diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 0dfee81..da6f1e9 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -1210,6 +1210,7 @@ config KPROBES a probepoint and specifies the callback. Kprobes is useful for kernel debugging, non-intrusive instrumentation and testing. If in doubt, say "N". + endmenu source "arch/i386/Kconfig.debug" @@ -1259,3 +1260,30 @@ config X86_TRAMPOLINE config KTIME_SCALAR bool default y + + +menu "LITMUS^RT" + + +config SCHED_TASK_TRACE + bool "Trace real-time tasks" + default y + help + Include support for the sched_trace_XXX() tracing functions. This + allows the collection of real-time task events such as job + completions, job releases, early completions, etc. This results in a + small overhead in the scheduling code. Disable if the overhead is not + acceptable (e.g., benchmarking). + +config SCHED_DEBUG_TRACE + bool "TRACE() debugging" + default y + help + Include support for sched_trace_log_messageg(), which is used to + implement TRACE(). If disabled, no TRACE() messages will be included + in the kernel, and no overheads due to debugging statements will be + incurred by the scheduler. Disable if the overhead is not acceptable + (e.g. benchmarking). + + +endmenu diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 776d9be..2e8909f 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -43,6 +44,8 @@ #include "io_ports.h" +#include + /* * cpu_mask that denotes the CPUs that needs timer interrupt coming in as * IPIs in place of local APIC timers @@ -54,6 +57,15 @@ static cpumask_t timer_bcast_ipi; */ static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ +/* + * Definitions and variables related to quantum synchronization. + */ +#define WAIT_TO_SYNC 30000 /* time after boot until sync */ +static int stagger = 0; /* are we using staggered quanta? */ +static atomic_t qsync_time = ATOMIC_INIT(INITIAL_JIFFIES); +static atomic_t quantum_sync_barrier = ATOMIC_INIT(0); +static atomic_t sync_done = ATOMIC_INIT(0); + static inline void lapic_disable(void) { enable_local_apic = -1; @@ -786,6 +798,23 @@ static int __init apic_set_verbosity(char *str) __setup("apic=", apic_set_verbosity); +/* + * Determine whether to use aligned or staggerd quanta. + */ + +static int __init apic_synch_type(char *str) +{ + if (strcmp("aligned", str) == 0) + stagger = 0; + else if (strcmp("staggered", str) == 0) + stagger = 1; + else + stagger = 0; /* aligned quanta by default */ + return 1; +} + +__setup("quanta=", apic_synch_type); + static int __init detect_init_APIC (void) { u32 h, l, features; @@ -1198,6 +1227,47 @@ EXPORT_SYMBOL(switch_ipi_to_APIC_timer); #undef APIC_DIVISOR /* + * This function is called to align all quanta, and to stagger quanta if + * necessary. It relies on a barrier to synchronize all processors, so + * that they all reset their APIC timers at the same time. If quanta + * should be staggered, the appropriate stagger delay is then added at + * each processor. + */ + +void synchronize_quanta(void) +{ + int cpu = smp_processor_id(); + int total_cpus = num_online_cpus(); + int stagger_interval = jiffies_to_usecs(1) / total_cpus; + + /* + * Disable APIC timer, wait for all other processors to reach barrier, + * and re-enable all timers concurrently. + */ + disable_APIC_timer(); + atomic_inc(&quantum_sync_barrier); + while (atomic_read(&quantum_sync_barrier) < total_cpus) { + /* Delay, otherwise atomic_inc's cannot occur. */ + udelay(1); + } + + /* Add necessary stagger for this CPU, if required. */ + if (stagger) { + int stagger_us = cpu * stagger_interval; + udelay(stagger_us); + } + + /* Re-enable all timers. */ + __setup_APIC_LVTT(calibration_result); + enable_APIC_timer(); + + /* The first CPU signals that quantum sync is complete. */ + if (cpu == 0) + atomic_inc(&sync_done); +} + + +/* * Local timer interrupt handler. It does both profiling and * process statistics/rescheduling. * @@ -1209,11 +1279,32 @@ EXPORT_SYMBOL(switch_ipi_to_APIC_timer); inline void smp_local_timer_interrupt(void) { +/* s64 offset; */ + + TS_TICK_START; + profile_tick(CPU_PROFILING); #ifdef CONFIG_SMP update_process_times(user_mode_vm(get_irq_regs())); #endif + /* Print out timing data - can be commented out if necessary. */ +/* offset = get_nsec_offset(); */ +/* TRACE("%d\n", offset); */ + + /* + * Synchronize quanta if we have reached qsync_time plus wait + * interval. The synchronization code itself is placed in its own + * (non-inline) function, to avoid issues with creating an inline + * function that is too large. + */ + if (unlikely(!atomic_read(&sync_done) && + time_after(jiffies, + (unsigned long)(atomic_read(&qsync_time) + + msecs_to_jiffies(WAIT_TO_SYNC))))) { + synchronize_quanta(); + } + /* * We take the 'long' return path, and there every subsystem * grabs the apropriate locks (kernel lock/ irq lock). @@ -1224,6 +1315,7 @@ inline void smp_local_timer_interrupt(void) * Currently this isn't too much of an issue (performance wise), * we can take more than 100K local irqs per second on a 100 MHz P5. */ + TS_TICK_END; } /* diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c index e3d4b73..9670f77 100644 --- a/arch/i386/kernel/i386_ksyms.c +++ b/arch/i386/kernel/i386_ksyms.c @@ -6,6 +6,7 @@ EXPORT_SYMBOL(__down_failed); EXPORT_SYMBOL(__down_failed_interruptible); EXPORT_SYMBOL(__down_failed_trylock); EXPORT_SYMBOL(__up_wakeup); + /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index 2697e92..9a5348f 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -319,3 +319,28 @@ ENTRY(sys_call_table) .long sys_move_pages .long sys_getcpu .long sys_epoll_pwait + /* LITMUS syscalls */ + .long sys_sched_setpolicy /* 320 */ + .long sys_sched_getpolicy + .long sys_set_rt_mode + .long sys_set_rt_task_param + .long sys_get_rt_task_param + .long sys_prepare_rt_task /* 325 */ + .long sys_ni_syscall /* CLEANUP: sys_reset_stat */ + .long sys_sleep_next_period + .long sys_scheduler_setup + .long sys_enter_np + .long sys_exit_np /* 330 */ + .long sys_pi_sema_init + .long sys_pi_down + .long sys_pi_up + .long sys_pi_sema_free + .long sys_sema_init /* 335 */ + .long sys_down + .long sys_up + .long sys_sema_free + .long sys_srp_sema_init + .long sys_srp_down /* 340 */ + .long sys_srp_up + .long sys_reg_task_srp_sem + .long sys_srp_sema_free /* 343 */ diff --git a/include/asm-i386/semaphore.h b/include/asm-i386/semaphore.h index 4e34a46..7212f4b 100644 --- a/include/asm-i386/semaphore.h +++ b/include/asm-i386/semaphore.h @@ -45,6 +45,7 @@ struct semaphore { atomic_t count; int sleepers; wait_queue_head_t wait; + int used; /* allows semaphores to allocated to user space processes */ }; diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index 833fa17..ac5756d 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -325,10 +325,36 @@ #define __NR_move_pages 317 #define __NR_getcpu 318 #define __NR_epoll_pwait 319 +/* LITMUS */ +#define __NR_sched_setpolicy 320 +#define __NR_sched_getpolicy 321 +/* Syscall definitions for mode change and task creation-manipulation */ +#define __NR_set_rt_mode 322 +#define __NR_set_rt_task_param 323 +#define __NR_get_rt_task_param 324 +#define __NR_prepare_rt_task 325 +#define __NR_reset_stat 326 +#define __NR_sleep_next_period 327 +#define __NR_scheduler_setup 328 +#define __NR_enter_np 329 +#define __NR_exit_np 330 +#define __NR_pi_sema_init 331 +#define __NR_pi_down 332 +#define __NR_pi_up 333 +#define __NR_pi_sema_free 334 +#define __NR_sema_init 335 +#define __NR_down 336 +#define __NR_up 337 +#define __NR_sema_free 338 +#define __NR_srp_sema_init 339 +#define __NR_srp_down 340 +#define __NR_srp_up 341 +#define __NR_reg_task_srp_sem 342 +#define __NR_srp_sema_free 343 #ifdef __KERNEL__ -#define NR_syscalls 320 +#define NR_syscalls 343 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/include/linux/edf_common.h b/include/linux/edf_common.h new file mode 100644 index 0000000..6b0eb2f --- /dev/null +++ b/include/linux/edf_common.h @@ -0,0 +1,77 @@ +/* EDF common data structures and utility functions shared by all EDF + * based scheduler plugins + */ + +/* CLEANUP: Add comments and make it less messy. + * + */ + +#ifndef __UNC_EDF_COMMON_H__ +#define __UNC_EDF_COMMON_H__ + +struct _edf_domain; + +typedef int (*edf_check_resched_needed_t)(struct _edf_domain *edf); +typedef struct _edf_domain { + /* runnable rt tasks are in here */ + rwlock_t ready_lock; + struct list_head ready_queue; + + /* real-time tasks waiting for release are in here */ + spinlock_t release_lock; + struct list_head release_queue; + + /* how do we check if we need to kick another CPU? */ + edf_check_resched_needed_t check_resched; +} edf_domain_t; + +#define next_ready(edf) \ + (list_entry((edf)->ready_queue.next, struct task_struct, rt_list)) + +void edf_domain_init(edf_domain_t *edf, edf_check_resched_needed_t f); + +int edf_higher_prio(struct task_struct* first, + struct task_struct* second); + +void __add_ready(edf_domain_t* edf, struct task_struct *new); +void __add_release(edf_domain_t* edf, struct task_struct *task); + +struct task_struct* __take_ready(edf_domain_t* edf); +struct task_struct* __peek_ready(edf_domain_t* edf); + + +void try_release_pending(edf_domain_t* edf); +void __release_pending(edf_domain_t* edf); +void __prepare_new_release(struct task_struct *t, jiffie_t start); +#define prepare_new_release(t) __prepare_new_release(t, jiffies) +void prepare_for_next_period(struct task_struct *t); +void prepare_new_releases(edf_domain_t *edf, jiffie_t start); +void __prepare_new_releases(edf_domain_t *edf, jiffie_t start); +int preemption_needed(edf_domain_t* edf, struct task_struct *t); +long edf_sleep_next_period(void); + +#define job_completed(t) (!is_be(t) && \ + (t)->rt_param.times.exec_time == (t)->rt_param.basic_params.exec_cost) + +static inline void add_ready(edf_domain_t* edf, struct task_struct *new) +{ + unsigned long flags; + /* first we need the write lock for edf_ready_queue */ + write_lock_irqsave(&edf->ready_lock, flags); + __add_ready(edf, new); + write_unlock_irqrestore(&edf->ready_lock, flags); +} + +static inline void add_release(edf_domain_t* edf, struct task_struct *task) +{ + unsigned long flags; + /* first we need the write lock for edf_ready_queue */ + spin_lock_irqsave(&edf->release_lock, flags); + __add_release(edf, task); + spin_unlock_irqrestore(&edf->release_lock, flags); +} + +int edf_set_hp_task(struct pi_semaphore *sem); +int edf_set_hp_cpu_task(struct pi_semaphore *sem, int cpu); + +#endif diff --git a/include/linux/feather_buffer.h b/include/linux/feather_buffer.h new file mode 100644 index 0000000..c477772 --- /dev/null +++ b/include/linux/feather_buffer.h @@ -0,0 +1,108 @@ +#ifndef _FEATHER_BUFFER_H_ +#define _FEATHER_BUFFER_H_ + +/* requires UINT_MAX and memcpy */ + +static inline int fetch_and_inc(int *val) +{ + int ret = 1; + __asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" ); + return ret; +} + +static inline int fetch_and_dec(int *val) +{ + int ret = -1; + __asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" ); + return ret; +} + +#define SLOT_FREE 0 +#define SLOT_BUSY 1 +#define SLOT_READY 2 + +struct ft_buffer { + unsigned int slot_count; + unsigned int slot_size; + + int free_count; + unsigned int write_idx; + unsigned int read_idx; + + char* slots; + void* buffer_mem; + unsigned int failed_writes; +}; + +static inline int init_ft_buffer(struct ft_buffer* buf, + unsigned int slot_count, + unsigned int slot_size, + char* slots, + void* buffer_mem) +{ + int i = 0; + if (!slot_count || UINT_MAX % slot_count != slot_count - 1) { + /* The slot count must divide UNIT_MAX + 1 so that when it + * wraps around the index correctly points to 0. + */ + return 0; + } else { + buf->slot_count = slot_count; + buf->slot_size = slot_size; + buf->slots = slots; + buf->buffer_mem = buffer_mem; + buf->free_count = slot_count; + buf->write_idx = 0; + buf->read_idx = 0; + buf->failed_writes = 0; + for (i = 0; i < slot_count; i++) + buf->slots[i] = SLOT_FREE; + return 1; + } +} + +static inline int ft_buffer_start_write(struct ft_buffer* buf, void **ptr) +{ + int free = fetch_and_dec(&buf->free_count); + unsigned int idx; + if (free <= 0) { + fetch_and_inc(&buf->free_count); + *ptr = 0; + fetch_and_inc(&buf->failed_writes); + return 0; + } else { + idx = fetch_and_inc((int*) &buf->write_idx) % buf->slot_count; + buf->slots[idx] = SLOT_BUSY; + *ptr = ((char*) buf->buffer_mem) + idx * buf->slot_size; + return 1; + } +} + +static inline void ft_buffer_finish_write(struct ft_buffer* buf, void *ptr) +{ + unsigned int idx = ((char*) ptr - (char*) buf->buffer_mem) / buf->slot_size; + buf->slots[idx] = SLOT_READY; +} + + +/* exclusive reader access is assumed */ +static inline int ft_buffer_read(struct ft_buffer* buf, void* dest) +{ + unsigned int idx; + if (buf->free_count == buf->slot_count) + /* nothing available */ + return 0; + idx = buf->read_idx % buf->slot_count; + if (buf->slots[idx] == SLOT_READY) { + memcpy(dest, ((char*) buf->buffer_mem) + idx * buf->slot_size, + buf->slot_size); + buf->slots[idx] = SLOT_FREE; + buf->read_idx++; + fetch_and_inc(&buf->free_count); + return 1; + } else + return 0; +} + + +#endif diff --git a/include/linux/feather_trace.h b/include/linux/feather_trace.h new file mode 100644 index 0000000..57a21a5 --- /dev/null +++ b/include/linux/feather_trace.h @@ -0,0 +1,93 @@ +#ifndef _FEATHER_TRACE_H_ +#define _FEATHER_TRACE_H_ + +#define feather_callback __attribute__((regparm(0))) + +/* make the compiler reload any register that is not saved in + * a cdecl function call + */ +#define CLOBBER_LIST "memory", "cc", "eax", "ecx", "edx" + +#define ft_event(id, callback) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " call " #callback " \n\t" \ + ".section __event_table, \"aw\" \n\t" \ + ".long " #id ", 0, 1b, 2f \n\t" \ + ".previous \n\t" \ + "2: \n\t" \ + : : : CLOBBER_LIST) + +#define ft_event0(id, callback) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " subl $4, %%esp \n\t" \ + " movl $" #id ", (%%esp) \n\t" \ + " call " #callback " \n\t" \ + " addl $4, %%esp \n\t" \ + ".section __event_table, \"aw\" \n\t" \ + ".long " #id ", 0, 1b, 2f \n\t" \ + ".previous \n\t" \ + "2: \n\t" \ + : : : CLOBBER_LIST) + +#define ft_event1(id, callback, param) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " subl $8, %%esp \n\t" \ + " movl %0, 4(%%esp) \n\t" \ + " movl $" #id ", (%%esp) \n\t" \ + " call " #callback " \n\t" \ + " addl $8, %%esp \n\t" \ + ".section __event_table, \"aw\" \n\t" \ + ".long " #id ", 0, 1b, 2f \n\t" \ + ".previous \n\t" \ + "2: \n\t" \ + : : "r" (param) : CLOBBER_LIST) + +#define ft_event2(id, callback, param, param2) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " subl $12, %%esp \n\t" \ + " movl %1, 8(%%esp) \n\t" \ + " movl %0, 4(%%esp) \n\t" \ + " movl $" #id ", (%%esp) \n\t" \ + " call " #callback " \n\t" \ + " addl $12, %%esp \n\t" \ + ".section __event_table, \"aw\" \n\t" \ + ".long " #id ", 0, 1b, 2f \n\t" \ + ".previous \n\t" \ + "2: \n\t" \ + : : "r" (param), "r" (param2) : CLOBBER_LIST) + + +#define ft_event3(id, callback, p, p2, p3) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " subl $16, %%esp \n\t" \ + " movl %1, 12(%%esp) \n\t" \ + " movl %1, 8(%%esp) \n\t" \ + " movl %0, 4(%%esp) \n\t" \ + " movl $" #id ", (%%esp) \n\t" \ + " call " #callback " \n\t" \ + " addl $16, %%esp \n\t" \ + ".section __event_table, \"aw\" \n\t" \ + ".long " #id ", 0, 1b, 2f \n\t" \ + ".previous \n\t" \ + "2: \n\t" \ + : : "r" (p), "r" (p2), "r" (p3) : CLOBBER_LIST) + + +static inline unsigned long long ft_read_tsc(void) +{ + unsigned long long ret; + __asm__ __volatile__("rdtsc" : "=A" (ret)); + return ret; +} + +int ft_enable_event(unsigned long id); +int ft_disable_event(unsigned long id); +int ft_is_event_enabled(unsigned long id); +int ft_disable_all_events(void); + +#endif diff --git a/include/linux/fifo_common.h b/include/linux/fifo_common.h new file mode 100644 index 0000000..5364e2b --- /dev/null +++ b/include/linux/fifo_common.h @@ -0,0 +1,33 @@ +/* FIFO common definitions and utility functions. + */ +#ifndef __UNC_SCHED_FIFO_H__ +#define __UNC_SCHED_FIFO_H__ + +typedef struct { + struct list_head queue; + atomic_t count; + spinlock_t lock; + unsigned int time_slice; +} fifo_domain_t; + +#define FIFO_INIT(name, time_slice) \ + { LIST_HEAD_INIT(name.queue), \ + ATOMIC_INIT(0), \ + SPIN_LOCK_UNLOCKED, \ + time_slice} + +void fifo_domain_init(fifo_domain_t* fifo, unsigned int exec_budget); +void fifo_enqueue(fifo_domain_t* fifo, struct task_struct* task); +void fifo_add(fifo_domain_t* fifo, struct task_struct* task); +void lifo_add(fifo_domain_t* fifo, struct task_struct* task); +struct task_struct* __fifo_take(fifo_domain_t* fifo); +struct task_struct* fifo_take(fifo_domain_t* fifo); +struct task_struct* fifo_take_rq(fifo_domain_t* fifo, runqueue_t* rq, int cpu); + +static inline int fifo_jobs_pending(fifo_domain_t* fifo) +{ + return atomic_read(&fifo->count) > 0; +} + + +#endif diff --git a/include/linux/litmus.h b/include/linux/litmus.h new file mode 100644 index 0000000..73ea643 --- /dev/null +++ b/include/linux/litmus.h @@ -0,0 +1,124 @@ +/* + * Constant definitions related to + * scheduling policy. + */ + +#ifndef _LINUX_LITMUS_H_ +#define _LINUX_LITMUS_H_ + +#include +#include + +typedef enum { + SCHED_BEG = 0, + SCHED_LINUX = 0, + SCHED_PFAIR = 1, + SCHED_PFAIR_STAGGER = 2, + SCHED_PART_EDF = 3, + SCHED_PART_EEVDF = 4, + SCHED_GLOBAL_EDF = 5, + SCHED_PFAIR_DESYNC = 6, + SCHED_GLOBAL_EDF_NP = 7, + SCHED_CUSTOM = 8, + SCHED_EDF_HSB = 9, + SCHED_GSN_EDF = 10, + SCHED_PSN_EDF = 11, + + /* Add your scheduling policy here */ + + SCHED_END = 11, + SCHED_DEFAULT = 0, + SCHED_INVALID = -1, +} spolicy; + +/* no options */ +#define SCHED_NONE 0 +/* make scheduling decisions at quantum boundaries */ +#define SCHED_QUANTUM 1 +/* only schedule RT tasks at slot boundaries */ +#define SCHED_RT_AT_BOUND 2 +/* default slot size - number of 1ms jiffies in a scheduling quantum */ +#define DEFAULT_SLOT_SIZE 1 +/* stagger value for no staggering of slot boundaries */ +#define DEFAULT_NO_STAGGER 0 +/* default stagger - number of 1ms jiffies by which processors + * are staggered, modulo the slot size + */ +#define DEFAULT_STAGGER 2 + +/* Runtime modes */ +/* CLEANUP: Should maybe an enum? */ +#define MAX_MODES 2 +#define MODE_NON_RT 0 +#define MODE_RT_RUN 1 + +/* Plugin boot options, for convenience */ +#define PLUGIN_LINUX "linux" +#define PLUGIN_PFAIR "pfair" +#define PLUGIN_PART_EDF "part_edf" +#define PLUGIN_GLOBAL_EDF "global_edf" +#define PLUGIN_PFAIR_STAGGER "stagger" +#define PLUGIN_PFAIR_DESYNC "desync" +#define PLUGIN_GLOBAL_EDF_NP "global_edf_np" +#define PLUGIN_EDF_HSB "edf_hsb" +#define PLUGIN_GSN_EDF "gsn_edf" +#define PLUGIN_PSN_EDF "psn_edf" + + +/* Additional clone flags + Indicates that the thread is to be used in + realtime mode, therefore it should not be + woken up in a linux manner, + we just set its state to TASK_STOPPED + It must be prepared and added to the ready queue explicitly +*/ + +/* Type definition for our quantums */ +typedef unsigned long long quantum_t; + +extern spolicy sched_policy; +extern int sched_options; +/* Make this function available to plugins */ +void set_sched_options(int); + +extern unsigned long slot_size; +extern unsigned long stagger_offset; + +/* RT mode start time */ +extern volatile unsigned long rt_start_time; + +/* Here we store the current mode of the system */ +extern atomic_t rt_mode; + +#define get_rt_mode() (atomic_read(&rt_mode)) +#define set_rt_mode(a) atomic_set(&rt_mode,(a)) + +/* CLEANUP: Should be queue_lock, does it really belong here? */ +extern spinlock_t litmus_task_set_lock; + + +#define TRACE(fmt, args...) \ + sched_trace_log_message("%d: " fmt, raw_smp_processor_id(), ## args) + +#define TRACE_TASK(t, fmt, args...) \ + TRACE("(%s/%d) " fmt, (t)->comm, (t)->pid, ##args) + +#define TRACE_CUR(fmt, args...) \ + TRACE_TASK(current, fmt, ## args) + +/* in_list - is a given list_head queued on some list? + */ +static inline int in_list(struct list_head* list) +{ + return !( /* case 1: deleted */ + (list->next == LIST_POISON1 && + list->prev == LIST_POISON2) + || + /* case 2: initialized */ + (list->next == list && + list->prev == list) + ); +} + + +#endif diff --git a/include/linux/pfair_common.h b/include/linux/pfair_common.h new file mode 100644 index 0000000..67e18c6 --- /dev/null +++ b/include/linux/pfair_common.h @@ -0,0 +1,40 @@ +/* PFAIR common data structures and utility functions shared by all PFAIR + * based scheduler plugins + */ + +#ifndef __UNC_PFAIR_COMMON_H__ +#define __UNC_PFAIR_COMMON_H__ + +#include +#include + +typedef struct _pfair_domain { + /* Global lock to protect the data structures */ + queuelock_t pfair_lock; + /* runnable rt tasks are in here */ + struct list_head ready_queue; + + /* real-time tasks waiting for release are in here */ + struct list_head release_queue; + + /* CPU's in the domain */ + cpumask_t domain_cpus; + +} pfair_domain_t; + +#define next_ready(pfair) \ + (list_entry((pfair)->ready_queue.next, struct task_struct, rt_list)) +void pfair_domain_init(pfair_domain_t *pfair); +void pfair_add_ready(pfair_domain_t* pfair, struct task_struct *new); +struct task_struct* __pfair_take_ready(pfair_domain_t* pfair); +void pfair_add_release(pfair_domain_t* pfair, struct task_struct *task); +void pfair_try_release_pending(pfair_domain_t* pfair); +void __pfair_prepare_new_release(struct task_struct *t, jiffie_t start); + +void pfair_prepare_next_job(struct task_struct *t); +void pfair_prepare_next_subtask(struct task_struct *t); + +void pfair_prepare_new_releases(pfair_domain_t *pfair, jiffie_t start); + +#endif + diff --git a/include/linux/pfair_math.h b/include/linux/pfair_math.h new file mode 100644 index 0000000..dab1778 --- /dev/null +++ b/include/linux/pfair_math.h @@ -0,0 +1,77 @@ +/* PFAIR Mathematical functions */ +#ifndef __UNC_PFAIR_MATH_H__ +#define __UNC_PFAIR_MATH_H__ + +#include +#include +#include +#include + +/* +* This file defines mathematical functions "ceiling", "floor", +* and PFAIR specific functions for computing the release and +* the deadline of a subtask, as well as tie breakers: +* b-bit and group deadline. +*/ +static inline quantum_t FLOOR(quantum_t a, unsigned long b) +{ + BUG_ON( b == 0); + do_div(a, b); + return a; +} +static inline quantum_t CEIL(quantum_t a, unsigned long b) +{ + quantum_t t = FLOOR(a, b); + return (quantum_t)((t * b == a) ? t : (t + 1)); +} + + +/* +* invariant - i-1=get_passed_quanta(t) +* +* release time of i-th subtask of j-th job is +* r_{ij}+\lfloor i-1/wt(T) \rfloor +* This operation should be robust to wrap-around +* so we can compare the result with jiffies safely +*/ +static inline quantum_t release_time(struct task_struct * t) +{ + quantum_t e = get_exec_cost(t); + quantum_t p = get_rt_period(t); + return FLOOR((get_passed_quanta(t)) * p, e); +} +/* +* deadline time of i-th subtask of j-th job is +* r_{ij}+\lceil i/wt(T) \rceil +* This operation should be robust to wrap-around +* so we can compare the result with jiffies safely +*/ +static inline quantum_t pfair_deadline(struct task_struct * t) +{ + quantum_t e = get_exec_cost(t); + quantum_t p = get_rt_period(t); + return CEIL((get_passed_quanta(t) + 1) * p, e); +} +/* In PFAIR b-bit is defined as +* \lceil i/wt(T) \rceil-\lfloor i/wt(T) \rfloor +*/ +static inline int b_bit(struct task_struct *t) +{ + quantum_t e = get_exec_cost(t); + quantum_t p = get_rt_period(t); + return CEIL((get_passed_quanta(t) + 1) * p, e)- + FLOOR((get_passed_quanta(t) + 1) * p, e); +} +/* +* Group deadline +*/ +static inline quantum_t group_deadline(struct task_struct * t) +{ + quantum_t p = get_rt_period(t); + quantum_t e = get_exec_cost(t); + quantum_t stage1 = CEIL((get_passed_quanta(t) + 1) * p, e); + quantum_t stage2 = CEIL(stage1 * (p - e), p); + return CEIL(stage2 * p, p - e); +} + +#endif /* __UNC_PFAIR_MATH_H__ */ diff --git a/include/linux/queuelock.h b/include/linux/queuelock.h new file mode 100644 index 0000000..454ff81 --- /dev/null +++ b/include/linux/queuelock.h @@ -0,0 +1,98 @@ +#ifndef _UNC_QUEUELOCK_H_ +#define _UNC_QUEUELOCK_H_ +/** +* Queue lock +* +* This is an implementation of T. Anderson's queue lock. +* It strives to follow the normal Linux locking conventions +* as much as possible. The rules for acquiring a lock are: +* +* 1) The caller must ensure interrupts and preemptions are disabled. +* +* 2) The caller _cannot_ recursively acquire the lock. +* +* 3) The caller may not sleep while holding the lock. This is currently +* not enforced, but it will not work. +*/ + +#include +#include +#include + +typedef struct { + /* pad the values being spun on to make sure + that they are cache local + */ + union { + volatile enum { + MUST_WAIT, + HAS_LOCK + } val; + char padding[SMP_CACHE_BYTES]; + } slots[NR_CPUS]; + + /* since spin_slot is not being spun on it can be + * in a shared cache line. next_slot will be evicted + * anyway on every attempt to acquire the lock. + */ + int spin_slot[NR_CPUS]; + + /* The next slot that will be available. + */ + atomic_t next_slot; +} queuelock_t; + + +static inline void queue_lock_init(queuelock_t *lock) +{ + int i; + for (i = 0; i < NR_CPUS; i++) { + lock->slots[i].val = MUST_WAIT; + lock->spin_slot[i] = i; + } + lock->slots[0].val = HAS_LOCK; + atomic_set(&lock->next_slot, 0); +} + + +static inline void queue_lock(queuelock_t *lock) +{ + int me = smp_processor_id(); + volatile int* spin_var; + /* Get slot to spin on. atomic_inc_return() returns the incremented + * value, so take one of again + */ + lock->spin_slot[me] = atomic_inc_return(&lock->next_slot) - 1; + /* check for wrap-around + * This could probably optimized away if we ensure that NR_CPUS divides + * INT_MAX... + */ + if (unlikely(lock->spin_slot[me] == NR_CPUS - 1)) + atomic_add(-NR_CPUS, &lock->next_slot); + /* range limit*/ + lock->spin_slot[me] %= NR_CPUS; + /* spin until you acquire the lock */ + spin_var = (int*) &lock->slots[lock->spin_slot[me]].val; + while (*spin_var == MUST_WAIT) + cpu_relax(); + + /* reset the lock */ + lock->slots[lock->spin_slot[me]].val = MUST_WAIT; + barrier(); +} + + +static inline void queue_unlock(queuelock_t *lock) +{ + int me = smp_processor_id(); + barrier(); + lock->slots[(lock->spin_slot[me] + 1) % NR_CPUS].val = HAS_LOCK; +} + +#define queue_lock_irqsave(lock, flags) \ + do { local_irq_save(flags); queue_lock(lock); } while (0); + +#define queue_unlock_irqrestore(lock, flags) \ + do { queue_unlock(lock); local_irq_restore(flags); } while (0); + +#endif /* _UNC_QUEUELOCK_H_ */ diff --git a/include/linux/rt_param.h b/include/linux/rt_param.h new file mode 100644 index 0000000..a305619 --- /dev/null +++ b/include/linux/rt_param.h @@ -0,0 +1,174 @@ +/* + * Definition of the scheduler plugin interface. + * + */ +#ifndef _LINUX_RT_PARAM_H_ +#define _LINUX_RT_PARAM_H_ + +#include + +typedef unsigned long jiffie_t; + +/* different types of clients */ +typedef enum { + RT_CLASS_HARD, + RT_CLASS_SOFT, + RT_CLASS_BEST_EFFORT +} task_class_t; + +typedef struct rt_param { + unsigned long exec_cost; + unsigned long period; + unsigned int cpu; + task_class_t class; +} rt_param_t; + +typedef struct { + /* when will this task be release the next time? */ + jiffie_t release; + /* time instant the last job was released */ + jiffie_t last_release; + /* what is the current deadline? */ + jiffie_t deadline; + /* b-bit tie breaker for PFAIR, it is ignored in EDF */ + int b_bit; + /* group deadline tie breaker, it is ignored in EDF */ + jiffie_t group_deadline; + /* how long has this task executed so far? + * In case of capacity sharing a job completion cannot be + * detected by checking time_slice == 0 as the job may have + * executed while using another capacity. Use this counter + * to keep track of the time spent on a CPU by a job. + * + * In other words: The number of consumed quanta since the + * last job release. + */ + unsigned int exec_time; +} in_times_t; + + +/* RT task parameters for scheduling extensions + * These parameters are inherited during clone and therefore must + * be explicitly set up before the task set is launched. + */ +typedef struct task_rt_param { + /* Real-time marker */ + int is_realtime; + /* user controlled parameters */ + rt_param_t basic_params; + /* is the task sleeping? */ + unsigned int flags; + /* task representing the current "inherited" task + * priority, assigned by inherit_priority and + * return priority in the scheduler plugins. + * could point to self if PI does not result in + * an increased task priority. + */ + struct task_struct* inh_task; + + unsigned int is_non_preemptable; + + /* put information for feedback control stuff and + * information about the performance of the task here + */ + struct { + /* How many non-tardy jobs since the last tardy job? */ + unsigned int nontardy_jobs_ctr; + } stats; + + in_times_t times; + in_times_t backup; + + /* is this task under control of litmus? + * + * this is necessary because otherwise signal delivery code + * may try to wake up a task that is already queued in plugin + * data structures. + */ + int litmus_controlled:1; + int subject_to_srp:1; + + + /* This field can be used by plugins to store where the task + * is currently scheduled. It is the responsibility of the + * plugin to avoid race conditions. + */ + int scheduled_on; + + /* This field can be used by plugins to store where the task + * is currently linked. It is the responsibility of the plugin + * to avoid race conditions. + */ + int linked_on; +} task_rt_param_t; + +/* Possible RT flags */ +#define RT_F_RUNNING 0x00000000 +#define RT_F_SLEEP 0x00000001 +#define RT_F_EXP_QUANTA 0x00000002 +#define RT_F_NON_PREEMTABLE 0x00000004 +#define RT_F_EXIT_SEM 0x00000008 + +/* Realtime utility macros */ +#define get_passed_quanta(t) ((t)->rt_param.times.exec_time) +#define inc_passed_quanta(t) ((t)->rt_param.times.exec_time += 1) +#define get_rt_flags(t) ((t)->rt_param.flags) +#define set_rt_flags(t,f) (t)->rt_param.flags=(f) +#define get_exec_cost(t) ((t)->rt_param.basic_params.exec_cost) +#define get_rt_period(t) ((t)->rt_param.basic_params.period) +#define set_rt_period(t,p) (t)->rt_param.basic_params.period=(p) +#define set_exec_cost(t,e) (t)->rt_param.basic_params.exec_cost=(e) +#define get_partition(t) (t)->rt_param.basic_params.cpu +#define get_deadline(t) ((t)->rt_param.times.deadline) +#define get_class(t) ((t)->rt_param.basic_params.class) + +#define is_realtime(t) ((t)->rt_param.is_realtime) +#define is_subject_to_srp(t) ((t)->rt_param.subject_to_srp) +#define is_hrt(t) \ + ((t)->rt_param.basic_params.class == RT_CLASS_HARD) +#define is_srt(t) \ + ((t)->rt_param.basic_params.class == RT_CLASS_SOFT) +#define is_be(t) \ + ((t)->rt_param.basic_params.class == RT_CLASS_BEST_EFFORT) +#define is_np(t) ((t)->rt_param.is_non_preemptable) + +#define clear_rt_params(t) \ +memset(&(t)->rt_param,0, sizeof(struct task_rt_param)) + +#define get_last_release_time(t) ((t)->rt_param.times.last_release) +#define set_last_release_time(t,r) ((t)->rt_param.times.last_release=(r)) + +#define get_release(t) ((t)->rt_param.times.release) +#define set_release(t,r) ((t)->rt_param.times.release=(r)) + +/* honor the flag that is set when scheduling is in progress + * This is some dirty hack in Linux that creates race conditions in our code + * if don't pay attention to it. + */ +#define is_running(t) \ + ((t)->state == TASK_RUNNING || \ + (t)->thread_info->preempt_count & PREEMPT_ACTIVE) + +#define is_blocked(t) (!is_running(t)) +#define is_released(t) (time_before_eq((t)->rt_param.times.release, jiffies)) +#define is_tardy(t) (time_before_eq((t)->rt_param.times.deadline, jiffies)) +#define task_slack(t) ( (int) (t)->rt_param.times.deadline - (int) jiffies - \ + (int) ((t)->rt_param.basic_params.exec_cost - \ + (t)->rt_param.times.exec_time)) + + +/* real-time comparison macros */ +#define earlier_deadline(a, b) (time_before(\ + (a)->rt_param.times.deadline,\ + (b)->rt_param.times.deadline)) +#define earlier_release(a, b) (time_before(\ + (a)->rt_param.times.release,\ + (b)->rt_param.times.release)) + +#define backup_times(t) do { (t)->rt_param.backup=(t)->rt_param.times; \ + } while(0); +#define restore_times(t) do { (t)->rt_param.times=(t)->rt_param.backup; \ + } while(0); + + +#endif diff --git a/include/linux/sched.h b/include/linux/sched.h index 4463735..f533ae3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3,6 +3,8 @@ #include /* For AT_VECTOR_SIZE */ +#include + /* * cloning flags: */ @@ -26,6 +28,8 @@ #define CLONE_STOPPED 0x02000000 /* Start in stopped state */ #define CLONE_NEWUTS 0x04000000 /* New utsname group? */ #define CLONE_NEWIPC 0x08000000 /* New ipcs */ +#define CLONE_REALTIME 0x10000000 /* LITMUS real-time task creation */ + /* * Scheduling policies @@ -1051,6 +1055,12 @@ struct task_struct { #ifdef CONFIG_FAULT_INJECTION int make_it_fail; #endif + /* litmus parameters and state */ + task_rt_param_t rt_param; + + /* allow scheduler plugins to queue in release lists, etc. */ + struct list_head rt_list; + }; static inline pid_t process_group(struct task_struct *tsk) diff --git a/include/linux/sched_plugin.h b/include/linux/sched_plugin.h new file mode 100644 index 0000000..6f09512 --- /dev/null +++ b/include/linux/sched_plugin.h @@ -0,0 +1,168 @@ +/* + * Definition of the scheduler plugin interface. + * + */ +#ifndef _LINUX_SCHED_PLUGIN_H_ +#define _LINUX_SCHED_PLUGIN_H_ + +#include + +/* struct for semaphore with priority inheritance */ +struct pi_semaphore { + atomic_t count; + int sleepers; + wait_queue_head_t wait; + union { + /* highest-prio holder/waiter */ + struct task_struct *task; + struct task_struct* cpu_task[NR_CPUS]; + } hp; + /* current lock holder */ + struct task_struct *holder; + /* is the semaphore being used? */ + int used; +}; + + +/* Enforce runqueues to be opaque objects. + * + * This allows us to pass around pointers to runqueues, + * without actually having to rip it out of sched.c. It + * also discourages plugins from trying to be + * overly clever. + */ +typedef void runqueue_t; + +/********************* real-time callbacks ********************/ + +/* Special plugin shutdown hook that clear plugin data structures + Currently is not supported +*/ +typedef void (*plugin_shutdown_hook_t) (void); + + +/********************* scheduler invocation ******************/ + +typedef enum { + NO_RESCHED = 0, + FORCE_RESCHED = 1 +} reschedule_check_t; + + +/* Plugin-specific realtime tick handler */ +typedef reschedule_check_t (*scheduler_tick_t) (void); +/* Novell make sched decision function */ +typedef int (*schedule_t) (struct task_struct * prev, + struct task_struct ** next, + runqueue_t * rq); +/* Clean up after the task switch has occured. + * This function is called after every (even non-rt) task switch. + */ +typedef void (*finish_switch_t)(struct task_struct *prev); + + +/********************* task state changes ********************/ + +/* called to setup a new real-time task */ +typedef long (*prepare_task_t) (struct task_struct *task); +/* called to re-introduce a task after blocking */ +typedef void (*wake_up_task_t) (struct task_struct *task); +/* called to notify the plugin of a blocking real-time task + * it will only be called for real-time tasks and before schedule is called */ +typedef void (*task_blocks_t) (struct task_struct *task); +/* called when a real-time task exits. Free any allocated resources */ +typedef long (*tear_down_t) (struct task_struct *); + +/* called when a real-time task wants to enter a non-preemptable section */ +typedef long (*enter_np_t) (struct task_struct *); +/* called when a real-time task wants to leave a non-preemptable section */ +typedef long (*exit_np_t) (struct task_struct *); + + +/* Called when the new_owner is released from the wait queue + * it should now inherit the priority from sem, _before_ it gets readded + * to any queue + */ +typedef long (*inherit_priority_t) (struct pi_semaphore *sem, + struct task_struct *new_owner); + +/* Called when the current task releases a semahpore where it might have + * inherited a piority from + */ +typedef long (*return_priority_t) (struct pi_semaphore *sem); + +/* Called when a task tries to acquire a semaphore and fails. Check if its + * priority is higher than that of the current holder. + */ +typedef long (*pi_block_t) (struct pi_semaphore *sem, struct task_struct *t); + + +/********************* sys call backends ********************/ +/* This function causes the caller to sleep until the next release */ +typedef long (*sleep_next_period_t) (void); + +typedef int (*scheduler_setup_t) (int cmd, void __user *parameter); + +typedef int (*mode_change_t) (int); + +struct sched_plugin { + /* basic info */ + char *plugin_name; + int ready_to_use; + + /* management interface */ + plugin_shutdown_hook_t shutdown_hook; /*currently unsupported */ + mode_change_t mode_change; + + /* scheduler invocation */ + scheduler_tick_t scheduler_tick; + scheduler_tick_t algo_scheduler_tick; + schedule_t schedule; + finish_switch_t finish_switch; + + /* syscall backend */ + sleep_next_period_t sleep_next_period; + scheduler_setup_t scheduler_setup; + + /* task state changes */ + prepare_task_t prepare_task; + wake_up_task_t wake_up_task; + task_blocks_t task_blocks; + tear_down_t tear_down; + + /* non-preemptable sections */ + enter_np_t enter_np; + exit_np_t exit_np; + + /* priority inheritance */ + inherit_priority_t inherit_priority; + return_priority_t return_priority; + pi_block_t pi_block; +} __attribute__ ((__aligned__(SMP_CACHE_BYTES))); + +typedef struct sched_plugin sched_plugin_t; + +extern sched_plugin_t *curr_sched_plugin; + + +/* common scheduler tick */ +reschedule_check_t rt_scheduler_tick(void); + + +/* Don't pull in our definitions on top of the real ones + * in sched.c! + */ +#ifndef __SCHED_C__ + +/* External linux scheduler facilities */ +void deactivate_task(struct task_struct *, runqueue_t *); +/* This function is defined in sched.c. We need acces to it for + * indirect switching. + */ +void __activate_task(struct task_struct *, runqueue_t *); +void __setscheduler(struct task_struct *, int, int); + +#endif + +extern int get_sched_options(void); +#endif diff --git a/include/linux/sched_trace.h b/include/linux/sched_trace.h new file mode 100644 index 0000000..47cd4ed --- /dev/null +++ b/include/linux/sched_trace.h @@ -0,0 +1,150 @@ +/* sched_trace.h -- record scheduler events to a byte stream for offline analysis. + */ +#ifndef _LINUX_SCHED_TRACE_H_ +#define _LINUX_SCHED_TRACE_H_ + +#include + +typedef enum { + ST_INVOCATION = 0, + ST_ARRIVAL = 1, + ST_DEPARTURE = 2, + ST_PREEMPTION = 3, + ST_SCHEDULED = 4, + ST_JOB_RELEASE = 5, + ST_JOB_COMPLETION = 6, + ST_CAPACITY_RELEASE = 7, + ST_CAPACITY_ALLOCATION = 8, +} trace_type_t; + +typedef struct { + trace_type_t trace:8; + unsigned long long timestamp; +} trace_header_t; + + +typedef struct { + unsigned int is_rt:1; + unsigned int is_server:1; + task_class_t class:4; + unsigned int budget:24; + u32 deadline; + + pid_t pid; +} task_info_t; + +typedef struct { + trace_header_t header; + unsigned long flags; +} invocation_record_t; + +typedef struct { + trace_header_t header; + task_info_t task; +} arrival_record_t; + +typedef struct { + trace_header_t header; + task_info_t task; +} departure_record_t; + +typedef struct { + trace_header_t header; + task_info_t task; + task_info_t by; +} preemption_record_t; + +typedef struct { + trace_header_t header; + task_info_t task; +} scheduled_record_t; + +typedef struct { + trace_header_t header; + task_info_t task; + u16 period; + u16 wcet; +} release_record_t; + +typedef struct { + trace_header_t header; + task_info_t task; + u16 period; + u16 wcet; + int tardiness; +} completion_record_t; + +typedef struct { + trace_header_t header; + task_info_t task; +} cap_release_record_t; + +typedef struct { + trace_header_t header; + task_info_t task; + u16 budget; + u32 deadline; + pid_t donor; +} cap_allocation_record_t; + +#ifdef CONFIG_SCHED_TASK_TRACE +void sched_trace_scheduler_invocation(void); + +void sched_trace_task_arrival(struct task_struct *t); +void sched_trace_task_departure(struct task_struct *t); +void sched_trace_task_preemption(struct task_struct *t, + struct task_struct* by); +void sched_trace_task_scheduled(struct task_struct *); + +void sched_trace_job_release(struct task_struct *t); +void sched_trace_job_completion(struct task_struct *t); + +void sched_trace_capacity_release(struct task_struct *t); +void sched_trace_capacity_allocation(struct task_struct *t, + u16 budget, u32 deadline, pid_t donor); + +void sched_trace_capacity_alloc_srv(pid_t srv, u32 srv_dl, task_class_t cls, + u16 srv_budget, + u16 budget, u32 deadline, pid_t donor); + +void sched_trace_server_release(int id, unsigned int wcet, + unsigned int period, + task_class_t class); + +void sched_trace_server_completion(int id, unsigned int budget, + jiffie_t deadline, + task_class_t class); + +void sched_trace_server_scheduled(int id, task_class_t class, + unsigned int budget, jiffie_t deadline); + +#else +#define sched_trace_scheduler_invocation(x) + +#define sched_trace_task_arrival(t) +#define sched_trace_task_departure(t) +#define sched_trace_task_preemption(t, by) +#define sched_trace_task_scheduled(t) +#define sched_trace_job_release(t) +#define sched_trace_job_completion(t) +#define sched_trace_capacity_release(t) +#define sched_trace_capacity_allocation(t, budget, deadline, donor) +#define sched_trace_capacity_alloc_srv(srv, srv_dl, cls, srv_budget,\ + budget, deadline, donor) +#define sched_trace_server_release(id, wcet, period, class) +#define sched_trace_server_completion(id, budget, deadline, class) +#define sched_trace_server_scheduled(id, class, budget, deadline) +#endif + + +#ifdef CONFIG_SCHED_DEBUG_TRACE +void sched_trace_log_message(const char* fmt, ...); + +#else + +#define sched_trace_log_message(fmt, ...) + +#endif + + +#endif diff --git a/include/linux/trace.h b/include/linux/trace.h new file mode 100644 index 0000000..9e457aa --- /dev/null +++ b/include/linux/trace.h @@ -0,0 +1,74 @@ + +#ifndef _SYS_TRACE_H_ +#define _SYS_TRACE_H_ + +#include +#include + + +/*********************** TIMESTAMPS ************************/ + +struct timestamp { + unsigned long event; + unsigned long long timestamp; + unsigned int seq_no; + int cpu; +}; + + +/* buffer holding time stamps - will be provided by driver */ +extern struct ft_buffer* trace_ts_buf; + +/* save_timestamp: stores current time as struct timestamp + * in trace_ts_buf + */ +asmlinkage void save_timestamp(unsigned long event); + +#define TIMESTAMP(id) ft_event0(id, save_timestamp) + +/* Convention for timestamps + * ========================= + * + * In order to process the trace files with a common tool, we use the following + * convention to measure execution times: The end time id of a code segment is + * always the next number after the start time event id. + */ + +#define TS_SCHED_START TIMESTAMP(100) +#define TS_SCHED_END TIMESTAMP(101) +#define TS_CXS_START TIMESTAMP(102) +#define TS_CXS_END TIMESTAMP(103) + +#define TS_TICK_START TIMESTAMP(110) +#define TS_TICK_END TIMESTAMP(111) + +#define TS_PLUGIN_SCHED_START TIMESTAMP(120) +#define TS_PLUGIN_SCHED_END TIMESTAMP(121) + +#define TS_PLUGIN_TICK_START TIMESTAMP(130) +#define TS_PLUGIN_TICK_END TIMESTAMP(131) + +#define TS_ENTER_NP_START TIMESTAMP(140) +#define TS_ENTER_NP_END TIMESTAMP(141) + +#define TS_EXIT_NP_START TIMESTAMP(150) +#define TS_EXIT_NP_END TIMESTAMP(151) + +#define TS_SRP_UP_START TIMESTAMP(160) +#define TS_SRP_UP_END TIMESTAMP(161) +#define TS_SRP_DOWN_START TIMESTAMP(162) +#define TS_SRP_DOWN_END TIMESTAMP(163) + +#define TS_PI_UP_START TIMESTAMP(170) +#define TS_PI_UP_END TIMESTAMP(171) +#define TS_PI_DOWN_START TIMESTAMP(172) +#define TS_PI_DOWN_END TIMESTAMP(173) + +#define TS_FIFO_UP_START TIMESTAMP(180) +#define TS_FIFO_UP_END TIMESTAMP(181) +#define TS_FIFO_DOWN_START TIMESTAMP(182) +#define TS_FIFO_DOWN_END TIMESTAMP(183) + + + +#endif /* !_SYS_TRACE_H_ */ diff --git a/include/linux/wait.h b/include/linux/wait.h index e820d00..c7e96b6 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -161,6 +161,8 @@ wait_queue_head_t *FASTCALL(bit_waitqueue(void *, int)); #define wake_up_locked(x) __wake_up_locked((x), TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE) #define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1) +#define pi_wake_up(x) __pi_wake_up(x, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, NULL) + #define __wait_event(wq, condition) \ do { \ DEFINE_WAIT(__wait); \ diff --git a/kernel/Makefile b/kernel/Makefile index 14f4d45..ce9dfa0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -8,7 +8,12 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ signal.o sys.o kmod.o workqueue.o pid.o \ rcupdate.o extable.o params.o posix-timers.o \ kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ - hrtimer.o rwsem.o latency.o nsproxy.o srcu.o + hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \ + sched_plugin.o litmus.o sched_trace.o \ + edf_common.o fifo_common.o pfair_common.o\ + sched_global_edf.o sched_part_edf.o sched_edf_hsb.o sched_pfair.o \ + sched_gsn_edf.o sched_psn_edf.o litmus_sem.o \ + trace.o ft_event.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-y += time/ diff --git a/kernel/edf_common.c b/kernel/edf_common.c new file mode 100644 index 0000000..fa83450 --- /dev/null +++ b/kernel/edf_common.c @@ -0,0 +1,299 @@ +/* + * kernel/edf_common.c + * + * Common functions for EDF based scheduler. + */ + +#include +#include +#include + +#include +#include +#include + +#include + + +static int dummy_resched(edf_domain_t *edf) +{ + return 0; +} + +void edf_domain_init(edf_domain_t *edf, edf_check_resched_needed_t f) +{ + BUG_ON(!edf); + if (!f) + f = dummy_resched; + INIT_LIST_HEAD(&edf->ready_queue); + INIT_LIST_HEAD(&edf->release_queue); + edf->ready_lock = RW_LOCK_UNLOCKED; + edf->release_lock = SPIN_LOCK_UNLOCKED; + edf->check_resched = f; +} + + +/* edf_higher_prio - returns true if first has a higher EDF priority + * than second. Deadline ties are broken by PID. + * + * first first must not be NULL and a real-time task. + * second may be NULL or a non-rt task. + */ +int edf_higher_prio(struct task_struct* first, + struct task_struct* second) +{ + struct task_struct *first_task = first; + struct task_struct *second_task = second; + + /* Check for inherited priorities. Change task + * used for comparison in such a case. + */ + if (first && first->rt_param.inh_task) + first_task = first->rt_param.inh_task; + if (second && second->rt_param.inh_task) + second_task = second->rt_param.inh_task; + + return + /* does the second task exist and is it a real-time task? If + * not, the first task (which is a RT task) has higher + * priority. + */ + !second_task || !is_realtime(second_task) || + + /* is the deadline of the first task earlier? + * Then it has higher priority. + */ + earlier_deadline(first_task, second_task) || + + /* Do we have a deadline tie? + * Then break by PID. + */ + (get_deadline(first_task) == get_deadline(second_task) && + (first_task->pid < second_task->pid || + + /* If the PIDs are the same then the task with the inherited + * priority wins. + */ + (first_task->pid == second_task->pid && + !second->rt_param.inh_task))); +} + + +/* add_ready - add a real-time task to the edf ready queue. It must be runnable. + * @new: the newly released task + */ +void __add_ready(edf_domain_t* edf, struct task_struct *new) +{ + struct list_head *pos; + struct task_struct *queued; + unsigned int passed = 0; + + BUG_ON(!new); + TRACE("edf: adding %s/%d (%u, %u) to ready queue\n", + new->comm, new->pid, get_exec_cost(new), get_rt_period(new)); + + /* find a spot where our deadline is earlier than the next */ + list_for_each(pos, &edf->ready_queue) { + queued = list_entry(pos, struct task_struct, rt_list); + if (unlikely(edf_higher_prio(new, queued))) { + /* the task at pos has a later deadline */ + /* insert the new task in front of it */ + __list_add(&new->rt_list, pos->prev, pos); + goto out; + } + passed++; + } + /* if we get to this point either the list is empty or new has the + * lowest priority. Let's add it to the end. */ + list_add_tail(&new->rt_list, &edf->ready_queue); + out: + if (!passed) + edf->check_resched(edf); +} + +struct task_struct* __take_ready(edf_domain_t* edf) +{ + struct task_struct *t = __peek_ready(edf); + + /* kick it out of the ready list */ + if (t) + list_del(&t->rt_list); + return t; +} + + +struct task_struct* __peek_ready(edf_domain_t* edf) +{ + struct task_struct *t = NULL; + /* either not yet released, preempted, or non-rt */ + if (!list_empty(&edf->ready_queue)) + /* take next rt task */ + t = list_entry(edf->ready_queue.next, struct task_struct, + rt_list); + return t; +} + + +/* add_release - add a real-time task to the edf release queue. + * @task: the sleeping task + */ +void __add_release(edf_domain_t* edf, struct task_struct *task) +{ + struct list_head *pos; + struct task_struct *queued; + + BUG_ON(!task); + /* first we need the lock for edf_release_queue */ + TRACE("edf: adding %s/%d (%u, %u) to release queue\n", + task->comm, task->pid, get_exec_cost(task), get_rt_period(task)); + + /* find a spot where our deadline is earlier than the next */ + list_for_each_prev(pos, &edf->release_queue) { + queued = list_entry(pos, struct task_struct, rt_list); + if ((unlikely(earlier_release(queued, task)))) { + /* the task at pos has an earlier release */ + /* insert the new task in behind it */ + __list_add(&task->rt_list, pos, pos->next); + return; + } + } + /* if we get to this point either the list is empty or task has the + * earliest release. Let's add it to the front. */ + list_add(&task->rt_list, &edf->release_queue); +} + +void __release_pending(edf_domain_t* edf) +{ + struct list_head *pos, *save; + struct task_struct *queued; + list_for_each_safe(pos, save, &edf->release_queue) { + queued = list_entry(pos, struct task_struct, rt_list); + if (likely(is_released(queued))) { + /* this one is ready to go*/ + list_del(pos); + set_rt_flags(queued, RT_F_RUNNING); + + sched_trace_job_release(queued); + + /* now it can be picked up */ + barrier(); + add_ready(edf, queued); + } + else + /* the release queue is ordered */ + break; + } +} + +void try_release_pending(edf_domain_t* edf) +{ + unsigned long flags; + + if (spin_trylock_irqsave(&edf->release_lock, flags)) { + __release_pending(edf); + spin_unlock_irqrestore(&edf->release_lock, flags); + } +} + +void __prepare_new_release(struct task_struct *t, jiffie_t start) { + t->rt_param.times.deadline = start; + t->rt_param.stats.nontardy_jobs_ctr = 0xf0000000; + prepare_for_next_period(t); + set_rt_flags(t, RT_F_RUNNING); +} + +void prepare_for_next_period(struct task_struct *t) +{ + BUG_ON(!t); + /* update tardy job ctr */ + if (jiffies > t->rt_param.times.deadline) + t->rt_param.stats.nontardy_jobs_ctr = 0; + else + t->rt_param.stats.nontardy_jobs_ctr++; + /* prepare next release */ + t->rt_param.times.release = t->rt_param.times.deadline; + t->rt_param.times.deadline += get_rt_period(t); + t->rt_param.times.exec_time = 0; + t->time_slice = get_exec_cost(t); + + /* who uses this? statistics? */ + t->first_time_slice = 0; +} + +void prepare_new_releases(edf_domain_t *edf, jiffie_t start) +{ + unsigned long flags; + + spin_lock_irqsave(&edf->release_lock, flags); + write_lock(&edf->ready_lock); + + __prepare_new_releases(edf, start); + + write_unlock(&edf->ready_lock); + spin_unlock_irqrestore(&edf->release_lock, flags); +} + +void __prepare_new_releases(edf_domain_t *edf, jiffie_t start) +{ + + struct list_head tmp_list; + struct list_head *pos, *n; + struct task_struct *t; + + INIT_LIST_HEAD(&tmp_list); + + while (!list_empty(&edf->release_queue)) { + pos = edf->release_queue.next; + list_del(pos); + list_add(pos, &tmp_list); + } + while (!list_empty(&edf->ready_queue)) { + pos = edf->ready_queue.next; + list_del(pos); + list_add(pos, &tmp_list); + } + + list_for_each_safe(pos, n, &tmp_list) { + t = list_entry(pos, struct task_struct, rt_list); + list_del(pos); + __prepare_new_release(t, start); + __add_release(edf, t); + } + +} + +/* need_to_preempt - check whether the task t needs to be preempted + * call only with irqs disabled and with ready_lock acquired + */ +int preemption_needed(edf_domain_t* edf, struct task_struct *t) +{ + /* we need the read lock for edf_ready_queue */ + /* no need to preempt if there is nothing pending */ + if (list_empty(&edf->ready_queue)) + return 0; + /* we need to reschedule if t doesn't exist */ + if (!t) + return 1; + /* don't preempt if t is non-preemptable */ + if (!is_np(t)) + /* make sure to get non-rt stuff out of the way */ + return !is_realtime(t) || edf_higher_prio(next_ready(edf), t); + return 0; +} + + +/* + * Deactivate current task until the beginning of the next period. + */ +long edf_sleep_next_period(void) +{ + /* Mark that we do not excute anymore */ + set_rt_flags(current, RT_F_SLEEP); + /* call schedule, this will return when a new job arrives + * it also takes care of preparing for the next release + */ + sched_trace_job_completion(current); + schedule(); + return 0; +} + diff --git a/kernel/fifo_common.c b/kernel/fifo_common.c new file mode 100644 index 0000000..98186cd --- /dev/null +++ b/kernel/fifo_common.c @@ -0,0 +1,118 @@ +/* + * kernel/fifo_common.c + * + * Fifo helper functions. Could one day be a FIFO plugin if someone + * is interested. + * + * The current FIFO implementaion automatically chops Linux tasks into + * smaller jobs by assigning a fixed time slice. Once that time slice expires, + * it is treated as a new job release (that is queued in the back). + * + * The result is that it provides FIFO properties on a job level and round-robin + * on a task level if the tasks execute continuously. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +/* This function is defined in sched.c. We need access it for + * indirect switching. + */ +void __activate_task(struct task_struct *p, runqueue_t *rq); + +void fifo_domain_init(fifo_domain_t* fifo, unsigned int exec_budget) +{ + INIT_LIST_HEAD(&fifo->queue); + atomic_set(&fifo->count, 0); + fifo->time_slice = exec_budget; + fifo->lock = SPIN_LOCK_UNLOCKED; +} + +void fifo_add(fifo_domain_t* fifo, struct task_struct* task) +{ + unsigned long flags; + + spin_lock_irqsave(&fifo->lock, flags); + + list_add_tail(&task->run_list, &fifo->queue); + atomic_inc(&fifo->count); + + spin_unlock_irqrestore(&fifo->lock, flags); +} + +void lifo_add(fifo_domain_t* fifo, struct task_struct* task) +{ + unsigned long flags; + + spin_lock_irqsave(&fifo->lock, flags); + + list_add(&task->run_list, &fifo->queue); + atomic_inc(&fifo->count); + + spin_unlock_irqrestore(&fifo->lock, flags); +} + +/* This is a best-effort attempt at maintaining FIFO order. + * If we re-add a task comming from a preemption, it should go to + * the front as it arived early than the other queued tasks. + * Of course, this is not guaranteed to work correctly. Right now, + * it is only used for best-effort jobs, so it doesn't really matter + * all that much. A correct implementation would have to maintain + * arrival times and perform cross-processor preemptions... + */ +void fifo_enqueue(fifo_domain_t* fifo, struct task_struct* task) +{ + task->array = NULL; + + if (!task->time_slice) { + task->time_slice = fifo->time_slice; + sched_trace_job_release(task); + fifo_add(fifo, task); + } else + lifo_add(fifo, task); +} + +struct task_struct* __fifo_take(fifo_domain_t* fifo) +{ + struct task_struct * task = NULL; + + if (atomic_read(&fifo->count)) { + BUG_ON(list_empty(&fifo->queue)); + task = list_entry(fifo->queue.next, struct task_struct, + run_list); + list_del(fifo->queue.next); + atomic_dec(&fifo->count); + } + + return task; +} + +struct task_struct* fifo_take(fifo_domain_t* fifo) +{ + unsigned long flags; + struct task_struct* t; + + spin_lock_irqsave(&fifo->lock, flags); + t = __fifo_take(fifo); + spin_unlock_irqrestore(&fifo->lock, flags); + return t; +} + + +struct task_struct* fifo_take_rq(fifo_domain_t* fifo, runqueue_t* rq, int cpu) +{ + struct task_struct *task = fifo_take(fifo); + + if (task) { + set_task_cpu(task, cpu); + __activate_task(task, rq); + } + return task; +} diff --git a/kernel/fork.c b/kernel/fork.c index d57118d..6874058 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -57,6 +57,9 @@ #include #include +#include +#include + /* * Protected counters by write_lock_irq(&tasklist_lock) */ @@ -118,6 +121,9 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + if (is_realtime(tsk)) + curr_sched_plugin->tear_down(tsk); + security_task_free(tsk); free_uid(tsk->user); put_group_info(tsk->group_info); diff --git a/kernel/ft_event.c b/kernel/ft_event.c new file mode 100644 index 0000000..10318ee --- /dev/null +++ b/kernel/ft_event.c @@ -0,0 +1,104 @@ +#include + +#include + +/* the feather trace management functions assume + * exclusive access to the event table + */ + + +#define BYTE_JUMP 0xeb +#define BYTE_JUMP_LEN 0x02 + +/* for each event, there is an entry in the event table */ +struct trace_event { + long id; + long count; + long start_addr; + long end_addr; +}; + +extern struct trace_event __start___event_table[]; +extern struct trace_event __stop___event_table[]; + +int ft_enable_event(unsigned long id) +{ + struct trace_event* te = __start___event_table; + int count = 0; + char* delta; + unsigned char* instr; + + while (te < __stop___event_table) { + if (te->id == id && ++te->count == 1) { + instr = (unsigned char*) te->start_addr; + /* make sure we don't clobber something wrong */ + if (*instr == BYTE_JUMP) { + delta = (((unsigned char*) te->start_addr) + 1); + *delta = 0; + } + } + if (te->id == id) + count++; + te++; + } + return count; +} + +int ft_disable_event(unsigned long id) +{ + struct trace_event* te = __start___event_table; + int count = 0; + char* delta; + unsigned char* instr; + + while (te < __stop___event_table) { + if (te->id == id && --te->count == 0) { + instr = (unsigned char*) te->start_addr; + if (*instr == BYTE_JUMP) { + delta = (((unsigned char*) te->start_addr) + 1); + *delta = te->end_addr - te->start_addr - + BYTE_JUMP_LEN; + } + } + if (te->id == id) + count++; + te++; + } + return count; +} + +int ft_disable_all_events(void) +{ + struct trace_event* te = __start___event_table; + int count = 0; + char* delta; + unsigned char* instr; + + while (te < __stop___event_table) { + if (te->count) { + instr = (unsigned char*) te->start_addr; + if (*instr == BYTE_JUMP) { + delta = (((unsigned char*) te->start_addr) + + 1); + *delta = te->end_addr - te->start_addr - + BYTE_JUMP_LEN; + te->count = 0; + count++; + } + } + te++; + } + return count; +} + +int ft_is_event_enabled(unsigned long id) +{ + struct trace_event* te = __start___event_table; + + while (te < __stop___event_table) { + if (te->id == id) + return te->count; + te++; + } + return 0; +} diff --git a/kernel/litmus.c b/kernel/litmus.c new file mode 100644 index 0000000..02d6851 --- /dev/null +++ b/kernel/litmus.c @@ -0,0 +1,523 @@ +/* litmus.c -- Implementation of the LITMUS syscalls, the LITMUS intialization, + * and the common tick function. + */ +#include +#include + +#include +#include +#include +#include + +#include + +/* Variables that govern the scheduling process */ +spolicy sched_policy = SCHED_DEFAULT; +int sched_options = 0; + +/* avoid races with multiple task wake-ups */ +spinlock_t litmus_task_set_lock = SPIN_LOCK_UNLOCKED; + +/* This is a flag for switching the system into RT mode when it is booted up + * In RT-mode non-realtime tasks are shut down and scheduled as spare + * time available + */ + +/* The system is booting in non-realtime mode */ +atomic_t rt_mode = ATOMIC_INIT(MODE_NON_RT); +/* Here we specify a mode change to be made */ +atomic_t new_mode = ATOMIC_INIT(MODE_NON_RT); +/* Number of RT tasks that exist in the system */ +atomic_t n_rt_tasks = ATOMIC_INIT(0); + +/* Only one process can perform mode change */ +static queuelock_t mode_change_lock; + +/* A time instant when we switched to RT mode */ +volatile jiffie_t rt_start_time = 0; + +/** + * sys_set_rt_mode + * @newmode: new mode the scheduler must be switched to + * External syscall for setting the RT mode flag + * Returns EINVAL if mode is not recognized or mode transition is + * not permitted + * On success 0 is returned + * + * FIXME: In a "real" OS we cannot just let any user switch the mode... + */ +asmlinkage long sys_set_rt_mode(int newmode) +{ + if ((newmode == MODE_NON_RT) || (newmode == MODE_RT_RUN)) { + printk(KERN_INFO "real-time mode switch to %s\n", + (newmode == MODE_RT_RUN ? "rt" : "non-rt")); + atomic_set(&new_mode, newmode); + return 0; + } + return -EINVAL; +} + +/* + * sys_set_task_rt_param + * @pid: Pid of the task which scheduling parameters must be changed + * @param: New real-time extension parameters such as the execution cost and + * period + * Syscall for manipulating with task rt extension params + * Returns EFAULT if param is NULL. + * ESRCH if pid is not corrsponding + * to a valid task. + * EINVAL if either period or execution cost is <=0 + * 0 if success + * + * FIXME: This code is racy during real-time mode. + */ +asmlinkage long sys_set_rt_task_param(pid_t pid, rt_param_t __user * param) +{ + rt_param_t tp; + struct task_struct *target; + int retval = -EINVAL; + + printk("Setting up rt task parameters for process %d.\n", pid); + + if (pid < 0 || param == 0) { + goto out; + } + if (copy_from_user(&tp, param, sizeof(tp))) { + retval = -EFAULT; + goto out; + } + + /* Task search and manipulation must be protected */ + read_lock_irq(&tasklist_lock); + if (!(target = find_task_by_pid(pid))) { + retval = -ESRCH; + goto out_unlock; + } + if (tp.exec_cost <= 0) + goto out_unlock; + if (tp.period <= 0) + goto out_unlock; + if (!cpu_online(tp.cpu)) + goto out_unlock; + if (tp.period < tp.exec_cost) + { + printk(KERN_INFO "litmus: real-time task %d rejected " + "because wcet > period\n", pid); + goto out_unlock; + } + + /* Assign params */ + target->rt_param.basic_params = tp; + + retval = 0; + out_unlock: + read_unlock_irq(&tasklist_lock); + out: + return retval; +} + +/* Getter of task's RT params + * returns EINVAL if param or pid is NULL + * returns ESRCH if pid does not correspond to a valid task + * returns EFAULT if copying of parameters has failed. + */ +asmlinkage long sys_get_rt_task_param(pid_t pid, rt_param_t __user * param) +{ + int retval = -EINVAL; + struct task_struct *source; + rt_param_t lp; + if (param == 0 || pid < 0) + goto out; + read_lock(&tasklist_lock); + if (!(source = find_task_by_pid(pid))) { + retval = -ESRCH; + goto out_unlock; + } + lp = source->rt_param.basic_params; + read_unlock(&tasklist_lock); + /* Do copying outside the lock */ + retval = + copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0; + return retval; + out_unlock: + read_unlock(&tasklist_lock); + out: + return retval; + +} + +/* + * sys_prepare_rt_task + * @pid: Pid of the task we want to prepare for RT mode + * Syscall for adding a task to RT queue, plugin dependent. + * Must be called before RT tasks are going to start up. + * Returns EPERM if current plugin does not define prepare operation + * or scheduling policy does not allow the operation. + * ESRCH if pid does not correspond to a valid task. + * EINVAL if a task is non-realtime or in invalid state + * from underlying plugin function + * EAGAIN if a task is not in the right state + * ENOMEM if there is no memory space to handle this task + * 0 if success + */ +asmlinkage long sys_prepare_rt_task(pid_t pid) +{ + int retval = -EINVAL; + struct task_struct *target = 0; + /* If a plugin does not define preparation mode then nothing to do */ + if (curr_sched_plugin->prepare_task == 0 + || sched_policy == SCHED_DEFAULT) { + retval = -EPERM; + goto out_prepare; + } + read_lock_irq(&tasklist_lock); + if (!(target = find_task_by_pid(pid))) { + retval = -ESRCH; + goto out_prepare_unlock; + } + if (!cpu_online(get_partition(target))) + { + printk(KERN_WARNING "litmus prepare: cpu %d is not online\n", + get_partition(target)); + goto out_prepare_unlock; + } + retval = curr_sched_plugin->prepare_task(target); + if (!retval) { + atomic_inc(&n_rt_tasks); + target->rt_param.is_realtime = 1; + } + out_prepare_unlock: + read_unlock_irq(&tasklist_lock); + out_prepare: + return retval; +} + +/* implemented in kernel/litmus_sem.c */ +void srp_ceiling_block(void); + +/* + * This is the crucial function for periodic task implementation, + * It checks if a task is periodic, checks if such kind of sleep + * is permitted and calls plugin-specific sleep, which puts the + * task into a wait array. + * returns 0 on successful wakeup + * returns EPERM if current conditions do not permit such sleep + * returns EINVAL if current task is not able to go to sleep + */ +asmlinkage long sys_sleep_next_period(void) +{ + int retval = -EPERM; + if (!is_realtime(current)) { + retval = -EINVAL; + goto out; + } + /* Task with negative or zero period cannot sleep */ + if (get_rt_period(current) <= 0) { + retval = -EINVAL; + goto out; + } + /* The plugin has to put the task into an + * appropriate queue and call schedule + */ + retval = curr_sched_plugin->sleep_next_period(); + if (!retval && is_subject_to_srp(current)) + srp_ceiling_block(); + out: + return retval; +} + +/* + * sys_enter_np() allows real-time tasks to request to enter a + * non-preemptable section. + * returns 0 if the request was granted. + * returns EPERM if current scheduler plugin does not allow the task to + * enter a non-preemptable section + * returns EINVAL if current task is not a real-time task + */ +asmlinkage long sys_enter_np(void) +{ + int retval = -EINVAL; + preempt_disable(); + TS_ENTER_NP_START; + if (!is_realtime(current)) + goto out; + /* Let the plugin decide. The default callback will return -EPERM. + */ + retval = curr_sched_plugin->enter_np(current); + TRACE("enter_np(%s/%d) => %d and np=%d\n", + current->comm, current->pid, retval, is_np(current)); + out: + TS_ENTER_NP_END; + preempt_enable(); + return retval; +} + +/* + * sys_exit_np() allows real-time tasks to signal that they leave a + * non-preemptable section. + * returns 0 if the signal was valid and processed. + * returns EPERM if current scheduler plugin does not allow the task to + * exit a non-preemptable section at the current time + * returns EINVAL if current task is not a real-time task + */ +asmlinkage long sys_exit_np(void) +{ + int retval = -EINVAL; + preempt_disable(); + TS_EXIT_NP_START; + if (!is_realtime(current)) + goto out; + /* Let the plugin decide. The default callback will return -EPERM. + */ + retval = curr_sched_plugin->exit_np(current); + TRACE("exit_np(%s/%d) => %d and np=%d\n", + current->comm, current->pid, retval, is_np(current)); + out: + TS_EXIT_NP_END; + preempt_enable(); + return retval; +} + + +/* Set scheduling options for all cpus. */ +void set_sched_options(int options) +{ + sched_options = options; +} + +/* The LITMUS tick function. It manages the change to and from real-time mode + * and then calls the plugin's tick function. + */ +reschedule_check_t __sched rt_scheduler_tick(void) +{ + /* Check for mode change */ + if ((get_rt_mode() != atomic_read(&new_mode))) { + queue_lock(&mode_change_lock); + // If the mode is already changed, proceed + if (get_rt_mode() == atomic_read(&new_mode)) { + queue_unlock(&mode_change_lock); + goto proceed; + } + // change the mode + if ((atomic_read(&new_mode) == MODE_RT_RUN)) { + /* The deferral of entering real-time mode should be + * handled by deferring task releases in the plugin. + * The plugin interface does not really need to know + * about quanta, that is the plugin's job. + */ + + /* update rt start time */ + rt_start_time = jiffies; + printk(KERN_INFO "Real-Time mode enabled\n"); + } + if (curr_sched_plugin->mode_change) + curr_sched_plugin-> + mode_change(atomic_read(&new_mode)); + set_rt_mode(atomic_read(&new_mode)); + queue_unlock(&mode_change_lock); + } + + proceed: + /* Call plugin-defined tick handler + * + * It is the plugin's tick handler' job to detect quantum + * boundaries in pfair. + */ + return curr_sched_plugin->algo_scheduler_tick(); +} + +asmlinkage spolicy sys_sched_setpolicy(spolicy newpolicy) +{ + /* Dynamic policy change is disabled at the moment */ + return SCHED_INVALID; +} + +asmlinkage spolicy sys_sched_getpolicy(void) +{ + return sched_policy; +} + + +asmlinkage int sys_scheduler_setup(int cmd, void __user *parameter) +{ + return curr_sched_plugin->scheduler_setup(cmd, parameter); +} + +#ifdef CONFIG_MAGIC_SYSRQ +/* We offer the possibility to change the real-time mode of the system + * with a magic sys request. This helps in debugging in case the system fails + * to perform its planned switch back to normal mode. This may happen if we have + * total system utilization and the task that is supposed to do the switch is + * always preempted (if it is not a real-time task). + */ + +int sys_kill(int pid, int sig); + +static void sysrq_handle_toGgle_rt_mode(int key, struct tty_struct *tty) +{ + sys_set_rt_mode(get_rt_mode() == MODE_NON_RT); +} + +static struct sysrq_key_op sysrq_toGgle_rt_mode_op = { + .handler = sysrq_handle_toGgle_rt_mode, + .help_msg = "toGgle-rt-mode", + .action_msg = "real-time mode changed", +}; + +static void sysrq_handle_kill_rt_tasks(int key, struct tty_struct *tty) +{ + struct task_struct *t; + read_lock(&tasklist_lock); + for_each_process(t) { + if (is_realtime(t)) { + sys_kill(t->pid, SIGKILL); + } + } + read_unlock(&tasklist_lock); +} + +static struct sysrq_key_op sysrq_kill_rt_tasks_op = { + .handler = sysrq_handle_kill_rt_tasks, + .help_msg = "Quit-rt-tasks", + .action_msg = "sent SIGKILL to all real-time tasks", +}; +#endif + +/* + * Scheduler initialization so that customized scheduler is + * enabled at boot time + * by setting boot option "rtsched=plugin_name", e.g. "rtsched=pfair" + */ + +/* All we need to know about other plugins is their initialization + * functions. These functions initialize internal data structures of a + * scheduler and return a pointer to initialized sched_plugin data + * structure with pointers to scheduling function implementations. + * If called repeatedly these init functions just return an existing + * plugin pointer. + */ +sched_plugin_t *init_global_edf_plugin(void); +sched_plugin_t *init_global_edf_np_plugin(void); +sched_plugin_t *init_part_edf_plugin(void); +sched_plugin_t *init_edf_hsb_plugin(void); +sched_plugin_t *init_pfair_plugin(void); +sched_plugin_t *init_gsn_edf_plugin(void); +sched_plugin_t *init_psn_edf_plugin(void); + +/* keep everything needed to setup plugins in one place */ + +/* we are lazy, so we use a convention for function naming to fill + * a table + */ +#define PLUGIN(caps, small) \ + {PLUGIN_ ## caps, SCHED_ ## caps, init_ ## small ## _plugin} + +#define init_nosetup_plugin 0 + +static struct { + const char *name; + const spolicy policy_id; + sched_plugin_t *(*init) (void); +} available_plugins[] = { + PLUGIN(LINUX, nosetup), + PLUGIN(GLOBAL_EDF_NP, global_edf_np), + PLUGIN(GLOBAL_EDF, global_edf), + PLUGIN(PART_EDF, part_edf), + PLUGIN(EDF_HSB, edf_hsb), + PLUGIN(PFAIR, pfair), + PLUGIN(GSN_EDF, gsn_edf), + PLUGIN(PSN_EDF, psn_edf), + + /********************************************* + * Add your custom plugin here + **********************************************/ +}; + +/* Some plugins may leave important functions unused. We define dummies + * so that we don't have to check for null pointers all over the place. + */ +void litmus_dummy_finish_switch(struct task_struct * prev); +int litmus_dummy_schedule(struct task_struct * prev, struct task_struct** next, + runqueue_t* q); +reschedule_check_t litmus_dummy_scheduler_tick(void); +long litmus_dummy_prepare_task(struct task_struct *t); +void litmus_dummy_wake_up_task(struct task_struct *task); +void litmus_dummy_task_blocks(struct task_struct *task); +long litmus_dummy_tear_down(struct task_struct *task); +int litmus_dummy_scheduler_setup(int cmd, void __user *parameter); +long litmus_dummy_sleep_next_period(void); +long litmus_dummy_enter_np(struct task_struct *task); +long litmus_dummy_exit_np(struct task_struct *task); +long litmus_dummy_inherit_priority(struct pi_semaphore *sem, + struct task_struct *new_owner); +long litmus_dummy_return_priority(struct pi_semaphore *sem); +long litmus_dummy_pi_block(struct pi_semaphore *sem, + struct task_struct *t); + +#define CHECK(func) {\ + if (!curr_sched_plugin->func) \ + curr_sched_plugin->func = litmus_dummy_ ## func;} + +static int boot_sched_setup(char *plugin_name) +{ + int i = 0; + + /* Common initializers, + * mode change lock is used to enforce single mode change + * operation. + */ + queue_lock_init(&mode_change_lock); + + printk("Starting LITMUS^RT kernel\n"); + + /* Look for a matching plugin. + */ + for (i = 0; i < ARRAY_SIZE(available_plugins); i++) { + if (!strcmp(plugin_name, available_plugins[i].name)) { + printk("Using %s scheduler plugin\n", plugin_name); + sched_policy = available_plugins[i].policy_id; + if (available_plugins[i].init) + curr_sched_plugin = available_plugins[i].init(); + goto out; + } + } + + + /* Otherwise we have default linux scheduler */ + printk("Plugin name %s is unknown, using default %s\n", plugin_name, + curr_sched_plugin->plugin_name); + +out: + /* make sure we don't trip over null pointers later */ + CHECK(finish_switch); + CHECK(schedule); + CHECK(scheduler_tick); + CHECK(wake_up_task); + CHECK(tear_down); + CHECK(task_blocks); + CHECK(prepare_task); + CHECK(scheduler_setup); + CHECK(sleep_next_period); + CHECK(enter_np); + CHECK(exit_np); + CHECK(inherit_priority); + CHECK(return_priority); + CHECK(pi_block); + +#ifdef CONFIG_MAGIC_SYSRQ + /* offer some debugging help */ + if (!register_sysrq_key('g', &sysrq_toGgle_rt_mode_op)) + printk("Registered eXit real-time mode magic sysrq.\n"); + else + printk("Could not register eXit real-time mode magic sysrq.\n"); + if (!register_sysrq_key('q', &sysrq_kill_rt_tasks_op)) + printk("Registered kill rt tasks magic sysrq.\n"); + else + printk("Could not register kill rt tasks magic sysrq.\n"); +#endif + printk("Litmus setup complete."); + return 1; +} + +/* Register for boot option */ +__setup("rtsched=", boot_sched_setup); diff --git a/kernel/litmus_sem.c b/kernel/litmus_sem.c new file mode 100644 index 0000000..71233cc --- /dev/null +++ b/kernel/litmus_sem.c @@ -0,0 +1,755 @@ + +/* + * SMP- and interrupt-safe semaphores. Also PI and SRP implementations. + * Much of the code here is borrowed from include/asm-i386/semaphore.h. + * + * NOTE: This implementation is very much a prototype and horribly insecure. It + * is intended to be a proof of concept, not a feature-complete solution. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include +/* ************************************************************************** */ +/* STANDARD FIFO SEMAPHORES */ +/* ************************************************************************** */ + +#define MAX_SEMAPHORES 256 + +struct semaphore sems[MAX_SEMAPHORES]; /* all sems */ +typedef int sema_id; /* Userspace ID of a semaphore */ + +static int rt_fifo_wake_up(wait_queue_t *wait, unsigned mode, int sync, + void *key) +{ + struct task_struct* t = (struct task_struct*) wait->private; + set_rt_flags(t, RT_F_EXIT_SEM); + TRACE_TASK(t, "woken up by rt_fifo_wake_up(), set RT_F_EXIT_SEM\n"); + default_wake_function(wait, mode, sync, key); + /* for reason why we always return 1 see rt_pi_wake_up() below */ + return 1; +} + +static fastcall void rt_fifo_up(struct semaphore * sem) +{ + TRACE_CUR("releases lock %p\n"); + preempt_disable(); + TS_FIFO_UP_START; + if (atomic_inc_return(&sem->count) < 1) + /* there is a task queued */ + wake_up(&sem->wait); + TS_FIFO_UP_END; + preempt_enable(); +} + +/* not optimized like the Linux down() implementation, but then + * again we incur the cost of a syscall anyway, so this hardly matters + */ +static fastcall void rt_fifo_down(struct semaphore * sem) +{ + struct task_struct *tsk = current; + wait_queue_t wait = { + .private = tsk, + .func = rt_fifo_wake_up, + .task_list = {NULL, NULL} + }; + + preempt_disable(); + TS_FIFO_DOWN_START; + + spin_lock(&sem->wait.lock); + if (atomic_dec_return(&sem->count) < 0 || + waitqueue_active(&sem->wait)) { + /* we need to suspend */ + tsk->state = TASK_UNINTERRUPTIBLE; + add_wait_queue_exclusive_locked(&sem->wait, &wait); + + TRACE_CUR("suspends on lock %p\n", sem); + + /* release lock before sleeping */ + spin_unlock(&sem->wait.lock); + + TS_FIFO_DOWN_END; + preempt_enable(); + + /* we depend on the FIFO order + * Thus, we don't need to recheck when we wake up, we + * are guaranteed to have the lock since there is only one + * wake up per release + */ + schedule(); + + TRACE_CUR("woke up, now owns lock %p\n", sem); + + /* try_to_wake_up() set our state to TASK_RUNNING, + * all we need to do is to remove our wait queue entry + */ + spin_lock(&sem->wait.lock); + remove_wait_queue_locked(&sem->wait, &wait); + spin_unlock(&sem->wait.lock); + } else { + TRACE_CUR("acquired lock %p, no contention\n", sem); + spin_unlock(&sem->wait.lock); + TS_FIFO_DOWN_END; + preempt_enable(); + } +} + + + +/* Initialize semaphores at boot time. */ +static int __init sema_boot_init(void) +{ + sema_id sem_id; + + printk("Initializing semaphores..."); + for (sem_id = 0; sem_id < MAX_SEMAPHORES; sem_id++) + sems[sem_id].used = 0; + printk(" done!\n"); + + return 0; +} +__initcall(sema_boot_init); + +/* Find a free semaphore and return. */ +asmlinkage long sys_sema_init (void) +{ + sema_id sem_id; + + for (sem_id = 0; sem_id < MAX_SEMAPHORES; sem_id++) { + if (!cmpxchg(&sems[sem_id].used, 0, 1)) { + sema_init(&sems[sem_id], 1); + return sem_id; + } + } + return -ENOMEM; +} + +asmlinkage long sys_down(sema_id sem_id) +{ + if (sem_id < 0 || sem_id >= MAX_SEMAPHORES) + return -EINVAL; + + if (!sems[sem_id].used) + return -EINVAL; + /* This allows for FIFO sems and gives others a chance... */ + rt_fifo_down(sems + sem_id); + return 0; +} + +asmlinkage long sys_up(sema_id sem_id) +{ + if (sem_id < 0 || sem_id >= MAX_SEMAPHORES) + return -EINVAL; + + if (!sems[sem_id].used) + return -EINVAL; + rt_fifo_up(sems + sem_id); + return 0; +} + +asmlinkage long sys_sema_free(sema_id sem_id) +{ + struct list_head *tmp, *next; + unsigned long flags; + + if (sem_id < 0 || sem_id >= MAX_SEMAPHORES) + return -EINVAL; + + if (!sems[sem_id].used) + return -EINVAL; + + spin_lock_irqsave(&sems[sem_id].wait.lock, flags); + if (waitqueue_active(&sems[sem_id].wait)) { + list_for_each_safe(tmp, next, &sems[sem_id].wait.task_list) { + wait_queue_t *curr = list_entry(tmp, wait_queue_t, + task_list); + list_del(tmp); + set_rt_flags((struct task_struct*)curr->private, + RT_F_EXIT_SEM); + curr->func(curr, + TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, + 0, NULL); + } + } + + spin_unlock_irqrestore(&sems[sem_id].wait.lock, flags); + sems[sem_id].used = 0; + + return 0; +} + + + + +/* ************************************************************************** */ +/* PRIORITY INHERITANCE */ +/* ************************************************************************** */ + + +#define MAX_PI_SEMAPHORES 256 + +struct pi_semaphore pi_sems[MAX_PI_SEMAPHORES]; /* all PI sems */ +typedef int pi_sema_id; /* Userspace ID of a pi_semaphore */ + +struct wq_pair { + struct task_struct* tsk; + struct pi_semaphore* sem; +}; + +static int rt_pi_wake_up(wait_queue_t *wait, unsigned mode, int sync, + void *key) +{ + struct wq_pair* wqp = (struct wq_pair*) wait->private; + set_rt_flags(wqp->tsk, RT_F_EXIT_SEM); + curr_sched_plugin->inherit_priority(wqp->sem, wqp->tsk); + TRACE_TASK(wqp->tsk, + "woken up by rt_pi_wake_up() (RT_F_SEM_EXIT, PI)\n"); + /* point to task for default_wake_function() */ + wait->private = wqp->tsk; + default_wake_function(wait, mode, sync, key); + + /* Always return true since we know that if we encountered a task + * that was already running the wake_up raced with the schedule in + * rt_pi_down(). In that case the task in rt_pi_down() will be scheduled + * immediately and own the lock. We must not wake up another task in + * any case. + */ + return 1; +} + + +/* caller is responsible for locking */ +int edf_set_hp_task(struct pi_semaphore *sem) +{ + struct list_head *tmp, *next; + struct task_struct *queued; + int ret = 0; + + sem->hp.task = NULL; + list_for_each_safe(tmp, next, &sem->wait.task_list) { + queued = ((struct wq_pair*) + list_entry(tmp, wait_queue_t, + task_list)->private)->tsk; + + /* Compare task prios, find high prio task. */ + if (edf_higher_prio(queued, sem->hp.task)) { + sem->hp.task = queued; + ret = 1; + } + } + return ret; +} + + +/* caller is responsible for locking */ +int edf_set_hp_cpu_task(struct pi_semaphore *sem, int cpu) +{ + struct list_head *tmp, *next; + struct task_struct *queued; + int ret = 0; + + sem->hp.cpu_task[cpu] = NULL; + list_for_each_safe(tmp, next, &sem->wait.task_list) { + queued = ((struct wq_pair*) + list_entry(tmp, wait_queue_t, + task_list)->private)->tsk; + + /* Compare task prios, find high prio task. */ + if (get_partition(queued) == cpu && + edf_higher_prio(queued, sem->hp.cpu_task[cpu])) { + sem->hp.cpu_task[cpu] = queued; + ret = 1; + } + } + return ret; +} + + +/* Initialize PI semaphores at boot time. */ +static int __init pi_sema_boot_init(void) +{ + pi_sema_id sem_id; + + printk("Initializing PI semaphores..."); + for (sem_id = 0; sem_id < MAX_PI_SEMAPHORES; sem_id++) + pi_sems[sem_id].used = 0; + printk(" done!\n"); + + return 0; +} +__initcall(pi_sema_boot_init); + +/* Find a free semaphore and return. */ +asmlinkage long sys_pi_sema_init (void) +{ + pi_sema_id sem_id; + int i = 0; + + for (sem_id = 0; sem_id < MAX_PI_SEMAPHORES; sem_id++) { + if (!cmpxchg(&pi_sems[sem_id].used, 0, 1)) { + atomic_set(&pi_sems[sem_id].count, 1); + pi_sems[sem_id].sleepers = 0; + init_waitqueue_head(&pi_sems[sem_id].wait); + pi_sems[sem_id].hp.task = NULL; + pi_sems[sem_id].holder = NULL; + for (i = 0; i < NR_CPUS; i++) + pi_sems[sem_id].hp.cpu_task[i] = NULL; + return sem_id; + } + } + return -ENOMEM; +} + +asmlinkage long sys_pi_down(pi_sema_id sem_id) +{ + struct pi_semaphore * sem; + unsigned long flags; + struct task_struct *tsk = current; + struct wq_pair pair; + long ret = -EINVAL; + wait_queue_t wait = { + .private = &pair, + .func = rt_pi_wake_up, + .task_list = {NULL, NULL} + }; + + preempt_disable(); + TS_PI_DOWN_START; + + if (sem_id < 0 || sem_id >= MAX_PI_SEMAPHORES) + goto out; + + if (!pi_sems[sem_id].used) + goto out; + + sem = pi_sems + sem_id; + pair.tsk = tsk; + pair.sem = sem; + spin_lock_irqsave(&sem->wait.lock, flags); + + if (atomic_dec_return(&sem->count) < 0 || + waitqueue_active(&sem->wait)) { + /* we need to suspend */ + tsk->state = TASK_UNINTERRUPTIBLE; + add_wait_queue_exclusive_locked(&sem->wait, &wait); + + TRACE_CUR("suspends on PI lock %p\n", sem); + curr_sched_plugin->pi_block(sem, tsk); + + /* release lock before sleeping */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + TS_PI_DOWN_END; + preempt_enable(); + + + /* we depend on the FIFO order + * Thus, we don't need to recheck when we wake up, we + * are guaranteed to have the lock since there is only one + * wake up per release + */ + schedule(); + + TRACE_CUR("woke up, now owns PI lock %p\n", sem); + + /* try_to_wake_up() set our state to TASK_RUNNING, + * all we need to do is to remove our wait queue entry + */ + remove_wait_queue(&sem->wait, &wait); + } else { + /* no priority inheritance necessary, since there are no queued + * tasks. + */ + TRACE_CUR("acquired PI lock %p, no contention\n", sem); + sem->holder = tsk; + sem->hp.task = tsk; + curr_sched_plugin->inherit_priority(sem, tsk); + spin_unlock_irqrestore(&sem->wait.lock, flags); + out: + TS_PI_DOWN_END; + preempt_enable(); + } + ret = 0; + return ret; +} + +asmlinkage long sys_pi_up(pi_sema_id sem_id) +{ + unsigned long flags; + long ret = -EINVAL; + struct pi_semaphore * sem; + + preempt_disable(); + TS_PI_UP_START; + + if (sem_id < 0 || sem_id >= MAX_PI_SEMAPHORES) + goto out; + + if (!pi_sems[sem_id].used) + goto out; + + sem = pi_sems + sem_id; + spin_lock_irqsave(&sem->wait.lock, flags); + + TRACE_CUR("releases PI lock %p\n", sem); + curr_sched_plugin->return_priority(sem); + sem->holder = NULL; + if (atomic_inc_return(&sem->count) < 1) + /* there is a task queued */ + wake_up_locked(&sem->wait); + + spin_unlock_irqrestore(&sem->wait.lock, flags); + + ret = 0; + out: + TS_PI_UP_END; + preempt_enable(); + return ret; +} + +/* Clear wait queue and wakeup waiting tasks, and free semaphore. */ +asmlinkage long sys_pi_sema_free(pi_sema_id sem_id) +{ + struct list_head *tmp, *next; + unsigned long flags; + + if (sem_id < 0 || sem_id >= MAX_PI_SEMAPHORES) + return -EINVAL; + + if (!pi_sems[sem_id].used) + return -EINVAL; + + spin_lock_irqsave(&pi_sems[sem_id].wait.lock, flags); + if (waitqueue_active(&pi_sems[sem_id].wait)) { + list_for_each_safe(tmp, next, + &pi_sems[sem_id].wait.task_list) { + wait_queue_t *curr = list_entry(tmp, wait_queue_t, + task_list); + list_del(tmp); + set_rt_flags((struct task_struct*)curr->private, + RT_F_EXIT_SEM); + curr->func(curr, + TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, + 0, NULL); + } + } + + spin_unlock_irqrestore(&pi_sems[sem_id].wait.lock, flags); + pi_sems[sem_id].used = 0; + + return 0; +} + + + + +/* ************************************************************************** */ +/* STACK RESOURCE POLICY */ +/* ************************************************************************** */ + +#define MAX_SRP_SEMAPHORES 256 + +struct srp_priority { + struct list_head list; + unsigned int period; + pid_t pid; +}; + +#define list2prio(l) list_entry(l, struct srp_priority, list) + +static int srp_higher_prio(struct srp_priority* first, + struct srp_priority* second) +{ + if (!first->period) + return 0; + else + return !second->period || + first->period < second->period || ( + first->period == second->period && + first->pid < second->pid); +} + +struct srp { + struct list_head ceiling; + wait_queue_head_t ceiling_blocked; +}; + +#define system_ceiling(srp) list2prio(srp->ceiling.next) + +static int srp_exceeds_ceiling(struct task_struct* first, + struct srp* srp) +{ + return list_empty(&srp->ceiling) || + get_rt_period(first) < system_ceiling(srp)->period || + (get_rt_period(first) == system_ceiling(srp)->period && + first->pid < system_ceiling(srp)->pid); +} + +static void srp_add_prio(struct srp* srp, struct srp_priority* prio) +{ + struct list_head *pos; + if (in_list(&prio->list)) { + TRACE_CUR("WARNING: SRP violation detected, prio is already in " + "ceiling list!\n"); + return; + } + list_for_each(pos, &srp->ceiling) + if (unlikely(srp_higher_prio(prio, list2prio(pos)))) { + __list_add(&prio->list, pos->prev, pos); + return; + } + + list_add_tail(&prio->list, &srp->ceiling); +} + +/* struct for uniprocessor SRP "semaphore" */ +struct srp_semaphore { + struct srp_priority ceiling; + int cpu; /* cpu associated with this "semaphore" and resource */ + int claimed; /* is the resource claimed (ceiling should be used)? */ + int used; /* is the semaphore being used? */ +}; + + +struct srp_semaphore srp_sems[MAX_SRP_SEMAPHORES]; /* all SRP sems */ +typedef int srp_sema_id; /* Userspace ID of a srp_semaphore */ + +DEFINE_PER_CPU(struct srp, srp); + +/* Initialize SRP semaphores at boot time. */ +static int __init srp_sema_boot_init(void) +{ + srp_sema_id sem_id; + int i; + + printk("Initializing SRP semaphores..."); + for (sem_id = 0; sem_id < MAX_SRP_SEMAPHORES; sem_id++) { + srp_sems[sem_id].used = 0; + srp_sems[sem_id].claimed = 0; + srp_sems[sem_id].cpu = -1; + INIT_LIST_HEAD(&srp_sems[sem_id].ceiling.list); + } + for (i = 0; i < NR_CPUS; i++) { + init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked); + INIT_LIST_HEAD(&per_cpu(srp, i).ceiling); + } + printk(" done!\n"); + + return 0; +} +__initcall(srp_sema_boot_init); + +/* Find a free semaphore and return. */ +asmlinkage long sys_srp_sema_init (void) +{ + srp_sema_id sem_id; + + if (!is_realtime(current)) + return -EPERM; + + for (sem_id = 0; sem_id < MAX_SRP_SEMAPHORES; sem_id++) { + if (!cmpxchg(&srp_sems[sem_id].used, 0, 1)) { + srp_sems[sem_id].ceiling.period = 0; + srp_sems[sem_id].cpu = get_partition(current); + return sem_id; + } + } + return -ENOMEM; +} + +/* SRP task priority comparison function. Smaller periods have highest + * priority, tie-break is PID. + */ + +/* Adjust the system-wide priority ceiling if resource is claimed. */ +asmlinkage long sys_srp_down(srp_sema_id sem_id) +{ + int cpu; + int ret = -EINVAL; + + /* disabling preemptions is sufficient protection since + * SRP is strictly per CPU and we don't interfere with any + * interrupt handlers + */ + preempt_disable(); + TS_SRP_DOWN_START; + + + cpu = smp_processor_id(); + + if (sem_id < 0 || sem_id >= MAX_SRP_SEMAPHORES || + srp_sems[sem_id].cpu != cpu) + goto out; + + if (!srp_sems[sem_id].used) + goto out; + + /* claim... */ + srp_sems[sem_id].claimed = 1; + /* ...and update ceiling */ + srp_add_prio(&__get_cpu_var(srp), &srp_sems[sem_id].ceiling); + + ret = 0; + out: + TS_SRP_DOWN_END; + preempt_enable(); + return ret; +} + +/* Adjust the system-wide priority ceiling if resource is freed. */ +asmlinkage long sys_srp_up(srp_sema_id sem_id) +{ + int cpu; + int ret = -EINVAL; + + preempt_disable(); + TS_SRP_UP_START; + + cpu = smp_processor_id(); + + if (sem_id < 0 || sem_id >= MAX_SRP_SEMAPHORES || + srp_sems[sem_id].cpu != cpu) + goto out; + + if (!srp_sems[sem_id].used) + goto out; + + srp_sems[sem_id].claimed = 0; + /* Determine new system priority ceiling for this CPU. */ + if (in_list(&srp_sems[sem_id].ceiling.list)) + list_del(&srp_sems[sem_id].ceiling.list); + else + TRACE_CUR("WARNING: SRP violation detected, prio not in ceiling" + " list!\n"); + + /* Wake tasks on this CPU, if they exceed current ceiling. */ + wake_up_all(&__get_cpu_var(srp).ceiling_blocked); + ret = 0; + out: + TS_SRP_UP_END; + preempt_enable(); + return ret; +} + +/* Indicate that task will use a resource associated with a given + * semaphore. Should be done *a priori* before RT task system is + * executed, so this does *not* update the system priority + * ceiling! (The ceiling would be meaningless anyway, as the SRP + * breaks without this a priori knowledge.) + */ +asmlinkage long sys_reg_task_srp_sem(srp_sema_id sem_id, pid_t t_pid) +{ + struct pid *task_pid; + struct task_struct *t; + struct srp_priority t_prio; + + if (sem_id < 0 || sem_id >= MAX_SRP_SEMAPHORES) + return -EINVAL; + + task_pid = find_get_pid(t_pid); + if (!task_pid) + return -EINVAL; + + t = get_pid_task(task_pid, PIDTYPE_PID); + if (!t) + return -EINVAL; + + if (!is_realtime(t)) + return -EPERM; + + if (!srp_sems[sem_id].used) + return -EINVAL; + + if (srp_sems[sem_id].cpu != get_partition(t)) + return -EINVAL; + + preempt_disable(); + t->rt_param.subject_to_srp = 1; + t_prio.period = get_rt_period(t); + t_prio.pid = t->pid; + if (srp_higher_prio(&t_prio, &srp_sems[sem_id].ceiling)) { + srp_sems[sem_id].ceiling.period = t_prio.period; + srp_sems[sem_id].ceiling.pid = t_prio.pid; + } + + preempt_enable(); + + return 0; +} + +static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync, + void *key) +{ + int cpu = smp_processor_id(); + struct task_struct *tsk = wait->private; + if (cpu != get_partition(tsk)) + TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b", + get_partition(tsk)); + else if (srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) + return default_wake_function(wait, mode, sync, key); + return 0; +} + + +/* Wait for current task priority to exceed system-wide priority ceiling. + * Can be used to determine when it is safe to run a job after its release. + */ +void srp_ceiling_block(void) +{ + struct task_struct *tsk = current; + wait_queue_t wait = { + .private = tsk, + .func = srp_wake_up, + .task_list = {NULL, NULL} + }; + + preempt_disable(); + if (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) { + tsk->state = TASK_UNINTERRUPTIBLE; + add_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait); + TRACE_CUR("is priority ceiling blocked.\n"); + preempt_enable(); + schedule(); + TRACE_CUR("finally exceeds system ceiling.\n"); + remove_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait); + } else { + TRACE_CUR("is not priority ceiling blocked\n"); + preempt_enable(); + } +} + +/* Free semaphore, adjusting the system-wide priority ceiling if necessary. */ +asmlinkage long sys_srp_sema_free(srp_sema_id sem_id) +{ + int cpu; + + preempt_disable(); + cpu = smp_processor_id(); + + if (sem_id < 0 || sem_id >= MAX_SRP_SEMAPHORES || + srp_sems[sem_id].cpu != cpu) + return -EINVAL; + + srp_sems[sem_id].claimed = 0; + srp_sems[sem_id].used = 0; + + preempt_enable(); + return 0; +} + + + +/* ************************************************************************** */ + + + diff --git a/kernel/pfair_common.c b/kernel/pfair_common.c new file mode 100644 index 0000000..a9e636d --- /dev/null +++ b/kernel/pfair_common.c @@ -0,0 +1,241 @@ +/* + * Common functions for PFAIR based scheduler. + */ + +#include +#include +#include + +#include +#include +#include + +#include +#include +/* Comparison of two tasks whether + * the lhs has higher priority than the rhs */ +int is_pfair_hp(struct task_struct *lhs, struct task_struct *rhs) +{ + /* Favor subtasks with earlier deadlines */ + if(time_before(get_deadline(lhs), get_deadline(rhs))) + return 1; + if(get_deadline(lhs) == get_deadline(rhs)) { + /* If deadlines are equal, + * favor non-zero b-bit (a heavy task) */ + if(lhs->rt_param.times.b_bit > rhs->rt_param.times.b_bit) + return 1; + + if(lhs->rt_param.times.b_bit == rhs->rt_param.times.b_bit && + lhs->rt_param.times.b_bit == 1) + /* If b-bit is 1, favor tasks with later + * group deadline */ + return time_after(lhs->rt_param.times.group_deadline, + rhs->rt_param.times.group_deadline); + + } + return 0; +} + +void pfair_domain_init(pfair_domain_t *pfair) +{ + BUG_ON(!pfair); + INIT_LIST_HEAD(&pfair->ready_queue); + INIT_LIST_HEAD(&pfair->release_queue); + queue_lock_init(&pfair->pfair_lock); + cpus_setall(pfair->domain_cpus); + /* Use cpu 0 to keep the system alive + * TODO: Remove later or make it configurable + * */ + cpu_clear(0, pfair->domain_cpus); +} + + +/* add_ready - add a real-time task to the PFAIR ready queue. + * It must be runnable. Global domain lock must be held before + * calling this function. + * + * @new: the newly released task + */ +void pfair_add_ready(pfair_domain_t* pfair, struct task_struct *new) +{ + struct list_head *pos; + struct task_struct *queued; + + BUG_ON(!new); + /* find a spot where our deadline is earlier than the next */ + list_for_each(pos, &pfair->ready_queue) { + queued = list_entry(pos, struct task_struct, rt_list); + if (unlikely(is_pfair_hp(new, queued))) { + /* the task at pos has a later deadline */ + /* insert the new task in front of it */ + __list_add(&new->rt_list, pos->prev, pos); + return; + } + } + /* if we get to this point either the list is empty or new has the + * lowest priority. Let's add it to the end. */ + list_add_tail(&new->rt_list, &pfair->ready_queue); +} +/** + * Extraction function. + */ +struct task_struct* __pfair_take_ready(pfair_domain_t* pfair) +{ + struct task_struct *t = NULL; + /* either not yet released, preempted, or non-rt */ + if (!list_empty(&pfair->ready_queue)) { + + /* take next rt task */ + t = list_entry(pfair->ready_queue.next, struct task_struct, + rt_list); + + /* kick it out of the ready list */ + list_del(&t->rt_list); + } + return t; +} + + +/* add_release - add a real-time task to the PFAIR release queue. + * Domain lock must be acquired before the function is called. + * + * @task: the sleeping task + */ +void pfair_add_release(pfair_domain_t* pfair, struct task_struct *task) +{ + struct list_head *pos; + struct task_struct *queued; + + BUG_ON(!task); + /* find a spot where our deadline is earlier than the next */ + list_for_each_prev(pos, &pfair->release_queue) { + queued = list_entry(pos, struct task_struct, rt_list); + if ((unlikely(time_before(queued->rt_param.times.release, + task->rt_param.times.release)))) { + /* the task at pos has an earlier release */ + /* insert the new task in behind it */ + __list_add(&task->rt_list, pos, pos->next); + return; + } + } + /* if we get to this point either the list is empty or task has the + * earliest release. Let's add it to the front. */ + list_add(&task->rt_list, &pfair->release_queue); +} +/** + * This function is called from tick handler, it acquires the lock + * automatically. Only one processor effectively merges the queues. + */ +void pfair_try_release_pending(pfair_domain_t* pfair) +{ + unsigned long flags; + struct list_head *pos, *save; + struct task_struct *queued; + queue_lock_irqsave(&pfair->pfair_lock, flags); + + list_for_each_safe(pos, save, &pfair->release_queue) { + queued = list_entry(pos, struct task_struct, rt_list); + if (likely(time_before_eq( + queued->rt_param.times.release, jiffies))) { + /* this one is ready to go*/ + list_del(pos); + set_rt_flags(queued, RT_F_RUNNING); + + sched_trace_job_release(queued); + /* now it can be picked up */ + barrier(); + pfair_add_ready(pfair, queued); + } + else + /* the release queue is ordered */ + break; + } + queue_unlock_irqrestore(&pfair->pfair_lock, flags); +} +/* + * Subtask preparation. Assuming that last_release + * denotes the time when the job was released. + */ +void pfair_prepare_next_subtask(struct task_struct *t) +{ + BUG_ON(!t); + /* assign subtask release time, deadline, b-bit, + * and group deadline + */ + t->rt_param.times.release = t->rt_param.times.last_release + +release_time(t); + t->rt_param.times.deadline = t->rt_param.times.last_release + +pfair_deadline(t); + t->rt_param.times.b_bit = b_bit(t); + t->rt_param.times.group_deadline = t->rt_param.times.last_release + +group_deadline(t); +} + +void pfair_prepare_next_job(struct task_struct *t) +{ + BUG_ON(!t); + /* update tardy job ctr */ + if (jiffies > t->rt_param.times.deadline) + t->rt_param.stats.nontardy_jobs_ctr = 0; + else + t->rt_param.stats.nontardy_jobs_ctr++; + + /* prepare next job release */ + /* make passed quantums zero so that we could compute new release times + * and deadlines for subtasks correctly + */ + t->rt_param.times.exec_time = 0; + /* assign job-wide release time, + * this is the starting point to + * compute subtask releases, deadlines and group deadlines + */ + t->rt_param.times.last_release = t->rt_param.times.last_release + +get_rt_period(t); + /* Release the first subtask. */ + pfair_prepare_next_subtask(t); + t->first_time_slice = 0; +} + +void __pfair_prepare_new_release(struct task_struct *t, jiffie_t start) +{ + t->rt_param.times.release = start; + t->rt_param.times.last_release = start; + t->rt_param.stats.nontardy_jobs_ctr = 0xf0000000; + t->rt_param.times.exec_time = 0; + t->first_time_slice = 0; + pfair_prepare_next_subtask(t); + set_rt_flags(t, RT_F_RUNNING); +} + +void pfair_prepare_new_releases(pfair_domain_t *pfair, jiffie_t start) +{ + unsigned long flags; + struct list_head tmp_list; + struct list_head *pos, *n; + struct task_struct *t; + + INIT_LIST_HEAD(&tmp_list); + + queue_lock_irqsave(&pfair->pfair_lock, flags); + + + while (!list_empty(&pfair->release_queue)) { + pos = pfair->release_queue.next; + list_del(pos); + list_add(pos, &tmp_list); + } + while (!list_empty(&pfair->ready_queue)) { + pos = pfair->ready_queue.next; + list_del(pos); + list_add(pos, &tmp_list); + } + + list_for_each_safe(pos, n, &tmp_list) { + t = list_entry(pos, struct task_struct, rt_list); + list_del(pos); + __pfair_prepare_new_release(t, start); + pfair_add_release(pfair, t); + } + queue_unlock_irqrestore(&pfair->pfair_lock, flags); +} + diff --git a/kernel/sched.c b/kernel/sched.c index cca93cc..40cf184 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -56,6 +56,14 @@ #include +#include +#define __SCHED_C__ +#include +#include +#include + +#include + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -836,7 +844,7 @@ static int effective_prio(struct task_struct *p) * keep the priority unchanged. Otherwise, update priority * to the normal priority: */ - if (!rt_prio(p->prio)) + if (!rt_prio(p->prio) && !is_realtime(p)) return p->normal_prio; return p->prio; } @@ -844,7 +852,7 @@ static int effective_prio(struct task_struct *p) /* * __activate_task - move a task to the runqueue. */ -static void __activate_task(struct task_struct *p, struct rq *rq) +void __activate_task(struct task_struct *p, struct rq *rq) { struct prio_array *target = rq->active; @@ -999,7 +1007,7 @@ out: /* * deactivate_task - remove a task from the runqueue. */ -static void deactivate_task(struct task_struct *p, struct rq *rq) +void deactivate_task(struct task_struct *p, struct rq *rq) { dec_nr_running(p, rq); dequeue_task(p, p->array); @@ -1408,13 +1416,44 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) #endif rq = task_rq_lock(p, &flags); + + if (is_realtime(p)) + TRACE("try_to_wake_up(%s/%d)\n", p->comm, p->pid); + old_state = p->state; if (!(old_state & state)) - goto out; + goto out; if (p->array) goto out_running; + + spin_lock(&litmus_task_set_lock); + if (p->rt_param.litmus_controlled) { + /* Already included. This can happen + * if the task dropped all locks to call + * schedule() but a wake up raced and came in + * early. + */ + + spin_unlock(&litmus_task_set_lock); + goto out_running; + } + + sched_trace_task_arrival(p); + if (is_realtime(p)) { + p->rt_param.litmus_controlled = 1; + curr_sched_plugin->wake_up_task(p); + + spin_unlock(&litmus_task_set_lock); + goto out_running; + } + + p->rt_param.litmus_controlled = 0; + spin_unlock(&litmus_task_set_lock); + + + cpu = task_cpu(p); this_cpu = smp_processor_id(); @@ -1580,6 +1619,7 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) cpu = sched_balance_self(cpu, SD_BALANCE_FORK); #endif set_task_cpu(p, cpu); + clear_rt_params(p); /* * We mark the process as running here, but have not actually @@ -1595,6 +1635,10 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) p->prio = current->normal_prio; INIT_LIST_HEAD(&p->run_list); + INIT_LIST_HEAD(&p->rt_list); + p->rt_param.basic_params.class = RT_CLASS_BEST_EFFORT; + p->rt_param.litmus_controlled = 0; + p->rt_param.inh_task = NULL; p->array = NULL; #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) if (unlikely(sched_info_on())) @@ -1647,6 +1691,13 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) unsigned long flags; int this_cpu, cpu; + if (clone_flags & CLONE_REALTIME) { + /* just mark the task as stopped */ + /* CLEANUP: Do we have to remove the task from the rq? */ + p->state = TASK_STOPPED; + return; + } + rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_RUNNING); this_cpu = smp_processor_id(); @@ -1730,6 +1781,9 @@ void fastcall sched_exit(struct task_struct *p) unsigned long flags; struct rq *rq; + if (is_realtime(p)) + return; + /* * If the child was a (relative-) CPU hog then decrease * the sleep_avg of the parent as well. @@ -1801,6 +1855,13 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); + /* Requeue previous real-time task before we drop the rq lock, cause + * that may lead to a preemption. + */ + curr_sched_plugin->finish_switch(prev); + sched_trace_task_scheduled(current); + /* trace before IRQs are enabled */ + TS_CXS_END; finish_lock_switch(rq, prev); if (mm) mmdrop(mm); @@ -1811,7 +1872,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) */ kprobe_flush_task(prev); put_task_struct(prev); - } + } } /** @@ -2990,7 +3051,7 @@ static inline void idle_balance(int cpu, struct rq *rq) static inline void wake_priority_sleeper(struct rq *rq) { #ifdef CONFIG_SCHED_SMT - if (!rq->nr_running) + if (!rq->nr_running || get_rt_mode() == MODE_RT_RUN) return; spin_lock(&rq->lock); @@ -3220,14 +3281,29 @@ void scheduler_tick(void) update_cpu_clock(p, rq, now); - if (p == rq->idle) - /* Task on the idle queue */ - wake_priority_sleeper(rq); - else - task_running_tick(rq, p); + /* check whether the RT scheduler plugin requires a call to + * schedule + */ + TS_PLUGIN_TICK_START; + if (curr_sched_plugin->scheduler_tick() == FORCE_RESCHED) + set_tsk_need_resched(p); + TS_PLUGIN_TICK_END; + + /* real-time accounting is done by the plugin + * call linux functions only for background tasks + */ + if (!is_realtime(p)) { + if (p == rq->idle) + /* Task on the idle queue */ + wake_priority_sleeper(rq); + else + task_running_tick(rq, p); + } + #ifdef CONFIG_SMP update_load(rq); - if (time_after_eq(jiffies, rq->next_balance)) + if (time_after_eq(jiffies, rq->next_balance) && + get_rt_mode() == MODE_NON_RT) raise_softirq(SCHED_SOFTIRQ); #endif } @@ -3420,6 +3496,7 @@ asmlinkage void __sched schedule(void) long *switch_count; struct rq *rq; + /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. @@ -3427,8 +3504,9 @@ asmlinkage void __sched schedule(void) */ if (unlikely(in_atomic() && !current->exit_state)) { printk(KERN_ERR "BUG: scheduling while atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), current->pid); + "%s/0x%08x/%d %s\n", + current->comm, preempt_count(), current->pid, + is_realtime(current) ? "rt" : "non-rt"); debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); @@ -3438,6 +3516,7 @@ asmlinkage void __sched schedule(void) need_resched: preempt_disable(); + TS_SCHED_START; prev = current; release_kernel_lock(prev); need_resched_nonpreemptible: @@ -3470,6 +3549,7 @@ need_resched_nonpreemptible: spin_lock_irq(&rq->lock); switch_count = &prev->nivcsw; + /* check for blocking tasks */ if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { switch_count = &prev->nvcsw; if (unlikely((prev->state & TASK_INTERRUPTIBLE) && @@ -3478,13 +3558,66 @@ need_resched_nonpreemptible: else { if (prev->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; + /* we need to remove real-time tasks from the runqueue*/ + + /* protect against races with signal delivery and IO + * interrupts on other CPUs + * + * FIXME: This is probably not sufficient, + * as (in theory) after + * unlocking the task_set_lock this task could + * be scheduled elsewere before we switched away + * from it. This has not been observed + * yet. To get this locking right is tricky. + */ + spin_lock(&litmus_task_set_lock); + if (prev->rt_param.litmus_controlled) + prev->rt_param.litmus_controlled = 0; + spin_unlock(&litmus_task_set_lock); + + if (is_realtime(prev)) { + TRACE("schedule: %s/%d blocks. state = %d\n", + prev->comm, prev->pid, prev->state); + curr_sched_plugin->task_blocks(prev); + /* Enable this for all tasks to get _a lot_ of + * data. Can be helpful for debugging. + */ + sched_trace_task_departure(prev); + } + + /* only indirect switching is supported in the current + * version of LITMUS + */ deactivate_task(prev, rq); } } + next = NULL; + + /* consult the real-time plugin */ + TS_PLUGIN_SCHED_START; + curr_sched_plugin->schedule(prev, &next, rq); + TS_PLUGIN_SCHED_END; + /* If the real-time plugin wants to switch to a specific task + * it'll be on the rq and have the highest priority. There will + * be exaclty one such task, thus the selection of the next task + * is unambiguous and the following code can only get + * triggered if there are no RT tasks pending (on this CPU). Thus, + * we may as well skip it. + */ + if (next) + goto switch_tasks; + cpu = smp_processor_id(); if (unlikely(!rq->nr_running)) { - idle_balance(cpu, rq); + /* only load-balance if we are not in RT mode + * + * TODO: Maybe this can be relaxed by modifiying the + * load-balancing routines in such a way that they never touch + * real-time tasks. + */ + if (get_rt_mode() == MODE_NON_RT) + idle_balance(cpu, rq); if (!rq->nr_running) { next = rq->idle; rq->expired_timestamp = 0; @@ -3528,7 +3661,7 @@ need_resched_nonpreemptible: } } next->sleep_type = SLEEP_NORMAL; - if (dependent_sleeper(cpu, rq, next)) + if (get_rt_mode() == MODE_NON_RT && dependent_sleeper(cpu, rq, next)) next = rq->idle; switch_tasks: if (next == rq->idle) @@ -3546,7 +3679,11 @@ switch_tasks: prev->timestamp = prev->last_ran = now; sched_info_switch(prev, next); + TS_SCHED_END; if (likely(prev != next)) { + TS_CXS_START; + if (is_running(prev)) + sched_trace_task_preemption(prev, next); next->timestamp = now; rq->nr_switches++; rq->curr = next; @@ -3560,9 +3697,10 @@ switch_tasks: * CPUs since it called schedule(), thus the 'rq' on its stack * frame will be invalid. */ - finish_task_switch(this_rq(), prev); - } else + finish_task_switch(this_rq(), prev); + } else { spin_unlock_irq(&rq->lock); + } prev = current; if (unlikely(reacquire_kernel_lock(prev) < 0)) @@ -3691,6 +3829,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, } } + /** * __wake_up - wake up threads blocked on a waitqueue. * @q: the waitqueue @@ -3709,6 +3848,7 @@ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, } EXPORT_SYMBOL(__wake_up); + /* * Same as __wake_up but called with the spinlock in wait_queue_head_t held. */ @@ -3717,6 +3857,7 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) __wake_up_common(q, mode, 1, 0, NULL); } + /** * __wake_up_sync - wake up threads blocked on a waitqueue. * @q: the waitqueue @@ -4175,7 +4316,7 @@ static inline struct task_struct *find_process_by_pid(pid_t pid) } /* Actually do priority change: must hold rq lock. */ -static void __setscheduler(struct task_struct *p, int policy, int prio) +void __setscheduler(struct task_struct *p, int policy, int prio) { BUG_ON(p->array); @@ -5397,7 +5538,7 @@ static struct notifier_block __cpuinitdata migration_notifier = { .priority = 10 }; -int __init migration_init(void) +int __init linux_migration_init(void) { void *cpu = (void *)(long)smp_processor_id(); int err; @@ -6859,7 +7000,7 @@ static int update_sched_domains(struct notifier_block *nfb, return NOTIFY_OK; } -void __init sched_init_smp(void) +void __init linux_sched_init_smp(void) { cpumask_t non_isolated_cpus; @@ -6877,7 +7018,7 @@ void __init sched_init_smp(void) BUG(); } #else -void __init sched_init_smp(void) +void __init linux_sched_init_smp(void) { } #endif /* CONFIG_SMP */ @@ -6892,7 +7033,7 @@ int in_sched_functions(unsigned long addr) && addr < (unsigned long)__sched_text_end); } -void __init sched_init(void) +void __init linux_sched_init(void) { int i, j, k; diff --git a/kernel/sched_edf_hsb.c b/kernel/sched_edf_hsb.c new file mode 100644 index 0000000..d190426 --- /dev/null +++ b/kernel/sched_edf_hsb.c @@ -0,0 +1,1802 @@ +/* + * kernel/sched_edf_hsb.c + * + * Implementation of the EDF-HSB scheduler plugin. + * + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* undefine to remove capacity sharing */ +#define HSB_CAP_SHARE_ENABLED + +/* fake server PIDs */ +#define HRT_BASE_PID 50000 +#define SRT_BASE_PID 60000 + + +/******************************************************************************/ +/* Capacity queue */ +/******************************************************************************/ + +int cap_check_resched(jiffie_t deadline); + +typedef struct { + int budget; + jiffie_t deadline; + pid_t donor; + + struct list_head list; +} capacity_t; + +typedef struct { + spinlock_t lock; + struct list_head queue; +} capacity_queue_t; + +#define next_cap(q) list_entry((q)->queue.next, capacity_t, list) + +void capacity_queue_init(capacity_queue_t* queue) +{ + queue->lock = SPIN_LOCK_UNLOCKED; + INIT_LIST_HEAD(&queue->queue); +} + +void __add_capacity(capacity_queue_t* queue, capacity_t *cap) +{ + struct list_head* pos; + capacity_t* queued; + + list_for_each_prev(pos, &queue->queue) { + queued = list_entry(pos, capacity_t, list); + if ( time_before_eq(queued->deadline, cap->deadline)) { + __list_add(&cap->list, pos, pos->next); + return; + } + } + list_add(&cap->list, &queue->queue); +} + +int __capacity_available(capacity_queue_t* queue) +{ + capacity_t *cap; + + while (!list_empty(&queue->queue)) { + cap = list_entry(queue->queue.next, capacity_t, list); + + + if (time_before_eq(cap->deadline, jiffies)) { + list_del(queue->queue.next); + kfree(cap); + cap = NULL; + } else + break; + } + + return !list_empty(&queue->queue); +} + +void __return_capacity(capacity_queue_t* queue, capacity_t *cap) +{ + if (!cap->budget || time_before_eq(cap->deadline, jiffies)) + kfree(cap); + else + __add_capacity(queue, cap); +} + + +void return_capacity(capacity_queue_t* queue, capacity_t *cap) + +{ + unsigned long flags; + + if (!cap->budget || time_before_eq(cap->deadline, jiffies)) + kfree(cap); + else { + spin_lock_irqsave(&queue->lock, flags); + __add_capacity(queue, cap); + spin_unlock_irqrestore(&queue->lock, flags); + } +} + + +#define MIN_TIME_DELTA 1 +#define MIN_BUDGET 1 + +#ifdef HSB_CAP_SHARE_ENABLED +void release_capacity(capacity_queue_t* queue, unsigned int budget, + jiffie_t deadline, struct task_struct* t) +{ + capacity_t* cap; + unsigned long flags; + + if (deadline >= jiffies + MIN_TIME_DELTA && budget >= MIN_BUDGET) { + cap = kmalloc(sizeof(capacity_t), GFP_ATOMIC); + if (cap) { + cap->budget = budget; + cap->deadline = deadline; + if (t) + cap->donor = t->pid; + else + cap->donor = 0; + spin_lock_irqsave(&queue->lock, flags); + __add_capacity(queue, cap); + cap_check_resched(next_cap(queue)->deadline); + spin_unlock_irqrestore(&queue->lock, flags); + if (t) + sched_trace_capacity_release(t); + } + } +} + +void __release_capacity(capacity_queue_t* queue, unsigned int budget, + jiffie_t deadline, struct task_struct* t) +{ + capacity_t* cap; + + if (deadline >= jiffies + MIN_TIME_DELTA && budget >= MIN_BUDGET) { + cap = kmalloc(sizeof(capacity_t), GFP_ATOMIC); + if (cap) { + cap->budget = budget; + cap->deadline = deadline; + if (t) + cap->donor = t->pid; + else + cap->donor = 0; + /* no locking, no resched check -- called from schedule */ + __add_capacity(queue, cap); + if (t) + sched_trace_capacity_release(t); + } + } +} + + +capacity_t* __take_capacity(capacity_queue_t* queue, jiffie_t deadline, int deadline_matters) +{ + capacity_t* cap = NULL; + + while (!list_empty(&queue->queue)) { + cap = list_entry(queue->queue.next, capacity_t, list); + + if (deadline_matters && time_before(deadline, cap->deadline)) { + cap = NULL; + break; + } + + list_del(queue->queue.next); + if (cap->deadline > jiffies) { + if (cap->deadline - jiffies < cap->budget) + cap->budget = cap->deadline - jiffies; + break; + } + kfree(cap); + cap = NULL; + } + + return cap; +} +#else + +/* no capacity sharing */ +void release_capacity(capacity_queue_t* queue, unsigned int budget, + jiffie_t deadline, struct task_struct* t) +{ +} + +capacity_t* __take_capacity(capacity_queue_t* queue, jiffie_t deadline, int deadline_matters) +{ + return NULL; +} +#endif + + +/******************************************************************************/ +/* server abstractions */ +/******************************************************************************/ + + +/* hrt_server_t - Abstraction of a hard real-time server. + * + * One HRT server per CPU. If it is unused period and wcet may be zero. + * HRT servers are strictly periodic and retain their budget. + */ +typedef struct { + edf_domain_t domain; + + unsigned int period; + unsigned int wcet; + + jiffie_t deadline; + int budget; +} hrt_server_t; + +/* be_server_t - Abstraction of best-effort server. + * + * This is pretty much only an accounting abstraction. + */ +typedef struct { + unsigned int period; + unsigned int wcet; + + jiffie_t deadline; + jiffie_t release; + int budget; + + struct list_head list; + pid_t pid; +} be_server_t; + +/* cast to int to allow for negative slack, i.e. tardiness */ +#define server_slack(srv) \ + ( ((int) (srv)->deadline - (int) jiffies) - (int) (srv)->budget ) + +typedef struct { + int cpu; + + hrt_server_t hrt; + be_server_t* be; + capacity_t* cap; + + task_class_t exec_class; + jiffie_t cur_deadline; + atomic_t will_schedule; + + struct list_head list; + spinlock_t lock; +} cpu_state_t; + + +DEFINE_PER_CPU(cpu_state_t, hsb_cpu_state); + +#define hrt_dom(cpu) (&per_cpu(hsb_cpu_state, cpu).hrt.domain) + +#define set_will_schedule() \ + (atomic_set(&__get_cpu_var(hsb_cpu_state).will_schedule, 1)) +#define clear_will_schedule() \ + (atomic_set(&__get_cpu_var(hsb_cpu_state).will_schedule, 0)) +#define test_will_schedule(cpu) \ + (atomic_read(&per_cpu(hsb_cpu_state, cpu).will_schedule)) + + +static void prepare_hrt_release(hrt_server_t *srv, jiffie_t start) +{ + if (srv->period && srv->wcet) { + srv->deadline = start; + srv->budget = 0; + } +} + +static void check_for_hrt_release(hrt_server_t *srv) { + if (srv->wcet && srv->period && + time_before_eq(srv->deadline, jiffies)) { + srv->deadline += srv->period; + srv->budget = srv->wcet; + sched_trace_server_release(HRT_BASE_PID + smp_processor_id(), + srv->budget, srv->period, RT_CLASS_HARD); + } +} + +/* A HRT client is eligible if either its deadline is before the + * the server deadline or if the server has zero slack. The server + * must have budget left. + */ +static inline int hrt_client_eligible(hrt_server_t *srv) +{ + if (!list_empty(&srv->domain.ready_queue)) + return srv->budget && ( + time_before(get_deadline(next_ready(&srv->domain)), + srv->deadline) + || server_slack(srv) <= 0); + else + return 0; +} + +static void hsb_cpu_state_init(cpu_state_t* cpu_state, + edf_check_resched_needed_t check, + int cpu) +{ + edf_domain_init(&cpu_state->hrt.domain, check); + cpu_state->hrt.budget = 0; + cpu_state->hrt.deadline = 0; + cpu_state->hrt.period = 0; + cpu_state->hrt.wcet = 0; + + cpu_state->be = NULL; + cpu_state->cap = NULL; + + cpu_state->cur_deadline = 0; + cpu_state->cpu = cpu; + cpu_state->lock = SPIN_LOCK_UNLOCKED; + cpu_state->exec_class = RT_CLASS_BEST_EFFORT; + + atomic_set(&cpu_state->will_schedule, 0); + INIT_LIST_HEAD(&cpu_state->list); +} + +/******************************************************************************/ +/* BE queue functions - mostly like edf_common.c */ +/******************************************************************************/ + +#define be_earlier_deadline(a, b) (time_before(\ + (a)->deadline, (b)->deadline)) +#define be_earlier_release(a, b) (time_before(\ + (a)->release, (b)->release)) + + +static void be_add_ready(edf_domain_t* edf, be_server_t *new) +{ + unsigned long flags; + struct list_head *pos; + be_server_t *queued; + unsigned int passed = 0; + + BUG_ON(!new); + /* first we need the write lock for edf_ready_queue */ + write_lock_irqsave(&edf->ready_lock, flags); + /* find a spot where our deadline is earlier than the next */ + list_for_each(pos, &edf->ready_queue) { + queued = list_entry(pos, be_server_t, list); + if (unlikely(be_earlier_deadline(new, queued))) { + __list_add(&new->list, pos->prev, pos); + goto out; + } + passed++; + } + /* if we get to this point either the list is empty or new has the + * lowest priority. Let's add it to the end. */ + list_add_tail(&new->list, &edf->ready_queue); + out: + if (!passed) + edf->check_resched(edf); + write_unlock_irqrestore(&edf->ready_lock, flags); +} + +static be_server_t* be_take_ready(edf_domain_t* edf) +{ + be_server_t *t = NULL; + + if (!list_empty(&edf->ready_queue)) { + t = list_entry(edf->ready_queue.next, be_server_t, list); + /* kick it out of the ready list */ + list_del(&t->list); + } + return t; +} + +/*static be_server_t* get_be_server(edf_domain_t* edf) +{ + be_server_t *t = NULL; + + spin_lock(&edf->release_lock); + write_lock(&edf->ready_lock); + t = be_take_ready(edf); + + if (!t && !list_empty(&edf->release_queue)) { + t = list_entry(edf->release_queue.next, be_server_t, list); + + list_del(&t->list); + } + + write_unlock(&edf->ready_lock); + spin_unlock(&edf->release_lock); + return t; +}*/ + +static void be_add_release(edf_domain_t* edf, be_server_t *srv) +{ + unsigned long flags; + struct list_head *pos; + be_server_t *queued; + + spin_lock_irqsave(&edf->release_lock, flags); + list_for_each_prev(pos, &edf->release_queue) { + queued = list_entry(pos, be_server_t, list); + if ((unlikely(be_earlier_release(queued, srv)))) { + /* the task at pos has an earlier release */ + /* insert the new task in behind it */ + __list_add(&srv->list, pos, pos->next); + goto out; + } + } + + list_add(&srv->list, &edf->release_queue); + out: + spin_unlock_irqrestore(&edf->release_lock, flags); +} + +static void be_try_release_pending(edf_domain_t* edf) +{ + unsigned long flags; + struct list_head *pos, *save; + be_server_t *queued; + + if (spin_trylock_irqsave(&edf->release_lock, flags)) { + list_for_each_safe(pos, save, &edf->release_queue) { + queued = list_entry(pos, be_server_t, list); + if (likely(time_before_eq( + queued->release, + jiffies))) { + list_del(pos); + be_add_ready(edf, queued); + sched_trace_server_release( + queued->pid, queued->budget, + queued->period, RT_CLASS_BEST_EFFORT); + } else + /* the release queue is ordered */ + break; + } + spin_unlock_irqrestore(&edf->release_lock, flags); + } +} + +static void be_prepare_new_release(be_server_t *t, jiffie_t start) { + t->release = start; + t->deadline = t->release + t->period; + t->budget = t->wcet; +} + +static void be_prepare_new_releases(edf_domain_t *edf, jiffie_t start) +{ + unsigned long flags; + struct list_head tmp_list; + struct list_head *pos, *n; + be_server_t *t; + + INIT_LIST_HEAD(&tmp_list); + + spin_lock_irqsave(&edf->release_lock, flags); + write_lock(&edf->ready_lock); + + + while (!list_empty(&edf->release_queue)) { + pos = edf->release_queue.next; + list_del(pos); + list_add(pos, &tmp_list); + } + + while (!list_empty(&edf->ready_queue)) { + pos = edf->ready_queue.next; + list_del(pos); + list_add(pos, &tmp_list); + + } + + write_unlock(&edf->ready_lock); + spin_unlock_irqrestore(&edf->release_lock, flags); + + list_for_each_safe(pos, n, &tmp_list) { + t = list_entry(pos, be_server_t, list); + list_del(pos); + be_prepare_new_release(t, start); + be_add_release(edf, t); + } + +} + +static void be_prepare_for_next_period(be_server_t *t) +{ + BUG_ON(!t); + /* prepare next release */ + t->release = t->deadline; + t->deadline += t->period; + t->budget = t->wcet; +} + +#define be_next_ready(edf) \ + list_entry((edf)->ready_queue.next, be_server_t, list) + + +/* need_to_preempt - check whether the task t needs to be preempted by a + * best-effort server. + */ +static inline int be_preemption_needed(edf_domain_t* edf, cpu_state_t* state) +{ + /* we need the read lock for edf_ready_queue */ + if (!list_empty(&edf->ready_queue)) + { + + if (state->exec_class == RT_CLASS_SOFT) { + if (state->cap) + return time_before( + be_next_ready(edf)->deadline, + state->cap->deadline); + else + return time_before( + be_next_ready(edf)->deadline, + state->cur_deadline); + } else + return 1; + } + return 0; +} + +static void be_enqueue(edf_domain_t* edf, be_server_t* srv) +{ + int new_release = 0; + if (!srv->budget) { + be_prepare_for_next_period(srv); + new_release = 1; + } + + if (time_before_eq(srv->release, jiffies) && + get_rt_mode() == MODE_RT_RUN) { + be_add_ready(edf, srv); + if (new_release) + sched_trace_server_release( + srv->pid, srv->budget, + srv->period, RT_CLASS_BEST_EFFORT); + } else + be_add_release(edf, srv); +} + +static void be_preempt(edf_domain_t *be, cpu_state_t *state) +{ + be_server_t *srv; + + spin_lock(&state->lock); + srv = state->be; + state->be = NULL; + spin_unlock(&state->lock); + + /* add outside of lock to avoid deadlock */ + if (srv) + be_enqueue(be, srv); +} + + +/******************************************************************************/ +/* Actual HSB implementation */ +/******************************************************************************/ + +/* always acquire the cpu lock as the last lock to avoid deadlocks */ +static spinlock_t hsb_cpu_lock = SPIN_LOCK_UNLOCKED; +/* the cpus queue themselves according to priority in here */ +static LIST_HEAD(hsb_cpu_queue); + + +/* the global soft real-time domain */ +static edf_domain_t srt; +/* the global best-effort server domain + * belongs conceptually to the srt domain, but has + * be_server_t* queued instead of tast_t* + */ +static edf_domain_t be; + +static fifo_domain_t hsb_fifo; + +static capacity_queue_t cap_queue; + + + + +/* adjust_cpu_queue - Move the cpu entry to the correct place to maintain + * order in the cpu queue. + * + */ +static void adjust_cpu_queue(task_class_t class, jiffie_t deadline, + be_server_t *be) +{ + struct list_head *pos; + cpu_state_t *other; + cpu_state_t *entry; + + spin_lock(&hsb_cpu_lock); + + entry = &__get_cpu_var(hsb_cpu_state); + + spin_lock(&entry->lock); + entry->exec_class = class; + entry->cur_deadline = deadline; + entry->be = be; + + spin_unlock(&entry->lock); + + + + if (be) + sched_trace_server_scheduled( + be->pid, RT_CLASS_BEST_EFFORT, be->budget, + be->deadline); + else if (class == RT_CLASS_HARD) + sched_trace_server_scheduled( + HRT_BASE_PID + smp_processor_id(), RT_CLASS_HARD, + entry->hrt.budget, entry->hrt.deadline); + + list_del(&entry->list); + /* If we do not execute real-time jobs we just move + * to the end of the queue . + * If we execute hard real-time jobs we move the start + * of the queue. + */ + + switch (entry->exec_class) { + case RT_CLASS_HARD: + list_add(&entry->list, &hsb_cpu_queue); + break; + + case RT_CLASS_SOFT: + list_for_each(pos, &hsb_cpu_queue) { + other = list_entry(pos, cpu_state_t, list); + if (other->exec_class > RT_CLASS_SOFT || + time_before_eq(entry->cur_deadline, + other->cur_deadline)) + { + __list_add(&entry->list, pos->prev, pos); + goto out; + } + } + /* possible fall through if lowest SRT priority */ + + case RT_CLASS_BEST_EFFORT: + list_add_tail(&entry->list, &hsb_cpu_queue); + break; + + default: + /* something wrong in the variable */ + BUG(); + } + out: + spin_unlock(&hsb_cpu_lock); +} + + +/* hrt_check_resched - check whether the HRT server on given CPU needs to + * preempt the running task. + */ +static int hrt_check_resched(edf_domain_t *edf) +{ + hrt_server_t *srv = container_of(edf, hrt_server_t, domain); + cpu_state_t *state = container_of(srv, cpu_state_t, hrt); + int ret = 0; + + spin_lock(&state->lock); + + if (hrt_client_eligible(srv)) { + if (state->exec_class > RT_CLASS_HARD || + time_before( + get_deadline(next_ready(edf)), + state->cur_deadline) + ) { + if (state->cpu == smp_processor_id()) + set_tsk_need_resched(current); + else + smp_send_reschedule(state->cpu); + } + } + + spin_unlock(&state->lock); + return ret; +} + + +/* srt_check_resched - Check whether another CPU needs to switch to a SRT task. + * + * The function only checks and kicks the last CPU. It will reschedule and + * kick the next if necessary, and so on. The caller is responsible for making + * sure that it is not the last entry or that a reschedule is not necessary. + * + * Caller must hold edf->ready_lock! + */ +static int srt_check_resched(edf_domain_t *edf) +{ + cpu_state_t *last; + int ret = 0; + + spin_lock(&hsb_cpu_lock); + + if (!list_empty(&srt.ready_queue)) { + last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list); + /* guard against concurrent updates */ + spin_lock(&last->lock); + if (last->exec_class == RT_CLASS_BEST_EFFORT || ( + last->exec_class == RT_CLASS_SOFT && + time_before(get_deadline(next_ready(&srt)), + last->cur_deadline))) + { + if (smp_processor_id() == last->cpu) + set_tsk_need_resched(current); + else + if (!test_will_schedule(last->cpu)) + smp_send_reschedule(last->cpu); + ret = 1; + } + spin_unlock(&last->lock); + } + + spin_unlock(&hsb_cpu_lock); + return ret; +} + + +/* be_check_resched - Check whether another CPU needs to switch to a BE server.. + * + * Caller must hold edf->ready_lock! + */ +static int be_check_resched(edf_domain_t *edf) +{ + cpu_state_t *last; + int soft, bg; + int ret = 0; + + spin_lock(&hsb_cpu_lock); + + if (!list_empty(&be.ready_queue)) { + last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list); + /* guard against concurrent updates */ + spin_lock(&last->lock); + + bg = last->exec_class == RT_CLASS_BEST_EFFORT; + soft = last->exec_class == RT_CLASS_SOFT; + + if (bg || (soft && time_before(be_next_ready(&be)->deadline, + last->cur_deadline))) + { + if (smp_processor_id() == last->cpu) + set_tsk_need_resched(current); + else + if (!test_will_schedule(last->cpu)) + smp_send_reschedule(last->cpu); + ret = 1; + } + + spin_unlock(&last->lock); + } + + spin_unlock(&hsb_cpu_lock); + return ret; +} + + +int cap_check_resched(jiffie_t deadline) +{ + unsigned long flags; + cpu_state_t *last; + int soft, bg; + int ret = 0; + + + + if (get_rt_mode() == MODE_RT_RUN) { + spin_lock_irqsave(&hsb_cpu_lock, flags); + + last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list); + /* guard against concurrent updates */ + spin_lock(&last->lock); + + bg = last->exec_class == RT_CLASS_BEST_EFFORT; + soft = last->exec_class == RT_CLASS_SOFT; + + if (bg || (soft && time_before(deadline, + last->cur_deadline))) + { + if (smp_processor_id() == last->cpu) + set_tsk_need_resched(current); + else + if (!test_will_schedule(last->cpu)) + smp_send_reschedule(last->cpu); + ret = 1; + } + + spin_unlock(&last->lock); + + spin_unlock_irqrestore(&hsb_cpu_lock, flags); + } + return ret; +} + +int fifo_check_resched(void) +{ + unsigned long flags; + cpu_state_t *last; + int ret = 0; + + if (get_rt_mode() == MODE_RT_RUN) { + spin_lock_irqsave(&hsb_cpu_lock, flags); + + + last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list); + /* guard against concurrent updates */ + + spin_lock(&last->lock); + + if (last->exec_class == RT_CLASS_BEST_EFFORT) + { + if (smp_processor_id() == last->cpu) + set_tsk_need_resched(current); + else + if (!test_will_schedule(last->cpu)) + smp_send_reschedule(last->cpu); + ret = 1; + } + + spin_unlock(&last->lock); + + spin_unlock_irqrestore(&hsb_cpu_lock, flags); + } + return ret; +} + + + +static inline int hsb_preemption_needed(edf_domain_t* edf, cpu_state_t* state) +{ + /* we need the read lock for edf_ready_queue */ + if (!list_empty(&edf->ready_queue)) + { + if (state->exec_class == RT_CLASS_SOFT) { + if (state->cap) + return time_before(get_deadline(next_ready(edf)) + , state->cap->deadline); + else + return time_before(get_deadline(next_ready(edf)) + , state->cur_deadline); + } else + return 1; + } + return 0; +} + +static inline int cap_preemption_needed(capacity_queue_t* q, cpu_state_t* state) +{ + /* we need the read lock for edf_ready_queue */ + if (!list_empty(&q->queue)) + { + if (state->exec_class == RT_CLASS_SOFT) { + if (state->cap) + return time_before(next_cap(q)->deadline + , state->cap->deadline); + else + return time_before(next_cap(q)->deadline + , state->cur_deadline); + } else + return 1; + } + return 0; +} + +/* hsb_scheduler_tick - this function is called for every local timer + * interrupt. + * + * checks whether the current task has expired and checks + * whether we need to preempt it if it has not expired + */ +static reschedule_check_t hsb_scheduler_tick(void) +{ + unsigned long flags; + struct task_struct *t = current; + int resched = 0; + + cpu_state_t *state = &__get_cpu_var(hsb_cpu_state); + + /* expire tasks even if not in real-time mode + * this makes sure that at the end of real-time mode + * no tasks "run away forever". + */ + + /* charge BE server only if we are not running on a spare capacity */ + if (state->be && !state->cap && --state->be->budget <= 0) { + sched_trace_server_completion(state->be->pid, 0, + state->be->deadline, + RT_CLASS_BEST_EFFORT); + be_preempt(&be, state); + resched = 1; + } + + if (state->cap) + if (--state->cap->budget <= 0 || + time_before_eq(state->cap->deadline, jiffies)) { + kfree(state->cap); + state->cap = NULL; + resched = 1; + } + + if (is_realtime(t)) { + if (is_hrt(t) && (--state->hrt.budget <= 0)) { + sched_trace_server_completion( + HRT_BASE_PID + smp_processor_id(), 0, + state->hrt.deadline, RT_CLASS_HARD); + resched = 1; + } + + /* account for received service... */ + t->rt_param.times.exec_time++; + + /* ...and charge current budget */ + if (!state->cap) { + --t->time_slice; + /* a task always should be able to finish its job */ + BUG_ON(!is_be(t) && !t->time_slice && !job_completed(t)); + } + + if (job_completed(t) || (is_be(t) && !t->time_slice)) { + sched_trace_job_completion(t); + set_rt_flags(t, RT_F_SLEEP); + resched = 1; + } + } + + + if (get_rt_mode() == MODE_RT_RUN) + { + try_release_pending(&state->hrt.domain); + check_for_hrt_release(&state->hrt); + try_release_pending(&srt); + be_try_release_pending(&be); + + if (!resched) + switch (state->exec_class) { + case RT_CLASS_HARD: + read_lock_irqsave(&state->hrt.domain.ready_lock, + flags); + resched = preemption_needed(&state->hrt.domain, + t); + read_unlock_irqrestore( + &state->hrt.domain.ready_lock, flags); + break; + + case RT_CLASS_SOFT: + case RT_CLASS_BEST_EFFORT: + local_irq_save(flags); + + /* check for HRT jobs */ + read_lock(&state->hrt.domain.ready_lock); + resched = hrt_client_eligible(&state->hrt); + read_unlock(&state->hrt.domain.ready_lock); + + /* check for spare capacities */ + if (!resched) { + spin_lock(&cap_queue.lock); + resched = + cap_preemption_needed(&cap_queue, + state); + spin_unlock(&cap_queue.lock); + } + + /* check for SRT jobs */ + if (!resched) { + read_lock(&srt.ready_lock); + resched = hsb_preemption_needed( + &srt, state); + read_unlock(&srt.ready_lock); + } + + /* check for BE jobs */ + if (!resched) { + read_lock(&be.ready_lock); + resched = be_preemption_needed( + &be, state); + read_unlock(&be.ready_lock); + } + + /* check for background jobs */ + if (!resched && !is_realtime(current)) + resched = fifo_jobs_pending(&hsb_fifo); + local_irq_restore(flags); + break; + + default: + /* something wrong in the variable */ + BUG(); + } + } + + if (resched) { + set_will_schedule(); + return FORCE_RESCHED; + } else + return NO_RESCHED; +} + +static int schedule_hrt(struct task_struct * prev, + struct task_struct ** next, runqueue_t * rq) +{ + unsigned long flags; + int deactivate = 1; + cpu_state_t *state; + + + state = &__get_cpu_var(hsb_cpu_state); + + write_lock_irqsave(&state->hrt.domain.ready_lock, flags); + + + if (state->cap) { + /* hrt_schedule does not have the cap_queue lock */ + return_capacity(&cap_queue, state->cap); + state->cap = NULL; + } + + if (is_hrt(prev) && is_released(prev) && is_running(prev) + && !preemption_needed(&state->hrt.domain, prev)) { + /* This really should only happen if the task has + * 100% utilization or when we got a bogus/delayed + * resched IPI. + */ + TRACE("HRT: prev will be next, already released\n"); + *next = prev; + deactivate = 0; + } else { + /* either not yet released, preempted, or non-rt */ + *next = __take_ready(&state->hrt.domain); + /* the logic in hsb_schedule makes sure *next must exist + * if we get here */ + BUG_ON(!*next); + /* stick the task into the runqueue */ + __activate_task(*next, rq); + set_task_cpu(*next, smp_processor_id()); + } + + set_rt_flags(*next, RT_F_RUNNING); + adjust_cpu_queue(RT_CLASS_HARD, get_deadline(*next), NULL); + clear_will_schedule(); + + write_unlock_irqrestore(&state->hrt.domain.ready_lock, flags); + return deactivate; +} + + +static struct task_struct* find_min_slack_task(struct task_struct *prev, + edf_domain_t* edf) +{ + struct list_head *pos; + struct task_struct* tsk = NULL; + struct task_struct* cur; + + if (is_realtime(prev) && is_running(prev) && + get_rt_flags(prev) != RT_F_SLEEP) + tsk = prev; + list_for_each(pos, &edf->ready_queue) { + cur = list_entry(pos, struct task_struct, rt_list); + if (!tsk || task_slack(tsk) > task_slack(cur)) + tsk = cur; + } + return tsk; +} + +static struct task_struct* null_heuristic(struct task_struct *prev, + edf_domain_t* edf, + fifo_domain_t* fifo) +{ + if (fifo_jobs_pending( fifo)) + return NULL; + else if (!list_empty(&edf->ready_queue)) + return list_entry(edf->ready_queue.next, + struct task_struct, rt_list); + else + return NULL; +} + +/*static struct task_struct* history_heuristic(struct task_struct *prev, edf_domain_t* edf) +{ + struct list_head *pos; + struct task_struct* tsk = NULL; + struct task_struct* cur; + + if (is_realtime(prev) && is_running(prev) && + get_rt_flags(prev) != RT_F_SLEEP) + tsk = prev; + list_for_each(pos, &edf->ready_queue) { + cur = list_entry(pos, struct task_struct, rt_list); + if (!tsk || + tsk->rt_param.stats.nontardy_jobs_ctr > + cur->rt_param.stats.nontardy_jobs_ctr) + tsk = cur; + } + if (tsk && tsk->rt_param.stats.nontardy_jobs_ctr < 5) + return tsk; + else + return NULL; +} +*/ +/* TODO: write slack heuristic.*/ +/*static struct task_struct* slack_heuristic(struct task_struct *prev, edf_domain_t* edf) +{ + struct list_head *pos; + struct task_struct* tsk = NULL; + struct task_struct* cur; + + if (is_realtime(prev) && is_running(prev) && + get_rt_flags(prev) != RT_F_SLEEP) + tsk = prev; + list_for_each(pos, &edf->ready_queue) { + cur = list_entry(pos, struct task_struct, rt_list); + if (!tsk || + tsk->rt_param.stats.nontardy_job_ctr > + cur->rt_param.stats.nontardy_job_ctr) + tsk = cur; + } + if (tsk && tsk->rt_param.stats.nontardy_job_ctr < 5) + return tsk; + else + return NULL; +}*/ + + +/* caller holds all locks + */ + +static int schedule_capacity(struct task_struct *prev, + struct task_struct **next, runqueue_t *rq) +{ + cpu_state_t *state = &__get_cpu_var(hsb_cpu_state); + capacity_t* old; + + if (state->cap) { + old = state->cap; + state->cap = __take_capacity(&cap_queue, old->deadline, 1); + if (!state->cap) + state->cap = old; + else + __return_capacity(&cap_queue, old); + } else + state->cap = __take_capacity(&cap_queue, 0, 0); + + + /* pick a task likely to be tardy */ + *next = find_min_slack_task(prev, &srt); + + /* only give away spare capacities if there is no task that + * is going to be tardy + */ + if (*next && task_slack(*next) >= 0) + *next = null_heuristic(prev, &srt, &hsb_fifo); + if (*next && *next != prev) + list_del(&(*next)->rt_list); + + + /* if there is none pick a BE job */ + if (!*next) { + if (is_realtime(prev) && is_be(prev) && is_running(prev) && + get_rt_flags(prev) != RT_F_SLEEP) + *next = prev; + else + *next = fifo_take(&hsb_fifo); + } + + if (state->be) + be_preempt(&be, state); + BUG_ON(!state->cap); + if (*next && state->cap->donor) { + sched_trace_capacity_allocation( + *next, state->cap->budget, state->cap->deadline, + state->cap->donor); + } + + return *next != prev; +} + + + +#define BG 0 +#define SRT 1 +#define BE 2 +#define CAP 3 + +static inline int what_first(edf_domain_t *be, edf_domain_t *srt, capacity_queue_t* q) +{ + jiffie_t sdl = 0, bdl= 0, cdl = 0, cur; + int _srt = !list_empty(&srt->ready_queue); + int _be = !list_empty(&be->ready_queue); + int _cap = __capacity_available(q); + + + int ret = BG; /* nothing ready => background mode*/ + cur = 0; + + if (_srt) + sdl = get_deadline(next_ready(srt)); + if (_be) + bdl = be_next_ready(be)->deadline; + if (_cap) + cdl = next_cap(q)->deadline; + + + + if (_cap) { + ret = CAP; + cur = cdl; + } + if (_srt && (time_before(sdl, cur) || !ret)) { + ret = SRT; + cur = sdl; + } + if (_be && (time_before(bdl, cur) || !ret)) { + ret = BE; + cur = bdl; + } + return ret; +} + + + +static int schedule_srt_be_cap(struct task_struct *prev, + struct task_struct **next, runqueue_t *rq) +{ + task_class_t class = RT_CLASS_BEST_EFFORT; + jiffie_t deadline = 0; + unsigned long flags; + int deactivate = 1; + be_server_t* bes; + cpu_state_t* state; + int type; + +reschedule: + write_lock_irqsave(&srt.ready_lock, flags); + write_lock(&be.ready_lock); + spin_lock(&cap_queue.lock); + + + state = &__get_cpu_var(hsb_cpu_state); + bes = NULL; + + clear_will_schedule(); + + if (is_realtime(prev) && (is_released(prev) || is_be(prev)) && + is_running(prev) && !hsb_preemption_needed(&srt, state) && + !be_preemption_needed(&be, state) + ) { + /* Our current task's next job has already been + * released and has higher priority than the highest + * prioriy waiting task; in other words: it is tardy. + * We just keep it. + */ + TRACE("prev will be next, already released\n"); + *next = prev; + class = prev->rt_param.basic_params.class; + deadline = get_deadline(*next); + deactivate = 0; + } else { + /* either not yet released, preempted, or non-rt */ + type = what_first(&be, &srt, &cap_queue); + switch (type) { + case CAP: + /* capacity */ + deactivate = schedule_capacity(prev, next, rq); + deadline = state->cap->deadline; + if (*next) + class = RT_CLASS_SOFT; + else + class = RT_CLASS_BEST_EFFORT; + break; + case BE: + /* be */ + *next = NULL; + bes = be_take_ready(&be); + if (bes) { + class = RT_CLASS_SOFT; + deadline = bes->deadline; + *next = fifo_take(&hsb_fifo); + if (!*next) { + /* deactivate */ + __release_capacity(&cap_queue, + bes->budget, + bes->deadline, NULL); + bes->budget = 0; + barrier(); + spin_unlock(&cap_queue.lock); + write_unlock(&be.ready_lock); + write_unlock_irqrestore(&srt.ready_lock, + flags); + be_enqueue(&be, bes); + goto reschedule; + } + } + break; + case SRT: + /* srt */ + *next = __take_ready(&srt); + if (*next) { + class = RT_CLASS_SOFT; + deadline = get_deadline(*next); + } + break; + case BG: + /* background server mode */ + class = RT_CLASS_BEST_EFFORT; + deadline = 0; + *next = fifo_take(&hsb_fifo); + break; + } + + + /* give back capacities */ + if (type != CAP && state->cap) { + __return_capacity(&cap_queue, state->cap); + state->cap = NULL; + } + if (*next && deactivate) { + /* mark the task as executing on this cpu */ + set_task_cpu(*next, smp_processor_id()); + /* stick the task into the runqueue */ + __activate_task(*next, rq); + } + } + + adjust_cpu_queue(class, deadline, bes); + + switch (type) { + case BG: + break; + case BE: + be.check_resched(&be); + break; + case SRT: + srt.check_resched(&srt); + break; + case CAP: + if (!list_empty(&cap_queue.queue)) + cap_check_resched(list_entry(cap_queue.queue.next, + capacity_t, list)->deadline); + break; + } + + + if(*next) + set_rt_flags(*next, RT_F_RUNNING); + + spin_unlock(&cap_queue.lock); + write_unlock(&be.ready_lock); + write_unlock_irqrestore(&srt.ready_lock, flags); + return deactivate; +} + + +static int hsb_schedule(struct task_struct * prev, struct task_struct ** next, + runqueue_t * rq) +{ + int need_deactivate = 1; + cpu_state_t *state = NULL; + + preempt_disable(); + + state = &__get_cpu_var(hsb_cpu_state); + + be_preempt(&be, state); + + + if (is_realtime(prev) && !is_be(prev) && + get_rt_flags(prev) == RT_F_SLEEP) + { + TRACE("preparing %d for next period\n", prev->pid); + release_capacity(&cap_queue, prev->time_slice, + prev->rt_param.times.deadline, prev); + prepare_for_next_period(prev); + } + + if (get_rt_mode() == MODE_RT_RUN) { + /* we need to schedule hrt if a hrt job is pending or when + * we have a non expired hrt job on the cpu + */ + + if (hrt_client_eligible(&state->hrt) || + unlikely((is_hrt(prev) && is_running(prev) && + get_rt_flags(prev) != RT_F_SLEEP))) { + if (state->cap) { + return_capacity(&cap_queue, state->cap); + state->cap = NULL; + } + need_deactivate = schedule_hrt(prev, next, rq); + } else + need_deactivate = schedule_srt_be_cap(prev, next, rq); + + } + + if (is_realtime(prev) && need_deactivate && prev->array) { + /* take it out of the run queue */ + deactivate_task(prev, rq); + } + + preempt_enable(); + + return 0; +} + +/* put task into correct queue */ +static inline void hsb_add_release(struct task_struct *t) +{ + if (is_hrt(t)) + add_release(hrt_dom(get_partition(t)), t); + else if (is_srt(t)) + add_release(&srt, t); + else if (is_be(t)) { + t->time_slice = 0; + fifo_enqueue(&hsb_fifo, t); + fifo_check_resched(); + } else + BUG(); + +} + +/* put task into correct queue */ +static inline void hsb_add_ready(struct task_struct *t) +{ + if (is_hrt(t)) + add_ready(hrt_dom(get_partition(t)), t); + else if (is_srt(t)) + add_ready(&srt, t); + else if (is_be(t)) { + fifo_enqueue(&hsb_fifo, t); + fifo_check_resched(); + } + else + BUG(); +} + + +/* _finish_switch - we just finished the switch away from prev + * it is now safe to requeue the task + */ +static void hsb_finish_switch(struct task_struct *prev) +{ + if (!is_realtime(prev) || !is_running(prev)) + return; + + TRACE("finish switch for %d\n", prev->pid); + + if (is_be(prev)) { + fifo_enqueue(&hsb_fifo, prev); + return; + } + + if (get_rt_flags(prev) == RT_F_SLEEP || + get_rt_mode() != MODE_RT_RUN) { + /* this task has expired + * _schedule has already taken care of updating + * the release and + * deadline. We just must check if has been released. + */ + if (is_released(prev) && get_rt_mode() == MODE_RT_RUN) { + sched_trace_job_release(prev); + hsb_add_ready(prev); + TRACE("%d goes straight to ready queue\n", prev->pid); + } + else + /* it has got to wait */ + hsb_add_release(prev); + } + else { + /* this is a forced preemption + * thus the task stays in the ready_queue + * we only must make it available to other cpus + */ + hsb_add_ready(prev); + } +} + + +/* Prepare a task for running in RT mode + * Enqueues the task into master queue data structure + * returns + * -EPERM if task is not TASK_STOPPED + */ +static long hsb_prepare_task(struct task_struct * t) +{ + TRACE("edf-hsb: prepare task %d\n", t->pid); + + if (t->state == TASK_STOPPED) { + __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1); + + if (get_rt_mode() == MODE_RT_RUN && !is_be(t)) + /* The action is already on. + * Prepare immediate release + */ + prepare_new_release(t); + /* The task should be running in the queue, otherwise signal + * code will try to wake it up with fatal consequences. + */ + t->state = TASK_RUNNING; + if (is_be(t)) + t->rt_param.times.deadline = 0; + hsb_add_release(t); + return 0; + } + else + return -EPERM; +} + +static void hsb_wake_up_task(struct task_struct *task) +{ + /* We must determine whether task should go into the release + * queue or into the ready queue. It may enter the ready queue + * if it has credit left in its time slice and has not yet reached + * its deadline. If it is now passed its deadline we assume this the + * arrival of a new sporadic job and thus put it in the ready queue + * anyway.If it has zero budget and the next release is in the future + * it has to go to the release queue. + */ + TRACE("edf-hsb: wake up %d with budget=%d\n", + task->pid, task->time_slice); + task->state = TASK_RUNNING; + + if (is_be(task)) { + hsb_add_release(task); + } + else if (is_tardy(task)) { + /* new sporadic release */ + prepare_new_release(task); + sched_trace_job_release(task); + hsb_add_ready(task); + } + else if (task->time_slice) { + /* came back in time before deadline + * TODO: clip budget to fit into period, otherwise it could + * cause a deadline overrun in the next period, i.e. + * over allocation in the next period. + */ + set_rt_flags(task, RT_F_RUNNING); + hsb_add_ready(task); + } + else { + hsb_add_release(task); + } + +} + +static void hsb_task_blocks(struct task_struct *t) +{ + /* CLEANUP: The BUG_ON actually triggerd in a really weierd case if a + * BEST_EFFORT gets caught in a migration right after execv + * The next version of Litmus should deal with this more gracefully. + */ + + /*BUG_ON(!is_realtime(t));*/ + /* not really anything to do since it can only block if + * it is running, and when it is not running it is not in any + * queue anyway. + * + * TODO: Check whether the assumption is correct for SIGKILL and + * SIGSTOP. + */ + TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice); + /*BUG_ON(t->rt_list.next != LIST_POISON1);*/ + /*BUG_ON(t->rt_list.prev != LIST_POISON2);*/ + + if (is_be(t)) + sched_trace_job_completion(t); +} + + +/* When _tear_down is called, the task should not be in any queue any more + * as it must have blocked first. We don't have any internal state for the task, + * it is all in the task_struct. + */ +static long hsb_tear_down(struct task_struct * t) +{ + /* CLEANUP: see hsb_task_blocks */ + /*BUG_ON(!is_realtime(t)); + TRACE("edf-hsb: tear down called for %d \n", t->pid); + BUG_ON(t->array); + BUG_ON(t->rt_list.next != LIST_POISON1); + BUG_ON(t->rt_list.prev != LIST_POISON2);*/ + return 0; +} + +static int hsb_mode_change(int new_mode) +{ + int cpu; + cpu_state_t *entry; + jiffie_t start; + + TRACE("[%d] edf-hsb: mode changed to %d\n", smp_processor_id(), + new_mode); + if (new_mode == MODE_RT_RUN) { + start = jiffies + 20; + prepare_new_releases(&srt, start); + be_prepare_new_releases(&be, start); + + /* initialize per CPU state + * we can't do this at boot time because we don't know + * which CPUs will be online and we can't put non-existing + * cpus into the queue + */ + spin_lock(&hsb_cpu_lock); + /* get old cruft out of the way in case we reenter real-time + * mode for a second time + */ + while (!list_empty(&hsb_cpu_queue)) + list_del(hsb_cpu_queue.next); + /* reinitialize */ + for_each_online_cpu(cpu) { + entry = &per_cpu(hsb_cpu_state, cpu); + atomic_set(&entry->will_schedule, 0); + entry->exec_class = RT_CLASS_BEST_EFFORT; + entry->cur_deadline = 0; + list_add(&entry->list, &hsb_cpu_queue); + + prepare_new_releases(&entry->hrt.domain, start); + prepare_hrt_release(&entry->hrt, start); + } + spin_unlock(&hsb_cpu_lock); + + } + TRACE("[%d] edf-hsb: mode change done\n", smp_processor_id()); + return 0; +} + + +typedef enum { + EDF_HSB_SET_HRT, + EDF_HSB_GET_HRT, + EDF_HSB_CREATE_BE +} edf_hsb_setup_cmds_t; + +typedef struct { + int cpu; + unsigned int wcet; + unsigned int period; +} setup_hrt_param_t; + +typedef struct { + unsigned int wcet; + unsigned int period; +} create_be_param_t; + +typedef struct { + union { + setup_hrt_param_t setup_hrt; + create_be_param_t create_be; + }; +} param_t; + +static pid_t next_be_server_pid = SRT_BASE_PID; + +static int hsb_scheduler_setup(int cmd, void __user* up) +{ + unsigned long flags; + int error = -EINVAL; + cpu_state_t* state; + be_server_t* srv; + param_t param; + + switch (cmd) { + case EDF_HSB_SET_HRT: + if (copy_from_user(¶m, up, sizeof(setup_hrt_param_t))) { + error = -EFAULT; + goto out; + } + if (!cpu_online(param.setup_hrt.cpu)) { + printk(KERN_WARNING "scheduler setup: " + "CPU %d is not online!\n", param.setup_hrt.cpu); + error = -EINVAL; + goto out; + } + if (param.setup_hrt.period < param.setup_hrt.wcet) { + printk(KERN_WARNING "period < wcet!\n"); + error = -EINVAL; + goto out; + } + + state = &per_cpu(hsb_cpu_state, param.setup_hrt.cpu); + spin_lock_irqsave(&state->lock, flags); + + state->hrt.wcet = param.setup_hrt.wcet; + state->hrt.period = param.setup_hrt.period; + + spin_unlock_irqrestore(&state->lock, flags); + + printk(KERN_WARNING "edf-hsb: set HRT #%d to (%d, %d)\n", + param.setup_hrt.cpu, param.setup_hrt.wcet, + param.setup_hrt.period); + + error = 0; + + break; + + case EDF_HSB_GET_HRT: + if (copy_from_user(¶m, up, sizeof(setup_hrt_param_t))) { + error = -EFAULT; + goto out; + } + if (!cpu_online(param.setup_hrt.cpu)) { + error = -EINVAL; + goto out; + } + state = &per_cpu(hsb_cpu_state, param.setup_hrt.cpu); + spin_lock_irqsave(&state->lock, flags); + + param.setup_hrt.wcet = state->hrt.wcet; + param.setup_hrt.period = state->hrt.period; + + spin_unlock_irqrestore(&state->lock, flags); + + if (copy_to_user(up, ¶m, sizeof(setup_hrt_param_t))) { + error = -EFAULT; + goto out; + } + error = 0; + break; + + case EDF_HSB_CREATE_BE: + if (copy_from_user(¶m, up, sizeof(create_be_param_t))) { + error = -EFAULT; + goto out; + } + if (param.create_be.period < param.create_be.wcet || + !param.create_be.period || !param.create_be.wcet) { + error = -EINVAL; + goto out; + } + srv = (be_server_t*) kmalloc(sizeof(be_server_t), GFP_KERNEL); + if (!srv) { + error = -ENOMEM; + goto out; + } + srv->wcet = param.create_be.wcet; + srv->period = param.create_be.period; + srv->pid = next_be_server_pid++; + INIT_LIST_HEAD(&srv->list); + be_prepare_new_release(srv, jiffies); + be_enqueue(&be, srv); + + printk(KERN_WARNING "edf-hsb: created a BE with (%d, %d)\n", + param.create_be.wcet, param.create_be.period); + + error = 0; + break; + + default: + printk(KERN_WARNING "edf-hsb: unknown command %d\n", cmd); + } + +out: + return error; +} + +/* Plugin object */ +static sched_plugin_t s_plugin __cacheline_aligned_in_smp = { + .ready_to_use = 0 +}; + + +/* + * Plugin initialization code. + */ +#define INIT_SCHED_PLUGIN (struct sched_plugin){\ + .plugin_name = "EDF-HSB",\ + .ready_to_use = 1,\ + .algo_scheduler_tick = hsb_scheduler_tick,\ + .scheduler_tick = rt_scheduler_tick,\ + .prepare_task = hsb_prepare_task,\ + .sleep_next_period = edf_sleep_next_period,\ + .tear_down = hsb_tear_down,\ + .shutdown_hook = 0,\ + .schedule = hsb_schedule,\ + .finish_switch = hsb_finish_switch,\ + .mode_change = hsb_mode_change,\ + .wake_up_task = hsb_wake_up_task,\ + .task_blocks = hsb_task_blocks, \ + .scheduler_setup = hsb_scheduler_setup \ +} + + +sched_plugin_t *__init init_edf_hsb_plugin(void) +{ + int i; + + if (!s_plugin.ready_to_use) + { + set_sched_options(SCHED_NONE); + capacity_queue_init(&cap_queue); + edf_domain_init(&srt, srt_check_resched); + edf_domain_init(&be, be_check_resched); + fifo_domain_init(&hsb_fifo, 50); + for (i = 0; i < NR_CPUS; i++) + { + hsb_cpu_state_init(&per_cpu(hsb_cpu_state, i), + hrt_check_resched, i); + printk("HRT server %d initialized.\n", i); + } + s_plugin = INIT_SCHED_PLUGIN; + } + return &s_plugin; +} diff --git a/kernel/sched_global_edf.c b/kernel/sched_global_edf.c new file mode 100644 index 0000000..0781de1 --- /dev/null +++ b/kernel/sched_global_edf.c @@ -0,0 +1,565 @@ +/* + * kernel/sched-global-edf.c + * + * Re-Implementation of the Global EDF scheduler. + * + * This version works without using the struct queue. It uses the + * builtin kernel lists. + */ + +#include +#include +#include + +#include +#include +#include +#include + + +/* cpu_entry_t - maintain state of the priority of cpu's current task + * this is needed to check for priority inversions. + */ +typedef struct { + int cpu; + int executes_realtime; + jiffie_t cur_deadline; + struct list_head list; + atomic_t will_schedule; +} cpu_entry_t; +DEFINE_PER_CPU(cpu_entry_t, gedf_cpu_entries); + +#define set_will_schedule() \ + (atomic_set(&__get_cpu_var(gedf_cpu_entries).will_schedule, 1)) +#define clear_will_schedule() \ + (atomic_set(&__get_cpu_var(gedf_cpu_entries).will_schedule, 0)) +#define test_will_schedule(cpu) \ + (atomic_read(&per_cpu(gedf_cpu_entries, cpu).will_schedule)) + + +/* always acquire the cpu lock as the last lock to avoid deadlocks */ +static spinlock_t gedf_cpu_lock = SPIN_LOCK_UNLOCKED; +/* the cpus queue themselves according to priority in here */ +static LIST_HEAD(gedf_cpu_queue); + + +static edf_domain_t gedf; + +#define DUMP(args...) TRACE(args) + +/* adjust_cpu_queue - Move the cpu entry to the correct place to maintain + * order in the cpu queue. Caller must hold ready write lock. + * + */ +static void adjust_cpu_queue(int exec_rt, jiffie_t deadline) +{ + struct list_head *pos; + cpu_entry_t *other; + cpu_entry_t *entry; + + spin_lock(&gedf_cpu_lock); + + entry = &__get_cpu_var(gedf_cpu_entries); + entry->executes_realtime = exec_rt; + entry->cur_deadline = deadline; + + /* TODO: move instead of del+reinsert */ + list_del(&entry->list); + /* if we do not execute real-time jobs we just move + * to the end of the queue + */ + if (entry->executes_realtime) + list_for_each(pos, &gedf_cpu_queue) { + other = list_entry(pos, cpu_entry_t, list); + if (!other->executes_realtime || + time_before_eq(entry->cur_deadline, + other->cur_deadline)) + { + __list_add(&entry->list, pos->prev, pos); + goto out; + } + } + /* if we get this far we have the lowest priority task */ + list_add_tail(&entry->list, &gedf_cpu_queue); + + out: + spin_unlock(&gedf_cpu_lock); +} + + +/* check_reschedule_needed - Check whether another CPU needs to reschedule. + * + * The function only checks and kicks the last CPU. It will reschedule and + * kick the next if necessary, and so on. The caller is responsible for making + * sure that it is not the last entry or that a reschedule is not necessary. + * + * TODO: This function is probably way too trigger happy. It should only send + * IPIs if the other CPU is not going to reschedule anyway. But that is + * hard to detect reliably. Too many schedules will hurt performance + * but do not cause incorrect schedules. + */ +static int gedf_check_resched(edf_domain_t *edf) +{ + cpu_entry_t *last; + int ret = 0; + + spin_lock(&gedf_cpu_lock); + + if (!list_empty(&edf->ready_queue)) { + last = list_entry(gedf_cpu_queue.prev, cpu_entry_t, list); + if (!last->executes_realtime || + time_before(next_ready(edf)->rt_param.times.deadline, + last->cur_deadline)) + { + if (smp_processor_id() == last->cpu) + set_tsk_need_resched(current); + else + if (!test_will_schedule(last->cpu)) + smp_send_reschedule(last->cpu); + ret = 1; + } + } + + spin_unlock(&gedf_cpu_lock); + return ret; +} + + + +/* gedf_scheduler_tick - this function is called for every local timer + * interrupt. + * + * checks whether the current task has expired and checks + * whether we need to preempt it if it has not expired + */ +static reschedule_check_t gedf_scheduler_tick(void) +{ + unsigned long flags; + struct task_struct *t = current; + reschedule_check_t want_resched = NO_RESCHED; + + /* expire tasks even if not in real-time mode + * this makes sure that at the end of real-time mode + * no tasks "run away forever". + */ + BUG_ON(is_realtime(t) && t->time_slice > 100000); + if (is_realtime(t) && (!--t->time_slice)) { + /* this task has exhausted its budget in this period */ + set_rt_flags(t, RT_F_SLEEP); + want_resched = FORCE_RESCHED; + set_will_schedule(); + sched_trace_job_completion(t); + } + if (get_rt_mode() == MODE_RT_RUN) + { + /* check whether anything is waiting to be released + * this could probably be moved to the global timer + * interrupt handler since the state will only change + * once per jiffie + */ + try_release_pending(&gedf); + if (want_resched != FORCE_RESCHED) + { + read_lock_irqsave(&gedf.ready_lock, flags); + if (preemption_needed(&gedf, t)) + { + want_resched = FORCE_RESCHED; + set_will_schedule(); + } + read_unlock_irqrestore(&gedf.ready_lock, flags); + } + } + return want_resched; +} + +/* This is main Global EDF schedule function + * + * Assumes the caller holds the lock for rq and that irqs are disabled + * This is function only works for indirect switching + */ +static int gedf_schedule(struct task_struct * prev, + struct task_struct ** next, + runqueue_t * rq) +{ + int need_deactivate = 1; + int rt; + jiffie_t deadline; + unsigned long flags; + + + if (is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP) + { + DUMP("preparing %d for next period\n", prev->pid); + prepare_for_next_period(prev); + } + + if (get_rt_mode() == MODE_RT_RUN) { + write_lock_irqsave(&gedf.ready_lock, flags); + + clear_will_schedule(); + + if (is_realtime(prev) && is_released(prev) && is_running(prev) + && !preemption_needed(&gedf, prev)) { + /* Our current task's next job has already been + * released and has higher priority than the highest + * prioriy waiting task; in other words: it is tardy. + * We just keep it. + */ + DUMP("prev will be next, already released\n"); + *next = prev; + rt = 1; + deadline = prev->rt_param.times.deadline; + need_deactivate = 0; + } else { + /* either not yet released, preempted, or non-rt */ + *next = __take_ready(&gedf); + if (*next) { + /* mark the task as executing on this cpu */ + set_task_cpu(*next, smp_processor_id()); + + /* stick the task into the runqueue */ + __activate_task(*next, rq); + rt = 1; + deadline = (*next)->rt_param.times.deadline; + } + else + rt = deadline = 0; + } + + adjust_cpu_queue(rt, deadline); + + if (rt) { + set_rt_flags(*next, RT_F_RUNNING); + gedf.check_resched(&gedf); + } + write_unlock_irqrestore(&gedf.ready_lock, flags); + } + + if (is_realtime(prev) && need_deactivate && prev->array) { + /* take it out of the run queue */ + deactivate_task(prev, rq); + } + + /* don't put back into release yet. + * We first need to actually switch + * stacks before we can execute it + * on a different CPU */ + + /* in the current implementation nobody cares about the return value */ + return 0; +} + + +/* _finish_switch - we just finished the switch away from prev + * it is now safe to requeue the task + */ +static void gedf_finish_switch(struct task_struct *prev) +{ + if (!is_realtime(prev) || !is_running(prev)) + return; + + /*printk(KERN_INFO "gedf finish switch for %d\n", prev->pid);*/ + if (get_rt_flags(prev) == RT_F_SLEEP || + get_rt_mode() != MODE_RT_RUN) { + /* this task has expired + * _schedule has already taken care of updating + * the release and + * deadline. We just must check if has been released. + */ + if (time_before_eq(prev->rt_param.times.release, jiffies) + && get_rt_mode() == MODE_RT_RUN) { + /* already released */ + add_ready(&gedf, prev); + DUMP("%d goes straight to ready queue\n", prev->pid); + } + else + /* it has got to wait */ + add_release(&gedf, prev); + } + else { + /* this is a forced preemption + * thus the task stays in the ready_queue + * we only must make it available to others + */ + add_ready(&gedf, prev); + } +} + + +/* Prepare a task for running in RT mode + * Enqueues the task into master queue data structure + * returns + * -EPERM if task is not TASK_STOPPED + */ +static long gedf_prepare_task(struct task_struct * t) +{ + TRACE("global edf: prepare task %d\n", t->pid); + + if (t->state == TASK_STOPPED) { + __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1); + + if (get_rt_mode() == MODE_RT_RUN) + /* The action is already on. + * Prepare immediate release + */ + prepare_new_release(t); + /* The task should be running in the queue, otherwise signal + * code will try to wake it up with fatal consequences. + */ + t->state = TASK_RUNNING; + add_release(&gedf, t); + return 0; + } + else + return -EPERM; +} + +static void gedf_wake_up_task(struct task_struct *task) +{ + /* We must determine whether task should go into the release + * queue or into the ready queue. It may enter the ready queue + * if it has credit left in its time slice and has not yet reached + * its deadline. If it is now passed its deadline we assume this the + * arrival of a new sporadic job and thus put it in the ready queue + * anyway.If it has zero budget and the next release is in the future + * it has to go to the release queue. + */ + TRACE("global edf: wake up %d with budget=%d\n", + task->pid, task->time_slice); + task->state = TASK_RUNNING; + if (is_tardy(task)) { + /* new sporadic release */ + prepare_new_release(task); + sched_trace_job_release(task); + add_ready(&gedf, task); + } + else if (task->time_slice) { + /* came back in time before deadline + * TODO: clip budget to fit into period, otherwise it could + * cause a deadline overrun in the next period, i.e. + * over allocation in the next period. + */ + set_rt_flags(task, RT_F_RUNNING); + add_ready(&gedf, task); + } + else { + add_release(&gedf, task); + } + +} + +static void gedf_task_blocks(struct task_struct *t) +{ + BUG_ON(!is_realtime(t)); + /* not really anything to do since it can only block if + * it is running, and when it is not running it is not in any + * queue anyway. + * + * TODO: Check whether the assumption is correct for SIGKILL and + * SIGSTOP. + */ + TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice); + BUG_ON(t->rt_list.next != LIST_POISON1); + BUG_ON(t->rt_list.prev != LIST_POISON2); +} + + +/* When _tear_down is called, the task should not be in any queue any more + * as it must have blocked first. We don't have any internal state for the task, + * it is all in the task_struct. + */ +static long gedf_tear_down(struct task_struct * t) +{ + BUG_ON(!is_realtime(t)); + TRACE("global edf: tear down called for %d \n", t->pid); + BUG_ON(t->array); + BUG_ON(t->rt_list.next != LIST_POISON1); + BUG_ON(t->rt_list.prev != LIST_POISON2); + return 0; +} + + +static int gedf_mode_change(int new_mode) +{ + int cpu; + cpu_entry_t *entry; + +/* printk(KERN_INFO "[%d] global edf: mode changed to %d\n", smp_processor_id(), + new_mode);*/ + if (new_mode == MODE_RT_RUN) { + prepare_new_releases(&gedf, jiffies + 10); + + /* initialize per CPU state + * we can't do this at boot time because we don't know + * which CPUs will be online and we can't put non-existing + * cpus into the queue + */ + spin_lock(&gedf_cpu_lock); + /* get old cruft out of the way in case we reenter real-time + * mode for a second time + */ + while (!list_empty(&gedf_cpu_queue)) + list_del(gedf_cpu_queue.next); + /* reinitialize */ + for_each_online_cpu(cpu) { + entry = &per_cpu(gedf_cpu_entries, cpu); + atomic_set(&entry->will_schedule, 0); + entry->executes_realtime = 0; + entry->cur_deadline = 0; + entry->cpu = cpu; + list_add(&entry->list, &gedf_cpu_queue); + } + spin_unlock(&gedf_cpu_lock); + } + /*printk(KERN_INFO "[%d] global edf: mode change done\n", smp_processor_id()); */ + return 0; +} + + +/* Plugin object */ +static sched_plugin_t s_plugin __cacheline_aligned_in_smp = { + .ready_to_use = 0 +}; + + +/* + * Plugin initialization code. + */ +#define INIT_SCHED_PLUGIN (struct sched_plugin){\ + .plugin_name = "Global EDF",\ + .ready_to_use = 1,\ + .algo_scheduler_tick = gedf_scheduler_tick,\ + .scheduler_tick = rt_scheduler_tick,\ + .prepare_task = gedf_prepare_task,\ + .sleep_next_period = edf_sleep_next_period,\ + .tear_down = gedf_tear_down,\ + .shutdown_hook = 0,\ + .schedule = gedf_schedule,\ + .finish_switch = gedf_finish_switch,\ + .mode_change = gedf_mode_change,\ + .wake_up_task = gedf_wake_up_task,\ + .task_blocks = gedf_task_blocks \ + } + + +sched_plugin_t *__init init_global_edf_plugin(void) +{ + if (!s_plugin.ready_to_use) + { + set_sched_options(SCHED_NONE); + edf_domain_init(&gedf, gedf_check_resched); + s_plugin = INIT_SCHED_PLUGIN; + } + return &s_plugin; +} + + + +/*****************************************************************************/ +/*****************************************************************************/ +/*****************************************************************************/ +/* NON-PREEMPTIVE GLOBAL EDF */ + + +/* gedf_np_scheduler_tick - this function is called for every local timer + * interrupt. + * + * checks whether the current task has expired and checks + * whether we need to preempt it if it has not expired + */ +static reschedule_check_t gedf_np_scheduler_tick(void) +{ + if (get_rt_mode() == MODE_RT_RUN) + { + /* check whether anything is waiting to be released + * this could probably be moved to the global timer + * interrupt handler since the state will only change + * once per jiffie + */ + try_release_pending(&gedf); + } + + /* expire tasks even if not in real-time mode + * this makes sure that at the end of real-time mode + * no tasks "run away forever". + */ + BUG_ON(current->time_slice > 1000); + if (is_realtime(current) && (!--current->time_slice)) { + /* this task has exhausted its budget in this period */ + set_rt_flags(current, RT_F_SLEEP); + return FORCE_RESCHED; + } + else + return NO_RESCHED; +} + +/* gedf_np_check_resched - Check whether another CPU needs to reschedule. + * + * The function only checks and kicks the last CPU. It will reschedule and + * kick the next if necessary, and so on. The caller is responsible for making + * sure that it is not the last entry or that a reschedule is not necessary. + * + */ +static int gedf_np_check_resched(edf_domain_t *edf) +{ + cpu_entry_t *last; + int ret = 0; + + spin_lock(&gedf_cpu_lock); + + if (!list_empty(&edf->ready_queue)) { + last = list_entry(gedf_cpu_queue.prev, cpu_entry_t, list); + /* preemption happens only for non-realtime tasks */ + if (!last->executes_realtime) + { + if (smp_processor_id() == last->cpu) + set_tsk_need_resched(current); + else + smp_send_reschedule(last->cpu); + ret = 1; + goto out; + } + } + + out: + spin_unlock(&gedf_cpu_lock); + return ret; +} + + +/* non-preemptive global EDF + * + * Non-preemptive EDF is almost the same as normal EDF. We only have to + * adjust the scheduler tick and the resched function. + */ +#define INIT_SCHED_PLUGIN_NP (struct sched_plugin){\ + .plugin_name = "Non-Preemptive Global EDF",\ + .ready_to_use = 1,\ + .algo_scheduler_tick = gedf_np_scheduler_tick,\ + .scheduler_tick = rt_scheduler_tick,\ + .prepare_task = gedf_prepare_task,\ + .sleep_next_period = edf_sleep_next_period,\ + .tear_down = gedf_tear_down,\ + .shutdown_hook = 0,\ + .schedule = gedf_schedule,\ + .finish_switch = gedf_finish_switch,\ + .mode_change = gedf_mode_change,\ + .wake_up_task = gedf_wake_up_task,\ + .task_blocks = gedf_task_blocks \ + } + + +/* as we only set the plugin at boot time, + * we use the same structure as preemptive EDF. This simplifies a lot + * of the funtions. + */ +sched_plugin_t* __init init_global_edf_np_plugin(void) +{ + if (!s_plugin.ready_to_use) + { + set_sched_options(SCHED_NONE); + edf_domain_init(&gedf, gedf_np_check_resched); + s_plugin = INIT_SCHED_PLUGIN_NP; + } + return &s_plugin; +} diff --git a/kernel/sched_gsn_edf.c b/kernel/sched_gsn_edf.c new file mode 100644 index 0000000..9042588 --- /dev/null +++ b/kernel/sched_gsn_edf.c @@ -0,0 +1,760 @@ +/* + * kernel/sched_gsn_edf.c + * + * Implementation of the GSN-EDF scheduling algorithm. + * + * This version uses the simple approach and serializes all scheduling + * decisions by the use of a queue lock. This is probably not the + * best way to do it, but it should suffice for now. It should not + * affect the benchmarks since all synchronization primitives will + * take the same performance hit, if any. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +int in_gsnedf_schedule[NR_CPUS] = {0, 0, 0, 0}; +int in_gsnedf_scheduler_tick[NR_CPUS] = {0, 0, 0, 0}; +int in_gsnedf_finish_switch[NR_CPUS] = {0, 0, 0, 0}; + +/* cpu_entry_t - maintain the linked and scheduled state + */ +typedef struct { + int cpu; + struct task_struct* linked; /* only RT tasks */ + struct task_struct* scheduled; /* only RT tasks */ + struct list_head list; + atomic_t will_schedule; /* prevent unneeded IPIs */ +} cpu_entry_t; +DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries); + +#define set_will_schedule() \ + (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 1)) +#define clear_will_schedule() \ + (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 0)) +#define test_will_schedule(cpu) \ + (atomic_read(&per_cpu(gsnedf_cpu_entries, cpu).will_schedule)) + + +#define NO_CPU 0xffffffff + +/* The gsnedf_lock is used to serialize all scheduling events. + * It protects + */ +static queuelock_t gsnedf_lock; +/* the cpus queue themselves according to priority in here */ +static LIST_HEAD(gsnedf_cpu_queue); + +static edf_domain_t gsnedf; + + +/* update_cpu_position - Move the cpu entry to the correct place to maintain + * order in the cpu queue. Caller must hold gsnedf lock. + */ +static void update_cpu_position(cpu_entry_t *entry) +{ + cpu_entry_t *other; + struct list_head *pos; + list_del(&entry->list); + /* if we do not execute real-time jobs we just move + * to the end of the queue + */ + if (entry->linked) { + list_for_each(pos, &gsnedf_cpu_queue) { + other = list_entry(pos, cpu_entry_t, list); + if (edf_higher_prio(entry->linked, other->linked)) { + __list_add(&entry->list, pos->prev, pos); + return; + } + } + } + /* if we get this far we have the lowest priority job */ + list_add_tail(&entry->list, &gsnedf_cpu_queue); +} + +/* link_task_to_cpu - Update the link of a CPU. + * Handles the case where the to-be-linked task is already + * scheduled on a different CPU. + */ +static noinline void link_task_to_cpu(struct task_struct* linked, + cpu_entry_t *entry) + +{ + cpu_entry_t *sched; + struct task_struct* tmp; + int on_cpu; + + BUG_ON(linked && !is_realtime(linked)); + + /* Currently linked task is set to be unlinked. */ + if (entry->linked) { + entry->linked->rt_param.linked_on = NO_CPU; + } + + /* Link new task to CPU. */ + if (linked) { + set_rt_flags(linked, RT_F_RUNNING); + /* handle task is already scheduled somewhere! */ + on_cpu = linked->rt_param.scheduled_on; + if (on_cpu != NO_CPU) { + sched = &per_cpu(gsnedf_cpu_entries, on_cpu); + /* this should only happen if not linked already */ + BUG_ON(sched->linked == linked); + + /* If we are already scheduled on the CPU to which we + * wanted to link, we don't need to do the swap -- + * we just link ourselves to the CPU and depend on + * the caller to get things right. + */ + if (entry != sched) { + tmp = sched->linked; + linked->rt_param.linked_on = sched->cpu; + sched->linked = linked; + update_cpu_position(sched); + linked = tmp; + } + } + if (linked) /* might be NULL due to swap */ + linked->rt_param.linked_on = entry->cpu; + } + entry->linked = linked; + update_cpu_position(entry); +} + +/* unlink - Make sure a task is not linked any longer to an entry + * where it was linked before. Must hold gsnedf_lock. + */ +static noinline void unlink(struct task_struct* t) +{ + cpu_entry_t *entry; + + BUG_ON(!t); + + if (t->rt_param.linked_on != NO_CPU) { + /* unlink */ + entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on); + t->rt_param.linked_on = NO_CPU; + link_task_to_cpu(NULL, entry); + } else if (in_list(&t->rt_list)) { + /* This is an interesting situation: t is scheduled, + * but was just recently unlinked. It cannot be + * linked anywhere else (because then it would have + * been relinked to this CPU), thus it must be in some + * queue. We must remove it from the list in this + * case. + */ + list_del(&t->rt_list); + } +} + + +/* preempt - force a CPU to reschedule + */ +static noinline void preempt(cpu_entry_t *entry) +{ + if (entry->scheduled && is_np(entry->scheduled)) + return; + if (smp_processor_id() == entry->cpu) + set_tsk_need_resched(current); + else + if (!test_will_schedule(entry->cpu)) + smp_send_reschedule(entry->cpu); +} + +/* requeue - Put an unlinked task into gsn-edf domain. + * Caller must hold gsnedf_lock. + */ +static noinline void requeue(struct task_struct* task) +{ + BUG_ON(!task); + /* sanity check rt_list before insertion */ + BUG_ON(in_list(&task->rt_list)); + + if (get_rt_flags(task) == RT_F_SLEEP || + get_rt_mode() != MODE_RT_RUN) { + /* this task has expired + * _schedule has already taken care of updating + * the release and + * deadline. We just must check if it has been released. + */ + if (is_released(task) && get_rt_mode() == MODE_RT_RUN) + __add_ready(&gsnedf, task); + else { + /* it has got to wait */ + __add_release(&gsnedf, task); + } + + } else + /* this is a forced preemption + * thus the task stays in the ready_queue + * we only must make it available to others + */ + __add_ready(&gsnedf, task); +} + +/* gsnedf_job_arrival: task is either resumed or released */ +static noinline void gsnedf_job_arrival(struct task_struct* task) +{ + cpu_entry_t* last; + + BUG_ON(list_empty(&gsnedf_cpu_queue)); + BUG_ON(!task); + + /* first queue arriving job */ + requeue(task); + + /* then check for any necessary preemptions */ + last = list_entry(gsnedf_cpu_queue.prev, cpu_entry_t, list); + if (preemption_needed(&gsnedf, last->linked)) { + /* preemption necessary */ + task = __take_ready(&gsnedf); + TRACE("job_arrival: task %d linked to %d\n", task->pid, last->cpu); + if (last->linked) + requeue(last->linked); + + link_task_to_cpu(task, last); + preempt(last); + } +} + +/* check for current job releases */ +static noinline void gsnedf_release_jobs(void) +{ + struct list_head *pos, *save; + struct task_struct *queued; + + list_for_each_safe(pos, save, &gsnedf.release_queue) { + queued = list_entry(pos, struct task_struct, rt_list); + if (likely(is_released(queued))) { + /* this one is ready to go*/ + list_del(pos); + set_rt_flags(queued, RT_F_RUNNING); + + sched_trace_job_release(queued); + gsnedf_job_arrival(queued); + } + else + /* the release queue is ordered */ + break; + } +} + +/* gsnedf_scheduler_tick - this function is called for every local timer + * interrupt. + * + * checks whether the current task has expired and checks + * whether we need to preempt it if it has not expired + */ +static reschedule_check_t gsnedf_scheduler_tick(void) +{ + unsigned long flags; + struct task_struct* t = current; + reschedule_check_t want_resched = NO_RESCHED; + cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries); + + /* debug */ + in_gsnedf_scheduler_tick[smp_processor_id()] = 1; + + /* expire tasks even if not in real-time mode + * this makes sure that at the end of real-time mode + * no task "runs away forever". + */ + if (is_realtime(t)) + TRACE_TASK(t, "scheduler tick\n"); + + if (is_realtime(t) && t->time_slice && !--t->time_slice) { + if (!is_np(t)) { /* np tasks will be preempted when they become + preemptable again */ + set_rt_flags(t, RT_F_SLEEP); + want_resched = FORCE_RESCHED; + set_will_schedule(); + sched_trace_job_completion(t); + /* prepare for next period */ + prepare_for_next_period(t); + queue_lock_irqsave(&gsnedf_lock, flags); + /* unlink */ + unlink(t); + /* requeue */ + gsnedf_job_arrival(t); + queue_unlock_irqrestore(&gsnedf_lock, flags); + } else + TRACE("gsnedf_scheduler_tick: " + "%d is non-preemptable, " + "preemption delayed.\n", t->pid); + + } + if (get_rt_mode() == MODE_RT_RUN) { + in_gsnedf_scheduler_tick[smp_processor_id()] = 666; + + queue_lock_irqsave(&gsnedf_lock, flags); + + /* (1) try to release pending jobs */ + gsnedf_release_jobs(); + + /* (2) check if we need to reschedule */ + if (entry->linked != entry->scheduled && + (!entry->scheduled || !is_np(entry->scheduled))) { + want_resched = FORCE_RESCHED; + set_will_schedule(); + } + queue_unlock_irqrestore(&gsnedf_lock, flags); + } + + /* debug */ + in_gsnedf_scheduler_tick[smp_processor_id()] = 0; + + return want_resched; +} + +/* This is main Global EDF schedule function + * + * Assumes the caller holds the lock for rq and that irqs are disabled + */ +static int gsnedf_schedule(struct task_struct * prev, + struct task_struct ** next, + runqueue_t * rq) +{ + cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries); + + in_gsnedf_schedule[smp_processor_id()] = 1; + + /* will be released in finish_switch */ + queue_lock(&gsnedf_lock); + clear_will_schedule(); + + /* (1) check for blocking jobs */ + if (prev == entry->linked && + (get_rt_mode() != MODE_RT_RUN || !is_running(prev))) { + link_task_to_cpu(NULL, entry); + } + + /* (2) if not linked then get rt task */ + if (get_rt_mode() == MODE_RT_RUN && !entry->linked) { + link_task_to_cpu(__take_ready(&gsnedf), entry); + } + + /* (3) if linked different from scheduled + * select linked as next + */ + BUG_ON(entry->scheduled && entry->scheduled != prev); + if (entry->linked != entry->scheduled) { + /* do we need to take care of a previously scheduled + * job? */ + if (entry->scheduled) { + BUG_ON(!is_realtime(prev)); + if (prev->array) + /* take it out of the run queue */ + deactivate_task(prev, rq); + } + /* do we need to schedule a linked job? */ + if (entry->linked) { + *next = entry->linked; + /* mark the task as executing on this cpu */ + set_task_cpu(*next, smp_processor_id()); + /* stick the task into the runqueue */ + __activate_task(*next, rq); + } + } else + *next = entry->linked; + + /* unlock in case that we don't affect real-time tasks or + * if nothing changed and finish_switch won't be called + */ + if (prev == *next || (!is_realtime(prev) && !*next)) + queue_unlock(&gsnedf_lock); + + in_gsnedf_schedule[smp_processor_id()] = 0; + + return 0; +} + + +/* _finish_switch - we just finished the switch away from prev + */ +static void gsnedf_finish_switch(struct task_struct *prev) +{ + cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries); + + in_gsnedf_finish_switch[smp_processor_id()] = 1; + + if (is_realtime(current)) + entry->scheduled = current; + else + entry->scheduled = NULL; + + prev->rt_param.scheduled_on = NO_CPU; + current->rt_param.scheduled_on = smp_processor_id(); + + /* unlock in case schedule() left it locked */ + if (is_realtime(current) || is_realtime(prev)) + queue_unlock(&gsnedf_lock); + + + in_gsnedf_finish_switch[smp_processor_id()] = 0; +} + + +/* Prepare a task for running in RT mode + * Enqueues the task into master queue data structure + * returns + * -EPERM if task is not TASK_STOPPED + */ +static long gsnedf_prepare_task(struct task_struct * t) +{ + unsigned long flags; + TRACE("gsn edf: prepare task %d\n", t->pid); + + if (t->state == TASK_STOPPED) { + __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1); + + t->rt_param.scheduled_on = NO_CPU; + t->rt_param.linked_on = NO_CPU; + t->rt_param.is_non_preemptable = 0; + if (get_rt_mode() == MODE_RT_RUN) + /* The action is already on. + * Prepare immediate release + */ + prepare_new_release(t); + /* The task should be running in the queue, otherwise signal + * code will try to wake it up with fatal consequences. + */ + t->state = TASK_RUNNING; + + queue_lock_irqsave(&gsnedf_lock, flags); + requeue(t); + queue_unlock_irqrestore(&gsnedf_lock, flags); + return 0; + } + else + return -EPERM; +} + +static void gsnedf_wake_up_task(struct task_struct *task) +{ + unsigned long flags; + /* We must determine whether task should go into the release + * queue or into the ready queue. It may enter the ready queue + * if it has credit left in its time slice and has not yet reached + * its deadline. If it is now passed its deadline we assume this the + * arrival of a new sporadic job and thus put it in the ready queue + * anyway.If it has zero budget and the next release is in the future + * it has to go to the release queue. + */ + TRACE("gsnedf: %d unsuspends with budget=%d\n", + task->pid, task->time_slice); + task->state = TASK_RUNNING; + + /* We need to take suspensions because of semaphores into + * account! If a job resumes after being suspended due to acquiring + * a semaphore, it should never be treated as a new job release. + */ + if (get_rt_flags(task) == RT_F_EXIT_SEM) { + set_rt_flags(task, RT_F_RUNNING); + } else { + if (is_tardy(task)) { + /* new sporadic release */ + prepare_new_release(task); + sched_trace_job_release(task); + } + else if (task->time_slice) + /* came back in time before deadline + */ + set_rt_flags(task, RT_F_RUNNING); + } + + queue_lock_irqsave(&gsnedf_lock, flags); + gsnedf_job_arrival(task); + queue_unlock_irqrestore(&gsnedf_lock, flags); +} + +static void gsnedf_task_blocks(struct task_struct *t) +{ + unsigned long flags; + + /* unlink if necessary */ + queue_lock_irqsave(&gsnedf_lock, flags); + unlink(t); + queue_unlock_irqrestore(&gsnedf_lock, flags); + + BUG_ON(!is_realtime(t)); + TRACE("task %d suspends with budget=%d\n", t->pid, t->time_slice); + BUG_ON(t->rt_list.next != LIST_POISON1); + BUG_ON(t->rt_list.prev != LIST_POISON2); +} + + +/* When _tear_down is called, the task should not be in any queue any more + * as it must have blocked first. We don't have any internal state for the task, + * it is all in the task_struct. + */ +static long gsnedf_tear_down(struct task_struct * t) +{ + BUG_ON(!is_realtime(t)); + TRACE_TASK(t, "tear down called"); + BUG_ON(t->array); + BUG_ON(t->rt_list.next != LIST_POISON1); + BUG_ON(t->rt_list.prev != LIST_POISON2); + return 0; +} + + +static long gsnedf_enter_np(struct task_struct * t) +{ + unsigned long flags; + + queue_lock_irqsave(&gsnedf_lock, flags); + t->rt_param.is_non_preemptable++; + queue_unlock_irqrestore(&gsnedf_lock, flags); + return 0; +} + +static long gsnedf_exit_np(struct task_struct * t) +{ + unsigned long flags; + int ret = 0; + cpu_entry_t *entry; + + queue_lock_irqsave(&gsnedf_lock, flags); + if (is_np(t)) { + t->rt_param.is_non_preemptable--; + entry = &__get_cpu_var(gsnedf_cpu_entries); + if (!is_np(t) && (!t->time_slice || entry->linked != t)) { + BUG_ON(t != entry->scheduled); + /* t is now preemptable and not linked */ + set_will_schedule(); + if (!t->time_slice) { + set_rt_flags(t, RT_F_SLEEP); + sched_trace_job_completion(t); + /* prepare for next period */ + prepare_for_next_period(t); + } + /* unlink */ + unlink(t); + /* requeue */ + gsnedf_job_arrival(t); + /* reschedule if necessary */ + if (entry->linked != entry->scheduled) { + TRACE("gsnedf_exit_np: delayed " + "preemption of %d\n", + t->pid); + set_tsk_need_resched(current); + } else + TRACE("gsnedf_exit_np: no preemption-necessary, " + " %s/%d got relinked\n", + entry->scheduled->comm, + entry->scheduled->pid); + } + } else + ret = -EPERM; + queue_unlock_irqrestore(&gsnedf_lock, flags); + return ret; +} + +static long gsnedf_pi_block(struct pi_semaphore *sem, + struct task_struct *new_waiter) +{ + /* This callback has to handle the situation where a new waiter is + * added to the wait queue of the semaphore. + * + * We must check if has a higher priority than the currently + * highest-priority task, and then potentially reschedule. + */ + + BUG_ON(!new_waiter); + + if (edf_higher_prio(new_waiter, sem->hp.task)) { + TRACE_TASK(new_waiter, " boosts priority\n"); + /* called with IRQs disabled */ + queue_lock(&gsnedf_lock); + /* store new highest-priority task */ + sem->hp.task = new_waiter; + if (sem->holder) { + /* let holder inherit */ + sem->holder->rt_param.inh_task = new_waiter; + unlink(sem->holder); + gsnedf_job_arrival(sem->holder); + } + queue_unlock(&gsnedf_lock); + } + + return 0; +} + +static long gsnedf_inherit_priority(struct pi_semaphore *sem, + struct task_struct *new_owner) +{ + /* We don't need to acquire the gsnedf_lock since at the time of this + * call new_owner isn't actually scheduled yet (it's still sleeping) + * and since the calling function already holds sem->wait.lock, which + * prevents concurrent sem->hp.task changes. + */ + + if (sem->hp.task && sem->hp.task != new_owner) { + new_owner->rt_param.inh_task = sem->hp.task; + TRACE_TASK(new_owner, "inherited priority from %s/%d\n", + sem->hp.task->comm, sem->hp.task->pid); + } else + TRACE_TASK(new_owner, + "cannot inherit priority, " + "no higher priority job waits.\n"); + return 0; +} + +/* This function is called on a semaphore release, and assumes that + * the current task is also the semaphore holder. + */ +static long gsnedf_return_priority(struct pi_semaphore *sem) +{ + struct task_struct* t = current; + int ret = 0; + + /* Find new highest-priority semaphore task + * if holder task is the current hp.task. + * + * Calling function holds sem->wait.lock. + */ + if (t == sem->hp.task) + edf_set_hp_task(sem); + + TRACE_CUR("gsnedf_return_priority for lock %p\n", sem); + + if (t->rt_param.inh_task) { + /* interrupts already disabled by PI code */ + queue_lock(&gsnedf_lock); + + /* Reset inh_task to NULL. */ + t->rt_param.inh_task = NULL; + + /* Check if rescheduling is necessary */ + unlink(t); + gsnedf_job_arrival(t); + queue_unlock(&gsnedf_lock); + } + + return ret; +} + +/* + * Deactivate current task until the beginning of the next period. + */ +static long gsnedf_sleep_next_period(void) +{ + unsigned long flags; + struct task_struct* t = current; + + queue_lock_irqsave(&gsnedf_lock, flags); + + /* Mark that we do not excute anymore */ + set_rt_flags(t, RT_F_SLEEP); + sched_trace_job_completion(t); + /* prepare for next period */ + prepare_for_next_period(t); + + /* unlink */ + unlink(t); + /* requeue */ + gsnedf_job_arrival(t); + + /* will reschedule on return to user mode */ + set_tsk_need_resched(t); + + queue_unlock_irqrestore(&gsnedf_lock, flags); + + return 0; +} + + +static int gsnedf_mode_change(int new_mode) +{ + unsigned long flags; + int cpu; + cpu_entry_t *entry; + + if (new_mode == MODE_RT_RUN) { + queue_lock_irqsave(&gsnedf_lock, flags); + + __prepare_new_releases(&gsnedf, jiffies + 10); + + /* get old cruft out of the way in case we reenter real-time + * mode for a second time + */ + while (!list_empty(&gsnedf_cpu_queue)) + list_del(gsnedf_cpu_queue.next); + /* reinitialize */ + for_each_online_cpu(cpu) { + entry = &per_cpu(gsnedf_cpu_entries, cpu); + atomic_set(&entry->will_schedule, 0); + entry->linked = NULL; + entry->scheduled = NULL; + list_add(&entry->list, &gsnedf_cpu_queue); + } + + queue_unlock_irqrestore(&gsnedf_lock, flags); + + } + return 0; +} + + +/* Plugin object */ +static sched_plugin_t s_plugin __cacheline_aligned_in_smp = { + .ready_to_use = 0 +}; + + +/* + * Plugin initialization code. + */ +#define INIT_SCHED_PLUGIN (struct sched_plugin){ \ + .plugin_name = "GSN-EDF", \ + .ready_to_use = 1, \ + .algo_scheduler_tick = gsnedf_scheduler_tick, \ + .scheduler_tick = rt_scheduler_tick, \ + .prepare_task = gsnedf_prepare_task, \ + .sleep_next_period = gsnedf_sleep_next_period, \ + .tear_down = gsnedf_tear_down, \ + .schedule = gsnedf_schedule, \ + .finish_switch = gsnedf_finish_switch, \ + .mode_change = gsnedf_mode_change, \ + .wake_up_task = gsnedf_wake_up_task, \ + .task_blocks = gsnedf_task_blocks, \ + .enter_np = gsnedf_enter_np, \ + .exit_np = gsnedf_exit_np, \ + .inherit_priority = gsnedf_inherit_priority, \ + .return_priority = gsnedf_return_priority, \ + .pi_block = gsnedf_pi_block \ +} + + +sched_plugin_t *__init init_gsn_edf_plugin(void) +{ + int cpu; + cpu_entry_t *entry; + + if (!s_plugin.ready_to_use) + { + /* initialize CPU state */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + entry = &per_cpu(gsnedf_cpu_entries, cpu); + atomic_set(&entry->will_schedule, 0); + entry->linked = NULL; + entry->scheduled = NULL; + entry->cpu = cpu; + } + + queue_lock_init(&gsnedf_lock); + set_sched_options(SCHED_NONE); + edf_domain_init(&gsnedf, NULL); + s_plugin = INIT_SCHED_PLUGIN; + } + return &s_plugin; +} + + diff --git a/kernel/sched_part_edf.c b/kernel/sched_part_edf.c new file mode 100644 index 0000000..c382722 --- /dev/null +++ b/kernel/sched_part_edf.c @@ -0,0 +1,345 @@ +/* + * kernel/sched_part_edf.c + * + * Implementation of the partitioned EDF scheduler plugin. + */ + +#include +#include +#include +#include + +#include +#include +#include + + +typedef struct { + edf_domain_t domain; + int cpu; + struct task_struct* scheduled; /* only RT tasks */ + spinlock_t lock; +} part_edf_domain_t; + + +#define local_edf (&__get_cpu_var(part_edf_domains).domain) +#define local_pedf (&__get_cpu_var(part_edf_domains)) +#define remote_edf(cpu) (&per_cpu(part_edf_domains, cpu).domain) +#define remote_pedf(cpu) (&per_cpu(part_edf_domains, cpu)) +#define task_edf(task) remote_edf(get_partition(task)) + +static void part_edf_domain_init(part_edf_domain_t* pedf, + edf_check_resched_needed_t check, + int cpu) +{ + edf_domain_init(&pedf->domain, check); + pedf->cpu = cpu; + pedf->lock = SPIN_LOCK_UNLOCKED; + pedf->scheduled = NULL; +} + +DEFINE_PER_CPU(part_edf_domain_t, part_edf_domains); + +/* This check is trivial in partioned systems as we only have to consider + * the CPU of the partition. + * + */ +static int part_edf_check_resched(edf_domain_t *edf) +{ + part_edf_domain_t *pedf = container_of(edf, part_edf_domain_t, domain); + int ret = 0; + + spin_lock(&pedf->lock); + + /* because this is a callback from edf_domain_t we already hold + * the necessary lock for the ready queue + */ + if (preemption_needed(edf, pedf->scheduled)) { + if (pedf->cpu == smp_processor_id()) + set_tsk_need_resched(current); + else + smp_send_reschedule(pedf->cpu); + ret = 1; + } + spin_unlock(&pedf->lock); + return ret; +} + + +static reschedule_check_t part_edf_scheduler_tick(void) +{ + unsigned long flags; + struct task_struct *t = current; + reschedule_check_t want_resched = NO_RESCHED; + edf_domain_t *edf = local_edf; + part_edf_domain_t *pedf = local_pedf; + + /* Check for inconsistency. We don't need the lock for this since + * ->scheduled is only changed in schedule, which obviously is not + * executing in parallel on this CPU + */ + BUG_ON(is_realtime(t) && t != pedf->scheduled); + + /* expire tasks even if not in real-time mode + * this makes sure that at the end of real-time mode + * no tasks "run away forever". + */ + if (is_realtime(t) && (!--t->time_slice)) { + /* this task has exhausted its budget in this period */ + set_rt_flags(t, RT_F_SLEEP); + want_resched = FORCE_RESCHED; + } + if (get_rt_mode() == MODE_RT_RUN) + { + /* check whether anything is waiting to be released + * this could probably be moved to the global timer + * interrupt handler since the state will only change + * once per jiffie + */ + try_release_pending(edf); + if (want_resched != FORCE_RESCHED) + { + read_lock_irqsave(&edf->ready_lock, flags); + if (preemption_needed(edf, t)) + want_resched = FORCE_RESCHED; + read_unlock_irqrestore(&edf->ready_lock, flags); + } + } + return want_resched; +} + +static int part_edf_schedule(struct task_struct * prev, + struct task_struct ** next, + runqueue_t * rq) +{ + int need_deactivate = 1; + part_edf_domain_t* pedf = local_pedf; + edf_domain_t* edf = &pedf->domain; + + + if (is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP) + prepare_for_next_period(prev); + + if (get_rt_mode() == MODE_RT_RUN) { + write_lock(&edf->ready_lock); + if (is_realtime(prev) && is_released(prev) && is_running(prev) + && !preemption_needed(edf, prev)) { + /* this really should only happen if the task has + * 100% utilization... + */ + TRACE("prev will be next, already released\n"); + *next = prev; + need_deactivate = 0; + } else { + /* either not yet released, preempted, or non-rt */ + *next = __take_ready(edf); + if (*next) { + /* stick the task into the runqueue */ + __activate_task(*next, rq); + set_task_cpu(*next, smp_processor_id()); + } + } + spin_lock(&pedf->lock); + pedf->scheduled = *next; + spin_unlock(&pedf->lock); + if (*next) + set_rt_flags(*next, RT_F_RUNNING); + + write_unlock(&edf->ready_lock); + } + + if (is_realtime(prev) && need_deactivate && prev->array) { + /* take it out of the run queue */ + deactivate_task(prev, rq); + } + + return 0; +} + + +static void part_edf_finish_switch(struct task_struct *prev) +{ + edf_domain_t* edf = local_edf; + + if (!is_realtime(prev) || !is_running(prev)) + return; + + if (get_rt_flags(prev) == RT_F_SLEEP || + get_rt_mode() != MODE_RT_RUN) { + /* this task has expired + * _schedule has already taken care of updating + * the release and + * deadline. We just must check if has been released. + */ + if (is_released(prev) && get_rt_mode() == MODE_RT_RUN) { + /* already released */ + add_ready(edf, prev); + TRACE("%d goes straight to ready queue\n", prev->pid); + } else + /* it has got to wait */ + add_release(edf, prev); + } else { + /* this is a forced preemption + * thus the task stays in the ready_queue + * we only must make it available to others + */ + add_ready(edf, prev); + } +} + + +/* Prepare a task for running in RT mode + * Enqueues the task into master queue data structure + * returns + * -EPERM if task is not TASK_STOPPED + */ +static long part_edf_prepare_task(struct task_struct * t) +{ + edf_domain_t* edf = task_edf(t); + + + TRACE("[%d] part edf: prepare task %d on CPU %d\n", + smp_processor_id(), t->pid, get_partition(t)); + if (t->state == TASK_STOPPED) { + __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1); + + if (get_rt_mode() == MODE_RT_RUN) + /* The action is already on. + * Prepare immediate release. + */ + prepare_new_release(t); + /* The task should be running in the queue, otherwise signal + * code will try to wake it up with fatal consequences. + */ + t->state = TASK_RUNNING; + add_release(edf, t); + return 0; + } else + return -EPERM; +} + +static void part_edf_wake_up_task(struct task_struct *task) +{ + edf_domain_t* edf; + + edf = task_edf(task); + + /* We must determine whether task should go into the release + * queue or into the ready queue. It may enter the ready queue + * if it has credit left in its time slice and has not yet reached + * its deadline. If it is now passed its deadline we assume this the + * arrival of a new sporadic job and thus put it in the ready queue + * anyway.If it has zero budget and the next release is in the future + * it has to go to the release queue. + */ + TRACE("part edf: wake up %d with budget=%d for cpu %d\n", + task->pid, task->time_slice, get_partition(task)); + task->state = TASK_RUNNING; + if (is_tardy(task)) { + /* new sporadic release */ + prepare_new_release(task); + add_ready(edf, task); + + } else if (task->time_slice) { + /* came back in time before deadline + * TODO: clip budget to fit into period, otherwise it could + * cause a deadline overrun in the next period, i.e. + * over allocation in the next period. + */ + set_rt_flags(task, RT_F_RUNNING); + add_ready(edf, task); + + } else { + add_release(edf, task); + } + +} + +static void part_edf_task_blocks(struct task_struct *t) +{ + BUG_ON(!is_realtime(t)); + /* not really anything to do since it can only block if + * it is running, and when it is not running it is not in any + * queue anyway. + * + * TODO: Check whether the assumption is correct for SIGKILL and + * SIGSTOP. + */ + TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice); + BUG_ON(in_list(&t->rt_list)); +} + + +/* When _tear_down is called, the task should not be in any queue any more + * as it must have blocked first. We don't have any internal state for the task, + * it is all in the task_struct. + */ +static long part_edf_tear_down(struct task_struct * t) +{ + BUG_ON(!is_realtime(t)); + TRACE("part edf: tear down called for %d \n", t->pid); + BUG_ON(t->array); + BUG_ON(in_list(&t->rt_list)); + return 0; +} + + +static int part_edf_mode_change(int new_mode) +{ + int cpu; + + if (new_mode == MODE_RT_RUN) + for_each_online_cpu(cpu) + prepare_new_releases(remote_edf(cpu), jiffies); + TRACE("[%d] part edf: mode changed to %d\n", + smp_processor_id(), new_mode); + return 0; +} + + +/* Plugin object */ +static sched_plugin_t s_plugin __cacheline_aligned_in_smp = { + .ready_to_use = 0 +}; + + +/* + * Plugin initialization code. + */ +#define INIT_SCHED_PLUGIN (struct sched_plugin) {\ + .plugin_name = "Partitioned EDF",\ + .ready_to_use = 1,\ + .algo_scheduler_tick = part_edf_scheduler_tick,\ + .scheduler_tick = rt_scheduler_tick,\ + .prepare_task = part_edf_prepare_task,\ + .sleep_next_period = edf_sleep_next_period,\ + .tear_down = part_edf_tear_down,\ + .shutdown_hook = NULL,\ + .schedule = part_edf_schedule,\ + .finish_switch = part_edf_finish_switch,\ + .mode_change = part_edf_mode_change,\ + .wake_up_task = part_edf_wake_up_task,\ + .task_blocks = part_edf_task_blocks \ +} + + +sched_plugin_t *__init init_part_edf_plugin(void) +{ + int i; + + if (!s_plugin.ready_to_use) + { + set_sched_options(SCHED_NONE); + for (i = 0; i < NR_CPUS; i++) + { + part_edf_domain_init(remote_pedf(i), + part_edf_check_resched, i); + printk("CPU partition %d initialized.", i); + } + s_plugin = INIT_SCHED_PLUGIN; + } + return &s_plugin; +} + + + diff --git a/kernel/sched_pfair.c b/kernel/sched_pfair.c new file mode 100644 index 0000000..4fa6ba2 --- /dev/null +++ b/kernel/sched_pfair.c @@ -0,0 +1,507 @@ +/* + * + * Implementation of synchronized PFAIR PD2 scheduler + * + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +struct cpu_state { + struct task_struct * t; + volatile jiffie_t jiffie_marker; +}; +/* PFAIR scheduling domain, release and ready queues */ +static pfair_domain_t pfair __cacheline_aligned_in_smp; + +/* An indicator that quantum boundary was crossed + * and a decision has to be made + */ +static int sync_go[NR_CPUS]; + + +/* A collection of CPU states protected by pfair lock */ +DEFINE_PER_CPU(struct cpu_state, states); + +/* + * This function gets called by the timer code, with HZ frequency + * with interrupts disabled. + * + * The function merges the release queue with the ready queue + * and indicates that quantum boundary was crossed. + * + * It also suggests to schedule off currently running + * real-time task if the mode is non-real-time. + */ +static reschedule_check_t pfair_scheduler_tick(void) +{ + int want_resched = NO_RESCHED; + sync_go[smp_processor_id()] = 0; + if (!cpu_isset(smp_processor_id(), pfair.domain_cpus)) + goto out; + /* Now determine if we want current task to be preempted */ + if (get_rt_mode() == MODE_RT_RUN) { + pfair_try_release_pending(&pfair); + want_resched = FORCE_RESCHED; + /* indicate that the interrupt fired */ + sync_go[smp_processor_id()] = 1; + barrier(); + } else if (is_realtime(current) && is_running(current)) { + /* In non real-time mode we want to + * schedule off real-time tasks */ + want_resched = FORCE_RESCHED; + } else if (is_realtime(current) && !is_running(current)) { + TRACE("[%d] %d Timer interrupt on not runninng %d\n", + smp_processor_id(), + jiffies-rt_start_time, current->pid); + } +out: + return want_resched; +} + +/** + * This function is called by the processor + * that performs rescheduling. It saves the timing + * parameters of currently running jobs that were not rescheduled yet + * and releases next subtask for these jobs placing them into + * release and ready queues. + */ +static void pretend_release(cpumask_t p) +{ + int i = 0; + struct task_struct * t = NULL; + /* for all the tasks increment the number of used quanta + * and release next subtask or job depending on the number + * of used quanta + */ + for_each_cpu_mask(i, p) { + t = per_cpu(states, i).t; + if (t != NULL) { + backup_times(t); + inc_passed_quanta(t); + if ( get_passed_quanta(t) == get_exec_cost(t)) { + pfair_prepare_next_job(t); + } else { + pfair_prepare_next_subtask(t); + } + /* + TRACE("[%d] %d pretending release %d with (%d, %d)\n", + smp_processor_id(), + jiffies-rt_start_time,t->pid, + get_release(t)-rt_start_time, + get_deadline(t)-rt_start_time);*/ + /* detect if the job or subtask has to be released now*/ + if (time_before_eq(get_release(t), jiffies)) + pfair_add_ready(&pfair, t); + else + pfair_add_release(&pfair, t); + } + } +} +/* + * Rollback the the pretended release of tasks. + * Timing parameters are restored and tasks are removed + * from the queues as it was before calling the schedule() function. + * + */ +static void rollback_release(cpumask_t p) +{ + int i = -1; + struct task_struct * t = NULL; + /* + * Rollback the pretended changes + */ + for_each_cpu_mask(i, p) { + t = per_cpu(states, i).t; + if (t != NULL) { + restore_times(t); + if(t->rt_list.prev != LIST_POISON1 || + t->rt_list.next != LIST_POISON2) { + /* Delete the task from a queue */ + list_del(&t->rt_list); + } + } + } +} + +/* + * The procedure creates a list of cpu's whose tasks have not been + * rescheduled yet. These are CPU's with jiffie marker different from + * the value of jiffies. + */ +static void find_participants(cpumask_t * target) +{ + cpumask_t res;int i; + cpus_clear(res); + for_each_online_cpu(i) { + if(per_cpu(states, i).jiffie_marker != jiffies) + cpu_set(i, res); + } + /* Examine only cpus in the domain */ + cpus_and(res, pfair.domain_cpus, res); + (*target) = res; +} + +/* + * This is main PFAIR schedule function, + * each processor pretends that some currently running tasks are + * released in the next quantum and determines whether it should + * keep the task that is currently running (this is usually the case + * for heavy tasks). +*/ +static int pfair_schedule(struct task_struct *prev, + struct task_struct **next, + runqueue_t * rq) +{ + int cpu =-1; + int k =-1; + int need_deactivate = 1; + int keep =0; + unsigned long flags; + cpumask_t participants; + /* A temporary array */ + struct task_struct * rs_old_ptr[NR_CPUS]; + + *next = NULL; + cpu = smp_processor_id(); + /* CPU's not in the domain just bypass */ + if (!cpu_isset(cpu, pfair.domain_cpus)) { + goto out; + } + queue_lock_irqsave(&pfair.pfair_lock, flags); + + /* If we happen to run in non-realtime mode + * then we have to schedule off currently running tasks + * */ + if (get_rt_mode() != MODE_RT_RUN) { + if (is_realtime(prev)) { + per_cpu(states, cpu).t = NULL; + TRACE("[%d] %d Suspending %d\n", + cpu, jiffies - rt_start_time, + prev->pid); + /* Move the task to the + * release queue for future runs + * FIXME: Do something smarter. + * For example create a set where + * prepared or inactive tasks are placed + * and then released. + * */ + set_release(prev, get_release(prev) + 1000); + pfair_add_release(&pfair, prev); + } + goto out_deactivate; + } + /* If the current task stops or dies */ + if (is_realtime(prev) && !is_running(prev)) { + /* remove it from the running set */ + per_cpu(states, cpu).t = NULL; + } + /* Make pfair decisions at quantum boundaries only, + * but schedule off stopped or dead tasks */ + + if ((sync_go[cpu]--) != 1) + goto out_deactivate; + + /*TRACE("[%d] %d Scheduler activation", cpu, jiffies-rt_start_time); + cpus_and(res, pfair.domain_cpus, cpu_online_map); + for_each_cpu_mask(k, res) { + TRACE("%d" ,(per_cpu(states, k).jiffie_marker!=jiffies)); + } + TRACE("\n");*/ + + /* Find processors that have not rescheduled yet */ + find_participants(&participants); + /* For each task on remote cpu's pretend release */ + pretend_release(participants); + /* Clear temporary array */ + for_each_possible_cpu(k) { rs_old_ptr[k] = NULL; } + /* Select a new subset of eligible tasks */ + for_each_cpu_mask(k, participants) { + rs_old_ptr[k] = __pfair_take_ready (&pfair); + /* Check if our current task must be scheduled in the next quantum */ + if (rs_old_ptr[k] == per_cpu(states, cpu).t) { + /* this is our current task, keep it */ + *next = per_cpu(states, cpu).t; + need_deactivate = 0; + keep = 1; + break; + } + } + /* Put all the extracted tasks back into the ready queue */ + for_each_cpu_mask(k, participants) { + if (rs_old_ptr[k] != NULL){ + pfair_add_ready(&pfair, rs_old_ptr[k]); + rs_old_ptr[k] = NULL; + } + } + /* Rollback the pretended release, + * task parameters are restored and running tasks are removed + * from queues */ + rollback_release(participants); + /* + * If the current task is not scheduled in the next quantum + * then select a new pfair task + */ + if(!keep) { + *next = per_cpu(states, cpu).t = __pfair_take_ready(&pfair); + if (*next != NULL) { + /*TRACE("[%d] %d Scheduling %d with (%d, %d)\n", + cpu, jiffies-rt_start_time, + get_release(*next), + get_deadline(*next)); + */ + set_task_cpu(*next, cpu); + __activate_task(*next, rq); + } + } else { + if (is_realtime(prev)) { + /*TRACE("[%d] %d prev==next %d\n", + cpu,jiffies-rt_start_time, + (prev)->pid);*/ + + /* The task will not be switched off but we + * need to track the execution time + */ + inc_passed_quanta(prev); + } + } + + /*Show that our task does not participate in subsequent selections*/ + __get_cpu_var(states).jiffie_marker = jiffies; + +out_deactivate: + if ( is_realtime(prev) && need_deactivate && prev->array) { + /* take prev out of the linux run queue */ + deactivate_task(prev, rq); + } + queue_unlock_irqrestore(&pfair.pfair_lock, flags); +out: + return 0; +} + +static void pfair_finish_task_switch(struct task_struct *t) +{ + if (!is_realtime(t) || !is_running(t)) + return; + + queue_lock(&pfair.pfair_lock); + /* Release in real-time mode only, + * if the mode is non real-time, then + * the task is already in the release queue + * with the time far in the future + */ + if (get_rt_mode() == MODE_RT_RUN) { + inc_passed_quanta(t); + if ( get_passed_quanta(t) == get_exec_cost(t)) { + sched_trace_job_completion(t); + pfair_prepare_next_job(t); + } else { + pfair_prepare_next_subtask(t); + } + /*TRACE("[%d] %d releasing %d with (%d, %d)\n", + smp_processor_id(), + jiffies-rt_start_time, + t->pid, + get_release(t)-rt_start_time, + get_deadline(t)-rt_start_time);*/ + if (time_before_eq(get_release(t), jiffies)) + pfair_add_ready(&pfair, t); + else + pfair_add_release(&pfair, t); + } + queue_unlock(&pfair.pfair_lock); +} + +/* Prepare a task for running in RT mode + * Enqueues the task into master queue data structure + * returns + * -EPERM if task is not TASK_STOPPED + */ +static long pfair_prepare_task(struct task_struct * t) +{ + unsigned long flags; + TRACE("pfair: prepare task %d\n", t->pid); + if (t->state == TASK_STOPPED) { + __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1); + + if (get_rt_mode() == MODE_RT_RUN) + /* The action is already on. + * Prepare immediate release + */ + __pfair_prepare_new_release(t, jiffies); + /* The task should be running in the queue, otherwise signal + * code will try to wake it up with fatal consequences. + */ + t->state = TASK_RUNNING; + queue_lock_irqsave(&pfair.pfair_lock, flags); + pfair_add_release(&pfair, t); + queue_unlock_irqrestore(&pfair.pfair_lock, flags); + return 0; + } else + return -EPERM; +} + + + +static void pfair_wake_up_task(struct task_struct *task) +{ + + unsigned long flags; + + /* We must determine whether task should go into the release + * queue or into the ready queue. + * The task enters the ready queue if the previous deadline was missed, + * so we treat the invoked job as a new sporadic release. + * + * The job can also enter the ready queue if it was invoked before its + * global deadline, but its budjet must be clipped down to one quantum + */ + task->state = TASK_RUNNING; + if (time_after_eq(jiffies, task->rt_param.times.last_release + + get_rt_period(task))) { + /* new sporadic release */ + TRACE("[%d] Sporadic release of %d at %d\n", + smp_processor_id(), + jiffies-rt_start_time, + task->pid); + __pfair_prepare_new_release(task, jiffies); + queue_lock_irqsave(&pfair.pfair_lock, flags); + sched_trace_job_release(task); + pfair_add_ready(&pfair, task); + queue_unlock_irqrestore(&pfair.pfair_lock, flags); + } else if (task->time_slice) { + /* came back in time before deadline + * clip the budget to be the last subtask of a job or + * the new job. + */ + task->rt_param.times.exec_time = get_exec_cost(task) - 1; + if (task->rt_param.times.exec_time == 0) { + pfair_prepare_next_job(task); + } else { + pfair_prepare_next_subtask(task); + } + TRACE("[%d] %d Resume of %d with %d, %d, %d\n", + smp_processor_id(), jiffies-rt_start_time, + task->pid, get_release(task)-rt_start_time, + get_deadline(task)-rt_start_time, + get_passed_quanta(task)); + + set_rt_flags(task, RT_F_RUNNING); + queue_lock_irqsave(&pfair.pfair_lock, flags); + sched_trace_job_release(task); + if (time_after_eq(jiffies, get_release(task))) { + pfair_add_ready(&pfair, task); + } else { + pfair_add_release(&pfair, task); + } + queue_unlock_irqrestore(&pfair.pfair_lock, flags); + + } else { + TRACE("[%d] %d Strange release of %d with %d, %d, %d\n", + smp_processor_id(), jiffies-rt_start_time, + task->pid, + get_release(task), get_deadline(task), + get_passed_quanta(task)); + + queue_lock_irqsave(&pfair.pfair_lock, flags); + pfair_add_release(&pfair, task); + queue_unlock_irqrestore(&pfair.pfair_lock, flags); + } +} + + +static void pfair_task_blocks(struct task_struct *t) +{ + unsigned long flags; + int i; + cpumask_t res; + BUG_ON(!is_realtime(t)); + /* If the task blocks, then it must be removed from the running set */ + queue_lock_irqsave(&pfair.pfair_lock, flags); + cpus_and(res,pfair.domain_cpus, cpu_online_map); + for_each_cpu_mask(i, res) { + if (per_cpu(states, i).t == t) + per_cpu(states, i).t = NULL; + } + /* If the task is running and in some + * list it might have been released by another + * processor + */ + if((t->rt_list.next != LIST_POISON1 || + t->rt_list.prev != LIST_POISON2)) { + TRACE("[%d] %d task %d is deleted from the list\n", + smp_processor_id(), + jiffies-rt_start_time, t->pid); + list_del(&t->rt_list); + } + queue_unlock_irqrestore(&pfair.pfair_lock, flags); + TRACE("[%d] %d task %d blocks with budget=%d state=%d\n", + smp_processor_id(), jiffies-rt_start_time, + t->pid, t->time_slice, t->state); +} + +static long pfair_tear_down(struct task_struct * t) +{ + BUG_ON(!is_realtime(t)); + TRACE("pfair: tear down called for %d \n", t->pid); + BUG_ON(t->array); + BUG_ON(t->rt_list.next != LIST_POISON1); + BUG_ON(t->rt_list.prev != LIST_POISON2); + return 0; +} + +static int pfair_mode_change(int new_mode) +{ + printk(KERN_INFO "[%d] pfair mode change %d\n", + smp_processor_id(), new_mode); + if (new_mode == MODE_RT_RUN) { + pfair_prepare_new_releases(&pfair, jiffies + 10); + } + printk(KERN_INFO "[%d] pfair: mode change done\n", smp_processor_id()); + return 0; +} + +/* Plugin object */ +static sched_plugin_t s_plugin __cacheline_aligned_in_smp = { + .ready_to_use = 0 +}; +/* +* PFAIR plugin initialization macro. +*/ +#define INIT_PFAIR_PLUGIN (struct sched_plugin){\ + .plugin_name = "PFAIR",\ + .ready_to_use = 1,\ + .algo_scheduler_tick = pfair_scheduler_tick,\ + .scheduler_tick = rt_scheduler_tick,\ + .prepare_task = pfair_prepare_task,\ + .sleep_next_period = 0,\ + .tear_down = pfair_tear_down,\ + .shutdown_hook = 0,\ + .schedule = pfair_schedule,\ + .finish_switch = pfair_finish_task_switch,\ + .mode_change = pfair_mode_change,\ + .wake_up_task = pfair_wake_up_task,\ + .task_blocks = pfair_task_blocks \ + } + +sched_plugin_t* __init init_pfair_plugin(void) +{ + int i=0; + if (!s_plugin.ready_to_use) { + set_sched_options(SCHED_NONE); + pfair_domain_init(&pfair); + for (i=0; i +#include + + +/* These are the original Linux initialization functions. + * We replace them here with our initialization code and call them + * after setting up LITMUS. + */ +void linux_sched_init(void); +void linux_sched_init_smp(void); +int linux_migration_init(void); + +/************************************************************* + * Dummy plugin functions * + *************************************************************/ + +void litmus_dummy_finish_switch(struct task_struct * prev) +{ +} + +int litmus_dummy_schedule(struct task_struct * prev, + struct task_struct** next, + runqueue_t* q) +{ + return 0; +} + +reschedule_check_t litmus_dummy_scheduler_tick(void) +{ + return NO_RESCHED; +} + + +long litmus_dummy_prepare_task(struct task_struct *t) +{ + return 0; +} + +void litmus_dummy_wake_up_task(struct task_struct *task) +{ + printk(KERN_WARNING "task %d: unhandled real-time wake up!\n", + task->pid); +} + +void litmus_dummy_task_blocks(struct task_struct *task) +{ +} + +long litmus_dummy_tear_down(struct task_struct *task) +{ + return 0; +} + +long litmus_dummy_enter_np(struct task_struct *task) +{ + return -EPERM; +} + +long litmus_dummy_exit_np(struct task_struct *task) +{ + return -EPERM; +} + +int litmus_dummy_scheduler_setup(int cmd, void __user *parameter) +{ + return -EPERM; +} + +long litmus_dummy_sleep_next_period(void) +{ + return -EPERM; +} + +long litmus_dummy_inherit_priority(struct pi_semaphore *sem, + struct task_struct *new_owner) +{ + return -EPERM; +} + +long litmus_dummy_return_priority(struct pi_semaphore *sem) +{ + return -EPERM; +} + +long litmus_dummy_pi_block(struct pi_semaphore *sem, + struct task_struct *new_waiter) +{ + return -EPERM; +} + + +/* The default scheduler plugin. It doesn't do anything and lets Linux do its + * job. + */ + +sched_plugin_t linux_sched_plugin = { + .plugin_name = "Linux", + .ready_to_use = 1, + .algo_scheduler_tick = 0, + .scheduler_tick = litmus_dummy_scheduler_tick, + .prepare_task = litmus_dummy_prepare_task, + .tear_down = litmus_dummy_tear_down, + .wake_up_task = litmus_dummy_wake_up_task, + .task_blocks = litmus_dummy_task_blocks, + .sleep_next_period = litmus_dummy_sleep_next_period, + .shutdown_hook = 0, + .schedule = litmus_dummy_schedule, + .finish_switch = litmus_dummy_finish_switch, + .scheduler_setup = litmus_dummy_scheduler_setup, + .inherit_priority = litmus_dummy_inherit_priority, + .return_priority = litmus_dummy_return_priority, + .pi_block = litmus_dummy_pi_block +}; + +/* + * The reference to current plugin that is used to schedule tasks within + * the system. It stores references to actual function implementations + * Should be initialized by calling "init_***_plugin()" + */ +sched_plugin_t *curr_sched_plugin = &linux_sched_plugin; + + +/* At sched-init */ +void __init sched_init(void) +{ + printk("Entering custom sched init, plugin %s\n", + curr_sched_plugin->plugin_name); + /* Init tracing facility before plugin functions are called */ + + /* CLEANUP: reenable this if needed + pstats = INIT_PSTATS; + + */ + + /* Call linux sched init tasks */ + linux_sched_init(); + printk("Sched init complete\n"); +} + +void __init sched_init_smp(void) +{ + printk("Entering custom SMP init, plugin %s\n", + curr_sched_plugin->plugin_name); + /* Call linux smp initializer */ + linux_sched_init_smp(); + /* Enable tracing facilities here */ + /* + CLEANUP: Reenable if needed. + if (smp_processor_id() == 0) { + if (init_trace()) { + printk("Tracing disabled\n"); + } else { + printk("Default tracing enabled\n"); + } + } */ + printk("Sched init SMP complete\n"); +} + +int __init migration_init(void) +{ + printk("Entering migration init\n"); + + /* Call linux migration init as it was before */ + return linux_migration_init(); +} diff --git a/kernel/sched_psn_edf.c b/kernel/sched_psn_edf.c new file mode 100644 index 0000000..8c3c2d8 --- /dev/null +++ b/kernel/sched_psn_edf.c @@ -0,0 +1,531 @@ + +/* + * kernel/sched_psn_edf.c + * + * Implementation of the PSN-EDF scheduler plugin. + * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c. + * + * Suspensions and non-preemptable sections are supported. + * Priority inheritance is not supported. + */ + +#include +#include +#include +#include + +#include +#include +#include + + +typedef struct { + edf_domain_t domain; + int cpu; + struct task_struct* scheduled; /* only RT tasks */ + spinlock_t lock; /* protects the domain and + * serializes scheduling decisions + */ +} psnedf_domain_t; + +DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains); + +#define local_edf (&__get_cpu_var(psnedf_domains).domain) +#define local_pedf (&__get_cpu_var(psnedf_domains)) +#define remote_edf(cpu) (&per_cpu(psnedf_domains, cpu).domain) +#define remote_pedf(cpu) (&per_cpu(psnedf_domains, cpu)) +#define task_edf(task) remote_edf(get_partition(task)) +#define task_pedf(task) remote_pedf(get_partition(task)) + + +static void psnedf_domain_init(psnedf_domain_t* pedf, + edf_check_resched_needed_t check, + int cpu) +{ + edf_domain_init(&pedf->domain, check); + pedf->cpu = cpu; + pedf->lock = SPIN_LOCK_UNLOCKED; + pedf->scheduled = NULL; +} + +/* we assume the lock is being held */ +static void preempt(psnedf_domain_t *pedf) +{ + /* don't interrupt non-preemptable tasks */ + if (pedf->scheduled && is_np(pedf->scheduled)) + return; + + if (pedf->cpu == smp_processor_id()) + set_tsk_need_resched(current); + else + smp_send_reschedule(pedf->cpu); +} + +/* This check is trivial in partioned systems as we only have to consider + * the CPU of the partition. + */ +static int psnedf_check_resched(edf_domain_t *edf) +{ + psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain); + int ret = 0; + + /* because this is a callback from edf_domain_t we already hold + * the necessary lock for the ready queue + */ + if (preemption_needed(edf, pedf->scheduled)) { + preempt(pedf); + ret = 1; + } + return ret; +} + + +static reschedule_check_t psnedf_scheduler_tick(void) +{ + unsigned long flags; + struct task_struct *t = current; + reschedule_check_t want_resched = NO_RESCHED; + edf_domain_t *edf = local_edf; + psnedf_domain_t *pedf = local_pedf; + + /* Check for inconsistency. We don't need the lock for this since + * ->scheduled is only changed in schedule, which obviously is not + * executing in parallel on this CPU + */ + BUG_ON(is_realtime(t) && t != pedf->scheduled); + + if (is_realtime(t)) + TRACE("%s/%d was hit by scheduler tick\n", t->comm, t->pid); + + /* expire tasks even if not in real-time mode + * this makes sure that at the end of real-time mode + * no tasks "run away forever". + */ + if (is_realtime(t) && t->time_slice && !--t->time_slice) { + if (!is_np(t)) { + TRACE("%s/%d was marked as RT_F_SLEEP, " + "state=%d\n", + t->comm, t->pid, t->state); + set_rt_flags(t, RT_F_SLEEP); + want_resched = FORCE_RESCHED; + } else + TRACE("psnedf_scheduler_tick: " + "%d is non-preemptable, " + "preemption delayed.\n", t->pid); + } + + if (get_rt_mode() == MODE_RT_RUN) + { + /* check whether anything is waiting to be released + * this could probably be moved to the global timer + * interrupt handler since the state will only change + * once per jiffie + */ + spin_lock_irqsave(&pedf->lock, flags); + __release_pending(edf); + if (want_resched != FORCE_RESCHED && + preemption_needed(edf, t)) + want_resched = FORCE_RESCHED; + + spin_unlock_irqrestore(&pedf->lock, flags); + + } + return want_resched; +} + +static int psnedf_schedule(struct task_struct * prev, + struct task_struct ** next, + runqueue_t * rq) +{ + int need_deactivate = 1; + psnedf_domain_t* pedf = local_pedf; + edf_domain_t* edf = &pedf->domain; + + /* if a real-time task is non-preemptable, then schedule it again. + */ + if (is_realtime(prev) && + is_running(prev) && + is_np(prev)) { + TRACE("psnedf_schedule: is_np(%d) = %d => reschedule prev\n", + prev->pid, is_np(prev)); + *next = prev; + return 0; + } + + if (is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP) + prepare_for_next_period(prev); + + if (get_rt_mode() == MODE_RT_RUN) { + spin_lock(&pedf->lock); + if (is_realtime(prev) && is_released(prev) && is_running(prev) + && !preemption_needed(edf, prev)) { + /* this really should only happen if the task has + * 100% utilization... + */ + TRACE("prev will be next, already released\n"); + *next = prev; + need_deactivate = 0; + } else { + /* either not yet released, preempted, or non-rt */ + *next = __take_ready(edf); + if (*next) { + /* stick the task into the runqueue */ + __activate_task(*next, rq); + set_task_cpu(*next, smp_processor_id()); + } + } + pedf->scheduled = *next; + if (*next) + set_rt_flags(*next, RT_F_RUNNING); + + spin_unlock(&pedf->lock); + } + + if (is_realtime(prev) && need_deactivate && prev->array) { + /* take it out of the run queue */ + deactivate_task(prev, rq); + } + + return 0; +} + + +static void psnedf_finish_switch(struct task_struct *prev) +{ + edf_domain_t* edf = local_edf; + psnedf_domain_t* pedf = local_pedf; + + if (!is_realtime(prev)) + return; + + if (is_blocked(prev)) { + TRACE("psdnedf: %s/%d is not requeued " + "(state=%d, prev->preempt_count=%x", + prev->comm, prev->pid, prev->state, + prev->thread_info->preempt_count); + return; + } + + if (prev->state != TASK_RUNNING) + TRACE("psdnedf: %s/%d is requeued because of preempt hack" + "(state=%d, prev->preempt_count=%x", + prev->comm, prev->pid, prev->state, + prev->thread_info->preempt_count); + + /* IRQs are still disabled from by schedule(), which is calling us */ + spin_lock(&pedf->lock); + if ((get_rt_flags(prev) == RT_F_SLEEP && !is_released(prev)) || + get_rt_mode() != MODE_RT_RUN) + __add_release(edf, prev); /* it has got to wait */ + else + __add_ready(edf, prev); + spin_unlock(&pedf->lock); +} + + +/* Prepare a task for running in RT mode + * Enqueues the task into master queue data structure + * returns + * -EPERM if task is not TASK_STOPPED + */ +static long psnedf_prepare_task(struct task_struct * t) +{ + edf_domain_t* edf = task_edf(t); + psnedf_domain_t* pedf = task_pedf(t); + unsigned long flags; + + TRACE("[%d] psn edf: prepare task %d on CPU %d\n", + smp_processor_id(), t->pid, get_partition(t)); + if (t->state == TASK_STOPPED) { + __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1); + + if (get_rt_mode() == MODE_RT_RUN) + /* The action is already on. + * Prepare immediate release. + */ + prepare_new_release(t); + /* The task should be running in the queue, otherwise signal + * code will try to wake it up with fatal consequences. + */ + t->state = TASK_RUNNING; + spin_lock_irqsave(&pedf->lock, flags); + __add_release(edf, t); + spin_unlock_irqrestore(&pedf->lock, flags); + return 0; + } else + return -EPERM; +} + +static void psnedf_wake_up_task(struct task_struct *task) +{ + unsigned long flags; + psnedf_domain_t* pedf = task_pedf(task); + edf_domain_t* edf = task_edf(task); + + TRACE("psnedf: %d unsuspends with budget=%d\n", + task->pid, task->time_slice); + + BUG_ON(in_list(&task->rt_list)); + + task->state = TASK_RUNNING; + + /* We need to take suspensions because of semaphores into + * account! If a job resumes after being suspended due to acquiring + * a semaphore, it should never be treated as a new job release. + */ + if (is_tardy(task) && get_rt_flags(task) != RT_F_EXIT_SEM) { + /* new sporadic release */ + prepare_new_release(task); + sched_trace_job_release(task); + } + + spin_lock_irqsave(&pedf->lock, flags); + if (task->time_slice) { + set_rt_flags(task, RT_F_RUNNING); + __add_ready(edf, task); + } else + __add_release(edf, task); + spin_unlock_irqrestore(&pedf->lock, flags); + +} + +static void psnedf_task_blocks(struct task_struct *t) +{ + BUG_ON(!is_realtime(t)); + /* not really anything to do since it can only block if + * it is running, and when it is not running it is not in any + * queue anyway. + */ + TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice); + BUG_ON(in_list(&t->rt_list)); +} + + +/* When _tear_down is called, the task should not be in any queue any more + * as it must have blocked first. We don't have any internal state for the task, + * it is all in the task_struct. + */ +static long psnedf_tear_down(struct task_struct * t) +{ + BUG_ON(!is_realtime(t)); + TRACE_TASK(t, "tear down called"); + BUG_ON(t->array); + BUG_ON(in_list(&t->rt_list)); + return 0; +} + +static long psnedf_enter_np(struct task_struct * t) +{ + unsigned long flags; + psnedf_domain_t* pedf = task_pedf(t); + + spin_lock_irqsave(&pedf->lock, flags); + t->rt_param.is_non_preemptable++; + spin_unlock_irqrestore(&pedf->lock, flags); + return 0; +} + +static long psnedf_exit_np(struct task_struct * t) +{ + unsigned long flags; + psnedf_domain_t* pedf = task_pedf(t); + int ret = 0; + + spin_lock_irqsave(&pedf->lock, flags); + if (is_np(t)) { + if (!--t->rt_param.is_non_preemptable && + (!t->time_slice || + preemption_needed(task_edf(t), t))) { + if (!t->time_slice) { + set_rt_flags(t, RT_F_SLEEP); + TRACE("psnedf_exit_np: delayed preemption " + "of %d\n", t->pid); + } + BUG_ON(t != local_pedf->scheduled); + set_tsk_need_resched(t); + } + } else + ret = -EPERM; + spin_unlock_irqrestore(&pedf->lock, flags); + return ret; +} + +static long psnedf_pi_block(struct pi_semaphore *sem, + struct task_struct *new_waiter) +{ + psnedf_domain_t* pedf; + edf_domain_t* edf; + struct task_struct* t; + int cpu = get_partition(new_waiter); + + BUG_ON(!new_waiter); + + if (edf_higher_prio(new_waiter, sem->hp.cpu_task[cpu])) { + TRACE_TASK(new_waiter, " boosts priority\n"); + pedf = task_pedf(new_waiter); + edf = task_edf(new_waiter); + + /* interrupts already disabled */ + spin_lock(&pedf->lock); + + /* store new highest-priority task */ + sem->hp.cpu_task[cpu] = new_waiter; + if (sem->holder && + get_partition(sem->holder) == get_partition(new_waiter)) { + /* let holder inherit */ + sem->holder->rt_param.inh_task = new_waiter; + t = sem->holder; + if (in_list(&t->rt_list)) { + /* queued in domain*/ + list_del(&t->rt_list); + /* readd to make priority change take place */ + if (is_released(t)) + __add_ready(edf, t); + else + __add_release(edf, t); + } + } + + /* check if we need to reschedule */ + if (preemption_needed(edf, current)) + preempt(pedf); + + spin_unlock(&pedf->lock); + } + + return 0; +} + +static long psnedf_inherit_priority(struct pi_semaphore *sem, + struct task_struct *new_owner) +{ + int cpu = get_partition(new_owner); + + new_owner->rt_param.inh_task = sem->hp.cpu_task[cpu]; + if (sem->hp.cpu_task[cpu] && new_owner != sem->hp.cpu_task[cpu]) { + TRACE_TASK(new_owner, + "inherited priority from %s/%d\n", + sem->hp.cpu_task[cpu]->comm, + sem->hp.cpu_task[cpu]->pid); + } else + TRACE_TASK(new_owner, + "cannot inherit priority: " + "no higher priority job waits on this CPU!\n"); + /* make new owner non-preemptable as required by FMLP/FLEX under + * PSN-EDF. + */ + psnedf_enter_np(new_owner); + return 0; +} + + +/* This function is called on a semaphore release, and assumes that + * the current task is also the semaphore holder. + */ +static long psnedf_return_priority(struct pi_semaphore *sem) +{ + struct task_struct* t = current; + psnedf_domain_t* pedf = task_pedf(t); + edf_domain_t* edf = task_edf(t); + int ret = 0; + int cpu = get_partition(current); + + + /* Find new highest-priority semaphore task + * if holder task is the current hp.cpu_task[cpu]. + * + * Calling function holds sem->wait.lock. + */ + if (t == sem->hp.cpu_task[cpu]) + edf_set_hp_cpu_task(sem, cpu); + + psnedf_exit_np(t); + if (current->rt_param.inh_task) { + TRACE_CUR("return priority of %s/%d\n", + current->rt_param.inh_task->comm, + current->rt_param.inh_task->pid); + spin_lock(&pedf->lock); + + /* Reset inh_task to NULL. */ + current->rt_param.inh_task = NULL; + + /* check if we need to reschedule */ + if (preemption_needed(edf, current)) + preempt(pedf); + + spin_unlock(&pedf->lock); + } else + TRACE_CUR(" no priority to return %p\n", sem); + + return ret; +} + + +static int psnedf_mode_change(int new_mode) +{ + int cpu; + + if (new_mode == MODE_RT_RUN) + for_each_online_cpu(cpu) { + spin_lock(&remote_pedf(cpu)->lock); + __prepare_new_releases(remote_edf(cpu), jiffies); + spin_unlock(&remote_pedf(cpu)->lock); + } + + TRACE("[%d] psn edf: mode changed to %d\n", + smp_processor_id(), new_mode); + return 0; +} + + +/* Plugin object */ +static sched_plugin_t s_plugin __cacheline_aligned_in_smp = { + .ready_to_use = 0 +}; + + +/* + * Plugin initialization code. + */ +#define INIT_SCHED_PLUGIN (struct sched_plugin) {\ + .plugin_name = "PSN-EDF",\ + .ready_to_use = 1,\ + .algo_scheduler_tick = psnedf_scheduler_tick,\ + .scheduler_tick = rt_scheduler_tick,\ + .prepare_task = psnedf_prepare_task,\ + .sleep_next_period = edf_sleep_next_period,\ + .tear_down = psnedf_tear_down,\ + .shutdown_hook = NULL,\ + .schedule = psnedf_schedule,\ + .finish_switch = psnedf_finish_switch,\ + .mode_change = psnedf_mode_change,\ + .wake_up_task = psnedf_wake_up_task,\ + .task_blocks = psnedf_task_blocks, \ + .enter_np = psnedf_enter_np, \ + .exit_np = psnedf_exit_np, \ + .pi_block = psnedf_pi_block, \ + .inherit_priority = psnedf_inherit_priority, \ + .return_priority = psnedf_return_priority \ +} + + +sched_plugin_t *__init init_psn_edf_plugin(void) +{ + int i; + + if (!s_plugin.ready_to_use) + { + set_sched_options(SCHED_NONE); + for (i = 0; i < NR_CPUS; i++) + { + psnedf_domain_init(remote_pedf(i), + psnedf_check_resched, i); + printk("PSN-EDF: CPU partition %d initialized.\n", i); + } + s_plugin = INIT_SCHED_PLUGIN; + } + return &s_plugin; +} + + + diff --git a/kernel/sched_trace.c b/kernel/sched_trace.c new file mode 100644 index 0000000..be495f1 --- /dev/null +++ b/kernel/sched_trace.c @@ -0,0 +1,725 @@ +/* sched_trace.c -- record scheduling events to a byte stream. + * + * TODO: Move ring buffer to a lockfree implementation. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + + +typedef struct { + /* guard read and write pointers */ + spinlock_t lock; + /* guard against concurrent freeing of buffer */ + rwlock_t del_lock; + + /* memory allocated for ring buffer */ + unsigned long order; + char* buf; + char* end; + + /* Read/write pointer. May not cross. + * They point to the position of next write and + * last read. + */ + char* writep; + char* readp; + +} ring_buffer_t; + +#define EMPTY_RING_BUFFER { \ + .lock = SPIN_LOCK_UNLOCKED, \ + .del_lock = RW_LOCK_UNLOCKED, \ + .buf = NULL, \ + .end = NULL, \ + .writep = NULL, \ + .readp = NULL \ +} + +void rb_init(ring_buffer_t* buf) +{ + *buf = (ring_buffer_t) EMPTY_RING_BUFFER; +} + +int rb_alloc_buf(ring_buffer_t* buf, unsigned long order) +{ + unsigned long flags; + int error = 0; + char *mem; + + /* do memory allocation while not atomic */ + mem = (char *) __get_free_pages(GFP_KERNEL, order); + if (!mem) + return -ENOMEM; + write_lock_irqsave(&buf->del_lock, flags); + BUG_ON(buf->buf); + buf->buf = mem; + buf->end = buf->buf + PAGE_SIZE * (1 << order) - 1; + memset(buf->buf, 0xff, buf->end - buf->buf); + buf->order = order; + buf->writep = buf->buf + 1; + buf->readp = buf->buf; + write_unlock_irqrestore(&buf->del_lock, flags); + return error; +} + +int rb_free_buf(ring_buffer_t* buf) +{ + unsigned long flags; + int error = 0; + write_lock_irqsave(&buf->del_lock, flags); + BUG_ON(!buf->buf); + free_pages((unsigned long) buf->buf, buf->order); + buf->buf = NULL; + buf->end = NULL; + buf->writep = NULL; + buf->readp = NULL; + write_unlock_irqrestore(&buf->del_lock, flags); + return error; +} + +/* Assumption: concurrent writes are serialized externally + * + * Will only succeed if there is enough space for all len bytes. + */ +int rb_put(ring_buffer_t* buf, char* mem, size_t len) +{ + unsigned long flags; + char* r , *w; + int error = 0; + read_lock_irqsave(&buf->del_lock, flags); + if (!buf->buf) { + error = -ENODEV; + goto out; + } + spin_lock(&buf->lock); + r = buf->readp; + w = buf->writep; + spin_unlock(&buf->lock); + if (r < w && buf->end - w >= len - 1) { + /* easy case: there is enough space in the buffer + * to write it in one continous chunk*/ + memcpy(w, mem, len); + w += len; + if (w > buf->end) + /* special case: fit exactly into buffer + * w is now buf->end + 1 + */ + w = buf->buf; + } else if (w < r && r - w >= len) { /* >= len because may not cross */ + /* we are constrained by the read pointer but we there + * is enough space + */ + memcpy(w, mem, len); + w += len; + } else if (r <= w && buf->end - w < len - 1) { + /* the wrap around case: there may or may not be space */ + if ((buf->end - w) + (r - buf->buf) >= len - 1) { + /* copy chunk that fits at the end */ + memcpy(w, mem, buf->end - w + 1); + mem += buf->end - w + 1; + len -= (buf->end - w + 1); + w = buf->buf; + /* copy the rest */ + memcpy(w, mem, len); + w += len; + } + else + error = -ENOMEM; + } else { + error = -ENOMEM; + } + if (!error) { + spin_lock(&buf->lock); + buf->writep = w; + spin_unlock(&buf->lock); + } + out: + read_unlock_irqrestore(&buf->del_lock, flags); + return error; +} + +/* Assumption: concurrent reads are serialized externally */ +int rb_get(ring_buffer_t* buf, char* mem, size_t len) +{ + unsigned long flags; + char* r , *w; + int error = 0; + read_lock_irqsave(&buf->del_lock, flags); + if (!buf->buf) { + error = -ENODEV; + goto out; + } + spin_lock(&buf->lock); + r = buf->readp; + w = buf->writep; + spin_unlock(&buf->lock); + + if (w <= r && buf->end - r >= len) { + /* easy case: there is enough data in the buffer + * to get it in one chunk*/ + memcpy(mem, r + 1, len); + r += len; + error = len; + + } else if (r + 1 < w && w - r - 1 >= len) { + /* we are constrained by the write pointer but + * there is enough data + */ + memcpy(mem, r + 1, len); + r += len; + error = len; + + } else if (r + 1 < w && w - r - 1 < len) { + /* we are constrained by the write pointer and there + * there is not enough data + */ + memcpy(mem, r + 1, w - r - 1); + error = w - r - 1; + r += w - r - 1; + + } else if (w <= r && buf->end - r < len) { + /* the wrap around case: there may or may not be enough data + * first let's get what is available + */ + memcpy(mem, r + 1, buf->end - r); + error += (buf->end - r); + mem += (buf->end - r); + len -= (buf->end - r); + r += (buf->end - r); + + if (w > buf->buf) { + /* there is more to get */ + r = buf->buf - 1; + if (w - r >= len) { + /* plenty */ + memcpy(mem, r + 1, len); + error += len; + r += len; + } else { + memcpy(mem, r + 1, w - r - 1); + error += w - r - 1; + r += w - r - 1; + } + } + } /* nothing available */ + + if (error > 0) { + spin_lock(&buf->lock); + buf->readp = r; + spin_unlock(&buf->lock); + } + out: + read_unlock_irqrestore(&buf->del_lock, flags); + return error; +} + + + +/******************************************************************************/ +/* DEVICE FILE DRIVER */ +/******************************************************************************/ + + + +/* Allocate a buffer of about 1 MB per CPU. + * + */ +#define BUFFER_ORDER 8 + +typedef struct { + ring_buffer_t buf; + atomic_t reader_cnt; + struct semaphore reader_mutex; +} trace_buffer_t; + + +/* This does not initialize the semaphore!! */ + +#define EMPTY_TRACE_BUFFER \ + { .buf = EMPTY_RING_BUFFER, .reader_cnt = ATOMIC_INIT(0)} + +static DEFINE_PER_CPU(trace_buffer_t, trace_buffer); + +#ifdef CONFIG_SCHED_DEBUG_TRACE +static spinlock_t log_buffer_lock = SPIN_LOCK_UNLOCKED; +#endif +static trace_buffer_t log_buffer = EMPTY_TRACE_BUFFER; + +static void init_buffers(void) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) { + rb_init(&per_cpu(trace_buffer, i).buf); + init_MUTEX(&per_cpu(trace_buffer, i).reader_mutex); + atomic_set(&per_cpu(trace_buffer, i).reader_cnt, 0); + } + /* only initialize the mutex, the rest was initialized as part + * of the static initialization macro + */ + init_MUTEX(&log_buffer.reader_mutex); +} + +static int trace_release(struct inode *in, struct file *filp) +{ + int error = -EINVAL; + trace_buffer_t* buf = filp->private_data; + + BUG_ON(!filp->private_data); + + if (down_interruptible(&buf->reader_mutex)) { + error = -ERESTARTSYS; + goto out; + } + + /* last release must deallocate buffers */ + if (atomic_dec_return(&buf->reader_cnt) == 0) { + error = rb_free_buf(&buf->buf); + } + + up(&buf->reader_mutex); + out: + return error; +} + +static ssize_t trace_read(struct file *filp, char __user *to, size_t len, + loff_t *f_pos) +{ + /* we ignore f_pos, this is strictly sequential */ + + ssize_t error = -EINVAL; + char* mem; + trace_buffer_t *buf = filp->private_data; + + if (down_interruptible(&buf->reader_mutex)) { + error = -ERESTARTSYS; + goto out; + } + + if (len > 64 * 1024) + len = 64 * 1024; + mem = kmalloc(len, GFP_KERNEL); + if (!mem) { + error = -ENOMEM; + goto out_unlock; + } + + error = rb_get(&buf->buf, mem, len); + while (!error) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(110); + if (signal_pending(current)) + error = -ERESTARTSYS; + else + error = rb_get(&buf->buf, mem, len); + } + + if (error > 0 && copy_to_user(to, mem, error)) + error = -EFAULT; + + kfree(mem); + out_unlock: + up(&buf->reader_mutex); + out: + return error; +} + + +/* trace_open - Open one of the per-CPU sched_trace buffers. + */ +static int trace_open(struct inode *in, struct file *filp) +{ + int error = -EINVAL; + int cpu = MINOR(in->i_rdev); + trace_buffer_t* buf; + + if (!cpu_online(cpu)) { + printk(KERN_WARNING "sched trace: " + "CPU #%d is not online. (open failed)\n", cpu); + error = -ENODEV; + goto out; + } + + buf = &per_cpu(trace_buffer, cpu); + + if (down_interruptible(&buf->reader_mutex)) { + error = -ERESTARTSYS; + goto out; + } + + /* first open must allocate buffers */ + if (atomic_inc_return(&buf->reader_cnt) == 1) { + if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER))) + { + atomic_dec(&buf->reader_cnt); + goto out_unlock; + } + } + + error = 0; + filp->private_data = buf; + + out_unlock: + up(&buf->reader_mutex); + out: + return error; +} + +/* log_open - open the global log message ring buffer. + */ +static int log_open(struct inode *in, struct file *filp) +{ + int error = -EINVAL; + trace_buffer_t* buf; + + buf = &log_buffer; + + if (down_interruptible(&buf->reader_mutex)) { + error = -ERESTARTSYS; + goto out; + } + + /* first open must allocate buffers */ + if (atomic_inc_return(&buf->reader_cnt) == 1) { + if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER))) + { + atomic_dec(&buf->reader_cnt); + goto out_unlock; + } + } + + error = 0; + filp->private_data = buf; + + out_unlock: + up(&buf->reader_mutex); + out: + return error; +} + +/******************************************************************************/ +/* Device Registration */ +/******************************************************************************/ + +/* the major numbes are from the unassigned/local use block + * + * This should be converted to dynamic allocation at some point... + */ +#define TRACE_MAJOR 250 +#define LOG_MAJOR 251 + +/* trace_fops - The file operations for accessing the per-CPU scheduling event + * trace buffers. + */ +struct file_operations trace_fops = { + .owner = THIS_MODULE, + .open = trace_open, + .release = trace_release, + .read = trace_read, +}; + +/* log_fops - The file operations for accessing the global LITMUS log message + * buffer. + * + * Except for opening the device file it uses the same operations as trace_fops. + */ +struct file_operations log_fops = { + .owner = THIS_MODULE, + .open = log_open, + .release = trace_release, + .read = trace_read, +}; + +static int __init register_buffer_dev(const char* name, + struct file_operations* fops, + int major, int count) +{ + dev_t trace_dev; + struct cdev *cdev; + int error = 0; + + trace_dev = MKDEV(major, 0); + error = register_chrdev_region(trace_dev, count, name); + if (error) + { + printk(KERN_WARNING "sched trace: " + "Could not register major/minor number %d\n", major); + return error; + } + cdev = cdev_alloc(); + if (!cdev) { + printk(KERN_WARNING "sched trace: " + "Could not get a cdev for %s.\n", name); + return -ENOMEM; + } + cdev->owner = THIS_MODULE; + cdev->ops = fops; + error = cdev_add(cdev, trace_dev, count); + if (error) { + printk(KERN_WARNING "sched trace: " + "add_cdev failed for %s.\n", name); + return -ENOMEM; + } + return error; + +} + +static int __init init_sched_trace(void) +{ + int error1 = 0, error2 = 0; + + printk("Initializing scheduler trace device\n"); + init_buffers(); + + error1 = register_buffer_dev("schedtrace", &trace_fops, + TRACE_MAJOR, NR_CPUS); + + error2 = register_buffer_dev("litmus_log", &log_fops, + LOG_MAJOR, 1); + if (error1 || error2) + return min(error1, error2); + else + return 0; +} + +module_init(init_sched_trace); + +/******************************************************************************/ +/* KERNEL API */ +/******************************************************************************/ + +/* The per-CPU LITMUS log buffer. Don't put it on the stack, it is too big for + * that and the kernel gets very picky with nested interrupts and small stacks. + */ + +#ifdef CONFIG_SCHED_DEBUG_TRACE + +#define MSG_SIZE 255 +static DEFINE_PER_CPU(char[MSG_SIZE], fmt_buffer); + +/* sched_trace_log_message - This is the only function that accesses the the + * log buffer inside the kernel for writing. + * Concurrent access to it is serialized via the + * log_buffer_lock. + * + * The maximum length of a formatted message is 255. + */ +void sched_trace_log_message(const char* fmt, ...) +{ + unsigned long flags; + va_list args; + size_t len; + char* buf; + + va_start(args, fmt); + local_irq_save(flags); + + /* format message */ + buf = __get_cpu_var(fmt_buffer); + len = vscnprintf(buf, MSG_SIZE, fmt, args); + + spin_lock(&log_buffer_lock); + /* Don't copy the trailing null byte, we don't want null bytes + * in a text file. + */ + rb_put(&log_buffer.buf, buf, len); + spin_unlock(&log_buffer_lock); + + local_irq_restore(flags); + va_end(args); +} + +#endif + +#ifdef CONFIG_SCHED_TASK_TRACE + +static inline void __put_trace(char* mem, size_t size) +{ + trace_buffer_t* buf = &__get_cpu_var(trace_buffer); + rb_put(&buf->buf, mem, size); +} + +#define put_trace(obj) \ + if (get_rt_mode() == MODE_RT_RUN) \ + __put_trace((char *) &obj, sizeof(obj)) + +#define header(rec, type) \ +{ \ + rec.header.trace = type; \ + rec.header.timestamp = sched_clock(); \ +} + +#define tinfo(info, t) \ +{ \ + info.is_rt = is_realtime(t); \ + info.is_server = 0; \ + info.class = get_class(t); \ + info.budget = (t)->time_slice; \ + info.pid = (t)->pid; \ + info.deadline = (t)->rt_param.times.deadline; \ +} + +#define rtinfo(info, t) \ +{ \ + info.wcet = get_exec_cost(t); \ + info.period = get_rt_period(t); \ +} + +void sched_trace_scheduler_invocation(void) +{ + invocation_record_t rec; + header(rec, ST_INVOCATION); + rec.flags = current->flags; + put_trace(rec); +} + +void sched_trace_task_arrival(struct task_struct *t) +{ + arrival_record_t rec; + header(rec, ST_ARRIVAL); + tinfo(rec.task, t); + put_trace(rec); +} + + +void sched_trace_task_departure(struct task_struct *t) +{ + departure_record_t rec; + header(rec, ST_DEPARTURE); + tinfo(rec.task, t); + put_trace(rec); +} + +void sched_trace_task_preemption(struct task_struct *t, struct task_struct* by) +{ + preemption_record_t rec; + header(rec, ST_PREEMPTION); + tinfo(rec.task, t); + tinfo(rec.by, by); + put_trace(rec); +} + + +void sched_trace_task_scheduled(struct task_struct *t) +{ + scheduled_record_t rec; + header(rec, ST_SCHEDULED); + tinfo(rec.task, t); + put_trace(rec); +} + + +void sched_trace_job_release(struct task_struct *t) +{ + release_record_t rec; + header(rec, ST_JOB_RELEASE); + tinfo(rec.task, t); + rtinfo(rec, t); + put_trace(rec); +} + +void sched_trace_job_completion(struct task_struct *t) +{ + completion_record_t rec; + header(rec, ST_JOB_COMPLETION); + tinfo(rec.task, t); + rtinfo(rec, t); + rec.tardiness = jiffies - t->rt_param.times.deadline; + put_trace(rec); +} + + +void sched_trace_server_scheduled(int id, task_class_t class, + unsigned int budget, jiffie_t deadline) +{ + scheduled_record_t rec; + header(rec, ST_SCHEDULED); + rec.task.pid = id; + rec.task.is_rt = 1; + rec.task.is_server = 1; + rec.task.class = class; + rec.task.budget = budget; + rec.task.deadline = deadline; + put_trace(rec); +} + +void sched_trace_server_release(int id, unsigned int wcet, + unsigned int period, task_class_t class) +{ + release_record_t rec; + header(rec, ST_JOB_RELEASE); + rec.task.pid = id; + rec.task.is_rt = 1; + rec.task.is_server = 1; + rec.task.class = class; + rec.task.budget = wcet; + rec.period = period; + rec.wcet = wcet; + put_trace(rec); +} + +void sched_trace_server_completion(int id, unsigned int budget, + jiffie_t deadline, task_class_t class) +{ + completion_record_t rec; + header(rec, ST_JOB_COMPLETION); + rec.task.pid = id; + rec.task.is_rt = 1; + rec.task.is_server = 1; + rec.task.class = class; + rec.task.budget = budget; + rec.task.deadline = deadline; + rec.period = 0; + rec.tardiness = jiffies - deadline; + put_trace(rec); + +} + +void sched_trace_capacity_release(struct task_struct *t) +{ + cap_release_record_t rec; + header(rec, ST_CAPACITY_RELEASE); + tinfo(rec.task, t); + put_trace(rec); +} + +void sched_trace_capacity_allocation(struct task_struct *t, u16 budget, u32 deadline, + pid_t donor) +{ + cap_allocation_record_t rec; + header(rec, ST_CAPACITY_ALLOCATION); + tinfo(rec.task, t); + rec.donor = donor; + rec.budget = budget; + rec.deadline = deadline; + put_trace(rec); +} + +void sched_trace_capacity_alloc_srv(pid_t srv, u32 srv_dl, task_class_t cls, + u16 srv_budget, + u16 budget, u32 deadline, pid_t donor) +{ + cap_allocation_record_t rec; + header(rec, ST_CAPACITY_ALLOCATION); + rec.task.pid = srv; + rec.task.is_rt = 1; + rec.task.is_server = 1; + rec.task.class = cls; + rec.task.budget = srv_budget; + rec.task.deadline = srv_dl; + rec.donor = donor; + rec.budget = budget; + rec.deadline = deadline; + put_trace(rec); +} + +#endif diff --git a/kernel/timer.c b/kernel/timer.c index c2a8ccf..77a1b6b 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -737,6 +737,27 @@ static inline s64 __get_nsec_offset(void) return ns_offset; } +/* Non-static, non-inline, public version of function above. + * It's up to the programmer to decide how to use it, no guarantees + * about anything are made here. + */ +s64 get_nsec_offset(void) +{ + cycle_t cycle_now, cycle_delta; + s64 ns_offset; + + /* read clocksource: */ + cycle_now = clocksource_read(clock); + + /* calculate the delta since the last update_wall_time: */ + cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; + + /* convert to nanoseconds: */ + ns_offset = cyc2ns(clock, cycle_delta); + + return ns_offset; +} + /** * __get_realtime_clock_ts - Returns the time of day in a timespec * @ts: pointer to the timespec to be set @@ -789,6 +810,7 @@ void do_gettimeofday(struct timeval *tv) } EXPORT_SYMBOL(do_gettimeofday); + /** * do_settimeofday - Sets the time of day * @tv: pointer to the timespec variable containing the new time diff --git a/kernel/trace.c b/kernel/trace.c new file mode 100644 index 0000000..ecebe6c --- /dev/null +++ b/kernel/trace.c @@ -0,0 +1,257 @@ +#include +#include +#include +#include +#include + +#include + +/******************************************************************************/ +/* Allocation */ +/******************************************************************************/ + +struct ft_buffer* trace_ts_buf = NULL; + +static unsigned int ts_seq_no = 0; + +feather_callback void save_timestamp(unsigned long event) +{ + unsigned int seq_no = fetch_and_inc((int *) &ts_seq_no); + struct timestamp *ts; + if (ft_buffer_start_write(trace_ts_buf, (void**) &ts)) { + ts->event = event; + ts->timestamp = ft_read_tsc(); + ts->seq_no = seq_no; + ts->cpu = raw_smp_processor_id(); + ft_buffer_finish_write(trace_ts_buf, ts); + } +} + +static struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size) +{ + struct ft_buffer* buf; + size_t total = (size + 1) * count; + char* mem; + int order = 0, pages = 1; + + buf = kmalloc(sizeof(struct ft_buffer), GFP_KERNEL); + if (!buf) + return NULL; + + total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0); + while (pages < total) { + order++; + pages *= 2; + } + + mem = (char*) __get_free_pages(GFP_KERNEL, order); + if (!mem) { + kfree(buf); + return NULL; + } + + if (!init_ft_buffer(buf, count, size, + mem + (count * size), /* markers at the end */ + mem)) { /* buffer objects */ + free_pages((unsigned long) mem, order); + kfree(buf); + return NULL; + } + return buf; +} + +static void free_ft_buffer(struct ft_buffer* buf) +{ + int order = 0, pages = 1; + size_t total; + + if (buf) { + total = (buf->slot_size + 1) * buf->slot_count; + total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0); + while (pages < total) { + order++; + pages *= 2; + } + free_pages((unsigned long) buf->buffer_mem, order); + kfree(buf); + } +} + + +/******************************************************************************/ +/* DEVICE FILE DRIVER */ +/******************************************************************************/ + +#define NO_TIMESTAMPS 262144 + +static int trace_release(struct inode *in, struct file *filp) +{ + int error = -EINVAL; + + /* disable events */ + ft_disable_all_events(); + set_current_state(TASK_UNINTERRUPTIBLE); + /* wait for any pending events to complete */ + schedule_timeout(HZ); + printk(KERN_ALERT "Failed trace writes: %u\n", + trace_ts_buf->failed_writes); + free_ft_buffer(trace_ts_buf); + trace_ts_buf = NULL; + return error; +} + +static ssize_t trace_read(struct file *filp, char __user *to, size_t len, + loff_t *f_pos) +{ + /* we ignore f_pos, this is strictly sequential */ + ssize_t error = 0; + struct timestamp ts; + + while (len >= sizeof(struct timestamp)) { + if (ft_buffer_read(trace_ts_buf, &ts)) { + if (copy_to_user(to, &ts, sizeof(struct timestamp))) { + error = -EFAULT; + break; + } else { + len -= sizeof(struct timestamp); + to += sizeof(struct timestamp); + error += sizeof(struct timestamp); + } + } else { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(50); + if (signal_pending(current)) { + error = -ERESTARTSYS; + break; + } + } + } + return error; +} + +#define ENABLE_CMD 0 +#define DISABLE_CMD 1 + +static ssize_t trace_write(struct file *filp, const char __user *from, + size_t len, loff_t *f_pos) +{ + ssize_t error = -EINVAL; + unsigned long cmd; + unsigned long id; + + if (len % sizeof(long) || len < 2 * sizeof(long)) + goto out; + + if (copy_from_user(&cmd, from, sizeof(long))) { + error = -EFAULT; + goto out; + } + len -= sizeof(long); + from += sizeof(long); + + if (cmd != ENABLE_CMD && cmd != DISABLE_CMD) + goto out; + + error = sizeof(long); + while (len) { + if (copy_from_user(&id, from, sizeof(long))) { + error = -EFAULT; + goto out; + } + len -= sizeof(long); + from += sizeof(long); + if (cmd) { + printk(KERN_INFO + "Disabling feather-trace event %lu.\n", id); + ft_disable_event(id); + } else { + printk(KERN_INFO + "Enabling feather-trace event %lu.\n", id); + ft_enable_event(id); + } + error += sizeof(long); + } + + + out: + return error; +} + +static int trace_open(struct inode *in, struct file *filp) +{ + int err = 0; + unsigned int count = NO_TIMESTAMPS; + while (count && !trace_ts_buf) { + printk("trace: trying to allocate %u time stamps.\n", count); + trace_ts_buf = alloc_ft_buffer(count, sizeof(struct timestamp)); + count /= 2; + } + if (!trace_ts_buf) + err = -ENOMEM; + + return err; +} + +/******************************************************************************/ +/* Device Registration */ +/******************************************************************************/ + +#define FT_TRACE_MAJOR 252 + +struct file_operations ft_trace_fops = { + .owner = THIS_MODULE, + .open = trace_open, + .release = trace_release, + .write = trace_write, + .read = trace_read, +}; + + +static int __init register_buffer_dev(const char* name, + struct file_operations* fops, + int major, int count) +{ + dev_t trace_dev; + struct cdev *cdev; + int error = 0; + + trace_dev = MKDEV(major, 0); + error = register_chrdev_region(trace_dev, count, name); + if (error) + { + printk(KERN_WARNING "trace: " + "Could not register major/minor number %d\n", major); + return error; + } + cdev = cdev_alloc(); + if (!cdev) { + printk(KERN_WARNING "trace: " + "Could not get a cdev for %s.\n", name); + return -ENOMEM; + } + cdev->owner = THIS_MODULE; + cdev->ops = fops; + error = cdev_add(cdev, trace_dev, count); + if (error) { + printk(KERN_WARNING "trace: " + "add_cdev failed for %s.\n", name); + return -ENOMEM; + } + return error; + +} + +static int __init init_sched_trace(void) +{ + int error = 0; + + printk("Initializing Feather-Trace device\n"); + /* dummy entry to make linker happy */ + ft_event0(666, save_timestamp); + + error = register_buffer_dev("ft_trace", &ft_trace_fops, + FT_TRACE_MAJOR, 1); + return error; +} + +module_init(init_sched_trace); diff --git a/lib/semaphore-sleepers.c b/lib/semaphore-sleepers.c index 1281805..3f4d543 100644 --- a/lib/semaphore-sleepers.c +++ b/lib/semaphore-sleepers.c @@ -108,7 +108,7 @@ fastcall int __sched __down_interruptible(struct semaphore * sem) /* * With signals pending, this turns into * the trylock failure case - we won't be - * sleeping, and we* can't get the lock as + * sleeping, and we can't get the lock as * it has contention. Just correct the count * and exit. */