Makefile | 2 +- arch/i386/Kconfig | 28 ++ arch/i386/kernel/apic.c | 92 +++++ arch/i386/kernel/i386_ksyms.c | 1 + arch/i386/kernel/signal.c | 3 +- arch/i386/kernel/smp.c | 1 + arch/i386/kernel/syscall_table.S | 22 + fs/exec.c | 5 +- fs/inode.c | 2 + include/asm-i386/unistd.h | 25 ++- include/linux/completion.h | 2 + include/linux/fs.h | 5 + include/linux/sched.h | 14 + include/linux/uaccess.h | 16 + include/litmus/edf_common.h | 27 ++ include/litmus/fdso.h | 78 ++++ include/litmus/feather_buffer.h | 108 +++++ include/litmus/feather_trace.h | 93 +++++ include/litmus/jobs.h | 9 + include/litmus/litmus.h | 200 +++++++++ include/litmus/rm_common.h | 44 ++ include/litmus/rt_domain.h | 94 +++++ include/litmus/rt_param.h | 177 ++++++++ include/litmus/sched_plugin.h | 120 ++++++ include/litmus/sched_trace.h | 31 ++ include/litmus/trace.h | 106 +++++ kernel/exit.c | 4 + kernel/fork.c | 5 + kernel/sched.c | 177 ++++++++- lib/semaphore-sleepers.c | 2 +- litmus/Makefile | 9 + litmus/edf_common.c | 95 +++++ litmus/fdso.c | 289 +++++++++++++ litmus/ft_event.c | 104 +++++ litmus/jobs.c | 43 ++ litmus/litmus.c | 830 ++++++++++++++++++++++++++++++++++++++ litmus/litmus_sem.c | 551 +++++++++++++++++++++++++ litmus/pcp.c | 764 +++++++++++++++++++++++++++++++++++ litmus/rm_common.c | 76 ++++ litmus/rt_domain.c | 130 ++++++ litmus/sched_gsn_edf.c | 733 +++++++++++++++++++++++++++++++++ litmus/sched_plugin.c | 169 ++++++++ litmus/sched_psn_edf.c | 458 +++++++++++++++++++++ litmus/sched_rm.c | 397 ++++++++++++++++++ litmus/sched_trace.c | 541 +++++++++++++++++++++++++ litmus/sync.c | 84 ++++ litmus/trace.c | 302 ++++++++++++++ 47 files changed, 7052 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index 7e2750f..79cf62b 100644 --- a/Makefile +++ b/Makefile @@ -553,7 +553,7 @@ export mod_strip_cmd ifeq ($(KBUILD_EXTMOD),) -core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ +core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ litmus/ vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \ $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index 0dfee81..da6f1e9 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -1210,6 +1210,7 @@ config KPROBES a probepoint and specifies the callback. Kprobes is useful for kernel debugging, non-intrusive instrumentation and testing. If in doubt, say "N". + endmenu source "arch/i386/Kconfig.debug" @@ -1259,3 +1260,30 @@ config X86_TRAMPOLINE config KTIME_SCALAR bool default y + + +menu "LITMUS^RT" + + +config SCHED_TASK_TRACE + bool "Trace real-time tasks" + default y + help + Include support for the sched_trace_XXX() tracing functions. This + allows the collection of real-time task events such as job + completions, job releases, early completions, etc. This results in a + small overhead in the scheduling code. Disable if the overhead is not + acceptable (e.g., benchmarking). + +config SCHED_DEBUG_TRACE + bool "TRACE() debugging" + default y + help + Include support for sched_trace_log_messageg(), which is used to + implement TRACE(). If disabled, no TRACE() messages will be included + in the kernel, and no overheads due to debugging statements will be + incurred by the scheduler. Disable if the overhead is not acceptable + (e.g. benchmarking). + + +endmenu diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c index 776d9be..36b0159 100644 --- a/arch/i386/kernel/apic.c +++ b/arch/i386/kernel/apic.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -43,6 +44,8 @@ #include "io_ports.h" +#include + /* * cpu_mask that denotes the CPUs that needs timer interrupt coming in as * IPIs in place of local APIC timers @@ -54,6 +57,15 @@ static cpumask_t timer_bcast_ipi; */ static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ +/* + * Definitions and variables related to quantum synchronization. + */ +#define WAIT_TO_SYNC 30000 /* time after boot until sync */ +static int stagger = 0; /* are we using staggered quanta? */ +static atomic_t qsync_time = ATOMIC_INIT(INITIAL_JIFFIES); +static atomic_t quantum_sync_barrier = ATOMIC_INIT(0); +static atomic_t sync_done = ATOMIC_INIT(0); + static inline void lapic_disable(void) { enable_local_apic = -1; @@ -786,6 +798,23 @@ static int __init apic_set_verbosity(char *str) __setup("apic=", apic_set_verbosity); +/* + * Determine whether to use aligned or staggerd quanta. + */ + +static int __init apic_synch_type(char *str) +{ + if (strcmp("aligned", str) == 0) + stagger = 0; + else if (strcmp("staggered", str) == 0) + stagger = 1; + else + stagger = 0; /* aligned quanta by default */ + return 1; +} + +__setup("quanta=", apic_synch_type); + static int __init detect_init_APIC (void) { u32 h, l, features; @@ -1198,6 +1227,47 @@ EXPORT_SYMBOL(switch_ipi_to_APIC_timer); #undef APIC_DIVISOR /* + * This function is called to align all quanta, and to stagger quanta if + * necessary. It relies on a barrier to synchronize all processors, so + * that they all reset their APIC timers at the same time. If quanta + * should be staggered, the appropriate stagger delay is then added at + * each processor. + */ + +void synchronize_quanta(void) +{ + int cpu = smp_processor_id(); + int total_cpus = num_online_cpus(); + int stagger_interval = jiffies_to_usecs(1) / total_cpus; + + /* + * Disable APIC timer, wait for all other processors to reach barrier, + * and re-enable all timers concurrently. + */ + disable_APIC_timer(); + atomic_inc(&quantum_sync_barrier); + while (atomic_read(&quantum_sync_barrier) < total_cpus) { + /* Delay, otherwise atomic_inc's cannot occur. */ + udelay(1); + } + + /* Add necessary stagger for this CPU, if required. */ + if (stagger) { + int stagger_us = cpu * stagger_interval; + udelay(stagger_us); + } + + /* Re-enable all timers. */ + __setup_APIC_LVTT(calibration_result); + enable_APIC_timer(); + + /* The first CPU signals that quantum sync is complete. */ + if (cpu == 0) + atomic_inc(&sync_done); +} + + +/* * Local timer interrupt handler. It does both profiling and * process statistics/rescheduling. * @@ -1209,11 +1279,32 @@ EXPORT_SYMBOL(switch_ipi_to_APIC_timer); inline void smp_local_timer_interrupt(void) { +/* s64 offset; */ + + TS_TICK_START; + profile_tick(CPU_PROFILING); #ifdef CONFIG_SMP update_process_times(user_mode_vm(get_irq_regs())); #endif + /* Print out timing data - can be commented out if necessary. */ +/* offset = get_nsec_offset(); */ +/* TRACE("%d\n", offset); */ + + /* + * Synchronize quanta if we have reached qsync_time plus wait + * interval. The synchronization code itself is placed in its own + * (non-inline) function, to avoid issues with creating an inline + * function that is too large. + */ + if (unlikely(!atomic_read(&sync_done) && + time_after(jiffies, + (unsigned long)(atomic_read(&qsync_time) + + msecs_to_jiffies(WAIT_TO_SYNC))))) { + synchronize_quanta(); + } + /* * We take the 'long' return path, and there every subsystem * grabs the apropriate locks (kernel lock/ irq lock). @@ -1224,6 +1315,7 @@ inline void smp_local_timer_interrupt(void) * Currently this isn't too much of an issue (performance wise), * we can take more than 100K local irqs per second on a 100 MHz P5. */ + TS_TICK_END; } /* diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c index e3d4b73..9670f77 100644 --- a/arch/i386/kernel/i386_ksyms.c +++ b/arch/i386/kernel/i386_ksyms.c @@ -6,6 +6,7 @@ EXPORT_SYMBOL(__down_failed); EXPORT_SYMBOL(__down_failed_interruptible); EXPORT_SYMBOL(__down_failed_trylock); EXPORT_SYMBOL(__up_wakeup); + /* Networking helper routines. */ EXPORT_SYMBOL(csum_partial_copy_generic); diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c index 65d7620..e95d732 100644 --- a/arch/i386/kernel/signal.c +++ b/arch/i386/kernel/signal.c @@ -651,7 +651,6 @@ void do_notify_resume(struct pt_regs *regs, void *_unused, /* deal with pending signal delivery */ if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK)) - do_signal(regs); - + do_signal(regs); clear_thread_flag(TIF_IRET); } diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c index 5285aff..91921a3 100644 --- a/arch/i386/kernel/smp.c +++ b/arch/i386/kernel/smp.c @@ -605,6 +605,7 @@ void smp_send_stop(void) */ fastcall void smp_reschedule_interrupt(struct pt_regs *regs) { + set_tsk_need_resched(current); ack_APIC_irq(); } diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index 2697e92..48e5e8e 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -319,3 +319,25 @@ ENTRY(sys_call_table) .long sys_move_pages .long sys_getcpu .long sys_epoll_pwait + /* LITMUS syscalls */ + .long sys_set_rt_task_param /* 320 */ + .long sys_get_rt_task_param + .long sys_task_mode_transition + .long sys_sleep_next_period + .long sys_register_np_flag + .long sys_exit_np /* 325 */ + .long sys_od_open + .long sys_od_close + .long sys_pi_down + .long sys_pi_up + .long sys_srp_down /* 330 */ + .long sys_srp_up + .long sys_reg_task_srp_sem + .long sys_query_job_no + .long sys_wait_for_job_release + .long sys_wait_for_ts_release /* 335 */ + .long sys_release_ts + .long sys_pcp_down + .long sys_pcp_up + .long sys_dpcp_invoke + .long sys_dpcp_agent /* 340 */ diff --git a/fs/exec.c b/fs/exec.c index 11fe93f..353d6e3 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -54,6 +54,8 @@ #include #include +#include + #ifdef CONFIG_KMOD #include #endif @@ -1140,7 +1142,8 @@ int do_execve(char * filename, if (IS_ERR(file)) goto out_kfree; - sched_exec(); + sched_exec(); + litmus_exec(); bprm->p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); diff --git a/fs/inode.c b/fs/inode.c index bf21dc6..fcf8ce3 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -205,6 +205,8 @@ void inode_init_once(struct inode *inode) INIT_LIST_HEAD(&inode->inotify_watches); mutex_init(&inode->inotify_mutex); #endif + INIT_LIST_HEAD(&inode->i_obj_list); + mutex_init(&inode->i_obj_mutex); } EXPORT_SYMBOL(inode_init_once); diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index 833fa17..d0ba5c3 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -325,10 +325,33 @@ #define __NR_move_pages 317 #define __NR_getcpu 318 #define __NR_epoll_pwait 319 +/* LITMUS */ +#define __NR_set_rt_task_param 320 +#define __NR_get_rt_task_param 321 +#define __NR_task_mode 322 +#define __NR_sleep_next_period 323 +#define __NR_register_np_flag 324 +#define __NR_exit_np 325 +#define __NR_od_open 326 +#define __NR_od_close 327 +#define __NR_pi_down 328 +#define __NR_pi_up 329 +#define __NR_srp_down 330 +#define __NR_srp_up 331 +#define __NR_reg_task_srp_sem 332 +#define __NR_query_job_no 333 +#define __NR_wait_for_job_release 334 +#define __NR_wait_for_ts_release 335 +#define __NR_release_ts 336 +#define __NR_pcp_down 337 +#define __NR_pcp_up 338 +#define __NR_dpcp_invoke 339 +#define __NR_dpcp_agent 340 + #ifdef __KERNEL__ -#define NR_syscalls 320 +#define NR_syscalls 343 #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/include/linux/completion.h b/include/linux/completion.h index 268c5a4..dc633ed 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -51,6 +51,8 @@ extern unsigned long FASTCALL(wait_for_completion_interruptible_timeout( extern void FASTCALL(complete(struct completion *)); extern void FASTCALL(complete_all(struct completion *)); +extern void FASTCALL(complete_n(struct completion *, int n)); + #define INIT_COMPLETION(x) ((x).done = 0) diff --git a/include/linux/fs.h b/include/linux/fs.h index 1410e53..4e1117c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -524,6 +524,8 @@ static inline int mapping_writably_mapped(struct address_space *mapping) #define i_size_ordered_init(inode) do { } while (0) #endif +struct inode_obj_id_table; + struct inode { struct hlist_node i_hash; struct list_head i_list; @@ -589,6 +591,9 @@ struct inode { void *i_security; #endif void *i_private; /* fs or device private pointer */ + + struct list_head i_obj_list; + struct mutex i_obj_mutex; }; /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 4463735..c7929d6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -3,6 +3,8 @@ #include /* For AT_VECTOR_SIZE */ +#include + /* * cloning flags: */ @@ -796,6 +798,8 @@ enum sleep_type { SLEEP_INTERRUPTED, }; +struct od_table_entry; + struct prio_array; struct task_struct { @@ -1051,6 +1055,16 @@ struct task_struct { #ifdef CONFIG_FAULT_INJECTION int make_it_fail; #endif + /* litmus parameters and state */ + struct rt_param rt_param; + + /* allow scheduler plugins to queue in release lists, etc. + * Cleanup: Move this into the rt_param struct. + */ + struct list_head rt_list; + + /* references to PI semaphores, etc. */ + struct od_table_entry* od_table; }; static inline pid_t process_group(struct task_struct *tsk) diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 975c963..6ae0ff9 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -84,4 +84,20 @@ static inline unsigned long __copy_from_user_nocache(void *to, ret; \ }) +/* This is a naive attempt at a write version of the above native Linux macro. + */ +#define poke_kernel_address(val, addr) \ + ({ \ + long ret; \ + mm_segment_t old_fs = get_fs(); \ + \ + set_fs(KERNEL_DS); \ + pagefault_disable(); \ + ret = __put_user(val, (__force typeof(val) __user *)(addr)); \ + pagefault_enable(); \ + set_fs(old_fs); \ + ret; \ + }) + + #endif /* __LINUX_UACCESS_H__ */ diff --git a/include/litmus/edf_common.h b/include/litmus/edf_common.h new file mode 100644 index 0000000..df711f5 --- /dev/null +++ b/include/litmus/edf_common.h @@ -0,0 +1,27 @@ +/* EDF common data structures and utility functions shared by all EDF + * based scheduler plugins + */ + +/* CLEANUP: Add comments and make it less messy. + * + */ + +#ifndef __UNC_EDF_COMMON_H__ +#define __UNC_EDF_COMMON_H__ + +#include + + +void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched); + +int edf_higher_prio(struct task_struct* first, + struct task_struct* second); + +int edf_ready_order(struct list_head* a, struct list_head* b); + +int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t); + +#define job_completed(t) (!is_be(t) && \ + (t)->rt_param.times.exec_time == (t)->rt_param.basic_params.exec_cost) + +#endif diff --git a/include/litmus/fdso.h b/include/litmus/fdso.h new file mode 100644 index 0000000..5544c1b --- /dev/null +++ b/include/litmus/fdso.h @@ -0,0 +1,78 @@ +/* fdso.h - file descriptor attached shared objects + * + * (c) 2007 B. Brandenburg, LITMUS^RT project + */ + +#ifndef _LINUX_FDSO_H_ +#define _LINUX_FDSO_H_ + +#include +#include + +#include + +#define MAX_OBJECT_DESCRIPTORS 32 + +typedef enum { + MIN_OBJ_TYPE = 0, + + PI_SEM = 0, + SRP_SEM = 1, + PCP_SEM = 2, + MPCP_SEM = 3, + + MAX_OBJ_TYPE = 3 +} obj_type_t; + +struct inode_obj_id { + struct list_head list; + atomic_t count; + struct inode* inode; + + obj_type_t type; + void* obj; + unsigned int id; +}; + + +struct od_table_entry { + unsigned int used; + + struct inode_obj_id* obj; + void* extra; +}; + +struct fdso_ops { + void* (*create) (void); + void (*destroy)(void*); + int (*open) (struct od_table_entry*, void* __user); + int (*close) (struct od_table_entry*); +}; + +/* translate a userspace supplied od into the raw table entry + * returns NULL if od is invalid + */ +struct od_table_entry* __od_lookup(int od); + +/* translate a userspace supplied od into the associated object + * returns NULL if od is invalid + */ +static inline void* od_lookup(int od, obj_type_t type) +{ + struct od_table_entry* e = __od_lookup(od); + return e && e->obj->type == type ? e->obj->obj : NULL; +} + +static inline void* od_lookup2(int od, obj_type_t type, obj_type_t type2) +{ + struct od_table_entry* e = __od_lookup(od); + return e && (e->obj->type == type || e->obj->type == type2) ? + e->obj->obj : NULL; +} + +#define lookup_pi_sem(od) ((struct pi_semaphore*) od_lookup(od, PI_SEM)) +#define lookup_srp_sem(od) ((struct srp_semaphore*) od_lookup(od, SRP_SEM)) +#define lookup_pcp_sem(od) ((struct pcp_semaphore*) \ + od_lookup2(od, PCP_SEM, MPCP_SEM)) + +#endif diff --git a/include/litmus/feather_buffer.h b/include/litmus/feather_buffer.h new file mode 100644 index 0000000..c788227 --- /dev/null +++ b/include/litmus/feather_buffer.h @@ -0,0 +1,108 @@ +#ifndef _FEATHER_BUFFER_H_ +#define _FEATHER_BUFFER_H_ + +/* requires UINT_MAX and memcpy */ + +static inline int fetch_and_inc(int *val) +{ + int ret = 1; + __asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" ); + return ret; +} + +static inline int fetch_and_dec(int *val) +{ + int ret = -1; + __asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" ); + return ret; +} + +#define SLOT_FREE 0 +#define SLOT_BUSY 1 +#define SLOT_READY 2 + +struct ft_buffer { + unsigned int slot_count; + unsigned int slot_size; + + int free_count; + unsigned int write_idx; + unsigned int read_idx; + + char* slots; + void* buffer_mem; + unsigned int failed_writes; +}; + +static inline int init_ft_buffer(struct ft_buffer* buf, + unsigned int slot_count, + unsigned int slot_size, + char* slots, + void* buffer_mem) +{ + int i = 0; + if (!slot_count || UINT_MAX % slot_count != slot_count - 1) { + /* The slot count must divide UNIT_MAX + 1 so that when it + * wraps around the index correctly points to 0. + */ + return 0; + } else { + buf->slot_count = slot_count; + buf->slot_size = slot_size; + buf->slots = slots; + buf->buffer_mem = buffer_mem; + buf->free_count = slot_count; + buf->write_idx = 0; + buf->read_idx = 0; + buf->failed_writes = 0; + for (i = 0; i < slot_count; i++) + buf->slots[i] = SLOT_FREE; + return 1; + } +} + +static inline int ft_buffer_start_write(struct ft_buffer* buf, void **ptr) +{ + int free = fetch_and_dec(&buf->free_count); + unsigned int idx; + if (free <= 0) { + fetch_and_inc(&buf->free_count); + *ptr = 0; + fetch_and_inc(&buf->failed_writes); + return 0; + } else { + idx = fetch_and_inc((int*) &buf->write_idx) % buf->slot_count; + buf->slots[idx] = SLOT_BUSY; + *ptr = ((char*) buf->buffer_mem) + idx * buf->slot_size; + return 1; + } +} + +static inline void ft_buffer_finish_write(struct ft_buffer* buf, void *ptr) +{ + unsigned int idx = ((char*) ptr - (char*) buf->buffer_mem) / buf->slot_size; + buf->slots[idx] = SLOT_READY; +} + + +/* exclusive reader access is assumed */ +static inline int ft_buffer_read(struct ft_buffer* buf, void* dest) +{ + unsigned int idx; + if (buf->free_count == buf->slot_count) + /* nothing available */ + return 0; + idx = buf->read_idx % buf->slot_count; + if (buf->slots[idx] == SLOT_READY) { + memcpy(dest, ((char*) buf->buffer_mem) + idx * buf->slot_size, + buf->slot_size); + buf->slots[idx] = SLOT_FREE; + buf->read_idx++; + fetch_and_inc(&buf->free_count); + return 1; + } else + return 0; +} + + +#endif diff --git a/include/litmus/feather_trace.h b/include/litmus/feather_trace.h new file mode 100644 index 0000000..5c37ea7 --- /dev/null +++ b/include/litmus/feather_trace.h @@ -0,0 +1,93 @@ +#ifndef _FEATHER_TRACE_H_ +#define _FEATHER_TRACE_H_ + +#define feather_callback __attribute__((regparm(0))) + +/* make the compiler reload any register that is not saved in + * a cdecl function call + */ +#define CLOBBER_LIST "memory", "cc", "eax", "ecx", "edx" + +#define ft_event(id, callback) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " call " #callback " \n\t" \ + ".section __event_table, \"aw\" \n\t" \ + ".long " #id ", 0, 1b, 2f \n\t" \ + ".previous \n\t" \ + "2: \n\t" \ + : : : CLOBBER_LIST) + +#define ft_event0(id, callback) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " subl $4, %%esp \n\t" \ + " movl $" #id ", (%%esp) \n\t" \ + " call " #callback " \n\t" \ + " addl $4, %%esp \n\t" \ + ".section __event_table, \"aw\" \n\t" \ + ".long " #id ", 0, 1b, 2f \n\t" \ + ".previous \n\t" \ + "2: \n\t" \ + : : : CLOBBER_LIST) + +#define ft_event1(id, callback, param) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " subl $8, %%esp \n\t" \ + " movl %0, 4(%%esp) \n\t" \ + " movl $" #id ", (%%esp) \n\t" \ + " call " #callback " \n\t" \ + " addl $8, %%esp \n\t" \ + ".section __event_table, \"aw\" \n\t" \ + ".long " #id ", 0, 1b, 2f \n\t" \ + ".previous \n\t" \ + "2: \n\t" \ + : : "r" (param) : CLOBBER_LIST) + +#define ft_event2(id, callback, param, param2) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " subl $12, %%esp \n\t" \ + " movl %1, 8(%%esp) \n\t" \ + " movl %0, 4(%%esp) \n\t" \ + " movl $" #id ", (%%esp) \n\t" \ + " call " #callback " \n\t" \ + " addl $12, %%esp \n\t" \ + ".section __event_table, \"aw\" \n\t" \ + ".long " #id ", 0, 1b, 2f \n\t" \ + ".previous \n\t" \ + "2: \n\t" \ + : : "r" (param), "r" (param2) : CLOBBER_LIST) + + +#define ft_event3(id, callback, p, p2, p3) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " subl $16, %%esp \n\t" \ + " movl %1, 12(%%esp) \n\t" \ + " movl %1, 8(%%esp) \n\t" \ + " movl %0, 4(%%esp) \n\t" \ + " movl $" #id ", (%%esp) \n\t" \ + " call " #callback " \n\t" \ + " addl $16, %%esp \n\t" \ + ".section __event_table, \"aw\" \n\t" \ + ".long " #id ", 0, 1b, 2f \n\t" \ + ".previous \n\t" \ + "2: \n\t" \ + : : "r" (p), "r" (p2), "r" (p3) : CLOBBER_LIST) + + +static inline unsigned long long ft_read_tsc(void) +{ + unsigned long long ret; + __asm__ __volatile__("rdtsc" : "=A" (ret)); + return ret; +} + +int ft_enable_event(unsigned long id); +int ft_disable_event(unsigned long id); +int ft_is_event_enabled(unsigned long id); +int ft_disable_all_events(void); + +#endif diff --git a/include/litmus/jobs.h b/include/litmus/jobs.h new file mode 100644 index 0000000..9bd361e --- /dev/null +++ b/include/litmus/jobs.h @@ -0,0 +1,9 @@ +#ifndef __LITMUS_JOBS_H__ +#define __LITMUS_JOBS_H__ + +void prepare_for_next_period(struct task_struct *t); +void release_at(struct task_struct *t, lt_t start); +long complete_job(void); + +#endif + diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h new file mode 100644 index 0000000..5853ed5 --- /dev/null +++ b/include/litmus/litmus.h @@ -0,0 +1,200 @@ +/* + * Constant definitions related to + * scheduling policy. + */ + +#ifndef _LINUX_LITMUS_H_ +#define _LINUX_LITMUS_H_ + +#include +#include + +typedef enum { + SCHED_LINUX = 0, + SCHED_GSN_EDF = 10, + SCHED_PSN_EDF = 11, + /* Add your scheduling policy here */ + + SCHED_DEFAULT = 0, + SCHED_INVALID = -1, +} spolicy; + + +typedef enum { + LITMUS_RESERVED_RANGE = 1024, + +} sched_setup_cmd_t; + +/* per-task modes */ +enum rt_task_mode_t { + BACKGROUND_TASK = 0, + LITMUS_RT_TASK = 1 +}; + +/* Plugin boot options, for convenience */ +#define PLUGIN_LINUX "linux" +#define PLUGIN_GSN_EDF "gsn_edf" +#define PLUGIN_PSN_EDF "psn_edf" + +extern spolicy sched_policy; + +/* RT mode start time */ +extern volatile unsigned long rt_start_time; + +#define TRACE(fmt, args...) \ + sched_trace_log_message("%d: " fmt, raw_smp_processor_id(), ## args) + +#define TRACE_TASK(t, fmt, args...) \ + TRACE("(%s/%d) " fmt, (t)->comm, (t)->pid, ##args) + +#define TRACE_CUR(fmt, args...) \ + TRACE_TASK(current, fmt, ## args) + +#define TRACE_BUG_ON(cond) \ + do { if (cond) TRACE("BUG_ON(%s) at %s:%d " \ + "called from %p current=%s/%d state=%d " \ + "flags=%x partition=%d cpu=%d rtflags=%d"\ + " job=%u knp=%d timeslice=%u\n", \ + #cond, __FILE__, __LINE__, __builtin_return_address(0), current->comm, \ + current->pid, current->state, current->flags, \ + get_partition(current), smp_processor_id(), get_rt_flags(current), \ + current->rt_param.job_params.job_no, current->rt_param.kernel_np, \ + current->time_slice\ + ); } while(0); + + +/* in_list - is a given list_head queued on some list? + */ +static inline int in_list(struct list_head* list) +{ + return !( /* case 1: deleted */ + (list->next == LIST_POISON1 && + list->prev == LIST_POISON2) + || + /* case 2: initialized */ + (list->next == list && + list->prev == list) + ); +} + +typedef int (*prio_cmp_t)(struct task_struct* first, + struct task_struct* second); + +typedef int (*list_cmp_t)(struct list_head*, struct list_head*); + +static inline unsigned int list_insert(struct list_head* new, + struct list_head* head, + list_cmp_t order_before) +{ + struct list_head *pos; + unsigned int passed = 0; + + BUG_ON(!new); + + /* find a spot where the new entry is less than the next */ + list_for_each(pos, head) { + if (unlikely(order_before(new, pos))) { + /* pos is not less than new, thus insert here */ + __list_add(new, pos->prev, pos); + goto out; + } + passed++; + } + /* if we get to this point either the list is empty or every entry + * queued element is less than new. + * Let's add new to the end. */ + list_add_tail(new, head); + out: + return passed; +} + +void list_qsort(struct list_head* list, list_cmp_t less_than); + + +#define RT_PREEMPTIVE 0x2050 /* = NP */ +#define RT_NON_PREEMPTIVE 0x4e50 /* = P */ +#define RT_EXIT_NP_REQUESTED 0x5251 /* = RQ */ + +/* returns 1 if task t has registered np flag and set it to RT_NON_PREEMPTIVE + */ +int is_np(struct task_struct *t); + +/* request that the task should call sys_exit_np() + */ +void request_exit_np(struct task_struct *t); + +/* kill naughty tasks + */ +void scheduler_signal(struct task_struct *t, unsigned int signal); +void send_scheduler_signals(void); +void np_mem_kill(struct task_struct *t); + +void litmus_fork(struct task_struct *tsk); +void litmus_exec(void); +/* clean up real-time state of a task */ +void exit_litmus(struct task_struct *dead_tsk); + +long transition_to_rt(struct task_struct* tsk); +long transition_to_be(struct task_struct* tsk); + +#define is_realtime(t) ((t)->rt_param.is_realtime) +#define rt_transition_pending(t) \ + ((t)->rt_param.transition_pending) + +/* Realtime utility macros */ +#define get_rt_flags(t) ((t)->rt_param.flags) +#define set_rt_flags(t,f) (t)->rt_param.flags=(f) +#define get_exec_cost(t) ((t)->rt_param.task_params.exec_cost) +#define get_exec_time(t) ((t)->rt_param.job_params.exec_time) +#define get_rt_period(t) ((t)->rt_param.task_params.period) +#define get_partition(t) (t)->rt_param.task_params.cpu +#define get_deadline(t) ((t)->rt_param.job_params.deadline) +#define get_class(t) ((t)->rt_param.task_params.cls) + +inline static int budget_exhausted(struct task_struct* t) +{ + return get_exec_time(t) >= get_exec_cost(t); +} + + +#define is_hrt(t) \ + ((t)->rt_param.task_params.class == RT_CLASS_HARD) +#define is_srt(t) \ + ((t)->rt_param.task_params.class == RT_CLASS_SOFT) +#define is_be(t) \ + ((t)->rt_param.task_params.class == RT_CLASS_BEST_EFFORT) + +#define get_release(t) ((t)->rt_param.job_params.release) + +/* Honor the flag in the preempt_count variable that is set + * when scheduling is in progress. + */ +#define is_running(t) \ + ((t)->state == TASK_RUNNING || \ + (t)->thread_info->preempt_count & PREEMPT_ACTIVE) + +#define is_blocked(t) \ + (!is_running(t)) +#define is_released(t, now) \ + (lt_before_eq(get_release(t), now)) +#define is_tardy(t, now) \ + (lt_before_eq((t)->rt_param.job_params.deadline, now)) + +/* real-time comparison macros */ +#define earlier_deadline(a, b) (lt_before(\ + (a)->rt_param.job_params.deadline,\ + (b)->rt_param.job_params.deadline)) +#define earlier_release(a, b) (lt_before(\ + (a)->rt_param.job_params.release,\ + (b)->rt_param.job_params.release)) + +#define shorter_period(a, b) (lt_before(\ + (a)->rt_param.task_params.period, \ + (b)->rt_param.task_params.period)) + +#define make_np(t) do {t->rt_param.kernel_np++;} while(0); +#define take_np(t) do {t->rt_param.kernel_np--;} while(0); + +void srp_ceiling_block(void); + +#endif diff --git a/include/litmus/rm_common.h b/include/litmus/rm_common.h new file mode 100644 index 0000000..11e8365 --- /dev/null +++ b/include/litmus/rm_common.h @@ -0,0 +1,44 @@ +/* rate monotonic helper functions. + */ + + +#ifndef __UNC_RM_COMMON_H__ +#define __UNC_RM_COMMON_H__ + +#include + +static inline int _rm_higher_prio(struct pcp_priority *p1, + struct pcp_priority *p2) +{ + /* does the second task exist and is it a real-time task? If + * not, the first task (which is a RT task) has higher + * priority. + */ + + if (unlikely(!p2)) + return 1; + + if (p1->in_global_cs == p2->in_global_cs) { + /* tie break by RM priority */ + if (p1->prio == p2->prio) + /* tie break equal periods by PID */ + return p1->pid < p2->pid; + else + /* shorter period or lower index has higher priority */ + return p1->prio < p2->prio; + } else + /* gcs always have higher priority */ + return p1->in_global_cs > p2->in_global_cs; +} + + +void rm_domain_init(rt_domain_t* rt, check_resched_needed_t resched); + +int rm_higher_prio(struct task_struct* first, + struct task_struct* second); + +int rm_ready_order(struct list_head* a, struct list_head* b); + +int rm_preemption_needed(rt_domain_t* rt, struct task_struct *t); + +#endif diff --git a/include/litmus/rt_domain.h b/include/litmus/rt_domain.h new file mode 100644 index 0000000..79b6034 --- /dev/null +++ b/include/litmus/rt_domain.h @@ -0,0 +1,94 @@ +/* CLEANUP: Add comments and make it less messy. + * + */ + +#ifndef __UNC_RT_DOMAIN_H__ +#define __UNC_RT_DOMAIN_H__ + +struct _rt_domain; + +typedef int (*check_resched_needed_t)(struct _rt_domain *rt); +typedef void (*release_at_t)(struct task_struct *t, lt_t start); + +typedef struct _rt_domain { + /* runnable rt tasks are in here */ + rwlock_t ready_lock; + struct list_head ready_queue; + + /* real-time tasks waiting for release are in here */ + spinlock_t release_lock; + struct list_head release_queue; + + /* how do we check if we need to kick another CPU? */ + check_resched_needed_t check_resched; + + /* how are tasks ordered in the ready queue? */ + list_cmp_t order; +} rt_domain_t; + +#define next_ready(rt) \ + (list_entry((rt)->ready_queue.next, struct task_struct, rt_list)) + +#define ready_jobs_pending(rt) \ + (!list_empty(&(rt)->ready_queue)) + +void rt_domain_init(rt_domain_t *rt, check_resched_needed_t f, + list_cmp_t order); + +void __add_ready(rt_domain_t* rt, struct task_struct *new); +void __add_release(rt_domain_t* rt, struct task_struct *task); + +struct task_struct* __take_ready(rt_domain_t* rt); +struct task_struct* __peek_ready(rt_domain_t* rt); + +void try_release_pending(rt_domain_t* rt); +void __release_pending(rt_domain_t* rt); + +static inline void add_ready(rt_domain_t* rt, struct task_struct *new) +{ + unsigned long flags; + /* first we need the write lock for rt_ready_queue */ + write_lock_irqsave(&rt->ready_lock, flags); + __add_ready(rt, new); + write_unlock_irqrestore(&rt->ready_lock, flags); +} + +static inline struct task_struct* take_ready(rt_domain_t* rt) +{ + unsigned long flags; + struct task_struct* ret; + /* first we need the write lock for rt_ready_queue */ + write_lock_irqsave(&rt->ready_lock, flags); + ret = __take_ready(rt); + write_unlock_irqrestore(&rt->ready_lock, flags); + return ret; +} + + +static inline void add_release(rt_domain_t* rt, struct task_struct *task) +{ + unsigned long flags; + /* first we need the write lock for rt_ready_queue */ + spin_lock_irqsave(&rt->release_lock, flags); + __add_release(rt, task); + spin_unlock_irqrestore(&rt->release_lock, flags); +} + +static inline int __jobs_pending(rt_domain_t* rt) +{ + return !list_empty(&rt->ready_queue); +} + +static inline int jobs_pending(rt_domain_t* rt) +{ + unsigned long flags; + int ret; + /* first we need the write lock for rt_ready_queue */ + read_lock_irqsave(&rt->ready_lock, flags); + ret = __jobs_pending(rt); + read_unlock_irqrestore(&rt->ready_lock, flags); + return ret; +} + + +#endif diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h new file mode 100644 index 0000000..37a4495 --- /dev/null +++ b/include/litmus/rt_param.h @@ -0,0 +1,177 @@ +/* + * Definition of the scheduler plugin interface. + * + */ +#ifndef _LINUX_RT_PARAM_H_ +#define _LINUX_RT_PARAM_H_ + +/* Litmus time type. */ +typedef unsigned long long lt_t; + +static inline int lt_after(lt_t a, lt_t b) +{ + return ((long long) b) - ((long long) a) < 0; +} +#define lt_before(a, b) lt_after(b, a) + +static inline int lt_after_eq(lt_t a, lt_t b) +{ + return ((long long) a) - ((long long) b) >= 0; +} +#define lt_before_eq(a, b) lt_after_eq(b, a) + +/* different types of clients */ +typedef enum { + RT_CLASS_HARD, + RT_CLASS_SOFT, + RT_CLASS_BEST_EFFORT +} task_class_t; + +struct rt_task { + lt_t exec_cost; + lt_t period; + lt_t phase; + lt_t prio; + unsigned int cpu; + task_class_t cls; +}; + +#define DPCP_WAIT 0x1 +#define DPCP_COMPLETE 0x2 + +/* don't export internal data structures to user space (liblitmus) */ +#ifdef __KERNEL__ + +#include + +struct rt_job { + /* Time instant the the job was or will be released. */ + lt_t release; + /* What is the current deadline? */ + lt_t deadline; + /* How much service has this job received so far? + */ + lt_t exec_time; + + /* Which job is this. This is used to let user space + * specify which job to wait for, which is important if jobs + * overrun. If we just call sys_sleep_next_period() then we + * will unintentionally miss jobs after an overrun. + * + * Increase this sequence number when a job is released. + */ + unsigned int job_no; + + /* when did this job start executing? */ + lt_t exec_start; +}; + + +/* make priority inheritance cleaner for PCP */ +struct pcp_priority { + lt_t prio; + int in_global_cs; + int pid; +}; + +struct pcp_semaphore; + +/* RT task parameters for scheduling extensions + * These parameters are inherited during clone and therefore must + * be explicitly set up before the task set is launched. + */ +struct rt_param { + /* is the task sleeping? */ + unsigned int flags:8; + + /* Real-time marker: 1 iff it is a LITMUS real-time task. + */ + unsigned int is_realtime:1; + + /* is a BE->RT or RT->BE transition pending? */ + unsigned int transition_pending:1; + + /* is this task under control of litmus? + * + * this is necessary because otherwise signal delivery code + * may try to wake up a task that is already queued in plugin + * data structures. + * + * bbb: I believe this flag is fundamentally flawed and should be + * taken out in the redesign. + */ + unsigned int litmus_controlled:1; + + /* do we need to check for srp blocking? */ + unsigned int srp_non_recurse:1; + + /* if a BE->RT transition failed, then this field contains the error */ + unsigned long transition_error; + + /* user controlled parameters */ + struct rt_task task_params; + + /* timing parameters */ + struct rt_job job_params; + + + /* task representing the current "inherited" task + * priority, assigned by inherit_priority and + * return priority in the scheduler plugins. + * could point to self if PI does not result in + * an increased task priority. + */ + struct task_struct* inh_task; + + /* Don't just dereference this pointer in kernel space! + * It might very well point to junk or nothing at all. + * NULL indicates that the task has not requested any non-preemptable + * section support. + * Not inherited upon fork. + */ + short* np_flag; + + /* For the FMLP under PSN-EDF, it is required to make the task + * non-preemptive from kernel space. In order not to interfere with + * user space, this counter indicates the kernel space np setting. + * kernel_np > 0 => task is non-preemptive + */ + unsigned int kernel_np; + + /* This field can be used by plugins to store where the task + * is currently scheduled. It is the responsibility of the + * plugin to avoid race conditions. + * + * Used by GSN-EDF. + */ + int scheduled_on; + + /* This field can be used by plugins to store where the task + * is currently linked. It is the responsibility of the plugin + * to avoid race conditions. + * + * Used by GSN-EDF. + */ + int linked_on; + + /* Used by RM + */ + struct pcp_priority pcp_prio; + struct pcp_priority* cur_prio; + struct list_head owned_semaphores; + struct pcp_semaphore* blocked_on; + + /* Fields saved before BE->RT transition. + */ + int old_policy; + int old_prio; +}; + +/* Possible RT flags */ +#define RT_F_RUNNING 0x00000000 +#define RT_F_SLEEP 0x00000001 +#define RT_F_EXIT_SEM 0x00000008 + +#endif + +#endif diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h new file mode 100644 index 0000000..337668f --- /dev/null +++ b/include/litmus/sched_plugin.h @@ -0,0 +1,120 @@ +/* + * Definition of the scheduler plugin interface. + * + */ +#ifndef _LINUX_SCHED_PLUGIN_H_ +#define _LINUX_SCHED_PLUGIN_H_ + +#include +#include + +/* struct for semaphore with priority inheritance */ +struct pi_semaphore { + atomic_t count; + int sleepers; + wait_queue_head_t wait; + union { + /* highest-prio holder/waiter */ + struct task_struct *task; + struct task_struct* cpu_task[NR_CPUS]; + } hp; + /* current lock holder */ + struct task_struct *holder; +}; + +int set_hp_task(struct pi_semaphore *sem, prio_cmp_t cmp); +int set_hp_cpu_task(struct pi_semaphore *sem, int cpu, prio_cmp_t cmp); + +/********************* scheduler invocation ******************/ + +/* Plugin-specific realtime tick handler */ +typedef void (*scheduler_tick_t) (void); +/* Novell make sched decision function */ +typedef int (*schedule_t) (struct task_struct * prev, + struct task_struct ** next); +/* Clean up after the task switch has occured. + * This function is called after every (even non-rt) task switch. + */ +typedef void (*finish_switch_t)(struct task_struct *prev); + + +/********************* task state changes ********************/ + +/* called to setup a new real-time task */ +typedef long (*prepare_task_t) (struct task_struct *task); +/* called to re-introduce a task after blocking */ +typedef void (*wake_up_task_t) (struct task_struct *task); +/* called to notify the plugin of a blocking real-time task + * it will only be called for real-time tasks and before schedule is called */ +typedef void (*task_blocks_t) (struct task_struct *task); +/* called when a real-time task exits. Free any allocated resources */ +typedef long (*tear_down_t) (struct task_struct *); + +/* Called when the new_owner is released from the wait queue + * it should now inherit the priority from sem, _before_ it gets readded + * to any queue + */ +typedef long (*inherit_priority_t) (struct pi_semaphore *sem, + struct task_struct *new_owner); + +/* Called when the current task releases a semahpore where it might have + * inherited a piority from + */ +typedef long (*return_priority_t) (struct pi_semaphore *sem); + +/* Called when a task tries to acquire a semaphore and fails. Check if its + * priority is higher than that of the current holder. + */ +typedef long (*pi_block_t) (struct pi_semaphore *sem, struct task_struct *t); + + +/********************* sys call backends ********************/ +/* This function causes the caller to sleep until the next release */ +typedef long (*sleep_next_period_t) (void); + +struct sched_plugin { + struct list_head list; + /* basic info */ + char *plugin_name; + unsigned int srp_active:1; + unsigned int pcp_active:1; + + /* scheduler invocation */ + scheduler_tick_t scheduler_tick; + schedule_t schedule; + finish_switch_t finish_switch; + + /* syscall backend */ + sleep_next_period_t sleep_next_period; + + /* task state changes */ + prepare_task_t prepare_task; + wake_up_task_t wake_up_task; + task_blocks_t task_blocks; + tear_down_t tear_down; + + /* priority inheritance */ + inherit_priority_t inherit_priority; + return_priority_t return_priority; + pi_block_t pi_block; +} __attribute__ ((__aligned__(SMP_CACHE_BYTES))); + + +extern struct sched_plugin *curr_sched_plugin; + +int register_sched_plugin(struct sched_plugin* plugin); +struct sched_plugin* find_sched_plugin(const char* name); +int print_sched_plugins(char* buf, int max); + +static inline int pcp_active(void) +{ + return curr_sched_plugin->pcp_active; +} + +static inline int srp_active(void) +{ + return curr_sched_plugin->srp_active; +} + + +#endif diff --git a/include/litmus/sched_trace.h b/include/litmus/sched_trace.h new file mode 100644 index 0000000..f9938c2 --- /dev/null +++ b/include/litmus/sched_trace.h @@ -0,0 +1,31 @@ +/* sched_trace.h -- record scheduler events to a byte stream for offline analysis. + */ +#ifndef _LINUX_SCHED_TRACE_H_ +#define _LINUX_SCHED_TRACE_H_ + +#include + +/* dummies, need to be re-implemented */ + +/* used in sched.c */ +#define sched_trace_task_arrival(t) +#define sched_trace_task_departure(t) +#define sched_trace_task_preemption(t, by) +#define sched_trace_task_scheduled(t) + +/* used in scheduler plugins */ +#define sched_trace_job_release(t) +#define sched_trace_job_completion(t) + + +#ifdef CONFIG_SCHED_DEBUG_TRACE +void sched_trace_log_message(const char* fmt, ...); + +#else + +#define sched_trace_log_message(fmt, ...) + +#endif + + +#endif diff --git a/include/litmus/trace.h b/include/litmus/trace.h new file mode 100644 index 0000000..5c2c2c0 --- /dev/null +++ b/include/litmus/trace.h @@ -0,0 +1,106 @@ + +#ifndef _SYS_TRACE_H_ +#define _SYS_TRACE_H_ + +#include +#include + + +/*********************** TIMESTAMPS ************************/ + +struct timestamp { + unsigned long event; + unsigned long long timestamp; + unsigned int seq_no; + int cpu; +}; + + +/* buffer holding time stamps - will be provided by driver */ +extern struct ft_buffer* trace_ts_buf; + +/* save_timestamp: stores current time as struct timestamp + * in trace_ts_buf + */ +asmlinkage void save_timestamp(unsigned long event); + +#define TIMESTAMP(id) ft_event0(id, save_timestamp) + +/* Convention for timestamps + * ========================= + * + * In order to process the trace files with a common tool, we use the following + * convention to measure execution times: The end time id of a code segment is + * always the next number after the start time event id. + */ + +#define TS_SCHED_START TIMESTAMP(100) +#define TS_SCHED_END TIMESTAMP(101) +#define TS_CXS_START TIMESTAMP(102) +#define TS_CXS_END TIMESTAMP(103) + +#define TS_TICK_START TIMESTAMP(110) +#define TS_TICK_END TIMESTAMP(111) + +#define TS_PLUGIN_SCHED_START TIMESTAMP(120) +#define TS_PLUGIN_SCHED_END TIMESTAMP(121) + +#define TS_PLUGIN_TICK_START TIMESTAMP(130) +#define TS_PLUGIN_TICK_END TIMESTAMP(131) + +#define TS_ENTER_NP_START TIMESTAMP(140) +#define TS_ENTER_NP_END TIMESTAMP(141) + +#define TS_EXIT_NP_START TIMESTAMP(150) +#define TS_EXIT_NP_END TIMESTAMP(151) + +#define TS_SRP_UP_START TIMESTAMP(160) +#define TS_SRP_UP_END TIMESTAMP(161) +#define TS_SRP_DOWN_START TIMESTAMP(162) +#define TS_SRP_DOWN_END TIMESTAMP(163) + +#define TS_PI_UP_START TIMESTAMP(170) +#define TS_PI_UP_END TIMESTAMP(171) +#define TS_PI_DOWN_START TIMESTAMP(172) +#define TS_PI_DOWN_END TIMESTAMP(173) + +#define TS_FIFO_UP_START TIMESTAMP(180) +#define TS_FIFO_UP_END TIMESTAMP(181) +#define TS_FIFO_DOWN_START TIMESTAMP(182) +#define TS_FIFO_DOWN_END TIMESTAMP(183) + +#define PCP1 200 +#define PCP2 204 + +#define DPCP 210 +#define MPCP 220 +#define FMLP 230 +#define SRPT 240 + +#define TS_PCP_UP_START TIMESTAMP(PCP1) +#define TS_PCP_UP_END TIMESTAMP(PCP1 + 1) +#define TS_PCP1_DOWN_START TIMESTAMP(PCP1 + 2) +#define TS_PCP1_DOWN_END TIMESTAMP(PCP1 + 3) +#define TS_PCP2_DOWN_START TIMESTAMP(PCP2 + 2) +#define TS_PCP2_DOWN_END TIMESTAMP(PCP2 + 3) + + +#define TS_DPCP_INVOKE_START TIMESTAMP(DPCP) +#define TS_DPCP_INVOKE_END TIMESTAMP(DPCP + 1) +#define TS_DPCP_AGENT1_START TIMESTAMP(DPCP + 2) +#define TS_DPCP_AGENT1_END TIMESTAMP(DPCP + 3) +#define TS_DPCP_AGENT2_START TIMESTAMP(DPCP + 4) +#define TS_DPCP_AGENT2_END TIMESTAMP(DPCP + 5) + + +#define TS_MPCP_UP_START TIMESTAMP(MPCP) +#define TS_MPCP_UP_END TIMESTAMP(MPCP + 1) +#define TS_MPCP_DOWN_START TIMESTAMP(MPCP + 2) +#define TS_MPCP_DOWN_END TIMESTAMP(MPCP + 3) + + +#define TS_SRPT_START TIMESTAMP(SRPT) +#define TS_SRPT_END TIMESTAMP(SRPT + 1) + + +#endif /* !_SYS_TRACE_H_ */ diff --git a/kernel/exit.c b/kernel/exit.c index fec12eb..8a0eb79 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -50,6 +50,8 @@ extern void sem_exit (void); +extern void exit_od_table(struct task_struct* t); + static void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p) @@ -916,6 +918,8 @@ fastcall NORET_TYPE void do_exit(long code) if (unlikely(tsk->audit_context)) audit_free(tsk); + exit_od_table(tsk); + taskstats_exit(tsk, group_dead); exit_mm(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index d57118d..6fa6e03 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -57,6 +57,9 @@ #include #include +#include +#include + /* * Protected counters by write_lock_irq(&tasklist_lock) */ @@ -118,6 +121,8 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + exit_litmus(tsk); + security_task_free(tsk); free_uid(tsk->user); put_group_info(tsk->group_info); diff --git a/kernel/sched.c b/kernel/sched.c index cca93cc..fb35f31 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -56,6 +56,12 @@ #include +#include +#include +#include +#include +#include + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -836,7 +842,7 @@ static int effective_prio(struct task_struct *p) * keep the priority unchanged. Otherwise, update priority * to the normal priority: */ - if (!rt_prio(p->prio)) + if (!rt_prio(p->prio) && !is_realtime(p)) return p->normal_prio; return p->prio; } @@ -844,7 +850,7 @@ static int effective_prio(struct task_struct *p) /* * __activate_task - move a task to the runqueue. */ -static void __activate_task(struct task_struct *p, struct rq *rq) +void __activate_task(struct task_struct *p, struct rq *rq) { struct prio_array *target = rq->active; @@ -999,7 +1005,7 @@ out: /* * deactivate_task - remove a task from the runqueue. */ -static void deactivate_task(struct task_struct *p, struct rq *rq) +void deactivate_task(struct task_struct *p, struct rq *rq) { dec_nr_running(p, rq); dequeue_task(p, p->array); @@ -1408,6 +1414,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) #endif rq = task_rq_lock(p, &flags); + + if (is_realtime(p)) + TRACE("try_to_wake_up(%s/%d)\n", p->comm, p->pid); + old_state = p->state; if (!(old_state & state)) goto out; @@ -1415,6 +1425,12 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (p->array) goto out_running; + sched_trace_task_arrival(p); + if (is_realtime(p)) { + curr_sched_plugin->wake_up_task(p); + goto out_running; + } + cpu = task_cpu(p); this_cpu = smp_processor_id(); @@ -1576,6 +1592,8 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) { int cpu = get_cpu(); + litmus_fork(p); + #ifdef CONFIG_SMP cpu = sched_balance_self(cpu, SD_BALANCE_FORK); #endif @@ -1730,6 +1748,9 @@ void fastcall sched_exit(struct task_struct *p) unsigned long flags; struct rq *rq; + if (is_realtime(p)) + return; + /* * If the child was a (relative-) CPU hog then decrease * the sleep_avg of the parent as well. @@ -1765,6 +1786,31 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *next) prepare_arch_switch(next); } +static void litmus_transition(struct task_struct *tsk, struct rq *rq) +{ + int wakeup = 0; + WARN_ON(tsk->state != TASK_STOPPED); + + tsk->rt_param.transition_pending = 0; + if (is_realtime(tsk)) { + /* RT -> BE transition */ + tsk->rt_param.transition_error = transition_to_be(tsk); + wakeup = tsk->rt_param.transition_error == 0; + } else { + /* BE -> RT transition */ + tsk->rt_param.transition_error = transition_to_rt(tsk); + /* If it was rejected as a real-time task, then + * keep it running as a best-effort task. + */ + wakeup = tsk->rt_param.transition_error != 0; + } + if (wakeup) { + /* we still hold the runqueue lock */ + tsk->state = TASK_RUNNING; + __activate_task(tsk, rq); + } +} + /** * finish_task_switch - clean up after a task-switch * @rq: runqueue associated with task-switch @@ -1801,6 +1847,15 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); + /* Requeue previous real-time task before we drop the rq lock, cause + * that may lead to a preemption. + */ + curr_sched_plugin->finish_switch(prev); + sched_trace_task_scheduled(current); + if (rt_transition_pending(prev)) + litmus_transition(prev, rq); + /* trace before IRQs are enabled */ + TS_CXS_END; finish_lock_switch(rq, prev); if (mm) mmdrop(mm); @@ -2095,6 +2150,10 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, struct sched_domain *sd, enum idle_type idle, int *all_pinned) { + /* Don't migrate LITMUS^RT tasks. */ + if (is_realtime(p)) + return 0; + /* * We do not migrate tasks that are: * 1) running (obviously), or @@ -3220,11 +3279,30 @@ void scheduler_tick(void) update_cpu_clock(p, rq, now); + /* real-time accounting is done by the plugin + * call linux functions only for background tasks + */ if (p == rq->idle) - /* Task on the idle queue */ - wake_priority_sleeper(rq); - else + /* Task on the idle queue */ + wake_priority_sleeper(rq); + else if (is_realtime(p)) { + /* time accounting for LITMUS^RT tasks */ + p->rt_param.job_params.exec_time += + now - p->rt_param.job_params.exec_start; + p->rt_param.job_params.exec_start = now; + } else + /* normal Linux tasks */ task_running_tick(rq, p); + + /* check whether the RT scheduler plugin requires a call to + * schedule + */ + TS_PLUGIN_TICK_START; + curr_sched_plugin->scheduler_tick(); + TS_PLUGIN_TICK_END; + + send_scheduler_signals(); + #ifdef CONFIG_SMP update_load(rq); if (time_after_eq(jiffies, rq->next_balance)) @@ -3406,6 +3484,7 @@ static inline int interactive_sleep(enum sleep_type sleep_type) sleep_type == SLEEP_INTERRUPTED); } + /* * schedule() is the main scheduler function. */ @@ -3420,6 +3499,7 @@ asmlinkage void __sched schedule(void) long *switch_count; struct rq *rq; + /* * Test if we are atomic. Since do_exit() needs to call into * schedule() atomically, we ignore that path for now. @@ -3427,8 +3507,9 @@ asmlinkage void __sched schedule(void) */ if (unlikely(in_atomic() && !current->exit_state)) { printk(KERN_ERR "BUG: scheduling while atomic: " - "%s/0x%08x/%d\n", - current->comm, preempt_count(), current->pid); + "%s/0x%08x/%d %s\n", + current->comm, preempt_count(), current->pid, + is_realtime(current) ? "rt" : "non-rt"); debug_show_held_locks(current); if (irqs_disabled()) print_irqtrace_events(current); @@ -3438,6 +3519,7 @@ asmlinkage void __sched schedule(void) need_resched: preempt_disable(); + TS_SCHED_START; prev = current; release_kernel_lock(prev); need_resched_nonpreemptible: @@ -3470,6 +3552,7 @@ need_resched_nonpreemptible: spin_lock_irq(&rq->lock); switch_count = &prev->nivcsw; + /* check for blocking tasks */ if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { switch_count = &prev->nvcsw; if (unlikely((prev->state & TASK_INTERRUPTIBLE) && @@ -3478,11 +3561,60 @@ need_resched_nonpreemptible: else { if (prev->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; + + if (is_realtime(prev)) { + TRACE_TASK(prev, "blocks, state = %d\n", + prev->state); + curr_sched_plugin->task_blocks(prev); + /* Enable this for all tasks to get _a lot_ of + * data. Can be helpful for debugging. + */ + sched_trace_task_departure(prev); + } + /* only indirect switching is supported in the current + * version of LITMUS + */ deactivate_task(prev, rq); } } + next = NULL; + + if (is_realtime(prev)) { + /* If we are invoked after scheduler_tick(), then + * prev is charged a tiny amount of overhead time. + * Since analysis has (or should have) accounted for + * overheads, this is ok. + */ + prev->rt_param.job_params.exec_time += + now - prev->rt_param.job_params.exec_start; + prev->rt_param.job_params.exec_start = now; + } + + /* consult the real-time plugin */ + TS_PLUGIN_SCHED_START; + curr_sched_plugin->schedule(prev, &next); + TS_PLUGIN_SCHED_END; + cpu = smp_processor_id(); + + if (prev != next && is_realtime(prev) && is_running(prev)) + deactivate_task(prev, rq); + if (next && prev != next) { + __activate_task(next, rq); + set_task_cpu(next, cpu); + } + + /* If the real-time plugin wants to switch to a specific task + * it'll be on the rq and have the highest priority. There will + * be exaclty one such task, thus the selection of the next task + * is unambiguous and the following code can only get + * triggered if there are no RT tasks pending (on this CPU). Thus, + * we may as well skip it. + */ + if (next) + goto switch_tasks; + if (unlikely(!rq->nr_running)) { idle_balance(cpu, rq); if (!rq->nr_running) { @@ -3546,12 +3678,17 @@ switch_tasks: prev->timestamp = prev->last_ran = now; sched_info_switch(prev, next); + TS_SCHED_END; if (likely(prev != next)) { + TS_CXS_START; + if (is_running(prev)) + sched_trace_task_preemption(prev, next); next->timestamp = now; rq->nr_switches++; rq->curr = next; ++*switch_count; + next->rt_param.job_params.exec_start = now; prepare_task_switch(rq, next); prev = context_switch(rq, prev, next); barrier(); @@ -3561,8 +3698,11 @@ switch_tasks: * frame will be invalid. */ finish_task_switch(this_rq(), prev); - } else + } else { spin_unlock_irq(&rq->lock); + } + + send_scheduler_signals(); prev = current; if (unlikely(reacquire_kernel_lock(prev) < 0)) @@ -3570,6 +3710,8 @@ switch_tasks: preempt_enable_no_resched(); if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) goto need_resched; + if (srp_active()) + srp_ceiling_block(); } EXPORT_SYMBOL(schedule); @@ -3691,6 +3833,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, } } + /** * __wake_up - wake up threads blocked on a waitqueue. * @q: the waitqueue @@ -3709,6 +3852,7 @@ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, } EXPORT_SYMBOL(__wake_up); + /* * Same as __wake_up but called with the spinlock in wait_queue_head_t held. */ @@ -3717,6 +3861,7 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) __wake_up_common(q, mode, 1, 0, NULL); } + /** * __wake_up_sync - wake up threads blocked on a waitqueue. * @q: the waitqueue @@ -3772,6 +3917,18 @@ void fastcall complete_all(struct completion *x) } EXPORT_SYMBOL(complete_all); +void fastcall complete_n(struct completion *x, int n) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done += n; + __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, + n, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete_n); + void fastcall __sched wait_for_completion(struct completion *x) { might_sleep(); @@ -4175,7 +4332,7 @@ static inline struct task_struct *find_process_by_pid(pid_t pid) } /* Actually do priority change: must hold rq lock. */ -static void __setscheduler(struct task_struct *p, int policy, int prio) +void __setscheduler(struct task_struct *p, int policy, int prio) { BUG_ON(p->array); diff --git a/lib/semaphore-sleepers.c b/lib/semaphore-sleepers.c index 1281805..3f4d543 100644 --- a/lib/semaphore-sleepers.c +++ b/lib/semaphore-sleepers.c @@ -108,7 +108,7 @@ fastcall int __sched __down_interruptible(struct semaphore * sem) /* * With signals pending, this turns into * the trylock failure case - we won't be - * sleeping, and we* can't get the lock as + * sleeping, and we can't get the lock as * it has contention. Just correct the count * and exit. */ diff --git a/litmus/Makefile b/litmus/Makefile new file mode 100644 index 0000000..db2518d --- /dev/null +++ b/litmus/Makefile @@ -0,0 +1,9 @@ +# +# Makefile for LITMUS^RT +# + +obj-y = sched_plugin.o litmus.o sched_trace.o \ + edf_common.o rm_common.o\ + sched_gsn_edf.o sched_psn_edf.o litmus_sem.o \ + trace.o ft_event.o rt_domain.o fdso.o \ + sched_rm.o sync.o jobs.o pcp.o diff --git a/litmus/edf_common.c b/litmus/edf_common.c new file mode 100644 index 0000000..2a52835 --- /dev/null +++ b/litmus/edf_common.c @@ -0,0 +1,95 @@ +/* + * kernel/edf_common.c + * + * Common functions for EDF based scheduler. + */ + +#include +#include +#include + +#include +#include +#include + + +#include + +/* edf_higher_prio - returns true if first has a higher EDF priority + * than second. Deadline ties are broken by PID. + * + * first first must not be NULL and a real-time task. + * second may be NULL or a non-rt task. + */ +int edf_higher_prio(struct task_struct* first, + struct task_struct* second) +{ + struct task_struct *first_task = first; + struct task_struct *second_task = second; + + /* Check for inherited priorities. Change task + * used for comparison in such a case. + */ + if (first && first->rt_param.inh_task) + first_task = first->rt_param.inh_task; + if (second && second->rt_param.inh_task) + second_task = second->rt_param.inh_task; + + return + /* does the second task exist and is it a real-time task? If + * not, the first task (which is a RT task) has higher + * priority. + */ + !second_task || !is_realtime(second_task) || + + /* is the deadline of the first task earlier? + * Then it has higher priority. + */ + earlier_deadline(first_task, second_task) || + + /* Do we have a deadline tie? + * Then break by PID. + */ + (get_deadline(first_task) == get_deadline(second_task) && + (first_task->pid < second_task->pid || + + /* If the PIDs are the same then the task with the inherited + * priority wins. + */ + (first_task->pid == second_task->pid && + !second->rt_param.inh_task))); +} + +int edf_ready_order(struct list_head* a, struct list_head* b) +{ + return edf_higher_prio( + list_entry(a, struct task_struct, rt_list), + list_entry(b, struct task_struct, rt_list)); +} + +void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched) +{ + rt_domain_init(rt, resched, edf_ready_order); +} + +/* need_to_preempt - check whether the task t needs to be preempted + * call only with irqs disabled and with ready_lock acquired + * THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT! + */ +int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t) +{ + /* we need the read lock for edf_ready_queue */ + /* no need to preempt if there is nothing pending */ + if (!ready_jobs_pending(rt)) + return 0; + /* we need to reschedule if t doesn't exist */ + if (!t) + return 1; + + /* NOTE: We cannot check for non-preemptibility since we + * don't know what address space we're currently in. + */ + + /* make sure to get non-rt stuff out of the way */ + return !is_realtime(t) || edf_higher_prio(next_ready(rt), t); +} diff --git a/litmus/fdso.c b/litmus/fdso.c new file mode 100644 index 0000000..ded9918 --- /dev/null +++ b/litmus/fdso.c @@ -0,0 +1,289 @@ +/* fdso.c - file descriptor attached shared objects + * + * (c) 2007 B. Brandenburg, LITMUS^RT project + * + * Notes: + * - objects descriptor (OD) tables are not cloned during a fork. + * - objects are created on-demand, and freed after the last reference + * is dropped. + * - for now, object types are hard coded. + * - As long as we have live objects, we keep a reference to the inode. + */ + +#include +#include +#include +#include +#include + +#include + +extern struct fdso_ops pi_sem_ops; +extern struct fdso_ops srp_sem_ops; +extern struct fdso_ops pcp_sem_ops; +extern struct fdso_ops mpcp_sem_ops; + +static const struct fdso_ops* fdso_ops[] = { + &pi_sem_ops, + &srp_sem_ops, + &pcp_sem_ops, + &mpcp_sem_ops, +}; + +static void* fdso_create(obj_type_t type) +{ + return fdso_ops[type]->create(); +} + +static void fdso_destroy(obj_type_t type, void* obj) +{ + fdso_ops[type]->destroy(obj); +} + +static int fdso_open(struct od_table_entry* entry, void* __user config) +{ + if (fdso_ops[entry->obj->type]->open) + return fdso_ops[entry->obj->type]->open(entry, config); + else + return 0; +} + +static int fdso_close(struct od_table_entry* entry) +{ + if (fdso_ops[entry->obj->type]->close) + return fdso_ops[entry->obj->type]->close(entry); + else + return 0; +} + +/* inode must be locked already */ +static struct inode_obj_id* alloc_inode_obj(struct inode* inode, + obj_type_t type, + unsigned int id) +{ + struct inode_obj_id* obj; + void* raw_obj; + + raw_obj = fdso_create(type); + if (!raw_obj) + return NULL; + + obj = kmalloc(sizeof(struct inode_obj_id), GFP_KERNEL); + if (!obj) + return NULL; + INIT_LIST_HEAD(&obj->list); + atomic_set(&obj->count, 1); + obj->type = type; + obj->id = id; + obj->obj = raw_obj; + obj->inode = inode; + + list_add(&obj->list, &inode->i_obj_list); + atomic_inc(&inode->i_count); +/* + printk(KERN_DEBUG "alloc_inode_obj(%p, %d, %d): object created\n", + inode, type, id); +*/ + return obj; +} + +/* inode must be locked already */ +static struct inode_obj_id* get_inode_obj(struct inode* inode, + obj_type_t type, + unsigned int id) +{ + struct list_head* pos; + struct inode_obj_id* obj = NULL; + + list_for_each(pos, &inode->i_obj_list) { + obj = list_entry(pos, struct inode_obj_id, list); + if (obj->id == id && obj->type == type) { + atomic_inc(&obj->count); + return obj; + } + } +/* + printk(KERN_DEBUG "get_inode_obj(%p, %d, %d): couldn't find object\n", + inode, type, id); +*/ + return NULL; +} + + +static void put_inode_obj(struct inode_obj_id* obj) +{ + struct inode* inode; + int let_go = 0; + + inode = obj->inode; + if (atomic_dec_and_test(&obj->count)) { + + mutex_lock(&inode->i_obj_mutex); + /* no new references can be obtained */ + if (!atomic_read(&obj->count)) { + list_del(&obj->list); + fdso_destroy(obj->type, obj->obj); + kfree(obj); + let_go = 1; + } + mutex_unlock(&inode->i_obj_mutex); + if (let_go) + iput(inode); + } +} + +static struct od_table_entry* get_od_entry(struct task_struct* t) +{ + struct od_table_entry* table; + int i; + + + table = t->od_table; + if (!table) { + table = (struct od_table_entry*) + kzalloc(sizeof(struct od_table_entry) * + MAX_OBJECT_DESCRIPTORS, GFP_KERNEL); + t->od_table = table; + } + + for (i = 0; table && i < MAX_OBJECT_DESCRIPTORS; i++) + if (!table[i].used) { + table[i].used = 1; + return table + i; + } + return NULL; +} + +static int put_od_entry(struct od_table_entry* od) +{ + put_inode_obj(od->obj); + od->used = 0; + return 0; +} + +void exit_od_table(struct task_struct* t) +{ + int i; + + if (t->od_table) { + for (i = 0; i < MAX_OBJECT_DESCRIPTORS; i++) + if (t->od_table[i].used) + put_od_entry(t->od_table + i); + kfree(t->od_table); + t->od_table = NULL; + } +} + +static int do_sys_od_open(struct file* file, obj_type_t type, int id, + void* __user config) +{ + int idx = 0, err; + struct inode* inode; + struct inode_obj_id* obj = NULL; + struct od_table_entry* entry; + + inode = file->f_dentry->d_inode; + + entry = get_od_entry(current); + if (!entry) + return -ENOMEM; + + mutex_lock(&inode->i_obj_mutex); + obj = get_inode_obj(inode, type, id); + if (!obj) + obj = alloc_inode_obj(inode, type, id); + if (!obj) { + idx = -ENOMEM; + entry->used = 0; + } else { + entry->obj = obj; + entry->extra = NULL; + idx = entry - current->od_table; + } + + mutex_unlock(&inode->i_obj_mutex); + + /* FIXME: What if the allocation failed? */ + err = fdso_open(entry, config); + if (err < 0) { + /* The class rejected the open call. + * We need to clean up and tell user space. + */ + put_od_entry(entry); + idx = err; + } + + return idx; +} + + +struct od_table_entry* __od_lookup(int od) +{ + struct task_struct *t = current; + + if (!t->od_table) + return NULL; + if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS) + return NULL; + if (!t->od_table[od].used) + return NULL; + return t->od_table + od; +} + + +asmlinkage int sys_od_open(int fd, int type, int obj_id, void* __user config) +{ + int ret = 0; + struct file* file; + + /* + 1) get file from fd, get inode from file + 2) lock inode + 3) try to lookup object + 4) if not present create and enqueue object, inc inode refcnt + 5) increment refcnt of object + 6) alloc od_table_entry, setup ptrs + 7) unlock inode + 8) return offset in od_table as OD + */ + + if (type < MIN_OBJ_TYPE || type > MAX_OBJ_TYPE) { + ret = -EINVAL; + goto out; + } + + file = fget(fd); + if (!file) { + ret = -EBADF; + goto out; + } + + ret = do_sys_od_open(file, type, obj_id, config); + + fput(file); + +out: + return ret; +} + + +asmlinkage int sys_od_close(int od) +{ + int ret = -EINVAL; + struct task_struct *t = current; + + if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS) + return ret; + + if (!t->od_table || !t->od_table[od].used) + return ret; + + + /* give the class a chance to reject the close + */ + ret = fdso_close(t->od_table + od); + if (ret == 0) + ret = put_od_entry(t->od_table + od); + + return ret; +} diff --git a/litmus/ft_event.c b/litmus/ft_event.c new file mode 100644 index 0000000..db9f4ea --- /dev/null +++ b/litmus/ft_event.c @@ -0,0 +1,104 @@ +#include + +#include + +/* the feather trace management functions assume + * exclusive access to the event table + */ + + +#define BYTE_JUMP 0xeb +#define BYTE_JUMP_LEN 0x02 + +/* for each event, there is an entry in the event table */ +struct trace_event { + long id; + long count; + long start_addr; + long end_addr; +}; + +extern struct trace_event __start___event_table[]; +extern struct trace_event __stop___event_table[]; + +int ft_enable_event(unsigned long id) +{ + struct trace_event* te = __start___event_table; + int count = 0; + char* delta; + unsigned char* instr; + + while (te < __stop___event_table) { + if (te->id == id && ++te->count == 1) { + instr = (unsigned char*) te->start_addr; + /* make sure we don't clobber something wrong */ + if (*instr == BYTE_JUMP) { + delta = (((unsigned char*) te->start_addr) + 1); + *delta = 0; + } + } + if (te->id == id) + count++; + te++; + } + return count; +} + +int ft_disable_event(unsigned long id) +{ + struct trace_event* te = __start___event_table; + int count = 0; + char* delta; + unsigned char* instr; + + while (te < __stop___event_table) { + if (te->id == id && --te->count == 0) { + instr = (unsigned char*) te->start_addr; + if (*instr == BYTE_JUMP) { + delta = (((unsigned char*) te->start_addr) + 1); + *delta = te->end_addr - te->start_addr - + BYTE_JUMP_LEN; + } + } + if (te->id == id) + count++; + te++; + } + return count; +} + +int ft_disable_all_events(void) +{ + struct trace_event* te = __start___event_table; + int count = 0; + char* delta; + unsigned char* instr; + + while (te < __stop___event_table) { + if (te->count) { + instr = (unsigned char*) te->start_addr; + if (*instr == BYTE_JUMP) { + delta = (((unsigned char*) te->start_addr) + + 1); + *delta = te->end_addr - te->start_addr - + BYTE_JUMP_LEN; + te->count = 0; + count++; + } + } + te++; + } + return count; +} + +int ft_is_event_enabled(unsigned long id) +{ + struct trace_event* te = __start___event_table; + + while (te < __stop___event_table) { + if (te->id == id) + return te->count; + te++; + } + return 0; +} diff --git a/litmus/jobs.c b/litmus/jobs.c new file mode 100644 index 0000000..e294bc5 --- /dev/null +++ b/litmus/jobs.c @@ -0,0 +1,43 @@ +/* litmus/jobs.c - common job control code + */ + +#include + +#include +#include + +void prepare_for_next_period(struct task_struct *t) +{ + BUG_ON(!t); + /* prepare next release */ + t->rt_param.job_params.release = t->rt_param.job_params.deadline; + t->rt_param.job_params.deadline += get_rt_period(t); + t->rt_param.job_params.exec_time = 0; + /* update job sequence number */ + t->rt_param.job_params.job_no++; + + /* don't confuse Linux */ + t->time_slice = 1; +} + +void release_at(struct task_struct *t, lt_t start) +{ + t->rt_param.job_params.deadline = start; + prepare_for_next_period(t); + set_rt_flags(t, RT_F_RUNNING); +} + + +/* + * Deactivate current task until the beginning of the next period. + */ +long complete_job(void) +{ + /* Mark that we do not excute anymore */ + set_rt_flags(current, RT_F_SLEEP); + /* call schedule, this will return when a new job arrives + * it also takes care of preparing for the next release + */ + schedule(); + return 0; +} diff --git a/litmus/litmus.c b/litmus/litmus.c new file mode 100644 index 0000000..77aad7d --- /dev/null +++ b/litmus/litmus.c @@ -0,0 +1,830 @@ +/* litmus.c -- Implementation of the LITMUS syscalls, the LITMUS intialization code, + * and the procfs interface.. + */ +#include +#include +#include + +#include +#include + + +#include +#include +#include + +#include + +/* Number of RT tasks that exist in the system */ +atomic_t rt_task_count = ATOMIC_INIT(0); +static DEFINE_SPINLOCK(task_transition_lock); + +/* To send signals from the scheduler + * Must drop locks first. + */ +static LIST_HEAD(sched_sig_list); +static DEFINE_SPINLOCK(sched_sig_list_lock); + +/* + * sys_set_task_rt_param + * @pid: Pid of the task which scheduling parameters must be changed + * @param: New real-time extension parameters such as the execution cost and + * period + * Syscall for manipulating with task rt extension params + * Returns EFAULT if param is NULL. + * ESRCH if pid is not corrsponding + * to a valid task. + * EINVAL if either period or execution cost is <=0 + * EPERM if pid is a real-time task + * 0 if success + * + * Only non-real-time tasks may be configured with this system call + * to avoid races with the scheduler. In practice, this means that a + * task's parameters must be set _before_ calling sys_prepare_rt_task() + */ +asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param) +{ + struct rt_task tp; + struct task_struct *target; + int retval = -EINVAL; + + printk("Setting up rt task parameters for process %d.\n", pid); + + if (pid < 0 || param == 0) { + goto out; + } + if (copy_from_user(&tp, param, sizeof(tp))) { + retval = -EFAULT; + goto out; + } + + /* Task search and manipulation must be protected */ + read_lock_irq(&tasklist_lock); + if (!(target = find_task_by_pid(pid))) { + retval = -ESRCH; + goto out_unlock; + } + + if (is_realtime(target)) { + /* The task is already a real-time task. + * We cannot not allow parameter changes at this point. + */ + retval = -EBUSY; + goto out_unlock; + } + + if (tp.exec_cost <= 0) + goto out_unlock; + if (tp.period <= 0) + goto out_unlock; + if (!cpu_online(tp.cpu)) + goto out_unlock; + if (tp.period < tp.exec_cost) + { + printk(KERN_INFO "litmus: real-time task %d rejected " + "because wcet > period\n", pid); + goto out_unlock; + } + + target->rt_param.task_params = tp; + + retval = 0; + out_unlock: + read_unlock_irq(&tasklist_lock); + out: + return retval; +} + +/* Getter of task's RT params + * returns EINVAL if param or pid is NULL + * returns ESRCH if pid does not correspond to a valid task + * returns EFAULT if copying of parameters has failed. + */ +asmlinkage long sys_get_rt_task_param(pid_t pid, struct rt_task __user * param) +{ + int retval = -EINVAL; + struct task_struct *source; + struct rt_task lp; + if (param == 0 || pid < 0) + goto out; + read_lock(&tasklist_lock); + if (!(source = find_task_by_pid(pid))) { + retval = -ESRCH; + goto out_unlock; + } + lp = source->rt_param.task_params; + read_unlock(&tasklist_lock); + /* Do copying outside the lock */ + retval = + copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0; + return retval; + out_unlock: + read_unlock(&tasklist_lock); + out: + return retval; + +} + +/* sys_task_mode_transition + * @target_mode: The desired execution mode after the system call completes. + * Either BACKGROUND_TASK or LITMUS_RT_TASK. + * Allow a normal task to become a real-time task, vice versa. + * Returns EINVAL if illegal transition requested. + * 0 if task mode was changed succesfully + * other if plugin failed. + */ +asmlinkage long sys_task_mode_transition(int target_mode) +{ + int retval = -EINVAL; + struct task_struct *t = current; + + if (( is_realtime(t) && target_mode == BACKGROUND_TASK) || + (!is_realtime(t) && target_mode == LITMUS_RT_TASK)) { + TRACE_TASK(t, "attempts mode transition to %s\n", + is_realtime(t) ? "best-effort" : "real-time"); + preempt_disable(); + t->rt_param.transition_pending = 1; + t->state = TASK_STOPPED; + preempt_enable_no_resched(); + + schedule(); + + retval = t->rt_param.transition_error; + } + return retval; +} + +/* implemented in kernel/litmus_sem.c */ +void srp_ceiling_block(void); + +/* + * This is the crucial function for periodic task implementation, + * It checks if a task is periodic, checks if such kind of sleep + * is permitted and calls plugin-specific sleep, which puts the + * task into a wait array. + * returns 0 on successful wakeup + * returns EPERM if current conditions do not permit such sleep + * returns EINVAL if current task is not able to go to sleep + */ +asmlinkage long sys_sleep_next_period(void) +{ + int retval = -EPERM; + if (!is_realtime(current)) { + retval = -EINVAL; + goto out; + } + /* Task with negative or zero period cannot sleep */ + if (get_rt_period(current) <= 0) { + retval = -EINVAL; + goto out; + } + /* The plugin has to put the task into an + * appropriate queue and call schedule + */ + retval = curr_sched_plugin->sleep_next_period(); + out: + return retval; +} + +/* This is an "improved" version of sys_sleep_next_period() that + * addresses the problem of unintentionally missing a job after + * an overrun. + * + * returns 0 on successful wakeup + * returns EPERM if current conditions do not permit such sleep + * returns EINVAL if current task is not able to go to sleep + */ +asmlinkage long sys_wait_for_job_release(unsigned int job) +{ + int retval = -EPERM; + if (!is_realtime(current)) { + retval = -EINVAL; + goto out; + } + + /* Task with negative or zero period cannot sleep */ + if (get_rt_period(current) <= 0) { + retval = -EINVAL; + goto out; + } + + retval = 0; + + /* first wait until we have "reached" the desired job + * + * This implementation has at least two problems: + * + * 1) It doesn't gracefully handle the wrap around of + * job_no. Since LITMUS is a prototype, this is not much + * of a problem right now. + * + * 2) It is theoretically racy if a job release occurs + * between checking job_no and calling sleep_next_period(). + * A proper solution would requiring adding another callback + * in the plugin structure and testing the condition with + * interrupts disabled. + * + * FIXME: At least problem 2 should be taken care of eventually. + */ + while (!retval && job > current->rt_param.job_params.job_no) + /* If the last job overran then job <= job_no and we + * don't send the task to sleep. + */ + retval = curr_sched_plugin->sleep_next_period(); + out: + return retval; +} + +/* This is a helper syscall to query the current job sequence number. + * + * returns 0 on successful query + * returns EPERM if task is not a real-time task. + * returns EFAULT if &job is not a valid pointer. + */ +asmlinkage long sys_query_job_no(unsigned int __user *job) +{ + int retval = -EPERM; + if (is_realtime(current)) + retval = put_user(current->rt_param.job_params.job_no, job); + + return retval; +} + +struct sched_sig { + struct list_head list; + struct task_struct* task; + unsigned int signal:31; + int force:1; +}; + +static void __scheduler_signal(struct task_struct *t, unsigned int signo, + int force) +{ + struct sched_sig* sig; + + sig = kmalloc(GFP_ATOMIC, sizeof(struct sched_sig)); + if (!sig) { + TRACE_TASK(t, "dropping signal: %u\n", t); + return; + } + + spin_lock(&sched_sig_list_lock); + + sig->signal = signo; + sig->force = force; + sig->task = t; + get_task_struct(t); + list_add(&sig->list, &sched_sig_list); + + spin_unlock(&sched_sig_list_lock); +} + +void scheduler_signal(struct task_struct *t, unsigned int signo) +{ + __scheduler_signal(t, signo, 0); +} + +void force_scheduler_signal(struct task_struct *t, unsigned int signo) +{ + __scheduler_signal(t, signo, 1); +} + +/* FIXME: get rid of the locking and do this on a per-processor basis */ +void send_scheduler_signals(void) +{ + unsigned long flags; + struct list_head *p, *extra; + struct siginfo info; + struct sched_sig* sig; + struct task_struct* t; + struct list_head claimed; + + if (spin_trylock_irqsave(&sched_sig_list_lock, flags)) { + if (list_empty(&sched_sig_list)) + p = NULL; + else { + p = sched_sig_list.next; + list_del(&sched_sig_list); + INIT_LIST_HEAD(&sched_sig_list); + } + spin_unlock_irqrestore(&sched_sig_list_lock, flags); + + /* abort if there are no signals */ + if (!p) + return; + + /* take signal list we just obtained */ + list_add(&claimed, p); + + list_for_each_safe(p, extra, &claimed) { + list_del(p); + sig = list_entry(p, struct sched_sig, list); + t = sig->task; + info.si_signo = sig->signal; + info.si_errno = 0; + info.si_code = SI_KERNEL; + info.si_pid = 1; + info.si_uid = 0; + TRACE("sending signal %d to %d\n", info.si_signo, + t->pid); + if (sig->force) + force_sig_info(sig->signal, &info, t); + else + send_sig_info(sig->signal, &info, t); + put_task_struct(t); + kfree(sig); + } + } + +} + +static inline void np_mem_error(struct task_struct* t, const char* reason) +{ + if (t->state != TASK_DEAD && !(t->flags & PF_EXITING)) { + TRACE("np section: %s => %s/%d killed\n", + reason, t->comm, t->pid); + force_scheduler_signal(t, SIGKILL); + } +} + +/* sys_register_np_flag() allows real-time tasks to register an + * np section indicator. + * returns 0 if the flag was successfully registered + * returns EINVAL if current task is not a real-time task + * returns EFAULT if *flag couldn't be written + */ +asmlinkage long sys_register_np_flag(short __user *flag) +{ + int retval = -EINVAL; + short test_val = RT_PREEMPTIVE; + + /* avoid races with the scheduler */ + preempt_disable(); + TRACE("reg_np_flag(%p) for %s/%d\n", flag, + current->comm, current->pid); + + /* Let's first try to write to the address. + * That way it is initialized and any bugs + * involving dangling pointers will caught + * early. + * NULL indicates disabling np section support + * and should not be tested. + */ + if (flag) + retval = poke_kernel_address(test_val, flag); + else + retval = 0; + TRACE("reg_np_flag: retval=%d\n", retval); + if (unlikely(0 != retval)) + np_mem_error(current, "np flag: not writable"); + else + /* the pointer is ok */ + current->rt_param.np_flag = flag; + + preempt_enable(); + return retval; +} + + +void request_exit_np(struct task_struct *t) +{ + int ret; + short flag; + + /* We can only do this if t is actually currently scheduled on this CPU + * because otherwise we are in the wrong address space. Thus make sure + * to check. + */ + BUG_ON(t != current); + + if (unlikely(!is_realtime(t) || !t->rt_param.np_flag)) { + TRACE_TASK(t, "request_exit_np(): BAD TASK!\n"); + return; + } + + flag = RT_EXIT_NP_REQUESTED; + ret = poke_kernel_address(flag, t->rt_param.np_flag + 1); + TRACE("request_exit_np(%s/%d)\n", t->comm, t->pid); + if (unlikely(0 != ret)) + np_mem_error(current, "request_exit_np(): flag not writable"); + +} + + +int is_np(struct task_struct* t) +{ + int ret; + unsigned short flag = 0x5858; /* = XX, looks nicer in debug*/ + + BUG_ON(t != current); + + if (unlikely(t->rt_param.kernel_np)) + return 1; + else if (unlikely(t->rt_param.np_flag == NULL) || + t->flags & PF_EXITING || + t->state == TASK_DEAD) + return 0; + else { + /* This is the tricky part. The process has registered a + * non-preemptive section marker. We now need to check whether + * it is set to to NON_PREEMPTIVE. Along the way we could + * discover that the pointer points to an unmapped region (=> + * kill the task) or that the location contains some garbage + * value (=> also kill the task). Killing the task in any case + * forces userspace to play nicely. Any bugs will be discovered + * immediately. + */ + ret = probe_kernel_address(t->rt_param.np_flag, flag); + if (0 == ret && (flag == RT_NON_PREEMPTIVE || + flag == RT_PREEMPTIVE)) + return flag != RT_PREEMPTIVE; + else { + /* either we could not read from the address or + * it contained garbage => kill the process + * FIXME: Should we cause a SEGFAULT instead? + */ + TRACE("is_np: ret=%d flag=%c%c (%x)\n", ret, + flag & 0xff, (flag >> 8) & 0xff, flag); + np_mem_error(t, "is_np() could not read"); + return 0; + } + } +} + +/* + * sys_exit_np() allows real-time tasks to signal that it left a + * non-preemptable section. It will be called after the kernel requested a + * callback in the preemption indicator flag. + * returns 0 if the signal was valid and processed. + * returns EINVAL if current task is not a real-time task + */ +asmlinkage long sys_exit_np(void) +{ + int retval = -EINVAL; + + TS_EXIT_NP_START; + + if (!is_realtime(current)) + goto out; + + TRACE("sys_exit_np(%s/%d)\n", current->comm, current->pid); + /* force rescheduling so that we can be preempted */ + set_tsk_need_resched(current); + retval = 0; + out: + + TS_EXIT_NP_END; + return retval; +} + +void __setscheduler(struct task_struct *, int, int); + +/* p is a real-time task. Re-init its state as a best-effort task. */ +static void reinit_litmus_state(struct task_struct* p, int restore) +{ + struct rt_task user_config = {}; + __user short *np_flag = NULL; + + if (restore) { + /* Safe user-space provided configuration data. + * FIXME: This is missing service levels for adaptive tasks. + */ + user_config = p->rt_param.task_params; + np_flag = p->rt_param.np_flag; + } + + /* We probably should not be inheriting any task's priority + * at this point in time. + */ + WARN_ON(p->rt_param.inh_task); + + /* We need to restore the priority of the task. */ + __setscheduler(p, p->rt_param.old_policy, p->rt_param.old_prio); + + /* Cleanup everything else. */ + memset(&p->rt_param, 0, sizeof(struct rt_task)); + + /* Restore preserved fields. */ + if (restore) { + p->rt_param.task_params = user_config; + p->rt_param.np_flag = np_flag; + } +} + +long transition_to_rt(struct task_struct* tsk) +{ + long retval; + long flags; + + BUG_ON(is_realtime(tsk)); + + if (get_rt_period(tsk) == 0 || + get_exec_cost(tsk) > get_rt_period(tsk)) { + TRACE_TASK(tsk, "litmus prepare: invalid task parameters " + "(%lu, %lu)\n", + get_exec_cost(tsk), get_rt_period(tsk)); + return -EINVAL; + } + + if (!cpu_online(get_partition(tsk))) + { + TRACE_TASK(tsk, "litmus prepare: cpu %d is not online\n", + get_partition(tsk)); + return -EINVAL; + } + + tsk->rt_param.old_prio = tsk->rt_priority; + tsk->rt_param.old_policy = tsk->policy; + INIT_LIST_HEAD(&tsk->rt_list); + + /* avoid scheduler plugin changing underneath us */ + spin_lock_irqsave(&task_transition_lock, flags); + retval = curr_sched_plugin->prepare_task(tsk); + + if (!retval) { + atomic_inc(&rt_task_count); + __setscheduler(tsk, SCHED_FIFO, MAX_RT_PRIO - 1); + tsk->rt_param.is_realtime = 1; + tsk->rt_param.litmus_controlled = 1; + } + spin_unlock_irqrestore(&task_transition_lock, flags); + + return retval; +} + +long transition_to_be(struct task_struct* tsk) +{ + BUG_ON(!is_realtime(tsk)); + + curr_sched_plugin->tear_down(tsk); + atomic_dec(&rt_task_count); + reinit_litmus_state(tsk, 1); + return 0; +} + + +/* Switching a plugin in use is tricky. + * We must watch out that no real-time tasks exists + * (and that none is created in parallel) and that the plugin is not + * currently in use on any processor (in theory). + * + * For now, we don't enforce the second part since it is unlikely to cause + * any trouble by itself as long as we don't unload modules. + */ +int switch_sched_plugin(struct sched_plugin* plugin) +{ + long flags; + int ret = 0; + + BUG_ON(!plugin); + + /* stop task transitions */ + spin_lock_irqsave(&task_transition_lock, flags); + + /* don't switch if there are active real-time tasks */ + if (atomic_read(&rt_task_count) == 0) { + printk(KERN_INFO "Switching to LITMUS^RT plugin %s.\n", plugin->plugin_name); + curr_sched_plugin = plugin; + } else + ret = -EBUSY; + + spin_unlock_irqrestore(&task_transition_lock, flags); + return ret; +} + +/* Called upon fork. + * p is the newly forked task. + */ +void litmus_fork(struct task_struct* p) +{ + if (is_realtime(p)) + /* clean out any litmus related state, don't preserve anything*/ + reinit_litmus_state(p, 0); +} + +/* Called upon execve(). + * current is doing the exec. + * Don't let address space specific stuff leak. + */ +void litmus_exec(void) +{ + struct task_struct* p = current; + + if (is_realtime(p)) { + WARN_ON(p->rt_param.inh_task); + p->rt_param.np_flag = NULL; + } +} + +void exit_litmus(struct task_struct *dead_tsk) +{ + if (is_realtime(dead_tsk)) + transition_to_be(dead_tsk); +} + + +void list_qsort(struct list_head* list, list_cmp_t less_than) +{ + struct list_head lt; + struct list_head geq; + struct list_head *pos, *extra, *pivot; + int n_lt = 0, n_geq = 0; + BUG_ON(!list); + + if (list->next == list) + return; + + INIT_LIST_HEAD(<); + INIT_LIST_HEAD(&geq); + + pivot = list->next; + list_del(pivot); + list_for_each_safe(pos, extra, list) { + list_del(pos); + if (less_than(pos, pivot)) { + list_add(pos, <); + n_lt++; + } else { + list_add(pos, &geq); + n_geq++; + } + } + if (n_lt < n_geq) { + list_qsort(<, less_than); + list_qsort(&geq, less_than); + } else { + list_qsort(&geq, less_than); + list_qsort(<, less_than); + } + list_splice(&geq, list); + list_add(pivot, list); + list_splice(<, list); +} + +#ifdef CONFIG_MAGIC_SYSRQ +int sys_kill(int pid, int sig); + +static void sysrq_handle_kill_rt_tasks(int key, struct tty_struct *tty) +{ + struct task_struct *t; + read_lock(&tasklist_lock); + for_each_process(t) { + if (is_realtime(t)) { + sys_kill(t->pid, SIGKILL); + } + } + read_unlock(&tasklist_lock); +} + +static struct sysrq_key_op sysrq_kill_rt_tasks_op = { + .handler = sysrq_handle_kill_rt_tasks, + .help_msg = "Quit-rt-tasks", + .action_msg = "sent SIGKILL to all real-time tasks", +}; +#endif + +static int proc_read_stats(char *page, char **start, + off_t off, int count, + int *eof, void *data) +{ + int len; + + len = snprintf(page, PAGE_SIZE, + "real-time task count = %d\n", + atomic_read(&rt_task_count)); + return len; +} + +static int proc_read_plugins(char *page, char **start, + off_t off, int count, + int *eof, void *data) +{ + int len; + + len = print_sched_plugins(page, PAGE_SIZE); + return len; +} + +static int proc_read_curr(char *page, char **start, + off_t off, int count, + int *eof, void *data) +{ + int len; + + len = snprintf(page, PAGE_SIZE, "%s\n", curr_sched_plugin->plugin_name); + return len; +} + +static int proc_write_curr(struct file *file, + const char *buffer, + unsigned long count, + void *data) +{ + int len, ret; + char name[65]; + struct sched_plugin* found; + + if(count > 64) + len = 64; + else + len = count; + + if(copy_from_user(name, buffer, len)) + return -EFAULT; + + name[len] = '\0'; + /* chomp name */ + if (len > 1 && name[len - 1] == '\n') + name[len - 1] = '\0'; + + found = find_sched_plugin(name); + + if (found) { + ret = switch_sched_plugin(found); + if (ret != 0) + printk(KERN_INFO "Could not switch plugin: %d\n", ret); + } else + printk(KERN_INFO "Plugin '%s' is unknown.\n", name); + + return len; +} + + +static struct proc_dir_entry *litmus_dir = NULL, + *curr_file = NULL, + *stat_file = NULL, + *plugs_file = NULL; + +static int __init init_litmus_proc(void) +{ + litmus_dir = proc_mkdir("litmus", NULL); + if (!litmus_dir) { + printk(KERN_ERR "Could not allocate LITMUS^RT procfs entry.\n"); + return -ENOMEM; + } + litmus_dir->owner = THIS_MODULE; + + curr_file = create_proc_entry("active_plugin", + 0644, litmus_dir); + if (!curr_file) { + printk(KERN_ERR "Could not allocate active_plugin " + "procfs entry.\n"); + return -ENOMEM; + } + curr_file->owner = THIS_MODULE; + curr_file->read_proc = proc_read_curr; + curr_file->write_proc = proc_write_curr; + + stat_file = create_proc_read_entry("stats", 0444, litmus_dir, + proc_read_stats, NULL); + + plugs_file = create_proc_read_entry("plugins", 0444, litmus_dir, + proc_read_plugins, NULL); + + return 0; +} + +static void exit_litmus_proc(void) +{ + if (plugs_file) + remove_proc_entry("plugins", litmus_dir); + if (stat_file) + remove_proc_entry("stats", litmus_dir); + if (curr_file) + remove_proc_entry("active_plugin", litmus_dir); + if (litmus_dir) + remove_proc_entry("litmus", NULL); +} + +extern struct sched_plugin linux_sched_plugin; + +static int __init _init_litmus(void) +{ + /* Common initializers, + * mode change lock is used to enforce single mode change + * operation. + */ + printk("Starting LITMUS^RT kernel\n"); + + register_sched_plugin(&linux_sched_plugin); + +#ifdef CONFIG_MAGIC_SYSRQ + /* offer some debugging help */ + if (!register_sysrq_key('q', &sysrq_kill_rt_tasks_op)) + printk("Registered kill rt tasks magic sysrq.\n"); + else + printk("Could not register kill rt tasks magic sysrq.\n"); +#endif + + init_litmus_proc(); + + return 0; +} + +static void _exit_litmus(void) +{ + exit_litmus_proc(); +} + +module_init(_init_litmus); +module_exit(_exit_litmus); diff --git a/litmus/litmus_sem.c b/litmus/litmus_sem.c new file mode 100644 index 0000000..7179b43 --- /dev/null +++ b/litmus/litmus_sem.c @@ -0,0 +1,551 @@ +/* + * PI semaphores and SRP implementations. + * Much of the code here is borrowed from include/asm-i386/semaphore.h. + * + * NOTE: This implementation is very much a prototype and horribly insecure. It + * is intended to be a proof of concept, not a feature-complete solution. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +/* ************************************************************************** */ +/* PRIORITY INHERITANCE */ +/* ************************************************************************** */ + +static void* create_pi_semaphore(void) +{ + struct pi_semaphore* sem; + int i; + + sem = kmalloc(sizeof(struct pi_semaphore), GFP_KERNEL); + if (!sem) + return NULL; + atomic_set(&sem->count, 1); + sem->sleepers = 0; + init_waitqueue_head(&sem->wait); + sem->hp.task = NULL; + sem->holder = NULL; + for (i = 0; i < NR_CPUS; i++) + sem->hp.cpu_task[i] = NULL; + return sem; +} + +static void destroy_pi_semaphore(void* sem) +{ + /* XXX assert invariants */ + kfree(sem); +} + +struct fdso_ops pi_sem_ops = { + .create = create_pi_semaphore, + .destroy = destroy_pi_semaphore +}; + +struct wq_pair { + struct task_struct* tsk; + struct pi_semaphore* sem; +}; + +static int rt_pi_wake_up(wait_queue_t *wait, unsigned mode, int sync, + void *key) +{ + struct wq_pair* wqp = (struct wq_pair*) wait->private; + set_rt_flags(wqp->tsk, RT_F_EXIT_SEM); + curr_sched_plugin->inherit_priority(wqp->sem, wqp->tsk); + TRACE_TASK(wqp->tsk, + "woken up by rt_pi_wake_up() (RT_F_SEM_EXIT, PI)\n"); + /* point to task for default_wake_function() */ + wait->private = wqp->tsk; + default_wake_function(wait, mode, sync, key); + + /* Always return true since we know that if we encountered a task + * that was already running the wake_up raced with the schedule in + * rt_pi_down(). In that case the task in rt_pi_down() will be scheduled + * immediately and own the lock. We must not wake up another task in + * any case. + */ + return 1; +} + +/* caller is responsible for locking */ +int set_hp_task(struct pi_semaphore *sem, prio_cmp_t higher_prio) +{ + struct list_head *tmp, *next; + struct task_struct *queued; + int ret = 0; + + sem->hp.task = NULL; + list_for_each_safe(tmp, next, &sem->wait.task_list) { + queued = ((struct wq_pair*) + list_entry(tmp, wait_queue_t, + task_list)->private)->tsk; + + /* Compare task prios, find high prio task. */ + if (higher_prio(queued, sem->hp.task)) { + sem->hp.task = queued; + ret = 1; + } + } + return ret; +} + +/* caller is responsible for locking */ +int set_hp_cpu_task(struct pi_semaphore *sem, int cpu, prio_cmp_t higher_prio) +{ + struct list_head *tmp, *next; + struct task_struct *queued; + int ret = 0; + + sem->hp.cpu_task[cpu] = NULL; + list_for_each_safe(tmp, next, &sem->wait.task_list) { + queued = ((struct wq_pair*) + list_entry(tmp, wait_queue_t, + task_list)->private)->tsk; + + /* Compare task prios, find high prio task. */ + if (get_partition(queued) == cpu && + higher_prio(queued, sem->hp.cpu_task[cpu])) { + sem->hp.cpu_task[cpu] = queued; + ret = 1; + } + } + return ret; +} + +int do_pi_down(struct pi_semaphore* sem) +{ + unsigned long flags; + struct task_struct *tsk = current; + struct wq_pair pair; + int suspended = 1; + wait_queue_t wait = { + .private = &pair, + .func = rt_pi_wake_up, + .task_list = {NULL, NULL} + }; + + pair.tsk = tsk; + pair.sem = sem; + spin_lock_irqsave(&sem->wait.lock, flags); + + if (atomic_dec_return(&sem->count) < 0 || + waitqueue_active(&sem->wait)) { + /* we need to suspend */ + tsk->state = TASK_UNINTERRUPTIBLE; + add_wait_queue_exclusive_locked(&sem->wait, &wait); + + TRACE_CUR("suspends on PI lock %p\n", sem); + curr_sched_plugin->pi_block(sem, tsk); + + /* release lock before sleeping */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + TS_PI_DOWN_END; + preempt_enable_no_resched(); + + + /* we depend on the FIFO order + * Thus, we don't need to recheck when we wake up, we + * are guaranteed to have the lock since there is only one + * wake up per release + */ + schedule(); + + TRACE_CUR("woke up, now owns PI lock %p\n", sem); + + /* try_to_wake_up() set our state to TASK_RUNNING, + * all we need to do is to remove our wait queue entry + */ + remove_wait_queue(&sem->wait, &wait); + } else { + /* no priority inheritance necessary, since there are no queued + * tasks. + */ + suspended = 0; + TRACE_CUR("acquired PI lock %p, no contention\n", sem); + sem->holder = tsk; + sem->hp.task = tsk; + curr_sched_plugin->inherit_priority(sem, tsk); + spin_unlock_irqrestore(&sem->wait.lock, flags); + } + return suspended; +} + +void do_pi_up(struct pi_semaphore* sem) +{ + unsigned long flags; + + spin_lock_irqsave(&sem->wait.lock, flags); + + TRACE_CUR("releases PI lock %p\n", sem); + curr_sched_plugin->return_priority(sem); + sem->holder = NULL; + if (atomic_inc_return(&sem->count) < 1) + /* there is a task queued */ + wake_up_locked(&sem->wait); + + spin_unlock_irqrestore(&sem->wait.lock, flags); +} + +asmlinkage long sys_pi_down(int sem_od) +{ + long ret = 0; + struct pi_semaphore * sem; + int suspended = 0; + + preempt_disable(); + TS_PI_DOWN_START; + + sem = lookup_pi_sem(sem_od); + if (sem) + suspended = do_pi_down(sem); + else + ret = -EINVAL; + + if (!suspended) { + TS_PI_DOWN_END; + preempt_enable(); + } + + return ret; +} + +asmlinkage long sys_pi_up(int sem_od) +{ + long ret = 0; + struct pi_semaphore * sem; + + preempt_disable(); + TS_PI_UP_START; + + sem = lookup_pi_sem(sem_od); + if (sem) + do_pi_up(sem); + else + ret = -EINVAL; + + + TS_PI_UP_END; + preempt_enable(); + + return ret; +} + + +/* ************************************************************************** */ +/* STACK RESOURCE POLICY */ +/* ************************************************************************** */ + + +struct srp_priority { + struct list_head list; + unsigned int period; + pid_t pid; +}; + +#define list2prio(l) list_entry(l, struct srp_priority, list) + +/* SRP task priority comparison function. Smaller periods have highest + * priority, tie-break is PID. Special case: period == 0 <=> no priority + */ +static int srp_higher_prio(struct srp_priority* first, + struct srp_priority* second) +{ + if (!first->period) + return 0; + else + return !second->period || + first->period < second->period || ( + first->period == second->period && + first->pid < second->pid); +} + +struct srp { + struct list_head ceiling; + wait_queue_head_t ceiling_blocked; +}; + + +atomic_t srp_objects_in_use = ATOMIC_INIT(0); + +DEFINE_PER_CPU(struct srp, srp); + + +/* Initialize SRP semaphores at boot time. */ +static int __init srp_init(void) +{ + int i; + + printk("Initializing SRP per-CPU ceilings..."); + for (i = 0; i < NR_CPUS; i++) { + init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked); + INIT_LIST_HEAD(&per_cpu(srp, i).ceiling); + } + printk(" done!\n"); + + return 0; +} +module_init(srp_init); + + +#define system_ceiling(srp) list2prio(srp->ceiling.next) + + +#define UNDEF_SEM -2 + + +/* struct for uniprocessor SRP "semaphore" */ +struct srp_semaphore { + struct srp_priority ceiling; + struct task_struct* owner; + int cpu; /* cpu associated with this "semaphore" and resource */ +}; + +#define ceiling2sem(c) container_of(c, struct srp_semaphore, ceiling) + +static int srp_exceeds_ceiling(struct task_struct* first, + struct srp* srp) +{ + return list_empty(&srp->ceiling) || + get_rt_period(first) < system_ceiling(srp)->period || + (get_rt_period(first) == system_ceiling(srp)->period && + first->pid < system_ceiling(srp)->pid) || + ceiling2sem(system_ceiling(srp))->owner == first; +} + +static void srp_add_prio(struct srp* srp, struct srp_priority* prio) +{ + struct list_head *pos; + if (in_list(&prio->list)) { + printk(KERN_CRIT "WARNING: SRP violation detected, prio is already in " + "ceiling list! cpu=%d, srp=%p\n", smp_processor_id(), ceiling2sem(prio)); + return; + } + list_for_each(pos, &srp->ceiling) + if (unlikely(srp_higher_prio(prio, list2prio(pos)))) { + __list_add(&prio->list, pos->prev, pos); + return; + } + + list_add_tail(&prio->list, &srp->ceiling); +} + + +static void* create_srp_semaphore(void) +{ + struct srp_semaphore* sem; + + sem = kmalloc(sizeof(*sem), GFP_KERNEL); + if (!sem) + return NULL; + + INIT_LIST_HEAD(&sem->ceiling.list); + sem->ceiling.period = 0; + sem->cpu = UNDEF_SEM; + sem->owner = NULL; + atomic_inc(&srp_objects_in_use); + return sem; +} + +static noinline int open_srp_semaphore(struct od_table_entry* entry, void* __user arg) +{ + struct srp_semaphore* sem = (struct srp_semaphore*) entry->obj->obj; + int ret = 0; + struct task_struct* t = current; + struct srp_priority t_prio; + + TRACE("opening SRP semaphore %p, cpu=%d\n", sem, sem->cpu); + if (!srp_active()) + return -EBUSY; + + if (sem->cpu == UNDEF_SEM) + sem->cpu = get_partition(t); + else if (sem->cpu != get_partition(t)) + ret = -EPERM; + + if (ret == 0) { + t_prio.period = get_rt_period(t); + t_prio.pid = t->pid; + if (srp_higher_prio(&t_prio, &sem->ceiling)) { + sem->ceiling.period = t_prio.period; + sem->ceiling.pid = t_prio.pid; + } + } + + return ret; +} + +static void destroy_srp_semaphore(void* sem) +{ + /* XXX invariants */ + atomic_dec(&srp_objects_in_use); + kfree(sem); +} + +struct fdso_ops srp_sem_ops = { + .create = create_srp_semaphore, + .open = open_srp_semaphore, + .destroy = destroy_srp_semaphore +}; + + +void do_srp_down(struct srp_semaphore* sem) +{ + /* Update ceiling. */ + srp_add_prio(&__get_cpu_var(srp), &sem->ceiling); + WARN_ON(sem->owner != NULL); + sem->owner = current; + TRACE_CUR("acquired srp 0x%p\n", sem); +} + +void do_srp_up(struct srp_semaphore* sem) +{ + /* Determine new system priority ceiling for this CPU. */ + WARN_ON(!in_list(&sem->ceiling.list)); + if (in_list(&sem->ceiling.list)) + list_del(&sem->ceiling.list); + + sem->owner = NULL; + + /* Wake tasks on this CPU, if they exceed current ceiling. */ + TRACE_CUR("released srp 0x%p\n", sem); + wake_up_all(&__get_cpu_var(srp).ceiling_blocked); +} + +/* Adjust the system-wide priority ceiling if resource is claimed. */ +asmlinkage long sys_srp_down(int sem_od) +{ + int cpu; + int ret = -EINVAL; + struct srp_semaphore* sem; + + /* disabling preemptions is sufficient protection since + * SRP is strictly per CPU and we don't interfere with any + * interrupt handlers + */ + preempt_disable(); + TS_SRP_DOWN_START; + + cpu = smp_processor_id(); + sem = lookup_srp_sem(sem_od); + if (sem && sem->cpu == cpu) { + do_srp_down(sem); + ret = 0; + } + + TS_SRP_DOWN_END; + preempt_enable(); + return ret; +} + +/* Adjust the system-wide priority ceiling if resource is freed. */ +asmlinkage long sys_srp_up(int sem_od) +{ + int cpu; + int ret = -EINVAL; + struct srp_semaphore* sem; + + preempt_disable(); + TS_SRP_UP_START; + + cpu = smp_processor_id(); + sem = lookup_srp_sem(sem_od); + + if (sem && sem->cpu == cpu) { + do_srp_up(sem); + ret = 0; + } + + TS_SRP_UP_END; + preempt_enable(); + return ret; +} + +asmlinkage long sys_reg_task_srp_sem(int sem_od) +{ + /* unused */ + return 0; +} + +static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync, + void *key) +{ + int cpu = smp_processor_id(); + struct task_struct *tsk = wait->private; + if (cpu != get_partition(tsk)) + TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b", + get_partition(tsk)); + else if (srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) + return default_wake_function(wait, mode, sync, key); + return 0; +} + + + +static void do_ceiling_block(struct task_struct *tsk) +{ + wait_queue_t wait = { + .private = tsk, + .func = srp_wake_up, + .task_list = {NULL, NULL} + }; + + tsk->state = TASK_UNINTERRUPTIBLE; + add_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait); + tsk->rt_param.srp_non_recurse = 1; + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + tsk->rt_param.srp_non_recurse = 0; + remove_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait); +} + +/* Wait for current task priority to exceed system-wide priority ceiling. + */ +void srp_ceiling_block(void) +{ + struct task_struct *tsk = current; + + TS_SRPT_START; + + /* Only applies to real-time tasks, but optimize for RT tasks. */ + if (unlikely(!is_realtime(tsk))) + return; + + /* Avoid recursive ceiling blocking. */ + if (unlikely(tsk->rt_param.srp_non_recurse)) + return; + + /* Bail out early if there aren't any SRP resources around. */ + if (likely(!atomic_read(&srp_objects_in_use))) + return; + + preempt_disable(); + if (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) { + TRACE_CUR("is priority ceiling blocked.\n"); + TS_SRPT_END; + while (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) + do_ceiling_block(tsk); + TRACE_CUR("finally exceeds system ceiling.\n"); + } else { + TS_SRPT_END; + TRACE_CUR("is not priority ceiling blocked\n"); + } + preempt_enable(); +} + +/* ************************************************************************** */ + + + diff --git a/litmus/pcp.c b/litmus/pcp.c new file mode 100644 index 0000000..06030d4 --- /dev/null +++ b/litmus/pcp.c @@ -0,0 +1,764 @@ +/* pcp.c -- Implementations of the PCP, D-PCP, and M-PCP. + * + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +/* from sched_rm.c */ +void rm_set_prio(struct task_struct *t, struct pcp_priority* new_prio); + +#define GLOBAL_SEM -1 +#define UNDEF_SEM -2 + +#define get_prio(t) ((t)->rt_param.cur_prio) +#define get_base_prio(t) (&((t)->rt_param.pcp_prio)) + + +struct dpcp_request { + struct list_head list; + struct completion done; + long arg; + lt_t prio; + int pid; +}; + +struct pcp_semaphore { + int cpu; + + /* waiting tasks */ + wait_queue_head_t blocked; + struct pcp_priority* blocked_prio; + + /* system ceiling support */ + struct list_head list; + struct pcp_priority ceiling; + + /* task_struct owned_semaphore list */ + struct list_head owned_list; + + /* Current lock holder. + * NULL implies unlocked. + */ + struct task_struct* holder; + + /* D-PCP support */ + spinlock_t dpcp_lock; + struct list_head dpcp_requests; + int dpcp_count; + struct dpcp_request* dpcp_current; + struct completion dpcp_job; + struct task_struct* dpcp_agent; +}; + +static DEFINE_PER_CPU(spinlock_t, pcp_lock); +static DEFINE_PER_CPU(struct list_head, sys_ceiling); + +static noinline void init_pcp_sem(struct pcp_semaphore *sem, int cpu) +{ + sem->cpu = cpu; + init_waitqueue_head(&sem->blocked); + INIT_LIST_HEAD(&sem->list); + INIT_LIST_HEAD(&sem->owned_list); + INIT_LIST_HEAD(&sem->dpcp_requests); + sem->holder = NULL; + sem->dpcp_current = NULL; + sem->blocked_prio = NULL; + sem->ceiling = (struct pcp_priority) {ULLONG_MAX, 0, INT_MAX}; + init_completion(&sem->dpcp_job); + spin_lock_init(&sem->dpcp_lock); + sem->dpcp_count = 0; + sem->dpcp_agent = NULL; +} + +static noinline int tsk_pcp_higher_prio(struct task_struct* t, + struct pcp_priority* p2) +{ + return _rm_higher_prio(t->rt_param.cur_prio, p2); +} + +static noinline struct pcp_semaphore* get_ceiling(int cpu) +{ + struct list_head *ceil_list = &per_cpu(sys_ceiling, cpu); + if (list_empty(ceil_list)) + return NULL; + return list_entry(ceil_list->next, struct pcp_semaphore, list); +} + +static noinline void raise_ceiling(struct pcp_semaphore* sem, int cpu) +{ + struct list_head *ceil_list = &per_cpu(sys_ceiling, cpu); + list_add(&sem->list, ceil_list); +} + +static noinline int exceeds_ceiling(struct task_struct* t, + struct pcp_semaphore* ceil) +{ + return !ceil || ceil->holder == t || + tsk_pcp_higher_prio(t, &ceil->ceiling); +} + +static noinline void give_priority(struct task_struct* t, struct pcp_semaphore* sem) +{ + struct pcp_semaphore* next; + /* sem->blocked_prio can be NULL, but _rm_higher_prio() handles that */ + + /* only update if we actually exceed existing priorities */ + if (_rm_higher_prio(get_prio(t), sem->blocked_prio) && + _rm_higher_prio(get_prio(t), get_base_prio(sem->holder))) { + /* we need to register our priority */ + sem->blocked_prio = get_prio(t); + + /* only update task if it results in a priority increase */ + if (_rm_higher_prio(get_prio(t), get_prio(sem->holder))) { + /* update prio */ + TRACE("PCP: %s/%d inherits from %s/%d\n", + sem->holder->comm, sem->holder->pid, + t->comm, t->pid); + rm_set_prio(sem->holder, get_prio(t)); + /* check if recipient is blocked, too */ + next = sem->holder->rt_param.blocked_on; + if (next) + /* Transitive priority inheritance. + * Recurse. + */ + give_priority(sem->holder, next); + } + } +} + +static noinline long local_pcp_down(struct pcp_semaphore *sem) +{ + long ret = 0; + struct task_struct* t = current; + struct pcp_semaphore* ceiling; + int cpu; + int ceiling_passed = 0; + + /* don't allow recursive locking */ + if (sem->holder == t) + return -EINVAL; + + cpu = smp_processor_id(); + if (cpu != sem->cpu) { + preempt_enable(); + return -EPERM; + } + + + /* first we need to pass the local system ceiling */ + while (!ceiling_passed) { + ceiling = get_ceiling(cpu); + TRACE_TASK(t, "PCP: I want %p, ceiling is %p\n", sem, ceiling); + ceiling_passed = exceeds_ceiling(t, ceiling); + if (!ceiling_passed) { + /* block on sys_ceiling */ + DECLARE_WAITQUEUE(waitq, t); + TRACE_TASK(t, "blocks on PCP system ceiling\n"); + add_wait_queue(&ceiling->blocked, &waitq); + /* initiate priority inheritance */ + give_priority(t, ceiling); + t->rt_param.blocked_on = ceiling; + t->state = TASK_UNINTERRUPTIBLE; + preempt_enable_no_resched(); + TS_PCP1_DOWN_END; + schedule(); + preempt_disable(); + t->rt_param.blocked_on = NULL; + remove_wait_queue(&ceiling->blocked, &waitq); + } else { + if (ceiling) + TRACE_TASK(t, + "system ceiling passed: {%llu, %d, %d} < " + "{%llu, %d, %d}\n", + ceiling->ceiling.prio, + ceiling->ceiling.in_global_cs, + ceiling->ceiling.pid, + t->rt_param.cur_prio->prio, + t->rt_param.cur_prio->in_global_cs, + t->rt_param.cur_prio->pid + ); + else + TRACE_TASK(t, + "system ceiling passed: NULL < " + "{%llu, %d, %d}\n", + t->rt_param.cur_prio->prio, + t->rt_param.cur_prio->in_global_cs, + t->rt_param.cur_prio->pid + ); + TS_PCP1_DOWN_END; + } + } + + TS_PCP2_DOWN_START; + /* Since we have passed the priority ceiling the semaphore cannot be + * in use. If it were in use then the ceiling would be at least as high + * as our priority. + */ + WARN_ON(sem->holder); + + TRACE_TASK(t, "taking PCP semaphore 0x%p, owner:%p\n", sem, sem->holder); + + /* We can become the owner. */ + sem->holder = t; + list_add(&sem->owned_list, &t->rt_param.owned_semaphores); + + /* We need to update the system ceiling, but only + * if the new ceiling is higher than the old. + */ + ceiling = get_ceiling(cpu); + /* if the priorities are equal then t already owns ceiling, + * otherwise it would not have gotten past the system ceiling + */ + if (!ceiling || _rm_higher_prio(&sem->ceiling, &ceiling->ceiling)) { + raise_ceiling(sem, cpu); + TRACE_TASK(t, "raised ceiling on %d\n", cpu); + } + + TS_PCP2_DOWN_END; + return ret; +} + +static noinline struct pcp_priority* fetch_highest_prio(struct task_struct *t) +{ + struct pcp_priority *prio; + struct list_head* pos; + struct pcp_semaphore* sem; + + /* base case is that the task uses its normal priority */ + prio = get_base_prio(t); + + /* now search the list of semaphores that we own for a higher priority + * to inherit + */ + list_for_each(pos, &t->rt_param.owned_semaphores) { + sem = list_entry(pos, struct pcp_semaphore, owned_list); + /* sem->blocked_prio could be NULL */ + if (!_rm_higher_prio(prio, sem->blocked_prio)) + prio = sem->blocked_prio; + } + return prio; +} + +static noinline long local_pcp_up(struct pcp_semaphore *sem) +{ + long ret = 0; + struct task_struct* t = current; + int cpu; + + cpu = smp_processor_id(); + + if (cpu != sem->cpu) + return -EPERM; + + if (sem->holder == t) { + TRACE_TASK(t, "giving up PCP semaphore 0x%p.\n", sem); + + /* we need to unblock all tasks in the wait_queue */ + wake_up_all(&sem->blocked); + + /* unlock semaphore */ + sem->holder = NULL; + list_del(&sem->owned_list); + + /* remove from system ceiling list */ + if (in_list(&sem->list)) + list_del(&sem->list); + + if (sem->blocked_prio == get_prio(t)) { + /* We are currently inheriting from this + * semaphore. We need to figure out which priority + * we should fall back to. + */ + TRACE_TASK(t, "giving up inherited prio.\n"); + rm_set_prio(t, fetch_highest_prio(t)); + } + /* reset semaphore priority inheritance */ + sem->blocked_prio = NULL; + } else { + TRACE_TASK(t, "local_pcp_up EINVAL 0x%p.\n", sem); + ret = -EINVAL; + } + + TS_PCP_UP_END; + return ret; +} + +static noinline struct task_struct* wqlist2task(struct list_head* l) +{ + return (struct task_struct*) + list_entry(l, wait_queue_t, task_list)->private; +} + +static noinline int wait_order(struct list_head* la, struct list_head* lb) +{ + return rm_higher_prio(wqlist2task(la), wqlist2task(lb)); +} + +/* The default function is too picky. + * We really only want to wake up one task. + */ +int single_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) +{ + int ret = default_wake_function(wait, mode, sync, key); + if (!ret) + TRACE("Overriding default_wake_function() return code.\n"); + return 1; +} + +static noinline long global_pcp_down(struct pcp_semaphore* sem) +{ + unsigned long flags; + long ret = 0; + struct task_struct* t = current; + + /* don't allow recursive locking */ + if (sem->holder == t) + return -EINVAL; + + spin_lock_irqsave(&sem->blocked.lock, flags); + + /* Get the global priority. Do this before + * we block, so that we wake up as a high-priority task. + */ + t->rt_param.pcp_prio.in_global_cs = 1; + rm_set_prio(t, &t->rt_param.pcp_prio); + + if (sem->holder) { + /* semaphore is not free. We need to block. */ + DECLARE_WAITQUEUE(waitq, t); + TRACE_TASK(t, "blocks on MPCP semaphore %p.\n", sem); + waitq.flags = WQ_FLAG_EXCLUSIVE; + waitq.func = single_wake_function; + /* insert ordered by priority */ + list_insert(&waitq.task_list, &sem->blocked.task_list, + wait_order); + t->state = TASK_UNINTERRUPTIBLE; + spin_unlock_irqrestore(&sem->blocked.lock, flags); + preempt_enable_no_resched(); + TS_MPCP_DOWN_END; + + schedule(); + + preempt_disable(); + /* once we wake up we are the owner of the lock */ + spin_lock_irqsave(&sem->blocked.lock, flags); + remove_wait_queue_locked(&sem->blocked, &waitq); + } else { + /* semaphore is free. We can proceed. */ + TS_MPCP_DOWN_END; + sem->holder = t; + } + if (sem->holder != t) { + if (sem->holder) + TRACE("expected %s/%d, but I am %s/%d\n", + sem->holder->comm, sem->holder->pid, t->comm, t->pid); + else + TRACE("expected NULL, but I am %s/%d\n", + t->comm, t->pid); + } + TRACE_TASK(t, "acquired MPCP semaphore %p.\n", sem); + + + spin_unlock_irqrestore(&sem->blocked.lock, flags); + return ret; +} + +static noinline long global_pcp_up(struct pcp_semaphore* sem) +{ + unsigned long flags; + long ret = 0; + struct task_struct* t = current; + + if (sem->holder != t) + return -EINVAL; + + TRACE_TASK(t, "releasing MPCP semaphore %p.\n", sem); + + spin_lock_irqsave(&sem->blocked.lock, flags); + if (waitqueue_active(&sem->blocked)) { + /* pass ownership on */ + sem->holder = wqlist2task(sem->blocked.task_list.next); + TRACE_TASK(t, "waking up next (=%s/%d) on MPCP semaphore %p.\n", + sem->holder->comm, sem->holder->pid, sem); + /* wake up first */ + wake_up_locked(&sem->blocked); + } else + sem->holder = NULL; + + /* restore our own priority */ + t->rt_param.pcp_prio.in_global_cs = 0; + rm_set_prio(t, &t->rt_param.pcp_prio); + + TS_MPCP_UP_END; + spin_unlock_irqrestore(&sem->blocked.lock, flags); + return ret; +} + +static noinline int request_order(struct list_head* la, struct list_head* lb) +{ + struct dpcp_request *a, *b; + a = list_entry(la, struct dpcp_request, list); + b = list_entry(lb, struct dpcp_request, list); + return a->prio < b->prio; +} + +static noinline long dpcp_invoke(struct pcp_semaphore* sem, long arg) +{ + unsigned long flags; + long ret = 0; + struct task_struct* t = current, *a; + struct dpcp_request req; + + spin_lock_irqsave(&sem->dpcp_lock, flags); + + init_completion(&req.done); + req.arg = arg; + req.prio = t->rt_param.pcp_prio.prio; + req.pid = t->rt_param.pcp_prio.pid; + + list_insert(&req.list, &sem->dpcp_requests, + request_order); + + if (!(sem->dpcp_count++)) { + /* agent needs to be awakened */ + TRACE_TASK(t, "waking DPCP agent for %p.\n", sem); + if (sem->dpcp_agent) { + a = sem->dpcp_agent; + /* set agent priority */ + a->rt_param.pcp_prio.in_global_cs = 1; + a->rt_param.pcp_prio.prio = req.prio; + rm_set_prio(a, &a->rt_param.pcp_prio); + } + complete(&sem->dpcp_job); + } + + spin_unlock_irqrestore(&sem->dpcp_lock, flags); + TRACE_TASK(t, "blocking on DPCP sem %p.\n", sem); + preempt_enable_no_resched(); + TS_DPCP_INVOKE_END; + + wait_for_completion(&req.done); + + preempt_disable(); + /* we don't need to clean up, the remote agent did that for us */ + return ret; +} + +static noinline long dpcp_agent(struct pcp_semaphore* sem, long flags, long *arg) +{ + unsigned long spinflags; + long ret = 0; + struct task_struct* t = current; + + spin_lock_irqsave(&sem->dpcp_lock, spinflags); + + /* defend against multiple concurrent agents */ + if (sem->dpcp_agent && sem->dpcp_agent != t) { + spin_unlock_irqrestore(&sem->dpcp_lock, spinflags); + return -EBUSY; + } else + sem->dpcp_agent = t; + + if (sem->cpu != get_partition(t)) { + int cpu = smp_processor_id(); + spin_unlock_irqrestore(&sem->dpcp_lock, spinflags); + printk(KERN_CRIT + "dpcp_agent: sem->cpu: %d, but agent " + "is on %d, and part=%d\n", + sem->cpu, cpu, get_partition(t)); + return -EINVAL; + } + + if ((flags & DPCP_COMPLETE) && sem->dpcp_current) { + TRACE_TASK(t, "completing DPCP sem %p.\n", sem); + /* we need to release the holder */ + complete(&sem->dpcp_current->done); + sem->dpcp_count--; + sem->dpcp_current = NULL; + } + + if (flags & DPCP_WAIT) { + do { + if (sem->dpcp_count) { + /* pass ownership on */ + sem->dpcp_current = list_entry( + sem->dpcp_requests.next, + struct dpcp_request, list); + list_del(sem->dpcp_requests.next); + t->rt_param.pcp_prio.in_global_cs = 1; + t->rt_param.pcp_prio.prio = + sem->dpcp_current->prio; + t->rt_param.pcp_prio.pid = sem->dpcp_current->pid; + rm_set_prio(t, &t->rt_param.pcp_prio); + TS_DPCP_AGENT2_END; + } else { + /* need to wait */ + spin_unlock_irqrestore(&sem->dpcp_lock, + spinflags); + TRACE_TASK(t, "agent waiting for " + "DPCP sem %p.\n", sem); + + preempt_enable_no_resched(); + TS_DPCP_AGENT2_END; + ret = wait_for_completion_interruptible(&sem->dpcp_job); + preempt_disable(); + TRACE_TASK(t, "got DPCP job on sem %p, " + "ret=%d.\n", sem, ret); + spin_lock_irqsave(&sem->dpcp_lock, spinflags); + if (ret != 0) { + /* FIXME: set priority */ + break; + } + } + } while (!sem->dpcp_current); + if (ret == 0) + *arg = sem->dpcp_current->arg; + } else { + /* restore our own priority */ + t->rt_param.pcp_prio.in_global_cs = 0; + t->rt_param.pcp_prio.prio = ULLONG_MAX; + rm_set_prio(t, &t->rt_param.pcp_prio); + sem->dpcp_agent = NULL; + } + + spin_unlock_irqrestore(&sem->dpcp_lock, spinflags); + return ret; +} + + +/* system calls */ + +asmlinkage long sys_pcp_down(int sem_od) +{ + long ret = 0; + struct pcp_semaphore * sem; + + preempt_disable(); + TS_MPCP_DOWN_START; + TS_PCP1_DOWN_START; + + if (!is_realtime(current)) { + ret = -EPERM; + goto out; + } + + sem = lookup_pcp_sem(sem_od); + if (sem) { + if (sem->cpu != GLOBAL_SEM) + ret = local_pcp_down(sem); + else + ret = global_pcp_down(sem); + } else + ret = -EINVAL; + +out: + preempt_enable(); + return ret; +} + +asmlinkage long sys_pcp_up(int sem_od) +{ + long ret = 0; + struct pcp_semaphore * sem; + + preempt_disable(); + TS_PCP_UP_START; + TS_MPCP_UP_START; + + if (!is_realtime(current)) { + ret = -EPERM; + goto out; + } + + sem = lookup_pcp_sem(sem_od); + if (sem) { + if (sem->cpu != GLOBAL_SEM) + ret = local_pcp_up(sem); + else + ret = global_pcp_up(sem); + } else + ret = -EINVAL; + +out: + preempt_enable(); + return ret; +} + + +asmlinkage long sys_dpcp_invoke(int sem_od, long arg) +{ + long ret = 0; + struct pcp_semaphore * sem; + + preempt_disable(); + TS_DPCP_INVOKE_START; + + if (!is_realtime(current)) { + ret = -EPERM; + goto out; + } + + sem = lookup_pcp_sem(sem_od); + if (sem) { + ret = dpcp_invoke(sem, arg); + } else + ret = -EINVAL; + +out: + preempt_enable(); + return ret; +} + +asmlinkage long sys_dpcp_agent(int sem_od, long flags, long __user *__arg) +{ + long ret = 0; + long arg; + struct pcp_semaphore * sem; + + preempt_disable(); + TS_DPCP_AGENT1_START; + + if (!is_realtime(current)) { + ret = -EPERM; + goto out; + } + + sem = lookup_pcp_sem(sem_od); + if (sem) { + TS_DPCP_AGENT1_END; + if (flags & DPCP_COMPLETE) { + TS_PCP_UP_START; + local_pcp_up(sem); + } + TS_DPCP_AGENT2_START; + ret = dpcp_agent(sem, flags, &arg); + if (ret == 0 && (flags & DPCP_WAIT)) { + ret = put_user(arg, __arg); + if (ret == 0) { + TS_PCP1_DOWN_START; + local_pcp_down(sem); + } + } + } else + ret = -EINVAL; + +out: + preempt_enable(); + return ret; +} + + +/* FDSO callbacks */ + +static noinline void* create_pcp_semaphore(void) +{ + struct pcp_semaphore* sem; + + sem = kmalloc(sizeof(struct pcp_semaphore), GFP_KERNEL); + if (!sem) + return NULL; + init_pcp_sem(sem, UNDEF_SEM); + TRACE("allocated PCP semaphore %p\n", sem); + return sem; +} + +static noinline void destroy_pcp_semaphore(void* obj) +{ + struct pcp_semaphore* sem = (struct pcp_semaphore*) obj; + WARN_ON(sem->holder); + WARN_ON(in_list(&sem->list)); + kfree(sem); +} + +static noinline void update_pcp_ceiling(struct pcp_semaphore* sem, struct task_struct* t, int global) +{ + struct pcp_priority prio = {get_rt_period(t), 1, t->pid}; + if (global && !sem->ceiling.in_global_cs) + sem->ceiling.in_global_cs = 1; + if (_rm_higher_prio(&prio, &sem->ceiling)) + sem->ceiling = prio; +} + +static noinline int open_pcp_semaphore(struct od_table_entry* entry, void __user *__arg) +{ + struct pcp_semaphore* sem = (struct pcp_semaphore*) entry->obj->obj; + int *arg = (int*) __arg; + struct task_struct* t = current; + int cpu= get_partition(t); + + TRACE("opening PCP semaphore %p, cpu=%d\n", sem, sem->cpu); + if (!pcp_active()) + return -EBUSY; + + if (arg && get_user(cpu, arg) != 0) + return -EFAULT; + + if (sem->cpu == UNDEF_SEM) + sem->cpu = cpu; + + update_pcp_ceiling(sem, t, sem->cpu != get_partition(t)); + + return 0; +} + +static noinline void update_mpcp_ceiling(struct pcp_semaphore* sem, struct task_struct* t) +{ + struct pcp_priority prio = {get_rt_period(t), 1, t->pid}; + if (_rm_higher_prio(&prio, &sem->ceiling)) + sem->ceiling = prio; +} + +static noinline int open_mpcp_semaphore(struct od_table_entry* entry, void* __user arg) +{ + struct pcp_semaphore* sem = (struct pcp_semaphore*) entry->obj->obj; + int ret = 0; + struct task_struct* t = current; + + if (!pcp_active()) + return -EBUSY; + + if (sem->cpu == UNDEF_SEM) + sem->cpu = GLOBAL_SEM; + + update_mpcp_ceiling(sem, t); + + return ret; +} + +struct fdso_ops pcp_sem_ops = { + .create = create_pcp_semaphore, + .destroy = destroy_pcp_semaphore, + .open = open_pcp_semaphore +}; + +struct fdso_ops mpcp_sem_ops = { + .create = create_pcp_semaphore, + .destroy = destroy_pcp_semaphore, + .open = open_mpcp_semaphore +}; + +static noinline int __init pcp_boot_init(void) +{ + int i; + + printk("Initializing PCP per-CPU ceilings..."); + for (i = 0; i < NR_CPUS; i++) { + INIT_LIST_HEAD(&per_cpu(sys_ceiling, i)); + per_cpu(pcp_lock, i) = __SPIN_LOCK_UNLOCKED(pcp_lock); + } + printk(" done!\n"); + + return 0; +} + +module_init(pcp_boot_init); diff --git a/litmus/rm_common.c b/litmus/rm_common.c new file mode 100644 index 0000000..9bf21fd --- /dev/null +++ b/litmus/rm_common.c @@ -0,0 +1,76 @@ +/* + * litmus/rm_common.c + * + * Common functions for RM based schedulers. + * + * FIXME: Too much code duplication with edf_common.c + */ + +#include +#include +#include + +#include +#include +#include + + +#include + +/* rm_higher_prio - returns true if first has a higher RM priority + * than second. Period ties are broken by PID. + * + * first first must not be NULL and a real-time task. + * second may be NULL or a non-rt task. + */ +int rm_higher_prio(struct task_struct* first, + struct task_struct* second) +{ + struct pcp_priority *p1, *p2; + + /* verify assumptions in DEBUG build */ + BUG_ON(!first); + BUG_ON(!is_realtime(first)); + BUG_ON(second && !is_realtime(second) && second->rt_param.cur_prio); + + p1 = first->rt_param.cur_prio; + + /* if second is not a real-time task, then cur_prio is NULL */ + p2 = second ? second->rt_param.cur_prio : NULL; + return _rm_higher_prio(p1, p2); +} + +int rm_ready_order(struct list_head* a, struct list_head* b) +{ + return rm_higher_prio( + list_entry(a, struct task_struct, rt_list), + list_entry(b, struct task_struct, rt_list)); +} + + +void rm_domain_init(rt_domain_t* rt, check_resched_needed_t resched) +{ + rt_domain_init(rt, resched, rm_ready_order); +} + +/* need_to_preempt - check whether the task t needs to be preempted + * call only with irqs disabled and with ready_lock acquired + * THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT! + */ +int rm_preemption_needed(rt_domain_t* rt, struct task_struct *t) +{ + /* we need the read lock for edf_ready_queue */ + /* no need to preempt if there is nothing pending */ + if (!ready_jobs_pending(rt)) + return 0; + /* we need to reschedule if t doesn't exist */ + if (!t) + return 1; + + /* NOTE: We cannot check for non-preemptibility since we + * don't know what address space we're currently in. + */ + + /* make sure to get non-rt stuff out of the way */ + return !is_realtime(t) || rm_higher_prio(next_ready(rt), t); +} diff --git a/litmus/rt_domain.c b/litmus/rt_domain.c new file mode 100644 index 0000000..fe7bd29 --- /dev/null +++ b/litmus/rt_domain.c @@ -0,0 +1,130 @@ +/* + * kernel/rt_domain.c + * + * LITMUS real-time infrastructure. This file contains the + * functions that manipulate RT domains. RT domains are an abstraction + * of a ready queue and a release queue. + */ + +#include +#include +#include + +#include +#include +#include + +#include + + +static int dummy_resched(rt_domain_t *rt) +{ + return 0; +} + +static int dummy_order(struct list_head* a, struct list_head* b) +{ + return 0; +} + +int release_order(struct list_head* a, struct list_head* b) +{ + return earlier_release( + list_entry(a, struct task_struct, rt_list), + list_entry(b, struct task_struct, rt_list)); +} + + +void rt_domain_init(rt_domain_t *rt, + check_resched_needed_t f, + list_cmp_t order) +{ + BUG_ON(!rt); + if (!f) + f = dummy_resched; + if (!order) + order = dummy_order; + INIT_LIST_HEAD(&rt->ready_queue); + INIT_LIST_HEAD(&rt->release_queue); + rt->ready_lock = RW_LOCK_UNLOCKED; + rt->release_lock = SPIN_LOCK_UNLOCKED; + rt->check_resched = f; + rt->order = order; +} + +/* add_ready - add a real-time task to the rt ready queue. It must be runnable. + * @new: the newly released task + */ +void __add_ready(rt_domain_t* rt, struct task_struct *new) +{ + TRACE("rt: adding %s/%d (%llu, %llu) to ready queue at %llu\n", + new->comm, new->pid, get_exec_cost(new), get_rt_period(new), + sched_clock()); + + if (!list_insert(&new->rt_list, &rt->ready_queue, rt->order)) + rt->check_resched(rt); +} + +struct task_struct* __take_ready(rt_domain_t* rt) +{ + struct task_struct *t = __peek_ready(rt); + + /* kick it out of the ready list */ + if (t) + list_del(&t->rt_list); + return t; +} + +struct task_struct* __peek_ready(rt_domain_t* rt) +{ + if (!list_empty(&rt->ready_queue)) + return next_ready(rt); + else + return NULL; +} + +/* add_release - add a real-time task to the rt release queue. + * @task: the sleeping task + */ +void __add_release(rt_domain_t* rt, struct task_struct *task) +{ + TRACE("rt: adding %s/%d (%llu, %llu) rel=%llu to release queue\n", + task->comm, task->pid, get_exec_cost(task), get_rt_period(task), + get_release(task)); + + list_insert(&task->rt_list, &rt->release_queue, release_order); +} + +void __release_pending(rt_domain_t* rt) +{ + struct list_head *pos, *save; + struct task_struct *queued; + lt_t now = sched_clock(); + list_for_each_safe(pos, save, &rt->release_queue) { + queued = list_entry(pos, struct task_struct, rt_list); + if (likely(is_released(queued, now))) { + /* this one is ready to go*/ + list_del(pos); + set_rt_flags(queued, RT_F_RUNNING); + + sched_trace_job_release(queued); + + /* now it can be picked up */ + barrier(); + add_ready(rt, queued); + } + else + /* the release queue is ordered */ + break; + } +} + +void try_release_pending(rt_domain_t* rt) +{ + unsigned long flags; + + if (spin_trylock_irqsave(&rt->release_lock, flags)) { + __release_pending(rt); + spin_unlock_irqrestore(&rt->release_lock, flags); + } +} diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c new file mode 100644 index 0000000..314f8a1 --- /dev/null +++ b/litmus/sched_gsn_edf.c @@ -0,0 +1,733 @@ +/* + * kernel/sched_gsn_edf.c + * + * Implementation of the GSN-EDF scheduling algorithm. + * + * This version uses the simple approach and serializes all scheduling + * decisions by the use of a queue lock. This is probably not the + * best way to do it, but it should suffice for now. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +/* Overview of GSN-EDF operations. + * + * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This + * description only covers how the individual operations are implemented in + * LITMUS. + * + * link_task_to_cpu(T, cpu) - Low-level operation to update the linkage + * structure (NOT the actually scheduled + * task). If there is another linked task To + * already it will set To->linked_on = NO_CPU + * (thereby removing its association with this + * CPU). However, it will not requeue the + * previously linked task (if any). It will set + * T's state to RT_F_RUNNING and check whether + * it is already running somewhere else. If T + * is scheduled somewhere else it will link + * it to that CPU instead (and pull the linked + * task to cpu). T may be NULL. + * + * unlink(T) - Unlink removes T from all scheduler data + * structures. If it is linked to some CPU it + * will link NULL to that CPU. If it is + * currently queued in the gsnedf queue it will + * be removed from the T->rt_list. It is safe to + * call unlink(T) if T is not linked. T may not + * be NULL. + * + * requeue(T) - Requeue will insert T into the appropriate + * queue. If the system is in real-time mode and + * the T is released already, it will go into the + * ready queue. If the system is not in + * real-time mode is T, then T will go into the + * release queue. If T's release time is in the + * future, it will go into the release + * queue. That means that T's release time/job + * no/etc. has to be updated before requeu(T) is + * called. It is not safe to call requeue(T) + * when T is already queued. T may not be NULL. + * + * gsnedf_job_arrival(T) - This is the catch all function when T enters + * the system after either a suspension or at a + * job release. It will queue T (which means it + * is not safe to call gsnedf_job_arrival(T) if + * T is already queued) and then check whether a + * preemption is necessary. If a preemption is + * necessary it will update the linkage + * accordingly and cause scheduled to be called + * (either with an IPI or need_resched). It is + * safe to call gsnedf_job_arrival(T) if T's + * next job has not been actually released yet + * (releast time in the future). T will be put + * on the release queue in that case. + * + * job_completion(T) - Take care of everything that needs to be done + * to prepare T for its next release and place + * it in the right queue with + * gsnedf_job_arrival(). + * + * + * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is + * equivalent to unlink(T). Note that if you unlink a task from a CPU none of + * the functions will automatically propagate pending task from the ready queue + * to a linked task. This is the job of the calling function ( by means of + * __take_ready). + */ + + +/* cpu_entry_t - maintain the linked and scheduled state + */ +typedef struct { + int cpu; + struct task_struct* linked; /* only RT tasks */ + struct task_struct* scheduled; /* only RT tasks */ + struct list_head list; + atomic_t will_schedule; /* prevent unneeded IPIs */ +} cpu_entry_t; +DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries); + +#define set_will_schedule() \ + (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 1)) +#define clear_will_schedule() \ + (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 0)) +#define test_will_schedule(cpu) \ + (atomic_read(&per_cpu(gsnedf_cpu_entries, cpu).will_schedule)) + + +#define NO_CPU 0xffffffff + +/* The gsnedf_lock is used to serialize all scheduling events. + * It protects + */ +static DEFINE_SPINLOCK(gsnedf_lock); +/* the cpus queue themselves according to priority in here */ +static LIST_HEAD(gsnedf_cpu_queue); + +static rt_domain_t gsnedf; + + +/* update_cpu_position - Move the cpu entry to the correct place to maintain + * order in the cpu queue. Caller must hold gsnedf lock. + * + * This really should be a heap. + */ +static void update_cpu_position(cpu_entry_t *entry) +{ + cpu_entry_t *other; + struct list_head *pos; + + if (likely(in_list(&entry->list))) + list_del(&entry->list); + /* if we do not execute real-time jobs we just move + * to the end of the queue + */ + if (entry->linked) { + list_for_each(pos, &gsnedf_cpu_queue) { + other = list_entry(pos, cpu_entry_t, list); + if (edf_higher_prio(entry->linked, other->linked)) { + __list_add(&entry->list, pos->prev, pos); + return; + } + } + } + /* if we get this far we have the lowest priority job */ + list_add_tail(&entry->list, &gsnedf_cpu_queue); +} + +/* link_task_to_cpu - Update the link of a CPU. + * Handles the case where the to-be-linked task is already + * scheduled on a different CPU. + */ +static noinline void link_task_to_cpu(struct task_struct* linked, + cpu_entry_t *entry) +{ + cpu_entry_t *sched; + struct task_struct* tmp; + int on_cpu; + + BUG_ON(linked && !is_realtime(linked)); + + /* Currently linked task is set to be unlinked. */ + if (entry->linked) { + entry->linked->rt_param.linked_on = NO_CPU; + } + + /* Link new task to CPU. */ + if (linked) { + set_rt_flags(linked, RT_F_RUNNING); + /* handle task is already scheduled somewhere! */ + on_cpu = linked->rt_param.scheduled_on; + if (on_cpu != NO_CPU) { + sched = &per_cpu(gsnedf_cpu_entries, on_cpu); + /* this should only happen if not linked already */ + BUG_ON(sched->linked == linked); + + /* If we are already scheduled on the CPU to which we + * wanted to link, we don't need to do the swap -- + * we just link ourselves to the CPU and depend on + * the caller to get things right. + */ + if (entry != sched) { + tmp = sched->linked; + linked->rt_param.linked_on = sched->cpu; + sched->linked = linked; + update_cpu_position(sched); + linked = tmp; + } + } + if (linked) /* might be NULL due to swap */ + linked->rt_param.linked_on = entry->cpu; + } + entry->linked = linked; + update_cpu_position(entry); +} + +/* unlink - Make sure a task is not linked any longer to an entry + * where it was linked before. Must hold gsnedf_lock. + */ +static noinline void unlink(struct task_struct* t) +{ + cpu_entry_t *entry; + + if (unlikely(!t)) { + TRACE_BUG_ON(!t); + return; + } + + if (t->rt_param.linked_on != NO_CPU) { + /* unlink */ + entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on); + t->rt_param.linked_on = NO_CPU; + link_task_to_cpu(NULL, entry); + } else if (in_list(&t->rt_list)) { + /* This is an interesting situation: t is scheduled, + * but was just recently unlinked. It cannot be + * linked anywhere else (because then it would have + * been relinked to this CPU), thus it must be in some + * queue. We must remove it from the list in this + * case. + */ + list_del(&t->rt_list); + } +} + + +/* preempt - force a CPU to reschedule + */ +static noinline void preempt(cpu_entry_t *entry) +{ + /* We cannot make the is_np() decision here if it is a remote CPU + * because requesting exit_np() requires that we currently use the + * address space of the task. Thus, in the remote case we just send + * the IPI and let schedule() handle the problem. + */ + + if (smp_processor_id() == entry->cpu) { + if (entry->scheduled && is_np(entry->scheduled)) + request_exit_np(entry->scheduled); + else + set_tsk_need_resched(current); + } else + /* in case that it is a remote CPU we have to defer the + * the decision to the remote CPU + * FIXME: We could save a few IPI's here if we leave the flag + * set when we are waiting for a np_exit(). + */ + if (!test_will_schedule(entry->cpu)) + smp_send_reschedule(entry->cpu); +} + +/* requeue - Put an unlinked task into gsn-edf domain. + * Caller must hold gsnedf_lock. + */ +static noinline void requeue(struct task_struct* task) +{ + BUG_ON(!task); + /* sanity check rt_list before insertion */ + BUG_ON(in_list(&task->rt_list)); + + if (get_rt_flags(task) == RT_F_SLEEP) { + /* this task has expired + * _schedule has already taken care of updating + * the release and + * deadline. We just must check if it has been released. + */ + if (is_released(task, sched_clock())) + __add_ready(&gsnedf, task); + else { + /* it has got to wait */ + __add_release(&gsnedf, task); + } + + } else + /* this is a forced preemption + * thus the task stays in the ready_queue + * we only must make it available to others + */ + __add_ready(&gsnedf, task); +} + +/* gsnedf_job_arrival: task is either resumed or released */ +static noinline void gsnedf_job_arrival(struct task_struct* task) +{ + cpu_entry_t* last; + + BUG_ON(list_empty(&gsnedf_cpu_queue)); + BUG_ON(!task); + + /* first queue arriving job */ + requeue(task); + + /* then check for any necessary preemptions */ + last = list_entry(gsnedf_cpu_queue.prev, cpu_entry_t, list); + if (edf_preemption_needed(&gsnedf, last->linked)) { + /* preemption necessary */ + task = __take_ready(&gsnedf); + TRACE("job_arrival: task %d linked to %d\n", + task->pid, last->cpu); + if (last->linked) + requeue(last->linked); + + link_task_to_cpu(task, last); + preempt(last); + } +} + +/* check for current job releases */ +static noinline void gsnedf_release_jobs(void) +{ + struct list_head *pos, *save; + struct task_struct *queued; + lt_t now = sched_clock(); + + + list_for_each_safe(pos, save, &gsnedf.release_queue) { + queued = list_entry(pos, struct task_struct, rt_list); + if (likely(is_released(queued, now))) { + /* this one is ready to go*/ + list_del(pos); + set_rt_flags(queued, RT_F_RUNNING); + + sched_trace_job_release(queued); + gsnedf_job_arrival(queued); + } + else + /* the release queue is ordered */ + break; + } +} + +/* gsnedf_scheduler_tick - this function is called for every local timer + * interrupt. + * + * checks whether the current task has expired and checks + * whether we need to preempt it if it has not expired + */ +static void gsnedf_scheduler_tick(void) +{ + unsigned long flags; + struct task_struct* t = current; + + if (is_realtime(t) && budget_exhausted(t)) { + if (!is_np(t)) { + /* np tasks will be preempted when they become + * preemptable again + */ + set_tsk_need_resched(t); + set_will_schedule(); + TRACE("gsnedf_scheduler_tick: " + "%d is preemptable " + " => FORCE_RESCHED\n", t->pid); + } else { + TRACE("gsnedf_scheduler_tick: " + "%d is non-preemptable, " + "preemption delayed.\n", t->pid); + request_exit_np(t); + } + } + + /* only the first CPU needs to release jobs */ + if (smp_processor_id() == 0) { + spin_lock_irqsave(&gsnedf_lock, flags); + + /* Try to release pending jobs */ + gsnedf_release_jobs(); + + /* We don't need to check linked != scheduled since + * set_tsk_need_resched has been set by preempt() if necessary. + */ + + spin_unlock_irqrestore(&gsnedf_lock, flags); + } +} + +/* caller holds gsnedf_lock */ +static noinline void job_completion(struct task_struct *t) +{ + BUG_ON(!t); + + sched_trace_job_completion(t); + + TRACE_TASK(t, "job_completion().\n"); + + /* set flags */ + set_rt_flags(t, RT_F_SLEEP); + /* prepare for next period */ + prepare_for_next_period(t); + /* unlink */ + unlink(t); + /* requeue + * But don't requeue a blocking task. */ + if (is_running(t)) + gsnedf_job_arrival(t); +} + + +/* Getting schedule() right is a bit tricky. schedule() may not make any + * assumptions on the state of the current task since it may be called for a + * number of reasons. The reasons include a scheduler_tick() determined that it + * was necessary, because sys_exit_np() was called, because some Linux + * subsystem determined so, or even (in the worst case) because there is a bug + * hidden somewhere. Thus, we must take extreme care to determine what the + * current state is. + * + * The CPU could currently be scheduling a task (or not), be linked (or not). + * + * The following assertions for the scheduled task could hold: + * + * - !is_running(scheduled) // the job blocks + * - scheduled->timeslice == 0 // the job completed (forcefully) + * - get_rt_flag() == RT_F_SLEEP // the job completed (by syscall) + * - linked != scheduled // we need to reschedule (for any reason) + * - is_np(scheduled) // rescheduling must be delayed, + * sys_exit_np must be requested + * + * Any of these can occur together. + */ +static int gsnedf_schedule(struct task_struct * prev, + struct task_struct ** next) +{ + cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries); + int out_of_time, sleep, preempt, np, exists, blocks; + + /* Will be released in finish_switch. */ + spin_lock(&gsnedf_lock); + clear_will_schedule(); + + /* sanity checking */ + BUG_ON(entry->scheduled && entry->scheduled != prev); + BUG_ON(entry->scheduled && !is_realtime(prev)); + BUG_ON(is_realtime(prev) && !entry->scheduled); + + /* (0) Determine state */ + exists = entry->scheduled != NULL; + blocks = exists && !is_running(entry->scheduled); + out_of_time = exists && budget_exhausted(entry->scheduled); + np = exists && is_np(entry->scheduled); + sleep = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP; + preempt = entry->scheduled != entry->linked; + + /* If a task blocks we have no choice but to reschedule. + */ + if (blocks) + unlink(entry->scheduled); + + /* Request a sys_exit_np() call if we would like to preempt but cannot. + * We need to make sure to update the link structure anyway in case + * that we are still linked. Multiple calls to request_exit_np() don't + * hurt. + */ + if (np && (out_of_time || preempt || sleep)) { + unlink(entry->scheduled); + request_exit_np(entry->scheduled); + } + + /* Any task that is preemptable and either exhausts its execution + * budget or wants to sleep completes. We may have to reschedule after + * this. + */ + if (!np && (out_of_time || sleep)) + job_completion(entry->scheduled); + + /* Link pending task if we became unlinked. + */ + if (!entry->linked) + link_task_to_cpu(__take_ready(&gsnedf), entry); + + /* The final scheduling decision. Do we need to switch for some reason? + * If linked different from scheduled select linked as next. + */ + if ((!np || blocks) && + entry->linked != entry->scheduled) { + /* Schedule a linked job? */ + if (entry->linked) + *next = entry->linked; + } else + /* Only override Linux scheduler if we have real-time task + * scheduled that needs to continue. + */ + if (exists) + *next = prev; + + spin_unlock(&gsnedf_lock); + + /* don't race with a concurrent switch */ + if (*next && prev != *next) + while ((*next)->rt_param.scheduled_on != NO_CPU) + cpu_relax(); + return 0; +} + + +/* _finish_switch - we just finished the switch away from prev + */ +static void gsnedf_finish_switch(struct task_struct *prev) +{ + cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries); + + entry->scheduled = is_realtime(current) ? current : NULL; + + prev->rt_param.scheduled_on = NO_CPU; + current->rt_param.scheduled_on = smp_processor_id(); +} + + +/* Prepare a task for running in RT mode + * Enqueues the task into master queue data structure + * returns + * -EPERM if task is not TASK_STOPPED + */ +static long gsnedf_prepare_task(struct task_struct * t) +{ + unsigned long flags; + TRACE("gsn edf: prepare task %d\n", t->pid); + + if (t->state == TASK_STOPPED) { + t->rt_param.scheduled_on = NO_CPU; + t->rt_param.linked_on = NO_CPU; + + /* delay by 1ms */ + release_at(t, sched_clock() + 1000000); + + /* The task should be running in the queue, otherwise signal + * code will try to wake it up with fatal consequences. + */ + t->state = TASK_RUNNING; + spin_lock_irqsave(&gsnedf_lock, flags); + t->rt_param.litmus_controlled = 1; + requeue(t); + spin_unlock_irqrestore(&gsnedf_lock, flags); + return 0; + } + else + return -EPERM; +} + +static void gsnedf_wake_up_task(struct task_struct *task) +{ + unsigned long flags; + lt_t now; + /* We must determine whether task should go into the release + * queue or into the ready queue. It may enter the ready queue + * if it has credit left in its time slice and has not yet reached + * its deadline. If it is now passed its deadline we assume this the + * arrival of a new sporadic job and thus put it in the ready queue + * anyway.If it has zero budget and the next release is in the future + * it has to go to the release queue. + */ + TRACE("gsnedf: %d unsuspends with budget=%d\n", + task->pid, task->time_slice); + + spin_lock_irqsave(&gsnedf_lock, flags); + if (!task->rt_param.litmus_controlled) { + task->rt_param.litmus_controlled = 1; + /* We need to take suspensions because of semaphores into + * account! If a job resumes after being suspended due to acquiring + * a semaphore, it should never be treated as a new job release. + */ + if (get_rt_flags(task) == RT_F_EXIT_SEM) { + set_rt_flags(task, RT_F_RUNNING); + } else { + now = sched_clock(); + if (is_tardy(task, now)) { + /* new sporadic release */ + release_at(task, now); + sched_trace_job_release(task); + } + else if (task->time_slice) + /* came back in time before deadline + */ + set_rt_flags(task, RT_F_RUNNING); + } + task->state = TASK_RUNNING; + gsnedf_job_arrival(task); + } + spin_unlock_irqrestore(&gsnedf_lock, flags); +} + +static void gsnedf_task_blocks(struct task_struct *t) +{ + unsigned long flags; + + /* unlink if necessary */ + spin_lock_irqsave(&gsnedf_lock, flags); + unlink(t); + t->rt_param.litmus_controlled = 0; + spin_unlock_irqrestore(&gsnedf_lock, flags); + + BUG_ON(!is_realtime(t)); + TRACE("task %d suspends with budget=%d\n", t->pid, t->time_slice); + BUG_ON(t->rt_list.next != LIST_POISON1); + BUG_ON(t->rt_list.prev != LIST_POISON2); +} + + +/* When _tear_down is called, the task should not be in any queue any more + * as it must have blocked first. We don't have any internal state for the task, + * it is all in the task_struct. + */ +static long gsnedf_tear_down(struct task_struct * t) +{ + BUG_ON(!is_realtime(t)); + TRACE_TASK(t, "RIP\n"); + BUG_ON(t->array); + BUG_ON(t->rt_list.next != LIST_POISON1); + BUG_ON(t->rt_list.prev != LIST_POISON2); + return 0; +} + +static long gsnedf_pi_block(struct pi_semaphore *sem, + struct task_struct *new_waiter) +{ + /* This callback has to handle the situation where a new waiter is + * added to the wait queue of the semaphore. + * + * We must check if has a higher priority than the currently + * highest-priority task, and then potentially reschedule. + */ + + BUG_ON(!new_waiter); + + if (edf_higher_prio(new_waiter, sem->hp.task)) { + TRACE_TASK(new_waiter, " boosts priority\n"); + /* called with IRQs disabled */ + spin_lock(&gsnedf_lock); + /* store new highest-priority task */ + sem->hp.task = new_waiter; + if (sem->holder) { + /* let holder inherit */ + sem->holder->rt_param.inh_task = new_waiter; + unlink(sem->holder); + gsnedf_job_arrival(sem->holder); + } + spin_unlock(&gsnedf_lock); + } + + return 0; +} + +static long gsnedf_inherit_priority(struct pi_semaphore *sem, + struct task_struct *new_owner) +{ + /* We don't need to acquire the gsnedf_lock since at the time of this + * call new_owner isn't actually scheduled yet (it's still sleeping) + * and since the calling function already holds sem->wait.lock, which + * prevents concurrent sem->hp.task changes. + */ + + if (sem->hp.task && sem->hp.task != new_owner) { + new_owner->rt_param.inh_task = sem->hp.task; + TRACE_TASK(new_owner, "inherited priority from %s/%d\n", + sem->hp.task->comm, sem->hp.task->pid); + } else + TRACE_TASK(new_owner, + "cannot inherit priority, " + "no higher priority job waits.\n"); + return 0; +} + +/* This function is called on a semaphore release, and assumes that + * the current task is also the semaphore holder. + */ +static long gsnedf_return_priority(struct pi_semaphore *sem) +{ + struct task_struct* t = current; + int ret = 0; + + /* Find new highest-priority semaphore task + * if holder task is the current hp.task. + * + * Calling function holds sem->wait.lock. + */ + if (t == sem->hp.task) + set_hp_task(sem, edf_higher_prio); + + TRACE_CUR("gsnedf_return_priority for lock %p\n", sem); + + if (t->rt_param.inh_task) { + /* interrupts already disabled by PI code */ + spin_lock(&gsnedf_lock); + + /* Reset inh_task to NULL. */ + t->rt_param.inh_task = NULL; + + /* Check if rescheduling is necessary */ + unlink(t); + gsnedf_job_arrival(t); + spin_unlock(&gsnedf_lock); + } + + return ret; +} + +/* Plugin object */ +static struct sched_plugin gsn_edf_plugin __cacheline_aligned_in_smp = { + .plugin_name = "GSN-EDF", + .scheduler_tick = gsnedf_scheduler_tick, + .prepare_task = gsnedf_prepare_task, + .sleep_next_period = complete_job, + .tear_down = gsnedf_tear_down, + .schedule = gsnedf_schedule, + .finish_switch = gsnedf_finish_switch, + .wake_up_task = gsnedf_wake_up_task, + .task_blocks = gsnedf_task_blocks, + .inherit_priority = gsnedf_inherit_priority, + .return_priority = gsnedf_return_priority, + .pi_block = gsnedf_pi_block +}; + + +static int __init init_gsn_edf(void) +{ + int cpu; + cpu_entry_t *entry; + + /* initialize CPU state */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + entry = &per_cpu(gsnedf_cpu_entries, cpu); + atomic_set(&entry->will_schedule, 0); + entry->linked = NULL; + entry->scheduled = NULL; + entry->cpu = cpu; + INIT_LIST_HEAD(&entry->list); + } + + edf_domain_init(&gsnedf, NULL); + return register_sched_plugin(&gsn_edf_plugin); +} + + +module_init(init_gsn_edf); diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c new file mode 100644 index 0000000..f05fc56 --- /dev/null +++ b/litmus/sched_plugin.c @@ -0,0 +1,169 @@ +/* sched_plugin.c -- core infrastructure for the scheduler plugin system + * + * This file includes the initialization of the plugin system, the no-op Linux + * scheduler plugin and some dummy functions. + */ + +#include +#include + +#include +#include + + +/************************************************************* + * Dummy plugin functions * + *************************************************************/ + +static void litmus_dummy_finish_switch(struct task_struct * prev) +{ +} + +static int litmus_dummy_schedule(struct task_struct * prev, + struct task_struct** next) +{ + return 0; +} + +static void litmus_dummy_scheduler_tick(void) +{ +} + +static long litmus_dummy_prepare_task(struct task_struct *t) +{ + return -ENOSYS; +} + +static void litmus_dummy_wake_up_task(struct task_struct *task) +{ + printk(KERN_WARNING "task %d: unhandled real-time wake up!\n", + task->pid); +} + +static void litmus_dummy_task_blocks(struct task_struct *task) +{ +} + +static long litmus_dummy_tear_down(struct task_struct *task) +{ + return 0; +} + +static long litmus_dummy_sleep_next_period(void) +{ + return -ENOSYS; +} + +static long litmus_dummy_inherit_priority(struct pi_semaphore *sem, + struct task_struct *new_owner) +{ + return -ENOSYS; +} + +static long litmus_dummy_return_priority(struct pi_semaphore *sem) +{ + return -ENOSYS; +} + +static long litmus_dummy_pi_block(struct pi_semaphore *sem, + struct task_struct *new_waiter) +{ + return -ENOSYS; +} + + +/* The default scheduler plugin. It doesn't do anything and lets Linux do its + * job. + */ +struct sched_plugin linux_sched_plugin = { + .plugin_name = "Linux", + .scheduler_tick = litmus_dummy_scheduler_tick, + .prepare_task = litmus_dummy_prepare_task, + .tear_down = litmus_dummy_tear_down, + .wake_up_task = litmus_dummy_wake_up_task, + .task_blocks = litmus_dummy_task_blocks, + .sleep_next_period = litmus_dummy_sleep_next_period, + .schedule = litmus_dummy_schedule, + .finish_switch = litmus_dummy_finish_switch, + .inherit_priority = litmus_dummy_inherit_priority, + .return_priority = litmus_dummy_return_priority, + .pi_block = litmus_dummy_pi_block +}; + +/* + * The reference to current plugin that is used to schedule tasks within + * the system. It stores references to actual function implementations + * Should be initialized by calling "init_***_plugin()" + */ +struct sched_plugin *curr_sched_plugin = &linux_sched_plugin; + +/* the list of registered scheduling plugins */ +static LIST_HEAD(sched_plugins); +static DEFINE_SPINLOCK(sched_plugins_lock); + +#define CHECK(func) {\ + if (!plugin->func) \ + plugin->func = litmus_dummy_ ## func;} + +/* FIXME: get reference to module */ +int register_sched_plugin(struct sched_plugin* plugin) +{ + printk(KERN_INFO "Registering LITMUS^RT plugin %s.\n", + plugin->plugin_name); + + /* make sure we don't trip over null pointers later */ + CHECK(finish_switch); + CHECK(schedule); + CHECK(scheduler_tick); + CHECK(wake_up_task); + CHECK(tear_down); + CHECK(task_blocks); + CHECK(prepare_task); + CHECK(sleep_next_period); + CHECK(inherit_priority); + CHECK(return_priority); + CHECK(pi_block); + + spin_lock(&sched_plugins_lock); + list_add(&plugin->list, &sched_plugins); + spin_unlock(&sched_plugins_lock); + + return 0; +} + + +/* FIXME: reference counting, etc. */ +struct sched_plugin* find_sched_plugin(const char* name) +{ + struct list_head *pos; + struct sched_plugin *plugin; + + spin_lock(&sched_plugins_lock); + list_for_each(pos, &sched_plugins) { + plugin = list_entry(pos, struct sched_plugin, list); + if (!strcmp(plugin->plugin_name, name)) + goto out_unlock; + } + plugin = NULL; + +out_unlock: + spin_unlock(&sched_plugins_lock); + return plugin; +} + +int print_sched_plugins(char* buf, int max) +{ + int count = 0; + struct list_head *pos; + struct sched_plugin *plugin; + + spin_lock(&sched_plugins_lock); + list_for_each(pos, &sched_plugins) { + plugin = list_entry(pos, struct sched_plugin, list); + count += snprintf(buf + count, max - count, "%s\n", plugin->plugin_name); + if (max - count <= 0) + break; + } + spin_unlock(&sched_plugins_lock); + return count; +} diff --git a/litmus/sched_psn_edf.c b/litmus/sched_psn_edf.c new file mode 100644 index 0000000..27f4b5c --- /dev/null +++ b/litmus/sched_psn_edf.c @@ -0,0 +1,458 @@ + +/* + * kernel/sched_psn_edf.c + * + * Implementation of the PSN-EDF scheduler plugin. + * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c. + * + * Suspensions and non-preemptable sections are supported. + * Priority inheritance is not supported. + */ + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + + +typedef struct { + rt_domain_t domain; + int cpu; + struct task_struct* scheduled; /* only RT tasks */ + spinlock_t lock; /* protects the domain and + * serializes scheduling decisions + */ +} psnedf_domain_t; + +DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains); + +#define local_edf (&__get_cpu_var(psnedf_domains).domain) +#define local_pedf (&__get_cpu_var(psnedf_domains)) +#define remote_edf(cpu) (&per_cpu(psnedf_domains, cpu).domain) +#define remote_pedf(cpu) (&per_cpu(psnedf_domains, cpu)) +#define task_edf(task) remote_edf(get_partition(task)) +#define task_pedf(task) remote_pedf(get_partition(task)) + + +static void psnedf_domain_init(psnedf_domain_t* pedf, + check_resched_needed_t check, + int cpu) +{ + edf_domain_init(&pedf->domain, check); + pedf->cpu = cpu; + pedf->lock = SPIN_LOCK_UNLOCKED; + pedf->scheduled = NULL; +} + +static void requeue(struct task_struct* t, rt_domain_t *edf) +{ + /* only requeue if t is actually running */ + BUG_ON(!is_running(t)); + + if (t->state != TASK_RUNNING) + TRACE_TASK(t, "requeue: !TASK_RUNNING"); + + set_rt_flags(t, RT_F_RUNNING); + if (is_released(t, sched_clock())) + __add_ready(edf, t); + else + __add_release(edf, t); /* it has got to wait */ +} + +/* we assume the lock is being held */ +static void preempt(psnedf_domain_t *pedf) +{ + if (smp_processor_id() == pedf->cpu) { + if (pedf->scheduled && is_np(pedf->scheduled)) + request_exit_np(pedf->scheduled); + else + set_tsk_need_resched(current); + } else + /* in case that it is a remote CPU we have to defer the + * the decision to the remote CPU + */ + smp_send_reschedule(pedf->cpu); +} + +/* This check is trivial in partioned systems as we only have to consider + * the CPU of the partition. + */ +static int psnedf_check_resched(rt_domain_t *edf) +{ + psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain); + int ret = 0; + + /* because this is a callback from rt_domain_t we already hold + * the necessary lock for the ready queue + */ + if (edf_preemption_needed(edf, pedf->scheduled)) { + preempt(pedf); + ret = 1; + } + return ret; +} + + +static void psnedf_scheduler_tick(void) +{ + unsigned long flags; + struct task_struct *t = current; + rt_domain_t *edf = local_edf; + psnedf_domain_t *pedf = local_pedf; + + /* Check for inconsistency. We don't need the lock for this since + * ->scheduled is only changed in schedule, which obviously is not + * executing in parallel on this CPU + */ + BUG_ON(is_realtime(t) && t != pedf->scheduled); + + if (is_realtime(t) && budget_exhausted(t)) { + if (!is_np(t)) + set_tsk_need_resched(t); + else { + TRACE("psnedf_scheduler_tick: " + "%d is non-preemptable, " + "preemption delayed.\n", t->pid); + request_exit_np(t); + } + } + + spin_lock_irqsave(&pedf->lock, flags); + __release_pending(edf); + if (edf_preemption_needed(edf, t)) + set_tsk_need_resched(t); + spin_unlock_irqrestore(&pedf->lock, flags); +} + +static void job_completion(struct task_struct* t) +{ + TRACE_TASK(t, "job_completion().\n"); + set_rt_flags(t, RT_F_SLEEP); + prepare_for_next_period(t); +} + +static int psnedf_schedule(struct task_struct * prev, + struct task_struct ** next) +{ + psnedf_domain_t* pedf = local_pedf; + rt_domain_t* edf = &pedf->domain; + + int out_of_time, sleep, preempt, + np, exists, blocks, resched; + + spin_lock(&pedf->lock); + + /* sanity checking */ + BUG_ON(pedf->scheduled && pedf->scheduled != prev); + BUG_ON(pedf->scheduled && !is_realtime(prev)); + + /* (0) Determine state */ + exists = pedf->scheduled != NULL; + blocks = exists && !is_running(pedf->scheduled); + out_of_time = exists && budget_exhausted(pedf->scheduled); + np = exists && is_np(pedf->scheduled); + sleep = exists && get_rt_flags(pedf->scheduled) == RT_F_SLEEP; + preempt = edf_preemption_needed(edf, prev); + + /* If we need to preempt do so. + * The following checks set resched to 1 in case of special + * circumstances. + */ + resched = preempt; + + /* If a task blocks we have no choice but to reschedule. + */ + if (blocks) + resched = 1; + + /* Request a sys_exit_np() call if we would like to preempt but cannot. + * Multiple calls to request_exit_np() don't hurt. + */ + if (np && (out_of_time || preempt || sleep)) + request_exit_np(pedf->scheduled); + + /* Any task that is preemptable and either exhausts its execution + * budget or wants to sleep completes. We may have to reschedule after + * this. + */ + if (!np && (out_of_time || sleep)) { + job_completion(pedf->scheduled); + resched = 1; + } + + /* The final scheduling decision. Do we need to switch for some reason? + * Switch if we are in RT mode and have no task or if we need to + * resched. + */ + *next = NULL; + if ((!np || blocks) && (resched || !exists)) { + /* Take care of a previously scheduled + * job by taking it out of the Linux runqueue. + */ + if (pedf->scheduled) { + /* as opposed to global schedulers that switch without + * a lock being held we can requeue already here since + * no other CPU will schedule from this domain. + */ + if (!blocks) + requeue(pedf->scheduled, edf); + } + *next = __take_ready(edf); + } else + /* Only override Linux scheduler if we have a real-time task + * scheduled that needs to continue. + */ + if (exists) + *next = prev; + + if (*next) + set_rt_flags(*next, RT_F_RUNNING); + + pedf->scheduled = *next; + spin_unlock(&pedf->lock); + return 0; +} + + +/* Prepare a task for running in RT mode + * Enqueues the task into master queue data structure + * returns + * -EPERM if task is not TASK_STOPPED + */ +static long psnedf_prepare_task(struct task_struct * t) +{ + rt_domain_t* edf = task_edf(t); + psnedf_domain_t* pedf = task_pedf(t); + unsigned long flags; + + TRACE("[%d] psn edf: prepare task %d on CPU %d\n", + smp_processor_id(), t->pid, get_partition(t)); + if (t->state == TASK_STOPPED) { + + /* 1ms delay */ + release_at(t, sched_clock() + 1000000); + + /* The task should be running in the queue, otherwise signal + * code will try to wake it up with fatal consequences. + */ + t->state = TASK_RUNNING; + spin_lock_irqsave(&pedf->lock, flags); + t->rt_param.litmus_controlled = 1; + __add_release(edf, t); + spin_unlock_irqrestore(&pedf->lock, flags); + return 0; + } else + return -EPERM; +} + +static void psnedf_wake_up_task(struct task_struct *task) +{ + unsigned long flags; + psnedf_domain_t* pedf = task_pedf(task); + rt_domain_t* edf = task_edf(task); + lt_t now; + + TRACE("psnedf: %d unsuspends with budget=%d\n", + task->pid, task->time_slice); + + spin_lock_irqsave(&pedf->lock, flags); + if (!task->rt_param.litmus_controlled) { + BUG_ON(in_list(&task->rt_list)); + task->rt_param.litmus_controlled = 1; + /* We need to take suspensions because of semaphores into + * account! If a job resumes after being suspended due to acquiring + * a semaphore, it should never be treated as a new job release. + */ + now = sched_clock(); + if (is_tardy(task, now) && + get_rt_flags(task) != RT_F_EXIT_SEM) { + /* new sporadic release */ + release_at(task, now); + sched_trace_job_release(task); + } + task->state = TASK_RUNNING; + requeue(task, edf); + } + spin_unlock_irqrestore(&pedf->lock, flags); +} + +static void psnedf_task_blocks(struct task_struct *t) +{ + BUG_ON(!is_realtime(t)); + /* not really anything to do since it can only block if + * it is running, and when it is not running it is not in any + * queue anyway. + */ + TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice); + BUG_ON(in_list(&t->rt_list)); + t->rt_param.litmus_controlled = 0; +} + + +/* When _tear_down is called, the task should not be in any queue any more + * as it must have blocked first. We don't have any internal state for the task, + * it is all in the task_struct. + */ +static long psnedf_tear_down(struct task_struct * t) +{ + BUG_ON(!is_realtime(t)); + TRACE_TASK(t, "tear down called"); + BUG_ON(t->array); + BUG_ON(in_list(&t->rt_list)); + return 0; +} + +static long psnedf_pi_block(struct pi_semaphore *sem, + struct task_struct *new_waiter) +{ + psnedf_domain_t* pedf; + rt_domain_t* edf; + struct task_struct* t; + int cpu = get_partition(new_waiter); + + BUG_ON(!new_waiter); + + if (edf_higher_prio(new_waiter, sem->hp.cpu_task[cpu])) { + TRACE_TASK(new_waiter, " boosts priority\n"); + pedf = task_pedf(new_waiter); + edf = task_edf(new_waiter); + + /* interrupts already disabled */ + spin_lock(&pedf->lock); + + /* store new highest-priority task */ + sem->hp.cpu_task[cpu] = new_waiter; + if (sem->holder && + get_partition(sem->holder) == get_partition(new_waiter)) { + /* let holder inherit */ + sem->holder->rt_param.inh_task = new_waiter; + t = sem->holder; + if (in_list(&t->rt_list)) { + /* queued in domain*/ + list_del(&t->rt_list); + /* readd to make priority change take place */ + if (is_released(t, sched_clock())) + __add_ready(edf, t); + else + __add_release(edf, t); + } + } + + /* check if we need to reschedule */ + if (edf_preemption_needed(edf, current)) + preempt(pedf); + + spin_unlock(&pedf->lock); + } + + return 0; +} + +static long psnedf_inherit_priority(struct pi_semaphore *sem, + struct task_struct *new_owner) +{ + int cpu = get_partition(new_owner); + + /* FIXME: This doesn't look correct at all! + * Why do we inherit in any case??? + */ + new_owner->rt_param.inh_task = sem->hp.cpu_task[cpu]; + if (sem->hp.cpu_task[cpu] && new_owner != sem->hp.cpu_task[cpu]) { + TRACE_TASK(new_owner, + "inherited priority from %s/%d\n", + sem->hp.cpu_task[cpu]->comm, + sem->hp.cpu_task[cpu]->pid); + } else + TRACE_TASK(new_owner, + "cannot inherit priority: " + "no higher priority job waits on this CPU!\n"); + /* make new owner non-preemptable as required by FMLP under + * PSN-EDF. + */ + make_np(new_owner); + return 0; +} + + +/* This function is called on a semaphore release, and assumes that + * the current task is also the semaphore holder. + */ +static long psnedf_return_priority(struct pi_semaphore *sem) +{ + struct task_struct* t = current; + psnedf_domain_t* pedf = task_pedf(t); + rt_domain_t* edf = task_edf(t); + int ret = 0; + int cpu = get_partition(current); + + + /* Find new highest-priority semaphore task + * if holder task is the current hp.cpu_task[cpu]. + * + * Calling function holds sem->wait.lock. + */ + if (t == sem->hp.cpu_task[cpu]) + set_hp_cpu_task(sem, cpu, edf_higher_prio); + + take_np(t); + if (current->rt_param.inh_task) { + TRACE_CUR("return priority of %s/%d\n", + current->rt_param.inh_task->comm, + current->rt_param.inh_task->pid); + spin_lock(&pedf->lock); + + /* Reset inh_task to NULL. */ + current->rt_param.inh_task = NULL; + + /* check if we need to reschedule */ + if (edf_preemption_needed(edf, current)) + preempt(pedf); + + spin_unlock(&pedf->lock); + } else + TRACE_CUR(" no priority to return %p\n", sem); + + return ret; +} + + +/* Plugin object */ +static struct sched_plugin psn_edf_plugin __cacheline_aligned_in_smp = { + .plugin_name = "PSN-EDF", + .srp_active = 1, + .scheduler_tick = psnedf_scheduler_tick, + .prepare_task = psnedf_prepare_task, + .sleep_next_period = complete_job, + .tear_down = psnedf_tear_down, + .schedule = psnedf_schedule, + .wake_up_task = psnedf_wake_up_task, + .task_blocks = psnedf_task_blocks, + .pi_block = psnedf_pi_block, + .inherit_priority = psnedf_inherit_priority, + .return_priority = psnedf_return_priority +}; + + +static int __init init_psn_edf(void) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) + { + psnedf_domain_init(remote_pedf(i), + psnedf_check_resched, i); + printk("PSN-EDF: CPU partition %d initialized.\n", i); + } + return register_sched_plugin(&psn_edf_plugin); +} + + + +module_init(init_psn_edf); diff --git a/litmus/sched_rm.c b/litmus/sched_rm.c new file mode 100644 index 0000000..57acde4 --- /dev/null +++ b/litmus/sched_rm.c @@ -0,0 +1,397 @@ + +/* RM implementation. + * Will support the M-PCP eventually. + */ + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + + +typedef struct { + rt_domain_t domain; + int cpu; + struct task_struct* scheduled; /* only RT tasks */ + spinlock_t lock; /* protects the domain and + * serializes scheduling decisions + */ +} rm_domain_t; + +DEFINE_PER_CPU(rm_domain_t, rm_domains); + +#define local_dom (&__get_cpu_var(rm_domains).domain) +#define local_part (&__get_cpu_var(rm_domains)) +#define remote_dom(cpu) (&per_cpu(rm_domains, cpu).domain) +#define remote_part(cpu) (&per_cpu(rm_domains, cpu)) +#define task_dom(task) remote_dom(get_partition(task)) +#define task_part(task) remote_part(get_partition(task)) + + +static void prm_domain_init(rm_domain_t* part, + check_resched_needed_t check, + int cpu) +{ + rm_domain_init(&part->domain, check); + part->cpu = cpu; + part->lock = SPIN_LOCK_UNLOCKED; + part->scheduled = NULL; +} + +static void requeue(struct task_struct* t, rt_domain_t *dom) +{ + /* only requeue if t is actually running */ + BUG_ON(!is_running(t)); + + if (t->state != TASK_RUNNING) + TRACE_TASK(t, "requeue: !TASK_RUNNING"); + + set_rt_flags(t, RT_F_RUNNING); + if (is_released(t, sched_clock())) + __add_ready(dom, t); + else + __add_release(dom, t); /* it has got to wait */ +} + +/* we assume the lock is being held */ +static void preempt(rm_domain_t *part) +{ + if (smp_processor_id() == part->cpu) { + if (part->scheduled && is_np(part->scheduled)) + request_exit_np(part->scheduled); + else + set_tsk_need_resched(current); + } else + /* in case that it is a remote CPU we have to defer the + * the decision to the remote CPU + */ + smp_send_reschedule(part->cpu); +} + +/* This check is trivial in partioned systems as we only have to consider + * the CPU of the partition. + */ +static int rm_check_resched(rt_domain_t *dom) +{ + rm_domain_t *part = container_of(dom, rm_domain_t, domain); + int ret = 0; + + /* because this is a callback from rt_domain_t we already hold + * the necessary lock for the ready queue + */ + if (rm_preemption_needed(dom, part->scheduled)) { + preempt(part); + ret = 1; + } + return ret; +} + +static void __rm_set_prio(struct task_struct *t, struct pcp_priority* new_prio, + rm_domain_t* part) +{ + t->rt_param.cur_prio = new_prio; + if (in_list(&t->rt_list)) { + list_del(&t->rt_list); + requeue(t, &part->domain); + } else + rm_check_resched(&part->domain); +} + +/* call only with IRQs disabled */ +void rm_set_prio(struct task_struct *t, struct pcp_priority* new_prio) +{ + unsigned long flags; + rm_domain_t *part = task_part(t); + + BUG_ON(!is_realtime(t)); + spin_lock_irqsave(&part->lock, flags); + __rm_set_prio(t, new_prio, part); + spin_unlock_irqrestore(&part->lock, flags); +} + +static void rm_scheduler_tick(void) +{ + unsigned long flags; + struct task_struct *t = current; + rt_domain_t *dom = local_dom; + rm_domain_t *part = local_part; + + /* Check for inconsistency. We don't need the lock for this since + * ->scheduled is only changed in schedule, which obviously is not + * executing in parallel on this CPU + */ + BUG_ON(is_realtime(t) && t != part->scheduled); + +/* if (is_realtime(t) && budget_exhausted(t)) { + if (!is_np(t)) + set_tsk_need_resched(t); + else { + TRACE("rm_scheduler_tick: " + "%d is non-preemptable, " + "preemption delayed.\n", t->pid); + request_exit_np(t); + } + } +*/ + spin_lock_irqsave(&part->lock, flags); + __release_pending(dom); + if (rm_preemption_needed(dom, t)) + set_tsk_need_resched(t); + spin_unlock_irqrestore(&part->lock, flags); +} + +static void job_completion(struct task_struct* t) +{ + TRACE_TASK(t, "job_completion().\n"); + set_rt_flags(t, RT_F_SLEEP); + prepare_for_next_period(t); +} + +static int rm_schedule(struct task_struct * prev, + struct task_struct ** next) +{ + rm_domain_t* part = local_part; + rt_domain_t* dom = &part->domain; + + int sleep, preempt, + np, exists, blocks, resched; +// int out_of_time; + + spin_lock(&part->lock); + + /* sanity checking */ + BUG_ON(part->scheduled && part->scheduled != prev); + BUG_ON(part->scheduled && !is_realtime(prev)); + + /* (0) Determine state */ + exists = part->scheduled != NULL; + blocks = exists && !is_running(part->scheduled); +// out_of_time = exists && budget_exhausted(part->scheduled); +#define out_of_time 0 + np = exists && is_np(part->scheduled); + sleep = exists && get_rt_flags(part->scheduled) == RT_F_SLEEP; + preempt = rm_preemption_needed(dom, prev); + + /* If we need to preempt do so. + * The following checks set resched to 1 in case of special + * circumstances. + */ + resched = preempt; + + /* If a task blocks we have no choice but to reschedule. + */ + if (blocks) + resched = 1; + + /* Request a sys_exit_np() call if we would like to preempt but cannot. + * Multiple calls to request_exit_np() don't hurt. + */ + if (np && (out_of_time || preempt || sleep)) + request_exit_np(part->scheduled); + + /* Any task that is preemptable and either exhausts its execution + * budget or wants to sleep completes. We may have to reschedule after + * this. + */ + if (!np && (out_of_time || sleep)) { + job_completion(part->scheduled); + resched = 1; + } + + /* The final scheduling decision. Do we need to switch for some reason? + * Switch if we are in RT mode and have no task or if we need to + * resched. + */ + *next = NULL; + if ((!np || blocks) && (resched || !exists)) { + /* Take care of a previously scheduled + * job by taking it out of the Linux runqueue. + */ + if (part->scheduled) { + /* as opposed to global schedulers that switch without + * a lock being held we can requeue already here since + * no other CPU will schedule from this domain. + */ + if (!blocks) + requeue(part->scheduled, dom); + } + *next = __take_ready(dom); + } else + /* Only override Linux scheduler if we have a real-time task + * scheduled that needs to continue. + */ + if (exists) + *next = prev; + + if (*next) + set_rt_flags(*next, RT_F_RUNNING); + + part->scheduled = *next; + spin_unlock(&part->lock); + return 0; +} + + +/* Prepare a task for running in RT mode + * Enqueues the task into master queue data structure + * returns + * -EPERM if task is not TASK_STOPPED + */ +static long rm_prepare_task(struct task_struct * t) +{ + rt_domain_t* dom = task_dom(t); + rm_domain_t* part = task_part(t); + unsigned long flags; + + TRACE("[%d] P-RM: prepare task %d on CPU %d\n", + smp_processor_id(), t->pid, get_partition(t)); + if (t->state == TASK_STOPPED) { +//FIXME if (!t->rt_param.task_params.prio) { + TRACE_TASK(t, "using rate-monotonic prio assignment\n"); + t->rt_param.pcp_prio.prio = get_rt_period(t); +// } else { +// TRACE_TASK(t, "using user-defined static prio assignment\n"); +// t->rt_param.pcp_prio.prio = t->rt_param.task_params.prio; +// } + t->rt_param.pcp_prio.in_global_cs = 0; + t->rt_param.pcp_prio.pid = t->pid; + t->rt_param.cur_prio = &t->rt_param.pcp_prio; + INIT_LIST_HEAD(&t->rt_param.owned_semaphores); + /* 1ms delay */ + release_at(t, sched_clock() + 1000000); + + /* The task should be running in the queue, otherwise signal + * code will try to wake it up with fatal consequences. + */ + t->state = TASK_RUNNING; + + spin_lock_irqsave(&part->lock, flags); + t->rt_param.litmus_controlled = 1; + __add_release(dom, t); + spin_unlock_irqrestore(&part->lock, flags); + return 0; + } else + return -EPERM; +} + +static void rm_wake_up_task(struct task_struct *task) +{ + unsigned long flags; + rm_domain_t* part = task_part(task); + rt_domain_t* dom = task_dom(task); + + TRACE_TASK(task, "P-RM: %d unsuspends.\n"); + + spin_lock_irqsave(&part->lock, flags); + if (!task->rt_param.litmus_controlled) { + BUG_ON(in_list(&task->rt_list)); + task->rt_param.litmus_controlled = 1; + task->state = TASK_RUNNING; + requeue(task, dom); + } + spin_unlock_irqrestore(&part->lock, flags); +} + +static void rm_task_blocks(struct task_struct *t) +{ + BUG_ON(!is_realtime(t)); + /* not really anything to do since it can only block if + * it is running, and when it is not running it is not in any + * queue anyway. + */ + TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice); + BUG_ON(in_list(&t->rt_list)); + t->rt_param.litmus_controlled = 0; +} + + +/* When _tear_down is called, the task should not be in any queue any more + * as it must have blocked first. We don't have any internal state for the task, + * it is all in the task_struct. + */ +static long rm_tear_down(struct task_struct * t) +{ + BUG_ON(!is_realtime(t)); + TRACE_TASK(t, "tear down called"); + BUG_ON(t->array); + BUG_ON(in_list(&t->rt_list)); + return 0; +} + +static struct pcp_priority boosted = {0, 1, INT_MAX}; + +static long rm_pi_block(struct pi_semaphore *sem, + struct task_struct *new_waiter) +{ + return 0; +} + +static long rm_inherit_priority(struct pi_semaphore *sem, + struct task_struct *new_owner) +{ + rm_set_prio(new_owner, &boosted); + TRACE_TASK(new_owner, "priority boosted"); + make_np(new_owner); + return 0; +} + + +/* This function is called on a semaphore release, and assumes that + * the current task is also the semaphore holder. + */ +static long rm_return_priority(struct pi_semaphore *sem) +{ + struct task_struct* t = current; + + take_np(t); + /* reset prio to trigger resched if required */ + rm_set_prio(t, &t->rt_param.pcp_prio); + TRACE_TASK(t, "prio boost ended"); + return 0; +} + +/* Plugin object */ +static struct sched_plugin p_rm_plugin __cacheline_aligned_in_smp = { + .plugin_name = "P-RM", + /* PCP and SRP don't really work together, but this is something the + * user has to get right for the moment. + * System will not crash and burn, but timing correctness is not ensured. + * Just don't use both APIs at the same time for now. + */ + .pcp_active = 1, + .srp_active = 1, + .scheduler_tick = rm_scheduler_tick, + .prepare_task = rm_prepare_task, + .sleep_next_period = complete_job, + .tear_down = rm_tear_down, + .schedule = rm_schedule, + .wake_up_task = rm_wake_up_task, + .task_blocks = rm_task_blocks, + .pi_block = rm_pi_block, + .inherit_priority = rm_inherit_priority, + .return_priority = rm_return_priority +}; + +static int __init init_rm(void) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) + { + prm_domain_init(remote_part(i), + rm_check_resched, i); + printk("P-RM: CPU partition %d initialized.\n", i); + } + return register_sched_plugin(&p_rm_plugin); +} + + + +module_init(init_rm); diff --git a/litmus/sched_trace.c b/litmus/sched_trace.c new file mode 100644 index 0000000..0976e83 --- /dev/null +++ b/litmus/sched_trace.c @@ -0,0 +1,541 @@ +/* sched_trace.c -- record scheduling events to a byte stream. + * + * TODO: Move ring buffer to a lockfree implementation. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + + +typedef struct { + /* guard read and write pointers */ + spinlock_t lock; + /* guard against concurrent freeing of buffer */ + rwlock_t del_lock; + + /* memory allocated for ring buffer */ + unsigned long order; + char* buf; + char* end; + + /* Read/write pointer. May not cross. + * They point to the position of next write and + * last read. + */ + char* writep; + char* readp; + +} ring_buffer_t; + +#define EMPTY_RING_BUFFER { \ + .lock = SPIN_LOCK_UNLOCKED, \ + .del_lock = RW_LOCK_UNLOCKED, \ + .buf = NULL, \ + .end = NULL, \ + .writep = NULL, \ + .readp = NULL \ +} + +void rb_init(ring_buffer_t* buf) +{ + *buf = (ring_buffer_t) EMPTY_RING_BUFFER; +} + +int rb_alloc_buf(ring_buffer_t* buf, unsigned long order) +{ + unsigned long flags; + int error = 0; + char *mem; + + /* do memory allocation while not atomic */ + mem = (char *) __get_free_pages(GFP_KERNEL, order); + if (!mem) + return -ENOMEM; + write_lock_irqsave(&buf->del_lock, flags); + BUG_ON(buf->buf); + buf->buf = mem; + buf->end = buf->buf + PAGE_SIZE * (1 << order) - 1; + memset(buf->buf, 0xff, buf->end - buf->buf); + buf->order = order; + buf->writep = buf->buf + 1; + buf->readp = buf->buf; + write_unlock_irqrestore(&buf->del_lock, flags); + return error; +} + +int rb_free_buf(ring_buffer_t* buf) +{ + unsigned long flags; + int error = 0; + write_lock_irqsave(&buf->del_lock, flags); + BUG_ON(!buf->buf); + free_pages((unsigned long) buf->buf, buf->order); + buf->buf = NULL; + buf->end = NULL; + buf->writep = NULL; + buf->readp = NULL; + write_unlock_irqrestore(&buf->del_lock, flags); + return error; +} + +/* Assumption: concurrent writes are serialized externally + * + * Will only succeed if there is enough space for all len bytes. + */ +int rb_put(ring_buffer_t* buf, char* mem, size_t len) +{ + unsigned long flags; + char* r , *w; + int error = 0; + read_lock_irqsave(&buf->del_lock, flags); + if (!buf->buf) { + error = -ENODEV; + goto out; + } + spin_lock(&buf->lock); + r = buf->readp; + w = buf->writep; + spin_unlock(&buf->lock); + if (r < w && buf->end - w >= len - 1) { + /* easy case: there is enough space in the buffer + * to write it in one continous chunk*/ + memcpy(w, mem, len); + w += len; + if (w > buf->end) + /* special case: fit exactly into buffer + * w is now buf->end + 1 + */ + w = buf->buf; + } else if (w < r && r - w >= len) { /* >= len because may not cross */ + /* we are constrained by the read pointer but we there + * is enough space + */ + memcpy(w, mem, len); + w += len; + } else if (r <= w && buf->end - w < len - 1) { + /* the wrap around case: there may or may not be space */ + if ((buf->end - w) + (r - buf->buf) >= len - 1) { + /* copy chunk that fits at the end */ + memcpy(w, mem, buf->end - w + 1); + mem += buf->end - w + 1; + len -= (buf->end - w + 1); + w = buf->buf; + /* copy the rest */ + memcpy(w, mem, len); + w += len; + } + else + error = -ENOMEM; + } else { + error = -ENOMEM; + } + if (!error) { + spin_lock(&buf->lock); + buf->writep = w; + spin_unlock(&buf->lock); + } + out: + read_unlock_irqrestore(&buf->del_lock, flags); + return error; +} + +/* Assumption: concurrent reads are serialized externally */ +int rb_get(ring_buffer_t* buf, char* mem, size_t len) +{ + unsigned long flags; + char* r , *w; + int error = 0; + read_lock_irqsave(&buf->del_lock, flags); + if (!buf->buf) { + error = -ENODEV; + goto out; + } + spin_lock(&buf->lock); + r = buf->readp; + w = buf->writep; + spin_unlock(&buf->lock); + + if (w <= r && buf->end - r >= len) { + /* easy case: there is enough data in the buffer + * to get it in one chunk*/ + memcpy(mem, r + 1, len); + r += len; + error = len; + + } else if (r + 1 < w && w - r - 1 >= len) { + /* we are constrained by the write pointer but + * there is enough data + */ + memcpy(mem, r + 1, len); + r += len; + error = len; + + } else if (r + 1 < w && w - r - 1 < len) { + /* we are constrained by the write pointer and there + * there is not enough data + */ + memcpy(mem, r + 1, w - r - 1); + error = w - r - 1; + r += w - r - 1; + + } else if (w <= r && buf->end - r < len) { + /* the wrap around case: there may or may not be enough data + * first let's get what is available + */ + memcpy(mem, r + 1, buf->end - r); + error += (buf->end - r); + mem += (buf->end - r); + len -= (buf->end - r); + r += (buf->end - r); + + if (w > buf->buf) { + /* there is more to get */ + r = buf->buf - 1; + if (w - r >= len) { + /* plenty */ + memcpy(mem, r + 1, len); + error += len; + r += len; + } else { + memcpy(mem, r + 1, w - r - 1); + error += w - r - 1; + r += w - r - 1; + } + } + } /* nothing available */ + + if (error > 0) { + spin_lock(&buf->lock); + buf->readp = r; + spin_unlock(&buf->lock); + } + out: + read_unlock_irqrestore(&buf->del_lock, flags); + return error; +} + + + +/******************************************************************************/ +/* DEVICE FILE DRIVER */ +/******************************************************************************/ + + + +/* Allocate a buffer of about 1 MB per CPU. + * + */ +#define BUFFER_ORDER 8 + +typedef struct { + ring_buffer_t buf; + atomic_t reader_cnt; + struct semaphore reader_mutex; +} trace_buffer_t; + + +/* This does not initialize the semaphore!! */ + +#define EMPTY_TRACE_BUFFER \ + { .buf = EMPTY_RING_BUFFER, .reader_cnt = ATOMIC_INIT(0)} + +static DEFINE_PER_CPU(trace_buffer_t, trace_buffer); + +#ifdef CONFIG_SCHED_DEBUG_TRACE +static spinlock_t log_buffer_lock = SPIN_LOCK_UNLOCKED; +#endif +static trace_buffer_t log_buffer = EMPTY_TRACE_BUFFER; + +static void init_buffers(void) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) { + rb_init(&per_cpu(trace_buffer, i).buf); + init_MUTEX(&per_cpu(trace_buffer, i).reader_mutex); + atomic_set(&per_cpu(trace_buffer, i).reader_cnt, 0); + } + /* only initialize the mutex, the rest was initialized as part + * of the static initialization macro + */ + init_MUTEX(&log_buffer.reader_mutex); +} + +static int trace_release(struct inode *in, struct file *filp) +{ + int error = -EINVAL; + trace_buffer_t* buf = filp->private_data; + + BUG_ON(!filp->private_data); + + if (down_interruptible(&buf->reader_mutex)) { + error = -ERESTARTSYS; + goto out; + } + + /* last release must deallocate buffers */ + if (atomic_dec_return(&buf->reader_cnt) == 0) { + error = rb_free_buf(&buf->buf); + } + + up(&buf->reader_mutex); + out: + return error; +} + +static ssize_t trace_read(struct file *filp, char __user *to, size_t len, + loff_t *f_pos) +{ + /* we ignore f_pos, this is strictly sequential */ + + ssize_t error = -EINVAL; + char* mem; + trace_buffer_t *buf = filp->private_data; + + if (down_interruptible(&buf->reader_mutex)) { + error = -ERESTARTSYS; + goto out; + } + + if (len > 64 * 1024) + len = 64 * 1024; + mem = kmalloc(len, GFP_KERNEL); + if (!mem) { + error = -ENOMEM; + goto out_unlock; + } + + error = rb_get(&buf->buf, mem, len); + while (!error) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(110); + if (signal_pending(current)) + error = -ERESTARTSYS; + else + error = rb_get(&buf->buf, mem, len); + } + + if (error > 0 && copy_to_user(to, mem, error)) + error = -EFAULT; + + kfree(mem); + out_unlock: + up(&buf->reader_mutex); + out: + return error; +} + + +/* trace_open - Open one of the per-CPU sched_trace buffers. + */ +static int trace_open(struct inode *in, struct file *filp) +{ + int error = -EINVAL; + int cpu = MINOR(in->i_rdev); + trace_buffer_t* buf; + + if (!cpu_online(cpu)) { + printk(KERN_WARNING "sched trace: " + "CPU #%d is not online. (open failed)\n", cpu); + error = -ENODEV; + goto out; + } + + buf = &per_cpu(trace_buffer, cpu); + + if (down_interruptible(&buf->reader_mutex)) { + error = -ERESTARTSYS; + goto out; + } + + /* first open must allocate buffers */ + if (atomic_inc_return(&buf->reader_cnt) == 1) { + if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER))) + { + atomic_dec(&buf->reader_cnt); + goto out_unlock; + } + } + + error = 0; + filp->private_data = buf; + + out_unlock: + up(&buf->reader_mutex); + out: + return error; +} + +/* log_open - open the global log message ring buffer. + */ +static int log_open(struct inode *in, struct file *filp) +{ + int error = -EINVAL; + trace_buffer_t* buf; + + buf = &log_buffer; + + if (down_interruptible(&buf->reader_mutex)) { + error = -ERESTARTSYS; + goto out; + } + + /* first open must allocate buffers */ + if (atomic_inc_return(&buf->reader_cnt) == 1) { + if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER))) + { + atomic_dec(&buf->reader_cnt); + goto out_unlock; + } + } + + error = 0; + filp->private_data = buf; + + out_unlock: + up(&buf->reader_mutex); + out: + return error; +} + +/******************************************************************************/ +/* Device Registration */ +/******************************************************************************/ + +/* the major numbes are from the unassigned/local use block + * + * This should be converted to dynamic allocation at some point... + */ +#define TRACE_MAJOR 250 +#define LOG_MAJOR 251 + +/* trace_fops - The file operations for accessing the per-CPU scheduling event + * trace buffers. + */ +struct file_operations trace_fops = { + .owner = THIS_MODULE, + .open = trace_open, + .release = trace_release, + .read = trace_read, +}; + +/* log_fops - The file operations for accessing the global LITMUS log message + * buffer. + * + * Except for opening the device file it uses the same operations as trace_fops. + */ +struct file_operations log_fops = { + .owner = THIS_MODULE, + .open = log_open, + .release = trace_release, + .read = trace_read, +}; + +static int __init register_buffer_dev(const char* name, + struct file_operations* fops, + int major, int count) +{ + dev_t trace_dev; + struct cdev *cdev; + int error = 0; + + trace_dev = MKDEV(major, 0); + error = register_chrdev_region(trace_dev, count, name); + if (error) + { + printk(KERN_WARNING "sched trace: " + "Could not register major/minor number %d\n", major); + return error; + } + cdev = cdev_alloc(); + if (!cdev) { + printk(KERN_WARNING "sched trace: " + "Could not get a cdev for %s.\n", name); + return -ENOMEM; + } + cdev->owner = THIS_MODULE; + cdev->ops = fops; + error = cdev_add(cdev, trace_dev, count); + if (error) { + printk(KERN_WARNING "sched trace: " + "add_cdev failed for %s.\n", name); + return -ENOMEM; + } + return error; + +} + +static int __init init_sched_trace(void) +{ + int error1 = 0, error2 = 0; + + printk("Initializing scheduler trace device\n"); + init_buffers(); + + error1 = register_buffer_dev("schedtrace", &trace_fops, + TRACE_MAJOR, NR_CPUS); + + error2 = register_buffer_dev("litmus_log", &log_fops, + LOG_MAJOR, 1); + if (error1 || error2) + return min(error1, error2); + else + return 0; +} + +module_init(init_sched_trace); + +/******************************************************************************/ +/* KERNEL API */ +/******************************************************************************/ + +/* The per-CPU LITMUS log buffer. Don't put it on the stack, it is too big for + * that and the kernel gets very picky with nested interrupts and small stacks. + */ + +#ifdef CONFIG_SCHED_DEBUG_TRACE + +#define MSG_SIZE 255 +static DEFINE_PER_CPU(char[MSG_SIZE], fmt_buffer); + +/* sched_trace_log_message - This is the only function that accesses the the + * log buffer inside the kernel for writing. + * Concurrent access to it is serialized via the + * log_buffer_lock. + * + * The maximum length of a formatted message is 255. + */ +void sched_trace_log_message(const char* fmt, ...) +{ + unsigned long flags; + va_list args; + size_t len; + char* buf; + + va_start(args, fmt); + local_irq_save(flags); + + /* format message */ + buf = __get_cpu_var(fmt_buffer); + len = vscnprintf(buf, MSG_SIZE, fmt, args); + + spin_lock(&log_buffer_lock); + /* Don't copy the trailing null byte, we don't want null bytes + * in a text file. + */ + rb_put(&log_buffer.buf, buf, len); + spin_unlock(&log_buffer_lock); + + local_irq_restore(flags); + va_end(args); +} + +#endif + diff --git a/litmus/sync.c b/litmus/sync.c new file mode 100644 index 0000000..4405228 --- /dev/null +++ b/litmus/sync.c @@ -0,0 +1,84 @@ +/* litmus/sync.c - Support for synchronous and asynchronous task system releases. + * + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +static DECLARE_COMPLETION(ts_release); + +static long do_wait_for_ts_release(void) +{ + long ret = 0; + + /* If the interruption races with a release, the completion object + * may have a non-zero counter. To avoid this problem, this should + * be replaced by wait_for_completion(). + * + * For debugging purposes, this is interruptible for now. + */ + ret = wait_for_completion_interruptible(&ts_release); + + return ret; +} + + +static long do_release_ts(lt_t start) +{ + int task_count = 0; + long flags; + struct list_head *pos; + struct task_struct *t; + + + spin_lock_irqsave(&ts_release.wait.lock, flags); + + list_for_each(pos, &ts_release.wait.task_list) { + t = (struct task_struct*) list_entry(pos, + struct __wait_queue, + task_list)->private; + task_count++; + release_at(t, start + t->rt_param.task_params.phase); + } + + spin_unlock_irqrestore(&ts_release.wait.lock, flags); + + complete_n(&ts_release, task_count); + + return task_count; +} + + +asmlinkage long sys_wait_for_ts_release(void) +{ + long ret = -EPERM; + struct task_struct *t = current; + + if (is_realtime(t)) + ret = do_wait_for_ts_release(); + + return ret; +} + + +asmlinkage long sys_release_ts(lt_t __user *__delay) +{ + long ret; + lt_t delay; + + /* FIXME: check capabilities... */ + + ret = copy_from_user(&delay, __delay, sizeof(lt_t)); + if (ret == 0) + ret = do_release_ts(sched_clock() + delay); + + return ret; +} diff --git a/litmus/trace.c b/litmus/trace.c new file mode 100644 index 0000000..bcdf103 --- /dev/null +++ b/litmus/trace.c @@ -0,0 +1,302 @@ +#include +#include +#include +#include +#include + +#include + +/******************************************************************************/ +/* Allocation */ +/******************************************************************************/ + +struct ft_buffer* trace_ts_buf = NULL; + +static unsigned int ts_seq_no = 0; + +feather_callback void save_timestamp(unsigned long event) +{ + unsigned int seq_no = fetch_and_inc((int *) &ts_seq_no); + struct timestamp *ts; + if (ft_buffer_start_write(trace_ts_buf, (void**) &ts)) { + ts->event = event; + ts->timestamp = ft_read_tsc(); + ts->seq_no = seq_no; + ts->cpu = raw_smp_processor_id(); + ft_buffer_finish_write(trace_ts_buf, ts); + } +} + +static struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size) +{ + struct ft_buffer* buf; + size_t total = (size + 1) * count; + char* mem; + int order = 0, pages = 1; + + buf = kmalloc(sizeof(struct ft_buffer), GFP_KERNEL); + if (!buf) + return NULL; + + total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0); + while (pages < total) { + order++; + pages *= 2; + } + + mem = (char*) __get_free_pages(GFP_KERNEL, order); + if (!mem) { + kfree(buf); + return NULL; + } + + if (!init_ft_buffer(buf, count, size, + mem + (count * size), /* markers at the end */ + mem)) { /* buffer objects */ + free_pages((unsigned long) mem, order); + kfree(buf); + return NULL; + } + return buf; +} + +static void free_ft_buffer(struct ft_buffer* buf) +{ + int order = 0, pages = 1; + size_t total; + + if (buf) { + total = (buf->slot_size + 1) * buf->slot_count; + total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0); + while (pages < total) { + order++; + pages *= 2; + } + free_pages((unsigned long) buf->buffer_mem, order); + kfree(buf); + } +} + + +/******************************************************************************/ +/* DEVICE FILE DRIVER */ +/******************************************************************************/ + +#define NO_TIMESTAMPS 262144 + +static DECLARE_MUTEX(feather_lock); +static int use_count = 0; + +static int trace_release(struct inode *in, struct file *filp) +{ + int err = -EINVAL; + + if (down_interruptible(&feather_lock)) { + err = -ERESTARTSYS; + goto out; + } + + printk(KERN_ALERT "%s/%d disconnects from feather trace device. " + "use_count=%d\n", + current->comm, current->pid, use_count); + + if (use_count == 1) { + /* disable events */ + ft_disable_all_events(); + + /* wait for any pending events to complete */ + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ); + + printk(KERN_ALERT "Failed trace writes: %u\n", + trace_ts_buf->failed_writes); + + free_ft_buffer(trace_ts_buf); + trace_ts_buf = NULL; + } + + use_count--; + up(&feather_lock); +out: + return err; +} + + +static ssize_t trace_read(struct file *filp, char __user *to, size_t len, + loff_t *f_pos) +{ + /* we ignore f_pos, this is strictly sequential */ + ssize_t error = 0; + struct timestamp ts; + + if (down_interruptible(&feather_lock)) { + error = -ERESTARTSYS; + goto out; + } + + + while (len >= sizeof(struct timestamp)) { + if (ft_buffer_read(trace_ts_buf, &ts)) { + if (copy_to_user(to, &ts, sizeof(struct timestamp))) { + error = -EFAULT; + break; + } else { + len -= sizeof(struct timestamp); + to += sizeof(struct timestamp); + error += sizeof(struct timestamp); + } + } else { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(50); + if (signal_pending(current)) { + error = -ERESTARTSYS; + break; + } + } + } + up(&feather_lock); +out: + return error; +} + +#define ENABLE_CMD 0 +#define DISABLE_CMD 1 + +static ssize_t trace_write(struct file *filp, const char __user *from, + size_t len, loff_t *f_pos) +{ + ssize_t error = -EINVAL; + unsigned long cmd; + unsigned long id; + + if (len % sizeof(long) || len < 2 * sizeof(long)) + goto out; + + if (copy_from_user(&cmd, from, sizeof(long))) { + error = -EFAULT; + goto out; + } + len -= sizeof(long); + from += sizeof(long); + + if (cmd != ENABLE_CMD && cmd != DISABLE_CMD) + goto out; + + if (down_interruptible(&feather_lock)) { + error = -ERESTARTSYS; + goto out; + } + + error = sizeof(long); + while (len) { + if (copy_from_user(&id, from, sizeof(long))) { + error = -EFAULT; + goto out; + } + len -= sizeof(long); + from += sizeof(long); + if (cmd) { + printk(KERN_INFO + "Disabling feather-trace event %lu.\n", id); + ft_disable_event(id); + } else { + printk(KERN_INFO + "Enabling feather-trace event %lu.\n", id); + ft_enable_event(id); + } + error += sizeof(long); + } + + up(&feather_lock); + out: + return error; +} + +static int trace_open(struct inode *in, struct file *filp) +{ + int err = 0; + unsigned int count = NO_TIMESTAMPS; + + if (down_interruptible(&feather_lock)) { + err = -ERESTARTSYS; + goto out; + } + + while (count && !trace_ts_buf) { + printk("trace: trying to allocate %u time stamps.\n", count); + trace_ts_buf = alloc_ft_buffer(count, sizeof(struct timestamp)); + count /= 2; + } + if (!trace_ts_buf) + err = -ENOMEM; + else + use_count++; + + up(&feather_lock); +out: + return err; +} + +/******************************************************************************/ +/* Device Registration */ +/******************************************************************************/ + +#define FT_TRACE_MAJOR 252 + +struct file_operations ft_trace_fops = { + .owner = THIS_MODULE, + .open = trace_open, + .release = trace_release, + .write = trace_write, + .read = trace_read, +}; + + +static int __init register_buffer_dev(const char* name, + struct file_operations* fops, + int major, int count) +{ + dev_t trace_dev; + struct cdev *cdev; + int error = 0; + + trace_dev = MKDEV(major, 0); + error = register_chrdev_region(trace_dev, count, name); + if (error) + { + printk(KERN_WARNING "trace: " + "Could not register major/minor number %d\n", major); + return error; + } + cdev = cdev_alloc(); + if (!cdev) { + printk(KERN_WARNING "trace: " + "Could not get a cdev for %s.\n", name); + return -ENOMEM; + } + cdev->owner = THIS_MODULE; + cdev->ops = fops; + error = cdev_add(cdev, trace_dev, count); + if (error) { + printk(KERN_WARNING "trace: " + "add_cdev failed for %s.\n", name); + return -ENOMEM; + } + return error; + +} + +static int __init init_sched_trace(void) +{ + int error = 0; + + printk("Initializing Feather-Trace device\n"); + /* dummy entry to make linker happy */ + ft_event0(666, save_timestamp); + + error = register_buffer_dev("ft_trace", &ft_trace_fops, + FT_TRACE_MAJOR, 1); + return error; +} + +module_init(init_sched_trace);