Makefile | 2 +- arch/sparc64/Kconfig | 2 + arch/sparc64/kernel/smp.c | 1 + arch/sparc64/kernel/systbls.S | 20 +- arch/x86/Kconfig | 2 + arch/x86/kernel/Makefile_32 | 3 + arch/x86/kernel/ft_event.c | 104 ++++ arch/x86/kernel/smp_32.c | 1 + arch/x86/kernel/syscall_table_32.S | 16 + fs/exec.c | 3 + fs/inode.c | 2 + include/asm-sparc64/feather_trace.h | 22 + include/asm-sparc64/spinlock.h | 113 ++--- include/asm-sparc64/spinlock_types.h | 5 +- include/asm-sparc64/unistd.h | 6 +- include/asm-x86/feather_trace.h | 104 ++++ include/asm-x86/unistd_32.h | 6 +- include/linux/completion.h | 2 +- include/linux/fs.h | 5 + include/linux/sched.h | 11 + include/linux/tick.h | 3 + include/linux/time.h | 4 + include/linux/uaccess.h | 16 + include/litmus/edf_common.h | 25 + include/litmus/fdso.h | 69 +++ include/litmus/feather_buffer.h | 94 ++++ include/litmus/feather_trace.h | 37 ++ include/litmus/ftdev.h | 49 ++ include/litmus/heap.h | 327 +++++++++++++ include/litmus/jobs.h | 9 + include/litmus/litmus.h | 227 +++++++++ include/litmus/norqlock.h | 26 + include/litmus/rt_domain.h | 174 +++++++ include/litmus/rt_param.h | 167 +++++++ include/litmus/sched_plugin.h | 159 ++++++ include/litmus/sched_trace.h | 168 +++++++ include/litmus/trace.h | 103 ++++ include/litmus/unistd.h | 20 + kernel/exit.c | 4 + kernel/fork.c | 8 + kernel/printk.c | 10 +- kernel/sched.c | 96 ++++- kernel/sched_fair.c | 2 +- kernel/sched_rt.c | 2 +- kernel/time/tick-sched.c | 44 ++- litmus/Kconfig | 78 +++ litmus/Makefile | 16 + litmus/edf_common.c | 95 ++++ litmus/fdso.c | 282 +++++++++++ litmus/fmlp.c | 262 ++++++++++ litmus/ft_event.c | 43 ++ litmus/ftdev.c | 352 +++++++++++++ litmus/jobs.c | 43 ++ litmus/litmus.c | 851 ++++++++++++++++++++++++++++++++ litmus/norqlock.c | 56 +++ litmus/rt_domain.c | 289 +++++++++++ litmus/sched_cedf.c | 705 ++++++++++++++++++++++++++ litmus/sched_gsn_edf.c | 728 +++++++++++++++++++++++++++ litmus/sched_litmus.c | 230 +++++++++ litmus/sched_pfair.c | 895 ++++++++++++++++++++++++++++++++++ litmus/sched_plugin.c | 199 ++++++++ litmus/sched_psn_edf.c | 454 +++++++++++++++++ litmus/sched_task_trace.c | 192 ++++++++ litmus/sched_trace.c | 462 ++++++++++++++++++ litmus/srp.c | 318 ++++++++++++ litmus/sync.c | 90 ++++ litmus/trace.c | 83 ++++ 67 files changed, 8910 insertions(+), 86 deletions(-) diff --git a/Makefile b/Makefile index 189d8ef..d9e4495 100644 --- a/Makefile +++ b/Makefile @@ -597,7 +597,7 @@ export mod_strip_cmd ifeq ($(KBUILD_EXTMOD),) -core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ +core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ litmus/ vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \ $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ diff --git a/arch/sparc64/Kconfig b/arch/sparc64/Kconfig index 10b212a..8d90b5a 100644 --- a/arch/sparc64/Kconfig +++ b/arch/sparc64/Kconfig @@ -471,3 +471,5 @@ source "security/Kconfig" source "crypto/Kconfig" source "lib/Kconfig" + +source "litmus/Kconfig" diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c index c399449..cd2bc7e 100644 --- a/arch/sparc64/kernel/smp.c +++ b/arch/sparc64/kernel/smp.c @@ -1033,6 +1033,7 @@ void smp_receive_signal(int cpu) void smp_receive_signal_client(int irq, struct pt_regs *regs) { clear_softint(1 << irq); + set_tsk_need_resched(current); } void smp_new_mmu_context_version_client(int irq, struct pt_regs *regs) diff --git a/arch/sparc64/kernel/systbls.S b/arch/sparc64/kernel/systbls.S index 06d1090..7fc7615 100644 --- a/arch/sparc64/kernel/systbls.S +++ b/arch/sparc64/kernel/systbls.S @@ -82,6 +82,13 @@ sys_call_table32: .word compat_sys_set_mempolicy, compat_sys_kexec_load, compat_sys_move_pages, sys_getcpu, compat_sys_epoll_pwait /*310*/ .word compat_sys_utimensat, compat_sys_signalfd, compat_sys_timerfd, sys_eventfd, compat_sys_fallocate +/*LITMUS, 315*/ + .word sys_set_rt_task_param, sys_get_rt_task_param, sys_complete_job, sys_register_np_flag, sys_exit_np +/*320*/ + .word sys_od_open, sys_od_close, sys_fmlp_down, sys_fmlp_up, sys_srp_down +/*325*/ .word sys_srp_up, sys_query_job_no, sys_wait_for_job_release, sys_wait_for_ts_release, sys_release_ts + + #endif /* CONFIG_COMPAT */ /* Now the 64-bit native Linux syscall table. */ @@ -154,6 +161,12 @@ sys_call_table: .word sys_set_mempolicy, sys_kexec_load, sys_move_pages, sys_getcpu, sys_epoll_pwait /*310*/ .word sys_utimensat, sys_signalfd, sys_timerfd, sys_eventfd, sys_fallocate +/*LITMUS, 315*/ + .word sys_set_rt_task_param, sys_get_rt_task_param, sys_complete_job, sys_register_np_flag, sys_exit_np +/*320*/ + .word sys_od_open, sys_od_close, sys_fmlp_down, sys_fmlp_up, sys_srp_down +/*325*/ .word sys_srp_up, sys_query_job_no, sys_wait_for_job_release, sys_wait_for_ts_release, sys_release_ts + #if defined(CONFIG_SUNOS_EMUL) || defined(CONFIG_SOLARIS_EMUL) || \ defined(CONFIG_SOLARIS_EMUL_MODULE) /* Now the 32-bit SunOS syscall table. */ @@ -271,6 +284,11 @@ sunos_sys_table: .word sunos_nosys, sunos_nosys, sunos_nosys .word sunos_nosys /*310*/ .word sunos_nosys, sunos_nosys, sunos_nosys - .word sunos_nosys, sunos_nosys + .word sunos_nosys, sunos_nosys, sunos_nosys + .word sunos_nosys, sunos_nosys, sunos_nosys + .word sunos_nosys +/*320*/ .word sunos_nosys, sunos_nosys, sunos_nosys + .word sunos_nosys, sunos_nosys, sunos_nosys + .word sunos_nosys, sunos_nosys, sunos_nosys #endif diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 80b7ba4..f99330f 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1620,3 +1620,5 @@ source "security/Kconfig" source "crypto/Kconfig" source "lib/Kconfig" + +source "litmus/Kconfig" diff --git a/arch/x86/kernel/Makefile_32 b/arch/x86/kernel/Makefile_32 index a7bc93c..5f87f32 100644 --- a/arch/x86/kernel/Makefile_32 +++ b/arch/x86/kernel/Makefile_32 @@ -49,6 +49,9 @@ obj-y += pcspeaker.o obj-$(CONFIG_SCx200) += scx200_32.o +obj-$(CONFIG_FEATHER_TRACE) += ft_event.o + + # vsyscall_32.o contains the vsyscall DSO images as __initdata. # We must build both images before we can assemble it. # Note: kbuild does not track this dependency due to usage of .incbin diff --git a/arch/x86/kernel/ft_event.c b/arch/x86/kernel/ft_event.c new file mode 100644 index 0000000..b1d80c5 --- /dev/null +++ b/arch/x86/kernel/ft_event.c @@ -0,0 +1,104 @@ +#include + +#include + +/* the feather trace management functions assume + * exclusive access to the event table + */ + + +#define BYTE_JUMP 0xeb +#define BYTE_JUMP_LEN 0x02 + +/* for each event, there is an entry in the event table */ +struct trace_event { + long id; + long count; + long start_addr; + long end_addr; +}; + +extern struct trace_event __start___event_table[]; +extern struct trace_event __stop___event_table[]; + +int ft_enable_event(unsigned long id) +{ + struct trace_event* te = __start___event_table; + int count = 0; + char* delta; + unsigned char* instr; + + while (te < __stop___event_table) { + if (te->id == id && ++te->count == 1) { + instr = (unsigned char*) te->start_addr; + /* make sure we don't clobber something wrong */ + if (*instr == BYTE_JUMP) { + delta = (((unsigned char*) te->start_addr) + 1); + *delta = 0; + } + } + if (te->id == id) + count++; + te++; + } + return count; +} + +int ft_disable_event(unsigned long id) +{ + struct trace_event* te = __start___event_table; + int count = 0; + char* delta; + unsigned char* instr; + + while (te < __stop___event_table) { + if (te->id == id && --te->count == 0) { + instr = (unsigned char*) te->start_addr; + if (*instr == BYTE_JUMP) { + delta = (((unsigned char*) te->start_addr) + 1); + *delta = te->end_addr - te->start_addr - + BYTE_JUMP_LEN; + } + } + if (te->id == id) + count++; + te++; + } + return count; +} + +int ft_disable_all_events(void) +{ + struct trace_event* te = __start___event_table; + int count = 0; + char* delta; + unsigned char* instr; + + while (te < __stop___event_table) { + if (te->count) { + instr = (unsigned char*) te->start_addr; + if (*instr == BYTE_JUMP) { + delta = (((unsigned char*) te->start_addr) + + 1); + *delta = te->end_addr - te->start_addr - + BYTE_JUMP_LEN; + te->count = 0; + count++; + } + } + te++; + } + return count; +} + +int ft_is_event_enabled(unsigned long id) +{ + struct trace_event* te = __start___event_table; + + while (te < __stop___event_table) { + if (te->id == id) + return te->count; + te++; + } + return 0; +} diff --git a/arch/x86/kernel/smp_32.c b/arch/x86/kernel/smp_32.c index fcaa026..1063dfc 100644 --- a/arch/x86/kernel/smp_32.c +++ b/arch/x86/kernel/smp_32.c @@ -641,6 +641,7 @@ static void native_smp_send_stop(void) fastcall void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); + set_tsk_need_resched(current); __get_cpu_var(irq_stat).irq_resched_count++; } diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index 8344c70..f6fdb0a 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -324,3 +324,19 @@ ENTRY(sys_call_table) .long sys_timerfd .long sys_eventfd .long sys_fallocate + /* LITMUS */ + .long sys_set_rt_task_param /* 325 */ + .long sys_get_rt_task_param + .long sys_complete_job + .long sys_register_np_flag + .long sys_exit_np + .long sys_od_open /* 330 */ + .long sys_od_close + .long sys_fmlp_down + .long sys_fmlp_up + .long sys_srp_down + .long sys_srp_up /* 335 */ + .long sys_query_job_no + .long sys_wait_for_job_release + .long sys_wait_for_ts_release + .long sys_release_ts /* 339 */ diff --git a/fs/exec.c b/fs/exec.c index 282240a..6f47786 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -56,6 +56,8 @@ #include #include +#include + #ifdef CONFIG_KMOD #include #endif @@ -1309,6 +1311,7 @@ int do_execve(char * filename, goto out_kfree; sched_exec(); + litmus_exec(); bprm->file = file; bprm->filename = filename; diff --git a/fs/inode.c b/fs/inode.c index ed35383..ef71ea0 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -220,6 +220,8 @@ void inode_init_once(struct inode *inode) INIT_LIST_HEAD(&inode->inotify_watches); mutex_init(&inode->inotify_mutex); #endif + INIT_LIST_HEAD(&inode->i_obj_list); + mutex_init(&inode->i_obj_mutex); } EXPORT_SYMBOL(inode_init_once); diff --git a/include/asm-sparc64/feather_trace.h b/include/asm-sparc64/feather_trace.h new file mode 100644 index 0000000..35ec70f --- /dev/null +++ b/include/asm-sparc64/feather_trace.h @@ -0,0 +1,22 @@ +#ifndef _ARCH_FEATHER_TRACE_H +#define _ARCH_FEATHER_TRACE_H + +#include +#include + +static inline int fetch_and_inc(int *val) +{ + return atomic_add_ret(1, (atomic_t*) val) - 1; +} + +static inline int fetch_and_dec(int *val) +{ + return atomic_sub_ret(1, (atomic_t*) val) + 1; +} + +static inline unsigned long long ft_timestamp(void) +{ + return get_cycles(); +} + +#endif diff --git a/include/asm-sparc64/spinlock.h b/include/asm-sparc64/spinlock.h index 0006fe9..16931d4 100644 --- a/include/asm-sparc64/spinlock.h +++ b/include/asm-sparc64/spinlock.h @@ -15,93 +15,80 @@ * and rebuild your kernel. */ -/* All of these locking primitives are expected to work properly - * even in an RMO memory model, which currently is what the kernel - * runs in. - * - * There is another issue. Because we play games to save cycles - * in the non-contention case, we need to be extra careful about - * branch targets into the "spinning" code. They live in their - * own section, but the newer V9 branches have a shorter range - * than the traditional 32-bit sparc branch variants. The rule - * is that the branches that go into and out of the spinner sections - * must be pre-V9 branches. - */ - -#define __raw_spin_is_locked(lp) ((lp)->lock != 0) +#define __raw_spin_is_locked(lp) ((lp)->tail != (lp)->head) #define __raw_spin_unlock_wait(lp) \ do { rmb(); \ - } while((lp)->lock) + } while((lp)->tail != (lp)->head) + + static inline void __raw_spin_lock(raw_spinlock_t *lock) { - unsigned long tmp; - + int ticket, tmp; __asm__ __volatile__( -"1: ldstub [%1], %0\n" -" membar #StoreLoad | #StoreStore\n" -" brnz,pn %0, 2f\n" -" nop\n" -" .subsection 2\n" -"2: ldub [%1], %0\n" -" membar #LoadLoad\n" -" brnz,pt %0, 2b\n" -" nop\n" -" ba,a,pt %%xcc, 1b\n" -" .previous" - : "=&r" (tmp) - : "r" (lock) +"1: lduw [%2], %0 \n" /* read ticket */ +" add %0, 1, %1 \n" +" cas [%2], %0, %1 \n" +" cmp %0, %1 \n" +" be,a,pt %%icc, 2f \n" +" nop \n" +" membar #LoadLoad | #StoreLoad | #LoadStore\n" +" ba 1b\n" +" nop \n" +"2: lduw [%3], %1 \n" +" cmp %0, %1 \n" +" be,a,pt %%icc, 3f \n" +" nop \n" +" membar #LoadLoad | #StoreLoad | #LoadStore\n" +" ba 2b\n" +"3: membar #StoreStore | #StoreLoad" + : "=&r" (ticket), "=&r" (tmp) + : "r" (&lock->tail), "r" (&lock->head) : "memory"); } static inline int __raw_spin_trylock(raw_spinlock_t *lock) { - unsigned long result; - + int tail, head; __asm__ __volatile__( -" ldstub [%1], %0\n" -" membar #StoreLoad | #StoreStore" - : "=r" (result) - : "r" (lock) +" lduw [%2], %0 \n" /* read tail */ +" lduw [%3], %1 \n" /* read head */ +" cmp %0, %1 \n" +" bne,a,pn %%icc, 1f \n" +" nop \n" +" inc %1 \n" +" cas [%2], %0, %1 \n" /* try to inc ticket */ +" membar #StoreStore | #StoreLoad \n" +"1: " + : "=&r" (tail), "=&r" (head) + : "r" (&lock->tail), "r" (&lock->head) : "memory"); - return (result == 0UL); + return tail == head; } static inline void __raw_spin_unlock(raw_spinlock_t *lock) { + int tmp; __asm__ __volatile__( -" membar #StoreStore | #LoadStore\n" -" stb %%g0, [%0]" - : /* No outputs */ - : "r" (lock) +" membar #StoreStore | #LoadStore \n" +" lduw [%1], %0 \n" +" inc %0 \n" +" st %0, [%1] \n" +" membar #StoreStore | #StoreLoad" + : "=&r" (tmp) + : "r" (&lock->head) : "memory"); } -static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) -{ - unsigned long tmp1, tmp2; +/* We don't handle this yet, but it looks like not re-enabling the interrupts + * works fine, too. For example, lockdep also does it like this. + */ +#define __raw_spin_lock_flags(l, f) __raw_spin_lock(l) + + - __asm__ __volatile__( -"1: ldstub [%2], %0\n" -" membar #StoreLoad | #StoreStore\n" -" brnz,pn %0, 2f\n" -" nop\n" -" .subsection 2\n" -"2: rdpr %%pil, %1\n" -" wrpr %3, %%pil\n" -"3: ldub [%2], %0\n" -" membar #LoadLoad\n" -" brnz,pt %0, 3b\n" -" nop\n" -" ba,pt %%xcc, 1b\n" -" wrpr %1, %%pil\n" -" .previous" - : "=&r" (tmp1), "=&r" (tmp2) - : "r"(lock), "r"(flags) - : "memory"); -} /* Multi-reader locks, these are much saner than the 32-bit Sparc ones... */ diff --git a/include/asm-sparc64/spinlock_types.h b/include/asm-sparc64/spinlock_types.h index e128112..1a2e24b 100644 --- a/include/asm-sparc64/spinlock_types.h +++ b/include/asm-sparc64/spinlock_types.h @@ -6,10 +6,11 @@ #endif typedef struct { - volatile unsigned char lock; + int tail; + int head; } raw_spinlock_t; -#define __RAW_SPIN_LOCK_UNLOCKED { 0 } +#define __RAW_SPIN_LOCK_UNLOCKED { 0, 0 } typedef struct { volatile unsigned int lock; diff --git a/include/asm-sparc64/unistd.h b/include/asm-sparc64/unistd.h index cb751b4..ebebde6 100644 --- a/include/asm-sparc64/unistd.h +++ b/include/asm-sparc64/unistd.h @@ -333,7 +333,11 @@ #define __NR_eventfd 313 #define __NR_fallocate 314 -#define NR_SYSCALLS 315 +#define __NR_LITMUS 315 + +#include "litmus/unistd.h" + +#define NR_SYSCALLS 315 + NR_litmus_syscalls #ifdef __KERNEL__ /* sysconf options, for SunOS compatibility */ diff --git a/include/asm-x86/feather_trace.h b/include/asm-x86/feather_trace.h new file mode 100644 index 0000000..253067e --- /dev/null +++ b/include/asm-x86/feather_trace.h @@ -0,0 +1,104 @@ +#ifndef _ARCH_FEATHER_TRACE_H +#define _ARCH_FEATHER_TRACE_H + +static inline int fetch_and_inc(int *val) +{ + int ret = 1; + __asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" ); + return ret; +} + +static inline int fetch_and_dec(int *val) +{ + int ret = -1; + __asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" ); + return ret; +} + +#define feather_callback __attribute__((regparm(0))) + +/* make the compiler reload any register that is not saved in + * a cdecl function call + */ +#define CLOBBER_LIST "memory", "cc", "eax", "ecx", "edx" + +#define ft_event(id, callback) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " call " #callback " \n\t" \ + ".section __event_table, \"aw\" \n\t" \ + ".long " #id ", 0, 1b, 2f \n\t" \ + ".previous \n\t" \ + "2: \n\t" \ + : : : CLOBBER_LIST) + +#define ft_event0(id, callback) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " subl $4, %%esp \n\t" \ + " movl $" #id ", (%%esp) \n\t" \ + " call " #callback " \n\t" \ + " addl $4, %%esp \n\t" \ + ".section __event_table, \"aw\" \n\t" \ + ".long " #id ", 0, 1b, 2f \n\t" \ + ".previous \n\t" \ + "2: \n\t" \ + : : : CLOBBER_LIST) + +#define ft_event1(id, callback, param) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " subl $8, %%esp \n\t" \ + " movl %0, 4(%%esp) \n\t" \ + " movl $" #id ", (%%esp) \n\t" \ + " call " #callback " \n\t" \ + " addl $8, %%esp \n\t" \ + ".section __event_table, \"aw\" \n\t" \ + ".long " #id ", 0, 1b, 2f \n\t" \ + ".previous \n\t" \ + "2: \n\t" \ + : : "r" (param) : CLOBBER_LIST) + +#define ft_event2(id, callback, param, param2) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " subl $12, %%esp \n\t" \ + " movl %1, 8(%%esp) \n\t" \ + " movl %0, 4(%%esp) \n\t" \ + " movl $" #id ", (%%esp) \n\t" \ + " call " #callback " \n\t" \ + " addl $12, %%esp \n\t" \ + ".section __event_table, \"aw\" \n\t" \ + ".long " #id ", 0, 1b, 2f \n\t" \ + ".previous \n\t" \ + "2: \n\t" \ + : : "r" (param), "r" (param2) : CLOBBER_LIST) + + +#define ft_event3(id, callback, p, p2, p3) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " subl $16, %%esp \n\t" \ + " movl %1, 12(%%esp) \n\t" \ + " movl %1, 8(%%esp) \n\t" \ + " movl %0, 4(%%esp) \n\t" \ + " movl $" #id ", (%%esp) \n\t" \ + " call " #callback " \n\t" \ + " addl $16, %%esp \n\t" \ + ".section __event_table, \"aw\" \n\t" \ + ".long " #id ", 0, 1b, 2f \n\t" \ + ".previous \n\t" \ + "2: \n\t" \ + : : "r" (p), "r" (p2), "r" (p3) : CLOBBER_LIST) + + +static inline unsigned long long ft_timestamp(void) +{ + unsigned long long ret; + __asm__ __volatile__("rdtsc" : "=A" (ret)); + return ret; +} + +#define __ARCH_HAS_FEATHER_TRACE + +#endif diff --git a/include/asm-x86/unistd_32.h b/include/asm-x86/unistd_32.h index 9b15545..36fec84 100644 --- a/include/asm-x86/unistd_32.h +++ b/include/asm-x86/unistd_32.h @@ -331,9 +331,13 @@ #define __NR_eventfd 323 #define __NR_fallocate 324 +#define __NR_LITMUS 325 + +#include "litmus/unistd.h" + #ifdef __KERNEL__ -#define NR_syscalls 325 +#define NR_syscalls 324 + NR_litmus_syscalls #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/include/linux/completion.h b/include/linux/completion.h index 33d6aaf..5b55e97 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -51,7 +51,7 @@ extern unsigned long wait_for_completion_interruptible_timeout( extern void complete(struct completion *); extern void complete_all(struct completion *); - +extern void complete_n(struct completion *, int n); #define INIT_COMPLETION(x) ((x).done = 0) #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index b3ec4a4..22f856c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -588,6 +588,8 @@ static inline int mapping_writably_mapped(struct address_space *mapping) #define i_size_ordered_init(inode) do { } while (0) #endif +struct inode_obj_id_table; + struct inode { struct hlist_node i_hash; struct list_head i_list; @@ -653,6 +655,9 @@ struct inode { void *i_security; #endif void *i_private; /* fs or device private pointer */ + + struct list_head i_obj_list; + struct mutex i_obj_mutex; }; /* diff --git a/include/linux/sched.h b/include/linux/sched.h index cc14656..76e28f1 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -37,6 +37,7 @@ #define SCHED_BATCH 3 /* SCHED_ISO: reserved but not implemented yet */ #define SCHED_IDLE 5 +#define SCHED_LITMUS 6 #ifdef __KERNEL__ @@ -91,6 +92,8 @@ struct sched_param { #include +#include + struct exec_domain; struct futex_pi_state; struct bio; @@ -914,6 +917,8 @@ struct sched_entity { #endif }; +struct od_table_entry; + struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ void *stack; @@ -1178,6 +1183,12 @@ struct task_struct { int make_it_fail; #endif struct prop_local_single dirties; + + /* litmus parameters and state */ + struct rt_param rt_param; + + /* references to PI semaphores, etc. */ + struct od_table_entry* od_table; }; /* diff --git a/include/linux/tick.h b/include/linux/tick.h index f4a1395..7eae358 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -64,6 +64,9 @@ extern int tick_is_oneshot_available(void); extern struct tick_device *tick_get_device(int cpu); # ifdef CONFIG_HIGH_RES_TIMERS +#define LINUX_DEFAULT_TICKS 0 +#define LITMUS_ALIGNED_TICKS 1 +#define LITMUS_STAGGERED_TICKS 2 extern int tick_init_highres(void); extern int tick_program_event(ktime_t expires, int force); extern void tick_setup_sched_timer(void); diff --git a/include/linux/time.h b/include/linux/time.h index b04136d..3e8fd9e 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -173,6 +173,10 @@ static inline void timespec_add_ns(struct timespec *a, u64 ns) { ns += a->tv_nsec; while(unlikely(ns >= NSEC_PER_SEC)) { + /* The following asm() prevents the compiler from + * optimising this loop into a modulo operation. */ + asm("" : "+r"(ns)); + ns -= NSEC_PER_SEC; a->tv_sec++; } diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 975c963..6ae0ff9 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -84,4 +84,20 @@ static inline unsigned long __copy_from_user_nocache(void *to, ret; \ }) +/* This is a naive attempt at a write version of the above native Linux macro. + */ +#define poke_kernel_address(val, addr) \ + ({ \ + long ret; \ + mm_segment_t old_fs = get_fs(); \ + \ + set_fs(KERNEL_DS); \ + pagefault_disable(); \ + ret = __put_user(val, (__force typeof(val) __user *)(addr)); \ + pagefault_enable(); \ + set_fs(old_fs); \ + ret; \ + }) + + #endif /* __LINUX_UACCESS_H__ */ diff --git a/include/litmus/edf_common.h b/include/litmus/edf_common.h new file mode 100644 index 0000000..32dcf9b --- /dev/null +++ b/include/litmus/edf_common.h @@ -0,0 +1,25 @@ +/* EDF common data structures and utility functions shared by all EDF + * based scheduler plugins + */ + +/* CLEANUP: Add comments and make it less messy. + * + */ + +#ifndef __UNC_EDF_COMMON_H__ +#define __UNC_EDF_COMMON_H__ + +#include + + +void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched, + release_jobs_t release); + +int edf_higher_prio(struct task_struct* first, + struct task_struct* second); + +int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t); + +int edf_set_hp_task(struct pi_semaphore *sem); +int edf_set_hp_cpu_task(struct pi_semaphore *sem, int cpu); +#endif diff --git a/include/litmus/fdso.h b/include/litmus/fdso.h new file mode 100644 index 0000000..286e10f --- /dev/null +++ b/include/litmus/fdso.h @@ -0,0 +1,69 @@ +/* fdso.h - file descriptor attached shared objects + * + * (c) 2007 B. Brandenburg, LITMUS^RT project + */ + +#ifndef _LINUX_FDSO_H_ +#define _LINUX_FDSO_H_ + +#include +#include + +#include + +#define MAX_OBJECT_DESCRIPTORS 32 + +typedef enum { + MIN_OBJ_TYPE = 0, + + FMLP_SEM = 0, + SRP_SEM = 1, + + MAX_OBJ_TYPE = 1 +} obj_type_t; + +struct inode_obj_id { + struct list_head list; + atomic_t count; + struct inode* inode; + + obj_type_t type; + void* obj; + unsigned int id; +}; + + +struct od_table_entry { + unsigned int used; + + struct inode_obj_id* obj; + void* extra; +}; + +struct fdso_ops { + void* (*create) (void); + void (*destroy)(void*); + int (*open) (struct od_table_entry*, void* __user); + int (*close) (struct od_table_entry*); +}; + +/* translate a userspace supplied od into the raw table entry + * returns NULL if od is invalid + */ +struct od_table_entry* __od_lookup(int od); + +/* translate a userspace supplied od into the associated object + * returns NULL if od is invalid + */ +static inline void* od_lookup(int od, obj_type_t type) +{ + struct od_table_entry* e = __od_lookup(od); + return e && e->obj->type == type ? e->obj->obj : NULL; +} + +#define lookup_fmlp_sem(od)((struct pi_semaphore*) od_lookup(od, FMLP_SEM)) +#define lookup_srp_sem(od) ((struct srp_semaphore*) od_lookup(od, SRP_SEM)) +#define lookup_ics(od) ((struct ics*) od_lookup(od, ICS_ID)) + + +#endif diff --git a/include/litmus/feather_buffer.h b/include/litmus/feather_buffer.h new file mode 100644 index 0000000..6c18277 --- /dev/null +++ b/include/litmus/feather_buffer.h @@ -0,0 +1,94 @@ +#ifndef _FEATHER_BUFFER_H_ +#define _FEATHER_BUFFER_H_ + +/* requires UINT_MAX and memcpy */ + +#define SLOT_FREE 0 +#define SLOT_BUSY 1 +#define SLOT_READY 2 + +struct ft_buffer { + unsigned int slot_count; + unsigned int slot_size; + + int free_count; + unsigned int write_idx; + unsigned int read_idx; + + char* slots; + void* buffer_mem; + unsigned int failed_writes; +}; + +static inline int init_ft_buffer(struct ft_buffer* buf, + unsigned int slot_count, + unsigned int slot_size, + char* slots, + void* buffer_mem) +{ + int i = 0; + if (!slot_count || UINT_MAX % slot_count != slot_count - 1) { + /* The slot count must divide UNIT_MAX + 1 so that when it + * wraps around the index correctly points to 0. + */ + return 0; + } else { + buf->slot_count = slot_count; + buf->slot_size = slot_size; + buf->slots = slots; + buf->buffer_mem = buffer_mem; + buf->free_count = slot_count; + buf->write_idx = 0; + buf->read_idx = 0; + buf->failed_writes = 0; + for (i = 0; i < slot_count; i++) + buf->slots[i] = SLOT_FREE; + return 1; + } +} + +static inline int ft_buffer_start_write(struct ft_buffer* buf, void **ptr) +{ + int free = fetch_and_dec(&buf->free_count); + unsigned int idx; + if (free <= 0) { + fetch_and_inc(&buf->free_count); + *ptr = 0; + fetch_and_inc(&buf->failed_writes); + return 0; + } else { + idx = fetch_and_inc((int*) &buf->write_idx) % buf->slot_count; + buf->slots[idx] = SLOT_BUSY; + *ptr = ((char*) buf->buffer_mem) + idx * buf->slot_size; + return 1; + } +} + +static inline void ft_buffer_finish_write(struct ft_buffer* buf, void *ptr) +{ + unsigned int idx = ((char*) ptr - (char*) buf->buffer_mem) / buf->slot_size; + buf->slots[idx] = SLOT_READY; +} + + +/* exclusive reader access is assumed */ +static inline int ft_buffer_read(struct ft_buffer* buf, void* dest) +{ + unsigned int idx; + if (buf->free_count == buf->slot_count) + /* nothing available */ + return 0; + idx = buf->read_idx % buf->slot_count; + if (buf->slots[idx] == SLOT_READY) { + memcpy(dest, ((char*) buf->buffer_mem) + idx * buf->slot_size, + buf->slot_size); + buf->slots[idx] = SLOT_FREE; + buf->read_idx++; + fetch_and_inc(&buf->free_count); + return 1; + } else + return 0; +} + + +#endif diff --git a/include/litmus/feather_trace.h b/include/litmus/feather_trace.h new file mode 100644 index 0000000..f8fb7ba --- /dev/null +++ b/include/litmus/feather_trace.h @@ -0,0 +1,37 @@ +#ifndef _FEATHER_TRACE_H_ +#define _FEATHER_TRACE_H_ + +#include + +int ft_enable_event(unsigned long id); +int ft_disable_event(unsigned long id); +int ft_is_event_enabled(unsigned long id); +int ft_disable_all_events(void); + +#ifndef __ARCH_HAS_FEATHER_TRACE +/* provide default implementation */ + +#define feather_callback + +#define MAX_EVENTS 1024 + +extern int ft_events[MAX_EVENTS]; + +#define ft_event(id, callback) \ + if (ft_events[id]) callback(); + +#define ft_event0(id, callback) \ + if (ft_events[id]) callback(id); + +#define ft_event1(id, callback, param) \ + if (ft_events[id]) callback(id, param); + +#define ft_event2(id, callback, param, param2) \ + if (ft_events[id]) callback(id, param, param2); + +#define ft_event3(id, callback, p, p2, p3) \ + if (ft_events[id]) callback(id, p, p2, p3); +#endif + + +#endif diff --git a/include/litmus/ftdev.h b/include/litmus/ftdev.h new file mode 100644 index 0000000..7697b46 --- /dev/null +++ b/include/litmus/ftdev.h @@ -0,0 +1,49 @@ +#ifndef _LITMUS_FTDEV_H_ +#define _LITMUS_FTDEV_H_ + +#include +#include +#include +#include + +#define MAX_FTDEV_MINORS NR_CPUS + +#define FTDEV_ENABLE_CMD 0 +#define FTDEV_DISABLE_CMD 1 + +struct ftdev; + +/* return 0 if buffer can be opened, otherwise -$REASON */ +typedef int (*ftdev_can_open_t)(struct ftdev* dev, unsigned int buf_no); +/* return 0 on success, otherwise -$REASON */ +typedef int (*ftdev_alloc_t)(struct ftdev* dev, unsigned int buf_no); +typedef void (*ftdev_free_t)(struct ftdev* dev, unsigned int buf_no); + + +struct ftdev_event; + +struct ftdev_minor { + struct ft_buffer* buf; + unsigned int readers; + struct mutex lock; + /* FIXME: filter for authorized events */ + struct ftdev_event* events; +}; + +struct ftdev { + struct cdev cdev; + /* FIXME: don't waste memory, allocate dynamically */ + struct ftdev_minor minor[MAX_FTDEV_MINORS]; + unsigned int minor_cnt; + ftdev_alloc_t alloc; + ftdev_free_t free; + ftdev_can_open_t can_open; +}; + +struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size); +void free_ft_buffer(struct ft_buffer* buf); + +void ftdev_init(struct ftdev* ftdev, struct module* owner); +int register_ftdev(struct ftdev* ftdev, const char* name, int major); + +#endif diff --git a/include/litmus/heap.h b/include/litmus/heap.h new file mode 100644 index 0000000..e5b4746 --- /dev/null +++ b/include/litmus/heap.h @@ -0,0 +1,327 @@ +/* heaps.h -- Binomial Heaps + * + * (c) 2008 Bjoern Brandenburg + */ + +#ifndef HEAP_H +#define HEAP_H + +#define NOT_IN_HEAP UINT_MAX + +struct heap_node { + struct heap_node* parent; + struct heap_node* next; + struct heap_node* child; + + unsigned int degree; + void* value; + struct heap_node** ref; +}; + +struct heap { + struct heap_node* head; + /* We cache the minimum of the heap. + * This speeds up repeated peek operations. + */ + struct heap_node* min; +}; + +typedef int (*heap_prio_t)(struct heap_node* a, struct heap_node* b); + +static inline void heap_init(struct heap* heap) +{ + heap->head = NULL; + heap->min = NULL; +} + +static inline void heap_node_init(struct heap_node** _h, void* value) +{ + struct heap_node* h = *_h; + h->parent = NULL; + h->next = NULL; + h->child = NULL; + h->degree = NOT_IN_HEAP; + h->value = value; + h->ref = _h; +} + +struct heap_node* heap_node_alloc(int gfp_flags); +void heap_node_free(struct heap_node* hn); + +static inline int heap_node_in_heap(struct heap_node* h) +{ + return h->degree != NOT_IN_HEAP; +} + +static inline int heap_empty(struct heap* heap) +{ + return heap->head == NULL && heap->min == NULL; +} + +/* make child a subtree of root */ +static inline void __heap_link(struct heap_node* root, + struct heap_node* child) +{ + child->parent = root; + child->next = root->child; + root->child = child; + root->degree++; +} + +/* merge root lists */ +static inline struct heap_node* __heap_merge(struct heap_node* a, + struct heap_node* b) +{ + struct heap_node* head = NULL; + struct heap_node** pos = &head; + + while (a && b) { + if (a->degree < b->degree) { + *pos = a; + a = a->next; + } else { + *pos = b; + b = b->next; + } + pos = &(*pos)->next; + } + if (a) + *pos = a; + else + *pos = b; + return head; +} + +/* reverse a linked list of nodes. also clears parent pointer */ +static inline struct heap_node* __heap_reverse(struct heap_node* h) +{ + struct heap_node* tail = NULL; + struct heap_node* next; + + if (!h) + return h; + + h->parent = NULL; + while (h->next) { + next = h->next; + h->next = tail; + tail = h; + h = next; + h->parent = NULL; + } + h->next = tail; + return h; +} + +static inline void __heap_min(heap_prio_t higher_prio, struct heap* heap, + struct heap_node** prev, struct heap_node** node) +{ + struct heap_node *_prev, *cur; + *prev = NULL; + + if (!heap->head) { + *node = NULL; + return; + } + + *node = heap->head; + _prev = heap->head; + cur = heap->head->next; + while (cur) { + if (higher_prio(cur, *node)) { + *node = cur; + *prev = _prev; + } + _prev = cur; + cur = cur->next; + } +} + +static inline void __heap_union(heap_prio_t higher_prio, struct heap* heap, + struct heap_node* h2) +{ + struct heap_node* h1; + struct heap_node *prev, *x, *next; + if (!h2) + return; + h1 = heap->head; + if (!h1) { + heap->head = h2; + return; + } + h1 = __heap_merge(h1, h2); + prev = NULL; + x = h1; + next = x->next; + while (next) { + if (x->degree != next->degree || + (next->next && next->next->degree == x->degree)) { + /* nothing to do, advance */ + prev = x; + x = next; + } else if (higher_prio(x, next)) { + /* x becomes the root of next */ + x->next = next->next; + __heap_link(x, next); + } else { + /* next becomes the root of x */ + if (prev) + prev->next = next; + else + h1 = next; + __heap_link(next, x); + x = next; + } + next = x->next; + } + heap->head = h1; +} + +static inline struct heap_node* __heap_extract_min(heap_prio_t higher_prio, + struct heap* heap) +{ + struct heap_node *prev, *node; + __heap_min(higher_prio, heap, &prev, &node); + if (!node) + return NULL; + if (prev) + prev->next = node->next; + else + heap->head = node->next; + __heap_union(higher_prio, heap, __heap_reverse(node->child)); + return node; +} + +/* insert (and reinitialize) a node into the heap */ +static inline void heap_insert(heap_prio_t higher_prio, struct heap* heap, + struct heap_node* node) +{ + struct heap_node *min; + node->child = NULL; + node->parent = NULL; + node->next = NULL; + node->degree = 0; + if (heap->min && higher_prio(node, heap->min)) { + /* swap min cache */ + min = heap->min; + min->child = NULL; + min->parent = NULL; + min->next = NULL; + min->degree = 0; + __heap_union(higher_prio, heap, min); + heap->min = node; + } else + __heap_union(higher_prio, heap, node); +} + +static inline void __uncache_min(heap_prio_t higher_prio, struct heap* heap) +{ + struct heap_node* min; + if (heap->min) { + min = heap->min; + heap->min = NULL; + heap_insert(higher_prio, heap, min); + } +} + +/* merge addition into target */ +static inline void heap_union(heap_prio_t higher_prio, + struct heap* target, struct heap* addition) +{ + /* first insert any cached minima, if necessary */ + __uncache_min(higher_prio, target); + __uncache_min(higher_prio, addition); + __heap_union(higher_prio, target, addition->head); + /* this is a destructive merge */ + addition->head = NULL; +} + +static inline struct heap_node* heap_peek(heap_prio_t higher_prio, + struct heap* heap) +{ + if (!heap->min) + heap->min = __heap_extract_min(higher_prio, heap); + return heap->min; +} + +static inline struct heap_node* heap_take(heap_prio_t higher_prio, + struct heap* heap) +{ + struct heap_node *node; + if (!heap->min) + heap->min = __heap_extract_min(higher_prio, heap); + node = heap->min; + heap->min = NULL; + if (node) + node->degree = NOT_IN_HEAP; + return node; +} + +static inline void heap_delete(heap_prio_t higher_prio, struct heap* heap, + struct heap_node* node) +{ + struct heap_node *parent, *prev, *pos; + struct heap_node** tmp_ref; + void* tmp; + + if (heap->min != node) { + /* bubble up */ + parent = node->parent; + while (parent) { + /* swap parent and node */ + tmp = parent->value; + parent->value = node->value; + node->value = tmp; + /* swap references */ + *(parent->ref) = node; + *(node->ref) = parent; + tmp_ref = parent->ref; + parent->ref = node->ref; + node->ref = tmp_ref; + /* step up */ + node = parent; + parent = node->parent; + } + /* now delete: + * first find prev */ + prev = NULL; + pos = heap->head; + while (pos != node) { + prev = pos; + pos = pos->next; + } + /* we have prev, now remove node */ + if (prev) + prev->next = node->next; + else + heap->head = node->next; + __heap_union(higher_prio, heap, __heap_reverse(node->child)); + } else + heap->min = NULL; + node->degree = NOT_IN_HEAP; +} + +/* allocate a heap node for value and insert into the heap */ +static inline int heap_add(heap_prio_t higher_prio, struct heap* heap, + void* value, int gfp_flags) +{ + struct heap_node* hn = heap_node_alloc(gfp_flags); + if (likely(hn)) { + heap_node_init(&hn, value); + heap_insert(higher_prio, heap, hn); + } + return hn != NULL; +} + +static inline void* heap_take_del(heap_prio_t higher_prio, + struct heap* heap) +{ + struct heap_node* hn = heap_take(higher_prio, heap); + void* ret = NULL; + if (hn) { + ret = hn->value; + heap_node_free(hn); + } + return ret; +} +#endif diff --git a/include/litmus/jobs.h b/include/litmus/jobs.h new file mode 100644 index 0000000..9bd361e --- /dev/null +++ b/include/litmus/jobs.h @@ -0,0 +1,9 @@ +#ifndef __LITMUS_JOBS_H__ +#define __LITMUS_JOBS_H__ + +void prepare_for_next_period(struct task_struct *t); +void release_at(struct task_struct *t, lt_t start); +long complete_job(void); + +#endif + diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h new file mode 100644 index 0000000..7ef5a62 --- /dev/null +++ b/include/litmus/litmus.h @@ -0,0 +1,227 @@ +/* + * Constant definitions related to + * scheduling policy. + */ + +#ifndef _LINUX_LITMUS_H_ +#define _LINUX_LITMUS_H_ + +#include +#include + +/* RT mode start time */ +extern volatile unsigned long rt_start_time; + +extern atomic_t __log_seq_no; + +#define TRACE(fmt, args...) \ + sched_trace_log_message("%d P%d: " fmt, atomic_add_return(1, &__log_seq_no), \ + raw_smp_processor_id(), ## args) + +#define TRACE_TASK(t, fmt, args...) \ + TRACE("(%s/%d) " fmt, (t)->comm, (t)->pid, ##args) + +#define TRACE_CUR(fmt, args...) \ + TRACE_TASK(current, fmt, ## args) + +#define TRACE_BUG_ON(cond) \ + do { if (cond) TRACE("BUG_ON(%s) at %s:%d " \ + "called from %p current=%s/%d state=%d " \ + "flags=%x partition=%d cpu=%d rtflags=%d"\ + " job=%u knp=%d timeslice=%u\n", \ + #cond, __FILE__, __LINE__, __builtin_return_address(0), current->comm, \ + current->pid, current->state, current->flags, \ + get_partition(current), smp_processor_id(), get_rt_flags(current), \ + current->rt_param.job_params.job_no, current->rt_param.kernel_np, \ + current->time_slice\ + ); } while(0); + + +/* in_list - is a given list_head queued on some list? + */ +static inline int in_list(struct list_head* list) +{ + return !( /* case 1: deleted */ + (list->next == LIST_POISON1 && + list->prev == LIST_POISON2) + || + /* case 2: initialized */ + (list->next == list && + list->prev == list) + ); +} + +typedef int (*list_cmp_t)(struct list_head*, struct list_head*); + +static inline unsigned int list_insert(struct list_head* new, + struct list_head* head, + list_cmp_t order_before) +{ + struct list_head *pos; + unsigned int passed = 0; + + BUG_ON(!new); + + /* find a spot where the new entry is less than the next */ + list_for_each(pos, head) { + if (unlikely(order_before(new, pos))) { + /* pos is not less than new, thus insert here */ + __list_add(new, pos->prev, pos); + goto out; + } + passed++; + } + /* if we get to this point either the list is empty or every entry + * queued element is less than new. + * Let's add new to the end. */ + list_add_tail(new, head); + out: + return passed; +} + +void list_qsort(struct list_head* list, list_cmp_t less_than); + + +#define RT_PREEMPTIVE 0x2050 /* = NP */ +#define RT_NON_PREEMPTIVE 0x4e50 /* = P */ +#define RT_EXIT_NP_REQUESTED 0x5251 /* = RQ */ + + +/* kill naughty tasks + */ +void scheduler_signal(struct task_struct *t, unsigned int signal); +void send_scheduler_signals(void); +void np_mem_kill(struct task_struct *t); + +void litmus_fork(struct task_struct *tsk); +void litmus_exec(void); +/* clean up real-time state of a task */ +void exit_litmus(struct task_struct *dead_tsk); + +long litmus_admit_task(struct task_struct *tsk); +void litmus_exit_task(struct task_struct *tsk); + +#define is_realtime(t) ((t)->policy == SCHED_LITMUS) +#define rt_transition_pending(t) \ + ((t)->rt_param.transition_pending) + +#define tsk_rt(t) (&(t)->rt_param) + +/* Realtime utility macros */ +#define get_rt_flags(t) (tsk_rt(t)->flags) +#define set_rt_flags(t,f) (tsk_rt(t)->flags=(f)) +#define get_exec_cost(t) (tsk_rt(t)->task_params.exec_cost) +#define get_exec_time(t) (tsk_rt(t)->job_params.exec_time) +#define get_rt_period(t) (tsk_rt(t)->task_params.period) +#define get_rt_phase(t) (tsk_rt(t)->task_params.phase) +#define get_partition(t) (tsk_rt(t)->task_params.cpu) +#define get_deadline(t) (tsk_rt(t)->job_params.deadline) +#define get_class(t) (tsk_rt(t)->task_params.cls) + +inline static int budget_exhausted(struct task_struct* t) +{ + return get_exec_time(t) >= get_exec_cost(t); +} + + +#define is_hrt(t) \ + (tsk_rt(t)->task_params.class == RT_CLASS_HARD) +#define is_srt(t) \ + (tsk_rt(t)->task_params.class == RT_CLASS_SOFT) +#define is_be(t) \ + (tsk_rt(t)->task_params.class == RT_CLASS_BEST_EFFORT) + +#define get_release(t) (tsk_rt(t)->job_params.release) + +/* Our notion of time within LITMUS: kernel monotonic time. */ +static inline lt_t litmus_clock(void) +{ + return ktime_to_ns(ktime_get()); +} + +/* A macro to convert from nanoseconds to ktime_t. */ +#define ns_to_ktime(t) ktime_add_ns(ktime_set(0, 0), t) + +#define get_domain(t) (tsk_rt(t)->domain) + +/* Honor the flag in the preempt_count variable that is set + * when scheduling is in progress. + */ +#define is_running(t) \ + ((t)->state == TASK_RUNNING || \ + task_thread_info(t)->preempt_count & PREEMPT_ACTIVE) + +#define is_blocked(t) \ + (!is_running(t)) +#define is_released(t, now) \ + (lt_before_eq(get_release(t), now)) +#define is_tardy(t, now) \ + (lt_before_eq(tsk_rt(t)->job_params.deadline, now)) + +/* real-time comparison macros */ +#define earlier_deadline(a, b) (lt_before(\ + (a)->rt_param.job_params.deadline,\ + (b)->rt_param.job_params.deadline)) +#define earlier_release(a, b) (lt_before(\ + (a)->rt_param.job_params.release,\ + (b)->rt_param.job_params.release)) + +#define make_np(t) do {t->rt_param.kernel_np++;} while(0); +#define take_np(t) do {t->rt_param.kernel_np--;} while(0); + +#ifdef CONFIG_SRP +void srp_ceiling_block(void); +#else +#define srp_ceiling_block() /* nothing */ +#endif + +#define heap2task(hn) ((struct task_struct*) hn->value) + + +#ifdef CONFIG_NP_SECTION +/* returns 1 if task t has registered np flag and set it to RT_NON_PREEMPTIVE + */ +int is_np(struct task_struct *t); + +/* request that the task should call sys_exit_np() + */ +void request_exit_np(struct task_struct *t); + +#else + +static inline int is_np(struct task_struct *t) +{ + return tsk_rt(t)->kernel_np; +} + +#define request_exit_np(t) + +#endif + +/* make the unit explicit */ +typedef unsigned long quanta_t; + +enum round { + FLOOR, + CEIL +}; + + +/* Tick period is used to convert ns-specified execution + * costs and periods into tick-based equivalents. + */ +extern ktime_t tick_period; + +static inline quanta_t time2quanta(lt_t time, enum round round) +{ + s64 quantum_length = ktime_to_ns(tick_period); + + if (do_div(time, quantum_length) && round == CEIL) + time++; + return (quanta_t) time; +} + +/* By how much is cpu staggered behind CPU 0? */ +u64 cpu_stagger_offset(int cpu); + +#endif diff --git a/include/litmus/norqlock.h b/include/litmus/norqlock.h new file mode 100644 index 0000000..e4c1d06 --- /dev/null +++ b/include/litmus/norqlock.h @@ -0,0 +1,26 @@ +#ifndef NORQLOCK_H +#define NORQLOCK_H + +typedef void (*work_t)(unsigned long arg); + +struct no_rqlock_work { + int active; + work_t work; + unsigned long arg; + struct no_rqlock_work* next; +}; + +void init_no_rqlock_work(struct no_rqlock_work* w, work_t work, + unsigned long arg); + +void __do_without_rqlock(struct no_rqlock_work *work); + +static inline void do_without_rqlock(struct no_rqlock_work *work) +{ + if (!test_and_set_bit(0, (void*)&work->active)) + __do_without_rqlock(work); +} + +void tick_no_rqlock(void); + +#endif diff --git a/include/litmus/rt_domain.h b/include/litmus/rt_domain.h new file mode 100644 index 0000000..7356ec7 --- /dev/null +++ b/include/litmus/rt_domain.h @@ -0,0 +1,174 @@ +/* CLEANUP: Add comments and make it less messy. + * + */ + +#ifndef __UNC_RT_DOMAIN_H__ +#define __UNC_RT_DOMAIN_H__ + +#include +#include + +#define RELEASE_QUEUE_SLOTS 127 /* prime */ + +struct _rt_domain; + +typedef int (*check_resched_needed_t)(struct _rt_domain *rt); +typedef void (*release_jobs_t)(struct _rt_domain *rt, struct heap* tasks); + +int heap_earlier_release(struct heap_node *_a, struct heap_node *_b); + +struct release_heap { + struct list_head list; + lt_t release_time; + struct heap heap; +}; + +struct release_queue { + /* each slot maintains a list of release heaps sorted + * by release time */ + struct list_head slot[RELEASE_QUEUE_SLOTS]; + /* the heap of heaps ordered by release time */ + struct heap rel_heap; + /* the actual timer used to trigger releases */ + struct hrtimer timer; + /* used to determine when to start the timer */ + int timer_armed; + /* when will it go off? */ + lt_t timer_time; +}; + +typedef struct _rt_domain { + struct no_rqlock_work arm_timer; + + /* runnable rt tasks are in here */ + spinlock_t ready_lock; + struct heap ready_queue; + + /* real-time tasks waiting for release are in here */ + spinlock_t release_lock; + struct release_queue release_queue; + + /* for moving tasks to the release queue */ + spinlock_t tobe_lock; + struct list_head tobe_released; + + /* how do we check if we need to kick another CPU? */ + check_resched_needed_t check_resched; + + /* how do we release a job? */ + release_jobs_t release_jobs; + + /* how are tasks ordered in the ready queue? */ + heap_prio_t order; +} rt_domain_t; + +/* caller must hold release_lock */ +static inline int next_release(rt_domain_t *rt, lt_t *time) +{ + struct heap_node* top = heap_peek(heap_earlier_release, + &rt->release_queue.rel_heap); + if (top) + *time = ((struct release_heap*) top->value)->release_time; + return top != NULL; +} + +static inline struct task_struct* __next_ready(rt_domain_t* rt) +{ + struct heap_node *hn = heap_peek(rt->order, &rt->ready_queue); + if (hn) + return heap2task(hn); + else + return NULL; +} + +void rt_domain_init(rt_domain_t *rt, heap_prio_t order, + check_resched_needed_t check, + release_jobs_t relase); + +void __add_ready(rt_domain_t* rt, struct task_struct *new); +void __merge_ready(rt_domain_t* rt, struct heap *tasks); +void __add_release(rt_domain_t* rt, struct task_struct *task); + +static inline struct task_struct* __take_ready(rt_domain_t* rt) +{ + struct heap_node* hn = heap_take(rt->order, &rt->ready_queue); + if (hn) + return heap2task(hn); + else + return NULL; +} + +static inline struct task_struct* __peek_ready(rt_domain_t* rt) +{ + struct heap_node* hn = heap_peek(rt->order, &rt->ready_queue); + if (hn) + return heap2task(hn); + else + return NULL; +} + +static inline int is_queued(struct task_struct *t) +{ + return heap_node_in_heap(tsk_rt(t)->heap_node); +} + +static inline void remove(rt_domain_t* rt, struct task_struct *t) +{ + heap_delete(rt->order, &rt->ready_queue, tsk_rt(t)->heap_node); +} + +static inline void add_ready(rt_domain_t* rt, struct task_struct *new) +{ + unsigned long flags; + /* first we need the write lock for rt_ready_queue */ + spin_lock_irqsave(&rt->ready_lock, flags); + __add_ready(rt, new); + spin_unlock_irqrestore(&rt->ready_lock, flags); +} + +static inline void merge_ready(rt_domain_t* rt, struct heap* tasks) +{ + unsigned long flags; + spin_lock_irqsave(&rt->ready_lock, flags); + __merge_ready(rt, tasks); + spin_unlock_irqrestore(&rt->ready_lock, flags); +} + +static inline struct task_struct* take_ready(rt_domain_t* rt) +{ + unsigned long flags; + struct task_struct* ret; + /* first we need the write lock for rt_ready_queue */ + spin_lock_irqsave(&rt->ready_lock, flags); + ret = __take_ready(rt); + spin_unlock_irqrestore(&rt->ready_lock, flags); + return ret; +} + + +static inline void add_release(rt_domain_t* rt, struct task_struct *task) +{ + unsigned long flags; + /* first we need the write lock for rt_ready_queue */ + spin_lock_irqsave(&rt->tobe_lock, flags); + __add_release(rt, task); + spin_unlock_irqrestore(&rt->tobe_lock, flags); +} + +static inline int __jobs_pending(rt_domain_t* rt) +{ + return !heap_empty(&rt->ready_queue); +} + +static inline int jobs_pending(rt_domain_t* rt) +{ + unsigned long flags; + int ret; + /* first we need the write lock for rt_ready_queue */ + spin_lock_irqsave(&rt->ready_lock, flags); + ret = !heap_empty(&rt->ready_queue); + spin_unlock_irqrestore(&rt->ready_lock, flags); + return ret; +} + +#endif diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h new file mode 100644 index 0000000..403ebc8 --- /dev/null +++ b/include/litmus/rt_param.h @@ -0,0 +1,167 @@ +/* + * Definition of the scheduler plugin interface. + * + */ +#ifndef _LINUX_RT_PARAM_H_ +#define _LINUX_RT_PARAM_H_ + +/* Litmus time type. */ +typedef unsigned long long lt_t; + +static inline int lt_after(lt_t a, lt_t b) +{ + return ((long long) b) - ((long long) a) < 0; +} +#define lt_before(a, b) lt_after(b, a) + +static inline int lt_after_eq(lt_t a, lt_t b) +{ + return ((long long) a) - ((long long) b) >= 0; +} +#define lt_before_eq(a, b) lt_after_eq(b, a) + +/* different types of clients */ +typedef enum { + RT_CLASS_HARD, + RT_CLASS_SOFT, + RT_CLASS_BEST_EFFORT +} task_class_t; + +struct rt_task { + lt_t exec_cost; + lt_t period; + lt_t phase; + unsigned int cpu; + task_class_t cls; +}; + +/* don't export internal data structures to user space (liblitmus) */ +#ifdef __KERNEL__ + +struct _rt_domain; +struct heap_node; + +struct rt_job { + /* Time instant the the job was or will be released. */ + lt_t release; + /* What is the current deadline? */ + lt_t deadline; + + /* How much service has this job received so far? */ + lt_t exec_time; + + /* Which job is this. This is used to let user space + * specify which job to wait for, which is important if jobs + * overrun. If we just call sys_sleep_next_period() then we + * will unintentionally miss jobs after an overrun. + * + * Increase this sequence number when a job is released. + */ + unsigned int job_no; + + /* when did this job start executing? */ + lt_t exec_start; +}; + + +struct pfair_param; + +/* RT task parameters for scheduling extensions + * These parameters are inherited during clone and therefore must + * be explicitly set up before the task set is launched. + */ +struct rt_param { + /* is the task sleeping? */ + unsigned int flags:8; + + /* do we need to check for srp blocking? */ + unsigned int srp_non_recurse:1; + + /* user controlled parameters */ + struct rt_task task_params; + + /* timing parameters */ + struct rt_job job_params; + + /* task representing the current "inherited" task + * priority, assigned by inherit_priority and + * return priority in the scheduler plugins. + * could point to self if PI does not result in + * an increased task priority. + */ + struct task_struct* inh_task; + + /* Don't just dereference this pointer in kernel space! + * It might very well point to junk or nothing at all. + * NULL indicates that the task has not requested any non-preemptable + * section support. + * Not inherited upon fork. + */ + short* np_flag; + + /* For the FMLP under PSN-EDF, it is required to make the task + * non-preemptive from kernel space. In order not to interfere with + * user space, this counter indicates the kernel space np setting. + * kernel_np > 0 => task is non-preemptive + */ + unsigned int kernel_np; + + /* This field can be used by plugins to store where the task + * is currently scheduled. It is the responsibility of the + * plugin to avoid race conditions. + * + * This used by GSN-EDF and PFAIR. + */ + volatile int scheduled_on; + + /* Is the stack of the task currently in use? This is updated by + * the LITMUS core. + * + * Be careful to avoid deadlocks! + */ + volatile int stack_in_use; + + /* This field can be used by plugins to store where the task + * is currently linked. It is the responsibility of the plugin + * to avoid race conditions. + * + * Used by GSN-EDF. + */ + volatile int linked_on; + + /* PFAIR/PD^2 state. Allocated on demand. */ + struct pfair_param* pfair; + + /* Fields saved before BE->RT transition. + */ + int old_policy; + int old_prio; + + /* ready queue for this task */ + struct _rt_domain* domain; + + /* heap element for this task + * + * Warning: Don't statically allocate this node. The heap + * implementation swaps these between tasks, thus after + * dequeuing from a heap you may end up with a different node + * then the one you had when enqueuing the task. For the same + * reason, don't obtain and store references to this node + * other than this pointer (which is updated by the heap + * implementation). + */ + struct heap_node* heap_node; + + /* Used by rt_domain to queue task in release list. + */ + struct list_head list; +}; + +/* Possible RT flags */ +#define RT_F_RUNNING 0x00000000 +#define RT_F_SLEEP 0x00000001 +#define RT_F_EXIT_SEM 0x00000008 + +#endif + +#endif diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h new file mode 100644 index 0000000..94952f6 --- /dev/null +++ b/include/litmus/sched_plugin.h @@ -0,0 +1,159 @@ +/* + * Definition of the scheduler plugin interface. + * + */ +#ifndef _LINUX_SCHED_PLUGIN_H_ +#define _LINUX_SCHED_PLUGIN_H_ + +#include + +/* struct for semaphore with priority inheritance */ +struct pi_semaphore { + atomic_t count; + int sleepers; + wait_queue_head_t wait; + union { + /* highest-prio holder/waiter */ + struct task_struct *task; + struct task_struct* cpu_task[NR_CPUS]; + } hp; + /* current lock holder */ + struct task_struct *holder; +}; + +/************************ setup/tear down ********************/ + +typedef long (*activate_plugin_t) (void); +typedef long (*deactivate_plugin_t) (void); + + + +/********************* scheduler invocation ******************/ + +/* Plugin-specific realtime tick handler */ +typedef void (*scheduler_tick_t) (struct task_struct *cur); +/* Novell make sched decision function */ +typedef struct task_struct* (*schedule_t)(struct task_struct * prev); +/* Clean up after the task switch has occured. + * This function is called after every (even non-rt) task switch. + */ +typedef void (*finish_switch_t)(struct task_struct *prev); + + +/********************* task state changes ********************/ + +/* Called to setup a new real-time task. + * Release the first job, enqueue, etc. + * Task may already be running. + */ +typedef void (*task_new_t) (struct task_struct *task, + int on_rq, + int running); + +/* Called to re-introduce a task after blocking. + * Can potentially be called multiple times. + */ +typedef void (*task_wake_up_t) (struct task_struct *task); +/* called to notify the plugin of a blocking real-time task + * it will only be called for real-time tasks and before schedule is called */ +typedef void (*task_block_t) (struct task_struct *task); +/* Called when a real-time task exits or changes to a different scheduling + * class. + * Free any allocated resources + */ +typedef void (*task_exit_t) (struct task_struct *); + +/* Called when the new_owner is released from the wait queue + * it should now inherit the priority from sem, _before_ it gets readded + * to any queue + */ +typedef long (*inherit_priority_t) (struct pi_semaphore *sem, + struct task_struct *new_owner); + +/* Called when the current task releases a semahpore where it might have + * inherited a piority from + */ +typedef long (*return_priority_t) (struct pi_semaphore *sem); + +/* Called when a task tries to acquire a semaphore and fails. Check if its + * priority is higher than that of the current holder. + */ +typedef long (*pi_block_t) (struct pi_semaphore *sem, struct task_struct *t); + + + + +/********************* sys call backends ********************/ +/* This function causes the caller to sleep until the next release */ +typedef long (*complete_job_t) (void); + +typedef long (*admit_task_t)(struct task_struct* tsk); + +typedef void (*release_at_t)(struct task_struct *t, lt_t start); + +struct sched_plugin { + struct list_head list; + /* basic info */ + char *plugin_name; + + /* setup */ + activate_plugin_t activate_plugin; + deactivate_plugin_t deactivate_plugin; + +#ifdef CONFIG_SRP + unsigned int srp_active; +#endif + + /* scheduler invocation */ + scheduler_tick_t tick; + schedule_t schedule; + finish_switch_t finish_switch; + + /* syscall backend */ + complete_job_t complete_job; + release_at_t release_at; + + /* task state changes */ + admit_task_t admit_task; + + task_new_t task_new; + task_wake_up_t task_wake_up; + task_block_t task_block; + task_exit_t task_exit; + +#ifdef CONFIG_FMLP + /* priority inheritance */ + unsigned int fmlp_active; + inherit_priority_t inherit_priority; + return_priority_t return_priority; + pi_block_t pi_block; +#endif +} __attribute__ ((__aligned__(SMP_CACHE_BYTES))); + + +extern struct sched_plugin *litmus; + +int register_sched_plugin(struct sched_plugin* plugin); +struct sched_plugin* find_sched_plugin(const char* name); +int print_sched_plugins(char* buf, int max); + +static inline int srp_active(void) +{ +#ifdef CONFIG_SRP + return litmus->srp_active; +#else + return 0; +#endif +} +static inline int fmlp_active(void) +{ +#ifdef CONFIG_FMLP + return litmus->fmlp_active; +#else + return 0; +#endif +} + +extern struct sched_plugin linux_sched_plugin; + +#endif diff --git a/include/litmus/sched_trace.h b/include/litmus/sched_trace.h new file mode 100644 index 0000000..06e1aaa --- /dev/null +++ b/include/litmus/sched_trace.h @@ -0,0 +1,168 @@ +/* sched_trace.h -- record scheduler events to a byte stream for offline analysis. + */ +#ifndef _LINUX_SCHED_TRACE_H_ +#define _LINUX_SCHED_TRACE_H_ + +/* all times in nanoseconds */ + +struct st_trace_header { + u8 type; /* Of what type is this record? */ + u8 cpu; /* On which CPU was it recorded? */ + u16 pid; /* PID of the task. */ + u32 job; /* The job sequence number. */ +}; + +#define ST_NAME_LEN 16 +struct st_name_data { + char cmd[ST_NAME_LEN];/* The name of the executable of this process. */ +}; + +struct st_param_data { /* regular params */ + u32 wcet; + u32 period; + u32 phase; + u8 partition; + u8 __unused[3]; +}; + +struct st_release_data { /* A job is was/is going to be released. */ + u64 release; /* What's the release time? */ + u64 deadline; /* By when must it finish? */ +}; + +struct st_assigned_data { /* A job was asigned to a CPU. */ + u64 when; + u8 target; /* Where should it execute? */ + u8 __unused[3]; +}; + +struct st_switch_to_data { /* A process was switched to on a given CPU. */ + u64 when; /* When did this occur? */ + u32 exec_time; /* Time the current job has executed. */ + +}; + +struct st_switch_away_data { /* A process was switched away from on a given CPU. */ + u64 when; + u64 exec_time; +}; + +struct st_completion_data { /* A job completed. */ + u64 when; + u8 forced:1; /* Set to 1 if job overran and kernel advanced to the + * next task automatically; set to 0 otherwise. + */ + u8 __uflags:7; + u8 __unused[3]; +}; + +struct st_block_data { /* A task blocks. */ + u64 when; + u64 __unused; +}; + +struct st_resume_data { /* A task resumes. */ + u64 when; + u64 __unused; +}; + +struct st_sys_release_data { + u64 when; + u64 release; +}; + +#define DATA(x) struct st_ ## x ## _data x; + +typedef enum { + ST_NAME = 1, /* Start at one, so that we can spot + * uninitialized records. */ + ST_PARAM, + ST_RELEASE, + ST_ASSIGNED, + ST_SWITCH_TO, + ST_SWITCH_AWAY, + ST_COMPLETION, + ST_BLOCK, + ST_RESUME, + ST_SYS_RELEASE, +} st_event_record_type_t; + +struct st_event_record { + struct st_trace_header hdr; + union { + u64 raw[2]; + + DATA(name); + DATA(param); + DATA(release); + DATA(assigned); + DATA(switch_to); + DATA(switch_away); + DATA(completion); + DATA(block); + DATA(resume); + DATA(sys_release); + + } data; +}; + +#undef DATA + +#ifdef __KERNEL__ + +#include +#include + +#ifdef CONFIG_SCHED_TASK_TRACE + +#define SCHED_TRACE(id, callback, task) \ + ft_event1(id, callback, task) +#define SCHED_TRACE2(id, callback, task, xtra) \ + ft_event2(id, callback, task, xtra) + +#else + +#define SCHED_TRACE(id, callback, task) /* no tracing */ +#define SCHED_TRACE2(id, callback, task, xtra) /* no tracing */ + +#endif + + +#define SCHED_TRACE_BASE_ID 500 + + +#define sched_trace_task_name(t) \ + SCHED_TRACE(SCHED_TRACE_BASE_ID + 1, do_sched_trace_task_name, t) +#define sched_trace_task_param(t) \ + SCHED_TRACE(SCHED_TRACE_BASE_ID + 2, do_sched_trace_task_param, t) +#define sched_trace_task_release(t) \ + SCHED_TRACE(SCHED_TRACE_BASE_ID + 3, do_sched_trace_task_release, t) +#define sched_trace_task_switch_to(t) \ + SCHED_TRACE(SCHED_TRACE_BASE_ID + 4, do_sched_trace_task_switch_to, t) +#define sched_trace_task_switch_away(t) \ + SCHED_TRACE(SCHED_TRACE_BASE_ID + 5, do_sched_trace_task_switch_away, t) +#define sched_trace_task_completion(t, forced) \ + SCHED_TRACE2(SCHED_TRACE_BASE_ID + 6, do_sched_trace_task_completion, t, \ + forced) +#define sched_trace_task_block(t) \ + SCHED_TRACE(SCHED_TRACE_BASE_ID + 7, do_sched_trace_task_block, t) +#define sched_trace_task_resume(t) \ + SCHED_TRACE(SCHED_TRACE_BASE_ID + 8, do_sched_trace_task_resume, t) + +#define sched_trace_sys_release(when) \ + SCHED_TRACE(SCHED_TRACE_BASE_ID + 9, do_sched_trace_sys_release, when) + +#define sched_trace_quantum_boundary() /* NOT IMPLEMENTED */ + +#ifdef CONFIG_SCHED_DEBUG_TRACE +void sched_trace_log_message(const char* fmt, ...); + +#else + +#define sched_trace_log_message(fmt, ...) + +#endif + +#endif /* __KERNEL__ */ + +#endif diff --git a/include/litmus/trace.h b/include/litmus/trace.h new file mode 100644 index 0000000..b8157e8 --- /dev/null +++ b/include/litmus/trace.h @@ -0,0 +1,103 @@ +#ifndef _SYS_TRACE_H_ +#define _SYS_TRACE_H_ + +#ifdef CONFIG_FEATHER_TRACE + +#include +#include + + +/*********************** TIMESTAMPS ************************/ + +enum task_type_marker { + TSK_BE, + TSK_RT, + TSK_UNKNOWN +}; + +struct timestamp { + uint64_t timestamp; + uint32_t seq_no; + uint8_t cpu; + uint8_t event; + uint8_t task_type; +}; + +/* tracing callbacks */ +feather_callback void save_timestamp(unsigned long event); +feather_callback void save_timestamp_def(unsigned long event, unsigned long type); +feather_callback void save_timestamp_task(unsigned long event, unsigned long t_ptr); + +#define TIMESTAMP(id) ft_event0(id, save_timestamp) + +#define DTIMESTAMP(id, def) ft_event1(id, save_timestamp_def, def) + +#define TTIMESTAMP(id, task) ft_event1(id, save_timestamp_task, (unsigned long) task) + +#else /* !CONFIG_FEATHER_TRACE */ + +#define TIMESTAMP(id) /* no tracing */ + +#define DTIMESTAMP(id, def) /* no tracing */ + +#define TTIMESTAMP(id, task) /* no tracing */ + +#endif + + +/* Convention for timestamps + * ========================= + * + * In order to process the trace files with a common tool, we use the following + * convention to measure execution times: The end time id of a code segment is + * always the next number after the start time event id. + */ + +#define TS_SCHED_START DTIMESTAMP(100, TSK_UNKNOWN) /* we only + * care + * about + * next */ +#define TS_SCHED_END(t) TTIMESTAMP(101, t) +#define TS_SCHED2_START(t) TTIMESTAMP(102, t) +#define TS_SCHED2_END(t) TTIMESTAMP(103, t) + +#define TS_CXS_START(t) TTIMESTAMP(104, t) +#define TS_CXS_END(t) TTIMESTAMP(105, t) + +#define TS_RELEASE_START DTIMESTAMP(106, TSK_RT) +#define TS_RELEASE_END DTIMESTAMP(107, TSK_RT) + +#define TS_TICK_START(t) TTIMESTAMP(110, t) +#define TS_TICK_END(t) TTIMESTAMP(111, t) + + +#define TS_PLUGIN_SCHED_START /* TIMESTAMP(120) */ /* currently unused */ +#define TS_PLUGIN_SCHED_END /* TIMESTAMP(121) */ + +#define TS_PLUGIN_TICK_START /* TIMESTAMP(130) */ +#define TS_PLUGIN_TICK_END /* TIMESTAMP(131) */ + +#define TS_ENTER_NP_START TIMESTAMP(140) +#define TS_ENTER_NP_END TIMESTAMP(141) + +#define TS_EXIT_NP_START TIMESTAMP(150) +#define TS_EXIT_NP_END TIMESTAMP(151) + +#define TS_SRP_UP_START TIMESTAMP(160) +#define TS_SRP_UP_END TIMESTAMP(161) +#define TS_SRP_DOWN_START TIMESTAMP(162) +#define TS_SRP_DOWN_END TIMESTAMP(163) + +#define TS_PI_UP_START TIMESTAMP(170) +#define TS_PI_UP_END TIMESTAMP(171) +#define TS_PI_DOWN_START TIMESTAMP(172) +#define TS_PI_DOWN_END TIMESTAMP(173) + +#define TS_FIFO_UP_START TIMESTAMP(180) +#define TS_FIFO_UP_END TIMESTAMP(181) +#define TS_FIFO_DOWN_START TIMESTAMP(182) +#define TS_FIFO_DOWN_END TIMESTAMP(183) + + + +#endif /* !_SYS_TRACE_H_ */ diff --git a/include/litmus/unistd.h b/include/litmus/unistd.h new file mode 100644 index 0000000..8224235 --- /dev/null +++ b/include/litmus/unistd.h @@ -0,0 +1,20 @@ + +#define __LSC(x) (__NR_LITMUS + x) + +#define __NR_set_rt_task_param __LSC(0) +#define __NR_get_rt_task_param __LSC(1) +#define __NR_sleep_next_period __LSC(2) +#define __NR_register_np_flag __LSC(3) +#define __NR_exit_np __LSC(4) +#define __NR_od_open __LSC(5) +#define __NR_od_close __LSC(6) +#define __NR_fmlp_down __LSC(7) +#define __NR_fmlp_up __LSC(8) +#define __NR_srp_down __LSC(9) +#define __NR_srp_up __LSC(10) +#define __NR_query_job_no __LSC(11) +#define __NR_wait_for_job_release __LSC(12) +#define __NR_wait_for_ts_release __LSC(13) +#define __NR_release_ts __LSC(14) + +#define NR_litmus_syscalls 15 diff --git a/kernel/exit.c b/kernel/exit.c index 549c055..bc313b7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -52,6 +52,8 @@ extern void sem_exit (void); +extern void exit_od_table(struct task_struct* t); + static void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p) @@ -987,6 +989,8 @@ fastcall NORET_TYPE void do_exit(long code) if (unlikely(tsk->audit_context)) audit_free(tsk); + exit_od_table(tsk); + tsk->exit_code = code; taskstats_exit(tsk, group_dead); diff --git a/kernel/fork.c b/kernel/fork.c index 8dd8ff2..4c322d4 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -59,6 +59,9 @@ #include #include +#include +#include + /* * Protected counters by write_lock_irq(&tasklist_lock) */ @@ -121,6 +124,8 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + exit_litmus(tsk); + security_task_free(tsk); free_uid(tsk->user); put_group_info(tsk->group_info); @@ -182,6 +187,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) *tsk = *orig; tsk->stack = ti; + /* Don't let the new task be a real-time task. */ + memset(&tsk->rt_param, 0, sizeof(struct rt_task)); + err = prop_local_init_single(&tsk->dirties); if (err) { free_thread_info(ti); diff --git a/kernel/printk.c b/kernel/printk.c index 89011bf..9eb2dc5 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -54,6 +54,12 @@ int console_printk[4] = { DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ }; +/* divert printk() messages when we have a LITMUS^RT + * debug listener + */ +#include +int trace_override = 0; + /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -652,6 +658,8 @@ asmlinkage int vprintk(const char *fmt, va_list args) /* Emit the output into the temporary buffer */ printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args); + if (trace_override) + TRACE("%s", printk_buf); /* * Copy the output into log_buf. If the caller didn't provide @@ -932,7 +940,7 @@ int is_console_locked(void) void wake_up_klogd(void) { - if (!oops_in_progress && waitqueue_active(&log_wait)) + if (!trace_override && !oops_in_progress && waitqueue_active(&log_wait)) wake_up_interruptible(&log_wait); } diff --git a/kernel/sched.c b/kernel/sched.c index e76b11c..fdeced2 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -67,6 +67,11 @@ #include #include +#include +#include + +#include + /* * Scheduler clock - returns current time in nanosec units. * This is default implementation. @@ -324,6 +329,8 @@ struct rq { atomic_t nr_iowait; + struct task_struct* litmus_next; + #ifdef CONFIG_SMP struct sched_domain *sd; @@ -875,11 +882,12 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} #include "sched_idletask.c" #include "sched_fair.c" #include "sched_rt.c" +#include "../litmus/sched_litmus.c" #ifdef CONFIG_SCHED_DEBUG # include "sched_debug.c" #endif -#define sched_class_highest (&rt_sched_class) +#define sched_class_highest (&litmus_sched_class) /* * Update delta_exec, delta_fair fields for rq. @@ -1516,6 +1524,8 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) int new_cpu; #endif + if (is_realtime(p)) + TRACE_TASK(p, "try_to_wake_up()\n"); rq = task_rq_lock(p, &flags); old_state = p->state; if (!(old_state & state)) @@ -1529,7 +1539,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) this_cpu = smp_processor_id(); #ifdef CONFIG_SMP - if (unlikely(task_running(rq, p))) + if (unlikely(task_running(rq, p) || is_realtime(p))) goto out_activate; new_cpu = cpu; @@ -1650,8 +1660,10 @@ out_activate: out_running: p->state = TASK_RUNNING; out: + if (is_realtime(p)) + TRACE_TASK(p, "try_to_wake_up() done, p->state=%d\n", p->state); task_rq_unlock(rq, &flags); - + tick_no_rqlock(); return success; } @@ -1890,6 +1902,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); + litmus->finish_switch(prev); + prev->rt_param.stack_in_use = NO_CPU; finish_lock_switch(rq, prev); fire_sched_in_preempt_notifiers(current); if (mm) @@ -3480,6 +3494,7 @@ void scheduler_tick(void) struct task_struct *curr = rq->curr; u64 next_tick = rq->tick_timestamp + TICK_NSEC; + TS_TICK_START(current); spin_lock(&rq->lock); __update_rq_clock(rq); /* @@ -3491,12 +3506,17 @@ void scheduler_tick(void) update_cpu_load(rq); if (curr != rq->idle) /* FIXME: needed? */ curr->sched_class->task_tick(rq, curr); + TS_PLUGIN_TICK_START; + litmus_tick(rq, curr); + TS_PLUGIN_TICK_END; spin_unlock(&rq->lock); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); - trigger_load_balance(rq, cpu); + if (!is_realtime(current)) + trigger_load_balance(rq, cpu); #endif + TS_TICK_END(current); } #if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT) @@ -3594,11 +3614,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev) * Optimization: we know that if all tasks are in * the fair class we can call that function directly: */ - if (likely(rq->nr_running == rq->cfs.nr_running)) { + /* Don't do that for LITMUS. + if (likely(rq->nr_running == rq->cfs.nr_running)) { p = fair_sched_class.pick_next_task(rq); if (likely(p)) return p; } + */ class = sched_class_highest; for ( ; ; ) { @@ -3633,6 +3655,7 @@ need_resched: release_kernel_lock(prev); need_resched_nonpreemptible: + TS_SCHED_START; schedule_debug(prev); @@ -3643,6 +3666,9 @@ need_resched_nonpreemptible: __update_rq_clock(rq); spin_lock(&rq->lock); clear_tsk_need_resched(prev); + TS_PLUGIN_SCHED_START; + litmus_schedule(rq, prev); + TS_PLUGIN_SCHED_END; if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely((prev->state & TASK_INTERRUPTIBLE) && @@ -3666,19 +3692,35 @@ need_resched_nonpreemptible: rq->nr_switches++; rq->curr = next; ++*switch_count; + sched_trace_task_switch_away(prev); + sched_trace_task_switch_to(next); + TS_SCHED_END(next); + TS_CXS_START(next); context_switch(rq, prev, next); /* unlocks the rq */ - } else + TS_CXS_END(current); + } else { + TS_SCHED_END(prev); spin_unlock_irq(&rq->lock); + } + TS_SCHED2_START(current); + + tick_no_rqlock(); if (unlikely(reacquire_kernel_lock(current) < 0)) { cpu = smp_processor_id(); rq = cpu_rq(cpu); + TS_SCHED2_END(current); goto need_resched_nonpreemptible; } preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) { + TS_SCHED2_END(current); goto need_resched; + } + TS_SCHED2_END(current); + if (srp_active()) + srp_ceiling_block(); } EXPORT_SYMBOL(schedule); @@ -3886,6 +3928,18 @@ void complete_all(struct completion *x) } EXPORT_SYMBOL(complete_all); +void complete_n(struct completion *x, int n) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done += n; + __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, + n, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete_n); + static inline long __sched do_wait_for_common(struct completion *x, long timeout, int state) { @@ -4236,6 +4290,9 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) case SCHED_RR: p->sched_class = &rt_sched_class; break; + case SCHED_LITMUS: + p->sched_class = &litmus_sched_class; + break; } p->rt_priority = prio; @@ -4268,7 +4325,7 @@ recheck: policy = oldpolicy = p->policy; else if (policy != SCHED_FIFO && policy != SCHED_RR && policy != SCHED_NORMAL && policy != SCHED_BATCH && - policy != SCHED_IDLE) + policy != SCHED_IDLE && policy != SCHED_LITMUS) return -EINVAL; /* * Valid priorities for SCHED_FIFO and SCHED_RR are @@ -4282,6 +4339,9 @@ recheck: if (rt_policy(policy) != (param->sched_priority != 0)) return -EINVAL; + if (policy == SCHED_LITMUS && policy == p->policy) + return -EINVAL; + /* * Allow unprivileged RT tasks to decrease priority: */ @@ -4316,6 +4376,12 @@ recheck: return -EPERM; } + if (policy == SCHED_LITMUS) { + retval = litmus_admit_task(p); + if (retval) + return retval; + } + retval = security_task_setscheduler(p, policy, param); if (retval) return retval; @@ -4345,9 +4411,17 @@ recheck: p->sched_class->put_prev_task(rq, p); } + if (p->policy == SCHED_LITMUS) + litmus_exit_task(p); + oldprio = p->prio; __setscheduler(rq, p, policy, param->sched_priority); + if (policy == SCHED_LITMUS) { + p->rt_param.stack_in_use = running ? rq->cpu : NO_CPU; + litmus->task_new(p, on_rq, running); + } + if (on_rq) { if (running) p->sched_class->set_curr_task(rq); @@ -4364,6 +4438,7 @@ recheck: check_preempt_curr(rq, p); } } + __task_rq_unlock(rq); spin_unlock_irqrestore(&p->pi_lock, flags); @@ -4494,10 +4569,11 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) read_lock(&tasklist_lock); p = find_process_by_pid(pid); - if (!p) { + if (!p || is_realtime(p)) { + /* LITMUS tasks don't get to do this, transition to BE first */ read_unlock(&tasklist_lock); mutex_unlock(&sched_hotcpu_mutex); - return -ESRCH; + return p ? -EPERM : -ESRCH; } /* diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index da7c061..de30496 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -845,7 +845,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) struct sched_entity *se = &curr->se, *pse = &p->se; unsigned long gran; - if (unlikely(rt_prio(p->prio))) { + if (unlikely(rt_prio(p->prio) || p->policy == SCHED_LITMUS)) { update_rq_clock(rq); update_curr(cfs_rq); resched_task(curr); diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 9ba3daa..c7c938c 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -70,7 +70,7 @@ yield_task_rt(struct rq *rq) */ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p) { - if (p->prio < rq->curr->prio) + if (p->prio < rq->curr->prio || p->policy == SCHED_LITMUS) resched_task(rq->curr); } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index cb89fa8..d6dad22 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -568,6 +568,42 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) } /** + * tick_set_quanta_type - get the quanta type as a boot option + * Default is standard setup with ticks staggered over first + * half of tick period. + */ +int quanta_type = LINUX_DEFAULT_TICKS; +static int __init tick_set_quanta_type(char *str) +{ + if (strcmp("aligned", str) == 0) + quanta_type = LITMUS_ALIGNED_TICKS; + else if (strcmp("staggered", str) == 0) + quanta_type = LITMUS_STAGGERED_TICKS; + return 1; +} +__setup("quanta=", tick_set_quanta_type); + +u64 cpu_stagger_offset(int cpu) +{ + u64 offset = 0; + switch (quanta_type) { + case LITMUS_ALIGNED_TICKS: + offset = 0; + break; + case LITMUS_STAGGERED_TICKS: + offset = ktime_to_ns(tick_period); + do_div(offset, num_possible_cpus()); + offset *= cpu; + break; + default: + offset = ktime_to_ns(tick_period) >> 1; + do_div(offset, num_possible_cpus()); + offset *= cpu; + } + return offset; +} + +/** * tick_setup_sched_timer - setup the tick emulation timer */ void tick_setup_sched_timer(void) @@ -585,9 +621,11 @@ void tick_setup_sched_timer(void) /* Get the next period (per cpu) */ ts->sched_timer.expires = tick_init_jiffy_update(); - offset = ktime_to_ns(tick_period) >> 1; - do_div(offset, num_possible_cpus()); - offset *= smp_processor_id(); + + /* Offset must be set correctly to achieve desired quanta type. */ + offset = cpu_stagger_offset(smp_processor_id()); + + /* Add correct offset to expiration time. */ ts->sched_timer.expires = ktime_add_ns(ts->sched_timer.expires, offset); for (;;) { diff --git a/litmus/Kconfig b/litmus/Kconfig new file mode 100644 index 0000000..9a2ab90 --- /dev/null +++ b/litmus/Kconfig @@ -0,0 +1,78 @@ +menu "LITMUS^RT" + +menu "Real-Time Synchronization" + +config NP_SECTION + bool "Non-preemptive section support" + depends on !SPARC64 + default n + help + Include support for flag-based non-preemptive section signaling + from userspace. + + (currently broken on SPARC64) + + Say Yes if you want FMLP short critical section synchronization support. + + +config SRP + bool "Stack Resource Policy (SRP)" + default n + help + Include support for Baker's Stack Resource Policy. + + Say Yes if you want FMLP local long critical section synchronization support. + +config FMLP + bool "FMLP support" + depends on NP_SECTION + default n + help + Include support for deterministic multiprocessor real-time + synchronization support. + + Say Yes if you want FMLP long critical section synchronization support. + +endmenu + +menu "Tracing" + +config SCHED_TASK_TRACE + bool "Trace real-time tasks" + default y + help + Include support for the sched_trace_XXX() tracing functions. This + allows the collection of real-time task events such as job + completions, job releases, early completions, etc. This results in a + small overhead in the scheduling code. Disable if the overhead is not + acceptable (e.g., benchmarking). + + Say Yes for debugging. + Say No for overhead tracing. + +config SCHED_DEBUG_TRACE + bool "TRACE() debugging" + default y + help + Include support for sched_trace_log_messageg(), which is used to + implement TRACE(). If disabled, no TRACE() messages will be included + in the kernel, and no overheads due to debugging statements will be + incurred by the scheduler. Disable if the overhead is not acceptable + (e.g. benchmarking). + + Say Yes for debugging. + Say No for overhead tracing. + +config FEATHER_TRACE + bool "Feather-Trace Instrumentation Support" + default y + help + Include Feather-Trace trace points. Currently not supported on + sparc64. + + Say Yes for overhead tracing. + + +endmenu + +endmenu diff --git a/litmus/Makefile b/litmus/Makefile new file mode 100644 index 0000000..fa39a2b --- /dev/null +++ b/litmus/Makefile @@ -0,0 +1,16 @@ +# +# Makefile for LITMUS^RT +# + +obj-y = sched_plugin.o litmus.o \ + edf_common.o jobs.o \ + rt_domain.o fdso.o sync.o \ + fmlp.o srp.o norqlock.o \ + sched_gsn_edf.o \ + sched_psn_edf.o \ + sched_cedf.o \ + sched_pfair.o + +obj-$(CONFIG_FEATHER_TRACE) += trace.o ft_event.o ftdev.o +obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o +obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o \ No newline at end of file diff --git a/litmus/edf_common.c b/litmus/edf_common.c new file mode 100644 index 0000000..84ece3e --- /dev/null +++ b/litmus/edf_common.c @@ -0,0 +1,95 @@ +/* + * kernel/edf_common.c + * + * Common functions for EDF based scheduler. + */ + +#include +#include +#include + +#include +#include +#include + + +#include + +/* edf_higher_prio - returns true if first has a higher EDF priority + * than second. Deadline ties are broken by PID. + * + * both first and second may be NULL + */ +int edf_higher_prio(struct task_struct* first, + struct task_struct* second) +{ + struct task_struct *first_task = first; + struct task_struct *second_task = second; + + /* Check for inherited priorities. Change task + * used for comparison in such a case. + */ + if (first && first->rt_param.inh_task) + first_task = first->rt_param.inh_task; + if (second && second->rt_param.inh_task) + second_task = second->rt_param.inh_task; + + return + /* it has to exist in order to have higher priority */ + first_task && ( + /* does the second task exist and is it a real-time task? If + * not, the first task (which is a RT task) has higher + * priority. + */ + !second_task || !is_realtime(second_task) || + + /* is the deadline of the first task earlier? + * Then it has higher priority. + */ + earlier_deadline(first_task, second_task) || + + /* Do we have a deadline tie? + * Then break by PID. + */ + (get_deadline(first_task) == get_deadline(second_task) && + (first_task->pid < second_task->pid || + + /* If the PIDs are the same then the task with the inherited + * priority wins. + */ + (first_task->pid == second_task->pid && + !second->rt_param.inh_task)))); +} + +int edf_ready_order(struct heap_node* a, struct heap_node* b) +{ + return edf_higher_prio(heap2task(a), heap2task(b)); +} + +void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched, + release_jobs_t release) +{ + rt_domain_init(rt, edf_ready_order, resched, release); +} + +/* need_to_preempt - check whether the task t needs to be preempted + * call only with irqs disabled and with ready_lock acquired + * THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT! + */ +int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t) +{ + /* we need the read lock for edf_ready_queue */ + /* no need to preempt if there is nothing pending */ + if (!__jobs_pending(rt)) + return 0; + /* we need to reschedule if t doesn't exist */ + if (!t) + return 1; + + /* NOTE: We cannot check for non-preemptibility since we + * don't know what address space we're currently in. + */ + + /* make sure to get non-rt stuff out of the way */ + return !is_realtime(t) || edf_higher_prio(__next_ready(rt), t); +} diff --git a/litmus/fdso.c b/litmus/fdso.c new file mode 100644 index 0000000..81ab0af --- /dev/null +++ b/litmus/fdso.c @@ -0,0 +1,282 @@ +/* fdso.c - file descriptor attached shared objects + * + * (c) 2007 B. Brandenburg, LITMUS^RT project + * + * Notes: + * - objects descriptor (OD) tables are not cloned during a fork. + * - objects are created on-demand, and freed after the last reference + * is dropped. + * - for now, object types are hard coded. + * - As long as we have live objects, we keep a reference to the inode. + */ + +#include +#include +#include +#include +#include + +#include + +extern struct fdso_ops fmlp_sem_ops; +extern struct fdso_ops srp_sem_ops; + +static const struct fdso_ops* fdso_ops[] = { + &fmlp_sem_ops, + &srp_sem_ops, +}; + +static void* fdso_create(obj_type_t type) +{ + if (fdso_ops[type]->create) + return fdso_ops[type]->create(); + else + return NULL; +} + +static void fdso_destroy(obj_type_t type, void* obj) +{ + fdso_ops[type]->destroy(obj); +} + +static int fdso_open(struct od_table_entry* entry, void* __user config) +{ + if (fdso_ops[entry->obj->type]->open) + return fdso_ops[entry->obj->type]->open(entry, config); + else + return 0; +} + +static int fdso_close(struct od_table_entry* entry) +{ + if (fdso_ops[entry->obj->type]->close) + return fdso_ops[entry->obj->type]->close(entry); + else + return 0; +} + +/* inode must be locked already */ +static struct inode_obj_id* alloc_inode_obj(struct inode* inode, + obj_type_t type, + unsigned int id) +{ + struct inode_obj_id* obj; + void* raw_obj; + + raw_obj = fdso_create(type); + if (!raw_obj) + return NULL; + + obj = kmalloc(sizeof(struct inode_obj_id), GFP_KERNEL); + if (!obj) + return NULL; + INIT_LIST_HEAD(&obj->list); + atomic_set(&obj->count, 1); + obj->type = type; + obj->id = id; + obj->obj = raw_obj; + obj->inode = inode; + + list_add(&obj->list, &inode->i_obj_list); + atomic_inc(&inode->i_count); + + printk(KERN_DEBUG "alloc_inode_obj(%p, %d, %d): object created\n", inode, type, id); + return obj; +} + +/* inode must be locked already */ +static struct inode_obj_id* get_inode_obj(struct inode* inode, + obj_type_t type, + unsigned int id) +{ + struct list_head* pos; + struct inode_obj_id* obj = NULL; + + list_for_each(pos, &inode->i_obj_list) { + obj = list_entry(pos, struct inode_obj_id, list); + if (obj->id == id && obj->type == type) { + atomic_inc(&obj->count); + return obj; + } + } + printk(KERN_DEBUG "get_inode_obj(%p, %d, %d): couldn't find object\n", inode, type, id); + return NULL; +} + + +static void put_inode_obj(struct inode_obj_id* obj) +{ + struct inode* inode; + int let_go = 0; + + inode = obj->inode; + if (atomic_dec_and_test(&obj->count)) { + + mutex_lock(&inode->i_obj_mutex); + /* no new references can be obtained */ + if (!atomic_read(&obj->count)) { + list_del(&obj->list); + fdso_destroy(obj->type, obj->obj); + kfree(obj); + let_go = 1; + } + mutex_unlock(&inode->i_obj_mutex); + if (let_go) + iput(inode); + } +} + +static struct od_table_entry* get_od_entry(struct task_struct* t) +{ + struct od_table_entry* table; + int i; + + + table = t->od_table; + if (!table) { + table = (struct od_table_entry*) + kzalloc(sizeof(struct od_table_entry) * + MAX_OBJECT_DESCRIPTORS, GFP_KERNEL); + t->od_table = table; + } + + for (i = 0; table && i < MAX_OBJECT_DESCRIPTORS; i++) + if (!table[i].used) { + table[i].used = 1; + return table + i; + } + return NULL; +} + +static int put_od_entry(struct od_table_entry* od) +{ + put_inode_obj(od->obj); + od->used = 0; + return 0; +} + +void exit_od_table(struct task_struct* t) +{ + int i; + + if (t->od_table) { + for (i = 0; i < MAX_OBJECT_DESCRIPTORS; i++) + if (t->od_table[i].used) + put_od_entry(t->od_table + i); + kfree(t->od_table); + t->od_table = NULL; + } +} + +static int do_sys_od_open(struct file* file, obj_type_t type, int id, + void* __user config) +{ + int idx = 0, err; + struct inode* inode; + struct inode_obj_id* obj = NULL; + struct od_table_entry* entry; + + inode = file->f_dentry->d_inode; + + entry = get_od_entry(current); + if (!entry) + return -ENOMEM; + + mutex_lock(&inode->i_obj_mutex); + obj = get_inode_obj(inode, type, id); + if (!obj) + obj = alloc_inode_obj(inode, type, id); + if (!obj) { + idx = -ENOMEM; + entry->used = 0; + } else { + entry->obj = obj; + entry->extra = NULL; + idx = entry - current->od_table; + } + + mutex_unlock(&inode->i_obj_mutex); + + err = fdso_open(entry, config); + if (err < 0) { + /* The class rejected the open call. + * We need to clean up and tell user space. + */ + put_od_entry(entry); + idx = err; + } + + return idx; +} + + +struct od_table_entry* __od_lookup(int od) +{ + struct task_struct *t = current; + + if (!t->od_table) + return NULL; + if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS) + return NULL; + if (!t->od_table[od].used) + return NULL; + return t->od_table + od; +} + + +asmlinkage int sys_od_open(int fd, int type, int obj_id, void* __user config) +{ + int ret = 0; + struct file* file; + + /* + 1) get file from fd, get inode from file + 2) lock inode + 3) try to lookup object + 4) if not present create and enqueue object, inc inode refcnt + 5) increment refcnt of object + 6) alloc od_table_entry, setup ptrs + 7) unlock inode + 8) return offset in od_table as OD + */ + + if (type < MIN_OBJ_TYPE || type > MAX_OBJ_TYPE) { + ret = -EINVAL; + goto out; + } + + file = fget(fd); + if (!file) { + ret = -EBADF; + goto out; + } + + ret = do_sys_od_open(file, type, obj_id, config); + + fput(file); + +out: + return ret; +} + + +asmlinkage int sys_od_close(int od) +{ + int ret = -EINVAL; + struct task_struct *t = current; + + if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS) + return ret; + + if (!t->od_table || !t->od_table[od].used) + return ret; + + + /* give the class a chance to reject the close + */ + ret = fdso_close(t->od_table + od); + if (ret == 0) + ret = put_od_entry(t->od_table + od); + + return ret; +} diff --git a/litmus/fmlp.c b/litmus/fmlp.c new file mode 100644 index 0000000..f34eeea --- /dev/null +++ b/litmus/fmlp.c @@ -0,0 +1,262 @@ +/* + * FMLP implementation. + * Much of the code here is borrowed from include/asm-i386/semaphore.h. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#ifdef CONFIG_FMLP + +static void* create_fmlp_semaphore(void) +{ + struct pi_semaphore* sem; + int i; + + sem = kmalloc(sizeof(struct pi_semaphore), GFP_KERNEL); + if (!sem) + return NULL; + atomic_set(&sem->count, 1); + sem->sleepers = 0; + init_waitqueue_head(&sem->wait); + sem->hp.task = NULL; + sem->holder = NULL; + for (i = 0; i < NR_CPUS; i++) + sem->hp.cpu_task[i] = NULL; + return sem; +} + +static int open_fmlp_semaphore(struct od_table_entry* entry, void* __user arg) +{ + if (!fmlp_active()) + return -EBUSY; + return 0; +} + +static void destroy_fmlp_semaphore(void* sem) +{ + /* XXX assert invariants */ + kfree(sem); +} + +struct fdso_ops fmlp_sem_ops = { + .create = create_fmlp_semaphore, + .open = open_fmlp_semaphore, + .destroy = destroy_fmlp_semaphore +}; + +struct wq_pair { + struct task_struct* tsk; + struct pi_semaphore* sem; +}; + +static int rt_pi_wake_up(wait_queue_t *wait, unsigned mode, int sync, + void *key) +{ + struct wq_pair* wqp = (struct wq_pair*) wait->private; + set_rt_flags(wqp->tsk, RT_F_EXIT_SEM); + litmus->inherit_priority(wqp->sem, wqp->tsk); + TRACE_TASK(wqp->tsk, + "woken up by rt_pi_wake_up() (RT_F_SEM_EXIT, PI)\n"); + /* point to task for default_wake_function() */ + wait->private = wqp->tsk; + default_wake_function(wait, mode, sync, key); + + /* Always return true since we know that if we encountered a task + * that was already running the wake_up raced with the schedule in + * rt_pi_down(). In that case the task in rt_pi_down() will be scheduled + * immediately and own the lock. We must not wake up another task in + * any case. + */ + return 1; +} + +/* caller is responsible for locking */ +int edf_set_hp_task(struct pi_semaphore *sem) +{ + struct list_head *tmp, *next; + struct task_struct *queued; + int ret = 0; + + sem->hp.task = NULL; + list_for_each_safe(tmp, next, &sem->wait.task_list) { + queued = ((struct wq_pair*) + list_entry(tmp, wait_queue_t, + task_list)->private)->tsk; + + /* Compare task prios, find high prio task. */ + if (edf_higher_prio(queued, sem->hp.task)) { + sem->hp.task = queued; + ret = 1; + } + } + return ret; +} + +/* caller is responsible for locking */ +int edf_set_hp_cpu_task(struct pi_semaphore *sem, int cpu) +{ + struct list_head *tmp, *next; + struct task_struct *queued; + int ret = 0; + + sem->hp.cpu_task[cpu] = NULL; + list_for_each_safe(tmp, next, &sem->wait.task_list) { + queued = ((struct wq_pair*) + list_entry(tmp, wait_queue_t, + task_list)->private)->tsk; + + /* Compare task prios, find high prio task. */ + if (get_partition(queued) == cpu && + edf_higher_prio(queued, sem->hp.cpu_task[cpu])) { + sem->hp.cpu_task[cpu] = queued; + ret = 1; + } + } + return ret; +} + +static int do_fmlp_down(struct pi_semaphore* sem) +{ + unsigned long flags; + struct task_struct *tsk = current; + struct wq_pair pair; + int suspended = 1; + wait_queue_t wait = { + .private = &pair, + .func = rt_pi_wake_up, + .task_list = {NULL, NULL} + }; + + pair.tsk = tsk; + pair.sem = sem; + spin_lock_irqsave(&sem->wait.lock, flags); + + if (atomic_dec_return(&sem->count) < 0 || + waitqueue_active(&sem->wait)) { + /* we need to suspend */ + tsk->state = TASK_UNINTERRUPTIBLE; + add_wait_queue_exclusive_locked(&sem->wait, &wait); + + TRACE_CUR("suspends on PI lock %p\n", sem); + litmus->pi_block(sem, tsk); + + /* release lock before sleeping */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + TS_PI_DOWN_END; + preempt_enable_no_resched(); + + + /* we depend on the FIFO order + * Thus, we don't need to recheck when we wake up, we + * are guaranteed to have the lock since there is only one + * wake up per release + */ + schedule(); + + TRACE_CUR("woke up, now owns PI lock %p\n", sem); + + /* try_to_wake_up() set our state to TASK_RUNNING, + * all we need to do is to remove our wait queue entry + */ + remove_wait_queue(&sem->wait, &wait); + } else { + /* no priority inheritance necessary, since there are no queued + * tasks. + */ + suspended = 0; + TRACE_CUR("acquired PI lock %p, no contention\n", sem); + sem->holder = tsk; + sem->hp.task = tsk; + litmus->inherit_priority(sem, tsk); + spin_unlock_irqrestore(&sem->wait.lock, flags); + } + return suspended; +} + +static void do_fmlp_up(struct pi_semaphore* sem) +{ + unsigned long flags; + + spin_lock_irqsave(&sem->wait.lock, flags); + + TRACE_CUR("releases PI lock %p\n", sem); + litmus->return_priority(sem); + sem->holder = NULL; + if (atomic_inc_return(&sem->count) < 1) + /* there is a task queued */ + wake_up_locked(&sem->wait); + + spin_unlock_irqrestore(&sem->wait.lock, flags); +} + +asmlinkage long sys_fmlp_down(int sem_od) +{ + long ret = 0; + struct pi_semaphore * sem; + int suspended = 0; + + preempt_disable(); + TS_PI_DOWN_START; + + sem = lookup_fmlp_sem(sem_od); + if (sem) + suspended = do_fmlp_down(sem); + else + ret = -EINVAL; + + if (!suspended) { + TS_PI_DOWN_END; + preempt_enable(); + } + + return ret; +} + +asmlinkage long sys_fmlp_up(int sem_od) +{ + long ret = 0; + struct pi_semaphore * sem; + + preempt_disable(); + TS_PI_UP_START; + + sem = lookup_fmlp_sem(sem_od); + if (sem) + do_fmlp_up(sem); + else + ret = -EINVAL; + + + TS_PI_UP_END; + preempt_enable(); + + return ret; +} + +#else + +struct fdso_ops fmlp_sem_ops = {}; + +asmlinkage long sys_fmlp_down(int sem_od) +{ + return -ENOSYS; +} + +asmlinkage long sys_fmlp_up(int sem_od) +{ + return -ENOSYS; +} + +#endif diff --git a/litmus/ft_event.c b/litmus/ft_event.c new file mode 100644 index 0000000..6084b6d --- /dev/null +++ b/litmus/ft_event.c @@ -0,0 +1,43 @@ +#include + +#include + +#ifndef __ARCH_HAS_FEATHER_TRACE +/* provide dummy implementation */ + +int ft_events[MAX_EVENTS]; + +int ft_enable_event(unsigned long id) +{ + if (id < MAX_EVENTS) { + ft_events[id]++; + return 1; + } else + return 0; +} + +int ft_disable_event(unsigned long id) +{ + if (id < MAX_EVENTS && ft_events[id]) { + ft_events[id]--; + return 1; + } else + return 0; +} + +int ft_disable_all_events(void) +{ + int i; + + for (i = 0; i < MAX_EVENTS; i++) + ft_events[i] = 0; + + return MAX_EVENTS; +} + +int ft_is_event_enabled(unsigned long id) +{ + return id < MAX_EVENTS && ft_events[id]; +} + +#endif diff --git a/litmus/ftdev.c b/litmus/ftdev.c new file mode 100644 index 0000000..7c933ff --- /dev/null +++ b/litmus/ftdev.c @@ -0,0 +1,352 @@ +#include +#include +#include +#include +#include + +#include +#include +#include + +struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size) +{ + struct ft_buffer* buf; + size_t total = (size + 1) * count; + char* mem; + int order = 0, pages = 1; + + buf = kmalloc(sizeof(struct ft_buffer), GFP_KERNEL); + if (!buf) + return NULL; + + total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0); + while (pages < total) { + order++; + pages *= 2; + } + + mem = (char*) __get_free_pages(GFP_KERNEL, order); + if (!mem) { + kfree(buf); + return NULL; + } + + if (!init_ft_buffer(buf, count, size, + mem + (count * size), /* markers at the end */ + mem)) { /* buffer objects */ + free_pages((unsigned long) mem, order); + kfree(buf); + return NULL; + } + return buf; +} + +void free_ft_buffer(struct ft_buffer* buf) +{ + int order = 0, pages = 1; + size_t total; + + if (buf) { + total = (buf->slot_size + 1) * buf->slot_count; + total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0); + while (pages < total) { + order++; + pages *= 2; + } + free_pages((unsigned long) buf->buffer_mem, order); + kfree(buf); + } +} + +struct ftdev_event { + int id; + struct ftdev_event* next; +}; + +static int activate(struct ftdev_event** chain, int id) +{ + struct ftdev_event* ev = kmalloc(sizeof(struct ftdev_event), GFP_KERNEL); + if (ev) { + printk(KERN_INFO + "Enabling feather-trace event %d.\n", (int) id); + ft_enable_event(id); + ev->id = id; + ev->next = *chain; + *chain = ev; + } + return ev ? 0 : -ENOMEM; +} + +static void deactivate(struct ftdev_event** chain, int id) +{ + struct ftdev_event **cur = chain; + struct ftdev_event *nxt; + while (*cur) { + if ((*cur)->id == id) { + nxt = (*cur)->next; + kfree(*cur); + *cur = nxt; + printk(KERN_INFO + "Disabling feather-trace event %d.\n", (int) id); + ft_disable_event(id); + break; + } + cur = &(*cur)->next; + } +} + +static int ftdev_open(struct inode *in, struct file *filp) +{ + struct ftdev* ftdev; + struct ftdev_minor* ftdm; + unsigned int buf_idx = iminor(in); + int err = 0; + + ftdev = container_of(in->i_cdev, struct ftdev, cdev); + + if (buf_idx >= ftdev->minor_cnt) { + err = -ENODEV; + goto out; + } + if (ftdev->can_open && (err = ftdev->can_open(ftdev, buf_idx))) + goto out; + + ftdm = ftdev->minor + buf_idx; + filp->private_data = ftdm; + + if (mutex_lock_interruptible(&ftdm->lock)) { + err = -ERESTARTSYS; + goto out; + } + + if (!ftdm->readers && ftdev->alloc) + err = ftdev->alloc(ftdev, buf_idx); + if (0 == err) + ftdm->readers++; + + mutex_unlock(&ftdm->lock); +out: + return err; +} + +static int ftdev_release(struct inode *in, struct file *filp) +{ + struct ftdev* ftdev; + struct ftdev_minor* ftdm; + unsigned int buf_idx = iminor(in); + int err = 0; + + ftdev = container_of(in->i_cdev, struct ftdev, cdev); + + if (buf_idx >= ftdev->minor_cnt) { + err = -ENODEV; + goto out; + } + ftdm = ftdev->minor + buf_idx; + + if (mutex_lock_interruptible(&ftdm->lock)) { + err = -ERESTARTSYS; + goto out; + } + + if (ftdm->readers == 1) { + while (ftdm->events) + deactivate(&ftdm->events, ftdm->events->id); + + /* wait for any pending events to complete */ + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ); + + printk(KERN_ALERT "Failed trace writes: %u\n", + ftdm->buf->failed_writes); + + if (ftdev->free) + ftdev->free(ftdev, buf_idx); + } + + ftdm->readers--; + mutex_unlock(&ftdm->lock); +out: + return err; +} + +/* based on ft_buffer_read + * @returns < 0 : page fault + * = 0 : no data available + * = 1 : one slot copied + */ +static int ft_buffer_copy_to_user(struct ft_buffer* buf, char __user *dest) +{ + unsigned int idx; + int err = 0; + if (buf->free_count != buf->slot_count) { + /* data available */ + idx = buf->read_idx % buf->slot_count; + if (buf->slots[idx] == SLOT_READY) { + err = copy_to_user(dest, ((char*) buf->buffer_mem) + + idx * buf->slot_size, + buf->slot_size); + if (err == 0) { + /* copy ok */ + buf->slots[idx] = SLOT_FREE; + buf->read_idx++; + fetch_and_inc(&buf->free_count); + err = 1; + } + } + } + return err; +} + +static ssize_t ftdev_read(struct file *filp, + char __user *to, size_t len, loff_t *f_pos) +{ + /* we ignore f_pos, this is strictly sequential */ + + ssize_t err = 0; + size_t chunk; + int copied; + struct ftdev_minor* ftdm = filp->private_data; + + if (mutex_lock_interruptible(&ftdm->lock)) { + err = -ERESTARTSYS; + goto out; + } + + + chunk = ftdm->buf->slot_size; + while (len >= chunk) { + copied = ft_buffer_copy_to_user(ftdm->buf, to); + if (copied == 1) { + len -= chunk; + to += chunk; + err += chunk; + } else if (err == 0 && copied == 0 && ftdm->events) { + /* Only wait if there are any events enabled and only + * if we haven't copied some data yet. We cannot wait + * here with copied data because that data would get + * lost if the task is interrupted (e.g., killed). + */ + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(50); + if (signal_pending(current)) { + if (err == 0) + /* nothing read yet, signal problem */ + err = -ERESTARTSYS; + break; + } + } else if (copied < 0) { + /* page fault */ + err = copied; + break; + } else + /* nothing left to get, return to user space */ + break; + } + mutex_unlock(&ftdm->lock); +out: + return err; +} + +typedef uint32_t cmd_t; + +static ssize_t ftdev_write(struct file *filp, const char __user *from, + size_t len, loff_t *f_pos) +{ + struct ftdev_minor* ftdm = filp->private_data; + ssize_t err = -EINVAL; + cmd_t cmd; + cmd_t id; + + if (len % sizeof(cmd_t) || len < 2 * sizeof(cmd_t)) + goto out; + + if (copy_from_user(&cmd, from, sizeof(cmd_t))) { + err = -EFAULT; + goto out; + } + len -= sizeof(cmd_t); + from += sizeof(cmd_t); + + if (cmd != FTDEV_ENABLE_CMD && cmd != FTDEV_DISABLE_CMD) + goto out; + + if (mutex_lock_interruptible(&ftdm->lock)) { + err = -ERESTARTSYS; + goto out; + } + + err = sizeof(cmd_t); + while (len) { + if (copy_from_user(&id, from, sizeof(cmd_t))) { + err = -EFAULT; + goto out_unlock; + } + /* FIXME: check id against list of acceptable events */ + len -= sizeof(cmd_t); + from += sizeof(cmd_t); + if (cmd == FTDEV_DISABLE_CMD) + deactivate(&ftdm->events, id); + else if (activate(&ftdm->events, id) != 0) { + err = -ENOMEM; + goto out_unlock; + } + err += sizeof(cmd_t); + } + +out_unlock: + mutex_unlock(&ftdm->lock); +out: + return err; +} + +struct file_operations ftdev_fops = { + .owner = THIS_MODULE, + .open = ftdev_open, + .release = ftdev_release, + .write = ftdev_write, + .read = ftdev_read, +}; + + +void ftdev_init(struct ftdev* ftdev, struct module* owner) +{ + int i; + cdev_init(&ftdev->cdev, &ftdev_fops); + ftdev->cdev.owner = owner; + ftdev->cdev.ops = &ftdev_fops; + ftdev->minor_cnt = 0; + for (i = 0; i < MAX_FTDEV_MINORS; i++) { + mutex_init(&ftdev->minor[i].lock); + ftdev->minor[i].readers = 0; + ftdev->minor[i].buf = NULL; + ftdev->minor[i].events = NULL; + } + ftdev->alloc = NULL; + ftdev->free = NULL; + ftdev->can_open = NULL; +} + +int register_ftdev(struct ftdev* ftdev, const char* name, int major) +{ + dev_t trace_dev; + int error = 0; + + trace_dev = MKDEV(major, 0); + error = register_chrdev_region(trace_dev, ftdev->minor_cnt, name); + if (error) + { + printk(KERN_WARNING "ftdev(%s): " + "Could not register major/minor number %d/%u\n", + name, major, ftdev->minor_cnt); + return error; + } + error = cdev_add(&ftdev->cdev, trace_dev, ftdev->minor_cnt); + if (error) { + printk(KERN_WARNING "ftdev(%s): " + "Could not add cdev for major/minor = %d/%u.\n", + name, major, ftdev->minor_cnt); + return error; + } + return error; +} diff --git a/litmus/jobs.c b/litmus/jobs.c new file mode 100644 index 0000000..e294bc5 --- /dev/null +++ b/litmus/jobs.c @@ -0,0 +1,43 @@ +/* litmus/jobs.c - common job control code + */ + +#include + +#include +#include + +void prepare_for_next_period(struct task_struct *t) +{ + BUG_ON(!t); + /* prepare next release */ + t->rt_param.job_params.release = t->rt_param.job_params.deadline; + t->rt_param.job_params.deadline += get_rt_period(t); + t->rt_param.job_params.exec_time = 0; + /* update job sequence number */ + t->rt_param.job_params.job_no++; + + /* don't confuse Linux */ + t->time_slice = 1; +} + +void release_at(struct task_struct *t, lt_t start) +{ + t->rt_param.job_params.deadline = start; + prepare_for_next_period(t); + set_rt_flags(t, RT_F_RUNNING); +} + + +/* + * Deactivate current task until the beginning of the next period. + */ +long complete_job(void) +{ + /* Mark that we do not excute anymore */ + set_rt_flags(current, RT_F_SLEEP); + /* call schedule, this will return when a new job arrives + * it also takes care of preparing for the next release + */ + schedule(); + return 0; +} diff --git a/litmus/litmus.c b/litmus/litmus.c new file mode 100644 index 0000000..314bdda --- /dev/null +++ b/litmus/litmus.c @@ -0,0 +1,851 @@ +/* litmus.c -- Implementation of the LITMUS syscalls, the LITMUS intialization code, + * and the procfs interface.. + */ +#include +#include +#include + +#include +#include +#include + +#include +#include +#include + +#include + +#include + +/* Number of RT tasks that exist in the system */ +atomic_t rt_task_count = ATOMIC_INIT(0); +static DEFINE_SPINLOCK(task_transition_lock); + +/* Give log messages sequential IDs. */ +atomic_t __log_seq_no = ATOMIC_INIT(0); + +/* To send signals from the scheduler + * Must drop locks first. + */ +static LIST_HEAD(sched_sig_list); +static DEFINE_SPINLOCK(sched_sig_list_lock); + +static struct kmem_cache * heap_node_cache; + +struct heap_node* heap_node_alloc(int gfp_flags) +{ + return kmem_cache_alloc(heap_node_cache, gfp_flags); +} + +void heap_node_free(struct heap_node* hn) +{ + kmem_cache_free(heap_node_cache, hn); +} + +/* + * sys_set_task_rt_param + * @pid: Pid of the task which scheduling parameters must be changed + * @param: New real-time extension parameters such as the execution cost and + * period + * Syscall for manipulating with task rt extension params + * Returns EFAULT if param is NULL. + * ESRCH if pid is not corrsponding + * to a valid task. + * EINVAL if either period or execution cost is <=0 + * EPERM if pid is a real-time task + * 0 if success + * + * Only non-real-time tasks may be configured with this system call + * to avoid races with the scheduler. In practice, this means that a + * task's parameters must be set _before_ calling sys_prepare_rt_task() + */ +asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param) +{ + struct rt_task tp; + struct task_struct *target; + int retval = -EINVAL; + + printk("Setting up rt task parameters for process %d.\n", pid); + + if (pid < 0 || param == 0) { + goto out; + } + if (copy_from_user(&tp, param, sizeof(tp))) { + retval = -EFAULT; + goto out; + } + + /* Task search and manipulation must be protected */ + read_lock_irq(&tasklist_lock); + if (!(target = find_task_by_pid(pid))) { + retval = -ESRCH; + goto out_unlock; + } + + if (is_realtime(target)) { + /* The task is already a real-time task. + * We cannot not allow parameter changes at this point. + */ + retval = -EBUSY; + goto out_unlock; + } + + if (tp.exec_cost <= 0) + goto out_unlock; + if (tp.period <= 0) + goto out_unlock; + if (!cpu_online(tp.cpu)) + goto out_unlock; + if (tp.period < tp.exec_cost) + { + printk(KERN_INFO "litmus: real-time task %d rejected " + "because wcet > period\n", pid); + goto out_unlock; + } + + target->rt_param.task_params = tp; + + retval = 0; + out_unlock: + read_unlock_irq(&tasklist_lock); + out: + return retval; +} + +/* Getter of task's RT params + * returns EINVAL if param or pid is NULL + * returns ESRCH if pid does not correspond to a valid task + * returns EFAULT if copying of parameters has failed. + */ +asmlinkage long sys_get_rt_task_param(pid_t pid, struct rt_task __user * param) +{ + int retval = -EINVAL; + struct task_struct *source; + struct rt_task lp; + if (param == 0 || pid < 0) + goto out; + read_lock(&tasklist_lock); + if (!(source = find_task_by_pid(pid))) { + retval = -ESRCH; + goto out_unlock; + } + lp = source->rt_param.task_params; + read_unlock(&tasklist_lock); + /* Do copying outside the lock */ + retval = + copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0; + return retval; + out_unlock: + read_unlock(&tasklist_lock); + out: + return retval; + +} + +/* + * This is the crucial function for periodic task implementation, + * It checks if a task is periodic, checks if such kind of sleep + * is permitted and calls plugin-specific sleep, which puts the + * task into a wait array. + * returns 0 on successful wakeup + * returns EPERM if current conditions do not permit such sleep + * returns EINVAL if current task is not able to go to sleep + */ +asmlinkage long sys_complete_job(void) +{ + int retval = -EPERM; + if (!is_realtime(current)) { + retval = -EINVAL; + goto out; + } + /* Task with negative or zero period cannot sleep */ + if (get_rt_period(current) <= 0) { + retval = -EINVAL; + goto out; + } + /* The plugin has to put the task into an + * appropriate queue and call schedule + */ + retval = litmus->complete_job(); + out: + return retval; +} + +/* This is an "improved" version of sys_complete_job that + * addresses the problem of unintentionally missing a job after + * an overrun. + * + * returns 0 on successful wakeup + * returns EPERM if current conditions do not permit such sleep + * returns EINVAL if current task is not able to go to sleep + */ +asmlinkage long sys_wait_for_job_release(unsigned int job) +{ + int retval = -EPERM; + if (!is_realtime(current)) { + retval = -EINVAL; + goto out; + } + + /* Task with negative or zero period cannot sleep */ + if (get_rt_period(current) <= 0) { + retval = -EINVAL; + goto out; + } + + retval = 0; + + /* first wait until we have "reached" the desired job + * + * This implementation has at least two problems: + * + * 1) It doesn't gracefully handle the wrap around of + * job_no. Since LITMUS is a prototype, this is not much + * of a problem right now. + * + * 2) It is theoretically racy if a job release occurs + * between checking job_no and calling sleep_next_period(). + * A proper solution would requiring adding another callback + * in the plugin structure and testing the condition with + * interrupts disabled. + * + * FIXME: At least problem 2 should be taken care of eventually. + */ + while (!retval && job > current->rt_param.job_params.job_no) + /* If the last job overran then job <= job_no and we + * don't send the task to sleep. + */ + retval = litmus->complete_job(); + out: + return retval; +} + +/* This is a helper syscall to query the current job sequence number. + * + * returns 0 on successful query + * returns EPERM if task is not a real-time task. + * returns EFAULT if &job is not a valid pointer. + */ +asmlinkage long sys_query_job_no(unsigned int __user *job) +{ + int retval = -EPERM; + if (is_realtime(current)) + retval = put_user(current->rt_param.job_params.job_no, job); + + return retval; +} + +struct sched_sig { + struct list_head list; + struct task_struct* task; + unsigned int signal:31; + int force:1; +}; + +static void __scheduler_signal(struct task_struct *t, unsigned int signo, + int force) +{ + struct sched_sig* sig; + + sig = kmalloc(GFP_ATOMIC, sizeof(struct sched_sig)); + if (!sig) { + TRACE_TASK(t, "dropping signal: %u\n", t); + return; + } + + spin_lock(&sched_sig_list_lock); + + sig->signal = signo; + sig->force = force; + sig->task = t; + get_task_struct(t); + list_add(&sig->list, &sched_sig_list); + + spin_unlock(&sched_sig_list_lock); +} + +void scheduler_signal(struct task_struct *t, unsigned int signo) +{ + __scheduler_signal(t, signo, 0); +} + +void force_scheduler_signal(struct task_struct *t, unsigned int signo) +{ + __scheduler_signal(t, signo, 1); +} + +/* FIXME: get rid of the locking and do this on a per-processor basis */ +void send_scheduler_signals(void) +{ + unsigned long flags; + struct list_head *p, *extra; + struct siginfo info; + struct sched_sig* sig; + struct task_struct* t; + struct list_head claimed; + + if (spin_trylock_irqsave(&sched_sig_list_lock, flags)) { + if (list_empty(&sched_sig_list)) + p = NULL; + else { + p = sched_sig_list.next; + list_del(&sched_sig_list); + INIT_LIST_HEAD(&sched_sig_list); + } + spin_unlock_irqrestore(&sched_sig_list_lock, flags); + + /* abort if there are no signals */ + if (!p) + return; + + /* take signal list we just obtained */ + list_add(&claimed, p); + + list_for_each_safe(p, extra, &claimed) { + list_del(p); + sig = list_entry(p, struct sched_sig, list); + t = sig->task; + info.si_signo = sig->signal; + info.si_errno = 0; + info.si_code = SI_KERNEL; + info.si_pid = 1; + info.si_uid = 0; + TRACE("sending signal %d to %d\n", info.si_signo, + t->pid); + if (sig->force) + force_sig_info(sig->signal, &info, t); + else + send_sig_info(sig->signal, &info, t); + put_task_struct(t); + kfree(sig); + } + } + +} + +#ifdef CONFIG_NP_SECTION + +static inline void np_mem_error(struct task_struct* t, const char* reason) +{ + if (t->state != TASK_DEAD && !(t->flags & PF_EXITING)) { + TRACE("np section: %s => %s/%d killed\n", + reason, t->comm, t->pid); + force_scheduler_signal(t, SIGKILL); + } +} + +/* sys_register_np_flag() allows real-time tasks to register an + * np section indicator. + * returns 0 if the flag was successfully registered + * returns EINVAL if current task is not a real-time task + * returns EFAULT if *flag couldn't be written + */ +asmlinkage long sys_register_np_flag(short __user *flag) +{ + int retval = -EINVAL; + short test_val = RT_PREEMPTIVE; + + /* avoid races with the scheduler */ + preempt_disable(); + TRACE("reg_np_flag(%p) for %s/%d\n", flag, + current->comm, current->pid); + + /* Let's first try to write to the address. + * That way it is initialized and any bugs + * involving dangling pointers will caught + * early. + * NULL indicates disabling np section support + * and should not be tested. + */ + if (flag) + retval = poke_kernel_address(test_val, flag); + else + retval = 0; + TRACE("reg_np_flag: retval=%d\n", retval); + if (unlikely(0 != retval)) + np_mem_error(current, "np flag: not writable"); + else + /* the pointer is ok */ + current->rt_param.np_flag = flag; + + preempt_enable(); + return retval; +} + + +void request_exit_np(struct task_struct *t) +{ + int ret; + short flag; + + /* We can only do this if t is actually currently scheduled on this CPU + * because otherwise we are in the wrong address space. Thus make sure + * to check. + */ + BUG_ON(t != current); + + if (unlikely(!is_realtime(t) || !t->rt_param.np_flag)) { + TRACE_TASK(t, "request_exit_np(): BAD TASK!\n"); + return; + } + + flag = RT_EXIT_NP_REQUESTED; + ret = poke_kernel_address(flag, t->rt_param.np_flag + 1); + TRACE("request_exit_np(%s/%d)\n", t->comm, t->pid); + if (unlikely(0 != ret)) + np_mem_error(current, "request_exit_np(): flag not writable"); + +} + + +int is_np(struct task_struct* t) +{ + int ret; + unsigned short flag = 0x5858; /* = XX, looks nicer in debug*/ + + BUG_ON(t != current); + + if (unlikely(t->rt_param.kernel_np)) + return 1; + else if (unlikely(t->rt_param.np_flag == NULL) || + t->flags & PF_EXITING || + t->state == TASK_DEAD) + return 0; + else { + /* This is the tricky part. The process has registered a + * non-preemptive section marker. We now need to check whether + * it is set to to NON_PREEMPTIVE. Along the way we could + * discover that the pointer points to an unmapped region (=> + * kill the task) or that the location contains some garbage + * value (=> also kill the task). Killing the task in any case + * forces userspace to play nicely. Any bugs will be discovered + * immediately. + */ + ret = probe_kernel_address(t->rt_param.np_flag, flag); + if (0 == ret && (flag == RT_NON_PREEMPTIVE || + flag == RT_PREEMPTIVE)) + return flag != RT_PREEMPTIVE; + else { + /* either we could not read from the address or + * it contained garbage => kill the process + * FIXME: Should we cause a SEGFAULT instead? + */ + TRACE("is_np: ret=%d flag=%c%c (%x)\n", ret, + flag & 0xff, (flag >> 8) & 0xff, flag); + np_mem_error(t, "is_np() could not read"); + return 0; + } + } +} + +/* + * sys_exit_np() allows real-time tasks to signal that it left a + * non-preemptable section. It will be called after the kernel requested a + * callback in the preemption indicator flag. + * returns 0 if the signal was valid and processed. + * returns EINVAL if current task is not a real-time task + */ +asmlinkage long sys_exit_np(void) +{ + int retval = -EINVAL; + + TS_EXIT_NP_START; + + if (!is_realtime(current)) + goto out; + + TRACE("sys_exit_np(%s/%d)\n", current->comm, current->pid); + /* force rescheduling so that we can be preempted */ + set_tsk_need_resched(current); + retval = 0; + out: + + TS_EXIT_NP_END; + return retval; +} + +#else /* !CONFIG_NP_SECTION */ + +asmlinkage long sys_register_np_flag(short __user *flag) +{ + return -ENOSYS; +} + +asmlinkage long sys_exit_np(void) +{ + return -ENOSYS; +} + +#endif /* CONFIG_NP_SECTION */ + + +/* p is a real-time task. Re-init its state as a best-effort task. */ +static void reinit_litmus_state(struct task_struct* p, int restore) +{ + struct rt_task user_config = {}; + __user short *np_flag = NULL; + + if (restore) { + /* Safe user-space provided configuration data. */ + user_config = p->rt_param.task_params; + np_flag = p->rt_param.np_flag; + } + + /* We probably should not be inheriting any task's priority + * at this point in time. + */ + WARN_ON(p->rt_param.inh_task); + + /* We need to restore the priority of the task. */ +// __setscheduler(p, p->rt_param.old_policy, p->rt_param.old_prio); + + /* Cleanup everything else. */ + memset(&p->rt_param, 0, sizeof(struct rt_task)); + + /* Restore preserved fields. */ + if (restore) { + p->rt_param.task_params = user_config; + p->rt_param.np_flag = np_flag; + } +} + +long litmus_admit_task(struct task_struct* tsk) +{ + long retval = 0; + long flags; + + BUG_ON(is_realtime(tsk)); + + if (get_rt_period(tsk) == 0 || + get_exec_cost(tsk) > get_rt_period(tsk)) { + TRACE_TASK(tsk, "litmus admit: invalid task parameters " + "(%lu, %lu)\n", + get_exec_cost(tsk), get_rt_period(tsk)); + return -EINVAL; + } + + if (!cpu_online(get_partition(tsk))) + { + TRACE_TASK(tsk, "litmus admit: cpu %d is not online\n", + get_partition(tsk)); + return -EINVAL; + } + + INIT_LIST_HEAD(&tsk_rt(tsk)->list); + + /* avoid scheduler plugin changing underneath us */ + spin_lock_irqsave(&task_transition_lock, flags); + + /* allocate heap node for this task */ + tsk_rt(tsk)->heap_node = kmem_cache_alloc(heap_node_cache, GFP_ATOMIC); + if (!tsk_rt(tsk)->heap_node) { + printk(KERN_WARNING "litmus: no more heap node memory!?\n"); + retval = -ENOMEM; + } else + heap_node_init(&tsk_rt(tsk)->heap_node, tsk); + + if (!retval) + retval = litmus->admit_task(tsk); + + if (!retval) { + sched_trace_task_name(tsk); + sched_trace_task_param(tsk); + atomic_inc(&rt_task_count); + } + + spin_unlock_irqrestore(&task_transition_lock, flags); + + return retval; +} + +void litmus_exit_task(struct task_struct* tsk) +{ + if (is_realtime(tsk)) { + sched_trace_task_completion(tsk, 1); + litmus->task_exit(tsk); + BUG_ON(heap_node_in_heap(tsk_rt(tsk)->heap_node)); + kmem_cache_free(heap_node_cache, tsk_rt(tsk)->heap_node); + atomic_dec(&rt_task_count); + reinit_litmus_state(tsk, 1); + } +} + +/* Switching a plugin in use is tricky. + * We must watch out that no real-time tasks exists + * (and that none is created in parallel) and that the plugin is not + * currently in use on any processor (in theory). + * + * For now, we don't enforce the second part since it is unlikely to cause + * any trouble by itself as long as we don't unload modules. + */ +int switch_sched_plugin(struct sched_plugin* plugin) +{ + long flags; + int ret = 0; + + BUG_ON(!plugin); + + /* stop task transitions */ + spin_lock_irqsave(&task_transition_lock, flags); + + /* don't switch if there are active real-time tasks */ + if (atomic_read(&rt_task_count) == 0) { + ret = litmus->deactivate_plugin(); + if (0 != ret) + goto out; + ret = plugin->activate_plugin(); + if (0 != ret) { + printk(KERN_INFO "Can't activate %s (%d).\n", + plugin->plugin_name, ret); + plugin = &linux_sched_plugin; + } + printk(KERN_INFO "Switching to LITMUS^RT plugin %s.\n", plugin->plugin_name); + litmus = plugin; + } else + ret = -EBUSY; +out: + spin_unlock_irqrestore(&task_transition_lock, flags); + return ret; +} + +/* Called upon fork. + * p is the newly forked task. + */ +void litmus_fork(struct task_struct* p) +{ + if (is_realtime(p)) + /* clean out any litmus related state, don't preserve anything*/ + reinit_litmus_state(p, 0); +} + +/* Called upon execve(). + * current is doing the exec. + * Don't let address space specific stuff leak. + */ +void litmus_exec(void) +{ + struct task_struct* p = current; + + if (is_realtime(p)) { + WARN_ON(p->rt_param.inh_task); + p->rt_param.np_flag = NULL; + } +} + +void exit_litmus(struct task_struct *dead_tsk) +{ + if (is_realtime(dead_tsk)) + litmus_exit_task(dead_tsk); +} + + +void list_qsort(struct list_head* list, list_cmp_t less_than) +{ + struct list_head lt; + struct list_head geq; + struct list_head *pos, *extra, *pivot; + int n_lt = 0, n_geq = 0; + BUG_ON(!list); + + if (list->next == list) + return; + + INIT_LIST_HEAD(<); + INIT_LIST_HEAD(&geq); + + pivot = list->next; + list_del(pivot); + list_for_each_safe(pos, extra, list) { + list_del(pos); + if (less_than(pos, pivot)) { + list_add(pos, <); + n_lt++; + } else { + list_add(pos, &geq); + n_geq++; + } + } + if (n_lt < n_geq) { + list_qsort(<, less_than); + list_qsort(&geq, less_than); + } else { + list_qsort(&geq, less_than); + list_qsort(<, less_than); + } + list_splice(&geq, list); + list_add(pivot, list); + list_splice(<, list); +} + +#ifdef CONFIG_MAGIC_SYSRQ +int sys_kill(int pid, int sig); + +static void sysrq_handle_kill_rt_tasks(int key, struct tty_struct *tty) +{ + struct task_struct *t; + read_lock(&tasklist_lock); + for_each_process(t) { + if (is_realtime(t)) { + sys_kill(t->pid, SIGKILL); + } + } + read_unlock(&tasklist_lock); +} + +static struct sysrq_key_op sysrq_kill_rt_tasks_op = { + .handler = sysrq_handle_kill_rt_tasks, + .help_msg = "Quit-rt-tasks", + .action_msg = "sent SIGKILL to all real-time tasks", +}; +#endif + +static int proc_read_stats(char *page, char **start, + off_t off, int count, + int *eof, void *data) +{ + int len; + + len = snprintf(page, PAGE_SIZE, + "real-time task count = %d\n", + atomic_read(&rt_task_count)); + return len; +} + +static int proc_read_plugins(char *page, char **start, + off_t off, int count, + int *eof, void *data) +{ + int len; + + len = print_sched_plugins(page, PAGE_SIZE); + return len; +} + +static int proc_read_curr(char *page, char **start, + off_t off, int count, + int *eof, void *data) +{ + int len; + + len = snprintf(page, PAGE_SIZE, "%s\n", litmus->plugin_name); + return len; +} + +static int proc_write_curr(struct file *file, + const char *buffer, + unsigned long count, + void *data) +{ + int len, ret; + char name[65]; + struct sched_plugin* found; + + if(count > 64) + len = 64; + else + len = count; + + if(copy_from_user(name, buffer, len)) + return -EFAULT; + + name[len] = '\0'; + /* chomp name */ + if (len > 1 && name[len - 1] == '\n') + name[len - 1] = '\0'; + + found = find_sched_plugin(name); + + if (found) { + ret = switch_sched_plugin(found); + if (ret != 0) + printk(KERN_INFO "Could not switch plugin: %d\n", ret); + } else + printk(KERN_INFO "Plugin '%s' is unknown.\n", name); + + return len; +} + + +static struct proc_dir_entry *litmus_dir = NULL, + *curr_file = NULL, + *stat_file = NULL, + *plugs_file = NULL; + +static int __init init_litmus_proc(void) +{ + litmus_dir = proc_mkdir("litmus", NULL); + if (!litmus_dir) { + printk(KERN_ERR "Could not allocate LITMUS^RT procfs entry.\n"); + return -ENOMEM; + } + litmus_dir->owner = THIS_MODULE; + + curr_file = create_proc_entry("active_plugin", + 0644, litmus_dir); + if (!curr_file) { + printk(KERN_ERR "Could not allocate active_plugin " + "procfs entry.\n"); + return -ENOMEM; + } + curr_file->owner = THIS_MODULE; + curr_file->read_proc = proc_read_curr; + curr_file->write_proc = proc_write_curr; + + stat_file = create_proc_read_entry("stats", 0444, litmus_dir, + proc_read_stats, NULL); + + plugs_file = create_proc_read_entry("plugins", 0444, litmus_dir, + proc_read_plugins, NULL); + + return 0; +} + +static void exit_litmus_proc(void) +{ + if (plugs_file) + remove_proc_entry("plugins", litmus_dir); + if (stat_file) + remove_proc_entry("stats", litmus_dir); + if (curr_file) + remove_proc_entry("active_plugin", litmus_dir); + if (litmus_dir) + remove_proc_entry("litmus", NULL); +} + +extern struct sched_plugin linux_sched_plugin; + +static int __init _init_litmus(void) +{ + /* Common initializers, + * mode change lock is used to enforce single mode change + * operation. + */ + printk("Starting LITMUS^RT kernel\n"); + + register_sched_plugin(&linux_sched_plugin); + + heap_node_cache = KMEM_CACHE(heap_node, 0); + if (!heap_node_cache) + return -ENOMEM; + +#ifdef CONFIG_MAGIC_SYSRQ + /* offer some debugging help */ + if (!register_sysrq_key('q', &sysrq_kill_rt_tasks_op)) + printk("Registered kill rt tasks magic sysrq.\n"); + else + printk("Could not register kill rt tasks magic sysrq.\n"); +#endif + + init_litmus_proc(); + + return 0; +} + +static void _exit_litmus(void) +{ + exit_litmus_proc(); + kmem_cache_destroy(heap_node_cache); +} + +module_init(_init_litmus); +module_exit(_exit_litmus); diff --git a/litmus/norqlock.c b/litmus/norqlock.c new file mode 100644 index 0000000..11f85d3 --- /dev/null +++ b/litmus/norqlock.c @@ -0,0 +1,56 @@ +#include +#include +#include +#include +#include + +#include + +struct worklist { + struct no_rqlock_work* next; +}; + +static DEFINE_PER_CPU(struct worklist, norq_worklist) = {NULL}; + +void init_no_rqlock_work(struct no_rqlock_work* w, work_t work, + unsigned long arg) +{ + w->active = 0; + w->work = work; + w->arg = arg; + w->next = NULL; +} + +void __do_without_rqlock(struct no_rqlock_work *work) +{ + long flags; + struct worklist* wl; + + local_irq_save(flags); + wl = &__get_cpu_var(norq_worklist); + work->next = wl->next; + wl->next = work; + local_irq_restore(flags); +} + +void tick_no_rqlock(void) +{ + long flags; + struct no_rqlock_work *todo, *next; + + local_irq_save(flags); + + next = __get_cpu_var(norq_worklist).next; + __get_cpu_var(norq_worklist).next = NULL; + + while (next) { + todo = next; + next = next->next; + todo->next = NULL; + smp_mb__before_clear_bit(); + clear_bit(0, (void*) &todo->active); + todo->work(todo->arg); + } + + local_irq_restore(flags); +} diff --git a/litmus/rt_domain.c b/litmus/rt_domain.c new file mode 100644 index 0000000..be4ef5e --- /dev/null +++ b/litmus/rt_domain.c @@ -0,0 +1,289 @@ +/* + * kernel/rt_domain.c + * + * LITMUS real-time infrastructure. This file contains the + * functions that manipulate RT domains. RT domains are an abstraction + * of a ready queue and a release queue. + */ + +#include +#include +#include + +#include +#include +#include + +#include + +#include + +#include + +static int dummy_resched(rt_domain_t *rt) +{ + return 0; +} + +static int dummy_order(struct heap_node* a, struct heap_node* b) +{ + return 0; +} + +/* default implementation: use default lock */ +static void default_release_jobs(rt_domain_t* rt, struct heap* tasks) +{ + merge_ready(rt, tasks); +} + +static unsigned int time2slot(lt_t time) +{ + return (unsigned int) time2quanta(time, FLOOR) % RELEASE_QUEUE_SLOTS; +} + +int heap_earlier_release(struct heap_node *_a, struct heap_node *_b) +{ + struct release_heap *a = _a->value; + struct release_heap *b = _b->value; + return lt_before(a->release_time, b->release_time); +} + +/* Caller most hold release lock. + * Will return heap for given time. If no such heap exists prior to the invocation + * it will be created. + */ +static struct release_heap* get_release_heap(rt_domain_t *rt, lt_t release_time) +{ + struct list_head* pos; + struct release_heap* heap = NULL; + struct release_heap* rh; + unsigned int slot = time2slot(release_time); + int inserted; + + /* initialize pos for the case that the list is empty */ + pos = rt->release_queue.slot[slot].next; + list_for_each(pos, &rt->release_queue.slot[slot]) { + rh = list_entry(pos, struct release_heap, list); + if (release_time == rh->release_time) { + /* perfect match -- this happens on hyperperiod + * boundaries + */ + heap = rh; + break; + } else if (lt_before(release_time, rh->release_time)) { + /* we need to insert a new node since rh is + * already in the future + */ + break; + } + } + if (!heap) { + /* must create new node */ + /* FIXME: use a kmemcache_t */ + rh = kmalloc(sizeof(struct release_heap), GFP_ATOMIC); + if (unlikely(!rh)) + /* Should be handled somehow. + * For now, let's just hope there is + * sufficient memory. + */ + panic("rt_domain: no more memory?"); + rh->release_time = release_time; + heap_init(&rh->heap); + list_add(&rh->list, pos->prev); + inserted = heap_add(heap_earlier_release, + &rt->release_queue.rel_heap, rh, + GFP_ATOMIC); + if (unlikely(!inserted)) + panic("rt_domain: no more heap memory?"); + heap = rh; + } + return heap; +} + +static enum hrtimer_restart on_release_timer(struct hrtimer *timer) +{ + long flags; + rt_domain_t *rt; + struct release_heap* rh; + struct heap tasks; + struct list_head list, *pos, *safe; + lt_t release = 0; + int pending; + int repeat; + enum hrtimer_mode ret = HRTIMER_NORESTART; + + TS_RELEASE_START; + + INIT_LIST_HEAD(&list); + heap_init(&tasks); + + rt = container_of(timer, rt_domain_t, + release_queue.timer); + + do { + list_for_each_safe(pos, safe, &list) { + rh = list_entry(pos, struct release_heap, list); + heap_union(rt->order, &tasks, &rh->heap); + list_del(pos); + kfree(rh); + } + + /* call release callback */ + rt->release_jobs(rt, &tasks); + + + spin_lock_irqsave(&rt->release_lock, flags); + while ((pending = next_release(rt, &release))) { + if (lt_before(release, litmus_clock())) { + /* pick for release */ + rh = heap_take_del(heap_earlier_release, + &rt->release_queue.rel_heap); + list_move(&rh->list, &list); + } else + break; + } + repeat = !list_empty(&list); + if (!repeat) { + /* last iteration, setup timers, etc. */ + if (!pending) { + rt->release_queue.timer_armed = 0; + ret = HRTIMER_NORESTART; + } else { + rt->release_queue.timer_time = release; + timer->expires = ns_to_ktime(release); + ret = HRTIMER_RESTART; + } + } + spin_unlock_irqrestore(&rt->release_lock, flags); + } while (repeat); + + TS_RELEASE_END; + + return ret; +} + +static void arm_release_timer(unsigned long _rt) +{ + rt_domain_t *rt = (rt_domain_t*) _rt; + unsigned long flags; + struct list_head list; + struct list_head *pos, *safe; + struct task_struct* t; + struct release_heap* rh; + int earlier, armed; + lt_t release = 0; + + local_irq_save(flags); + spin_lock(&rt->tobe_lock); + list_replace_init(&rt->tobe_released, &list); + spin_unlock(&rt->tobe_lock); + + /* We only have to defend against the ISR since norq callbacks + * are serialized. + */ + spin_lock(&rt->release_lock); + + list_for_each_safe(pos, safe, &list) { + t = list_entry(pos, struct task_struct, rt_param.list); + sched_trace_task_release(t); + list_del(pos); + rh = get_release_heap(rt, get_release(t)); + heap_add(rt->order, &rh->heap, t, GFP_ATOMIC); + } + + next_release(rt, &release); + armed = rt->release_queue.timer_armed; + earlier = lt_before(release, rt->release_queue.timer_time); + /* We'll do the actual arming in a sec. The ISR doesn't care what these + * flags say, and it'll be true before another instance of this + * function can observe the flag due to the sequential nature of norq + * work. + */ + rt->release_queue.timer_armed = 1; + rt->release_queue.timer_time = release; + spin_unlock(&rt->release_lock); + if (!armed || earlier) { + if (armed) { + /* need to cancel first */ + hrtimer_cancel(&rt->release_queue.timer); + } + hrtimer_start(&rt->release_queue.timer, + ns_to_ktime(release), + HRTIMER_MODE_ABS); + } + local_irq_restore(flags); +} + +void rt_domain_init(rt_domain_t *rt, + heap_prio_t order, + check_resched_needed_t check, + release_jobs_t release + ) +{ + int i; + + BUG_ON(!rt); + if (!check) + check = dummy_resched; + if (!release) + release = default_release_jobs; + if (!order) + order = dummy_order; + + heap_init(&rt->ready_queue); + INIT_LIST_HEAD(&rt->tobe_released); + rt->release_queue.timer_armed = 0; + for (i = 0; i < RELEASE_QUEUE_SLOTS; i++) + INIT_LIST_HEAD(&rt->release_queue.slot[i]); + + hrtimer_init(&rt->release_queue.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + rt->release_queue.timer.function = on_release_timer; +#ifdef CONFIG_HIGH_RES_TIMERS + rt->release_queue.timer.cb_mode = HRTIMER_CB_IRQSAFE; +#endif + + spin_lock_init(&rt->ready_lock); + spin_lock_init(&rt->release_lock); + spin_lock_init(&rt->tobe_lock); + + rt->check_resched = check; + rt->release_jobs = release; + rt->order = order; + init_no_rqlock_work(&rt->arm_timer, arm_release_timer, (unsigned long) rt); +} + +/* add_ready - add a real-time task to the rt ready queue. It must be runnable. + * @new: the newly released task + */ +void __add_ready(rt_domain_t* rt, struct task_struct *new) +{ + TRACE("rt: adding %s/%d (%llu, %llu) rel=%llu to ready queue at %llu\n", + new->comm, new->pid, get_exec_cost(new), get_rt_period(new), + get_release(new), litmus_clock()); + + BUG_ON(heap_node_in_heap(tsk_rt(new)->heap_node)); + + heap_insert(rt->order, &rt->ready_queue, tsk_rt(new)->heap_node); + rt->check_resched(rt); +} + +/* merge_ready - Add a sorted set of tasks to the rt ready queue. They must be runnable. + * @tasks - the newly released tasks + */ +void __merge_ready(rt_domain_t* rt, struct heap* tasks) +{ + heap_union(rt->order, &rt->ready_queue, tasks); + rt->check_resched(rt); +} + +/* add_release - add a real-time task to the rt release queue. + * @task: the sleeping task + */ +void __add_release(rt_domain_t* rt, struct task_struct *task) +{ + TRACE_TASK(task, "add_release(), rel=%llu\n", get_release(task)); + list_add(&tsk_rt(task)->list, &rt->tobe_released); + task->rt_param.domain = rt; + do_without_rqlock(&rt->arm_timer); +} + diff --git a/litmus/sched_cedf.c b/litmus/sched_cedf.c new file mode 100755 index 0000000..6c32e1c --- /dev/null +++ b/litmus/sched_cedf.c @@ -0,0 +1,705 @@ +/* + * kernel/sched_cedf.c + * + * Implementation of the Clustered EDF (C-EDF) scheduling algorithm. + * Linking is included so that support for synchronization (e.g., through + * the implementation of a "CSN-EDF" algorithm) can be added later if desired. + * + * This version uses the simple approach and serializes all scheduling + * decisions by the use of a queue lock. This is probably not the + * best way to do it, but it should suffice for now. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +/* Overview of C-EDF operations. + * + * link_task_to_cpu(T, cpu) - Low-level operation to update the linkage + * structure (NOT the actually scheduled + * task). If there is another linked task To + * already it will set To->linked_on = NO_CPU + * (thereby removing its association with this + * CPU). However, it will not requeue the + * previously linked task (if any). It will set + * T's state to RT_F_RUNNING and check whether + * it is already running somewhere else. If T + * is scheduled somewhere else it will link + * it to that CPU instead (and pull the linked + * task to cpu). T may be NULL. + * + * unlink(T) - Unlink removes T from all scheduler data + * structures. If it is linked to some CPU it + * will link NULL to that CPU. If it is + * currently queued in the cedf queue for + * a partition, it will be removed from + * the rt_domain. It is safe to call + * unlink(T) if T is not linked. T may not + * be NULL. + * + * requeue(T) - Requeue will insert T into the appropriate + * queue. If the system is in real-time mode and + * the T is released already, it will go into the + * ready queue. If the system is not in + * real-time mode is T, then T will go into the + * release queue. If T's release time is in the + * future, it will go into the release + * queue. That means that T's release time/job + * no/etc. has to be updated before requeue(T) is + * called. It is not safe to call requeue(T) + * when T is already queued. T may not be NULL. + * + * cedf_job_arrival(T) - This is the catch-all function when T enters + * the system after either a suspension or at a + * job release. It will queue T (which means it + * is not safe to call cedf_job_arrival(T) if + * T is already queued) and then check whether a + * preemption is necessary. If a preemption is + * necessary it will update the linkage + * accordingly and cause scheduled to be called + * (either with an IPI or need_resched). It is + * safe to call cedf_job_arrival(T) if T's + * next job has not been actually released yet + * (release time in the future). T will be put + * on the release queue in that case. + * + * job_completion(T) - Take care of everything that needs to be done + * to prepare T for its next release and place + * it in the right queue with + * cedf_job_arrival(). + * + * + * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is + * equivalent to unlink(T). Note that if you unlink a task from a CPU none of + * the functions will automatically propagate pending task from the ready queue + * to a linked task. This is the job of the calling function ( by means of + * __take_ready). + */ + +/* cpu_entry_t - maintain the linked and scheduled state + */ +typedef struct { + int cpu; + struct task_struct* linked; /* only RT tasks */ + struct task_struct* scheduled; /* only RT tasks */ + struct list_head list; + atomic_t will_schedule; /* prevent unneeded IPIs */ +} cpu_entry_t; +DEFINE_PER_CPU(cpu_entry_t, cedf_cpu_entries); + +cpu_entry_t* cedf_cpu_entries_array[NR_CPUS]; + +#define set_will_schedule() \ + (atomic_set(&__get_cpu_var(cedf_cpu_entries).will_schedule, 1)) +#define clear_will_schedule() \ + (atomic_set(&__get_cpu_var(cedf_cpu_entries).will_schedule, 0)) +#define test_will_schedule(cpu) \ + (atomic_read(&per_cpu(cedf_cpu_entries, cpu).will_schedule)) + +#define NO_CPU 0xffffffff + +/* Cluster size -- currently four. This is a variable to allow for + * the possibility of changing the cluster size online in the future. + */ +int cluster_size = 4; + +typedef struct { + rt_domain_t domain; + int first_cpu; + int last_cpu; + + /* the cpus queue themselves according to priority in here */ + struct list_head cedf_cpu_queue; + + /* per-partition spinlock: protects the domain and + * serializes scheduling decisions + */ +#define slock domain.ready_lock +} cedf_domain_t; + +DEFINE_PER_CPU(cedf_domain_t*, cedf_domains) = NULL; + +cedf_domain_t* cedf_domains_array[NR_CPUS]; + + +/* These are defined similarly to partitioning, except that a + * tasks partition is any cpu of the cluster to which it + * is assigned, typically the lowest-numbered cpu. + */ +#define local_edf (&__get_cpu_var(cedf_domains)->domain) +#define local_cedf __get_cpu_var(cedf_domains) +#define remote_edf(cpu) (&per_cpu(cedf_domains, cpu)->domain) +#define remote_cedf(cpu) per_cpu(cedf_domains, cpu) +#define task_edf(task) remote_edf(get_partition(task)) +#define task_cedf(task) remote_cedf(get_partition(task)) + +/* update_cpu_position - Move the cpu entry to the correct place to maintain + * order in the cpu queue. Caller must hold cedf lock. + * + * This really should be a heap. + */ +static void update_cpu_position(cpu_entry_t *entry) +{ + cpu_entry_t *other; + struct list_head *cedf_cpu_queue = + &(remote_cedf(entry->cpu))->cedf_cpu_queue; + struct list_head *pos; + + BUG_ON(!cedf_cpu_queue); + + if (likely(in_list(&entry->list))) + list_del(&entry->list); + /* if we do not execute real-time jobs we just move + * to the end of the queue + */ + if (entry->linked) { + list_for_each(pos, cedf_cpu_queue) { + other = list_entry(pos, cpu_entry_t, list); + if (edf_higher_prio(entry->linked, other->linked)) { + __list_add(&entry->list, pos->prev, pos); + return; + } + } + } + /* if we get this far we have the lowest priority job */ + list_add_tail(&entry->list, cedf_cpu_queue); +} + +/* link_task_to_cpu - Update the link of a CPU. + * Handles the case where the to-be-linked task is already + * scheduled on a different CPU. + */ +static noinline void link_task_to_cpu(struct task_struct* linked, + cpu_entry_t *entry) +{ + cpu_entry_t *sched; + struct task_struct* tmp; + int on_cpu; + + BUG_ON(linked && !is_realtime(linked)); + + /* Cannot link task to a CPU that doesn't belong to its partition... */ + BUG_ON(linked && remote_cedf(entry->cpu) != task_cedf(linked)); + + /* Currently linked task is set to be unlinked. */ + if (entry->linked) { + entry->linked->rt_param.linked_on = NO_CPU; + } + + /* Link new task to CPU. */ + if (linked) { + set_rt_flags(linked, RT_F_RUNNING); + /* handle task is already scheduled somewhere! */ + on_cpu = linked->rt_param.scheduled_on; + if (on_cpu != NO_CPU) { + sched = &per_cpu(cedf_cpu_entries, on_cpu); + /* this should only happen if not linked already */ + BUG_ON(sched->linked == linked); + + /* If we are already scheduled on the CPU to which we + * wanted to link, we don't need to do the swap -- + * we just link ourselves to the CPU and depend on + * the caller to get things right. + */ + if (entry != sched) { + tmp = sched->linked; + linked->rt_param.linked_on = sched->cpu; + sched->linked = linked; + update_cpu_position(sched); + linked = tmp; + } + } + if (linked) /* might be NULL due to swap */ + linked->rt_param.linked_on = entry->cpu; + } + entry->linked = linked; + + if (entry->linked) + TRACE_TASK(entry->linked, "linked to CPU %d, state:%d\n", + entry->cpu, entry->linked->state); + else + TRACE("NULL linked to CPU %d\n", entry->cpu); + + update_cpu_position(entry); +} + +/* unlink - Make sure a task is not linked any longer to an entry + * where it was linked before. Must hold cedf_lock. + */ +static noinline void unlink(struct task_struct* t) +{ + cpu_entry_t *entry; + + if (unlikely(!t)) { + TRACE_BUG_ON(!t); + return; + } + + if (t->rt_param.linked_on != NO_CPU) { + /* unlink */ + entry = &per_cpu(cedf_cpu_entries, t->rt_param.linked_on); + t->rt_param.linked_on = NO_CPU; + link_task_to_cpu(NULL, entry); + } else if (is_queued(t)) { + /* This is an interesting situation: t is scheduled, + * but was just recently unlinked. It cannot be + * linked anywhere else (because then it would have + * been relinked to this CPU), thus it must be in some + * queue. We must remove it from the list in this + * case. + */ + remove(task_edf(t), t); + } +} + + +/* preempt - force a CPU to reschedule + */ +static noinline void preempt(cpu_entry_t *entry) +{ + /* We cannot make the is_np() decision here if it is a remote CPU + * because requesting exit_np() requires that we currently use the + * address space of the task. Thus, in the remote case we just send + * the IPI and let schedule() handle the problem. + */ + + if (smp_processor_id() == entry->cpu) { + if (entry->scheduled && is_np(entry->scheduled)) + request_exit_np(entry->scheduled); + else + set_tsk_need_resched(current); + } else + /* in case that it is a remote CPU we have to defer the + * the decision to the remote CPU + * FIXME: We could save a few IPI's here if we leave the flag + * set when we are waiting for a np_exit(). + */ + if (!test_will_schedule(entry->cpu)) + smp_send_reschedule(entry->cpu); +} + +/* requeue - Put an unlinked task into c-edf domain. + * Caller must hold cedf_lock. + */ +static noinline void requeue(struct task_struct* task) +{ + cedf_domain_t* cedf; + rt_domain_t* edf; + + BUG_ON(!task); + /* sanity check rt_list before insertion */ + BUG_ON(is_queued(task)); + + /* Get correct real-time domain. */ + cedf = task_cedf(task); + edf = &cedf->domain; + + if (is_released(task, litmus_clock())) + __add_ready(edf, task); + else { + /* it has got to wait */ + add_release(edf, task); + } +} + +static void check_for_preemptions(cedf_domain_t* cedf) +{ + cpu_entry_t *last; + struct task_struct *task; + struct list_head *cedf_cpu_queue; + cedf_cpu_queue = &cedf->cedf_cpu_queue; + + for(last = list_entry(cedf_cpu_queue->prev, cpu_entry_t, list); + edf_preemption_needed(&cedf->domain, last->linked); + last = list_entry(cedf_cpu_queue->prev, cpu_entry_t, list)) { + /* preemption necessary */ + task = __take_ready(&cedf->domain); + TRACE("check_for_preemptions: task %d linked to %d, state:%d\n", + task->pid, last->cpu, task->state); + if (last->linked) + requeue(last->linked); + link_task_to_cpu(task, last); + preempt(last); + } + +} + +/* cedf_job_arrival: task is either resumed or released */ +static noinline void cedf_job_arrival(struct task_struct* task) +{ + cedf_domain_t* cedf; + rt_domain_t* edf; + + BUG_ON(!task); + + /* Get correct real-time domain. */ + cedf = task_cedf(task); + edf = &cedf->domain; + + /* first queue arriving job */ + requeue(task); + + /* then check for any necessary preemptions */ + check_for_preemptions(cedf); +} + +/* check for current job releases */ +static void cedf_release_jobs(rt_domain_t* rt, struct heap* tasks) +{ + cedf_domain_t* cedf = container_of(rt, cedf_domain_t, domain); + unsigned long flags; + + spin_lock_irqsave(&cedf->slock, flags); + + __merge_ready(&cedf->domain, tasks); + check_for_preemptions(cedf); + spin_unlock_irqrestore(&cedf->slock, flags); +} + +/* cedf_tick - this function is called for every local timer + * interrupt. + * + * checks whether the current task has expired and checks + * whether we need to preempt it if it has not expired + */ +static void cedf_tick(struct task_struct* t) +{ + BUG_ON(!t); + + if (is_realtime(t) && budget_exhausted(t)) { + if (!is_np(t)) { + /* np tasks will be preempted when they become + * preemptable again + */ + set_tsk_need_resched(t); + set_will_schedule(); + TRACE("cedf_scheduler_tick: " + "%d is preemptable (state:%d) " + " => FORCE_RESCHED\n", t->pid, t->state); + } else { + TRACE("cedf_scheduler_tick: " + "%d is non-preemptable (state:%d), " + "preemption delayed.\n", t->pid, t->state); + request_exit_np(t); + } + } +} + +/* caller holds cedf_lock */ +static noinline void job_completion(struct task_struct *t, int forced) +{ + BUG_ON(!t); + + sched_trace_task_completion(t, forced); + + TRACE_TASK(t, "job_completion(). [state:%d]\n", t->state); + + /* set flags */ + set_rt_flags(t, RT_F_SLEEP); + /* prepare for next period */ + prepare_for_next_period(t); + /* unlink */ + unlink(t); + /* requeue + * But don't requeue a blocking task. */ + if (is_running(t)) + cedf_job_arrival(t); +} + +/* Getting schedule() right is a bit tricky. schedule() may not make any + * assumptions on the state of the current task since it may be called for a + * number of reasons. The reasons include a scheduler_tick() determined that it + * was necessary, because sys_exit_np() was called, because some Linux + * subsystem determined so, or even (in the worst case) because there is a bug + * hidden somewhere. Thus, we must take extreme care to determine what the + * current state is. + * + * The CPU could currently be scheduling a task (or not), be linked (or not). + * + * The following assertions for the scheduled task could hold: + * + * - !is_running(scheduled) // the job blocks + * - scheduled->timeslice == 0 // the job completed (forcefully) + * - get_rt_flag() == RT_F_SLEEP // the job completed (by syscall) + * - linked != scheduled // we need to reschedule (for any reason) + * - is_np(scheduled) // rescheduling must be delayed, + * sys_exit_np must be requested + * + * Any of these can occur together. + */ +static struct task_struct* cedf_schedule(struct task_struct * prev) +{ + cedf_domain_t* cedf = local_cedf; + rt_domain_t* edf = &cedf->domain; + cpu_entry_t* entry = &__get_cpu_var(cedf_cpu_entries); + int out_of_time, sleep, preempt, np, + exists, blocks; + struct task_struct* next = NULL; + + BUG_ON(!prev); + BUG_ON(!cedf); + BUG_ON(!edf); + BUG_ON(!entry); + BUG_ON(cedf != remote_cedf(entry->cpu)); + BUG_ON(is_realtime(prev) && cedf != task_cedf(prev)); + + /* Will be released in finish_switch. */ + spin_lock(&cedf->slock); + clear_will_schedule(); + + /* sanity checking */ + BUG_ON(entry->scheduled && entry->scheduled != prev); + BUG_ON(entry->scheduled && !is_realtime(prev)); + BUG_ON(is_realtime(prev) && !entry->scheduled); + + /* (0) Determine state */ + exists = entry->scheduled != NULL; + blocks = exists && !is_running(entry->scheduled); + out_of_time = exists && budget_exhausted(entry->scheduled); + np = exists && is_np(entry->scheduled); + sleep = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP; + preempt = entry->scheduled != entry->linked; + + /* If a task blocks we have no choice but to reschedule. + */ + if (blocks) + unlink(entry->scheduled); + + /* Request a sys_exit_np() call if we would like to preempt but cannot. + * We need to make sure to update the link structure anyway in case + * that we are still linked. Multiple calls to request_exit_np() don't + * hurt. + */ + if (np && (out_of_time || preempt || sleep)) { + unlink(entry->scheduled); + request_exit_np(entry->scheduled); + } + + /* Any task that is preemptable and either exhausts its execution + * budget or wants to sleep completes. We may have to reschedule after + * this. Don't do a job completion if blocks (can't have timers + * running for blocked jobs). Preemption go first for the same reason. + */ + if (!np && (out_of_time || sleep) && !blocks && !preempt) + job_completion(entry->scheduled, !sleep); + + /* Link pending task if we became unlinked. + */ + if (!entry->linked) + link_task_to_cpu(__take_ready(edf), entry); + + /* The final scheduling decision. Do we need to switch for some reason? + * If linked different from scheduled select linked as next. + */ + if ((!np || blocks) && + entry->linked != entry->scheduled) { + /* Schedule a linked job? */ + if (entry->linked) { + entry->linked->rt_param.scheduled_on = entry->cpu; + next = entry->linked; + } + if (entry->scheduled) { + /* not gonna be scheduled soon */ + entry->scheduled->rt_param.scheduled_on = NO_CPU; + TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n"); + } + } else + /* Only override Linux scheduler if we have real-time task + * scheduled that needs to continue. + */ + if (exists) + next = prev; + + spin_unlock(&cedf->slock); + + return next; +} + +/* _finish_switch - we just finished the switch away from prev + */ +static void cedf_finish_switch(struct task_struct *prev) +{ + cpu_entry_t* entry = &__get_cpu_var(cedf_cpu_entries); + + BUG_ON(!prev); + BUG_ON(!entry); + + entry->scheduled = is_realtime(current) ? current : NULL; +} + +/* Prepare a task for running in RT mode + */ +static void cedf_task_new(struct task_struct *t, int on_rq, int running) +{ + unsigned long flags; + cedf_domain_t* cedf = task_cedf(t); + cpu_entry_t* entry; + + BUG_ON(!cedf); + + spin_lock_irqsave(&cedf->slock, flags); + if (running) { + entry = &per_cpu(cedf_cpu_entries, task_cpu(t)); + BUG_ON(!entry); + BUG_ON(entry->scheduled); + entry->scheduled = t; + t->rt_param.scheduled_on = task_cpu(t); + } else + t->rt_param.scheduled_on = NO_CPU; + t->rt_param.linked_on = NO_CPU; + + /* setup job params */ + release_at(t, litmus_clock()); + + cedf_job_arrival(t); + spin_unlock_irqrestore(&cedf->slock, flags); +} + + +static void cedf_task_wake_up(struct task_struct *task) +{ + unsigned long flags; + cedf_domain_t* cedf; + lt_t now; + + BUG_ON(!task); + + cedf = task_cedf(task); + BUG_ON(!cedf); + + spin_lock_irqsave(&cedf->slock, flags); + /* We need to take suspensions because of semaphores into + * account! If a job resumes after being suspended due to acquiring + * a semaphore, it should never be treated as a new job release. + */ + if (get_rt_flags(task) == RT_F_EXIT_SEM) { + set_rt_flags(task, RT_F_RUNNING); + } else { + now = litmus_clock(); + if (is_tardy(task, now)) { + /* new sporadic release */ + release_at(task, now); + sched_trace_task_release(task); + } + else if (task->time_slice) + /* came back in time before deadline + */ + set_rt_flags(task, RT_F_RUNNING); + } + cedf_job_arrival(task); + spin_unlock_irqrestore(&cedf->slock, flags); +} + + +static void cedf_task_block(struct task_struct *t) +{ + unsigned long flags; + + BUG_ON(!t); + + /* unlink if necessary */ + spin_lock_irqsave(&task_cedf(t)->slock, flags); + unlink(t); + spin_unlock_irqrestore(&task_cedf(t)->slock, flags); + + BUG_ON(!is_realtime(t)); +} + +static void cedf_task_exit(struct task_struct * t) +{ + unsigned long flags; + + BUG_ON(!t); + + /* unlink if necessary */ + spin_lock_irqsave(&task_cedf(t)->slock, flags); + unlink(t); + if (tsk_rt(t)->scheduled_on != NO_CPU) { + cedf_cpu_entries_array[tsk_rt(t)->scheduled_on]-> + scheduled = NULL; + tsk_rt(t)->scheduled_on = NO_CPU; + } + spin_unlock_irqrestore(&task_cedf(t)->slock, flags); + + BUG_ON(!is_realtime(t)); + TRACE_TASK(t, "RIP\n"); +} + +static long cedf_admit_task(struct task_struct* tsk) +{ + return (task_cpu(tsk) >= task_cedf(tsk)->first_cpu && + task_cpu(tsk) <= task_cedf(tsk)->last_cpu) ? 0 : -EINVAL; +} + + +/* Plugin object */ +static struct sched_plugin cedf_plugin __cacheline_aligned_in_smp = { + .plugin_name = "C-EDF", + .finish_switch = cedf_finish_switch, + .tick = cedf_tick, + .task_new = cedf_task_new, + .complete_job = complete_job, + .task_exit = cedf_task_exit, + .schedule = cedf_schedule, + .task_wake_up = cedf_task_wake_up, + .task_block = cedf_task_block, + .admit_task = cedf_admit_task +}; + +static void cedf_domain_init(int first_cpu, int last_cpu) +{ + int cpu; + + /* Create new domain for this cluster. */ + cedf_domain_t *new_cedf_domain = kmalloc(sizeof(cedf_domain_t), + GFP_KERNEL); + + /* Initialize cluster domain. */ + edf_domain_init(&new_cedf_domain->domain, NULL, + cedf_release_jobs); + new_cedf_domain->first_cpu = first_cpu; + new_cedf_domain->last_cpu = last_cpu; + INIT_LIST_HEAD(&new_cedf_domain->cedf_cpu_queue); + + /* Assign all cpus in cluster to point to this domain. */ + for (cpu = first_cpu; cpu <= last_cpu; cpu++) { + remote_cedf(cpu) = new_cedf_domain; + cedf_domains_array[cpu] = new_cedf_domain; + } +} + +static int __init init_cedf(void) +{ + int cpu; + cpu_entry_t *entry; + + /* initialize CPU state */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + entry = &per_cpu(cedf_cpu_entries, cpu); + cedf_cpu_entries_array[cpu] = entry; + atomic_set(&entry->will_schedule, 0); + entry->linked = NULL; + entry->scheduled = NULL; + entry->cpu = cpu; + INIT_LIST_HEAD(&entry->list); + } + + /* initialize all cluster domains */ + for (cpu = 0; cpu < NR_CPUS; cpu += cluster_size) + cedf_domain_init(cpu, cpu+cluster_size-1); + + return register_sched_plugin(&cedf_plugin); +} + +module_init(init_cedf); + diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c new file mode 100644 index 0000000..dada868 --- /dev/null +++ b/litmus/sched_gsn_edf.c @@ -0,0 +1,728 @@ +/* + * kernel/sched_gsn_edf.c + * + * Implementation of the GSN-EDF scheduling algorithm. + * + * This version uses the simple approach and serializes all scheduling + * decisions by the use of a queue lock. This is probably not the + * best way to do it, but it should suffice for now. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +#include + +/* Overview of GSN-EDF operations. + * + * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This + * description only covers how the individual operations are implemented in + * LITMUS. + * + * link_task_to_cpu(T, cpu) - Low-level operation to update the linkage + * structure (NOT the actually scheduled + * task). If there is another linked task To + * already it will set To->linked_on = NO_CPU + * (thereby removing its association with this + * CPU). However, it will not requeue the + * previously linked task (if any). It will set + * T's state to RT_F_RUNNING and check whether + * it is already running somewhere else. If T + * is scheduled somewhere else it will link + * it to that CPU instead (and pull the linked + * task to cpu). T may be NULL. + * + * unlink(T) - Unlink removes T from all scheduler data + * structures. If it is linked to some CPU it + * will link NULL to that CPU. If it is + * currently queued in the gsnedf queue it will + * be removed from the rt_domain. It is safe to + * call unlink(T) if T is not linked. T may not + * be NULL. + * + * requeue(T) - Requeue will insert T into the appropriate + * queue. If the system is in real-time mode and + * the T is released already, it will go into the + * ready queue. If the system is not in + * real-time mode is T, then T will go into the + * release queue. If T's release time is in the + * future, it will go into the release + * queue. That means that T's release time/job + * no/etc. has to be updated before requeu(T) is + * called. It is not safe to call requeue(T) + * when T is already queued. T may not be NULL. + * + * gsnedf_job_arrival(T) - This is the catch all function when T enters + * the system after either a suspension or at a + * job release. It will queue T (which means it + * is not safe to call gsnedf_job_arrival(T) if + * T is already queued) and then check whether a + * preemption is necessary. If a preemption is + * necessary it will update the linkage + * accordingly and cause scheduled to be called + * (either with an IPI or need_resched). It is + * safe to call gsnedf_job_arrival(T) if T's + * next job has not been actually released yet + * (releast time in the future). T will be put + * on the release queue in that case. + * + * job_completion(T) - Take care of everything that needs to be done + * to prepare T for its next release and place + * it in the right queue with + * gsnedf_job_arrival(). + * + * + * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is + * equivalent to unlink(T). Note that if you unlink a task from a CPU none of + * the functions will automatically propagate pending task from the ready queue + * to a linked task. This is the job of the calling function ( by means of + * __take_ready). + */ + + +/* cpu_entry_t - maintain the linked and scheduled state + */ +typedef struct { + int cpu; + struct task_struct* linked; /* only RT tasks */ + struct task_struct* scheduled; /* only RT tasks */ + atomic_t will_schedule; /* prevent unneeded IPIs */ + struct heap_node* hn; +} cpu_entry_t; +DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries); + +cpu_entry_t* gsnedf_cpus[NR_CPUS]; + +#define set_will_schedule() \ + (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 1)) +#define clear_will_schedule() \ + (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 0)) +#define test_will_schedule(cpu) \ + (atomic_read(&per_cpu(gsnedf_cpu_entries, cpu).will_schedule)) + + +#define NO_CPU 0xffffffff + +/* the cpus queue themselves according to priority in here */ +static struct heap_node gsnedf_heap_node[NR_CPUS]; +static struct heap gsnedf_cpu_heap; + +static rt_domain_t gsnedf; +#define gsnedf_lock (gsnedf.ready_lock) + + +static int cpu_lower_prio(struct heap_node *_a, struct heap_node *_b) +{ + cpu_entry_t *a, *b; + a = _a->value; + b = _b->value; + /* Note that a and b are inverted: we want the lowest-priority CPU at + * the top of the heap. + */ + return edf_higher_prio(b->linked, a->linked); +} + +/* update_cpu_position - Move the cpu entry to the correct place to maintain + * order in the cpu queue. Caller must hold gsnedf lock. + */ +static void update_cpu_position(cpu_entry_t *entry) +{ + if (likely(heap_node_in_heap(entry->hn))) + heap_delete(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn); + heap_insert(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn); +} + +/* caller must hold gsnedf lock */ +static cpu_entry_t* lowest_prio_cpu(void) +{ + struct heap_node* hn; + hn = heap_peek(cpu_lower_prio, &gsnedf_cpu_heap); + return hn->value; +} + + +/* link_task_to_cpu - Update the link of a CPU. + * Handles the case where the to-be-linked task is already + * scheduled on a different CPU. + */ +static noinline void link_task_to_cpu(struct task_struct* linked, + cpu_entry_t *entry) +{ + cpu_entry_t *sched; + struct task_struct* tmp; + int on_cpu; + + BUG_ON(linked && !is_realtime(linked)); + + /* Currently linked task is set to be unlinked. */ + if (entry->linked) { + entry->linked->rt_param.linked_on = NO_CPU; + } + + /* Link new task to CPU. */ + if (linked) { + set_rt_flags(linked, RT_F_RUNNING); + /* handle task is already scheduled somewhere! */ + on_cpu = linked->rt_param.scheduled_on; + if (on_cpu != NO_CPU) { + sched = &per_cpu(gsnedf_cpu_entries, on_cpu); + /* this should only happen if not linked already */ + BUG_ON(sched->linked == linked); + + /* If we are already scheduled on the CPU to which we + * wanted to link, we don't need to do the swap -- + * we just link ourselves to the CPU and depend on + * the caller to get things right. + */ + if (entry != sched) { + TRACE_TASK(linked, + "already scheduled on %d, updating link.\n", + sched->cpu); + tmp = sched->linked; + linked->rt_param.linked_on = sched->cpu; + sched->linked = linked; + update_cpu_position(sched); + linked = tmp; + } + } + if (linked) /* might be NULL due to swap */ + linked->rt_param.linked_on = entry->cpu; + } + entry->linked = linked; + if (linked) + TRACE_TASK(linked, "linked to %d.\n", entry->cpu); + else + TRACE("NULL linked to %d.\n", entry->cpu); + update_cpu_position(entry); +} + +/* unlink - Make sure a task is not linked any longer to an entry + * where it was linked before. Must hold gsnedf_lock. + */ +static noinline void unlink(struct task_struct* t) +{ + cpu_entry_t *entry; + + if (unlikely(!t)) { + TRACE_BUG_ON(!t); + return; + } + + if (t->rt_param.linked_on != NO_CPU) { + /* unlink */ + entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on); + t->rt_param.linked_on = NO_CPU; + link_task_to_cpu(NULL, entry); + } else if (is_queued(t)) { + /* This is an interesting situation: t is scheduled, + * but was just recently unlinked. It cannot be + * linked anywhere else (because then it would have + * been relinked to this CPU), thus it must be in some + * queue. We must remove it from the list in this + * case. + */ + remove(&gsnedf, t); + } +} + + +/* preempt - force a CPU to reschedule + */ +static noinline void preempt(cpu_entry_t *entry) +{ + /* We cannot make the is_np() decision here if it is a remote CPU + * because requesting exit_np() requires that we currently use the + * address space of the task. Thus, in the remote case we just send + * the IPI and let schedule() handle the problem. + */ + + if (smp_processor_id() == entry->cpu) { + if (entry->scheduled && is_np(entry->scheduled)) + request_exit_np(entry->scheduled); + else + set_tsk_need_resched(current); + } else + /* in case that it is a remote CPU we have to defer the + * the decision to the remote CPU + * FIXME: We could save a few IPI's here if we leave the flag + * set when we are waiting for a np_exit(). + */ + if (!test_will_schedule(entry->cpu)) + smp_send_reschedule(entry->cpu); +} + +/* requeue - Put an unlinked task into gsn-edf domain. + * Caller must hold gsnedf_lock. + */ +static noinline void requeue(struct task_struct* task) +{ + BUG_ON(!task); + /* sanity check before insertion */ + BUG_ON(is_queued(task)); + + if (is_released(task, litmus_clock())) + __add_ready(&gsnedf, task); + else { + /* it has got to wait */ + add_release(&gsnedf, task); + } +} + +/* check for any necessary preemptions */ +static void check_for_preemptions(void) +{ + struct task_struct *task; + cpu_entry_t* last; + + for(last = lowest_prio_cpu(); + edf_preemption_needed(&gsnedf, last->linked); + last = lowest_prio_cpu()) { + /* preemption necessary */ + task = __take_ready(&gsnedf); + TRACE("check_for_preemptions: attempting to link task %d to %d\n", + task->pid, last->cpu); + if (last->linked) + requeue(last->linked); + link_task_to_cpu(task, last); + preempt(last); + } +} + +/* gsnedf_job_arrival: task is either resumed or released */ +static noinline void gsnedf_job_arrival(struct task_struct* task) +{ + BUG_ON(!task); + + requeue(task); + check_for_preemptions(); +} + +static void gsnedf_release_jobs(rt_domain_t* rt, struct heap* tasks) +{ + unsigned long flags; + + spin_lock_irqsave(&gsnedf_lock, flags); + + __merge_ready(rt, tasks); + check_for_preemptions(); + + spin_unlock_irqrestore(&gsnedf_lock, flags); +} + +/* caller holds gsnedf_lock */ +static noinline void job_completion(struct task_struct *t, int forced) +{ + BUG_ON(!t); + + sched_trace_task_completion(t, forced); + + TRACE_TASK(t, "job_completion().\n"); + + /* set flags */ + set_rt_flags(t, RT_F_SLEEP); + /* prepare for next period */ + prepare_for_next_period(t); + if (is_released(t, litmus_clock())) + sched_trace_task_release(t); + /* unlink */ + unlink(t); + /* requeue + * But don't requeue a blocking task. */ + if (is_running(t)) + gsnedf_job_arrival(t); +} + +/* gsnedf_tick - this function is called for every local timer + * interrupt. + * + * checks whether the current task has expired and checks + * whether we need to preempt it if it has not expired + */ +static void gsnedf_tick(struct task_struct* t) +{ + if (is_realtime(t) && budget_exhausted(t)) { + if (!is_np(t)) { + /* np tasks will be preempted when they become + * preemptable again + */ + set_tsk_need_resched(t); + set_will_schedule(); + TRACE("gsnedf_scheduler_tick: " + "%d is preemptable " + " => FORCE_RESCHED\n", t->pid); + } else { + TRACE("gsnedf_scheduler_tick: " + "%d is non-preemptable, " + "preemption delayed.\n", t->pid); + request_exit_np(t); + } + } +} + +/* Getting schedule() right is a bit tricky. schedule() may not make any + * assumptions on the state of the current task since it may be called for a + * number of reasons. The reasons include a scheduler_tick() determined that it + * was necessary, because sys_exit_np() was called, because some Linux + * subsystem determined so, or even (in the worst case) because there is a bug + * hidden somewhere. Thus, we must take extreme care to determine what the + * current state is. + * + * The CPU could currently be scheduling a task (or not), be linked (or not). + * + * The following assertions for the scheduled task could hold: + * + * - !is_running(scheduled) // the job blocks + * - scheduled->timeslice == 0 // the job completed (forcefully) + * - get_rt_flag() == RT_F_SLEEP // the job completed (by syscall) + * - linked != scheduled // we need to reschedule (for any reason) + * - is_np(scheduled) // rescheduling must be delayed, + * sys_exit_np must be requested + * + * Any of these can occur together. + */ +static struct task_struct* gsnedf_schedule(struct task_struct * prev) +{ + cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries); + int out_of_time, sleep, preempt, np, exists, blocks; + struct task_struct* next = NULL; + + /* Will be released in finish_switch. */ + spin_lock(&gsnedf_lock); + clear_will_schedule(); + + /* sanity checking */ + BUG_ON(entry->scheduled && entry->scheduled != prev); + BUG_ON(entry->scheduled && !is_realtime(prev)); + BUG_ON(is_realtime(prev) && !entry->scheduled); + + /* (0) Determine state */ + exists = entry->scheduled != NULL; + blocks = exists && !is_running(entry->scheduled); + out_of_time = exists && budget_exhausted(entry->scheduled); + np = exists && is_np(entry->scheduled); + sleep = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP; + preempt = entry->scheduled != entry->linked; + + TRACE_TASK(prev, "invoked gsnedf_schedule.\n"); + + if (exists) + TRACE_TASK(prev, + "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d " + "state:%d sig:%d\n", + blocks, out_of_time, np, sleep, preempt, + prev->state, signal_pending(prev)); + if (entry->linked && preempt) + TRACE_TASK(prev, "will be preempted by %s/%d\n", + entry->linked->comm, entry->linked->pid); + + + /* If a task blocks we have no choice but to reschedule. + */ + if (blocks) + unlink(entry->scheduled); + + /* Request a sys_exit_np() call if we would like to preempt but cannot. + * We need to make sure to update the link structure anyway in case + * that we are still linked. Multiple calls to request_exit_np() don't + * hurt. + */ + if (np && (out_of_time || preempt || sleep)) { + unlink(entry->scheduled); + request_exit_np(entry->scheduled); + } + + /* Any task that is preemptable and either exhausts its execution + * budget or wants to sleep completes. We may have to reschedule after + * this. Don't do a job completion if we block (can't have timers running + * for blocked jobs). Preemption go first for the same reason. + */ + if (!np && (out_of_time || sleep) && !blocks && !preempt) + job_completion(entry->scheduled, !sleep); + + /* Link pending task if we became unlinked. + */ + if (!entry->linked) + link_task_to_cpu(__take_ready(&gsnedf), entry); + + /* The final scheduling decision. Do we need to switch for some reason? + * If linked is different from scheduled, then select linked as next. + */ + if ((!np || blocks) && + entry->linked != entry->scheduled) { + /* Schedule a linked job? */ + if (entry->linked) { + entry->linked->rt_param.scheduled_on = entry->cpu; + next = entry->linked; + } + if (entry->scheduled) { + /* not gonna be scheduled soon */ + entry->scheduled->rt_param.scheduled_on = NO_CPU; + TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n"); + } + } else + /* Only override Linux scheduler if we have a real-time task + * scheduled that needs to continue. + */ + if (exists) + next = prev; + + spin_unlock(&gsnedf_lock); + + TRACE("gsnedf_lock released, next=0x%p\n", next); + + + if (next) + TRACE_TASK(next, "scheduled at %llu\n", litmus_clock()); + else if (exists && !next) + TRACE("becomes idle at %llu.\n", litmus_clock()); + + + return next; +} + + +/* _finish_switch - we just finished the switch away from prev + */ +static void gsnedf_finish_switch(struct task_struct *prev) +{ + cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries); + + entry->scheduled = is_realtime(current) ? current : NULL; + TRACE_TASK(prev, "switched away from\n"); +} + + +/* Prepare a task for running in RT mode + */ +static void gsnedf_task_new(struct task_struct * t, int on_rq, int running) +{ + unsigned long flags; + cpu_entry_t* entry; + + TRACE("gsn edf: task new %d\n", t->pid); + + spin_lock_irqsave(&gsnedf_lock, flags); + if (running) { + entry = &per_cpu(gsnedf_cpu_entries, task_cpu(t)); + BUG_ON(entry->scheduled); + entry->scheduled = t; + t->rt_param.scheduled_on = task_cpu(t); + } else + t->rt_param.scheduled_on = NO_CPU; + t->rt_param.linked_on = NO_CPU; + + /* setup job params */ + release_at(t, litmus_clock()); + + gsnedf_job_arrival(t); + spin_unlock_irqrestore(&gsnedf_lock, flags); +} + +static void gsnedf_task_wake_up(struct task_struct *task) +{ + unsigned long flags; + lt_t now; + + TRACE_TASK(task, "wake_up at %llu\n", litmus_clock()); + + spin_lock_irqsave(&gsnedf_lock, flags); + /* We need to take suspensions because of semaphores into + * account! If a job resumes after being suspended due to acquiring + * a semaphore, it should never be treated as a new job release. + */ + if (get_rt_flags(task) == RT_F_EXIT_SEM) { + set_rt_flags(task, RT_F_RUNNING); + } else { + now = litmus_clock(); + if (is_tardy(task, now)) { + /* new sporadic release */ + release_at(task, now); + sched_trace_task_release(task); + } + else if (task->time_slice) + /* came back in time before deadline + */ + set_rt_flags(task, RT_F_RUNNING); + } + gsnedf_job_arrival(task); + spin_unlock_irqrestore(&gsnedf_lock, flags); +} + +static void gsnedf_task_block(struct task_struct *t) +{ + unsigned long flags; + + TRACE_TASK(t, "block at %llu\n", litmus_clock()); + + /* unlink if necessary */ + spin_lock_irqsave(&gsnedf_lock, flags); + unlink(t); + spin_unlock_irqrestore(&gsnedf_lock, flags); + + BUG_ON(!is_realtime(t)); +} + + +static void gsnedf_task_exit(struct task_struct * t) +{ + unsigned long flags; + + /* unlink if necessary */ + spin_lock_irqsave(&gsnedf_lock, flags); + unlink(t); + if (tsk_rt(t)->scheduled_on != NO_CPU) { + gsnedf_cpus[tsk_rt(t)->scheduled_on]->scheduled = NULL; + tsk_rt(t)->scheduled_on = NO_CPU; + } + spin_unlock_irqrestore(&gsnedf_lock, flags); + + BUG_ON(!is_realtime(t)); + TRACE_TASK(t, "RIP\n"); +} + +#ifdef CONFIG_FMLP +static long gsnedf_pi_block(struct pi_semaphore *sem, + struct task_struct *new_waiter) +{ + /* This callback has to handle the situation where a new waiter is + * added to the wait queue of the semaphore. + * + * We must check if has a higher priority than the currently + * highest-priority task, and then potentially reschedule. + */ + + BUG_ON(!new_waiter); + + if (edf_higher_prio(new_waiter, sem->hp.task)) { + TRACE_TASK(new_waiter, " boosts priority\n"); + /* called with IRQs disabled */ + spin_lock(&gsnedf_lock); + /* store new highest-priority task */ + sem->hp.task = new_waiter; + if (sem->holder) { + /* let holder inherit */ + sem->holder->rt_param.inh_task = new_waiter; + unlink(sem->holder); + gsnedf_job_arrival(sem->holder); + } + spin_unlock(&gsnedf_lock); + } + + return 0; +} + +static long gsnedf_inherit_priority(struct pi_semaphore *sem, + struct task_struct *new_owner) +{ + /* We don't need to acquire the gsnedf_lock since at the time of this + * call new_owner isn't actually scheduled yet (it's still sleeping) + * and since the calling function already holds sem->wait.lock, which + * prevents concurrent sem->hp.task changes. + */ + + if (sem->hp.task && sem->hp.task != new_owner) { + new_owner->rt_param.inh_task = sem->hp.task; + TRACE_TASK(new_owner, "inherited priority from %s/%d\n", + sem->hp.task->comm, sem->hp.task->pid); + } else + TRACE_TASK(new_owner, + "cannot inherit priority, " + "no higher priority job waits.\n"); + return 0; +} + +/* This function is called on a semaphore release, and assumes that + * the current task is also the semaphore holder. + */ +static long gsnedf_return_priority(struct pi_semaphore *sem) +{ + struct task_struct* t = current; + int ret = 0; + + /* Find new highest-priority semaphore task + * if holder task is the current hp.task. + * + * Calling function holds sem->wait.lock. + */ + if (t == sem->hp.task) + edf_set_hp_task(sem); + + TRACE_CUR("gsnedf_return_priority for lock %p\n", sem); + + if (t->rt_param.inh_task) { + /* interrupts already disabled by PI code */ + spin_lock(&gsnedf_lock); + + /* Reset inh_task to NULL. */ + t->rt_param.inh_task = NULL; + + /* Check if rescheduling is necessary */ + unlink(t); + gsnedf_job_arrival(t); + spin_unlock(&gsnedf_lock); + } + + return ret; +} + +#endif + +static long gsnedf_admit_task(struct task_struct* tsk) +{ + return 0; +} + + +/* Plugin object */ +static struct sched_plugin gsn_edf_plugin __cacheline_aligned_in_smp = { + .plugin_name = "GSN-EDF", + .finish_switch = gsnedf_finish_switch, + .tick = gsnedf_tick, + .task_new = gsnedf_task_new, + .complete_job = complete_job, + .task_exit = gsnedf_task_exit, + .schedule = gsnedf_schedule, + .task_wake_up = gsnedf_task_wake_up, + .task_block = gsnedf_task_block, +#ifdef CONFIG_FMLP + .fmlp_active = 1, + .pi_block = gsnedf_pi_block, + .inherit_priority = gsnedf_inherit_priority, + .return_priority = gsnedf_return_priority, +#endif + .admit_task = gsnedf_admit_task +}; + + +static int __init init_gsn_edf(void) +{ + int cpu; + cpu_entry_t *entry; + + heap_init(&gsnedf_cpu_heap); + /* initialize CPU state */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + entry = &per_cpu(gsnedf_cpu_entries, cpu); + gsnedf_cpus[cpu] = entry; + atomic_set(&entry->will_schedule, 0); + entry->linked = NULL; + entry->scheduled = NULL; + entry->cpu = cpu; + entry->hn = &gsnedf_heap_node[cpu]; + heap_node_init(&entry->hn, entry); + } + edf_domain_init(&gsnedf, NULL, gsnedf_release_jobs); + return register_sched_plugin(&gsn_edf_plugin); +} + + +module_init(init_gsn_edf); diff --git a/litmus/sched_litmus.c b/litmus/sched_litmus.c new file mode 100644 index 0000000..b4858f8 --- /dev/null +++ b/litmus/sched_litmus.c @@ -0,0 +1,230 @@ +/* This file is included from kernel/sched.c */ + +#include +#include + +static void update_time_litmus(struct rq *rq, struct task_struct *p) +{ + lt_t now = litmus_clock(); + p->rt_param.job_params.exec_time += + now - p->rt_param.job_params.exec_start; + p->rt_param.job_params.exec_start = now; +} + +static void double_rq_lock(struct rq *rq1, struct rq *rq2); +static void double_rq_unlock(struct rq *rq1, struct rq *rq2); + +static void litmus_tick(struct rq *rq, struct task_struct *p) +{ + if (is_realtime(p)) + update_time_litmus(rq, p); + litmus->tick(p); +} + +#define NO_CPU -1 + +static void litmus_schedule(struct rq *rq, struct task_struct *prev) +{ + struct rq* other_rq; + long prev_state; + lt_t _maybe_deadlock = 0; + /* WARNING: rq is _not_ locked! */ + if (is_realtime(prev)) + update_time_litmus(rq, prev); + + /* let the plugin schedule */ + rq->litmus_next = litmus->schedule(prev); + + /* check if a global plugin pulled a task from a different RQ */ + if (rq->litmus_next && task_rq(rq->litmus_next) != rq) { + /* we need to migrate the task */ + other_rq = task_rq(rq->litmus_next); + TRACE_TASK(rq->litmus_next, "migrate from %d\n", other_rq->cpu); + + /* while we drop the lock, the prev task could change its + * state + */ + prev_state = prev->state; + mb(); + spin_unlock(&rq->lock); + + /* Don't race with a concurrent switch. + * This could deadlock in the case of cross or circular migrations. + * It's the job of the plugin to make sure that doesn't happen. + */ + TRACE_TASK(rq->litmus_next, "stack_in_use=%d\n", + rq->litmus_next->rt_param.stack_in_use); + if (rq->litmus_next->rt_param.stack_in_use != NO_CPU) { + TRACE_TASK(rq->litmus_next, "waiting to deschedule\n"); + _maybe_deadlock = litmus_clock(); + } + while (rq->litmus_next->rt_param.stack_in_use != NO_CPU) { + cpu_relax(); + mb(); + if (rq->litmus_next->rt_param.stack_in_use == NO_CPU) + TRACE_TASK(rq->litmus_next, + "descheduled. Proceeding.\n"); + if (lt_before(_maybe_deadlock + 10000000, litmus_clock())) { + /* We've been spinning for 10ms. + * Something can't be right! + * Let's abandon the task and bail out; at least + * we will have debug info instead of a hard + * deadlock. + */ + TRACE_TASK(rq->litmus_next, + "stack too long in use. Deadlock?\n"); + rq->litmus_next = NULL; + + /* bail out */ + spin_lock(&rq->lock); + return; + } + } +#ifdef __ARCH_WANT_UNLOCKED_CTXSW + if (rq->litmus_next->oncpu) + TRACE_TASK(rq->litmus_next, "waiting for !oncpu"); + while (rq->litmus_next->oncpu) { + cpu_relax(); + mb(); + } +#endif + double_rq_lock(rq, other_rq); + mb(); + if (prev->state != prev_state && is_realtime(prev)) { + TRACE_TASK(prev, + "state changed while we dropped" + " the lock: now=%d, old=%d\n", + prev->state, prev_state); + if (prev_state && !prev->state) { + /* prev task became unblocked + * we need to simulate normal sequence of events + * to scheduler plugins. + */ + litmus->task_block(prev); + litmus->task_wake_up(prev); + } + } + + set_task_cpu(rq->litmus_next, smp_processor_id()); + + /* DEBUG: now that we have the lock we need to make sure a + * couple of things still hold: + * - it is still a real-time task + * - it is still runnable (could have been stopped) + */ + if (!is_realtime(rq->litmus_next) || + !is_running(rq->litmus_next)) { + /* BAD BAD BAD */ + TRACE_TASK(rq->litmus_next, + "migration invariant FAILED: rt=%d running=%d\n", + is_realtime(rq->litmus_next), + is_running(rq->litmus_next)); + /* drop the task */ + rq->litmus_next = NULL; + } + /* release the other CPU's runqueue, but keep ours */ + spin_unlock(&other_rq->lock); + } + if (rq->litmus_next) + rq->litmus_next->rt_param.stack_in_use = rq->cpu; +} + +static void enqueue_task_litmus(struct rq *rq, struct task_struct *p, int wakeup) +{ + if (wakeup) { + sched_trace_task_resume(p); + litmus->task_wake_up(p); + } else + TRACE_TASK(p, "ignoring an enqueue, not a wake up.\n"); +} + +static void dequeue_task_litmus(struct rq *rq, struct task_struct *p, int sleep) +{ + if (sleep) { + litmus->task_block(p); + sched_trace_task_block(p); + } else + TRACE_TASK(p, "ignoring a dequeue, not going to sleep.\n"); +} + +static void yield_task_litmus(struct rq *rq) +{ + BUG_ON(rq->curr != current); + litmus->complete_job(); +} + +/* Plugins are responsible for this. + */ +static void check_preempt_curr_litmus(struct rq *rq, struct task_struct *p) +{ +} + +/* has already been taken care of */ +static void put_prev_task_litmus(struct rq *rq, struct task_struct *p) +{ +} + +static struct task_struct *pick_next_task_litmus(struct rq *rq) +{ + struct task_struct* picked = rq->litmus_next; + rq->litmus_next = NULL; + if (picked) + picked->rt_param.job_params.exec_start = litmus_clock(); + return picked; +} + +static void task_tick_litmus(struct rq *rq, struct task_struct *p) +{ +} + +/* This is called when a task became a real-time task, either due + * to a SCHED_* class transition or due to PI mutex inheritance.\ + * We don't handle Linux PI mutex inheritance yet. Use LITMUS provided + * synchronization primitives instead. + */ +static void set_curr_task_litmus(struct rq *rq) +{ + rq->curr->rt_param.job_params.exec_start = litmus_clock(); +} + + +#ifdef CONFIG_SMP + +/* we don't repartition at runtime */ + +static unsigned long +load_balance_litmus(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, int *this_best_prio) +{ + return 0; +} + +static int +move_one_task_litmus(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle) +{ + return 0; +} +#endif + +const struct sched_class litmus_sched_class = { + .next = &rt_sched_class, + .enqueue_task = enqueue_task_litmus, + .dequeue_task = dequeue_task_litmus, + .yield_task = yield_task_litmus, + + .check_preempt_curr = check_preempt_curr_litmus, + + .pick_next_task = pick_next_task_litmus, + .put_prev_task = put_prev_task_litmus, + +#ifdef CONFIG_SMP + .load_balance = load_balance_litmus, + .move_one_task = move_one_task_litmus, +#endif + + .set_curr_task = set_curr_task_litmus, + .task_tick = task_tick_litmus, +}; diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c new file mode 100755 index 0000000..a733c95 --- /dev/null +++ b/litmus/sched_pfair.c @@ -0,0 +1,895 @@ +/* + * kernel/sched_pfair.c + * + * Implementation of the (global) Pfair scheduling algorithm. + * + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include + +struct subtask { + /* measured in quanta relative to job release */ + quanta_t release; + quanta_t deadline; + quanta_t overlap; /* called "b bit" by PD^2 */ + quanta_t group_deadline; +}; + +struct pfair_param { + quanta_t quanta; /* number of subtasks */ + quanta_t cur; /* index of current subtask */ + + quanta_t release; /* in quanta */ + quanta_t period; /* in quanta */ + + quanta_t last_quantum; /* when scheduled last */ + int last_cpu; /* where scheduled last */ + + unsigned int present; /* Can the task be scheduled? */ + unsigned int sporadic_release; /* On wakeup, new sporadic release? */ + + struct subtask subtasks[0]; /* allocate together with pfair_param */ +}; + +#define tsk_pfair(tsk) ((tsk)->rt_param.pfair) + +struct pfair_state { + int cpu; + volatile quanta_t cur_tick; /* updated by the CPU that is advancing + * the time */ + volatile quanta_t local_tick; /* What tick is the local CPU currently + * executing? Updated only by the local + * CPU. In QEMU, this may lag behind the + * current tick. In a real system, with + * proper timers and aligned quanta, + * that should only be the + * case for a very short time after the + * time advanced. With staggered quanta, + * it will lag for the duration of the + * offset. + */ + + struct task_struct* linked; /* the task that should be executing */ + struct task_struct* local; /* the local copy of linked */ + struct task_struct* scheduled; /* what is actually scheduled */ + + unsigned long missed_quanta; + lt_t offset; /* stagger offset */ +}; + +/* Currently, we limit the maximum period of any task to 2000 quanta. + * The reason is that it makes the implementation easier since we do not + * need to reallocate the release wheel on task arrivals. + * In the future + */ +#define PFAIR_MAX_PERIOD 2000 + +/* This is the release queue wheel. It is indexed by pfair_time % + * PFAIR_MAX_PERIOD. Each heap is ordered by PFAIR priority, so that it can be + * merged with the ready queue. + */ +static struct heap release_queue[PFAIR_MAX_PERIOD]; + +DEFINE_PER_CPU(struct pfair_state, pfair_state); +struct pfair_state* pstate[NR_CPUS]; /* short cut */ + +#define NO_CPU 0xffffffff + +static quanta_t pfair_time = 0; /* the "official" PFAIR clock */ +static quanta_t merge_time = 0; /* Updated after the release queue has been + * merged. Used by drop_all_references(). + */ + +static rt_domain_t pfair; + +/* The pfair_lock is used to serialize all scheduling events. + */ +#define pfair_lock pfair.ready_lock + +/* Enable for lots of trace info. + * #define PFAIR_DEBUG + */ + +#ifdef PFAIR_DEBUG +#define PTRACE_TASK(t, f, args...) TRACE_TASK(t, f, ## args) +#define PTRACE(f, args...) TRACE(f, ## args) +#else +#define PTRACE_TASK(t, f, args...) +#define PTRACE(f, args...) +#endif + +/* gcc will inline all of these accessor functions... */ +static struct subtask* cur_subtask(struct task_struct* t) +{ + return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur; +} + +static quanta_t cur_deadline(struct task_struct* t) +{ + return cur_subtask(t)->deadline + tsk_pfair(t)->release; +} + + +static quanta_t cur_sub_release(struct task_struct* t) +{ + return cur_subtask(t)->release + tsk_pfair(t)->release; +} + +static quanta_t cur_release(struct task_struct* t) +{ +#ifdef EARLY_RELEASE + /* only the release of the first subtask counts when we early + * release */ + return tsk_pfair(t)->release; +#else + return cur_sub_release(t); +#endif +} + +static quanta_t cur_overlap(struct task_struct* t) +{ + return cur_subtask(t)->overlap; +} + +static quanta_t cur_group_deadline(struct task_struct* t) +{ + quanta_t gdl = cur_subtask(t)->group_deadline; + if (gdl) + return gdl + tsk_pfair(t)->release; + else + return gdl; +} + +static int is_present(struct task_struct* t) +{ + return t && tsk_pfair(t)->present; +} + +static int pfair_higher_prio(struct task_struct* first, + struct task_struct* second) +{ + return /* first task must exist */ + first && ( + /* Does the second task exist and is it a real-time task? If + * not, the first task (which is a RT task) has higher + * priority. + */ + !second || !is_realtime(second) || + + /* Is the (subtask) deadline of the first task earlier? + * Then it has higher priority. + */ + time_before(cur_deadline(first), cur_deadline(second)) || + + /* Do we have a deadline tie? + * Then break by B-bit. + */ + (cur_deadline(first) == cur_deadline(second) && + (cur_overlap(first) > cur_overlap(second) || + + /* Do we have a B-bit tie? + * Then break by group deadline. + */ + (cur_overlap(first) == cur_overlap(second) && + (time_after(cur_group_deadline(first), + cur_group_deadline(second)) || + + /* Do we have a group deadline tie? + * Then break by PID, which are unique. + */ + (cur_group_deadline(first) == + cur_group_deadline(second) && + first->pid < second->pid)))))); +} + +int pfair_ready_order(struct heap_node* a, struct heap_node* b) +{ + return pfair_higher_prio(heap2task(a), heap2task(b)); +} + +/* return the proper release queue for time t */ +static struct heap* relq(quanta_t t) +{ + struct heap* rq = &release_queue[t % PFAIR_MAX_PERIOD]; + return rq; +} + +static void prepare_release(struct task_struct* t, quanta_t at) +{ + tsk_pfair(t)->release = at; + tsk_pfair(t)->cur = 0; +} + +static void __pfair_add_release(struct task_struct* t, struct heap* queue) +{ + heap_insert(pfair_ready_order, queue, + tsk_rt(t)->heap_node); +} + +static void pfair_add_release(struct task_struct* t) +{ + BUG_ON(heap_node_in_heap(tsk_rt(t)->heap_node)); + __pfair_add_release(t, relq(cur_release(t))); +} + +/* pull released tasks from the release queue */ +static void poll_releases(quanta_t time) +{ + __merge_ready(&pfair, relq(time)); + merge_time = time; +} + +static void check_preempt(struct task_struct* t) +{ + int cpu = NO_CPU; + if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on && + tsk_pfair(t)->present) { + /* the task can be scheduled and + * is not scheduled where it ought to be scheduled + */ + cpu = tsk_rt(t)->linked_on != NO_CPU ? + tsk_rt(t)->linked_on : + tsk_rt(t)->scheduled_on; + PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n", + tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on); + /* preempt */ + if (cpu == smp_processor_id()) + set_tsk_need_resched(current); + else { + smp_send_reschedule(cpu); + } + } +} + +/* caller must hold pfair_lock */ +static void drop_all_references(struct task_struct *t) +{ + int cpu; + struct pfair_state* s; + struct heap* q; + if (heap_node_in_heap(tsk_rt(t)->heap_node)) { + /* figure out what queue the node is in */ + if (time_before_eq(cur_release(t), merge_time)) + q = &pfair.ready_queue; + else + q = relq(cur_release(t)); + heap_delete(pfair_ready_order, q, + tsk_rt(t)->heap_node); + } + for (cpu = 0; cpu < NR_CPUS; cpu++) { + s = &per_cpu(pfair_state, cpu); + if (s->linked == t) + s->linked = NULL; + if (s->local == t) + s->local = NULL; + if (s->scheduled == t) + s->scheduled = NULL; + } +} + +/* returns 1 if the task needs to go the release queue */ +static int advance_subtask(quanta_t time, struct task_struct* t, int cpu) +{ + struct pfair_param* p = tsk_pfair(t); + int to_relq; + p->cur = (p->cur + 1) % p->quanta; + if (!p->cur) { + sched_trace_task_completion(t, 1); + if (tsk_pfair(t)->present) { + /* we start a new job */ + prepare_for_next_period(t); + sched_trace_task_release(t); + get_rt_flags(t) = RT_F_RUNNING; + p->release += p->period; + } else { + /* remove task from system until it wakes */ + drop_all_references(t); + tsk_pfair(t)->sporadic_release = 1; + TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n", + cpu, p->cur); + return 0; + } + } + to_relq = time_after(cur_release(t), time); + TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d\n", + cpu, p->cur, to_relq); + return to_relq; +} + +static void advance_subtasks(quanta_t time) +{ + int cpu, missed; + struct task_struct* l; + struct pfair_param* p; + + for_each_online_cpu(cpu) { + l = pstate[cpu]->linked; + missed = pstate[cpu]->linked != pstate[cpu]->local; + if (l) { + p = tsk_pfair(l); + p->last_quantum = time; + p->last_cpu = cpu; + if (advance_subtask(time, l, cpu)) { + pstate[cpu]->linked = NULL; + pfair_add_release(l); + } + } + } +} + +static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu) +{ + int cpu; + if (tsk_rt(t)->scheduled_on != NO_CPU) { + /* always observe scheduled_on linkage */ + default_cpu = tsk_rt(t)->scheduled_on; + } else if (tsk_pfair(t)->last_quantum == time - 1) { + /* back2back quanta */ + /* Only observe last_quantum if no scheduled_on is in the way. + * This should only kick in if a CPU missed quanta, and that + * *should* only happen in QEMU. + */ + cpu = tsk_pfair(t)->last_cpu; + if (!pstate[cpu]->linked || + tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) { + default_cpu = cpu; + } + } + return default_cpu; +} + +/* returns one if linking was redirected */ +static int pfair_link(quanta_t time, int cpu, + struct task_struct* t) +{ + int target = target_cpu(time, t, cpu); + struct task_struct* prev = pstate[cpu]->linked; + struct task_struct* other; + + if (target != cpu) { + other = pstate[target]->linked; + pstate[target]->linked = t; + tsk_rt(t)->linked_on = target; + if (!other) + /* linked ok, but reschedule this CPU */ + return 1; + if (target < cpu) { + /* link other to cpu instead */ + tsk_rt(other)->linked_on = cpu; + pstate[cpu]->linked = other; + if (prev) { + /* prev got pushed back into the ready queue */ + tsk_rt(prev)->linked_on = NO_CPU; + __add_ready(&pfair, prev); + } + /* we are done with this cpu */ + return 0; + } else { + /* re-add other, it's original CPU was not considered yet */ + tsk_rt(other)->linked_on = NO_CPU; + __add_ready(&pfair, other); + /* reschedule this CPU */ + return 1; + } + } else { + pstate[cpu]->linked = t; + tsk_rt(t)->linked_on = cpu; + if (prev) { + /* prev got pushed back into the ready queue */ + tsk_rt(prev)->linked_on = NO_CPU; + __add_ready(&pfair, prev); + } + /* we are done with this CPU */ + return 0; + } +} + +static void schedule_subtasks(quanta_t time) +{ + int cpu, retry; + + for_each_online_cpu(cpu) { + retry = 1; + while (retry) { + if (pfair_higher_prio(__peek_ready(&pfair), + pstate[cpu]->linked)) + retry = pfair_link(time, cpu, + __take_ready(&pfair)); + else + retry = 0; + } + } +} + +static void schedule_next_quantum(quanta_t time) +{ + int cpu; + + /* called with interrupts disabled */ + PTRACE("--- Q %lu at %llu PRE-SPIN\n", + time, litmus_clock()); + spin_lock(&pfair_lock); + PTRACE("<<< Q %lu at %llu\n", + time, litmus_clock()); + + sched_trace_quantum_boundary(); + + advance_subtasks(time); + poll_releases(time); + schedule_subtasks(time); + + for (cpu = 0; cpu < NR_CPUS; cpu++) + if (pstate[cpu]->linked) + PTRACE_TASK(pstate[cpu]->linked, + " linked on %d.\n", cpu); + else + PTRACE("(null) linked on %d.\n", cpu); + + /* We are done. Advance time. */ + mb(); + for (cpu = 0; cpu < NR_CPUS; cpu++) { + if (pstate[cpu]->local_tick != pstate[cpu]->cur_tick) { + TRACE("BAD Quantum not acked on %d " + "(l:%lu c:%lu p:%lu)\n", + cpu, + pstate[cpu]->local_tick, + pstate[cpu]->cur_tick, + pfair_time); + pstate[cpu]->missed_quanta++; + } + pstate[cpu]->cur_tick = time; + } + PTRACE(">>> Q %lu at %llu\n", + time, litmus_clock()); + spin_unlock(&pfair_lock); +} + +static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state) +{ + quanta_t loc; + + goto first; /* skip mb() on first iteration */ + do { + cpu_relax(); + mb(); + first: loc = state->cur_tick; + /* FIXME: what if loc > cur? */ + } while (time_before(loc, q)); + PTRACE("observed cur_tick:%lu >= q:%lu\n", + loc, q); +} + +static quanta_t current_quantum(struct pfair_state* state) +{ + lt_t t = litmus_clock() - state->offset; + return time2quanta(t, FLOOR); +} + +static void catchup_quanta(quanta_t from, quanta_t target, + struct pfair_state* state) +{ + quanta_t cur = from, time; + TRACE("+++< BAD catching up quanta from %lu to %lu\n", + from, target); + while (time_before(cur, target)) { + wait_for_quantum(cur, state); + cur++; + time = cmpxchg(&pfair_time, + cur - 1, /* expected */ + cur /* next */ + ); + if (time == cur - 1) + schedule_next_quantum(cur); + } + TRACE("+++> catching up done\n"); +} + +/* pfair_tick - this function is called for every local timer + * interrupt. + */ +static void pfair_tick(struct task_struct* t) +{ + struct pfair_state* state = &__get_cpu_var(pfair_state); + quanta_t time, cur; + int retry = 10; + + do { + cur = current_quantum(state); + PTRACE("q %lu at %llu\n", cur, litmus_clock()); + + /* Attempt to advance time. First CPU to get here + * will prepare the next quantum. + */ + time = cmpxchg(&pfair_time, + cur - 1, /* expected */ + cur /* next */ + ); + if (time == cur - 1) { + /* exchange succeeded */ + wait_for_quantum(cur - 1, state); + schedule_next_quantum(cur); + retry = 0; + } else if (time_before(time, cur - 1)) { + /* the whole system missed a tick !? */ + catchup_quanta(time, cur, state); + retry--; + } else if (time_after(time, cur)) { + /* our timer lagging behind!? */ + TRACE("BAD pfair_time:%lu > cur:%lu\n", time, cur); + retry--; + } else { + /* Some other CPU already started scheduling + * this quantum. Let it do its job and then update. + */ + retry = 0; + } + } while (retry); + + /* Spin locally until time advances. */ + wait_for_quantum(cur, state); + + /* copy assignment */ + /* FIXME: what if we race with a future update? Corrupted state? */ + state->local = state->linked; + /* signal that we are done */ + mb(); + state->local_tick = state->cur_tick; + + if (state->local != current + && (is_realtime(current) || is_present(state->local))) + set_tsk_need_resched(current); +} + +static int safe_to_schedule(struct task_struct* t, int cpu) +{ + int where = tsk_rt(t)->scheduled_on; + if (where != NO_CPU && where != cpu) { + TRACE_TASK(t, "BAD: can't be scheduled on %d, " + "scheduled already on %d.\n", cpu, where); + return 0; + } else + return tsk_pfair(t)->present && get_rt_flags(t) == RT_F_RUNNING; +} + +static struct task_struct* pfair_schedule(struct task_struct * prev) +{ + struct pfair_state* state = &__get_cpu_var(pfair_state); + int blocks; + struct task_struct* next = NULL; + + spin_lock(&pfair_lock); + + blocks = is_realtime(prev) && !is_running(prev); + + if (blocks) + tsk_pfair(prev)->present = 0; + + if (state->local && safe_to_schedule(state->local, state->cpu)) + next = state->local; + + if (prev != next) { + tsk_rt(prev)->scheduled_on = NO_CPU; + if (next) + tsk_rt(next)->scheduled_on = state->cpu; + } + + spin_unlock(&pfair_lock); + + if (next) + TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n", + tsk_pfair(next)->release, pfair_time, litmus_clock()); + else if (is_realtime(prev)) + TRACE("Becomes idle at %lu (%llu)\n", pfair_time, litmus_clock()); + + return next; +} + +static void pfair_task_new(struct task_struct * t, int on_rq, int running) +{ + unsigned long flags; + + TRACE("pfair: task new %d state:%d\n", t->pid, t->state); + + spin_lock_irqsave(&pfair_lock, flags); + if (running) + t->rt_param.scheduled_on = task_cpu(t); + else + t->rt_param.scheduled_on = NO_CPU; + + prepare_release(t, pfair_time + 1); + tsk_pfair(t)->present = running; + tsk_pfair(t)->sporadic_release = 0; + pfair_add_release(t); + check_preempt(t); + + spin_unlock_irqrestore(&pfair_lock, flags); +} + +static void pfair_task_wake_up(struct task_struct *t) +{ + unsigned long flags; + lt_t now; + + TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n", + litmus_clock(), cur_release(t), pfair_time); + + spin_lock_irqsave(&pfair_lock, flags); + + tsk_pfair(t)->present = 1; + + /* It is a little unclear how to deal with Pfair + * tasks that block for a while and then wake. For now, + * if a task blocks and wakes before its next job release, + * then it may resume if it is currently linked somewhere + * (as if it never blocked at all). Otherwise, we have a + * new sporadic job release. + */ + if (tsk_pfair(t)->sporadic_release) { + now = litmus_clock(); + release_at(t, now); + prepare_release(t, time2quanta(now, CEIL)); + sched_trace_task_release(t); + /* FIXME: race with pfair_time advancing */ + pfair_add_release(t); + tsk_pfair(t)->sporadic_release = 0; + } + + check_preempt(t); + + spin_unlock_irqrestore(&pfair_lock, flags); + TRACE_TASK(t, "wake up done at %llu\n", litmus_clock()); +} + +static void pfair_task_block(struct task_struct *t) +{ + BUG_ON(!is_realtime(t)); + TRACE_TASK(t, "blocks at %llu, state:%d\n", + litmus_clock(), t->state); +} + +static void pfair_task_exit(struct task_struct * t) +{ + unsigned long flags; + + BUG_ON(!is_realtime(t)); + + /* Remote task from release or ready queue, and ensure + * that it is not the scheduled task for ANY CPU. We + * do this blanket check because occassionally when + * tasks exit while blocked, the task_cpu of the task + * might not be the same as the CPU that the PFAIR scheduler + * has chosen for it. + */ + spin_lock_irqsave(&pfair_lock, flags); + + TRACE_TASK(t, "RIP, state:%d\n", t->state); + drop_all_references(t); + + spin_unlock_irqrestore(&pfair_lock, flags); + + kfree(t->rt_param.pfair); + t->rt_param.pfair = NULL; +} + + +static void pfair_release_at(struct task_struct* task, lt_t start) +{ + unsigned long flags; + quanta_t release; + + BUG_ON(!is_realtime(task)); + + spin_lock_irqsave(&pfair_lock, flags); + release_at(task, start); + release = time2quanta(start, CEIL); + + if (release - pfair_time >= PFAIR_MAX_PERIOD) + release = pfair_time + PFAIR_MAX_PERIOD; + + TRACE_TASK(task, "sys release at %lu\n", release); + + drop_all_references(task); + prepare_release(task, release); + pfair_add_release(task); + + /* Clear sporadic release flag, since this release subsumes any + * sporadic release on wake. + */ + tsk_pfair(task)->sporadic_release = 0; + + spin_unlock_irqrestore(&pfair_lock, flags); +} + +static void init_subtask(struct subtask* sub, unsigned long i, + lt_t quanta, lt_t period) +{ + /* since i is zero-based, the formulas are shifted by one */ + lt_t tmp; + + /* release */ + tmp = period * i; + do_div(tmp, quanta); /* floor */ + sub->release = (quanta_t) tmp; + + /* deadline */ + tmp = period * (i + 1); + if (do_div(tmp, quanta)) /* ceil */ + tmp++; + sub->deadline = (quanta_t) tmp; + + /* next release */ + tmp = period * (i + 1); + do_div(tmp, quanta); /* floor */ + sub->overlap = sub->deadline - (quanta_t) tmp; + + /* Group deadline. + * Based on the formula given in Uma's thesis. + */ + if (2 * quanta >= period) { + /* heavy */ + tmp = (sub->deadline - (i + 1)) * period; + if (period > quanta && + do_div(tmp, (period - quanta))) /* ceil */ + tmp++; + sub->group_deadline = (quanta_t) tmp; + } else + sub->group_deadline = 0; +} + +static void dump_subtasks(struct task_struct* t) +{ + unsigned long i; + for (i = 0; i < t->rt_param.pfair->quanta; i++) + TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n", + i + 1, + t->rt_param.pfair->subtasks[i].release, + t->rt_param.pfair->subtasks[i].deadline, + t->rt_param.pfair->subtasks[i].overlap, + t->rt_param.pfair->subtasks[i].group_deadline); +} + +static long pfair_admit_task(struct task_struct* t) +{ + lt_t quanta; + lt_t period; + s64 quantum_length = ktime_to_ns(tick_period); + struct pfair_param* param; + unsigned long i; + + /* Pfair is a tick-based method, so the time + * of interest is jiffies. Calculate tick-based + * times for everything. + * (Ceiling of exec cost, floor of period.) + */ + + quanta = get_exec_cost(t); + period = get_rt_period(t); + + quanta = time2quanta(get_exec_cost(t), CEIL); + + if (do_div(period, quantum_length)) + printk(KERN_WARNING + "The period of %s/%d is not a multiple of %llu.\n", + t->comm, t->pid, (unsigned long long) quantum_length); + + if (period >= PFAIR_MAX_PERIOD) { + printk(KERN_WARNING + "PFAIR: Rejecting task %s/%d; its period is too long.\n", + t->comm, t->pid); + return -EINVAL; + } + + if (quanta == period) { + /* special case: task has weight 1.0 */ + printk(KERN_INFO + "Admitting weight 1.0 task. (%s/%d, %llu, %llu).\n", + t->comm, t->pid, quanta, period); + quanta = 1; + period = 1; + } + + param = kmalloc(sizeof(struct pfair_param) + + quanta * sizeof(struct subtask), GFP_ATOMIC); + + if (!param) + return -ENOMEM; + + param->quanta = quanta; + param->cur = 0; + param->release = 0; + param->period = period; + + for (i = 0; i < quanta; i++) + init_subtask(param->subtasks + i, i, quanta, period); + + if (t->rt_param.pfair) + /* get rid of stale allocation */ + kfree(t->rt_param.pfair); + + t->rt_param.pfair = param; + + /* spew out some debug info */ + dump_subtasks(t); + + return 0; +} + +static long pfair_activate_plugin(void) +{ + int cpu; + struct pfair_state* state; + + state = &__get_cpu_var(pfair_state); + pfair_time = current_quantum(state); + + TRACE("Activating PFAIR at q=%lu\n", pfair_time); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + state = &per_cpu(pfair_state, cpu); + state->cur_tick = pfair_time; + state->local_tick = pfair_time; + state->missed_quanta = 0; + state->offset = cpu_stagger_offset(cpu); + } + + return 0; +} + +/* Plugin object */ +static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = { + .plugin_name = "PFAIR", + .tick = pfair_tick, + .task_new = pfair_task_new, + .task_exit = pfair_task_exit, + .schedule = pfair_schedule, + .task_wake_up = pfair_task_wake_up, + .task_block = pfair_task_block, + .admit_task = pfair_admit_task, + .release_at = pfair_release_at, + .complete_job = complete_job, + .activate_plugin = pfair_activate_plugin, +}; + +static int __init init_pfair(void) +{ + int cpu, i; + struct pfair_state *state; + + /* initialize release queue */ + for (i = 0; i < PFAIR_MAX_PERIOD; i++) + heap_init(&release_queue[i]); + + /* initialize CPU state */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + state = &per_cpu(pfair_state, cpu); + state->cpu = cpu; + state->cur_tick = 0; + state->local_tick = 0; + state->linked = NULL; + state->local = NULL; + state->scheduled = NULL; + state->missed_quanta = 0; + state->offset = cpu_stagger_offset(cpu); + pstate[cpu] = state; + } + + rt_domain_init(&pfair, pfair_ready_order, NULL, NULL); + return register_sched_plugin(&pfair_plugin); +} + +module_init(init_pfair); + diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c new file mode 100644 index 0000000..0be091e --- /dev/null +++ b/litmus/sched_plugin.c @@ -0,0 +1,199 @@ +/* sched_plugin.c -- core infrastructure for the scheduler plugin system + * + * This file includes the initialization of the plugin system, the no-op Linux + * scheduler plugin and some dummy functions. + */ + +#include +#include + +#include +#include + +#include + +/************************************************************* + * Dummy plugin functions * + *************************************************************/ + +static void litmus_dummy_finish_switch(struct task_struct * prev) +{ +} + +static struct task_struct* litmus_dummy_schedule(struct task_struct * prev) +{ + return NULL; +} + +static void litmus_dummy_tick(struct task_struct* tsk) +{ +} + +static long litmus_dummy_admit_task(struct task_struct* tsk) +{ + printk(KERN_CRIT "LITMUS^RT: Linux plugin rejects %s/%d.\n", + tsk->comm, tsk->pid); + return -EINVAL; +} + +static void litmus_dummy_task_new(struct task_struct *t, int on_rq, int running) +{ +} + +static void litmus_dummy_task_wake_up(struct task_struct *task) +{ +} + +static void litmus_dummy_task_block(struct task_struct *task) +{ +} + +static void litmus_dummy_task_exit(struct task_struct *task) +{ +} + +static long litmus_dummy_complete_job(void) +{ + return -ENOSYS; +} + +static long litmus_dummy_activate_plugin(void) +{ + return 0; +} + +static long litmus_dummy_deactivate_plugin(void) +{ + return 0; +} + +#ifdef CONFIG_FMLP + +static long litmus_dummy_inherit_priority(struct pi_semaphore *sem, + struct task_struct *new_owner) +{ + return -ENOSYS; +} + +static long litmus_dummy_return_priority(struct pi_semaphore *sem) +{ + return -ENOSYS; +} + +static long litmus_dummy_pi_block(struct pi_semaphore *sem, + struct task_struct *new_waiter) +{ + return -ENOSYS; +} + +#endif + + +/* The default scheduler plugin. It doesn't do anything and lets Linux do its + * job. + */ +struct sched_plugin linux_sched_plugin = { + .plugin_name = "Linux", + .tick = litmus_dummy_tick, + .task_new = litmus_dummy_task_new, + .task_exit = litmus_dummy_task_exit, + .task_wake_up = litmus_dummy_task_wake_up, + .task_block = litmus_dummy_task_block, + .complete_job = litmus_dummy_complete_job, + .schedule = litmus_dummy_schedule, + .finish_switch = litmus_dummy_finish_switch, + .activate_plugin = litmus_dummy_activate_plugin, + .deactivate_plugin = litmus_dummy_deactivate_plugin, +#ifdef CONFIG_FMLP + .inherit_priority = litmus_dummy_inherit_priority, + .return_priority = litmus_dummy_return_priority, + .pi_block = litmus_dummy_pi_block, +#endif + .admit_task = litmus_dummy_admit_task +}; + +/* + * The reference to current plugin that is used to schedule tasks within + * the system. It stores references to actual function implementations + * Should be initialized by calling "init_***_plugin()" + */ +struct sched_plugin *litmus = &linux_sched_plugin; + +/* the list of registered scheduling plugins */ +static LIST_HEAD(sched_plugins); +static DEFINE_SPINLOCK(sched_plugins_lock); + +#define CHECK(func) {\ + if (!plugin->func) \ + plugin->func = litmus_dummy_ ## func;} + +/* FIXME: get reference to module */ +int register_sched_plugin(struct sched_plugin* plugin) +{ + printk(KERN_INFO "Registering LITMUS^RT plugin %s.\n", + plugin->plugin_name); + + /* make sure we don't trip over null pointers later */ + CHECK(finish_switch); + CHECK(schedule); + CHECK(tick); + CHECK(task_wake_up); + CHECK(task_exit); + CHECK(task_block); + CHECK(task_new); + CHECK(complete_job); + CHECK(activate_plugin); + CHECK(deactivate_plugin); +#ifdef CONFIG_FMLP + CHECK(inherit_priority); + CHECK(return_priority); + CHECK(pi_block); +#endif + CHECK(admit_task); + + if (!plugin->release_at) + plugin->release_at = release_at; + + spin_lock(&sched_plugins_lock); + list_add(&plugin->list, &sched_plugins); + spin_unlock(&sched_plugins_lock); + + return 0; +} + + +/* FIXME: reference counting, etc. */ +struct sched_plugin* find_sched_plugin(const char* name) +{ + struct list_head *pos; + struct sched_plugin *plugin; + + spin_lock(&sched_plugins_lock); + list_for_each(pos, &sched_plugins) { + plugin = list_entry(pos, struct sched_plugin, list); + if (!strcmp(plugin->plugin_name, name)) + goto out_unlock; + } + plugin = NULL; + +out_unlock: + spin_unlock(&sched_plugins_lock); + return plugin; +} + +int print_sched_plugins(char* buf, int max) +{ + int count = 0; + struct list_head *pos; + struct sched_plugin *plugin; + + spin_lock(&sched_plugins_lock); + list_for_each(pos, &sched_plugins) { + plugin = list_entry(pos, struct sched_plugin, list); + count += snprintf(buf + count, max - count, "%s\n", plugin->plugin_name); + if (max - count <= 0) + break; + } + spin_unlock(&sched_plugins_lock); + return count; +} diff --git a/litmus/sched_psn_edf.c b/litmus/sched_psn_edf.c new file mode 100644 index 0000000..9a2bdfc --- /dev/null +++ b/litmus/sched_psn_edf.c @@ -0,0 +1,454 @@ + +/* + * kernel/sched_psn_edf.c + * + * Implementation of the PSN-EDF scheduler plugin. + * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c. + * + * Suspensions and non-preemptable sections are supported. + * Priority inheritance is not supported. + */ + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + + +typedef struct { + rt_domain_t domain; + int cpu; + struct task_struct* scheduled; /* only RT tasks */ + +/* scheduling lock + */ +#define slock domain.ready_lock +/* protects the domain and + * serializes scheduling decisions + */ +} psnedf_domain_t; + +DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains); + +#define local_edf (&__get_cpu_var(psnedf_domains).domain) +#define local_pedf (&__get_cpu_var(psnedf_domains)) +#define remote_edf(cpu) (&per_cpu(psnedf_domains, cpu).domain) +#define remote_pedf(cpu) (&per_cpu(psnedf_domains, cpu)) +#define task_edf(task) remote_edf(get_partition(task)) +#define task_pedf(task) remote_pedf(get_partition(task)) + + +static void psnedf_domain_init(psnedf_domain_t* pedf, + check_resched_needed_t check, + release_jobs_t release, + int cpu) +{ + edf_domain_init(&pedf->domain, check, release); + pedf->cpu = cpu; + pedf->scheduled = NULL; +} + +static void requeue(struct task_struct* t, rt_domain_t *edf) +{ + if (t->state != TASK_RUNNING) + TRACE_TASK(t, "requeue: !TASK_RUNNING\n"); + + set_rt_flags(t, RT_F_RUNNING); + if (is_released(t, litmus_clock())) + __add_ready(edf, t); + else + add_release(edf, t); /* it has got to wait */ +} + +/* we assume the lock is being held */ +static void preempt(psnedf_domain_t *pedf) +{ + if (smp_processor_id() == pedf->cpu) { + if (pedf->scheduled && is_np(pedf->scheduled)) + request_exit_np(pedf->scheduled); + else + set_tsk_need_resched(current); + } else + /* in case that it is a remote CPU we have to defer the + * the decision to the remote CPU + */ + smp_send_reschedule(pedf->cpu); +} + +/* This check is trivial in partioned systems as we only have to consider + * the CPU of the partition. + */ +static int psnedf_check_resched(rt_domain_t *edf) +{ + psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain); + int ret = 0; + + /* because this is a callback from rt_domain_t we already hold + * the necessary lock for the ready queue + */ + if (edf_preemption_needed(edf, pedf->scheduled)) { + preempt(pedf); + ret = 1; + } + return ret; +} + +static void psnedf_tick(struct task_struct *t) +{ + psnedf_domain_t *pedf = local_pedf; + + /* Check for inconsistency. We don't need the lock for this since + * ->scheduled is only changed in schedule, which obviously is not + * executing in parallel on this CPU + */ + BUG_ON(is_realtime(t) && t != pedf->scheduled); + + if (is_realtime(t) && budget_exhausted(t)) { + if (!is_np(t)) + set_tsk_need_resched(t); + else { + TRACE("psnedf_scheduler_tick: " + "%d is non-preemptable, " + "preemption delayed.\n", t->pid); + request_exit_np(t); + } + } +} + +static void job_completion(struct task_struct* t) +{ + TRACE_TASK(t, "job_completion().\n"); + set_rt_flags(t, RT_F_SLEEP); + prepare_for_next_period(t); +} + +static struct task_struct* psnedf_schedule(struct task_struct * prev) +{ + psnedf_domain_t* pedf = local_pedf; + rt_domain_t* edf = &pedf->domain; + struct task_struct* next; + + int out_of_time, sleep, preempt, + np, exists, blocks, resched; + + spin_lock(&pedf->slock); + + /* sanity checking */ + BUG_ON(pedf->scheduled && pedf->scheduled != prev); + BUG_ON(pedf->scheduled && !is_realtime(prev)); + + /* (0) Determine state */ + exists = pedf->scheduled != NULL; + blocks = exists && !is_running(pedf->scheduled); + out_of_time = exists && budget_exhausted(pedf->scheduled); + np = exists && is_np(pedf->scheduled); + sleep = exists && get_rt_flags(pedf->scheduled) == RT_F_SLEEP; + preempt = edf_preemption_needed(edf, prev); + + /* If we need to preempt do so. + * The following checks set resched to 1 in case of special + * circumstances. + */ + resched = preempt; + + /* If a task blocks we have no choice but to reschedule. + */ + if (blocks) + resched = 1; + + /* Request a sys_exit_np() call if we would like to preempt but cannot. + * Multiple calls to request_exit_np() don't hurt. + */ + if (np && (out_of_time || preempt || sleep)) + request_exit_np(pedf->scheduled); + + /* Any task that is preemptable and either exhausts its execution + * budget or wants to sleep completes. We may have to reschedule after + * this. + */ + if (!np && (out_of_time || sleep) && !blocks) { + job_completion(pedf->scheduled); + resched = 1; + } + + /* The final scheduling decision. Do we need to switch for some reason? + * Switch if we are in RT mode and have no task or if we need to + * resched. + */ + next = NULL; + if ((!np || blocks) && (resched || !exists)) { + /* Take care of a previously scheduled + * job by taking it out of the Linux runqueue. + */ + if (pedf->scheduled && !blocks) + requeue(pedf->scheduled, edf); + next = __take_ready(edf); + } else + /* Only override Linux scheduler if we have a real-time task + * scheduled that needs to continue. + */ + if (exists) + next = prev; + + if (next) { + TRACE_TASK(next, " == next\n"); + set_rt_flags(next, RT_F_RUNNING); + } else { + TRACE("becoming idle.\n"); + } + + pedf->scheduled = next; + spin_unlock(&pedf->slock); + + return next; +} + + +/* Prepare a task for running in RT mode + */ +static void psnedf_task_new(struct task_struct * t, int on_rq, int running) +{ + rt_domain_t* edf = task_edf(t); + psnedf_domain_t* pedf = task_pedf(t); + unsigned long flags; + + TRACE_TASK(t, "new\n"); + + /* setup job parameters */ + release_at(t, litmus_clock()); + + /* The task should be running in the queue, otherwise signal + * code will try to wake it up with fatal consequences. + */ + spin_lock_irqsave(&pedf->slock, flags); + if (running) { + /* there shouldn't be anything else running at the time */ + BUG_ON(pedf->scheduled); + pedf->scheduled = t; + } else { + requeue(t, edf); + /* maybe we have to reschedule */ + preempt(pedf); + } + spin_unlock_irqrestore(&pedf->slock, flags); +} + +static void psnedf_task_wake_up(struct task_struct *task) +{ + unsigned long flags; + psnedf_domain_t* pedf = task_pedf(task); + rt_domain_t* edf = task_edf(task); + lt_t now; + + TRACE_TASK(task, "wake up\n"); + spin_lock_irqsave(&pedf->slock, flags); + BUG_ON(is_queued(task)); + /* We need to take suspensions because of semaphores into + * account! If a job resumes after being suspended due to acquiring + * a semaphore, it should never be treated as a new job release. + * + * FIXME: This should be done in some more predictable and userspace-controlled way. + */ + now = litmus_clock(); + if (is_tardy(task, now) && + get_rt_flags(task) != RT_F_EXIT_SEM) { + /* new sporadic release */ + release_at(task, now); + sched_trace_task_release(task); + } + requeue(task, edf); + spin_unlock_irqrestore(&pedf->slock, flags); + TRACE_TASK(task, "wake up done\n"); +} + +static void psnedf_task_block(struct task_struct *t) +{ + /* only running tasks can block, thus t is in no queue */ + TRACE_TASK(t, "block, state=%d\n", t->state); + BUG_ON(!is_realtime(t)); + BUG_ON(is_queued(t)); +} + +static void psnedf_task_exit(struct task_struct * t) +{ + unsigned long flags; + psnedf_domain_t* pedf = task_pedf(t); + rt_domain_t* edf; + + spin_lock_irqsave(&pedf->slock, flags); + if (is_queued(t)) { + /* dequeue */ + edf = task_edf(t); + remove(edf, t); + } + if (pedf->scheduled == t) + pedf->scheduled = NULL; + preempt(pedf); + spin_unlock_irqrestore(&pedf->slock, flags); +} + +#ifdef CONFIG_FMLP +static long psnedf_pi_block(struct pi_semaphore *sem, + struct task_struct *new_waiter) +{ + psnedf_domain_t* pedf; + rt_domain_t* edf; + struct task_struct* t; + int cpu = get_partition(new_waiter); + + BUG_ON(!new_waiter); + + if (edf_higher_prio(new_waiter, sem->hp.cpu_task[cpu])) { + TRACE_TASK(new_waiter, " boosts priority\n"); + pedf = task_pedf(new_waiter); + edf = task_edf(new_waiter); + + /* interrupts already disabled */ + spin_lock(&pedf->slock); + + /* store new highest-priority task */ + sem->hp.cpu_task[cpu] = new_waiter; + if (sem->holder && + get_partition(sem->holder) == get_partition(new_waiter)) { + /* let holder inherit */ + sem->holder->rt_param.inh_task = new_waiter; + t = sem->holder; + if (is_queued(t)) { + /* queued in domain*/ + remove(edf, t); + /* readd to make priority change take place */ + /* FIXME: this looks outdated */ + if (is_released(t, litmus_clock())) + __add_ready(edf, t); + else + add_release(edf, t); + } + } + + /* check if we need to reschedule */ + if (edf_preemption_needed(edf, current)) + preempt(pedf); + + spin_unlock(&pedf->slock); + } + + return 0; +} + +static long psnedf_inherit_priority(struct pi_semaphore *sem, + struct task_struct *new_owner) +{ + int cpu = get_partition(new_owner); + + new_owner->rt_param.inh_task = sem->hp.cpu_task[cpu]; + if (sem->hp.cpu_task[cpu] && new_owner != sem->hp.cpu_task[cpu]) { + TRACE_TASK(new_owner, + "inherited priority from %s/%d\n", + sem->hp.cpu_task[cpu]->comm, + sem->hp.cpu_task[cpu]->pid); + } else + TRACE_TASK(new_owner, + "cannot inherit priority: " + "no higher priority job waits on this CPU!\n"); + /* make new owner non-preemptable as required by FMLP under + * PSN-EDF. + */ + make_np(new_owner); + return 0; +} + + +/* This function is called on a semaphore release, and assumes that + * the current task is also the semaphore holder. + */ +static long psnedf_return_priority(struct pi_semaphore *sem) +{ + struct task_struct* t = current; + psnedf_domain_t* pedf = task_pedf(t); + rt_domain_t* edf = task_edf(t); + int ret = 0; + int cpu = get_partition(current); + + + /* Find new highest-priority semaphore task + * if holder task is the current hp.cpu_task[cpu]. + * + * Calling function holds sem->wait.lock. + */ + if (t == sem->hp.cpu_task[cpu]) + edf_set_hp_cpu_task(sem, cpu); + + take_np(t); + if (current->rt_param.inh_task) { + TRACE_CUR("return priority of %s/%d\n", + current->rt_param.inh_task->comm, + current->rt_param.inh_task->pid); + spin_lock(&pedf->slock); + + /* Reset inh_task to NULL. */ + current->rt_param.inh_task = NULL; + + /* check if we need to reschedule */ + if (edf_preemption_needed(edf, current)) + preempt(pedf); + + spin_unlock(&pedf->slock); + } else + TRACE_CUR(" no priority to return %p\n", sem); + + return ret; +} + +#endif + +static long psnedf_admit_task(struct task_struct* tsk) +{ + return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL; +} + +/* Plugin object */ +static struct sched_plugin psn_edf_plugin __cacheline_aligned_in_smp = { + .plugin_name = "PSN-EDF", +#ifdef CONFIG_SRP + .srp_active = 1, +#endif + .tick = psnedf_tick, + .task_new = psnedf_task_new, + .complete_job = complete_job, + .task_exit = psnedf_task_exit, + .schedule = psnedf_schedule, + .task_wake_up = psnedf_task_wake_up, + .task_block = psnedf_task_block, +#ifdef CONFIG_FMLP + .fmlp_active = 1, + .pi_block = psnedf_pi_block, + .inherit_priority = psnedf_inherit_priority, + .return_priority = psnedf_return_priority, +#endif + .admit_task = psnedf_admit_task +}; + + +static int __init init_psn_edf(void) +{ + int i; + + for (i = 0; i < NR_CPUS; i++) + { + psnedf_domain_init(remote_pedf(i), + psnedf_check_resched, + NULL, i); + } + return register_sched_plugin(&psn_edf_plugin); +} + + + +module_init(init_psn_edf); diff --git a/litmus/sched_task_trace.c b/litmus/sched_task_trace.c new file mode 100644 index 0000000..913d999 --- /dev/null +++ b/litmus/sched_task_trace.c @@ -0,0 +1,192 @@ +/* sched_task_trace.c -- record scheduling events to a byte stream + * + */ + +#include +#include +#include + +#include +#include + +#include +#include +#include + +#define FT_TASK_TRACE_MAJOR 253 +#define NO_EVENTS 4096 /* this is a buffer of 12 4k pages per CPU */ + +#define now() litmus_clock() + +struct local_buffer { + struct st_event_record record[NO_EVENTS]; + char flag[NO_EVENTS]; + struct ft_buffer ftbuf; +}; + +DEFINE_PER_CPU(struct local_buffer, st_event_buffer); + +static struct ftdev st_dev; + +static int st_dev_can_open(struct ftdev *dev, unsigned int cpu) +{ + return cpu_online(cpu) ? 0 : -ENODEV; +} + +static int __init init_sched_task_trace(void) +{ + struct local_buffer* buf; + int i, ok = 0; + ftdev_init(&st_dev, THIS_MODULE); + for (i = 0; i < NR_CPUS; i++) { + buf = &per_cpu(st_event_buffer, i); + ok += init_ft_buffer(&buf->ftbuf, NO_EVENTS, + sizeof(struct st_event_record), + buf->flag, + buf->record); + st_dev.minor[i].buf = &buf->ftbuf; + } + if (ok == NR_CPUS) { + st_dev.minor_cnt = NR_CPUS; + st_dev.can_open = st_dev_can_open; + return register_ftdev(&st_dev, "sched_trace", FT_TASK_TRACE_MAJOR); + } else + return -EINVAL; +} + +module_init(init_sched_task_trace); + + +static inline struct st_event_record* get_record(u8 type, struct task_struct* t) +{ + struct st_event_record* rec; + struct local_buffer* buf; + + buf = &get_cpu_var(st_event_buffer); + if (ft_buffer_start_write(&buf->ftbuf, (void**) &rec)) { + rec->hdr.type = type; + rec->hdr.cpu = smp_processor_id(); + rec->hdr.pid = t ? t->pid : 0; + rec->hdr.job = t ? t->rt_param.job_params.job_no : 0; + } else + put_cpu_var(st_event_buffer); + /* rec will be NULL if it failed */ + return rec; +} + +static inline void put_record(struct st_event_record* rec) +{ + struct local_buffer* buf; + buf = &__get_cpu_var(st_event_buffer); + ft_buffer_finish_write(&buf->ftbuf, rec); + put_cpu_var(st_event_buffer); +} + +feather_callback void do_sched_trace_task_name(unsigned long id, unsigned long _task) +{ + struct task_struct *t = (struct task_struct*) _task; + struct st_event_record* rec = get_record(ST_NAME, t); + int i; + if (rec) { + for (i = 0; i < min(TASK_COMM_LEN, ST_NAME_LEN); i++) + rec->data.name.cmd[i] = t->comm[i]; + put_record(rec); + } +} + +feather_callback void do_sched_trace_task_param(unsigned long id, unsigned long _task) +{ + struct task_struct *t = (struct task_struct*) _task; + struct st_event_record* rec = get_record(ST_PARAM, t); + if (rec) { + rec->data.param.wcet = get_exec_cost(t); + rec->data.param.period = get_rt_period(t); + rec->data.param.phase = get_rt_phase(t); + rec->data.param.partition = get_partition(t); + put_record(rec); + } +} + +feather_callback void do_sched_trace_task_release(unsigned long id, unsigned long _task) +{ + struct task_struct *t = (struct task_struct*) _task; + struct st_event_record* rec = get_record(ST_RELEASE, t); + if (rec) { + rec->data.release.release = get_release(t); + rec->data.release.deadline = get_deadline(t); + put_record(rec); + } +} + +/* skipped: st_assigned_data, we don't use it atm */ + +feather_callback void do_sched_trace_task_switch_to(unsigned long id, unsigned long _task) +{ + struct task_struct *t = (struct task_struct*) _task; + struct st_event_record* rec; + if (is_realtime(t)) { + rec = get_record(ST_SWITCH_TO, t); + if (rec) { + rec->data.switch_to.when = now(); + rec->data.switch_to.exec_time = get_exec_time(t); + put_record(rec); + } + } +} + +feather_callback void do_sched_trace_task_switch_away(unsigned long id, unsigned long _task) +{ + struct task_struct *t = (struct task_struct*) _task; + struct st_event_record* rec; + if (is_realtime(t)) { + rec = get_record(ST_SWITCH_AWAY, t); + if (rec) { + rec->data.switch_away.when = now(); + rec->data.switch_away.exec_time = get_exec_time(t); + put_record(rec); + } + } +} + +feather_callback void do_sched_trace_task_completion(unsigned long id, unsigned long _task, + unsigned long forced) +{ + struct task_struct *t = (struct task_struct*) _task; + struct st_event_record* rec = get_record(ST_COMPLETION, t); + if (rec) { + rec->data.completion.when = now(); + rec->data.completion.forced = forced; + put_record(rec); + } +} + +feather_callback void do_sched_trace_task_block(unsigned long id, unsigned long _task) +{ + struct task_struct *t = (struct task_struct*) _task; + struct st_event_record* rec = get_record(ST_BLOCK, t); + if (rec) { + rec->data.block.when = now(); + put_record(rec); + } +} + +feather_callback void do_sched_trace_task_resume(unsigned long id, unsigned long _task) +{ + struct task_struct *t = (struct task_struct*) _task; + struct st_event_record* rec = get_record(ST_RESUME, t); + if (rec) { + rec->data.resume.when = now(); + put_record(rec); + } +} + +feather_callback void do_sched_trace_sys_release(unsigned long id, unsigned long _start) +{ + lt_t *start = (lt_t*) _start; + struct st_event_record* rec = get_record(ST_SYS_RELEASE, NULL); + if (rec) { + rec->data.sys_release.when = now(); + rec->data.sys_release.release = *start; + put_record(rec); + } +} diff --git a/litmus/sched_trace.c b/litmus/sched_trace.c new file mode 100644 index 0000000..527a58b --- /dev/null +++ b/litmus/sched_trace.c @@ -0,0 +1,462 @@ +/* sched_trace.c -- record scheduling events to a byte stream. + * + * TODO: Move ring buffer to a lockfree implementation. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +typedef struct { + /* guard read and write pointers */ + spinlock_t lock; + /* guard against concurrent freeing of buffer */ + rwlock_t del_lock; + + /* memory allocated for ring buffer */ + unsigned long order; + char* buf; + char* end; + + /* Read/write pointer. May not cross. + * They point to the position of next write and + * last read. + */ + char* writep; + char* readp; + +} ring_buffer_t; + +#define EMPTY_RING_BUFFER { \ + .lock = SPIN_LOCK_UNLOCKED, \ + .del_lock = RW_LOCK_UNLOCKED, \ + .buf = NULL, \ + .end = NULL, \ + .writep = NULL, \ + .readp = NULL \ +} + +void rb_init(ring_buffer_t* buf) +{ + *buf = (ring_buffer_t) EMPTY_RING_BUFFER; +} + +int rb_alloc_buf(ring_buffer_t* buf, unsigned long order) +{ + unsigned long flags; + int error = 0; + char *mem; + + /* do memory allocation while not atomic */ + mem = (char *) __get_free_pages(GFP_KERNEL, order); + if (!mem) + return -ENOMEM; + write_lock_irqsave(&buf->del_lock, flags); + BUG_ON(buf->buf); + buf->buf = mem; + buf->end = buf->buf + PAGE_SIZE * (1 << order) - 1; + memset(buf->buf, 0xff, buf->end - buf->buf); + buf->order = order; + buf->writep = buf->buf + 1; + buf->readp = buf->buf; + write_unlock_irqrestore(&buf->del_lock, flags); + return error; +} + +int rb_free_buf(ring_buffer_t* buf) +{ + unsigned long flags; + int error = 0; + write_lock_irqsave(&buf->del_lock, flags); + BUG_ON(!buf->buf); + free_pages((unsigned long) buf->buf, buf->order); + buf->buf = NULL; + buf->end = NULL; + buf->writep = NULL; + buf->readp = NULL; + write_unlock_irqrestore(&buf->del_lock, flags); + return error; +} + +/* Assumption: concurrent writes are serialized externally + * + * Will only succeed if there is enough space for all len bytes. + */ +int rb_put(ring_buffer_t* buf, char* mem, size_t len) +{ + unsigned long flags; + char* r , *w; + int error = 0; + read_lock_irqsave(&buf->del_lock, flags); + if (!buf->buf) { + error = -ENODEV; + goto out; + } + spin_lock(&buf->lock); + r = buf->readp; + w = buf->writep; + spin_unlock(&buf->lock); + if (r < w && buf->end - w >= len - 1) { + /* easy case: there is enough space in the buffer + * to write it in one continous chunk*/ + memcpy(w, mem, len); + w += len; + if (w > buf->end) + /* special case: fit exactly into buffer + * w is now buf->end + 1 + */ + w = buf->buf; + } else if (w < r && r - w >= len) { /* >= len because may not cross */ + /* we are constrained by the read pointer but we there + * is enough space + */ + memcpy(w, mem, len); + w += len; + } else if (r <= w && buf->end - w < len - 1) { + /* the wrap around case: there may or may not be space */ + if ((buf->end - w) + (r - buf->buf) >= len - 1) { + /* copy chunk that fits at the end */ + memcpy(w, mem, buf->end - w + 1); + mem += buf->end - w + 1; + len -= (buf->end - w + 1); + w = buf->buf; + /* copy the rest */ + memcpy(w, mem, len); + w += len; + } + else + error = -ENOMEM; + } else { + error = -ENOMEM; + } + if (!error) { + spin_lock(&buf->lock); + buf->writep = w; + spin_unlock(&buf->lock); + } + out: + read_unlock_irqrestore(&buf->del_lock, flags); + return error; +} + +/* Assumption: concurrent reads are serialized externally */ +int rb_get(ring_buffer_t* buf, char* mem, size_t len) +{ + unsigned long flags; + char* r , *w; + int error = 0; + read_lock_irqsave(&buf->del_lock, flags); + if (!buf->buf) { + error = -ENODEV; + goto out; + } + spin_lock(&buf->lock); + r = buf->readp; + w = buf->writep; + spin_unlock(&buf->lock); + + if (w <= r && buf->end - r >= len) { + /* easy case: there is enough data in the buffer + * to get it in one chunk*/ + memcpy(mem, r + 1, len); + r += len; + error = len; + + } else if (r + 1 < w && w - r - 1 >= len) { + /* we are constrained by the write pointer but + * there is enough data + */ + memcpy(mem, r + 1, len); + r += len; + error = len; + + } else if (r + 1 < w && w - r - 1 < len) { + /* we are constrained by the write pointer and there + * there is not enough data + */ + memcpy(mem, r + 1, w - r - 1); + error = w - r - 1; + r += w - r - 1; + + } else if (w <= r && buf->end - r < len) { + /* the wrap around case: there may or may not be enough data + * first let's get what is available + */ + memcpy(mem, r + 1, buf->end - r); + error += (buf->end - r); + mem += (buf->end - r); + len -= (buf->end - r); + r += (buf->end - r); + + if (w > buf->buf) { + /* there is more to get */ + r = buf->buf - 1; + if (w - r >= len) { + /* plenty */ + memcpy(mem, r + 1, len); + error += len; + r += len; + } else { + memcpy(mem, r + 1, w - r - 1); + error += w - r - 1; + r += w - r - 1; + } + } + } /* nothing available */ + + if (error > 0) { + spin_lock(&buf->lock); + buf->readp = r; + spin_unlock(&buf->lock); + } + out: + read_unlock_irqrestore(&buf->del_lock, flags); + return error; +} + + + +/******************************************************************************/ +/* DEVICE FILE DRIVER */ +/******************************************************************************/ + + + +/* Allocate a buffer of about 1 MB per CPU. + * + */ +#define BUFFER_ORDER 8 + +typedef struct { + ring_buffer_t buf; + atomic_t reader_cnt; + struct semaphore reader_mutex; +} trace_buffer_t; + + +/* This does not initialize the semaphore!! */ + +#define EMPTY_TRACE_BUFFER \ + { .buf = EMPTY_RING_BUFFER, .reader_cnt = ATOMIC_INIT(0)} + +static spinlock_t log_buffer_lock = SPIN_LOCK_UNLOCKED; +static trace_buffer_t log_buffer = EMPTY_TRACE_BUFFER; + +static void init_log_buffer(void) +{ + /* only initialize the mutex, the rest was initialized as part + * of the static initialization macro + */ + init_MUTEX(&log_buffer.reader_mutex); +} + +static ssize_t log_read(struct file *filp, char __user *to, size_t len, + loff_t *f_pos) +{ + /* we ignore f_pos, this is strictly sequential */ + + ssize_t error = -EINVAL; + char* mem; + trace_buffer_t *buf = filp->private_data; + + if (down_interruptible(&buf->reader_mutex)) { + error = -ERESTARTSYS; + goto out; + } + + if (len > 64 * 1024) + len = 64 * 1024; + mem = kmalloc(len, GFP_KERNEL); + if (!mem) { + error = -ENOMEM; + goto out_unlock; + } + + error = rb_get(&buf->buf, mem, len); + while (!error) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(110); + if (signal_pending(current)) + error = -ERESTARTSYS; + else + error = rb_get(&buf->buf, mem, len); + } + + if (error > 0 && copy_to_user(to, mem, error)) + error = -EFAULT; + + kfree(mem); + out_unlock: + up(&buf->reader_mutex); + out: + return error; +} + + + +extern int trace_override; + +/* log_open - open the global log message ring buffer. + */ +static int log_open(struct inode *in, struct file *filp) +{ + int error = -EINVAL; + trace_buffer_t* buf; + + buf = &log_buffer; + + if (down_interruptible(&buf->reader_mutex)) { + error = -ERESTARTSYS; + goto out; + } + + /* first open must allocate buffers */ + if (atomic_inc_return(&buf->reader_cnt) == 1) { + if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER))) + { + atomic_dec(&buf->reader_cnt); + goto out_unlock; + } + } + + error = 0; + filp->private_data = buf; + printk(KERN_DEBUG "sched_trace buf: from 0x%p to 0x%p length: %x\n", + buf->buf.buf, buf->buf.end, buf->buf.end - buf->buf.buf); + trace_override++; + out_unlock: + up(&buf->reader_mutex); + out: + return error; +} + +static int log_release(struct inode *in, struct file *filp) +{ + int error = -EINVAL; + trace_buffer_t* buf = filp->private_data; + + BUG_ON(!filp->private_data); + + if (down_interruptible(&buf->reader_mutex)) { + error = -ERESTARTSYS; + goto out; + } + + /* last release must deallocate buffers */ + if (atomic_dec_return(&buf->reader_cnt) == 0) { + error = rb_free_buf(&buf->buf); + } + + trace_override--; + up(&buf->reader_mutex); + out: + return error; +} + +/******************************************************************************/ +/* Device Registration */ +/******************************************************************************/ + +/* the major numbes are from the unassigned/local use block + * + * This should be converted to dynamic allocation at some point... + */ +#define LOG_MAJOR 251 + +/* log_fops - The file operations for accessing the global LITMUS log message + * buffer. + * + * Except for opening the device file it uses the same operations as trace_fops. + */ +struct file_operations log_fops = { + .owner = THIS_MODULE, + .open = log_open, + .release = log_release, + .read = log_read, +}; + +static int __init register_buffer_dev(const char* name, + struct file_operations* fops, + int major, int count) +{ + dev_t trace_dev; + struct cdev *cdev; + int error = 0; + + trace_dev = MKDEV(major, 0); + error = register_chrdev_region(trace_dev, count, name); + if (error) + { + printk(KERN_WARNING "sched trace: " + "Could not register major/minor number %d\n", major); + return error; + } + cdev = cdev_alloc(); + if (!cdev) { + printk(KERN_WARNING "sched trace: " + "Could not get a cdev for %s.\n", name); + return -ENOMEM; + } + cdev->owner = THIS_MODULE; + cdev->ops = fops; + error = cdev_add(cdev, trace_dev, count); + if (error) { + printk(KERN_WARNING "sched trace: " + "add_cdev failed for %s.\n", name); + return -ENOMEM; + } + return error; + +} + +static int __init init_sched_trace(void) +{ + printk("Initializing TRACE() device\n"); + init_log_buffer(); + + return register_buffer_dev("litmus_log", &log_fops, + LOG_MAJOR, 1); +} + +module_init(init_sched_trace); + +#define MSG_SIZE 255 +static DEFINE_PER_CPU(char[MSG_SIZE], fmt_buffer); + +/* sched_trace_log_message - This is the only function that accesses the the + * log buffer inside the kernel for writing. + * Concurrent access to it is serialized via the + * log_buffer_lock. + * + * The maximum length of a formatted message is 255. + */ +void sched_trace_log_message(const char* fmt, ...) +{ + unsigned long flags; + va_list args; + size_t len; + char* buf; + + va_start(args, fmt); + local_irq_save(flags); + + /* format message */ + buf = __get_cpu_var(fmt_buffer); + len = vscnprintf(buf, MSG_SIZE, fmt, args); + + spin_lock(&log_buffer_lock); + /* Don't copy the trailing null byte, we don't want null bytes + * in a text file. + */ + rb_put(&log_buffer.buf, buf, len); + spin_unlock(&log_buffer_lock); + + local_irq_restore(flags); + va_end(args); +} diff --git a/litmus/srp.c b/litmus/srp.c new file mode 100644 index 0000000..71639b9 --- /dev/null +++ b/litmus/srp.c @@ -0,0 +1,318 @@ +/* ************************************************************************** */ +/* STACK RESOURCE POLICY */ +/* ************************************************************************** */ + +#include +#include +#include +#include + +#include + +#include + + +#ifdef CONFIG_SRP + +struct srp_priority { + struct list_head list; + unsigned int period; + pid_t pid; +}; + +#define list2prio(l) list_entry(l, struct srp_priority, list) + +/* SRP task priority comparison function. Smaller periods have highest + * priority, tie-break is PID. Special case: period == 0 <=> no priority + */ +static int srp_higher_prio(struct srp_priority* first, + struct srp_priority* second) +{ + if (!first->period) + return 0; + else + return !second->period || + first->period < second->period || ( + first->period == second->period && + first->pid < second->pid); +} + +struct srp { + struct list_head ceiling; + wait_queue_head_t ceiling_blocked; +}; + + +atomic_t srp_objects_in_use = ATOMIC_INIT(0); + +DEFINE_PER_CPU(struct srp, srp); + + +/* Initialize SRP semaphores at boot time. */ +static int __init srp_init(void) +{ + int i; + + printk("Initializing SRP per-CPU ceilings..."); + for (i = 0; i < NR_CPUS; i++) { + init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked); + INIT_LIST_HEAD(&per_cpu(srp, i).ceiling); + } + printk(" done!\n"); + + return 0; +} +module_init(srp_init); + + +#define system_ceiling(srp) list2prio(srp->ceiling.next) + + +#define UNDEF_SEM -2 + + +/* struct for uniprocessor SRP "semaphore" */ +struct srp_semaphore { + struct srp_priority ceiling; + struct task_struct* owner; + int cpu; /* cpu associated with this "semaphore" and resource */ +}; + +#define ceiling2sem(c) container_of(c, struct srp_semaphore, ceiling) + +static int srp_exceeds_ceiling(struct task_struct* first, + struct srp* srp) +{ + return list_empty(&srp->ceiling) || + get_rt_period(first) < system_ceiling(srp)->period || + (get_rt_period(first) == system_ceiling(srp)->period && + first->pid < system_ceiling(srp)->pid) || + ceiling2sem(system_ceiling(srp))->owner == first; +} + +static void srp_add_prio(struct srp* srp, struct srp_priority* prio) +{ + struct list_head *pos; + if (in_list(&prio->list)) { + printk(KERN_CRIT "WARNING: SRP violation detected, prio is already in " + "ceiling list! cpu=%d, srp=%p\n", smp_processor_id(), ceiling2sem(prio)); + return; + } + list_for_each(pos, &srp->ceiling) + if (unlikely(srp_higher_prio(prio, list2prio(pos)))) { + __list_add(&prio->list, pos->prev, pos); + return; + } + + list_add_tail(&prio->list, &srp->ceiling); +} + + +static void* create_srp_semaphore(void) +{ + struct srp_semaphore* sem; + + sem = kmalloc(sizeof(*sem), GFP_KERNEL); + if (!sem) + return NULL; + + INIT_LIST_HEAD(&sem->ceiling.list); + sem->ceiling.period = 0; + sem->cpu = UNDEF_SEM; + sem->owner = NULL; + atomic_inc(&srp_objects_in_use); + return sem; +} + +static noinline int open_srp_semaphore(struct od_table_entry* entry, void* __user arg) +{ + struct srp_semaphore* sem = (struct srp_semaphore*) entry->obj->obj; + int ret = 0; + struct task_struct* t = current; + struct srp_priority t_prio; + + TRACE("opening SRP semaphore %p, cpu=%d\n", sem, sem->cpu); + if (!srp_active()) + return -EBUSY; + + if (sem->cpu == UNDEF_SEM) + sem->cpu = get_partition(t); + else if (sem->cpu != get_partition(t)) + ret = -EPERM; + + if (ret == 0) { + t_prio.period = get_rt_period(t); + t_prio.pid = t->pid; + if (srp_higher_prio(&t_prio, &sem->ceiling)) { + sem->ceiling.period = t_prio.period; + sem->ceiling.pid = t_prio.pid; + } + } + + return ret; +} + +static void destroy_srp_semaphore(void* sem) +{ + /* XXX invariants */ + atomic_dec(&srp_objects_in_use); + kfree(sem); +} + +struct fdso_ops srp_sem_ops = { + .create = create_srp_semaphore, + .open = open_srp_semaphore, + .destroy = destroy_srp_semaphore +}; + + +static void do_srp_down(struct srp_semaphore* sem) +{ + /* Update ceiling. */ + srp_add_prio(&__get_cpu_var(srp), &sem->ceiling); + WARN_ON(sem->owner != NULL); + sem->owner = current; + TRACE_CUR("acquired srp 0x%p\n", sem); +} + +static void do_srp_up(struct srp_semaphore* sem) +{ + /* Determine new system priority ceiling for this CPU. */ + WARN_ON(!in_list(&sem->ceiling.list)); + if (in_list(&sem->ceiling.list)) + list_del(&sem->ceiling.list); + + sem->owner = NULL; + + /* Wake tasks on this CPU, if they exceed current ceiling. */ + TRACE_CUR("released srp 0x%p\n", sem); + wake_up_all(&__get_cpu_var(srp).ceiling_blocked); +} + +/* Adjust the system-wide priority ceiling if resource is claimed. */ +asmlinkage long sys_srp_down(int sem_od) +{ + int cpu; + int ret = -EINVAL; + struct srp_semaphore* sem; + + /* disabling preemptions is sufficient protection since + * SRP is strictly per CPU and we don't interfere with any + * interrupt handlers + */ + preempt_disable(); + TS_SRP_DOWN_START; + + cpu = smp_processor_id(); + sem = lookup_srp_sem(sem_od); + if (sem && sem->cpu == cpu) { + do_srp_down(sem); + ret = 0; + } + + TS_SRP_DOWN_END; + preempt_enable(); + return ret; +} + +/* Adjust the system-wide priority ceiling if resource is freed. */ +asmlinkage long sys_srp_up(int sem_od) +{ + int cpu; + int ret = -EINVAL; + struct srp_semaphore* sem; + + preempt_disable(); + TS_SRP_UP_START; + + cpu = smp_processor_id(); + sem = lookup_srp_sem(sem_od); + + if (sem && sem->cpu == cpu) { + do_srp_up(sem); + ret = 0; + } + + TS_SRP_UP_END; + preempt_enable(); + return ret; +} + +static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync, + void *key) +{ + int cpu = smp_processor_id(); + struct task_struct *tsk = wait->private; + if (cpu != get_partition(tsk)) + TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b", + get_partition(tsk)); + else if (srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) + return default_wake_function(wait, mode, sync, key); + return 0; +} + + + +static void do_ceiling_block(struct task_struct *tsk) +{ + wait_queue_t wait = { + .private = tsk, + .func = srp_wake_up, + .task_list = {NULL, NULL} + }; + + tsk->state = TASK_UNINTERRUPTIBLE; + add_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait); + tsk->rt_param.srp_non_recurse = 1; + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + tsk->rt_param.srp_non_recurse = 0; + remove_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait); +} + +/* Wait for current task priority to exceed system-wide priority ceiling. + */ +void srp_ceiling_block(void) +{ + struct task_struct *tsk = current; + + /* Only applies to real-time tasks, but optimize for RT tasks. */ + if (unlikely(!is_realtime(tsk))) + return; + + /* Avoid recursive ceiling blocking. */ + if (unlikely(tsk->rt_param.srp_non_recurse)) + return; + + /* Bail out early if there aren't any SRP resources around. */ + if (likely(!atomic_read(&srp_objects_in_use))) + return; + + preempt_disable(); + if (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) { + TRACE_CUR("is priority ceiling blocked.\n"); + while (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) + do_ceiling_block(tsk); + TRACE_CUR("finally exceeds system ceiling.\n"); + } else + TRACE_CUR("is not priority ceiling blocked\n"); + preempt_enable(); +} + + +#else + +asmlinkage long sys_srp_down(int sem_od) +{ + return -ENOSYS; +} + +asmlinkage long sys_srp_up(int sem_od) +{ + return -ENOSYS; +} + +struct fdso_ops srp_sem_ops = {}; + +#endif diff --git a/litmus/sync.c b/litmus/sync.c new file mode 100644 index 0000000..d5069f9 --- /dev/null +++ b/litmus/sync.c @@ -0,0 +1,90 @@ +/* litmus/sync.c - Support for synchronous and asynchronous task system releases. + * + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +static DECLARE_COMPLETION(ts_release); + +static long do_wait_for_ts_release(void) +{ + long ret = 0; + + /* If the interruption races with a release, the completion object + * may have a non-zero counter. To avoid this problem, this should + * be replaced by wait_for_completion(). + * + * For debugging purposes, this is interruptible for now. + */ + ret = wait_for_completion_interruptible(&ts_release); + + return ret; +} + + +static long do_release_ts(lt_t start) +{ + int task_count = 0; + long flags; + struct list_head *pos; + struct task_struct *t; + + + spin_lock_irqsave(&ts_release.wait.lock, flags); + TRACE("<<<<<< synchronous task system release >>>>>>\n"); + + sched_trace_sys_release(&start); + list_for_each(pos, &ts_release.wait.task_list) { + t = (struct task_struct*) list_entry(pos, + struct __wait_queue, + task_list)->private; + task_count++; + litmus->release_at(t, start + t->rt_param.task_params.phase); + sched_trace_task_release(t); + } + + spin_unlock_irqrestore(&ts_release.wait.lock, flags); + + complete_n(&ts_release, task_count); + + return task_count; +} + + +asmlinkage long sys_wait_for_ts_release(void) +{ + long ret = -EPERM; + struct task_struct *t = current; + + if (is_realtime(t)) + ret = do_wait_for_ts_release(); + + return ret; +} + + +asmlinkage long sys_release_ts(lt_t __user *__delay) +{ + long ret; + lt_t delay; + + /* FIXME: check capabilities... */ + + ret = copy_from_user(&delay, __delay, sizeof(lt_t)); + if (ret == 0) + ret = do_release_ts(litmus_clock() + delay); + + return ret; +} diff --git a/litmus/trace.c b/litmus/trace.c new file mode 100644 index 0000000..8851198 --- /dev/null +++ b/litmus/trace.c @@ -0,0 +1,83 @@ +#include + +#include +#include +#include + +/******************************************************************************/ +/* Allocation */ +/******************************************************************************/ + +static struct ftdev overhead_dev; + +#define trace_ts_buf overhead_dev.minor[0].buf + +static unsigned int ts_seq_no = 0; + +static inline void __save_timestamp(unsigned long event, uint8_t type) +{ + unsigned int seq_no; + struct timestamp *ts; + seq_no = fetch_and_inc((int *) &ts_seq_no); + if (ft_buffer_start_write(trace_ts_buf, (void**) &ts)) { + ts->event = event; + ts->timestamp = ft_timestamp(); + ts->seq_no = seq_no; + ts->cpu = raw_smp_processor_id(); + ts->task_type = type; + ft_buffer_finish_write(trace_ts_buf, ts); + } +} + +feather_callback void save_timestamp(unsigned long event) +{ + __save_timestamp(event, TSK_UNKNOWN); +} + +feather_callback void save_timestamp_def(unsigned long event, unsigned long type) +{ + __save_timestamp(event, (uint8_t) type); +} + +feather_callback void save_timestamp_task(unsigned long event, unsigned long t_ptr) +{ + int rt = is_realtime((struct task_struct *) t_ptr); + __save_timestamp(event, rt ? TSK_RT : TSK_BE); +} + +/******************************************************************************/ +/* DEVICE FILE DRIVER */ +/******************************************************************************/ + +#define NO_TIMESTAMPS (2 << 19) /* that should be 8 megs of ram, we may not get + * as much */ +#define FT_TRACE_MAJOR 252 + +static int alloc_timestamp_buffer(struct ftdev* ftdev, unsigned int idx) +{ + unsigned int count = NO_TIMESTAMPS; + while (count && !trace_ts_buf) { + printk("time stamp buffer: trying to allocate %u time stamps.\n", count); + ftdev->minor[idx].buf = alloc_ft_buffer(count, sizeof(struct timestamp)); + count /= 2; + } + return ftdev->minor[idx].buf ? 0 : -ENOMEM; +} + +static void free_timestamp_buffer(struct ftdev* ftdev, unsigned int idx) +{ + free_ft_buffer(ftdev->minor[idx].buf); + ftdev->minor[idx].buf = NULL; +} + +static int __init init_ft_overhead_trace(void) +{ + printk("Initializing Feather-Trace overhead tracing device.\n"); + ftdev_init(&overhead_dev, THIS_MODULE); + overhead_dev.minor_cnt = 1; /* only one buffer */ + overhead_dev.alloc = alloc_timestamp_buffer; + overhead_dev.free = free_timestamp_buffer; + return register_ftdev(&overhead_dev, "ft_trace", FT_TRACE_MAJOR); +} + +module_init(init_ft_overhead_trace);