From b1e1fea67bca3796d5f9133a92c300ec4fa93a4f Mon Sep 17 00:00:00 2001 From: Jeremy Erickson Date: Thu, 30 Aug 2012 21:01:47 -0400 Subject: Bjoern's Dissertation Code with Priority Donation --- Makefile | 4 +- arch/arm/Kconfig | 8 + arch/arm/include/asm/timex.h | 2 + arch/arm/include/asm/unistd.h | 3 + arch/arm/kernel/calls.S | 12 + arch/arm/kernel/smp.c | 4 + arch/arm/mach-realview/include/mach/timex.h | 27 + arch/x86/Kconfig | 8 + arch/x86/include/asm/entry_arch.h | 1 + arch/x86/include/asm/feather_trace.h | 17 + arch/x86/include/asm/feather_trace_32.h | 79 ++ arch/x86/include/asm/feather_trace_64.h | 67 ++ arch/x86/include/asm/hw_irq.h | 3 + arch/x86/include/asm/irq_vectors.h | 5 + arch/x86/include/asm/processor.h | 4 + arch/x86/include/asm/unistd_32.h | 6 +- arch/x86/include/asm/unistd_64.h | 4 + arch/x86/kernel/Makefile | 2 + arch/x86/kernel/cpu/intel_cacheinfo.c | 17 + arch/x86/kernel/entry_64.S | 2 + arch/x86/kernel/ft_event.c | 118 ++ arch/x86/kernel/irqinit.c | 3 + arch/x86/kernel/smp.c | 27 + arch/x86/kernel/syscall_table_32.S | 12 + drivers/tty/vt/consolemap_deftbl.c | 86 ++ drivers/tty/vt/defkeymap.c | 262 +++++ fs/exec.c | 13 +- fs/inode.c | 2 + include/linux/completion.h | 1 + include/linux/fs.h | 21 +- include/linux/hrtimer.h | 32 + include/linux/sched.h | 19 +- include/linux/smp.h | 5 + include/linux/tick.h | 5 + include/litmus/bheap.h | 77 ++ include/litmus/budget.h | 8 + include/litmus/clustered.h | 44 + include/litmus/debug_trace.h | 37 + include/litmus/edf_common.h | 33 + include/litmus/fdso.h | 77 ++ include/litmus/feather_buffer.h | 94 ++ include/litmus/feather_trace.h | 65 ++ include/litmus/fp_common.h | 105 ++ include/litmus/ftdev.h | 55 + include/litmus/jobs.h | 9 + include/litmus/litmus.h | 292 +++++ include/litmus/litmus_proc.h | 25 + include/litmus/locking.h | 28 + include/litmus/preempt.h | 165 +++ include/litmus/rt_domain.h | 182 ++++ include/litmus/rt_param.h | 228 ++++ include/litmus/sched_plugin.h | 117 ++ include/litmus/sched_plugin.h.rej | 22 + include/litmus/sched_trace.h | 200 ++++ include/litmus/srp.h | 28 + include/litmus/trace.h | 129 +++ include/litmus/unistd_32.h | 21 + include/litmus/unistd_64.h | 33 + include/litmus/wait.h | 57 + kernel/exit.c | 4 + kernel/fork.c | 7 + kernel/hrtimer.c | 95 ++ kernel/printk.c | 14 +- kernel/sched.c | 127 ++- kernel/sched_fair.c | 2 +- kernel/sched_rt.c | 2 +- kernel/time/tick-sched.c | 47 + litmus/Kconfig | 185 ++++ litmus/Makefile | 30 + litmus/bheap.c | 314 ++++++ litmus/budget.c | 111 ++ litmus/clustered.c | 111 ++ litmus/ctrldev.c | 150 +++ litmus/edf_common.c | 143 +++ litmus/fdso.c | 297 ++++++ litmus/fp_common.c | 119 +++ litmus/ft_event.c | 43 + litmus/ftdev.c | 446 ++++++++ litmus/jobs.c | 43 + litmus/litmus.c | 555 ++++++++++ litmus/litmus_proc.c | 347 ++++++ litmus/locking.c | 186 ++++ litmus/preempt.c | 131 +++ litmus/rt_domain.c | 357 +++++++ litmus/sched_cedf.c | 1526 ++++++++++++++++++++++++++ litmus/sched_cedf.c.rej | 53 + litmus/sched_gfl_split_namechange.c | 1149 ++++++++++++++++++++ litmus/sched_gsn_edf.c | 1286 ++++++++++++++++++++++ litmus/sched_gsn_edf_split_namechange.c | 1165 ++++++++++++++++++++ litmus/sched_litmus.c | 328 ++++++ litmus/sched_litmus.c.rej | 11 + litmus/sched_pfair.c | 1056 ++++++++++++++++++ litmus/sched_pfp.c | 1542 +++++++++++++++++++++++++++ litmus/sched_plugin.c | 233 ++++ litmus/sched_psn_edf.c | 917 ++++++++++++++++ litmus/sched_task_trace.c | 241 +++++ litmus/sched_trace.c | 252 +++++ litmus/srp.c | 295 +++++ litmus/sync.c | 104 ++ litmus/trace.c | 213 ++++ 100 files changed, 17213 insertions(+), 36 deletions(-) create mode 100644 arch/x86/include/asm/feather_trace.h create mode 100644 arch/x86/include/asm/feather_trace_32.h create mode 100644 arch/x86/include/asm/feather_trace_64.h create mode 100644 arch/x86/kernel/ft_event.c create mode 100644 drivers/tty/vt/consolemap_deftbl.c create mode 100644 drivers/tty/vt/defkeymap.c create mode 100644 include/litmus/bheap.h create mode 100644 include/litmus/budget.h create mode 100644 include/litmus/clustered.h create mode 100644 include/litmus/debug_trace.h create mode 100644 include/litmus/edf_common.h create mode 100644 include/litmus/fdso.h create mode 100644 include/litmus/feather_buffer.h create mode 100644 include/litmus/feather_trace.h create mode 100644 include/litmus/fp_common.h create mode 100644 include/litmus/ftdev.h create mode 100644 include/litmus/jobs.h create mode 100644 include/litmus/litmus.h create mode 100644 include/litmus/litmus_proc.h create mode 100644 include/litmus/locking.h create mode 100644 include/litmus/preempt.h create mode 100644 include/litmus/rt_domain.h create mode 100644 include/litmus/rt_param.h create mode 100644 include/litmus/sched_plugin.h create mode 100644 include/litmus/sched_plugin.h.rej create mode 100644 include/litmus/sched_trace.h create mode 100644 include/litmus/srp.h create mode 100644 include/litmus/trace.h create mode 100644 include/litmus/unistd_32.h create mode 100644 include/litmus/unistd_64.h create mode 100644 include/litmus/wait.h create mode 100644 litmus/Kconfig create mode 100644 litmus/Makefile create mode 100644 litmus/bheap.c create mode 100644 litmus/budget.c create mode 100644 litmus/clustered.c create mode 100644 litmus/ctrldev.c create mode 100644 litmus/edf_common.c create mode 100644 litmus/fdso.c create mode 100644 litmus/fp_common.c create mode 100644 litmus/ft_event.c create mode 100644 litmus/ftdev.c create mode 100644 litmus/jobs.c create mode 100644 litmus/litmus.c create mode 100644 litmus/litmus_proc.c create mode 100644 litmus/locking.c create mode 100644 litmus/preempt.c create mode 100644 litmus/rt_domain.c create mode 100644 litmus/sched_cedf.c create mode 100644 litmus/sched_cedf.c.rej create mode 100644 litmus/sched_gfl_split_namechange.c create mode 100644 litmus/sched_gsn_edf.c create mode 100644 litmus/sched_gsn_edf_split_namechange.c create mode 100644 litmus/sched_litmus.c create mode 100644 litmus/sched_litmus.c.rej create mode 100644 litmus/sched_pfair.c create mode 100644 litmus/sched_pfp.c create mode 100644 litmus/sched_plugin.c create mode 100644 litmus/sched_psn_edf.c create mode 100644 litmus/sched_task_trace.c create mode 100644 litmus/sched_trace.c create mode 100644 litmus/srp.c create mode 100644 litmus/sync.c create mode 100644 litmus/trace.c diff --git a/Makefile b/Makefile index 860c26af52c3..8e53f47a311b 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 36 -EXTRAVERSION = +EXTRAVERSION =-litmus2010 NAME = Flesh-Eating Bats with Fangs # *DOCUMENTATION* @@ -659,7 +659,7 @@ export mod_strip_cmd ifeq ($(KBUILD_EXTMOD),) -core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ +core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ litmus/ vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \ $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 9c26ba7244fb..babad6d7681a 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1808,3 +1808,11 @@ source "security/Kconfig" source "crypto/Kconfig" source "lib/Kconfig" + +config ARCH_HAS_SEND_PULL_TIMERS + def_bool n + +config ARCH_HAS_FEATHER_TRACE + def_bool n + +source "litmus/Kconfig" diff --git a/arch/arm/include/asm/timex.h b/arch/arm/include/asm/timex.h index 3be8de3adaba..8a102a383a36 100644 --- a/arch/arm/include/asm/timex.h +++ b/arch/arm/include/asm/timex.h @@ -16,9 +16,11 @@ typedef unsigned long cycles_t; +#ifndef get_cycles static inline cycles_t get_cycles (void) { return 0; } +#endif #endif diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h index c891eb76c0e3..625b30490624 100644 --- a/arch/arm/include/asm/unistd.h +++ b/arch/arm/include/asm/unistd.h @@ -397,6 +397,9 @@ #define __NR_fanotify_mark (__NR_SYSCALL_BASE+368) #define __NR_prlimit64 (__NR_SYSCALL_BASE+369) +#define __NR_LITMUS (__NR_SYSCALL_BASE+370) +#include + /* * The following SWIs are ARM private. */ diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S index 5c26eccef998..b99087ac85b9 100644 --- a/arch/arm/kernel/calls.S +++ b/arch/arm/kernel/calls.S @@ -379,6 +379,18 @@ CALL(sys_fanotify_init) CALL(sys_fanotify_mark) CALL(sys_prlimit64) +/* 370 */ CALL(sys_set_rt_task_param) + CALL(sys_get_rt_task_param) + CALL(sys_complete_job) + CALL(sys_od_open) + CALL(sys_od_close) +/* 375 */ CALL(sys_litmus_lock) + CALL(sys_litmus_unlock) + CALL(sys_query_job_no) + CALL(sys_wait_for_job_release) + CALL(sys_wait_for_ts_release) +/* 380 */ CALL(sys_release_ts) + CALL(sys_null_call) #ifndef syscalls_counted .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls #define syscalls_counted diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index 40dc74f2b27f..b72fbf3d043c 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -38,6 +38,8 @@ #include #include +#include + /* * as from 2.5, kernels no longer have an init_tasks structure * so we need some other way of telling a new secondary core @@ -533,6 +535,8 @@ asmlinkage void __exception do_IPI(struct pt_regs *regs) * nothing more to do - eveything is * done on the interrupt return path */ + /* LITMUS^RT: take action based on scheduler state */ + sched_state_ipi(); break; case IPI_CALL_FUNC: diff --git a/arch/arm/mach-realview/include/mach/timex.h b/arch/arm/mach-realview/include/mach/timex.h index 4eeb069373c2..e8bcc40d1f08 100644 --- a/arch/arm/mach-realview/include/mach/timex.h +++ b/arch/arm/mach-realview/include/mach/timex.h @@ -21,3 +21,30 @@ */ #define CLOCK_TICK_RATE (50000000 / 16) + +#if defined(CONFIG_MACH_REALVIEW_PB11MP) || defined(CONFIG_MACH_REALVIEW_PB1176) + +static inline unsigned long realview_get_arm11_cp15_ccnt(void) +{ + unsigned long cycles; + /* Read CP15 CCNT register. */ + asm volatile ("mrc p15, 0, %0, c15, c12, 1" : "=r" (cycles)); + return cycles; +} + +#define get_cycles realview_get_arm11_cp15_ccnt + +#elif defined(CONFIG_MACH_REALVIEW_PBA8) + + +static inline unsigned long realview_get_a8_cp15_ccnt(void) +{ + unsigned long cycles; + /* Read CP15 CCNT register. */ + asm volatile ("mrc p15, 0, %0, c9, c13, 0" : "=r" (cycles)); + return cycles; +} + +#define get_cycles realview_get_a8_cp15_ccnt + +#endif diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cea0cd9a316f..5181ed3a211a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2142,3 +2142,11 @@ source "crypto/Kconfig" source "arch/x86/kvm/Kconfig" source "lib/Kconfig" + +config ARCH_HAS_FEATHER_TRACE + def_bool y + +config ARCH_HAS_SEND_PULL_TIMERS + def_bool y + +source "litmus/Kconfig" diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index 8e8ec663a98f..5d07dea2ebb8 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -13,6 +13,7 @@ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR) BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR) BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR) +BUILD_INTERRUPT(pull_timers_interrupt,PULL_TIMERS_VECTOR) BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) diff --git a/arch/x86/include/asm/feather_trace.h b/arch/x86/include/asm/feather_trace.h new file mode 100644 index 000000000000..4fd31633405d --- /dev/null +++ b/arch/x86/include/asm/feather_trace.h @@ -0,0 +1,17 @@ +#ifndef _ARCH_FEATHER_TRACE_H +#define _ARCH_FEATHER_TRACE_H + +#include + +static inline unsigned long long ft_timestamp(void) +{ + return __native_read_tsc(); +} + +#ifdef CONFIG_X86_32 +#include "feather_trace_32.h" +#else +#include "feather_trace_64.h" +#endif + +#endif diff --git a/arch/x86/include/asm/feather_trace_32.h b/arch/x86/include/asm/feather_trace_32.h new file mode 100644 index 000000000000..70202f90f169 --- /dev/null +++ b/arch/x86/include/asm/feather_trace_32.h @@ -0,0 +1,79 @@ +/* Do not directly include this file. Include feather_trace.h instead */ + +#define feather_callback __attribute__((regparm(0))) + +/* + * make the compiler reload any register that is not saved in + * a cdecl function call + */ +#define CLOBBER_LIST "memory", "cc", "eax", "ecx", "edx" + +#define ft_event(id, callback) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " call " #callback " \n\t" \ + ".section __event_table, \"aw\" \n\t" \ + ".long " #id ", 0, 1b, 2f \n\t" \ + ".previous \n\t" \ + "2: \n\t" \ + : : : CLOBBER_LIST) + +#define ft_event0(id, callback) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " subl $4, %%esp \n\t" \ + " movl $" #id ", (%%esp) \n\t" \ + " call " #callback " \n\t" \ + " addl $4, %%esp \n\t" \ + ".section __event_table, \"aw\" \n\t" \ + ".long " #id ", 0, 1b, 2f \n\t" \ + ".previous \n\t" \ + "2: \n\t" \ + : : : CLOBBER_LIST) + +#define ft_event1(id, callback, param) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " subl $8, %%esp \n\t" \ + " movl %0, 4(%%esp) \n\t" \ + " movl $" #id ", (%%esp) \n\t" \ + " call " #callback " \n\t" \ + " addl $8, %%esp \n\t" \ + ".section __event_table, \"aw\" \n\t" \ + ".long " #id ", 0, 1b, 2f \n\t" \ + ".previous \n\t" \ + "2: \n\t" \ + : : "r" (param) : CLOBBER_LIST) + +#define ft_event2(id, callback, param, param2) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " subl $12, %%esp \n\t" \ + " movl %1, 8(%%esp) \n\t" \ + " movl %0, 4(%%esp) \n\t" \ + " movl $" #id ", (%%esp) \n\t" \ + " call " #callback " \n\t" \ + " addl $12, %%esp \n\t" \ + ".section __event_table, \"aw\" \n\t" \ + ".long " #id ", 0, 1b, 2f \n\t" \ + ".previous \n\t" \ + "2: \n\t" \ + : : "r" (param), "r" (param2) : CLOBBER_LIST) + + +#define ft_event3(id, callback, p, p2, p3) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " subl $16, %%esp \n\t" \ + " movl %2, 12(%%esp) \n\t" \ + " movl %1, 8(%%esp) \n\t" \ + " movl %0, 4(%%esp) \n\t" \ + " movl $" #id ", (%%esp) \n\t" \ + " call " #callback " \n\t" \ + " addl $16, %%esp \n\t" \ + ".section __event_table, \"aw\" \n\t" \ + ".long " #id ", 0, 1b, 2f \n\t" \ + ".previous \n\t" \ + "2: \n\t" \ + : : "r" (p), "r" (p2), "r" (p3) : CLOBBER_LIST) + diff --git a/arch/x86/include/asm/feather_trace_64.h b/arch/x86/include/asm/feather_trace_64.h new file mode 100644 index 000000000000..54ac2aeb3a28 --- /dev/null +++ b/arch/x86/include/asm/feather_trace_64.h @@ -0,0 +1,67 @@ +/* Do not directly include this file. Include feather_trace.h instead */ + +/* regparm is the default on x86_64 */ +#define feather_callback + +# define _EVENT_TABLE(id,from,to) \ + ".section __event_table, \"aw\"\n\t" \ + ".balign 8\n\t" \ + ".quad " #id ", 0, " #from ", " #to " \n\t" \ + ".previous \n\t" + +/* + * x86_64 callee only owns rbp, rbx, r12 -> r15 + * the called can freely modify the others + */ +#define CLOBBER_LIST "memory", "cc", "rdi", "rsi", "rdx", "rcx", \ + "r8", "r9", "r10", "r11", "rax" + +#define ft_event(id, callback) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " call " #callback " \n\t" \ + _EVENT_TABLE(id,1b,2f) \ + "2: \n\t" \ + : : : CLOBBER_LIST) + +#define ft_event0(id, callback) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " movq $" #id ", %%rdi \n\t" \ + " call " #callback " \n\t" \ + _EVENT_TABLE(id,1b,2f) \ + "2: \n\t" \ + : : : CLOBBER_LIST) + +#define ft_event1(id, callback, param) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " movq %0, %%rsi \n\t" \ + " movq $" #id ", %%rdi \n\t" \ + " call " #callback " \n\t" \ + _EVENT_TABLE(id,1b,2f) \ + "2: \n\t" \ + : : "r" (param) : CLOBBER_LIST) + +#define ft_event2(id, callback, param, param2) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " movq %1, %%rdx \n\t" \ + " movq %0, %%rsi \n\t" \ + " movq $" #id ", %%rdi \n\t" \ + " call " #callback " \n\t" \ + _EVENT_TABLE(id,1b,2f) \ + "2: \n\t" \ + : : "r" (param), "r" (param2) : CLOBBER_LIST) + +#define ft_event3(id, callback, p, p2, p3) \ + __asm__ __volatile__( \ + "1: jmp 2f \n\t" \ + " movq %2, %%rcx \n\t" \ + " movq %1, %%rdx \n\t" \ + " movq %0, %%rsi \n\t" \ + " movq $" #id ", %%rdi \n\t" \ + " call " #callback " \n\t" \ + _EVENT_TABLE(id,1b,2f) \ + "2: \n\t" \ + : : "r" (p), "r" (p2), "r" (p3) : CLOBBER_LIST) diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 46c0fe05f230..c17411503f28 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -53,6 +53,8 @@ extern void threshold_interrupt(void); extern void call_function_interrupt(void); extern void call_function_single_interrupt(void); +extern void pull_timers_interrupt(void); + /* IOAPIC */ #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs)) extern unsigned long io_apic_irqs; @@ -122,6 +124,7 @@ extern asmlinkage void smp_irq_move_cleanup_interrupt(void); extern void smp_reschedule_interrupt(struct pt_regs *); extern void smp_call_function_interrupt(struct pt_regs *); extern void smp_call_function_single_interrupt(struct pt_regs *); +extern void smp_pull_timers_interrupt(struct pt_regs *); #ifdef CONFIG_X86_32 extern void smp_invalidate_interrupt(struct pt_regs *); #else diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index e2ca30092557..6143ebeeebfa 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -108,6 +108,11 @@ */ #define LOCAL_TIMER_VECTOR 0xef +/* + * LITMUS^RT pull timers IRQ vector + */ +#define PULL_TIMERS_VECTOR 0xee + /* * Generic system vector for platform specific use */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 325b7bdbebaa..ebaa04a8d3af 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -169,6 +169,10 @@ extern void print_cpu_info(struct cpuinfo_x86 *); extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); extern unsigned short num_cache_leaves; +#ifdef CONFIG_SYSFS +extern int get_shared_cpu_map(cpumask_var_t mask, + unsigned int cpu, int index); +#endif extern void detect_extended_topology(struct cpuinfo_x86 *c); extern void detect_ht(struct cpuinfo_x86 *c); diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index b766a5e8ba0e..b7ba19acd3f8 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -347,9 +347,13 @@ #define __NR_fanotify_mark 339 #define __NR_prlimit64 340 +#define __NR_LITMUS 341 + +#include "litmus/unistd_32.h" + #ifdef __KERNEL__ -#define NR_syscalls 341 +#define NR_syscalls 341 + NR_litmus_syscalls #define __ARCH_WANT_IPC_PARSE_VERSION #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index 363e9b8a715b..332bf3c9c84c 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -670,6 +670,10 @@ __SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) #define __NR_prlimit64 302 __SYSCALL(__NR_prlimit64, sys_prlimit64) +#define __NR_LITMUS 303 + +#include "litmus/unistd_64.h" + #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR #define __ARCH_WANT_OLD_STAT diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index fedf32a8c3ec..6890dbb9ac15 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -118,6 +118,8 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o +obj-$(CONFIG_FEATHER_TRACE) += ft_event.o + ### # 64 bit specific files ifeq ($(CONFIG_X86_64),y) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 898c2f4eab88..3fec7d9bfd62 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -758,6 +758,23 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c) static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info); #define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y])) +/* returns CPUs that share the index cache with cpu */ +int get_shared_cpu_map(cpumask_var_t mask, unsigned int cpu, int index) +{ + int ret = 0; + struct _cpuid4_info *this_leaf; + + if (index >= num_cache_leaves) { + index = num_cache_leaves - 1; + ret = index; + } + + this_leaf = CPUID4_INFO_IDX(cpu,index); + cpumask_copy(mask, to_cpumask(this_leaf->shared_cpu_map)); + + return ret; +} + #ifdef CONFIG_SMP static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index) { diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 17be5ec7cbba..115e8951e8c8 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1016,6 +1016,8 @@ apicinterrupt CALL_FUNCTION_VECTOR \ call_function_interrupt smp_call_function_interrupt apicinterrupt RESCHEDULE_VECTOR \ reschedule_interrupt smp_reschedule_interrupt +apicinterrupt PULL_TIMERS_VECTOR \ + pull_timers_interrupt smp_pull_timers_interrupt #endif apicinterrupt ERROR_APIC_VECTOR \ diff --git a/arch/x86/kernel/ft_event.c b/arch/x86/kernel/ft_event.c new file mode 100644 index 000000000000..37cc33252713 --- /dev/null +++ b/arch/x86/kernel/ft_event.c @@ -0,0 +1,118 @@ +#include + +#include + +/* the feather trace management functions assume + * exclusive access to the event table + */ + +#ifndef CONFIG_DEBUG_RODATA + +#define BYTE_JUMP 0xeb +#define BYTE_JUMP_LEN 0x02 + +/* for each event, there is an entry in the event table */ +struct trace_event { + long id; + long count; + long start_addr; + long end_addr; +}; + +extern struct trace_event __start___event_table[]; +extern struct trace_event __stop___event_table[]; + +/* Workaround: if no events are defined, then the event_table section does not + * exist and the above references cause linker errors. This could probably be + * fixed by adjusting the linker script, but it is easier to maintain for us if + * we simply create a dummy symbol in the event table section. + */ +int __event_table_dummy[0] __attribute__ ((section("__event_table"))); + +int ft_enable_event(unsigned long id) +{ + struct trace_event* te = __start___event_table; + int count = 0; + char* delta; + unsigned char* instr; + + while (te < __stop___event_table) { + if (te->id == id && ++te->count == 1) { + instr = (unsigned char*) te->start_addr; + /* make sure we don't clobber something wrong */ + if (*instr == BYTE_JUMP) { + delta = (((unsigned char*) te->start_addr) + 1); + *delta = 0; + } + } + if (te->id == id) + count++; + te++; + } + + printk(KERN_DEBUG "ft_enable_event: enabled %d events\n", count); + return count; +} + +int ft_disable_event(unsigned long id) +{ + struct trace_event* te = __start___event_table; + int count = 0; + char* delta; + unsigned char* instr; + + while (te < __stop___event_table) { + if (te->id == id && --te->count == 0) { + instr = (unsigned char*) te->start_addr; + if (*instr == BYTE_JUMP) { + delta = (((unsigned char*) te->start_addr) + 1); + *delta = te->end_addr - te->start_addr - + BYTE_JUMP_LEN; + } + } + if (te->id == id) + count++; + te++; + } + + printk(KERN_DEBUG "ft_disable_event: disabled %d events\n", count); + return count; +} + +int ft_disable_all_events(void) +{ + struct trace_event* te = __start___event_table; + int count = 0; + char* delta; + unsigned char* instr; + + while (te < __stop___event_table) { + if (te->count) { + instr = (unsigned char*) te->start_addr; + if (*instr == BYTE_JUMP) { + delta = (((unsigned char*) te->start_addr) + + 1); + *delta = te->end_addr - te->start_addr - + BYTE_JUMP_LEN; + te->count = 0; + count++; + } + } + te++; + } + return count; +} + +int ft_is_event_enabled(unsigned long id) +{ + struct trace_event* te = __start___event_table; + + while (te < __stop___event_table) { + if (te->id == id) + return te->count; + te++; + } + return 0; +} + +#endif diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 990ae7cfc578..9772b1a0f9a4 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -189,6 +189,9 @@ static void __init smp_intr_init(void) alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR, call_function_single_interrupt); + /* IPI for hrtimer pulling on remote cpus */ + alloc_intr_gate(PULL_TIMERS_VECTOR, pull_timers_interrupt); + /* Low priority IPI to cleanup after moving an irq */ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c index d801210945d6..74cca6014c0e 100644 --- a/arch/x86/kernel/smp.c +++ b/arch/x86/kernel/smp.c @@ -23,6 +23,10 @@ #include #include +#include +#include +#include + #include #include #include @@ -118,6 +122,7 @@ static void native_smp_send_reschedule(int cpu) WARN_ON(1); return; } + TS_SEND_RESCHED_START(cpu); apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR); } @@ -147,6 +152,16 @@ void native_send_call_func_ipi(const struct cpumask *mask) free_cpumask_var(allbutself); } +/* trigger timers on remote cpu */ +void smp_send_pull_timers(int cpu) +{ + if (unlikely(cpu_is_offline(cpu))) { + WARN_ON(1); + return; + } + apic->send_IPI_mask(cpumask_of(cpu), PULL_TIMERS_VECTOR); +} + /* * this function calls the 'stop' function on all other CPUs in the system. */ @@ -198,7 +213,10 @@ static void native_smp_send_stop(void) void smp_reschedule_interrupt(struct pt_regs *regs) { ack_APIC_irq(); + /* LITMUS^RT: this IPI might need to trigger the sched state machine. */ + sched_state_ipi(); inc_irq_stat(irq_resched_count); + TS_SEND_RESCHED_END; /* * KVM uses this interrupt to force a cpu out of guest mode */ @@ -222,6 +240,15 @@ void smp_call_function_single_interrupt(struct pt_regs *regs) irq_exit(); } +extern void hrtimer_pull(void); + +void smp_pull_timers_interrupt(struct pt_regs *regs) +{ + ack_APIC_irq(); + TRACE("pull timer interrupt\n"); + hrtimer_pull(); +} + struct smp_ops smp_ops = { .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu, .smp_prepare_cpus = native_smp_prepare_cpus, diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index b35786dc9b8f..37702905f658 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -340,3 +340,15 @@ ENTRY(sys_call_table) .long sys_fanotify_init .long sys_fanotify_mark .long sys_prlimit64 /* 340 */ + .long sys_set_rt_task_param /* LITMUS^RT 341 */ + .long sys_get_rt_task_param + .long sys_complete_job + .long sys_od_open + .long sys_od_close + .long sys_litmus_lock + .long sys_litmus_unlock + .long sys_query_job_no + .long sys_wait_for_job_release + .long sys_wait_for_ts_release + .long sys_release_ts + .long sys_null_call diff --git a/drivers/tty/vt/consolemap_deftbl.c b/drivers/tty/vt/consolemap_deftbl.c new file mode 100644 index 000000000000..5f141383566b --- /dev/null +++ b/drivers/tty/vt/consolemap_deftbl.c @@ -0,0 +1,86 @@ +/* + * Do not edit this file; it was automatically generated by + * + * conmakehash drivers/tty/vt/cp437.uni > [this file] + * + */ + +#include + +u8 dfont_unicount[256] = +{ + 1, 1, 1, 1, 2, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 2, + 2, 2, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 2, 1, 1, 1, 1, 2, + 1, 1, 1, 1, 2, 2, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 5, 1, 2, 2, 4, 1, 1, + 1, 5, 1, 2, 1, 1, 1, 5, + 1, 1, 2, 1, 1, 4, 1, 1, + 1, 2, 1, 1, 1, 1, 1, 3, + 1, 2, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 2, + 1, 1, 1, 1, 1, 1, 1, 1, + 2, 2, 1, 1, 2, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 2, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 1, 1, 1, 2, 1, + 2, 1, 2, 2, 1, 2, 2, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 2, 1 +}; + +u16 dfont_unitable[303] = +{ + 0x0000, 0x263a, 0x263b, 0x2665, 0x2666, 0x25c6, 0x2663, 0x2660, + 0x2022, 0x25d8, 0x25cb, 0x25d9, 0x2642, 0x2640, 0x266a, 0x266b, + 0x263c, 0x00a4, 0x25b6, 0x25ba, 0x25c0, 0x25c4, 0x2195, 0x203c, + 0x00b6, 0x00a7, 0x25ac, 0x21a8, 0x2191, 0x2193, 0x2192, 0x2190, + 0x221f, 0x2194, 0x25b2, 0x25bc, 0x0020, 0x0021, 0x0022, 0x00a8, + 0x0023, 0x0024, 0x0025, 0x0026, 0x0027, 0x00b4, 0x0028, 0x0029, + 0x002a, 0x002b, 0x002c, 0x00b8, 0x002d, 0x00ad, 0x002e, 0x002f, + 0x0030, 0x0031, 0x0032, 0x0033, 0x0034, 0x0035, 0x0036, 0x0037, + 0x0038, 0x0039, 0x003a, 0x003b, 0x003c, 0x003d, 0x003e, 0x003f, + 0x0040, 0x0041, 0x00c0, 0x00c1, 0x00c2, 0x00c3, 0x0042, 0x0043, + 0x00a9, 0x0044, 0x00d0, 0x0045, 0x00c8, 0x00ca, 0x00cb, 0x0046, + 0x0047, 0x0048, 0x0049, 0x00cc, 0x00cd, 0x00ce, 0x00cf, 0x004a, + 0x004b, 0x212a, 0x004c, 0x004d, 0x004e, 0x004f, 0x00d2, 0x00d3, + 0x00d4, 0x00d5, 0x0050, 0x0051, 0x0052, 0x00ae, 0x0053, 0x0054, + 0x0055, 0x00d9, 0x00da, 0x00db, 0x0056, 0x0057, 0x0058, 0x0059, + 0x00dd, 0x005a, 0x005b, 0x005c, 0x005d, 0x005e, 0x005f, 0x23bd, + 0xf804, 0x0060, 0x0061, 0x00e3, 0x0062, 0x0063, 0x0064, 0x0065, + 0x0066, 0x0067, 0x0068, 0x0069, 0x006a, 0x006b, 0x006c, 0x006d, + 0x006e, 0x006f, 0x00f5, 0x0070, 0x0071, 0x0072, 0x0073, 0x0074, + 0x0075, 0x0076, 0x0077, 0x0078, 0x00d7, 0x0079, 0x00fd, 0x007a, + 0x007b, 0x007c, 0x00a6, 0x007d, 0x007e, 0x2302, 0x00c7, 0x00fc, + 0x00e9, 0x00e2, 0x00e4, 0x00e0, 0x00e5, 0x00e7, 0x00ea, 0x00eb, + 0x00e8, 0x00ef, 0x00ee, 0x00ec, 0x00c4, 0x00c5, 0x212b, 0x00c9, + 0x00e6, 0x00c6, 0x00f4, 0x00f6, 0x00f2, 0x00fb, 0x00f9, 0x00ff, + 0x00d6, 0x00dc, 0x00a2, 0x00a3, 0x00a5, 0x20a7, 0x0192, 0x00e1, + 0x00ed, 0x00f3, 0x00fa, 0x00f1, 0x00d1, 0x00aa, 0x00ba, 0x00bf, + 0x2310, 0x00ac, 0x00bd, 0x00bc, 0x00a1, 0x00ab, 0x00bb, 0x2591, + 0x2592, 0x2593, 0x2502, 0x2524, 0x2561, 0x2562, 0x2556, 0x2555, + 0x2563, 0x2551, 0x2557, 0x255d, 0x255c, 0x255b, 0x2510, 0x2514, + 0x2534, 0x252c, 0x251c, 0x2500, 0x253c, 0x255e, 0x255f, 0x255a, + 0x2554, 0x2569, 0x2566, 0x2560, 0x2550, 0x256c, 0x2567, 0x2568, + 0x2564, 0x2565, 0x2559, 0x2558, 0x2552, 0x2553, 0x256b, 0x256a, + 0x2518, 0x250c, 0x2588, 0x2584, 0x258c, 0x2590, 0x2580, 0x03b1, + 0x03b2, 0x00df, 0x0393, 0x03c0, 0x03a3, 0x03c3, 0x00b5, 0x03bc, + 0x03c4, 0x03a6, 0x00d8, 0x0398, 0x03a9, 0x2126, 0x03b4, 0x00f0, + 0x221e, 0x03c6, 0x00f8, 0x03b5, 0x2208, 0x2229, 0x2261, 0x00b1, + 0x2265, 0x2264, 0x2320, 0x2321, 0x00f7, 0x2248, 0x00b0, 0x2219, + 0x00b7, 0x221a, 0x207f, 0x00b2, 0x25a0, 0xfffd, 0x00a0 +}; diff --git a/drivers/tty/vt/defkeymap.c b/drivers/tty/vt/defkeymap.c new file mode 100644 index 000000000000..d2208dfe3f67 --- /dev/null +++ b/drivers/tty/vt/defkeymap.c @@ -0,0 +1,262 @@ +/* Do not edit this file! It was automatically generated by */ +/* loadkeys --mktable defkeymap.map > defkeymap.c */ + +#include +#include +#include + +u_short plain_map[NR_KEYS] = { + 0xf200, 0xf01b, 0xf031, 0xf032, 0xf033, 0xf034, 0xf035, 0xf036, + 0xf037, 0xf038, 0xf039, 0xf030, 0xf02d, 0xf03d, 0xf07f, 0xf009, + 0xfb71, 0xfb77, 0xfb65, 0xfb72, 0xfb74, 0xfb79, 0xfb75, 0xfb69, + 0xfb6f, 0xfb70, 0xf05b, 0xf05d, 0xf201, 0xf702, 0xfb61, 0xfb73, + 0xfb64, 0xfb66, 0xfb67, 0xfb68, 0xfb6a, 0xfb6b, 0xfb6c, 0xf03b, + 0xf027, 0xf060, 0xf700, 0xf05c, 0xfb7a, 0xfb78, 0xfb63, 0xfb76, + 0xfb62, 0xfb6e, 0xfb6d, 0xf02c, 0xf02e, 0xf02f, 0xf700, 0xf30c, + 0xf703, 0xf020, 0xf207, 0xf100, 0xf101, 0xf102, 0xf103, 0xf104, + 0xf105, 0xf106, 0xf107, 0xf108, 0xf109, 0xf208, 0xf209, 0xf307, + 0xf308, 0xf309, 0xf30b, 0xf304, 0xf305, 0xf306, 0xf30a, 0xf301, + 0xf302, 0xf303, 0xf300, 0xf310, 0xf206, 0xf200, 0xf03c, 0xf10a, + 0xf10b, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, + 0xf30e, 0xf702, 0xf30d, 0xf01c, 0xf701, 0xf205, 0xf114, 0xf603, + 0xf118, 0xf601, 0xf602, 0xf117, 0xf600, 0xf119, 0xf115, 0xf116, + 0xf11a, 0xf10c, 0xf10d, 0xf11b, 0xf11c, 0xf110, 0xf311, 0xf11d, + 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, +}; + +u_short shift_map[NR_KEYS] = { + 0xf200, 0xf01b, 0xf021, 0xf040, 0xf023, 0xf024, 0xf025, 0xf05e, + 0xf026, 0xf02a, 0xf028, 0xf029, 0xf05f, 0xf02b, 0xf07f, 0xf009, + 0xfb51, 0xfb57, 0xfb45, 0xfb52, 0xfb54, 0xfb59, 0xfb55, 0xfb49, + 0xfb4f, 0xfb50, 0xf07b, 0xf07d, 0xf201, 0xf702, 0xfb41, 0xfb53, + 0xfb44, 0xfb46, 0xfb47, 0xfb48, 0xfb4a, 0xfb4b, 0xfb4c, 0xf03a, + 0xf022, 0xf07e, 0xf700, 0xf07c, 0xfb5a, 0xfb58, 0xfb43, 0xfb56, + 0xfb42, 0xfb4e, 0xfb4d, 0xf03c, 0xf03e, 0xf03f, 0xf700, 0xf30c, + 0xf703, 0xf020, 0xf207, 0xf10a, 0xf10b, 0xf10c, 0xf10d, 0xf10e, + 0xf10f, 0xf110, 0xf111, 0xf112, 0xf113, 0xf213, 0xf203, 0xf307, + 0xf308, 0xf309, 0xf30b, 0xf304, 0xf305, 0xf306, 0xf30a, 0xf301, + 0xf302, 0xf303, 0xf300, 0xf310, 0xf206, 0xf200, 0xf03e, 0xf10a, + 0xf10b, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, + 0xf30e, 0xf702, 0xf30d, 0xf200, 0xf701, 0xf205, 0xf114, 0xf603, + 0xf20b, 0xf601, 0xf602, 0xf117, 0xf600, 0xf20a, 0xf115, 0xf116, + 0xf11a, 0xf10c, 0xf10d, 0xf11b, 0xf11c, 0xf110, 0xf311, 0xf11d, + 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, +}; + +u_short altgr_map[NR_KEYS] = { + 0xf200, 0xf200, 0xf200, 0xf040, 0xf200, 0xf024, 0xf200, 0xf200, + 0xf07b, 0xf05b, 0xf05d, 0xf07d, 0xf05c, 0xf200, 0xf200, 0xf200, + 0xfb71, 0xfb77, 0xf918, 0xfb72, 0xfb74, 0xfb79, 0xfb75, 0xfb69, + 0xfb6f, 0xfb70, 0xf200, 0xf07e, 0xf201, 0xf702, 0xf914, 0xfb73, + 0xf917, 0xf919, 0xfb67, 0xfb68, 0xfb6a, 0xfb6b, 0xfb6c, 0xf200, + 0xf200, 0xf200, 0xf700, 0xf200, 0xfb7a, 0xfb78, 0xf916, 0xfb76, + 0xf915, 0xfb6e, 0xfb6d, 0xf200, 0xf200, 0xf200, 0xf700, 0xf30c, + 0xf703, 0xf200, 0xf207, 0xf50c, 0xf50d, 0xf50e, 0xf50f, 0xf510, + 0xf511, 0xf512, 0xf513, 0xf514, 0xf515, 0xf208, 0xf202, 0xf911, + 0xf912, 0xf913, 0xf30b, 0xf90e, 0xf90f, 0xf910, 0xf30a, 0xf90b, + 0xf90c, 0xf90d, 0xf90a, 0xf310, 0xf206, 0xf200, 0xf07c, 0xf516, + 0xf517, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, + 0xf30e, 0xf702, 0xf30d, 0xf200, 0xf701, 0xf205, 0xf114, 0xf603, + 0xf118, 0xf601, 0xf602, 0xf117, 0xf600, 0xf119, 0xf115, 0xf116, + 0xf11a, 0xf10c, 0xf10d, 0xf11b, 0xf11c, 0xf110, 0xf311, 0xf11d, + 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, +}; + +u_short ctrl_map[NR_KEYS] = { + 0xf200, 0xf200, 0xf200, 0xf000, 0xf01b, 0xf01c, 0xf01d, 0xf01e, + 0xf01f, 0xf07f, 0xf200, 0xf200, 0xf01f, 0xf200, 0xf008, 0xf200, + 0xf011, 0xf017, 0xf005, 0xf012, 0xf014, 0xf019, 0xf015, 0xf009, + 0xf00f, 0xf010, 0xf01b, 0xf01d, 0xf201, 0xf702, 0xf001, 0xf013, + 0xf004, 0xf006, 0xf007, 0xf008, 0xf00a, 0xf00b, 0xf00c, 0xf200, + 0xf007, 0xf000, 0xf700, 0xf01c, 0xf01a, 0xf018, 0xf003, 0xf016, + 0xf002, 0xf00e, 0xf00d, 0xf200, 0xf20e, 0xf07f, 0xf700, 0xf30c, + 0xf703, 0xf000, 0xf207, 0xf100, 0xf101, 0xf102, 0xf103, 0xf104, + 0xf105, 0xf106, 0xf107, 0xf108, 0xf109, 0xf208, 0xf204, 0xf307, + 0xf308, 0xf309, 0xf30b, 0xf304, 0xf305, 0xf306, 0xf30a, 0xf301, + 0xf302, 0xf303, 0xf300, 0xf310, 0xf206, 0xf200, 0xf200, 0xf10a, + 0xf10b, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, + 0xf30e, 0xf702, 0xf30d, 0xf01c, 0xf701, 0xf205, 0xf114, 0xf603, + 0xf118, 0xf601, 0xf602, 0xf117, 0xf600, 0xf119, 0xf115, 0xf116, + 0xf11a, 0xf10c, 0xf10d, 0xf11b, 0xf11c, 0xf110, 0xf311, 0xf11d, + 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, +}; + +u_short shift_ctrl_map[NR_KEYS] = { + 0xf200, 0xf200, 0xf200, 0xf000, 0xf200, 0xf200, 0xf200, 0xf200, + 0xf200, 0xf200, 0xf200, 0xf200, 0xf01f, 0xf200, 0xf200, 0xf200, + 0xf011, 0xf017, 0xf005, 0xf012, 0xf014, 0xf019, 0xf015, 0xf009, + 0xf00f, 0xf010, 0xf200, 0xf200, 0xf201, 0xf702, 0xf001, 0xf013, + 0xf004, 0xf006, 0xf007, 0xf008, 0xf00a, 0xf00b, 0xf00c, 0xf200, + 0xf200, 0xf200, 0xf700, 0xf200, 0xf01a, 0xf018, 0xf003, 0xf016, + 0xf002, 0xf00e, 0xf00d, 0xf200, 0xf200, 0xf200, 0xf700, 0xf30c, + 0xf703, 0xf200, 0xf207, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, + 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf208, 0xf200, 0xf307, + 0xf308, 0xf309, 0xf30b, 0xf304, 0xf305, 0xf306, 0xf30a, 0xf301, + 0xf302, 0xf303, 0xf300, 0xf310, 0xf206, 0xf200, 0xf200, 0xf200, + 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, + 0xf30e, 0xf702, 0xf30d, 0xf200, 0xf701, 0xf205, 0xf114, 0xf603, + 0xf118, 0xf601, 0xf602, 0xf117, 0xf600, 0xf119, 0xf115, 0xf116, + 0xf11a, 0xf10c, 0xf10d, 0xf11b, 0xf11c, 0xf110, 0xf311, 0xf11d, + 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, +}; + +u_short alt_map[NR_KEYS] = { + 0xf200, 0xf81b, 0xf831, 0xf832, 0xf833, 0xf834, 0xf835, 0xf836, + 0xf837, 0xf838, 0xf839, 0xf830, 0xf82d, 0xf83d, 0xf87f, 0xf809, + 0xf871, 0xf877, 0xf865, 0xf872, 0xf874, 0xf879, 0xf875, 0xf869, + 0xf86f, 0xf870, 0xf85b, 0xf85d, 0xf80d, 0xf702, 0xf861, 0xf873, + 0xf864, 0xf866, 0xf867, 0xf868, 0xf86a, 0xf86b, 0xf86c, 0xf83b, + 0xf827, 0xf860, 0xf700, 0xf85c, 0xf87a, 0xf878, 0xf863, 0xf876, + 0xf862, 0xf86e, 0xf86d, 0xf82c, 0xf82e, 0xf82f, 0xf700, 0xf30c, + 0xf703, 0xf820, 0xf207, 0xf500, 0xf501, 0xf502, 0xf503, 0xf504, + 0xf505, 0xf506, 0xf507, 0xf508, 0xf509, 0xf208, 0xf209, 0xf907, + 0xf908, 0xf909, 0xf30b, 0xf904, 0xf905, 0xf906, 0xf30a, 0xf901, + 0xf902, 0xf903, 0xf900, 0xf310, 0xf206, 0xf200, 0xf83c, 0xf50a, + 0xf50b, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, + 0xf30e, 0xf702, 0xf30d, 0xf01c, 0xf701, 0xf205, 0xf114, 0xf603, + 0xf118, 0xf210, 0xf211, 0xf117, 0xf600, 0xf119, 0xf115, 0xf116, + 0xf11a, 0xf10c, 0xf10d, 0xf11b, 0xf11c, 0xf110, 0xf311, 0xf11d, + 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, +}; + +u_short ctrl_alt_map[NR_KEYS] = { + 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, + 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, + 0xf811, 0xf817, 0xf805, 0xf812, 0xf814, 0xf819, 0xf815, 0xf809, + 0xf80f, 0xf810, 0xf200, 0xf200, 0xf201, 0xf702, 0xf801, 0xf813, + 0xf804, 0xf806, 0xf807, 0xf808, 0xf80a, 0xf80b, 0xf80c, 0xf200, + 0xf200, 0xf200, 0xf700, 0xf200, 0xf81a, 0xf818, 0xf803, 0xf816, + 0xf802, 0xf80e, 0xf80d, 0xf200, 0xf200, 0xf200, 0xf700, 0xf30c, + 0xf703, 0xf200, 0xf207, 0xf500, 0xf501, 0xf502, 0xf503, 0xf504, + 0xf505, 0xf506, 0xf507, 0xf508, 0xf509, 0xf208, 0xf200, 0xf307, + 0xf308, 0xf309, 0xf30b, 0xf304, 0xf305, 0xf306, 0xf30a, 0xf301, + 0xf302, 0xf303, 0xf300, 0xf20c, 0xf206, 0xf200, 0xf200, 0xf50a, + 0xf50b, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, + 0xf30e, 0xf702, 0xf30d, 0xf200, 0xf701, 0xf205, 0xf114, 0xf603, + 0xf118, 0xf601, 0xf602, 0xf117, 0xf600, 0xf119, 0xf115, 0xf20c, + 0xf11a, 0xf10c, 0xf10d, 0xf11b, 0xf11c, 0xf110, 0xf311, 0xf11d, + 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, 0xf200, +}; + +ushort *key_maps[MAX_NR_KEYMAPS] = { + plain_map, shift_map, altgr_map, NULL, + ctrl_map, shift_ctrl_map, NULL, NULL, + alt_map, NULL, NULL, NULL, + ctrl_alt_map, NULL +}; + +unsigned int keymap_count = 7; + +/* + * Philosophy: most people do not define more strings, but they who do + * often want quite a lot of string space. So, we statically allocate + * the default and allocate dynamically in chunks of 512 bytes. + */ + +char func_buf[] = { + '\033', '[', '[', 'A', 0, + '\033', '[', '[', 'B', 0, + '\033', '[', '[', 'C', 0, + '\033', '[', '[', 'D', 0, + '\033', '[', '[', 'E', 0, + '\033', '[', '1', '7', '~', 0, + '\033', '[', '1', '8', '~', 0, + '\033', '[', '1', '9', '~', 0, + '\033', '[', '2', '0', '~', 0, + '\033', '[', '2', '1', '~', 0, + '\033', '[', '2', '3', '~', 0, + '\033', '[', '2', '4', '~', 0, + '\033', '[', '2', '5', '~', 0, + '\033', '[', '2', '6', '~', 0, + '\033', '[', '2', '8', '~', 0, + '\033', '[', '2', '9', '~', 0, + '\033', '[', '3', '1', '~', 0, + '\033', '[', '3', '2', '~', 0, + '\033', '[', '3', '3', '~', 0, + '\033', '[', '3', '4', '~', 0, + '\033', '[', '1', '~', 0, + '\033', '[', '2', '~', 0, + '\033', '[', '3', '~', 0, + '\033', '[', '4', '~', 0, + '\033', '[', '5', '~', 0, + '\033', '[', '6', '~', 0, + '\033', '[', 'M', 0, + '\033', '[', 'P', 0, +}; + +char *funcbufptr = func_buf; +int funcbufsize = sizeof(func_buf); +int funcbufleft = 0; /* space left */ + +char *func_table[MAX_NR_FUNC] = { + func_buf + 0, + func_buf + 5, + func_buf + 10, + func_buf + 15, + func_buf + 20, + func_buf + 25, + func_buf + 31, + func_buf + 37, + func_buf + 43, + func_buf + 49, + func_buf + 55, + func_buf + 61, + func_buf + 67, + func_buf + 73, + func_buf + 79, + func_buf + 85, + func_buf + 91, + func_buf + 97, + func_buf + 103, + func_buf + 109, + func_buf + 115, + func_buf + 120, + func_buf + 125, + func_buf + 130, + func_buf + 135, + func_buf + 140, + func_buf + 145, + NULL, + NULL, + func_buf + 149, + NULL, +}; + +struct kbdiacruc accent_table[MAX_DIACR] = { + {'`', 'A', 0300}, {'`', 'a', 0340}, + {'\'', 'A', 0301}, {'\'', 'a', 0341}, + {'^', 'A', 0302}, {'^', 'a', 0342}, + {'~', 'A', 0303}, {'~', 'a', 0343}, + {'"', 'A', 0304}, {'"', 'a', 0344}, + {'O', 'A', 0305}, {'o', 'a', 0345}, + {'0', 'A', 0305}, {'0', 'a', 0345}, + {'A', 'A', 0305}, {'a', 'a', 0345}, + {'A', 'E', 0306}, {'a', 'e', 0346}, + {',', 'C', 0307}, {',', 'c', 0347}, + {'`', 'E', 0310}, {'`', 'e', 0350}, + {'\'', 'E', 0311}, {'\'', 'e', 0351}, + {'^', 'E', 0312}, {'^', 'e', 0352}, + {'"', 'E', 0313}, {'"', 'e', 0353}, + {'`', 'I', 0314}, {'`', 'i', 0354}, + {'\'', 'I', 0315}, {'\'', 'i', 0355}, + {'^', 'I', 0316}, {'^', 'i', 0356}, + {'"', 'I', 0317}, {'"', 'i', 0357}, + {'-', 'D', 0320}, {'-', 'd', 0360}, + {'~', 'N', 0321}, {'~', 'n', 0361}, + {'`', 'O', 0322}, {'`', 'o', 0362}, + {'\'', 'O', 0323}, {'\'', 'o', 0363}, + {'^', 'O', 0324}, {'^', 'o', 0364}, + {'~', 'O', 0325}, {'~', 'o', 0365}, + {'"', 'O', 0326}, {'"', 'o', 0366}, + {'/', 'O', 0330}, {'/', 'o', 0370}, + {'`', 'U', 0331}, {'`', 'u', 0371}, + {'\'', 'U', 0332}, {'\'', 'u', 0372}, + {'^', 'U', 0333}, {'^', 'u', 0373}, + {'"', 'U', 0334}, {'"', 'u', 0374}, + {'\'', 'Y', 0335}, {'\'', 'y', 0375}, + {'T', 'H', 0336}, {'t', 'h', 0376}, + {'s', 's', 0337}, {'"', 'y', 0377}, + {'s', 'z', 0337}, {'i', 'j', 0377}, +}; + +unsigned int accent_table_size = 68; diff --git a/fs/exec.c b/fs/exec.c index 6d2b6f936858..56536ad0e7cc 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -19,7 +19,7 @@ * current->executable is only used by the procfs. This allows a dispatch * table to check for several different types of binary formats. We keep * trying until we recognize the file or we run out of supported binary - * formats. + * formats. */ #include @@ -55,6 +55,8 @@ #include #include +#include + #include #include #include @@ -78,7 +80,7 @@ int __register_binfmt(struct linux_binfmt * fmt, int insert) insert ? list_add(&fmt->lh, &formats) : list_add_tail(&fmt->lh, &formats); write_unlock(&binfmt_lock); - return 0; + return 0; } EXPORT_SYMBOL(__register_binfmt); @@ -1064,7 +1066,7 @@ void setup_new_exec(struct linux_binprm * bprm) group */ current->self_exec_id++; - + flush_signal_handlers(current, 0); flush_old_files(current->files); } @@ -1154,8 +1156,8 @@ int check_unsafe_exec(struct linux_binprm *bprm) return res; } -/* - * Fill the binprm structure from the inode. +/* + * Fill the binprm structure from the inode. * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes * * This may be called multiple times for binary chains (scripts for example). @@ -1367,6 +1369,7 @@ int do_execve(const char * filename, goto out_unmark; sched_exec(); + litmus_exec(); bprm->file = file; bprm->filename = filename; diff --git a/fs/inode.c b/fs/inode.c index 86464332e590..d4fe9c031864 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -266,6 +266,8 @@ void inode_init_once(struct inode *inode) #ifdef CONFIG_FSNOTIFY INIT_HLIST_HEAD(&inode->i_fsnotify_marks); #endif + INIT_LIST_HEAD(&inode->i_obj_list); + mutex_init(&inode->i_obj_mutex); } EXPORT_SYMBOL(inode_init_once); diff --git a/include/linux/completion.h b/include/linux/completion.h index 51e3145196f6..c63950e8a863 100644 --- a/include/linux/completion.h +++ b/include/linux/completion.h @@ -90,6 +90,7 @@ extern bool completion_done(struct completion *x); extern void complete(struct completion *); extern void complete_all(struct completion *); +extern void complete_n(struct completion *, int n); /** * INIT_COMPLETION: - reinitialize a completion structure diff --git a/include/linux/fs.h b/include/linux/fs.h index 63d069bd80b7..29a672458d27 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -16,8 +16,8 @@ * nr_file rlimit, so it's safe to set up a ridiculously high absolute * upper limit on files-per-process. * - * Some programs (notably those using select()) may have to be - * recompiled to take full advantage of the new limits.. + * Some programs (notably those using select()) may have to be + * recompiled to take full advantage of the new limits.. */ /* Fixed constants first: */ @@ -172,7 +172,7 @@ struct inodes_stat_t { #define SEL_EX 4 /* public flags for file_system_type */ -#define FS_REQUIRES_DEV 1 +#define FS_REQUIRES_DEV 1 #define FS_BINARY_MOUNTDATA 2 #define FS_HAS_SUBTYPE 4 #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */ @@ -470,7 +470,7 @@ struct iattr { */ #include -/** +/** * enum positive_aop_returns - aop return codes with specific semantics * * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has @@ -480,7 +480,7 @@ struct iattr { * be a candidate for writeback again in the near * future. Other callers must be careful to unlock * the page if they get this return. Returned by - * writepage(); + * writepage(); * * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has * unlocked it and the page might have been truncated. @@ -721,6 +721,7 @@ static inline int mapping_writably_mapped(struct address_space *mapping) struct posix_acl; #define ACL_NOT_CACHED ((void *)(-1)) +struct inode_obj_id_table; struct inode { struct hlist_node i_hash; @@ -784,6 +785,8 @@ struct inode { struct posix_acl *i_acl; struct posix_acl *i_default_acl; #endif + struct list_head i_obj_list; + struct mutex i_obj_mutex; void *i_private; /* fs or device private pointer */ }; @@ -997,10 +1000,10 @@ static inline int file_check_writeable(struct file *filp) #define MAX_NON_LFS ((1UL<<31) - 1) -/* Page cache limit. The filesystems should put that into their s_maxbytes - limits, otherwise bad things can happen in VM. */ +/* Page cache limit. The filesystems should put that into their s_maxbytes + limits, otherwise bad things can happen in VM. */ #if BITS_PER_LONG==32 -#define MAX_LFS_FILESIZE (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) +#define MAX_LFS_FILESIZE (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1) #elif BITS_PER_LONG==64 #define MAX_LFS_FILESIZE 0x7fffffffffffffffUL #endif @@ -2145,7 +2148,7 @@ extern int may_open(struct path *, int, int); extern int kernel_read(struct file *, loff_t, char *, unsigned long); extern struct file * open_exec(const char *); - + /* fs/dcache.c -- generic fs support functions */ extern int is_subdir(struct dentry *, struct dentry *); extern int path_is_under(struct path *, struct path *); diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index fd0c1b857d3d..76da541c1f66 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -167,6 +167,7 @@ struct hrtimer_clock_base { * @nr_retries: Total number of hrtimer interrupt retries * @nr_hangs: Total number of hrtimer interrupt hangs * @max_hang_time: Maximum time spent in hrtimer_interrupt + * @to_pull: LITMUS^RT list of timers to be pulled on this cpu */ struct hrtimer_cpu_base { raw_spinlock_t lock; @@ -180,8 +181,32 @@ struct hrtimer_cpu_base { unsigned long nr_hangs; ktime_t max_hang_time; #endif + struct list_head to_pull; }; +#ifdef CONFIG_ARCH_HAS_SEND_PULL_TIMERS + +#define HRTIMER_START_ON_INACTIVE 0 +#define HRTIMER_START_ON_QUEUED 1 + +/* + * struct hrtimer_start_on_info - save timer info on remote cpu + * @list: list of hrtimer_start_on_info on remote cpu (to_pull) + * @timer: timer to be triggered on remote cpu + * @time: time event + * @mode: timer mode + * @state: activity flag + */ +struct hrtimer_start_on_info { + struct list_head list; + struct hrtimer *timer; + ktime_t time; + enum hrtimer_mode mode; + atomic_t state; +}; + +#endif + static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) { timer->_expires = time; @@ -348,6 +373,13 @@ __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns, const enum hrtimer_mode mode, int wakeup); +#ifdef CONFIG_ARCH_HAS_SEND_PULL_TIMERS +extern void hrtimer_start_on_info_init(struct hrtimer_start_on_info *info); +extern int hrtimer_start_on(int cpu, struct hrtimer_start_on_info *info, + struct hrtimer *timer, ktime_t time, + const enum hrtimer_mode mode); +#endif + extern int hrtimer_cancel(struct hrtimer *timer); extern int hrtimer_try_to_cancel(struct hrtimer *timer); diff --git a/include/linux/sched.h b/include/linux/sched.h index 1e2a6db2d7dd..c9ac4fc837ba 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -38,6 +38,7 @@ #define SCHED_BATCH 3 /* SCHED_ISO: reserved but not implemented yet */ #define SCHED_IDLE 5 +#define SCHED_LITMUS 6 /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ #define SCHED_RESET_ON_FORK 0x40000000 @@ -94,6 +95,9 @@ struct sched_param { #include +#include +#include + struct exec_domain; struct futex_pi_state; struct robust_list_head; @@ -1159,6 +1163,7 @@ struct sched_rt_entity { }; struct rcu_node; +struct od_table_entry; struct task_struct { volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ @@ -1243,9 +1248,9 @@ struct task_struct { unsigned long stack_canary; #endif - /* + /* * pointers to (original) parent process, youngest child, younger sibling, - * older sibling, respectively. (p->father can be replaced with + * older sibling, respectively. (p->father can be replaced with * p->real_parent->pid) */ struct task_struct *real_parent; /* real parent process */ @@ -1453,6 +1458,13 @@ struct task_struct { int make_it_fail; #endif struct prop_local_single dirties; + + /* LITMUS RT parameters and state */ + struct rt_param rt_param; + + /* references to PI semaphores, etc. */ + struct od_table_entry *od_table; + #ifdef CONFIG_LATENCYTOP int latency_record_count; struct latency_record latency_record[LT_SAVECOUNT]; @@ -2014,7 +2026,7 @@ static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, s spin_unlock_irqrestore(&tsk->sighand->siglock, flags); return ret; -} +} extern void block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask); @@ -2290,6 +2302,7 @@ static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag) static inline void set_tsk_need_resched(struct task_struct *tsk) { set_tsk_thread_flag(tsk,TIF_NEED_RESCHED); + sched_state_will_schedule(tsk); } static inline void clear_tsk_need_resched(struct task_struct *tsk) diff --git a/include/linux/smp.h b/include/linux/smp.h index cfa2d20e35f1..f86d40768e7f 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -79,6 +79,11 @@ void __smp_call_function_single(int cpuid, struct call_single_data *data, int smp_call_function_any(const struct cpumask *mask, void (*func)(void *info), void *info, int wait); +/* + * sends a 'pull timer' event to a remote CPU + */ +extern void smp_send_pull_timers(int cpu); + /* * Generic and arch helpers */ diff --git a/include/linux/tick.h b/include/linux/tick.h index b232ccc0ee29..1e29bd5b18af 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -74,6 +74,11 @@ extern int tick_is_oneshot_available(void); extern struct tick_device *tick_get_device(int cpu); # ifdef CONFIG_HIGH_RES_TIMERS +/* LITMUS^RT tick alignment */ +#define LINUX_DEFAULT_TICKS 0 +#define LITMUS_ALIGNED_TICKS 1 +#define LITMUS_STAGGERED_TICKS 2 + extern int tick_init_highres(void); extern int tick_program_event(ktime_t expires, int force); extern void tick_setup_sched_timer(void); diff --git a/include/litmus/bheap.h b/include/litmus/bheap.h new file mode 100644 index 000000000000..cf4864a498d8 --- /dev/null +++ b/include/litmus/bheap.h @@ -0,0 +1,77 @@ +/* bheaps.h -- Binomial Heaps + * + * (c) 2008, 2009 Bjoern Brandenburg + */ + +#ifndef BHEAP_H +#define BHEAP_H + +#define NOT_IN_HEAP UINT_MAX + +struct bheap_node { + struct bheap_node* parent; + struct bheap_node* next; + struct bheap_node* child; + + unsigned int degree; + void* value; + struct bheap_node** ref; +}; + +struct bheap { + struct bheap_node* head; + /* We cache the minimum of the heap. + * This speeds up repeated peek operations. + */ + struct bheap_node* min; +}; + +typedef int (*bheap_prio_t)(struct bheap_node* a, struct bheap_node* b); + +void bheap_init(struct bheap* heap); +void bheap_node_init(struct bheap_node** ref_to_bheap_node_ptr, void* value); + +static inline int bheap_node_in_heap(struct bheap_node* h) +{ + return h->degree != NOT_IN_HEAP; +} + +static inline int bheap_empty(struct bheap* heap) +{ + return heap->head == NULL && heap->min == NULL; +} + +/* insert (and reinitialize) a node into the heap */ +void bheap_insert(bheap_prio_t higher_prio, + struct bheap* heap, + struct bheap_node* node); + +/* merge addition into target */ +void bheap_union(bheap_prio_t higher_prio, + struct bheap* target, + struct bheap* addition); + +struct bheap_node* bheap_peek(bheap_prio_t higher_prio, + struct bheap* heap); + +struct bheap_node* bheap_take(bheap_prio_t higher_prio, + struct bheap* heap); + +void bheap_uncache_min(bheap_prio_t higher_prio, struct bheap* heap); +int bheap_decrease(bheap_prio_t higher_prio, struct bheap_node* node); + +void bheap_delete(bheap_prio_t higher_prio, + struct bheap* heap, + struct bheap_node* node); + +/* allocate from memcache */ +struct bheap_node* bheap_node_alloc(int gfp_flags); +void bheap_node_free(struct bheap_node* hn); + +/* allocate a heap node for value and insert into the heap */ +int bheap_add(bheap_prio_t higher_prio, struct bheap* heap, + void* value, int gfp_flags); + +void* bheap_take_del(bheap_prio_t higher_prio, + struct bheap* heap); +#endif diff --git a/include/litmus/budget.h b/include/litmus/budget.h new file mode 100644 index 000000000000..732530e63491 --- /dev/null +++ b/include/litmus/budget.h @@ -0,0 +1,8 @@ +#ifndef _LITMUS_BUDGET_H_ +#define _LITMUS_BUDGET_H_ + +/* Update the per-processor enforcement timer (arm/reproram/cancel) for + * the next task. */ +void update_enforcement_timer(struct task_struct* t); + +#endif diff --git a/include/litmus/clustered.h b/include/litmus/clustered.h new file mode 100644 index 000000000000..0c18dcb15e6c --- /dev/null +++ b/include/litmus/clustered.h @@ -0,0 +1,44 @@ +#ifndef CLUSTERED_H +#define CLUSTERED_H + +/* Which cache level should be used to group CPUs into clusters? + * GLOBAL_CLUSTER means that all CPUs form a single cluster (just like under + * global scheduling). + */ +enum cache_level { + GLOBAL_CLUSTER = 0, + L1_CLUSTER = 1, + L2_CLUSTER = 2, + L3_CLUSTER = 3 +}; + +int parse_cache_level(const char *str, enum cache_level *level); +const char* cache_level_name(enum cache_level level); + +/* expose a cache level in a /proc dir */ +struct proc_dir_entry* create_cluster_file(struct proc_dir_entry* parent, + enum cache_level* level); + + + +struct scheduling_cluster { + unsigned int id; + /* list of CPUs that are part of this cluster */ + struct list_head cpus; +}; + +struct cluster_cpu { + unsigned int id; /* which CPU is this? */ + struct list_head cluster_list; /* List of the CPUs in this cluster. */ + struct scheduling_cluster* cluster; /* The cluster that this CPU belongs to. */ +}; + +int get_cluster_size(enum cache_level level); + +int assign_cpus_to_clusters(enum cache_level level, + struct scheduling_cluster* clusters[], + unsigned int num_clusters, + struct cluster_cpu* cpus[], + unsigned int num_cpus); + +#endif diff --git a/include/litmus/debug_trace.h b/include/litmus/debug_trace.h new file mode 100644 index 000000000000..48d086d5a44c --- /dev/null +++ b/include/litmus/debug_trace.h @@ -0,0 +1,37 @@ +#ifndef LITMUS_DEBUG_TRACE_H +#define LITMUS_DEBUG_TRACE_H + +#ifdef CONFIG_SCHED_DEBUG_TRACE +void sched_trace_log_message(const char* fmt, ...); +void dump_trace_buffer(int max); +#else + +#define sched_trace_log_message(fmt, ...) + +#endif + +extern atomic_t __log_seq_no; + +#ifdef CONFIG_SCHED_DEBUG_TRACE_CALLER +#define TRACE_PREFIX "%d P%d [%s@%s:%d]: " +#define TRACE_ARGS atomic_add_return(1, &__log_seq_no), \ + raw_smp_processor_id(), \ + __FUNCTION__, __FILE__, __LINE__ +#else +#define TRACE_PREFIX "%d P%d: " +#define TRACE_ARGS atomic_add_return(1, &__log_seq_no), \ + raw_smp_processor_id() +#endif + +#define TRACE(fmt, args...) \ + sched_trace_log_message(TRACE_PREFIX fmt, \ + TRACE_ARGS, ## args) + +#define TRACE_TASK(t, fmt, args...) \ + TRACE("(%s/%d:%d) " fmt, (t)->comm, (t)->pid, \ + (t)->rt_param.job_params.job_no, ##args) + +#define TRACE_CUR(fmt, args...) \ + TRACE_TASK(current, fmt, ## args) + +#endif diff --git a/include/litmus/edf_common.h b/include/litmus/edf_common.h new file mode 100644 index 000000000000..2c4266f77c03 --- /dev/null +++ b/include/litmus/edf_common.h @@ -0,0 +1,33 @@ +/* + * EDF common data structures and utility functions shared by all EDF + * based scheduler plugins + */ + +/* CLEANUP: Add comments and make it less messy. + * + */ + +#ifndef __UNC_EDF_COMMON_H__ +#define __UNC_EDF_COMMON_H__ + +#include + +void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched, + release_jobs_t release); + +int edf_higher_prio(struct task_struct* first, + struct task_struct* second); + +#ifdef CONFIG_LITMUS_LOCKING +/* priority comparison without priority inheritance */ +int edf_higher_base_prio(struct task_struct* first, + struct task_struct* second); + +int edf_pending_order(struct bheap_node* a, struct bheap_node* b); +#endif + +int edf_ready_order(struct bheap_node* a, struct bheap_node* b); + +int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t); + +#endif diff --git a/include/litmus/fdso.h b/include/litmus/fdso.h new file mode 100644 index 000000000000..d1ee0d1142d8 --- /dev/null +++ b/include/litmus/fdso.h @@ -0,0 +1,77 @@ +/* fdso.h - file descriptor attached shared objects + * + * (c) 2007--2011 B. Brandenburg, LITMUS^RT project + */ + +#ifndef _LINUX_FDSO_H_ +#define _LINUX_FDSO_H_ + +#include +#include + +#include +#include + +#define MAX_OBJECT_DESCRIPTORS 85 + +typedef enum { + MIN_OBJ_TYPE = 0, + + FMLP_SEM = 0, + SRP_SEM = 1, + + MPCP_SEM = 2, + MPCP_VS_SEM = 3, + DPCP_SEM = 4, + + OMLP_SEM = 5, + + MAX_OBJ_TYPE = 5 +} obj_type_t; + +struct inode_obj_id { + struct list_head list; + atomic_t count; + struct inode* inode; + + obj_type_t type; + void* obj; + unsigned int id; +}; + +struct fdso_ops; + +struct od_table_entry { + unsigned int used; + + struct inode_obj_id* obj; + const struct fdso_ops* class; +}; + +struct fdso_ops { + int (*create)(void** obj_ref, obj_type_t type, void* __user); + void (*destroy)(obj_type_t type, void*); + int (*open) (struct od_table_entry*, void* __user); + int (*close) (struct od_table_entry*); +}; + +/* translate a userspace supplied od into the raw table entry + * returns NULL if od is invalid + */ +struct od_table_entry* get_entry_for_od(int od); + +/* translate a userspace supplied od into the associated object + * returns NULL if od is invalid + */ +static inline void* od_lookup(int od, obj_type_t type) +{ + struct od_table_entry* e = get_entry_for_od(od); + return e && e->obj->type == type ? e->obj->obj : NULL; +} + +#define lookup_fmlp_sem(od)((struct pi_semaphore*) od_lookup(od, FMLP_SEM)) +#define lookup_srp_sem(od) ((struct srp_semaphore*) od_lookup(od, SRP_SEM)) +#define lookup_ics(od) ((struct ics*) od_lookup(od, ICS_ID)) + + +#endif diff --git a/include/litmus/feather_buffer.h b/include/litmus/feather_buffer.h new file mode 100644 index 000000000000..6c18277fdfc9 --- /dev/null +++ b/include/litmus/feather_buffer.h @@ -0,0 +1,94 @@ +#ifndef _FEATHER_BUFFER_H_ +#define _FEATHER_BUFFER_H_ + +/* requires UINT_MAX and memcpy */ + +#define SLOT_FREE 0 +#define SLOT_BUSY 1 +#define SLOT_READY 2 + +struct ft_buffer { + unsigned int slot_count; + unsigned int slot_size; + + int free_count; + unsigned int write_idx; + unsigned int read_idx; + + char* slots; + void* buffer_mem; + unsigned int failed_writes; +}; + +static inline int init_ft_buffer(struct ft_buffer* buf, + unsigned int slot_count, + unsigned int slot_size, + char* slots, + void* buffer_mem) +{ + int i = 0; + if (!slot_count || UINT_MAX % slot_count != slot_count - 1) { + /* The slot count must divide UNIT_MAX + 1 so that when it + * wraps around the index correctly points to 0. + */ + return 0; + } else { + buf->slot_count = slot_count; + buf->slot_size = slot_size; + buf->slots = slots; + buf->buffer_mem = buffer_mem; + buf->free_count = slot_count; + buf->write_idx = 0; + buf->read_idx = 0; + buf->failed_writes = 0; + for (i = 0; i < slot_count; i++) + buf->slots[i] = SLOT_FREE; + return 1; + } +} + +static inline int ft_buffer_start_write(struct ft_buffer* buf, void **ptr) +{ + int free = fetch_and_dec(&buf->free_count); + unsigned int idx; + if (free <= 0) { + fetch_and_inc(&buf->free_count); + *ptr = 0; + fetch_and_inc(&buf->failed_writes); + return 0; + } else { + idx = fetch_and_inc((int*) &buf->write_idx) % buf->slot_count; + buf->slots[idx] = SLOT_BUSY; + *ptr = ((char*) buf->buffer_mem) + idx * buf->slot_size; + return 1; + } +} + +static inline void ft_buffer_finish_write(struct ft_buffer* buf, void *ptr) +{ + unsigned int idx = ((char*) ptr - (char*) buf->buffer_mem) / buf->slot_size; + buf->slots[idx] = SLOT_READY; +} + + +/* exclusive reader access is assumed */ +static inline int ft_buffer_read(struct ft_buffer* buf, void* dest) +{ + unsigned int idx; + if (buf->free_count == buf->slot_count) + /* nothing available */ + return 0; + idx = buf->read_idx % buf->slot_count; + if (buf->slots[idx] == SLOT_READY) { + memcpy(dest, ((char*) buf->buffer_mem) + idx * buf->slot_size, + buf->slot_size); + buf->slots[idx] = SLOT_FREE; + buf->read_idx++; + fetch_and_inc(&buf->free_count); + return 1; + } else + return 0; +} + + +#endif diff --git a/include/litmus/feather_trace.h b/include/litmus/feather_trace.h new file mode 100644 index 000000000000..028dfb206fb0 --- /dev/null +++ b/include/litmus/feather_trace.h @@ -0,0 +1,65 @@ +#ifndef _FEATHER_TRACE_H_ +#define _FEATHER_TRACE_H_ + +#include + +int ft_enable_event(unsigned long id); +int ft_disable_event(unsigned long id); +int ft_is_event_enabled(unsigned long id); +int ft_disable_all_events(void); + +/* atomic_* funcitons are inline anyway */ +static inline int fetch_and_inc(int *val) +{ + return atomic_add_return(1, (atomic_t*) val) - 1; +} + +static inline int fetch_and_dec(int *val) +{ + return atomic_sub_return(1, (atomic_t*) val) + 1; +} + +/* Don't use rewriting implementation if kernel text pages are read-only. + * Ftrace gets around this by using the identity mapping, but that's more + * effort that is warrented right now for Feather-Trace. + * Eventually, it may make sense to replace Feather-Trace with ftrace. + */ +#if defined(CONFIG_ARCH_HAS_FEATHER_TRACE) && !defined(CONFIG_DEBUG_RODATA) + +#include + +#else /* !__ARCH_HAS_FEATHER_TRACE */ + +/* provide default implementation */ + +#include /* for get_cycles() */ + +static inline unsigned long long ft_timestamp(void) +{ + return get_cycles(); +} + +#define feather_callback + +#define MAX_EVENTS 1024 + +extern int ft_events[MAX_EVENTS]; + +#define ft_event(id, callback) \ + if (ft_events[id]) callback(); + +#define ft_event0(id, callback) \ + if (ft_events[id]) callback(id); + +#define ft_event1(id, callback, param) \ + if (ft_events[id]) callback(id, param); + +#define ft_event2(id, callback, param, param2) \ + if (ft_events[id]) callback(id, param, param2); + +#define ft_event3(id, callback, p, p2, p3) \ + if (ft_events[id]) callback(id, p, p2, p3); + +#endif /* __ARCH_HAS_FEATHER_TRACE */ + +#endif diff --git a/include/litmus/fp_common.h b/include/litmus/fp_common.h new file mode 100644 index 000000000000..dd1f7bf1e347 --- /dev/null +++ b/include/litmus/fp_common.h @@ -0,0 +1,105 @@ +/* Fixed-priority scheduler support. + */ + +#ifndef __FP_COMMON_H__ +#define __FP_COMMON_H__ + +#include + +#include + + +void fp_domain_init(rt_domain_t* rt, check_resched_needed_t resched, + release_jobs_t release); + +int fp_higher_prio(struct task_struct* first, + struct task_struct* second); + +int fp_ready_order(struct bheap_node* a, struct bheap_node* b); + +#define FP_PRIO_BIT_WORDS (LITMUS_MAX_PRIORITY / BITS_PER_LONG) + +#if (LITMUS_MAX_PRIORITY % BITS_PER_LONG) +#error LITMUS_MAX_PRIORITY must be a multiple of BITS_PER_LONG +#endif + +/* bitmask-inexed priority queue */ +struct fp_prio_queue { + unsigned long bitmask[FP_PRIO_BIT_WORDS]; + struct bheap queue[LITMUS_MAX_PRIORITY]; +}; + +void fp_prio_queue_init(struct fp_prio_queue* q); + +static inline void fpq_set(struct fp_prio_queue* q, unsigned int index) +{ + unsigned long *word = q->bitmask + (index / BITS_PER_LONG); + __set_bit(index % BITS_PER_LONG, word); +} + +static inline void fpq_clear(struct fp_prio_queue* q, unsigned int index) +{ + unsigned long *word = q->bitmask + (index / BITS_PER_LONG); + __clear_bit(index % BITS_PER_LONG, word); +} + +static inline unsigned int fpq_find(struct fp_prio_queue* q) +{ + int i; + + /* loop optimizer should unroll this */ + for (i = 0; i < FP_PRIO_BIT_WORDS; i++) + if (q->bitmask[i]) + return __ffs(q->bitmask[i]) + i * BITS_PER_LONG; + + return LITMUS_MAX_PRIORITY; /* nothing found */ +} + +static inline void fp_prio_add(struct fp_prio_queue* q, struct task_struct* t, unsigned int index) +{ + + BUG_ON(bheap_node_in_heap(tsk_rt(t)->heap_node)); + + fpq_set(q, index); + bheap_insert(fp_ready_order, &q->queue[index], tsk_rt(t)->heap_node); +} + +static inline void fp_prio_remove(struct fp_prio_queue* q, struct task_struct* t, unsigned int index) +{ + BUG_ON(!is_queued(t)); + + bheap_delete(fp_ready_order, &q->queue[index], tsk_rt(t)->heap_node); + if (likely(bheap_empty(&q->queue[index]))) + fpq_clear(q, index); +} + +static inline struct task_struct* fp_prio_peek(struct fp_prio_queue* q) +{ + unsigned int idx = fpq_find(q); + struct bheap_node* hn; + + if (idx < LITMUS_MAX_PRIORITY) { + hn = bheap_peek(fp_ready_order, &q->queue[idx]); + return bheap2task(hn); + } else + return NULL; +} + +static inline struct task_struct* fp_prio_take(struct fp_prio_queue* q) +{ + unsigned int idx = fpq_find(q); + struct bheap_node* hn; + + if (idx < LITMUS_MAX_PRIORITY) { + hn = bheap_take(fp_ready_order, &q->queue[idx]); + if (likely(bheap_empty(&q->queue[idx]))) + fpq_clear(q, idx); + return bheap2task(hn); + } else + return NULL; +} + +int fp_preemption_needed(struct fp_prio_queue* q, struct task_struct *t); + + +#endif diff --git a/include/litmus/ftdev.h b/include/litmus/ftdev.h new file mode 100644 index 000000000000..0b959874dd70 --- /dev/null +++ b/include/litmus/ftdev.h @@ -0,0 +1,55 @@ +#ifndef _LITMUS_FTDEV_H_ +#define _LITMUS_FTDEV_H_ + +#include +#include +#include +#include + +#define FTDEV_ENABLE_CMD 0 +#define FTDEV_DISABLE_CMD 1 + +struct ftdev; + +/* return 0 if buffer can be opened, otherwise -$REASON */ +typedef int (*ftdev_can_open_t)(struct ftdev* dev, unsigned int buf_no); +/* return 0 on success, otherwise -$REASON */ +typedef int (*ftdev_alloc_t)(struct ftdev* dev, unsigned int buf_no); +typedef void (*ftdev_free_t)(struct ftdev* dev, unsigned int buf_no); +/* Let devices handle writes from userspace. No synchronization provided. */ +typedef ssize_t (*ftdev_write_t)(struct ft_buffer* buf, size_t len, const char __user *from); + +struct ftdev_event; + +struct ftdev_minor { + struct ft_buffer* buf; + unsigned int readers; + struct mutex lock; + /* FIXME: filter for authorized events */ + struct ftdev_event* events; + struct device* device; + struct ftdev* ftdev; +}; + +struct ftdev { + dev_t major; + struct cdev cdev; + struct class* class; + const char* name; + struct ftdev_minor* minor; + unsigned int minor_cnt; + ftdev_alloc_t alloc; + ftdev_free_t free; + ftdev_can_open_t can_open; + ftdev_write_t write; +}; + +struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size); +void free_ft_buffer(struct ft_buffer* buf); + +int ftdev_init( struct ftdev* ftdev, struct module* owner, + const int minor_cnt, const char* name); +void ftdev_exit(struct ftdev* ftdev); +int register_ftdev(struct ftdev* ftdev); + +#endif diff --git a/include/litmus/jobs.h b/include/litmus/jobs.h new file mode 100644 index 000000000000..9bd361ef3943 --- /dev/null +++ b/include/litmus/jobs.h @@ -0,0 +1,9 @@ +#ifndef __LITMUS_JOBS_H__ +#define __LITMUS_JOBS_H__ + +void prepare_for_next_period(struct task_struct *t); +void release_at(struct task_struct *t, lt_t start); +long complete_job(void); + +#endif + diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h new file mode 100644 index 000000000000..31ac72eddef7 --- /dev/null +++ b/include/litmus/litmus.h @@ -0,0 +1,292 @@ +/* + * Constant definitions related to + * scheduling policy. + */ + +#ifndef _LINUX_LITMUS_H_ +#define _LINUX_LITMUS_H_ + +#include + +#ifdef CONFIG_RELEASE_MASTER +extern atomic_t release_master_cpu; +#endif + +/* in_list - is a given list_head queued on some list? + */ +static inline int in_list(struct list_head* list) +{ + return !( /* case 1: deleted */ + (list->next == LIST_POISON1 && + list->prev == LIST_POISON2) + || + /* case 2: initialized */ + (list->next == list && + list->prev == list) + ); +} + +#define NO_CPU 0xffffffff + +void litmus_fork(struct task_struct *tsk); +void litmus_exec(void); +/* clean up real-time state of a task */ +void exit_litmus(struct task_struct *dead_tsk); + +long litmus_admit_task(struct task_struct *tsk); +void litmus_exit_task(struct task_struct *tsk); + +#define is_realtime(t) ((t)->policy == SCHED_LITMUS) +#define rt_transition_pending(t) \ + ((t)->rt_param.transition_pending) + +#define tsk_rt(t) (&(t)->rt_param) + +/* Realtime utility macros */ +#define get_rt_flags(t) (tsk_rt(t)->flags) +#define set_rt_flags(t,f) (tsk_rt(t)->flags=(f)) +#define get_exec_cost(t) (tsk_rt(t)->task_params.exec_cost) +#define get_exec_time(t) (tsk_rt(t)->job_params.exec_time) +#define get_rt_period(t) (tsk_rt(t)->task_params.period) +#define get_rt_phase(t) (tsk_rt(t)->task_params.phase) +#define get_partition(t) (tsk_rt(t)->task_params.cpu) +#define get_priority(t) (tsk_rt(t)->task_params.priority) +#define get_deadline(t) (tsk_rt(t)->job_params.deadline) +#define get_release(t) (tsk_rt(t)->job_params.release) +#define get_class(t) (tsk_rt(t)->task_params.cls) + +#define is_priority_boosted(t) (tsk_rt(t)->priority_boosted) +#define get_boost_start(t) (tsk_rt(t)->boost_start_time) + +inline static int budget_exhausted(struct task_struct* t) +{ + return get_exec_time(t) >= get_exec_cost(t); +} + +inline static lt_t budget_remaining(struct task_struct* t) +{ + if (!budget_exhausted(t)) + return get_exec_cost(t) - get_exec_time(t); + else + /* avoid overflow */ + return 0; +} + +#define budget_enforced(t) (tsk_rt(t)->task_params.budget_policy != NO_ENFORCEMENT) + +#define budget_precisely_enforced(t) (tsk_rt(t)->task_params.budget_policy \ + == PRECISE_ENFORCEMENT) + +#define is_hrt(t) \ + (tsk_rt(t)->task_params.cls == RT_CLASS_HARD) +#define is_srt(t) \ + (tsk_rt(t)->task_params.cls == RT_CLASS_SOFT) +#define is_be(t) \ + (tsk_rt(t)->task_params.cls == RT_CLASS_BEST_EFFORT) + +/* Our notion of time within LITMUS: kernel monotonic time. */ +static inline lt_t litmus_clock(void) +{ + return ktime_to_ns(ktime_get()); +} + +/* A macro to convert from nanoseconds to ktime_t. */ +#define ns_to_ktime(t) ktime_add_ns(ktime_set(0, 0), t) + +#define get_domain(t) (tsk_rt(t)->domain) + +/* Honor the flag in the preempt_count variable that is set + * when scheduling is in progress. + */ +#define is_running(t) \ + ((t)->state == TASK_RUNNING || \ + task_thread_info(t)->preempt_count & PREEMPT_ACTIVE) + +#define is_blocked(t) \ + (!is_running(t)) +#define is_released(t, now) \ + (lt_before_eq(get_release(t), now)) +#define is_tardy(t, now) \ + (lt_before_eq(tsk_rt(t)->job_params.deadline, now)) + +/* real-time comparison macros */ +#define earlier_deadline(a, b) (lt_before(\ + (a)->rt_param.job_params.deadline,\ + (b)->rt_param.job_params.deadline)) +#define earlier_release(a, b) (lt_before(\ + (a)->rt_param.job_params.release,\ + (b)->rt_param.job_params.release)) + +void preempt_if_preemptable(struct task_struct* t, int on_cpu); + +#ifdef CONFIG_LITMUS_LOCKING +void srp_ceiling_block(void); +#else +#define srp_ceiling_block() /* nothing */ +#endif + +#define bheap2task(hn) ((struct task_struct*) hn->value) + +static inline struct control_page* get_control_page(struct task_struct *t) +{ + return tsk_rt(t)->ctrl_page; +} + +static inline int has_control_page(struct task_struct* t) +{ + return tsk_rt(t)->ctrl_page != NULL; +} + +#ifdef CONFIG_NP_SECTION + +static inline int is_kernel_np(struct task_struct *t) +{ + return tsk_rt(t)->kernel_np; +} + +static inline int is_user_np(struct task_struct *t) +{ + return tsk_rt(t)->ctrl_page ? tsk_rt(t)->ctrl_page->sched.np.flag : 0; +} + +static inline void request_exit_np(struct task_struct *t) +{ + if (is_user_np(t)) { + /* Set the flag that tells user space to call + * into the kernel at the end of a critical section. */ + if (likely(tsk_rt(t)->ctrl_page)) { + TRACE_TASK(t, "setting delayed_preemption flag\n"); + tsk_rt(t)->ctrl_page->sched.np.preempt = 1; + } + } +} + +static inline void make_np(struct task_struct *t) +{ + tsk_rt(t)->kernel_np++; +} + +/* Caller should check if preemption is necessary when + * the function return 0. + */ +static inline int take_np(struct task_struct *t) +{ + return --tsk_rt(t)->kernel_np; +} + +/* returns 0 if remote CPU needs an IPI to preempt, 1 if no IPI is required */ +static inline int request_exit_np_atomic(struct task_struct *t) +{ + union np_flag old, new; + int ok; + + if (tsk_rt(t)->ctrl_page) { + old.raw = tsk_rt(t)->ctrl_page->sched.raw; + if (old.np.flag == 0) { + /* no longer non-preemptive */ + return 0; + } else if (old.np.preempt) { + /* already set, nothing for us to do */ + TRACE_TASK(t, "not setting np.preempt flag again\n"); + return 1; + } else { + /* non preemptive and flag not set */ + new.raw = old.raw; + new.np.preempt = 1; + /* if we get old back, then we atomically set the flag */ + ok = cmpxchg(&tsk_rt(t)->ctrl_page->sched.raw, old.raw, new.raw) == old.raw; + /* If we raced with a concurrent change, then so be + * it. Deliver it by IPI. We don't want an unbounded + * retry loop here since tasks might exploit that to + * keep the kernel busy indefinitely. */ + TRACE_TASK(t, "request_exit_np => %d\n", ok); + return ok; + } + } else + return 0; +} + +#else + +static inline int is_kernel_np(struct task_struct* t) +{ + return 0; +} + +static inline int is_user_np(struct task_struct* t) +{ + return 0; +} + +static inline void request_exit_np(struct task_struct *t) +{ + /* request_exit_np() shouldn't be called if !CONFIG_NP_SECTION */ + BUG(); +} + +static inline int request_exist_np_atomic(struct task_struct *t) +{ + return 0; +} + +#endif + +static inline void clear_exit_np(struct task_struct *t) +{ + if (likely(tsk_rt(t)->ctrl_page)) + tsk_rt(t)->ctrl_page->sched.np.preempt = 0; +} + +static inline int is_np(struct task_struct *t) +{ +#ifdef CONFIG_SCHED_DEBUG_TRACE + int kernel, user; + kernel = is_kernel_np(t); + user = is_user_np(t); + if (kernel || user) + TRACE_TASK(t, " is non-preemptive: kernel=%d user=%d\n", + + kernel, user); + return kernel || user; +#else + return unlikely(is_kernel_np(t) || is_user_np(t)); +#endif +} + +static inline int is_present(struct task_struct* t) +{ + return t && tsk_rt(t)->present; +} + + +/* make the unit explicit */ +typedef unsigned long quanta_t; + +enum round { + FLOOR, + CEIL +}; + + +/* Tick period is used to convert ns-specified execution + * costs and periods into tick-based equivalents. + */ +extern ktime_t tick_period; + +static inline quanta_t time2quanta(lt_t time, enum round round) +{ + s64 quantum_length = ktime_to_ns(tick_period); + + if (do_div(time, quantum_length) && round == CEIL) + time++; + return (quanta_t) time; +} + +/* By how much is cpu staggered behind CPU 0? */ +u64 cpu_stagger_offset(int cpu); + +#define TS_SYSCALL_IN_START \ + if (has_control_page(current)) \ + __TS_SYSCALL_IN_START(&get_control_page(current)->ts_syscall_start); + +#endif diff --git a/include/litmus/litmus_proc.h b/include/litmus/litmus_proc.h new file mode 100644 index 000000000000..6800e725d48c --- /dev/null +++ b/include/litmus/litmus_proc.h @@ -0,0 +1,25 @@ +#include +#include + +int __init init_litmus_proc(void); +void exit_litmus_proc(void); + +/* + * On success, returns 0 and sets the pointer to the location of the new + * proc dir entry, otherwise returns an error code and sets pde to NULL. + */ +long make_plugin_proc_dir(struct sched_plugin* plugin, + struct proc_dir_entry** pde); + +/* + * Plugins should deallocate all child proc directory entries before + * calling this, to avoid memory leaks. + */ +void remove_plugin_proc_dir(struct sched_plugin* plugin); + + +/* Copy at most size-1 bytes from ubuf into kbuf, null-terminate buf, and + * remove a '\n' if present. Returns the number of bytes that were read or + * -EFAULT. */ +int copy_and_chomp(char *kbuf, unsigned long ksize, + __user const char* ubuf, unsigned long ulength); diff --git a/include/litmus/locking.h b/include/litmus/locking.h new file mode 100644 index 000000000000..4d7b870cb443 --- /dev/null +++ b/include/litmus/locking.h @@ -0,0 +1,28 @@ +#ifndef LITMUS_LOCKING_H +#define LITMUS_LOCKING_H + +struct litmus_lock_ops; + +/* Generic base struct for LITMUS^RT userspace semaphores. + * This structure should be embedded in protocol-specific semaphores. + */ +struct litmus_lock { + struct litmus_lock_ops *ops; + int type; +}; + +struct litmus_lock_ops { + /* Current task tries to obtain / drop a reference to a lock. + * Optional methods, allowed by default. */ + int (*open)(struct litmus_lock*, void* __user); + int (*close)(struct litmus_lock*); + + /* Current tries to lock/unlock this lock (mandatory methods). */ + int (*lock)(struct litmus_lock*); + int (*unlock)(struct litmus_lock*); + + /* The lock is no longer being referenced (mandatory method). */ + void (*deallocate)(struct litmus_lock*); +}; + +#endif diff --git a/include/litmus/preempt.h b/include/litmus/preempt.h new file mode 100644 index 000000000000..f3cf29ad87ee --- /dev/null +++ b/include/litmus/preempt.h @@ -0,0 +1,165 @@ +#ifndef LITMUS_PREEMPT_H +#define LITMUS_PREEMPT_H + +#include +#include +#include +#include + +#include + +extern DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, resched_state); + +//#ifdef CONFIG_DEBUG_KERNEL +#if 0 +const char* sched_state_name(int s); +#define TRACE_STATE(fmt, args...) TRACE("SCHED_STATE " fmt, args) +#else +#define TRACE_STATE(fmt, args...) /* ignore */ +#endif + +#define VERIFY_SCHED_STATE(x) \ + do { int __s = get_sched_state(); \ + if ((__s & (x)) == 0) \ + TRACE_STATE("INVALID s=0x%x (%s) not " \ + "in 0x%x (%s) [%s]\n", \ + __s, sched_state_name(__s), \ + (x), #x, __FUNCTION__); \ + } while (0); + +#define TRACE_SCHED_STATE_CHANGE(x, y, cpu) \ + TRACE_STATE("[P%d] 0x%x (%s) -> 0x%x (%s)\n", \ + cpu, (x), sched_state_name(x), \ + (y), sched_state_name(y)) + + +typedef enum scheduling_state { + TASK_SCHEDULED = (1 << 0), /* The currently scheduled task is the one that + * should be scheduled, and the processor does not + * plan to invoke schedule(). */ + SHOULD_SCHEDULE = (1 << 1), /* A remote processor has determined that the + * processor should reschedule, but this has not + * been communicated yet (IPI still pending). */ + WILL_SCHEDULE = (1 << 2), /* The processor has noticed that it has to + * reschedule and will do so shortly. */ + TASK_PICKED = (1 << 3), /* The processor is currently executing schedule(), + * has selected a new task to schedule, but has not + * yet performed the actual context switch. */ + PICKED_WRONG_TASK = (1 << 4), /* The processor has not yet performed the context + * switch, but a remote processor has already + * determined that a higher-priority task became + * eligible after the task was picked. */ +} sched_state_t; + +static inline sched_state_t get_sched_state_on(int cpu) +{ + return atomic_read(&per_cpu(resched_state, cpu)); +} + +static inline sched_state_t get_sched_state(void) +{ + return atomic_read(&__get_cpu_var(resched_state)); +} + +static inline int is_in_sched_state(int possible_states) +{ + return get_sched_state() & possible_states; +} + +static inline int cpu_is_in_sched_state(int cpu, int possible_states) +{ + return get_sched_state_on(cpu) & possible_states; +} + +static inline void set_sched_state(sched_state_t s) +{ + TRACE_SCHED_STATE_CHANGE(get_sched_state(), s, smp_processor_id()); + atomic_set(&__get_cpu_var(resched_state), s); +} + +static inline int sched_state_transition(sched_state_t from, sched_state_t to) +{ + sched_state_t old_state; + + old_state = atomic_cmpxchg(&__get_cpu_var(resched_state), from, to); + if (old_state == from) { + TRACE_SCHED_STATE_CHANGE(from, to, smp_processor_id()); + return 1; + } else + return 0; +} + +static inline int sched_state_transition_on(int cpu, + sched_state_t from, + sched_state_t to) +{ + sched_state_t old_state; + + old_state = atomic_cmpxchg(&per_cpu(resched_state, cpu), from, to); + if (old_state == from) { + TRACE_SCHED_STATE_CHANGE(from, to, cpu); + return 1; + } else + return 0; +} + +/* Plugins must call this function after they have decided which job to + * schedule next. IMPORTANT: this function must be called while still holding + * the lock that is used to serialize scheduling decisions. + * + * (Ideally, we would like to use runqueue locks for this purpose, but that + * would lead to deadlocks with the migration code.) + */ +static inline void sched_state_task_picked(void) +{ + VERIFY_SCHED_STATE(WILL_SCHEDULE); + + /* WILL_SCHEDULE has only a local tansition => simple store is ok */ + set_sched_state(TASK_PICKED); +} + +static inline void sched_state_entered_schedule(void) +{ + /* Update state for the case that we entered schedule() not due to + * set_tsk_need_resched() */ + set_sched_state(WILL_SCHEDULE); +} + +/* Called by schedule() to check if the scheduling decision is still valid + * after a context switch. Returns 1 if the CPU needs to reschdule. */ +static inline int sched_state_validate_switch(void) +{ + int left_state_ok = 0; + + VERIFY_SCHED_STATE(PICKED_WRONG_TASK | TASK_PICKED); + + if (is_in_sched_state(TASK_PICKED)) { + /* Might be good; let's try to transition out of this + * state. This must be done atomically since remote processors + * may try to change the state, too. */ + left_state_ok = sched_state_transition(TASK_PICKED, TASK_SCHEDULED); + } + + if (!left_state_ok) { + /* We raced with a higher-priority task arrival => not + * valid. The CPU needs to reschedule. */ + set_sched_state(WILL_SCHEDULE); + return 1; + } else + return 0; +} + +/* State transition events. See litmus/preempt.c for details. */ +void sched_state_will_schedule(struct task_struct* tsk); +void sched_state_ipi(void); +/* Cause a CPU (remote or local) to reschedule. */ +void litmus_reschedule(int cpu); +void litmus_reschedule_local(void); + +#ifdef CONFIG_DEBUG_KERNEL +void sched_state_plugin_check(void); +#else +#define sched_state_plugin_check() /* no check */ +#endif + +#endif diff --git a/include/litmus/rt_domain.h b/include/litmus/rt_domain.h new file mode 100644 index 000000000000..ac249292e866 --- /dev/null +++ b/include/litmus/rt_domain.h @@ -0,0 +1,182 @@ +/* CLEANUP: Add comments and make it less messy. + * + */ + +#ifndef __UNC_RT_DOMAIN_H__ +#define __UNC_RT_DOMAIN_H__ + +#include + +#define RELEASE_QUEUE_SLOTS 127 /* prime */ + +struct _rt_domain; + +typedef int (*check_resched_needed_t)(struct _rt_domain *rt); +typedef void (*release_jobs_t)(struct _rt_domain *rt, struct bheap* tasks); + +struct release_queue { + /* each slot maintains a list of release heaps sorted + * by release time */ + struct list_head slot[RELEASE_QUEUE_SLOTS]; +}; + +typedef struct _rt_domain { + /* runnable rt tasks are in here */ + raw_spinlock_t ready_lock; + struct bheap ready_queue; + + /* real-time tasks waiting for release are in here */ + raw_spinlock_t release_lock; + struct release_queue release_queue; + +#ifdef CONFIG_RELEASE_MASTER + int release_master; +#endif + + /* for moving tasks to the release queue */ + raw_spinlock_t tobe_lock; + struct list_head tobe_released; + + /* how do we check if we need to kick another CPU? */ + check_resched_needed_t check_resched; + + /* how do we release jobs? */ + release_jobs_t release_jobs; + + /* how are tasks ordered in the ready queue? */ + bheap_prio_t order; +} rt_domain_t; + +struct release_heap { + /* list_head for per-time-slot list */ + struct list_head list; + lt_t release_time; + /* all tasks to be released at release_time */ + struct bheap heap; + /* used to trigger the release */ + struct hrtimer timer; + +#ifdef CONFIG_RELEASE_MASTER + /* used to delegate releases */ + struct hrtimer_start_on_info info; +#endif + /* required for the timer callback */ + rt_domain_t* dom; +}; + + +static inline struct task_struct* __next_ready(rt_domain_t* rt) +{ + struct bheap_node *hn = bheap_peek(rt->order, &rt->ready_queue); + if (hn) + return bheap2task(hn); + else + return NULL; +} + +void rt_domain_init(rt_domain_t *rt, bheap_prio_t order, + check_resched_needed_t check, + release_jobs_t relase); + +void __add_ready(rt_domain_t* rt, struct task_struct *new); +void __merge_ready(rt_domain_t* rt, struct bheap *tasks); +void __add_release(rt_domain_t* rt, struct task_struct *task); + +static inline struct task_struct* __take_ready(rt_domain_t* rt) +{ + struct bheap_node* hn = bheap_take(rt->order, &rt->ready_queue); + if (hn) + return bheap2task(hn); + else + return NULL; +} + +static inline struct task_struct* __peek_ready(rt_domain_t* rt) +{ + struct bheap_node* hn = bheap_peek(rt->order, &rt->ready_queue); + if (hn) + return bheap2task(hn); + else + return NULL; +} + +static inline int is_queued(struct task_struct *t) +{ + BUG_ON(!tsk_rt(t)->heap_node); + return bheap_node_in_heap(tsk_rt(t)->heap_node); +} + +static inline void remove(rt_domain_t* rt, struct task_struct *t) +{ + bheap_delete(rt->order, &rt->ready_queue, tsk_rt(t)->heap_node); +} + +static inline void add_ready(rt_domain_t* rt, struct task_struct *new) +{ + unsigned long flags; + /* first we need the write lock for rt_ready_queue */ + raw_spin_lock_irqsave(&rt->ready_lock, flags); + __add_ready(rt, new); + raw_spin_unlock_irqrestore(&rt->ready_lock, flags); +} + +static inline void merge_ready(rt_domain_t* rt, struct bheap* tasks) +{ + unsigned long flags; + raw_spin_lock_irqsave(&rt->ready_lock, flags); + __merge_ready(rt, tasks); + raw_spin_unlock_irqrestore(&rt->ready_lock, flags); +} + +static inline struct task_struct* take_ready(rt_domain_t* rt) +{ + unsigned long flags; + struct task_struct* ret; + /* first we need the write lock for rt_ready_queue */ + raw_spin_lock_irqsave(&rt->ready_lock, flags); + ret = __take_ready(rt); + raw_spin_unlock_irqrestore(&rt->ready_lock, flags); + return ret; +} + + +static inline void add_release(rt_domain_t* rt, struct task_struct *task) +{ + unsigned long flags; + raw_spin_lock_irqsave(&rt->tobe_lock, flags); + __add_release(rt, task); + raw_spin_unlock_irqrestore(&rt->tobe_lock, flags); +} + +#ifdef CONFIG_RELEASE_MASTER +void __add_release_on(rt_domain_t* rt, struct task_struct *task, + int target_cpu); + +static inline void add_release_on(rt_domain_t* rt, + struct task_struct *task, + int target_cpu) +{ + unsigned long flags; + raw_spin_lock_irqsave(&rt->tobe_lock, flags); + __add_release_on(rt, task, target_cpu); + raw_spin_unlock_irqrestore(&rt->tobe_lock, flags); +} +#endif + +static inline int __jobs_pending(rt_domain_t* rt) +{ + return !bheap_empty(&rt->ready_queue); +} + +static inline int jobs_pending(rt_domain_t* rt) +{ + unsigned long flags; + int ret; + /* first we need the write lock for rt_ready_queue */ + raw_spin_lock_irqsave(&rt->ready_lock, flags); + ret = !bheap_empty(&rt->ready_queue); + raw_spin_unlock_irqrestore(&rt->ready_lock, flags); + return ret; +} + +#endif diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h new file mode 100644 index 000000000000..a23ce1524051 --- /dev/null +++ b/include/litmus/rt_param.h @@ -0,0 +1,228 @@ +/* + * Definition of the scheduler plugin interface. + * + */ +#ifndef _LINUX_RT_PARAM_H_ +#define _LINUX_RT_PARAM_H_ + +/* Litmus time type. */ +typedef unsigned long long lt_t; + +static inline int lt_after(lt_t a, lt_t b) +{ + return ((long long) b) - ((long long) a) < 0; +} +#define lt_before(a, b) lt_after(b, a) + +static inline int lt_after_eq(lt_t a, lt_t b) +{ + return ((long long) a) - ((long long) b) >= 0; +} +#define lt_before_eq(a, b) lt_after_eq(b, a) + +/* different types of clients */ +typedef enum { + RT_CLASS_HARD, + RT_CLASS_SOFT, + RT_CLASS_BEST_EFFORT +} task_class_t; + +typedef enum { + NO_ENFORCEMENT, /* job may overrun unhindered */ + QUANTUM_ENFORCEMENT, /* budgets are only checked on quantum boundaries */ + PRECISE_ENFORCEMENT /* NOT IMPLEMENTED - enforced with hrtimers */ +} budget_policy_t; + +#define LITMUS_MAX_PRIORITY 512 + +struct rt_task { + lt_t exec_cost; + lt_t period; + lt_t phase; + unsigned int cpu; + unsigned int priority; + task_class_t cls; + budget_policy_t budget_policy; /* ignored by pfair */ +}; + +union np_flag { + uint32_t raw; + struct { + /* Is the task currently in a non-preemptive section? */ + uint32_t flag:31; + /* Should the task call into the scheduler? */ + uint32_t preempt:1; + } np; +}; + +/* The definition of the data that is shared between the kernel and real-time + * tasks via a shared page (see litmus/ctrldev.c). + * + * WARNING: User space can write to this, so don't trust + * the correctness of the fields! + * + * This servees two purposes: to enable efficient signaling + * of non-preemptive sections (user->kernel) and + * delayed preemptions (kernel->user), and to export + * some real-time relevant statistics such as preemption and + * migration data to user space. We can't use a device to export + * statistics because we want to avoid system call overhead when + * determining preemption/migration overheads). + */ +struct control_page { + volatile union np_flag sched; + + /* locking overhead tracing: time stamp prior to system call */ + uint64_t ts_syscall_start; /* Feather-Trace cycles */ + + /* to be extended */ +}; + +/* don't export internal data structures to user space (liblitmus) */ +#ifdef __KERNEL__ + +struct _rt_domain; +struct bheap_node; +struct release_heap; + +struct rt_job { + /* Time instant the the job was or will be released. */ + lt_t release; + /* What is the current deadline? */ + lt_t deadline; + + /* How much service has this job received so far? */ + lt_t exec_time; + + /* Which job is this. This is used to let user space + * specify which job to wait for, which is important if jobs + * overrun. If we just call sys_sleep_next_period() then we + * will unintentionally miss jobs after an overrun. + * + * Increase this sequence number when a job is released. + */ + unsigned int job_no; +}; + +struct pfair_param; + +/* RT task parameters for scheduling extensions + * These parameters are inherited during clone and therefore must + * be explicitly set up before the task set is launched. + */ +struct rt_param { + /* is the task sleeping? */ + unsigned int flags:8; + + /* do we need to check for srp blocking? */ + unsigned int srp_non_recurse:1; + + /* is the task present? (true if it can be scheduled) */ + unsigned int present:1; + +#ifdef CONFIG_LITMUS_LOCKING + /* Is the task being priority-boosted by a locking protocol? */ + unsigned int priority_boosted:1; + /* If so, when did this start? */ + lt_t boost_start_time; +#endif + + /* user controlled parameters */ + struct rt_task task_params; + + /* timing parameters */ + struct rt_job job_params; + + /* task representing the current "inherited" task + * priority, assigned by inherit_priority and + * return priority in the scheduler plugins. + * could point to self if PI does not result in + * an increased task priority. + */ + struct task_struct* inh_task; + +#ifdef CONFIG_NP_SECTION + /* For the FMLP under PSN-EDF, it is required to make the task + * non-preemptive from kernel space. In order not to interfere with + * user space, this counter indicates the kernel space np setting. + * kernel_np > 0 => task is non-preemptive + */ + unsigned int kernel_np; +#endif + + /* This field can be used by plugins to store where the task + * is currently scheduled. It is the responsibility of the + * plugin to avoid race conditions. + * + * This used by GSN-EDF and PFAIR. + */ + volatile int scheduled_on; + + /* Is the stack of the task currently in use? This is updated by + * the LITMUS core. + * + * Be careful to avoid deadlocks! + */ + volatile int stack_in_use; + + /* This field can be used by plugins to store where the task + * is currently linked. It is the responsibility of the plugin + * to avoid race conditions. + * + * Used by GSN-EDF. + */ + volatile int linked_on; + + /* PFAIR/PD^2 state. Allocated on demand. */ + struct pfair_param* pfair; + + /* Fields saved before BE->RT transition. + */ + int old_policy; + int old_prio; + + /* ready queue for this task */ + struct _rt_domain* domain; + + /* heap element for this task + * + * Warning: Don't statically allocate this node. The heap + * implementation swaps these between tasks, thus after + * dequeuing from a heap you may end up with a different node + * then the one you had when enqueuing the task. For the same + * reason, don't obtain and store references to this node + * other than this pointer (which is updated by the heap + * implementation). + */ + struct bheap_node* heap_node; + struct release_heap* rel_heap; + +#ifdef CONFIG_LITMUS_LOCKING + /* task in heap of pending jobs -- used by C-EDF for priority donation */ + struct bheap_node* pending_node; + /* is the job in a critical section or a wait queue?*/ + unsigned int request_incomplete; + /* is the job currently a donor? */ + unsigned int is_donor; + /* is this job suspended, waiting to become eligible? */ + unsigned int waiting_eligible; + + int pending_on; +#endif + + /* Used by rt_domain to queue task in release list. + */ + struct list_head list; + + /* Pointer to the page shared between userspace and kernel. */ + struct control_page * ctrl_page; +}; + +/* Possible RT flags */ +#define RT_F_RUNNING 0x00000000 +#define RT_F_SLEEP 0x00000001 +#define RT_F_EXIT_SEM 0x00000008 + +#endif + +#endif diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h new file mode 100644 index 000000000000..b5d1ae7bc3b6 --- /dev/null +++ b/include/litmus/sched_plugin.h @@ -0,0 +1,117 @@ +/* + * Definition of the scheduler plugin interface. + * + */ +#ifndef _LINUX_SCHED_PLUGIN_H_ +#define _LINUX_SCHED_PLUGIN_H_ + +#include + +#ifdef CONFIG_LITMUS_LOCKING +#include +#endif + +/************************ setup/tear down ********************/ + +typedef long (*activate_plugin_t) (void); +typedef long (*deactivate_plugin_t) (void); + + + +/********************* scheduler invocation ******************/ + +/* Plugin-specific realtime tick handler */ +typedef void (*scheduler_tick_t) (struct task_struct *cur); +/* Novell make sched decision function */ +typedef struct task_struct* (*schedule_t)(struct task_struct * prev); +/* Clean up after the task switch has occured. + * This function is called after every (even non-rt) task switch. + */ +typedef void (*finish_switch_t)(struct task_struct *prev); + + +/********************* task state changes ********************/ + +/* Called to setup a new real-time task. + * Release the first job, enqueue, etc. + * Task may already be running. + */ +typedef void (*task_new_t) (struct task_struct *task, + int on_rq, + int running); + +/* Called to re-introduce a task after blocking. + * Can potentially be called multiple times. + */ +typedef void (*task_wake_up_t) (struct task_struct *task); +/* called to notify the plugin of a blocking real-time task + * it will only be called for real-time tasks and before schedule is called */ +typedef void (*task_block_t) (struct task_struct *task); +/* Called when a real-time task exits or changes to a different scheduling + * class. + * Free any allocated resources + */ +typedef void (*task_exit_t) (struct task_struct *); + +/* called early before the caller holds the runqueue lock */ +typedef void (*pre_setsched_t) (struct task_struct *, int policy); + + +/* Called when the current task attempts to create a new lock of a given + * protocol type. */ +typedef long (*allocate_lock_t) (struct litmus_lock **lock, int type, + void* __user config); + + +/********************* sys call backends ********************/ +/* This function causes the caller to sleep until the next release */ +typedef long (*complete_job_t) (void); + +typedef long (*admit_task_t)(struct task_struct* tsk); + +typedef void (*release_at_t)(struct task_struct *t, lt_t start); + +struct sched_plugin { + struct list_head list; + /* basic info */ + char *plugin_name; + + /* setup */ + activate_plugin_t activate_plugin; + deactivate_plugin_t deactivate_plugin; + + /* scheduler invocation */ + scheduler_tick_t tick; + schedule_t schedule; + finish_switch_t finish_switch; + + /* syscall backend */ + complete_job_t complete_job; + release_at_t release_at; + + /* task state changes */ + admit_task_t admit_task; + + task_new_t task_new; + task_wake_up_t task_wake_up; + task_block_t task_block; + task_exit_t task_exit; + + pre_setsched_t pre_setsched; + +#ifdef CONFIG_LITMUS_LOCKING + /* locking protocols */ + allocate_lock_t allocate_lock; +#endif +} __attribute__ ((__aligned__(SMP_CACHE_BYTES))); + + +extern struct sched_plugin *litmus; + +int register_sched_plugin(struct sched_plugin* plugin); +struct sched_plugin* find_sched_plugin(const char* name); +int print_sched_plugins(char* buf, int max); + +extern struct sched_plugin linux_sched_plugin; + +#endif diff --git a/include/litmus/sched_plugin.h.rej b/include/litmus/sched_plugin.h.rej new file mode 100644 index 000000000000..47e0c27c5061 --- /dev/null +++ b/include/litmus/sched_plugin.h.rej @@ -0,0 +1,22 @@ +--- include/litmus/sched_plugin.h ++++ include/litmus/sched_plugin.h +@@ -53,6 +53,10 @@ + */ + typedef void (*task_exit_t) (struct task_struct *); + ++/* called early before the caller holds the runqueue lock */ ++typedef void (*pre_setsched_t) (struct task_struct *, int policy); ++ ++ + /* Called when the current task attempts to create a new lock of a given + * protocol type. */ + typedef long (*allocate_lock_t) (struct litmus_lock **lock, int type, +@@ -93,6 +97,8 @@ + task_block_t task_block; + task_exit_t task_exit; + ++ pre_setsched_t pre_setsched; ++ + #ifdef CONFIG_LITMUS_LOCKING + /* locking protocols */ + allocate_lock_t allocate_lock; diff --git a/include/litmus/sched_trace.h b/include/litmus/sched_trace.h new file mode 100644 index 000000000000..7ca34cb13881 --- /dev/null +++ b/include/litmus/sched_trace.h @@ -0,0 +1,200 @@ +/* + * sched_trace.h -- record scheduler events to a byte stream for offline analysis. + */ +#ifndef _LINUX_SCHED_TRACE_H_ +#define _LINUX_SCHED_TRACE_H_ + +/* all times in nanoseconds */ + +struct st_trace_header { + u8 type; /* Of what type is this record? */ + u8 cpu; /* On which CPU was it recorded? */ + u16 pid; /* PID of the task. */ + u32 job; /* The job sequence number. */ +}; + +#define ST_NAME_LEN 16 +struct st_name_data { + char cmd[ST_NAME_LEN];/* The name of the executable of this process. */ +}; + +struct st_param_data { /* regular params */ + u32 wcet; + u32 period; + u32 phase; + u8 partition; + u8 class; + u8 __unused[2]; +}; + +struct st_release_data { /* A job is was/is going to be released. */ + u64 release; /* What's the release time? */ + u64 deadline; /* By when must it finish? */ +}; + +struct st_assigned_data { /* A job was asigned to a CPU. */ + u64 when; + u8 target; /* Where should it execute? */ + u8 __unused[7]; +}; + +struct st_switch_to_data { /* A process was switched to on a given CPU. */ + u64 when; /* When did this occur? */ + u32 exec_time; /* Time the current job has executed. */ + u8 __unused[4]; + +}; + +struct st_switch_away_data { /* A process was switched away from on a given CPU. */ + u64 when; + u64 exec_time; +}; + +struct st_completion_data { /* A job completed. */ + u64 when; + u8 forced:1; /* Set to 1 if job overran and kernel advanced to the + * next task automatically; set to 0 otherwise. + */ + u8 __uflags:7; + u8 __unused[7]; +}; + +struct st_block_data { /* A task blocks. */ + u64 when; + u64 __unused; +}; + +struct st_resume_data { /* A task resumes. */ + u64 when; + u64 __unused; +}; + +struct st_action_data { + u64 when; + u8 action; + u8 __unused[7]; +}; + +struct st_sys_release_data { + u64 when; + u64 release; +}; + +#define DATA(x) struct st_ ## x ## _data x; + +typedef enum { + ST_NAME = 1, /* Start at one, so that we can spot + * uninitialized records. */ + ST_PARAM, + ST_RELEASE, + ST_ASSIGNED, + ST_SWITCH_TO, + ST_SWITCH_AWAY, + ST_COMPLETION, + ST_BLOCK, + ST_RESUME, + ST_ACTION, + ST_SYS_RELEASE +} st_event_record_type_t; + +struct st_event_record { + struct st_trace_header hdr; + union { + u64 raw[2]; + + DATA(name); + DATA(param); + DATA(release); + DATA(assigned); + DATA(switch_to); + DATA(switch_away); + DATA(completion); + DATA(block); + DATA(resume); + DATA(action); + DATA(sys_release); + } data; +}; + +#undef DATA + +#ifdef __KERNEL__ + +#include +#include + +#ifdef CONFIG_SCHED_TASK_TRACE + +#define SCHED_TRACE(id, callback, task) \ + ft_event1(id, callback, task) +#define SCHED_TRACE2(id, callback, task, xtra) \ + ft_event2(id, callback, task, xtra) + +/* provide prototypes; needed on sparc64 */ +#ifndef NO_TASK_TRACE_DECLS +feather_callback void do_sched_trace_task_name(unsigned long id, + struct task_struct* task); +feather_callback void do_sched_trace_task_param(unsigned long id, + struct task_struct* task); +feather_callback void do_sched_trace_task_release(unsigned long id, + struct task_struct* task); +feather_callback void do_sched_trace_task_switch_to(unsigned long id, + struct task_struct* task); +feather_callback void do_sched_trace_task_switch_away(unsigned long id, + struct task_struct* task); +feather_callback void do_sched_trace_task_completion(unsigned long id, + struct task_struct* task, + unsigned long forced); +feather_callback void do_sched_trace_task_block(unsigned long id, + struct task_struct* task); +feather_callback void do_sched_trace_task_resume(unsigned long id, + struct task_struct* task); +feather_callback void do_sched_trace_action(unsigned long id, + struct task_struct* task, + unsigned long action); +feather_callback void do_sched_trace_sys_release(unsigned long id, + lt_t* start); + +#endif + +#else + +#define SCHED_TRACE(id, callback, task) /* no tracing */ +#define SCHED_TRACE2(id, callback, task, xtra) /* no tracing */ + +#endif + + +#define SCHED_TRACE_BASE_ID 500 + + +#define sched_trace_task_name(t) \ + SCHED_TRACE(SCHED_TRACE_BASE_ID + 1, do_sched_trace_task_name, t) +#define sched_trace_task_param(t) \ + SCHED_TRACE(SCHED_TRACE_BASE_ID + 2, do_sched_trace_task_param, t) +#define sched_trace_task_release(t) \ + SCHED_TRACE(SCHED_TRACE_BASE_ID + 3, do_sched_trace_task_release, t) +#define sched_trace_task_switch_to(t) \ + SCHED_TRACE(SCHED_TRACE_BASE_ID + 4, do_sched_trace_task_switch_to, t) +#define sched_trace_task_switch_away(t) \ + SCHED_TRACE(SCHED_TRACE_BASE_ID + 5, do_sched_trace_task_switch_away, t) +#define sched_trace_task_completion(t, forced) \ + SCHED_TRACE2(SCHED_TRACE_BASE_ID + 6, do_sched_trace_task_completion, t, \ + (unsigned long) forced) +#define sched_trace_task_block(t) \ + SCHED_TRACE(SCHED_TRACE_BASE_ID + 7, do_sched_trace_task_block, t) +#define sched_trace_task_resume(t) \ + SCHED_TRACE(SCHED_TRACE_BASE_ID + 8, do_sched_trace_task_resume, t) +#define sched_trace_action(t, action) \ + SCHED_TRACE2(SCHED_TRACE_BASE_ID + 9, do_sched_trace_action, t, \ + (unsigned long) action); +/* when is a pointer, it does not need an explicit cast to unsigned long */ +#define sched_trace_sys_release(when) \ + SCHED_TRACE(SCHED_TRACE_BASE_ID + 10, do_sched_trace_sys_release, when) + + +#define sched_trace_quantum_boundary() /* NOT IMPLEMENTED */ + +#endif /* __KERNEL__ */ + +#endif diff --git a/include/litmus/srp.h b/include/litmus/srp.h new file mode 100644 index 000000000000..c9a4552b2bf3 --- /dev/null +++ b/include/litmus/srp.h @@ -0,0 +1,28 @@ +#ifndef LITMUS_SRP_H +#define LITMUS_SRP_H + +struct srp_semaphore; + +struct srp_priority { + struct list_head list; + unsigned int priority; + pid_t pid; +}; +#define list2prio(l) list_entry(l, struct srp_priority, list) + +/* struct for uniprocessor SRP "semaphore" */ +struct srp_semaphore { + struct litmus_lock litmus_lock; + struct srp_priority ceiling; + struct task_struct* owner; + int cpu; /* cpu associated with this "semaphore" and resource */ +}; + +/* map a task to its SRP preemption level priority */ +typedef unsigned int (*srp_prioritization_t)(struct task_struct* t); +/* Must be updated by each plugin that uses SRP.*/ +extern srp_prioritization_t get_srp_prio; + +struct srp_semaphore* allocate_srp_semaphore(void); + +#endif diff --git a/include/litmus/trace.h b/include/litmus/trace.h new file mode 100644 index 000000000000..d6829c416912 --- /dev/null +++ b/include/litmus/trace.h @@ -0,0 +1,129 @@ +#ifndef _SYS_TRACE_H_ +#define _SYS_TRACE_H_ + +#ifdef CONFIG_SCHED_OVERHEAD_TRACE + +#include +#include + + +/*********************** TIMESTAMPS ************************/ + +enum task_type_marker { + TSK_BE, + TSK_RT, + TSK_UNKNOWN +}; + +struct timestamp { + uint64_t timestamp; + uint32_t seq_no; + uint8_t cpu; + uint8_t event; + uint8_t task_type; +}; + +/* tracing callbacks */ +feather_callback void save_timestamp(unsigned long event); +feather_callback void save_timestamp_def(unsigned long event, unsigned long type); +feather_callback void save_timestamp_task(unsigned long event, unsigned long t_ptr); +feather_callback void save_timestamp_cpu(unsigned long event, unsigned long cpu); +feather_callback void save_task_latency(unsigned long event, unsigned long when_ptr); +feather_callback void save_timestamp_time(unsigned long event, unsigned long time_ptr); + +#define TIMESTAMP(id) ft_event0(id, save_timestamp) + +#define DTIMESTAMP(id, def) ft_event1(id, save_timestamp_def, (unsigned long) def) + +#define TTIMESTAMP(id, task) \ + ft_event1(id, save_timestamp_task, (unsigned long) task) + +#define CTIMESTAMP(id, cpu) \ + ft_event1(id, save_timestamp_cpu, (unsigned long) cpu) + +#define LTIMESTAMP(id, task) \ + ft_event1(id, save_task_latency, (unsigned long) task) + +#define TIMESTAMP_TIME(id, time_ptr) \ + ft_event1(id, save_timestamp_time, (unsigned long) time_ptr) + +#define TIMESTAMP_PID(id) ft_event0(id, save_timestamp_pid) + +#else /* !CONFIG_SCHED_OVERHEAD_TRACE */ + +#define TIMESTAMP(id) /* no tracing */ + +#define DTIMESTAMP(id, def) /* no tracing */ + +#define TTIMESTAMP(id, task) /* no tracing */ + +#define CTIMESTAMP(id, cpu) /* no tracing */ + +#define LTIMESTAMP(id, when_ptr) /* no tracing */ + +#define TIMESTAMP_TIME(id, time_ptr) /* no tracing */ + +#define TIMESTAMP_PID(id) /* no tracing */ + +#endif + + +/* Convention for timestamps + * ========================= + * + * In order to process the trace files with a common tool, we use the following + * convention to measure execution times: The end time id of a code segment is + * always the next number after the start time event id. + */ + +#define __TS_SYSCALL_IN_START(p) TIMESTAMP_TIME(10, p) +#define TS_SYSCALL_IN_END TIMESTAMP_PID(11) + +#define TS_SYSCALL_OUT_START TIMESTAMP_PID(20) +#define TS_SYSCALL_OUT_END TIMESTAMP_PID(21) + +#define TS_LOCK_START TIMESTAMP_PID(30) +#define TS_LOCK_END TIMESTAMP_PID(31) + +#define TS_LOCK_SUSPEND TIMESTAMP_PID(38) +#define TS_LOCK_RESUME TIMESTAMP_PID(39) + +#define TS_UNLOCK_START TIMESTAMP_PID(40) +#define TS_UNLOCK_END TIMESTAMP_PID(41) + +#define TS_SCHED_START DTIMESTAMP(100, TSK_UNKNOWN) /* we only + * care + * about + * next */ +#define TS_SCHED_END(t) TTIMESTAMP(101, t) +#define TS_SCHED2_START(t) TTIMESTAMP(102, t) +#define TS_SCHED2_END(t) TTIMESTAMP(103, t) + +#define TS_CXS_START(t) TTIMESTAMP(104, t) +#define TS_CXS_END(t) TTIMESTAMP(105, t) + +#define TS_RELEASE_START DTIMESTAMP(106, TSK_RT) +#define TS_RELEASE_END DTIMESTAMP(107, TSK_RT) + +#define TS_TICK_START(t) TTIMESTAMP(110, t) +#define TS_TICK_END(t) TTIMESTAMP(111, t) + + +#define TS_PLUGIN_SCHED_START /* TIMESTAMP(120) */ /* currently unused */ +#define TS_PLUGIN_SCHED_END /* TIMESTAMP(121) */ + +#define TS_PLUGIN_TICK_START /* TIMESTAMP(130) */ +#define TS_PLUGIN_TICK_END /* TIMESTAMP(131) */ + +#define TS_ENTER_NP_START TIMESTAMP(140) +#define TS_ENTER_NP_END TIMESTAMP(141) + +#define TS_EXIT_NP_START TIMESTAMP(150) +#define TS_EXIT_NP_END TIMESTAMP(151) + +#define TS_SEND_RESCHED_START(c) CTIMESTAMP(190, c) +#define TS_SEND_RESCHED_END DTIMESTAMP(191, TSK_UNKNOWN) + +#define TS_RELEASE_LATENCY(when) LTIMESTAMP(208, &(when)) + +#endif /* !_SYS_TRACE_H_ */ diff --git a/include/litmus/unistd_32.h b/include/litmus/unistd_32.h new file mode 100644 index 000000000000..94264c27d9ac --- /dev/null +++ b/include/litmus/unistd_32.h @@ -0,0 +1,21 @@ +/* + * included from arch/x86/include/asm/unistd_32.h + * + * LITMUS^RT syscalls with "relative" numbers + */ +#define __LSC(x) (__NR_LITMUS + x) + +#define __NR_set_rt_task_param __LSC(0) +#define __NR_get_rt_task_param __LSC(1) +#define __NR_complete_job __LSC(2) +#define __NR_od_open __LSC(3) +#define __NR_od_close __LSC(4) +#define __NR_litmus_lock __LSC(5) +#define __NR_litmus_unlock __LSC(6) +#define __NR_query_job_no __LSC(7) +#define __NR_wait_for_job_release __LSC(8) +#define __NR_wait_for_ts_release __LSC(9) +#define __NR_release_ts __LSC(10) +#define __NR_null_call __LSC(11) + +#define NR_litmus_syscalls 12 diff --git a/include/litmus/unistd_64.h b/include/litmus/unistd_64.h new file mode 100644 index 000000000000..d5ced0d2642c --- /dev/null +++ b/include/litmus/unistd_64.h @@ -0,0 +1,33 @@ +/* + * included from arch/x86/include/asm/unistd_64.h + * + * LITMUS^RT syscalls with "relative" numbers + */ +#define __LSC(x) (__NR_LITMUS + x) + +#define __NR_set_rt_task_param __LSC(0) +__SYSCALL(__NR_set_rt_task_param, sys_set_rt_task_param) +#define __NR_get_rt_task_param __LSC(1) +__SYSCALL(__NR_get_rt_task_param, sys_get_rt_task_param) +#define __NR_complete_job __LSC(2) +__SYSCALL(__NR_complete_job, sys_complete_job) +#define __NR_od_open __LSC(3) +__SYSCALL(__NR_od_open, sys_od_open) +#define __NR_od_close __LSC(4) +__SYSCALL(__NR_od_close, sys_od_close) +#define __NR_litmus_lock __LSC(5) +__SYSCALL(__NR_litmus_lock, sys_litmus_lock) +#define __NR_litmus_unlock __LSC(6) +__SYSCALL(__NR_litmus_unlock, sys_litmus_unlock) +#define __NR_query_job_no __LSC(7) +__SYSCALL(__NR_query_job_no, sys_query_job_no) +#define __NR_wait_for_job_release __LSC(8) +__SYSCALL(__NR_wait_for_job_release, sys_wait_for_job_release) +#define __NR_wait_for_ts_release __LSC(9) +__SYSCALL(__NR_wait_for_ts_release, sys_wait_for_ts_release) +#define __NR_release_ts __LSC(10) +__SYSCALL(__NR_release_ts, sys_release_ts) +#define __NR_null_call __LSC(11) +__SYSCALL(__NR_null_call, sys_null_call) + +#define NR_litmus_syscalls 12 diff --git a/include/litmus/wait.h b/include/litmus/wait.h new file mode 100644 index 000000000000..ce1347c355f8 --- /dev/null +++ b/include/litmus/wait.h @@ -0,0 +1,57 @@ +#ifndef _LITMUS_WAIT_H_ +#define _LITMUS_WAIT_H_ + +struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq); + +/* wrap regular wait_queue_t head */ +struct __prio_wait_queue { + wait_queue_t wq; + + /* some priority point */ + lt_t priority; + /* break ties in priority by lower tie_breaker */ + unsigned int tie_breaker; +}; + +typedef struct __prio_wait_queue prio_wait_queue_t; + +static inline void init_prio_waitqueue_entry(prio_wait_queue_t *pwq, + struct task_struct* t, + lt_t priority) +{ + init_waitqueue_entry(&pwq->wq, t); + pwq->priority = priority; + pwq->tie_breaker = 0; +} + +static inline void init_prio_waitqueue_entry_tie(prio_wait_queue_t *pwq, + struct task_struct* t, + lt_t priority, + unsigned int tie_breaker) +{ + init_waitqueue_entry(&pwq->wq, t); + pwq->priority = priority; + pwq->tie_breaker = tie_breaker; +} + +unsigned int __add_wait_queue_prio_exclusive( + wait_queue_head_t* head, + prio_wait_queue_t *new); + +static inline unsigned int add_wait_queue_prio_exclusive( + wait_queue_head_t* head, + prio_wait_queue_t *new) +{ + unsigned long flags; + unsigned int passed; + + spin_lock_irqsave(&head->lock, flags); + passed = __add_wait_queue_prio_exclusive(head, new); + + spin_unlock_irqrestore(&head->lock, flags); + + return passed; +} + + +#endif diff --git a/kernel/exit.c b/kernel/exit.c index 03120229db28..b9d3bc6c21ec 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -56,6 +56,8 @@ #include #include +extern void exit_od_table(struct task_struct *t); + static void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p, bool group_dead) @@ -960,6 +962,8 @@ NORET_TYPE void do_exit(long code) if (unlikely(tsk->audit_context)) audit_free(tsk); + exit_od_table(tsk); + tsk->exit_code = code; taskstats_exit(tsk, group_dead); diff --git a/kernel/fork.c b/kernel/fork.c index c445f8cc408d..ab7f29d906c7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -75,6 +75,9 @@ #include +#include +#include + /* * Protected counters by write_lock_irq(&tasklist_lock) */ @@ -183,6 +186,7 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(atomic_read(&tsk->usage)); WARN_ON(tsk == current); + exit_litmus(tsk); exit_creds(tsk); delayacct_tsk_free(tsk); put_signal_struct(tsk->signal); @@ -266,6 +270,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk->stack = ti; + /* Don't let the new task be a real-time task. */ + litmus_fork(tsk); + err = prop_local_init_single(&tsk->dirties); if (err) goto out; diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c index 72206cf5c6cf..cb49883b64e5 100644 --- a/kernel/hrtimer.c +++ b/kernel/hrtimer.c @@ -46,6 +46,8 @@ #include #include +#include + #include #include @@ -1042,6 +1044,98 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode) } EXPORT_SYMBOL_GPL(hrtimer_start); +#ifdef CONFIG_ARCH_HAS_SEND_PULL_TIMERS + +/** + * hrtimer_start_on_info_init - Initialize hrtimer_start_on_info + */ +void hrtimer_start_on_info_init(struct hrtimer_start_on_info *info) +{ + memset(info, 0, sizeof(struct hrtimer_start_on_info)); + atomic_set(&info->state, HRTIMER_START_ON_INACTIVE); +} + +/** + * hrtimer_pull - PULL_TIMERS_VECTOR callback on remote cpu + */ +void hrtimer_pull(void) +{ + struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases); + struct hrtimer_start_on_info *info; + struct list_head *pos, *safe, list; + + raw_spin_lock(&base->lock); + list_replace_init(&base->to_pull, &list); + raw_spin_unlock(&base->lock); + + list_for_each_safe(pos, safe, &list) { + info = list_entry(pos, struct hrtimer_start_on_info, list); + TRACE("pulled timer 0x%x\n", info->timer); + list_del(pos); + hrtimer_start(info->timer, info->time, info->mode); + } +} + +/** + * hrtimer_start_on - trigger timer arming on remote cpu + * @cpu: remote cpu + * @info: save timer information for enqueuing on remote cpu + * @timer: timer to be pulled + * @time: expire time + * @mode: timer mode + */ +int hrtimer_start_on(int cpu, struct hrtimer_start_on_info* info, + struct hrtimer *timer, ktime_t time, + const enum hrtimer_mode mode) +{ + unsigned long flags; + struct hrtimer_cpu_base* base; + int in_use = 0, was_empty; + + /* serialize access to info through the timer base */ + lock_hrtimer_base(timer, &flags); + + in_use = (atomic_read(&info->state) != HRTIMER_START_ON_INACTIVE); + if (!in_use) { + INIT_LIST_HEAD(&info->list); + info->timer = timer; + info->time = time; + info->mode = mode; + /* mark as in use */ + atomic_set(&info->state, HRTIMER_START_ON_QUEUED); + } + + unlock_hrtimer_base(timer, &flags); + + if (!in_use) { + /* initiate pull */ + preempt_disable(); + if (cpu == smp_processor_id()) { + /* start timer locally; we may get called + * with rq->lock held, do not wake up anything + */ + TRACE("hrtimer_start_on: starting on local CPU\n"); + __hrtimer_start_range_ns(info->timer, info->time, + 0, info->mode, 0); + } else { + TRACE("hrtimer_start_on: pulling to remote CPU\n"); + base = &per_cpu(hrtimer_bases, cpu); + raw_spin_lock_irqsave(&base->lock, flags); + was_empty = list_empty(&base->to_pull); + list_add(&info->list, &base->to_pull); + raw_spin_unlock_irqrestore(&base->lock, flags); + if (was_empty) + /* only send IPI if other no else + * has done so already + */ + smp_send_pull_timers(cpu); + } + preempt_enable(); + } + return in_use; +} + +#endif /** * hrtimer_try_to_cancel - try to deactivate a timer @@ -1634,6 +1728,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu) cpu_base->clock_base[i].cpu_base = cpu_base; hrtimer_init_hres(cpu_base); + INIT_LIST_HEAD(&cpu_base->to_pull); } #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/printk.c b/kernel/printk.c index 8fe465ac008a..9dc8ea140426 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -73,6 +73,13 @@ int console_printk[4] = { DEFAULT_CONSOLE_LOGLEVEL, /* default_console_loglevel */ }; +/* + * divert printk() messages when there is a LITMUS^RT debug listener + */ +#include +int trace_override = 0; +int trace_recurse = 0; + /* * Low level drivers may need that to know if they can schedule in * their unblank() callback or not. So let's export it. @@ -735,6 +742,9 @@ asmlinkage int vprintk(const char *fmt, va_list args) /* Emit the output into the temporary buffer */ printed_len += vscnprintf(printk_buf + printed_len, sizeof(printk_buf) - printed_len, fmt, args); + /* if LITMUS^RT tracer is active divert printk() msgs */ + if (trace_override && !trace_recurse) + TRACE("%s", printk_buf); p = printk_buf; @@ -804,7 +814,7 @@ asmlinkage int vprintk(const char *fmt, va_list args) * Try to acquire and then immediately release the * console semaphore. The release will do all the * actual magic (print out buffers, wake up klogd, - * etc). + * etc). * * The acquire_console_semaphore_for_printk() function * will release 'logbuf_lock' regardless of whether it @@ -1067,7 +1077,7 @@ int printk_needs_cpu(int cpu) void wake_up_klogd(void) { - if (waitqueue_active(&log_wait)) + if (!trace_override && waitqueue_active(&log_wait)) __raw_get_cpu_var(printk_pending) = 1; } diff --git a/kernel/sched.c b/kernel/sched.c index dc85ceb90832..1f5327f8c012 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -79,6 +79,11 @@ #include "sched_cpupri.h" #include "workqueue_sched.h" +#include +#include + +static void litmus_tick(struct rq*, struct task_struct*); + #define CREATE_TRACE_POINTS #include @@ -405,6 +410,12 @@ struct rt_rq { #endif }; +/* Litmus related fields in a runqueue */ +struct litmus_rq { + unsigned long nr_running; + struct task_struct *prev; +}; + #ifdef CONFIG_SMP /* @@ -471,6 +482,7 @@ struct rq { struct cfs_rq cfs; struct rt_rq rt; + struct litmus_rq litmus; #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ @@ -566,8 +578,14 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) * A queue event has occurred, and we're going to schedule. In * this case, we can save a useless back to back clock update. */ + /* LITMUS^RT: turning off the clock update is buggy in Linux 2.6.36; + * the scheduler can "forget" to renable the runqueue clock in some + * cases. LITMUS^RT amplifies the effects of this problem. Hence, we + * turn it off to avoid stalling clocks. */ + /* if (test_tsk_need_resched(p)) rq->skip_clock_update = 1; + */ } static inline int cpu_of(struct rq *rq) @@ -1042,6 +1060,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) raw_spin_lock(&rq->lock); update_rq_clock(rq); rq->curr->sched_class->task_tick(rq, rq->curr, 1); + litmus_tick(rq, rq->curr); raw_spin_unlock(&rq->lock); return HRTIMER_NORESTART; @@ -1840,7 +1859,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) static const struct sched_class rt_sched_class; -#define sched_class_highest (&rt_sched_class) +#define sched_class_highest (&litmus_sched_class) #define for_each_class(class) \ for (class = sched_class_highest; class; class = class->next) @@ -1920,6 +1939,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int flags) #include "sched_idletask.c" #include "sched_fair.c" #include "sched_rt.c" +#include "../litmus/sched_litmus.c" #ifdef CONFIG_SCHED_DEBUG # include "sched_debug.c" #endif @@ -2352,6 +2372,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, unsigned long en_flags = ENQUEUE_WAKEUP; struct rq *rq; + if (is_realtime(p)) + TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state); + this_cpu = get_cpu(); smp_wmb(); @@ -2366,7 +2389,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, orig_cpu = cpu; #ifdef CONFIG_SMP - if (unlikely(task_running(rq, p))) + if (unlikely(task_running(rq, p)) || is_realtime(p)) goto out_activate; /* @@ -2428,6 +2451,8 @@ out_activate: out_running: ttwu_post_activation(p, rq, wake_flags, success); out: + if (is_realtime(p)) + TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state); task_rq_unlock(rq, &flags); put_cpu(); @@ -2532,7 +2557,8 @@ void sched_fork(struct task_struct *p, int clone_flags) * Revert to default priority/policy on fork if requested. */ if (unlikely(p->sched_reset_on_fork)) { - if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { + if (p->policy == SCHED_FIFO || p->policy == SCHED_RR || + p->policy == SCHED_LITMUS) { p->policy = SCHED_NORMAL; p->normal_prio = p->static_prio; } @@ -2748,6 +2774,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); + litmus->finish_switch(prev); + prev->rt_param.stack_in_use = NO_CPU; #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW local_irq_disable(); #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */ @@ -2777,6 +2805,15 @@ static inline void pre_schedule(struct rq *rq, struct task_struct *prev) { if (prev->sched_class->pre_schedule) prev->sched_class->pre_schedule(rq, prev); + + /* LITMUS^RT not very clean hack: we need to save the prev task + * as our scheduling decision rely on it (as we drop the rq lock + * something in prev can change...); there is no way to escape + * this ack apart from modifying pick_nex_task(rq, _prev_) or + * falling back on the previous solution of decoupling + * scheduling decisions + */ + rq->litmus.prev = prev; } /* rq->lock is NOT held, but preemption is disabled */ @@ -3578,18 +3615,26 @@ void scheduler_tick(void) sched_clock_tick(); + TS_TICK_START(current); + raw_spin_lock(&rq->lock); update_rq_clock(rq); update_cpu_load_active(rq); curr->sched_class->task_tick(rq, curr, 0); + + /* litmus_tick may force current to resched */ + litmus_tick(rq, curr); + raw_spin_unlock(&rq->lock); perf_event_task_tick(curr); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); - trigger_load_balance(rq, cpu); + if (!is_realtime(current)) + trigger_load_balance(rq, cpu); #endif + TS_TICK_END(current); } notrace unsigned long get_parent_ip(unsigned long addr) @@ -3716,12 +3761,20 @@ pick_next_task(struct rq *rq) /* * Optimization: we know that if all tasks are in * the fair class we can call that function directly: - */ - if (likely(rq->nr_running == rq->cfs.nr_running)) { + + * NOT IN LITMUS^RT! + + * This breaks many assumptions in the plugins. + * Do not uncomment without thinking long and hard + * about how this affects global plugins such as GSN-EDF. + + if (rq->nr_running == rq->cfs.nr_running) { + TRACE("taking shortcut in pick_next_task()\n"); p = fair_sched_class.pick_next_task(rq); if (likely(p)) return p; } + */ class = sched_class_highest; for ( ; ; ) { @@ -3748,6 +3801,7 @@ asmlinkage void __sched schedule(void) need_resched: preempt_disable(); + sched_state_entered_schedule(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_note_context_switch(cpu); @@ -3755,6 +3809,8 @@ need_resched: release_kernel_lock(prev); need_resched_nonpreemptible: + TS_SCHED_START; + sched_trace_task_switch_away(prev); schedule_debug(prev); @@ -3803,7 +3859,10 @@ need_resched_nonpreemptible: rq->curr = next; ++*switch_count; + TS_SCHED_END(next); + TS_CXS_START(next); context_switch(rq, prev, next); /* unlocks the rq */ + TS_CXS_END(current); /* * The context switch have flipped the stack from under us * and restored the local variables which were saved when @@ -3812,17 +3871,23 @@ need_resched_nonpreemptible: */ cpu = smp_processor_id(); rq = cpu_rq(cpu); - } else + } else { + TS_SCHED_END(prev); raw_spin_unlock_irq(&rq->lock); + } + + sched_trace_task_switch_to(current); post_schedule(rq); - if (unlikely(reacquire_kernel_lock(prev))) + if (sched_state_validate_switch() || unlikely(reacquire_kernel_lock(prev))) goto need_resched_nonpreemptible; preempt_enable_no_resched(); if (need_resched()) goto need_resched; + + srp_ceiling_block(); } EXPORT_SYMBOL(schedule); @@ -4108,6 +4173,17 @@ void complete_all(struct completion *x) } EXPORT_SYMBOL(complete_all); +void complete_n(struct completion *x, int n) +{ + unsigned long flags; + + spin_lock_irqsave(&x->wait.lock, flags); + x->done += n; + __wake_up_common(&x->wait, TASK_NORMAL, n, 0, NULL); + spin_unlock_irqrestore(&x->wait.lock, flags); +} +EXPORT_SYMBOL(complete_n); + static inline long __sched do_wait_for_common(struct completion *x, long timeout, int state) { @@ -4550,7 +4626,9 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio) p->normal_prio = normal_prio(p); /* we are holding p->pi_lock already */ p->prio = rt_mutex_getprio(p); - if (rt_prio(p->prio)) + if (p->policy == SCHED_LITMUS) + p->sched_class = &litmus_sched_class; + else if (rt_prio(p->prio)) p->sched_class = &rt_sched_class; else p->sched_class = &fair_sched_class; @@ -4595,7 +4673,7 @@ recheck: if (policy != SCHED_FIFO && policy != SCHED_RR && policy != SCHED_NORMAL && policy != SCHED_BATCH && - policy != SCHED_IDLE) + policy != SCHED_IDLE && policy != SCHED_LITMUS) return -EINVAL; } @@ -4610,6 +4688,8 @@ recheck: return -EINVAL; if (rt_policy(policy) != (param->sched_priority != 0)) return -EINVAL; + if (policy == SCHED_LITMUS && policy == p->policy) + return -EINVAL; /* * Allow unprivileged RT tasks to decrease priority: @@ -4650,6 +4730,12 @@ recheck: return retval; } + if (policy == SCHED_LITMUS) { + retval = litmus_admit_task(p); + if (retval) + return retval; + } + /* * make sure no PI-waiters arrive (or leave) while we are * changing the priority of the task: @@ -4692,10 +4778,19 @@ recheck: p->sched_reset_on_fork = reset_on_fork; + if (p->policy == SCHED_LITMUS) + litmus_exit_task(p); + oldprio = p->prio; prev_class = p->sched_class; __setscheduler(rq, p, policy, param->sched_priority); + if (policy == SCHED_LITMUS) { + p->rt_param.stack_in_use = running ? rq->cpu : NO_CPU; + p->rt_param.present = running; + litmus->task_new(p, on_rq, running); + } + if (running) p->sched_class->set_curr_task(rq); if (on_rq) { @@ -4755,6 +4850,13 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) if (copy_from_user(&lparam, param, sizeof(struct sched_param))) return -EFAULT; +#ifdef CONFIG_LITMUS_LOCKING + /* Hack to allow plugin to call into schedule + * prio to a setscheduler() call. */ + if (is_realtime(current)) + litmus->pre_setsched(current, policy); +#endif + rcu_read_lock(); retval = -ESRCH; p = find_process_by_pid(pid); @@ -4865,10 +4967,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) rcu_read_lock(); p = find_process_by_pid(pid); - if (!p) { + /* Don't set affinity if task not found and for LITMUS tasks */ + if (!p || is_realtime(p)) { rcu_read_unlock(); put_online_cpus(); - return -ESRCH; + return p ? -EPERM : -ESRCH; } /* Prevent p going away */ diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index db3f674ca49d..e0e8d5ca3c98 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1654,7 +1654,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ struct cfs_rq *cfs_rq = task_cfs_rq(curr); int scale = cfs_rq->nr_running >= sched_nr_latency; - if (unlikely(rt_prio(p->prio))) + if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS) goto preempt; if (unlikely(p->sched_class != &fair_sched_class)) diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index d10c80ebb67a..e40e7fe43170 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -1013,7 +1013,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p) */ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags) { - if (p->prio < rq->curr->prio) { + if (p->prio < rq->curr->prio || p->policy == SCHED_LITMUS) { resched_task(rq->curr); return; } diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3e216e01bbd1..bb2d8b7850a3 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -767,6 +767,46 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer) return HRTIMER_RESTART; } +/** + * tick_set_quanta_type - get the quanta type as a boot option + * Default is standard setup with ticks staggered over first + * half of tick period. + */ +int quanta_type = LINUX_DEFAULT_TICKS; +static int __init tick_set_quanta_type(char *str) +{ + if (strcmp("aligned", str) == 0) { + quanta_type = LITMUS_ALIGNED_TICKS; + printk(KERN_INFO "LITMUS^RT: setting aligned quanta\n"); + } + else if (strcmp("staggered", str) == 0) { + quanta_type = LITMUS_STAGGERED_TICKS; + printk(KERN_INFO "LITMUS^RT: setting staggered quanta\n"); + } + return 1; +} +__setup("quanta=", tick_set_quanta_type); + +u64 cpu_stagger_offset(int cpu) +{ + u64 offset = 0; + switch (quanta_type) { + case LITMUS_ALIGNED_TICKS: + offset = 0; + break; + case LITMUS_STAGGERED_TICKS: + offset = ktime_to_ns(tick_period); + do_div(offset, num_possible_cpus()); + offset *= cpu; + break; + default: + offset = ktime_to_ns(tick_period) >> 1; + do_div(offset, num_possible_cpus()); + offset *= cpu; + } + return offset; +} + /** * tick_setup_sched_timer - setup the tick emulation timer */ @@ -774,6 +814,7 @@ void tick_setup_sched_timer(void) { struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched); ktime_t now = ktime_get(); + u64 offset; /* * Emulate tick processing via per-CPU hrtimers: @@ -784,6 +825,12 @@ void tick_setup_sched_timer(void) /* Get the next period (per cpu) */ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update()); + /* Offset must be set correctly to achieve desired quanta type. */ + offset = cpu_stagger_offset(smp_processor_id()); + + /* Add the correct offset to expiration time */ + hrtimer_add_expires_ns(&ts->sched_timer, offset); + for (;;) { hrtimer_forward(&ts->sched_timer, now, tick_period); hrtimer_start_expires(&ts->sched_timer, diff --git a/litmus/Kconfig b/litmus/Kconfig new file mode 100644 index 000000000000..ad8dc8308cf0 --- /dev/null +++ b/litmus/Kconfig @@ -0,0 +1,185 @@ +menu "LITMUS^RT" + +menu "Scheduling" + +config PLUGIN_CEDF + bool "Clustered-EDF" + depends on X86 && SYSFS + default y + help + Include the Clustered EDF (C-EDF) plugin in the kernel. + This is appropriate for large platforms with shared caches. + On smaller platforms (e.g., ARM PB11MPCore), using C-EDF + makes little sense since there aren't any shared caches. + +config PLUGIN_PFAIR + bool "PFAIR" + depends on HIGH_RES_TIMERS && !NO_HZ + default y + help + Include the PFAIR plugin (i.e., the PD^2 scheduler) in the kernel. + The PFAIR plugin requires high resolution timers (for staggered quanta) + and does not support NO_HZ (quanta could be missed when the system is idle). + + If unsure, say Yes. + +config RELEASE_MASTER + bool "Release-master Support" + depends on ARCH_HAS_SEND_PULL_TIMERS + default n + help + Allow one processor to act as a dedicated interrupt processor + that services all timer interrupts, but that does not schedule + real-time tasks. See RTSS'09 paper for details + (http://www.cs.unc.edu/~anderson/papers.html). + Currently only supported by GSN-EDF. + +endmenu + +menu "Real-Time Synchronization" + +config NP_SECTION + bool "Non-preemptive section support" + default n + help + Allow tasks to become non-preemptable. + Note that plugins still need to explicitly support non-preemptivity. + Currently, only GSN-EDF and PSN-EDF have such support. + + This is required to support locking protocols such as the FMLP. + If disabled, all tasks will be considered preemptable at all times. + +config LITMUS_LOCKING + bool "Support for real-time locking protocols" + depends on NP_SECTION + default n + help + Enable LITMUS^RT's deterministic multiprocessor real-time + locking protocols. + + Say Yes if you want to include locking protocols such as the FMLP and + Baker's SRP. + +endmenu + +menu "Tracing" + +config FEATHER_TRACE + bool "Feather-Trace Infrastructure" + default y + help + Feather-Trace basic tracing infrastructure. Includes device file + driver and instrumentation point support. + + There are actually two implementations of Feather-Trace. + 1) A slower, but portable, default implementation. + 2) Architecture-specific implementations that rewrite kernel .text at runtime. + + If enabled, Feather-Trace will be based on 2) if available (currently only for x86). + However, if DEBUG_RODATA=y, then Feather-Trace will choose option 1) in any case + to avoid problems with write-protected .text pages. + + Bottom line: to avoid increased overheads, choose DEBUG_RODATA=n. + + Note that this option only enables the basic Feather-Trace infrastructure; + you still need to enable SCHED_TASK_TRACE and/or SCHED_OVERHEAD_TRACE to + actually enable any events. + +config SCHED_TASK_TRACE + bool "Trace real-time tasks" + depends on FEATHER_TRACE + default y + help + Include support for the sched_trace_XXX() tracing functions. This + allows the collection of real-time task events such as job + completions, job releases, early completions, etc. This results in a + small overhead in the scheduling code. Disable if the overhead is not + acceptable (e.g., benchmarking). + + Say Yes for debugging. + Say No for overhead tracing. + +config SCHED_TASK_TRACE_SHIFT + int "Buffer size for sched_trace_xxx() events" + depends on SCHED_TASK_TRACE + range 8 13 + default 9 + help + + Select the buffer size of sched_trace_xxx() events as a power of two. + These buffers are statically allocated as per-CPU data. Each event + requires 24 bytes storage plus one additional flag byte. Too large + buffers can cause issues with the per-cpu allocator (and waste + memory). Too small buffers can cause scheduling events to be lost. The + "right" size is workload dependent and depends on the number of tasks, + each task's period, each task's number of suspensions, and how often + the buffer is flushed. + + Examples: 12 => 4k events + 10 => 1k events + 8 => 512 events + +config SCHED_OVERHEAD_TRACE + bool "Record timestamps for overhead measurements" + depends on FEATHER_TRACE + default n + help + Export event stream for overhead tracing. + Say Yes for overhead tracing. + +config SCHED_DEBUG_TRACE + bool "TRACE() debugging" + default y + help + Include support for sched_trace_log_messageg(), which is used to + implement TRACE(). If disabled, no TRACE() messages will be included + in the kernel, and no overheads due to debugging statements will be + incurred by the scheduler. Disable if the overhead is not acceptable + (e.g. benchmarking). + + Say Yes for debugging. + Say No for overhead tracing. + +config SCHED_DEBUG_TRACE_SHIFT + int "Buffer size for TRACE() buffer" + depends on SCHED_DEBUG_TRACE + range 14 22 + default 18 + help + + Select the amount of memory needed per for the TRACE() buffer, as a + power of two. The TRACE() buffer is global and statically allocated. If + the buffer is too small, there will be holes in the TRACE() log if the + buffer-flushing task is starved. + + The default should be sufficient for most systems. Increase the buffer + size if the log contains holes. Reduce the buffer size when running on + a memory-constrained system. + + Examples: 14 => 16KB + 18 => 256KB + 20 => 1MB + + This buffer is exported to usespace using a misc device as + 'litmus/log'. On a system with default udev rules, a corresponding + character device node should be created at /dev/litmus/log. The buffer + can be flushed using cat, e.g., 'cat /dev/litmus/log > my_log_file.txt'. + +config SCHED_DEBUG_TRACE_CALLER + bool "Include [function@file:line] tag in TRACE() log" + depends on SCHED_DEBUG_TRACE + default n + help + With this option enabled, TRACE() prepends + + "[@:]" + + to each message in the debug log. Enable this to aid in figuring out + what was called in which order. The downside is that it adds a lot of + clutter. + + If unsure, say No. + +endmenu + +endmenu diff --git a/litmus/Makefile b/litmus/Makefile new file mode 100644 index 000000000000..e86fad8c25ec --- /dev/null +++ b/litmus/Makefile @@ -0,0 +1,30 @@ +# +# Makefile for LITMUS^RT +# + +obj-y = sched_plugin.o litmus.o \ + preempt.o \ + litmus_proc.o \ + budget.o \ + clustered.o \ + jobs.o \ + sync.o \ + rt_domain.o \ + edf_common.o \ + fp_common.o \ + fdso.o \ + locking.o \ + srp.o \ + bheap.o \ + ctrldev.o \ + sched_gsn_edf.o \ + sched_psn_edf.o \ + sched_pfp.o + +obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o +obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o + +obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o +obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o +obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o +obj-$(CONFIG_SCHED_OVERHEAD_TRACE) += trace.o diff --git a/litmus/bheap.c b/litmus/bheap.c new file mode 100644 index 000000000000..528af97f18a6 --- /dev/null +++ b/litmus/bheap.c @@ -0,0 +1,314 @@ +#include "linux/kernel.h" +#include "litmus/bheap.h" + +void bheap_init(struct bheap* heap) +{ + heap->head = NULL; + heap->min = NULL; +} + +void bheap_node_init(struct bheap_node** _h, void* value) +{ + struct bheap_node* h = *_h; + h->parent = NULL; + h->next = NULL; + h->child = NULL; + h->degree = NOT_IN_HEAP; + h->value = value; + h->ref = _h; +} + + +/* make child a subtree of root */ +static void __bheap_link(struct bheap_node* root, + struct bheap_node* child) +{ + child->parent = root; + child->next = root->child; + root->child = child; + root->degree++; +} + +/* merge root lists */ +static struct bheap_node* __bheap_merge(struct bheap_node* a, + struct bheap_node* b) +{ + struct bheap_node* head = NULL; + struct bheap_node** pos = &head; + + while (a && b) { + if (a->degree < b->degree) { + *pos = a; + a = a->next; + } else { + *pos = b; + b = b->next; + } + pos = &(*pos)->next; + } + if (a) + *pos = a; + else + *pos = b; + return head; +} + +/* reverse a linked list of nodes. also clears parent pointer */ +static struct bheap_node* __bheap_reverse(struct bheap_node* h) +{ + struct bheap_node* tail = NULL; + struct bheap_node* next; + + if (!h) + return h; + + h->parent = NULL; + while (h->next) { + next = h->next; + h->next = tail; + tail = h; + h = next; + h->parent = NULL; + } + h->next = tail; + return h; +} + +static void __bheap_min(bheap_prio_t higher_prio, struct bheap* heap, + struct bheap_node** prev, struct bheap_node** node) +{ + struct bheap_node *_prev, *cur; + *prev = NULL; + + if (!heap->head) { + *node = NULL; + return; + } + + *node = heap->head; + _prev = heap->head; + cur = heap->head->next; + while (cur) { + if (higher_prio(cur, *node)) { + *node = cur; + *prev = _prev; + } + _prev = cur; + cur = cur->next; + } +} + +static void __bheap_union(bheap_prio_t higher_prio, struct bheap* heap, + struct bheap_node* h2) +{ + struct bheap_node* h1; + struct bheap_node *prev, *x, *next; + if (!h2) + return; + h1 = heap->head; + if (!h1) { + heap->head = h2; + return; + } + h1 = __bheap_merge(h1, h2); + prev = NULL; + x = h1; + next = x->next; + while (next) { + if (x->degree != next->degree || + (next->next && next->next->degree == x->degree)) { + /* nothing to do, advance */ + prev = x; + x = next; + } else if (higher_prio(x, next)) { + /* x becomes the root of next */ + x->next = next->next; + __bheap_link(x, next); + } else { + /* next becomes the root of x */ + if (prev) + prev->next = next; + else + h1 = next; + __bheap_link(next, x); + x = next; + } + next = x->next; + } + heap->head = h1; +} + +static struct bheap_node* __bheap_extract_min(bheap_prio_t higher_prio, + struct bheap* heap) +{ + struct bheap_node *prev, *node; + __bheap_min(higher_prio, heap, &prev, &node); + if (!node) + return NULL; + if (prev) + prev->next = node->next; + else + heap->head = node->next; + __bheap_union(higher_prio, heap, __bheap_reverse(node->child)); + return node; +} + +/* insert (and reinitialize) a node into the heap */ +void bheap_insert(bheap_prio_t higher_prio, struct bheap* heap, + struct bheap_node* node) +{ + struct bheap_node *min; + node->child = NULL; + node->parent = NULL; + node->next = NULL; + node->degree = 0; + if (heap->min && higher_prio(node, heap->min)) { + /* swap min cache */ + min = heap->min; + min->child = NULL; + min->parent = NULL; + min->next = NULL; + min->degree = 0; + __bheap_union(higher_prio, heap, min); + heap->min = node; + } else + __bheap_union(higher_prio, heap, node); +} + +void bheap_uncache_min(bheap_prio_t higher_prio, struct bheap* heap) +{ + struct bheap_node* min; + if (heap->min) { + min = heap->min; + heap->min = NULL; + bheap_insert(higher_prio, heap, min); + } +} + +/* merge addition into target */ +void bheap_union(bheap_prio_t higher_prio, + struct bheap* target, struct bheap* addition) +{ + /* first insert any cached minima, if necessary */ + bheap_uncache_min(higher_prio, target); + bheap_uncache_min(higher_prio, addition); + __bheap_union(higher_prio, target, addition->head); + /* this is a destructive merge */ + addition->head = NULL; +} + +struct bheap_node* bheap_peek(bheap_prio_t higher_prio, + struct bheap* heap) +{ + if (!heap->min) + heap->min = __bheap_extract_min(higher_prio, heap); + return heap->min; +} + +struct bheap_node* bheap_take(bheap_prio_t higher_prio, + struct bheap* heap) +{ + struct bheap_node *node; + if (!heap->min) + heap->min = __bheap_extract_min(higher_prio, heap); + node = heap->min; + heap->min = NULL; + if (node) + node->degree = NOT_IN_HEAP; + return node; +} + +int bheap_decrease(bheap_prio_t higher_prio, struct bheap_node* node) +{ + struct bheap_node *parent; + struct bheap_node** tmp_ref; + void* tmp; + + /* bubble up */ + parent = node->parent; + while (parent && higher_prio(node, parent)) { + /* swap parent and node */ + tmp = parent->value; + parent->value = node->value; + node->value = tmp; + /* swap references */ + *(parent->ref) = node; + *(node->ref) = parent; + tmp_ref = parent->ref; + parent->ref = node->ref; + node->ref = tmp_ref; + /* step up */ + node = parent; + parent = node->parent; + } + + return parent != NULL; +} + +void bheap_delete(bheap_prio_t higher_prio, struct bheap* heap, + struct bheap_node* node) +{ + struct bheap_node *parent, *prev, *pos; + struct bheap_node** tmp_ref; + void* tmp; + + if (heap->min != node) { + /* bubble up */ + parent = node->parent; + while (parent) { + /* swap parent and node */ + tmp = parent->value; + parent->value = node->value; + node->value = tmp; + /* swap references */ + *(parent->ref) = node; + *(node->ref) = parent; + tmp_ref = parent->ref; + parent->ref = node->ref; + node->ref = tmp_ref; + /* step up */ + node = parent; + parent = node->parent; + } + /* now delete: + * first find prev */ + prev = NULL; + pos = heap->head; + while (pos != node) { + prev = pos; + pos = pos->next; + } + /* we have prev, now remove node */ + if (prev) + prev->next = node->next; + else + heap->head = node->next; + __bheap_union(higher_prio, heap, __bheap_reverse(node->child)); + } else + heap->min = NULL; + node->degree = NOT_IN_HEAP; +} + +/* allocate a heap node for value and insert into the heap */ +int bheap_add(bheap_prio_t higher_prio, struct bheap* heap, + void* value, int gfp_flags) +{ + struct bheap_node* hn = bheap_node_alloc(gfp_flags); + if (likely(hn)) { + bheap_node_init(&hn, value); + bheap_insert(higher_prio, heap, hn); + } + return hn != NULL; +} + +void* bheap_take_del(bheap_prio_t higher_prio, + struct bheap* heap) +{ + struct bheap_node* hn = bheap_take(higher_prio, heap); + void* ret = NULL; + if (hn) { + ret = hn->value; + bheap_node_free(hn); + } + return ret; +} diff --git a/litmus/budget.c b/litmus/budget.c new file mode 100644 index 000000000000..310e9a3d4172 --- /dev/null +++ b/litmus/budget.c @@ -0,0 +1,111 @@ +#include +#include +#include + +#include +#include + +struct enforcement_timer { + /* The enforcement timer is used to accurately police + * slice budgets. */ + struct hrtimer timer; + int armed; +}; + +DEFINE_PER_CPU(struct enforcement_timer, budget_timer); + +static enum hrtimer_restart on_enforcement_timeout(struct hrtimer *timer) +{ + struct enforcement_timer* et = container_of(timer, + struct enforcement_timer, + timer); + unsigned long flags; + + local_irq_save(flags); + TRACE("enforcement timer fired.\n"); + et->armed = 0; + /* activate scheduler */ + litmus_reschedule_local(); + local_irq_restore(flags); + + return HRTIMER_NORESTART; +} + +/* assumes called with IRQs off */ +static void cancel_enforcement_timer(struct enforcement_timer* et) +{ + int ret; + + TRACE("cancelling enforcement timer.\n"); + + /* Since interrupts are disabled and et->armed is only + * modified locally, we do not need any locks. + */ + + if (et->armed) { + ret = hrtimer_try_to_cancel(&et->timer); + /* Should never be inactive. */ + BUG_ON(ret == 0); + /* Should never be running concurrently. */ + BUG_ON(ret == -1); + + et->armed = 0; + } +} + +/* assumes called with IRQs off */ +static void arm_enforcement_timer(struct enforcement_timer* et, + struct task_struct* t) +{ + lt_t when_to_fire; + TRACE_TASK(t, "arming enforcement timer.\n"); + + /* Calling this when there is no budget left for the task + * makes no sense, unless the task is non-preemptive. */ + BUG_ON(budget_exhausted(t) && (!is_np(t))); + + /* __hrtimer_start_range_ns() cancels the timer + * anyway, so we don't have to check whether it is still armed */ + + if (likely(!is_np(t))) { + when_to_fire = litmus_clock() + budget_remaining(t); + __hrtimer_start_range_ns(&et->timer, + ns_to_ktime(when_to_fire), + 0 /* delta */, + HRTIMER_MODE_ABS_PINNED, + 0 /* no wakeup */); + et->armed = 1; + } +} + + +/* expects to be called with IRQs off */ +void update_enforcement_timer(struct task_struct* t) +{ + struct enforcement_timer* et = &__get_cpu_var(budget_timer); + + if (t && budget_precisely_enforced(t)) { + /* Make sure we call into the scheduler when this budget + * expires. */ + arm_enforcement_timer(et, t); + } else if (et->armed) { + /* Make sure we don't cause unnecessary interrupts. */ + cancel_enforcement_timer(et); + } +} + + +static int __init init_budget_enforcement(void) +{ + int cpu; + struct enforcement_timer* et; + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + et = &per_cpu(budget_timer, cpu); + hrtimer_init(&et->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + et->timer.function = on_enforcement_timeout; + } + return 0; +} + +module_init(init_budget_enforcement); diff --git a/litmus/clustered.c b/litmus/clustered.c new file mode 100644 index 000000000000..6fe1b512f628 --- /dev/null +++ b/litmus/clustered.c @@ -0,0 +1,111 @@ +#include +#include +#include + +#include + +#ifndef CONFIG_X86 +/* fake get_shared_cpu_map() on non-x86 architectures */ + +int get_shared_cpu_map(cpumask_var_t mask, unsigned int cpu, int index) +{ + if (index != 1) + return 1; + else { + /* Fake L1: CPU is all by itself. */ + cpumask_clear(mask); + cpumask_set_cpu(cpu, mask); + return 0; + } +} + +#endif + +int get_cluster_size(enum cache_level level) +{ + cpumask_var_t mask; + int ok; + int num_cpus; + + if (level == GLOBAL_CLUSTER) + return num_online_cpus(); + else { + if (!zalloc_cpumask_var(&mask, GFP_ATOMIC)) + return -ENOMEM; + /* assumes CPU 0 is representative of all CPUs */ + ok = get_shared_cpu_map(mask, 0, level); + /* ok == 0 means we got the map; otherwise it's an invalid cache level */ + if (ok == 0) + num_cpus = cpumask_weight(mask); + free_cpumask_var(mask); + + if (ok == 0) + return num_cpus; + else + return -EINVAL; + } +} + +int assign_cpus_to_clusters(enum cache_level level, + struct scheduling_cluster* clusters[], + unsigned int num_clusters, + struct cluster_cpu* cpus[], + unsigned int num_cpus) +{ + cpumask_var_t mask; + unsigned int i, free_cluster = 0, low_cpu; + int err = 0; + + if (!zalloc_cpumask_var(&mask, GFP_ATOMIC)) + return -ENOMEM; + + /* clear cluster pointers */ + for (i = 0; i < num_cpus; i++) { + cpus[i]->id = i; + cpus[i]->cluster = NULL; + } + + /* initialize clusters */ + for (i = 0; i < num_clusters; i++) { + clusters[i]->id = i; + INIT_LIST_HEAD(&clusters[i]->cpus); + } + + /* Assign each CPU. Two assumtions are made: + * 1) The index of a cpu in cpus corresponds to its processor id (i.e., the index in a cpu mask). + * 2) All cpus that belong to some cluster are online. + */ + for_each_online_cpu(i) { + /* get lowest-id CPU in cluster */ + if (level != GLOBAL_CLUSTER) { + err = get_shared_cpu_map(mask, cpus[i]->id, level); + if (err != 0) { + /* ugh... wrong cache level? Either caller screwed up + * or the CPU topology is weird. */ + printk(KERN_ERR "Could not set up clusters for L%d sharing (max: L%d).\n", + level, err); + err = -EINVAL; + goto out; + } + low_cpu = cpumask_first(mask); + } else + low_cpu = 0; + if (low_cpu == i) { + /* caller must provide an appropriate number of clusters */ + BUG_ON(free_cluster >= num_clusters); + + /* create new cluster */ + cpus[i]->cluster = clusters[free_cluster++]; + } else { + /* low_cpu points to the right cluster + * Assumption: low_cpu is actually online and was processed earlier. */ + cpus[i]->cluster = cpus[low_cpu]->cluster; + } + /* enqueue in cpus list */ + list_add_tail(&cpus[i]->cluster_list, &cpus[i]->cluster->cpus); + printk(KERN_INFO "Assigning CPU%u to cluster %u\n.", i, cpus[i]->cluster->id); + } +out: + free_cpumask_var(mask); + return err; +} diff --git a/litmus/ctrldev.c b/litmus/ctrldev.c new file mode 100644 index 000000000000..6677a67cc945 --- /dev/null +++ b/litmus/ctrldev.c @@ -0,0 +1,150 @@ +#include +#include +#include +#include +#include + +#include + +/* only one page for now, but we might want to add a RO version at some point */ + +#define CTRL_NAME "litmus/ctrl" + +/* allocate t->rt_param.ctrl_page*/ +static int alloc_ctrl_page(struct task_struct *t) +{ + int err = 0; + + /* only allocate if the task doesn't have one yet */ + if (!tsk_rt(t)->ctrl_page) { + tsk_rt(t)->ctrl_page = (void*) get_zeroed_page(GFP_KERNEL); + if (!tsk_rt(t)->ctrl_page) + err = -ENOMEM; + /* will get de-allocated in task teardown */ + TRACE_TASK(t, "%s ctrl_page = %p\n", __FUNCTION__, + tsk_rt(t)->ctrl_page); + } + return err; +} + +static int map_ctrl_page(struct task_struct *t, struct vm_area_struct* vma) +{ + int err; + unsigned long pfn; + + struct page* ctrl = virt_to_page(tsk_rt(t)->ctrl_page); + + /* Increase ref count. Is decreased when vma is destroyed. */ + get_page(ctrl); + + /* compute page frame number */ + pfn = page_to_pfn(ctrl); + + TRACE_CUR(CTRL_NAME + ": mapping %p (pfn:%lx, %lx) to 0x%lx (prot:%lx)\n", + tsk_rt(t)->ctrl_page, pfn, page_to_pfn(ctrl), vma->vm_start, + vma->vm_page_prot); + + /* Map it into the vma. Make sure to use PAGE_SHARED, otherwise + * userspace actually gets a copy-on-write page. */ + err = remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, PAGE_SHARED); + + if (err) + TRACE_CUR(CTRL_NAME ": remap_pfn_range() failed (%d)\n", err); + + return err; +} + +static void litmus_ctrl_vm_close(struct vm_area_struct* vma) +{ + TRACE_CUR("%s flags=0x%x prot=0x%x\n", __FUNCTION__, + vma->vm_flags, vma->vm_page_prot); + + TRACE_CUR(CTRL_NAME + ": %p:%p vma:%p vma->vm_private_data:%p closed.\n", + (void*) vma->vm_start, (void*) vma->vm_end, vma, + vma->vm_private_data, current->comm, + current->pid); +} + +static int litmus_ctrl_vm_fault(struct vm_area_struct* vma, + struct vm_fault* vmf) +{ + /* This function should never be called, since + * all pages should have been mapped by mmap() + * already. */ + TRACE_CUR("%s flags=0x%x\n", __FUNCTION__, vma->vm_flags); + + /* nope, you only get one page */ + return VM_FAULT_SIGBUS; +} + +static struct vm_operations_struct litmus_ctrl_vm_ops = { + .close = litmus_ctrl_vm_close, + .fault = litmus_ctrl_vm_fault, +}; + +static int litmus_ctrl_mmap(struct file* filp, struct vm_area_struct* vma) +{ + int err = 0; + + /* first make sure mapper knows what he's doing */ + + /* you can only get one page */ + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + /* you can only map the "first" page */ + if (vma->vm_pgoff != 0) + return -EINVAL; + + /* you can't share it with anyone */ + if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) + return -EINVAL; + + vma->vm_ops = &litmus_ctrl_vm_ops; + /* this mapping should not be kept across forks, + * and cannot be expanded */ + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND; + + err = alloc_ctrl_page(current); + if (!err) + err = map_ctrl_page(current, vma); + + TRACE_CUR("%s flags=0x%x prot=0x%lx\n", + __FUNCTION__, vma->vm_flags, vma->vm_page_prot); + + return err; +} + +static struct file_operations litmus_ctrl_fops = { + .owner = THIS_MODULE, + .mmap = litmus_ctrl_mmap, +}; + +static struct miscdevice litmus_ctrl_dev = { + .name = CTRL_NAME, + .minor = MISC_DYNAMIC_MINOR, + .fops = &litmus_ctrl_fops, +}; + +static int __init init_litmus_ctrl_dev(void) +{ + int err; + + BUILD_BUG_ON(sizeof(struct control_page) > PAGE_SIZE); + + printk("Initializing LITMUS^RT control device.\n"); + err = misc_register(&litmus_ctrl_dev); + if (err) + printk("Could not allocate %s device (%d).\n", CTRL_NAME, err); + return err; +} + +static void __exit exit_litmus_ctrl_dev(void) +{ + misc_deregister(&litmus_ctrl_dev); +} + +module_init(init_litmus_ctrl_dev); +module_exit(exit_litmus_ctrl_dev); diff --git a/litmus/edf_common.c b/litmus/edf_common.c new file mode 100644 index 000000000000..c7d02ec2e15b --- /dev/null +++ b/litmus/edf_common.c @@ -0,0 +1,143 @@ +/* + * kernel/edf_common.c + * + * Common functions for EDF based scheduler. + */ + +#include +#include +#include + +#include +#include +#include + +#include + + +#ifdef CONFIG_LITMUS_LOCKING +int edf_higher_base_prio(struct task_struct* first, + struct task_struct* second) +{ + struct task_struct *first_task = first; + struct task_struct *second_task = second; + + /* check for NULL tasks */ + if (!first || !second) + return first && !second; + + return !is_realtime(second_task) || + earlier_deadline(first_task, second_task) || + (get_deadline(first_task) == get_deadline(second_task) && + first_task->pid < second_task->pid); +} + +int edf_pending_order(struct bheap_node* a, struct bheap_node* b) +{ + return edf_higher_base_prio(bheap2task(a), bheap2task(b)); +} + +#endif + +/* edf_higher_prio - returns true if first has a higher EDF priority + * than second. Deadline ties are broken by PID. + * + * both first and second may be NULL + */ +int edf_higher_prio(struct task_struct* first, + struct task_struct* second) +{ + struct task_struct *first_task = first; + struct task_struct *second_task = second; + + /* There is no point in comparing a task to itself. */ + if (first && first == second) { + TRACE_TASK(first, + "WARNING: pointless edf priority comparison.\n"); + return 0; + } + + + /* check for NULL tasks */ + if (!first || !second) + return first && !second; + +#ifdef CONFIG_LITMUS_LOCKING + + /* Check for inherited priorities. Change task + * used for comparison in such a case. + */ + if (unlikely(first->rt_param.inh_task)) + first_task = first->rt_param.inh_task; + if (unlikely(second->rt_param.inh_task)) + second_task = second->rt_param.inh_task; + + /* Check for priority boosting. Tie-break by start of boosting. + */ + if (unlikely(is_priority_boosted(first_task))) { + /* first_task is boosted, how about second_task? */ + if (!is_priority_boosted(second_task) || + lt_before(get_boost_start(first_task), + get_boost_start(second_task))) + return 1; + else + return 0; + } else if (unlikely(is_priority_boosted(second_task))) + /* second_task is boosted, first is not*/ + return 0; + +#endif + + + return !is_realtime(second_task) || + + /* is the deadline of the first task earlier? + * Then it has higher priority. + */ + earlier_deadline(first_task, second_task) || + + /* Do we have a deadline tie? + * Then break by PID. + */ + (get_deadline(first_task) == get_deadline(second_task) && + (first_task->pid < second_task->pid || + + /* If the PIDs are the same then the task with the inherited + * priority wins. + */ + (first_task->pid == second_task->pid && + !second->rt_param.inh_task))); +} + +int edf_ready_order(struct bheap_node* a, struct bheap_node* b) +{ + return edf_higher_prio(bheap2task(a), bheap2task(b)); +} + +void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched, + release_jobs_t release) +{ + rt_domain_init(rt, edf_ready_order, resched, release); +} + +/* need_to_preempt - check whether the task t needs to be preempted + * call only with irqs disabled and with ready_lock acquired + * THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT! + */ +int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t) +{ + /* we need the read lock for edf_ready_queue */ + /* no need to preempt if there is nothing pending */ + if (!__jobs_pending(rt)) + return 0; + /* we need to reschedule if t doesn't exist */ + if (!t) + return 1; + + /* NOTE: We cannot check for non-preemptibility since we + * don't know what address space we're currently in. + */ + + /* make sure to get non-rt stuff out of the way */ + return !is_realtime(t) || edf_higher_prio(__next_ready(rt), t); +} diff --git a/litmus/fdso.c b/litmus/fdso.c new file mode 100644 index 000000000000..2c629598e3c9 --- /dev/null +++ b/litmus/fdso.c @@ -0,0 +1,297 @@ +/* fdso.c - file descriptor attached shared objects + * + * (c) 2007 B. Brandenburg, LITMUS^RT project + * + * Notes: + * - objects descriptor (OD) tables are not cloned during a fork. + * - objects are created on-demand, and freed after the last reference + * is dropped. + * - for now, object types are hard coded. + * - As long as we have live objects, we keep a reference to the inode. + */ + +#include +#include +#include +#include +#include + +#include + +extern struct fdso_ops generic_lock_ops; + +static const struct fdso_ops* fdso_ops[] = { + &generic_lock_ops, /* FMLP_SEM */ + &generic_lock_ops, /* SRP_SEM */ + &generic_lock_ops, /* MPCP_SEM */ + &generic_lock_ops, /* MPCP_VS_SEM */ + &generic_lock_ops, /* DPCP_SEM */ + &generic_lock_ops, /* OMLP_SEM */ +}; + +static int fdso_create(void** obj_ref, obj_type_t type, void* __user config) +{ + if (fdso_ops[type]->create) + return fdso_ops[type]->create(obj_ref, type, config); + else + return -EINVAL; +} + +static void fdso_destroy(obj_type_t type, void* obj) +{ + fdso_ops[type]->destroy(type, obj); +} + +static int fdso_open(struct od_table_entry* entry, void* __user config) +{ + if (fdso_ops[entry->obj->type]->open) + return fdso_ops[entry->obj->type]->open(entry, config); + else + return 0; +} + +static int fdso_close(struct od_table_entry* entry) +{ + if (fdso_ops[entry->obj->type]->close) + return fdso_ops[entry->obj->type]->close(entry); + else + return 0; +} + +/* inode must be locked already */ +static int alloc_inode_obj(struct inode_obj_id** obj_ref, + struct inode* inode, + obj_type_t type, + unsigned int id, + void* __user config) +{ + struct inode_obj_id* obj; + void* raw_obj; + int err; + + obj = kmalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) { + return -ENOMEM; + } + + err = fdso_create(&raw_obj, type, config); + if (err != 0) { + kfree(obj); + return err; + } + + INIT_LIST_HEAD(&obj->list); + atomic_set(&obj->count, 1); + obj->type = type; + obj->id = id; + obj->obj = raw_obj; + obj->inode = inode; + + list_add(&obj->list, &inode->i_obj_list); + atomic_inc(&inode->i_count); + + printk(KERN_DEBUG "alloc_inode_obj(%p, %d, %d): object created\n", inode, type, id); + + *obj_ref = obj; + return 0; +} + +/* inode must be locked already */ +static struct inode_obj_id* get_inode_obj(struct inode* inode, + obj_type_t type, + unsigned int id) +{ + struct list_head* pos; + struct inode_obj_id* obj = NULL; + + list_for_each(pos, &inode->i_obj_list) { + obj = list_entry(pos, struct inode_obj_id, list); + if (obj->id == id && obj->type == type) { + atomic_inc(&obj->count); + return obj; + } + } + printk(KERN_DEBUG "get_inode_obj(%p, %d, %d): couldn't find object\n", inode, type, id); + return NULL; +} + + +static void put_inode_obj(struct inode_obj_id* obj) +{ + struct inode* inode; + int let_go = 0; + + inode = obj->inode; + if (atomic_dec_and_test(&obj->count)) { + + mutex_lock(&inode->i_obj_mutex); + /* no new references can be obtained */ + if (!atomic_read(&obj->count)) { + list_del(&obj->list); + fdso_destroy(obj->type, obj->obj); + kfree(obj); + let_go = 1; + } + mutex_unlock(&inode->i_obj_mutex); + if (let_go) + iput(inode); + } +} + +static struct od_table_entry* get_od_entry(struct task_struct* t) +{ + struct od_table_entry* table; + int i; + + + table = t->od_table; + if (!table) { + table = kzalloc(sizeof(*table) * MAX_OBJECT_DESCRIPTORS, + GFP_KERNEL); + t->od_table = table; + } + + for (i = 0; table && i < MAX_OBJECT_DESCRIPTORS; i++) + if (!table[i].used) { + table[i].used = 1; + return table + i; + } + return NULL; +} + +static int put_od_entry(struct od_table_entry* od) +{ + put_inode_obj(od->obj); + od->used = 0; + return 0; +} + +void exit_od_table(struct task_struct* t) +{ + int i; + + if (t->od_table) { + for (i = 0; i < MAX_OBJECT_DESCRIPTORS; i++) + if (t->od_table[i].used) + put_od_entry(t->od_table + i); + kfree(t->od_table); + t->od_table = NULL; + } +} + +static int do_sys_od_open(struct file* file, obj_type_t type, int id, + void* __user config) +{ + int idx = 0, err = 0; + struct inode* inode; + struct inode_obj_id* obj = NULL; + struct od_table_entry* entry; + + inode = file->f_dentry->d_inode; + + entry = get_od_entry(current); + if (!entry) + return -ENOMEM; + + mutex_lock(&inode->i_obj_mutex); + obj = get_inode_obj(inode, type, id); + if (!obj) + err = alloc_inode_obj(&obj, inode, type, id, config); + if (err != 0) { + obj = NULL; + idx = err; + entry->used = 0; + } else { + entry->obj = obj; + entry->class = fdso_ops[type]; + idx = entry - current->od_table; + } + + mutex_unlock(&inode->i_obj_mutex); + + /* open only if creation succeeded */ + if (!err) + err = fdso_open(entry, config); + if (err < 0) { + /* The class rejected the open call. + * We need to clean up and tell user space. + */ + if (obj) + put_od_entry(entry); + idx = err; + } + + return idx; +} + + +struct od_table_entry* get_entry_for_od(int od) +{ + struct task_struct *t = current; + + if (!t->od_table) + return NULL; + if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS) + return NULL; + if (!t->od_table[od].used) + return NULL; + return t->od_table + od; +} + + +asmlinkage long sys_od_open(int fd, int type, int obj_id, void* __user config) +{ + int ret = 0; + struct file* file; + + /* + 1) get file from fd, get inode from file + 2) lock inode + 3) try to lookup object + 4) if not present create and enqueue object, inc inode refcnt + 5) increment refcnt of object + 6) alloc od_table_entry, setup ptrs + 7) unlock inode + 8) return offset in od_table as OD + */ + + if (type < MIN_OBJ_TYPE || type > MAX_OBJ_TYPE) { + ret = -EINVAL; + goto out; + } + + file = fget(fd); + if (!file) { + ret = -EBADF; + goto out; + } + + ret = do_sys_od_open(file, type, obj_id, config); + + fput(file); + +out: + return ret; +} + + +asmlinkage long sys_od_close(int od) +{ + int ret = -EINVAL; + struct task_struct *t = current; + + if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS) + return ret; + + if (!t->od_table || !t->od_table[od].used) + return ret; + + + /* give the class a chance to reject the close + */ + ret = fdso_close(t->od_table + od); + if (ret == 0) + ret = put_od_entry(t->od_table + od); + + return ret; +} diff --git a/litmus/fp_common.c b/litmus/fp_common.c new file mode 100644 index 000000000000..31fc2db20adf --- /dev/null +++ b/litmus/fp_common.c @@ -0,0 +1,119 @@ +/* + * litmus/fp_common.c + * + * Common functions for fixed-priority scheduler. + */ + +#include +#include +#include + +#include +#include +#include + +#include + +/* fp_higher_prio - returns true if first has a higher static priority + * than second. Deadline ties are broken by PID. + * + * both first and second may be NULL + */ +int fp_higher_prio(struct task_struct* first, + struct task_struct* second) +{ + struct task_struct *first_task = first; + struct task_struct *second_task = second; + + /* There is no point in comparing a task to itself. */ + if (unlikely(first && first == second)) { + TRACE_TASK(first, + "WARNING: pointless FP priority comparison.\n"); + return 0; + } + + + /* check for NULL tasks */ + if (!first || !second) + return first && !second; + +#ifdef CONFIG_LITMUS_LOCKING + + /* Check for inherited priorities. Change task + * used for comparison in such a case. + */ + if (unlikely(first->rt_param.inh_task)) + first_task = first->rt_param.inh_task; + if (unlikely(second->rt_param.inh_task)) + second_task = second->rt_param.inh_task; + + /* Check for priority boosting. Tie-break by start of boosting. + */ + if (unlikely(is_priority_boosted(first_task))) { + /* first_task is boosted, how about second_task? */ + if (!is_priority_boosted(second_task) || + lt_before(get_boost_start(first_task), + get_boost_start(second_task))) + return 1; + else + return 0; + } else if (unlikely(is_priority_boosted(second_task))) + /* second_task is boosted, first is not*/ + return 0; + +#endif + + + return !is_realtime(second_task) || + + get_priority(first_task) < get_priority(second_task) || + + /* Break by PID. + */ + (get_priority(first_task) == get_priority(second_task) && + (first_task->pid < second_task->pid || + + /* If the PIDs are the same then the task with the inherited + * priority wins. + */ + (first_task->pid == second_task->pid && + !second->rt_param.inh_task))); +} + +int fp_ready_order(struct bheap_node* a, struct bheap_node* b) +{ + return fp_higher_prio(bheap2task(a), bheap2task(b)); +} + +void fp_domain_init(rt_domain_t* rt, check_resched_needed_t resched, + release_jobs_t release) +{ + rt_domain_init(rt, fp_ready_order, resched, release); +} + +/* need_to_preempt - check whether the task t needs to be preempted + */ +int fp_preemption_needed(struct fp_prio_queue *q, struct task_struct *t) +{ + struct task_struct *pending; + + pending = fp_prio_peek(q); + + if (!pending) + return 0; + if (!t) + return 1; + + /* make sure to get non-rt stuff out of the way */ + return !is_realtime(t) || fp_higher_prio(pending, t); +} + +void fp_prio_queue_init(struct fp_prio_queue* q) +{ + int i; + + for (i = 0; i < FP_PRIO_BIT_WORDS; i++) + q->bitmask[i] = 0; + for (i = 0; i < LITMUS_MAX_PRIORITY; i++) + bheap_init(&q->queue[i]); +} diff --git a/litmus/ft_event.c b/litmus/ft_event.c new file mode 100644 index 000000000000..399a07becca5 --- /dev/null +++ b/litmus/ft_event.c @@ -0,0 +1,43 @@ +#include + +#include + +#if !defined(CONFIG_ARCH_HAS_FEATHER_TRACE) || defined(CONFIG_DEBUG_RODATA) +/* provide dummy implementation */ + +int ft_events[MAX_EVENTS]; + +int ft_enable_event(unsigned long id) +{ + if (id < MAX_EVENTS) { + ft_events[id]++; + return 1; + } else + return 0; +} + +int ft_disable_event(unsigned long id) +{ + if (id < MAX_EVENTS && ft_events[id]) { + ft_events[id]--; + return 1; + } else + return 0; +} + +int ft_disable_all_events(void) +{ + int i; + + for (i = 0; i < MAX_EVENTS; i++) + ft_events[i] = 0; + + return MAX_EVENTS; +} + +int ft_is_event_enabled(unsigned long id) +{ + return id < MAX_EVENTS && ft_events[id]; +} + +#endif diff --git a/litmus/ftdev.c b/litmus/ftdev.c new file mode 100644 index 000000000000..99bc39ffbcef --- /dev/null +++ b/litmus/ftdev.c @@ -0,0 +1,446 @@ +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size) +{ + struct ft_buffer* buf; + size_t total = (size + 1) * count; + char* mem; + int order = 0, pages = 1; + + buf = kmalloc(sizeof(*buf), GFP_KERNEL); + if (!buf) + return NULL; + + total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0); + while (pages < total) { + order++; + pages *= 2; + } + + mem = (char*) __get_free_pages(GFP_KERNEL, order); + if (!mem) { + kfree(buf); + return NULL; + } + + if (!init_ft_buffer(buf, count, size, + mem + (count * size), /* markers at the end */ + mem)) { /* buffer objects */ + free_pages((unsigned long) mem, order); + kfree(buf); + return NULL; + } + return buf; +} + +void free_ft_buffer(struct ft_buffer* buf) +{ + int order = 0, pages = 1; + size_t total; + + if (buf) { + total = (buf->slot_size + 1) * buf->slot_count; + total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0); + while (pages < total) { + order++; + pages *= 2; + } + free_pages((unsigned long) buf->buffer_mem, order); + kfree(buf); + } +} + +struct ftdev_event { + int id; + struct ftdev_event* next; +}; + +static int activate(struct ftdev_event** chain, int id) +{ + struct ftdev_event* ev = kmalloc(sizeof(*ev), GFP_KERNEL); + if (ev) { + printk(KERN_INFO + "Enabling feather-trace event %d.\n", (int) id); + ft_enable_event(id); + ev->id = id; + ev->next = *chain; + *chain = ev; + } + return ev ? 0 : -ENOMEM; +} + +static void deactivate(struct ftdev_event** chain, int id) +{ + struct ftdev_event **cur = chain; + struct ftdev_event *nxt; + while (*cur) { + if ((*cur)->id == id) { + nxt = (*cur)->next; + kfree(*cur); + *cur = nxt; + printk(KERN_INFO + "Disabling feather-trace event %d.\n", (int) id); + ft_disable_event(id); + break; + } + cur = &(*cur)->next; + } +} + +static int ftdev_open(struct inode *in, struct file *filp) +{ + struct ftdev* ftdev; + struct ftdev_minor* ftdm; + unsigned int buf_idx = iminor(in); + int err = 0; + + ftdev = container_of(in->i_cdev, struct ftdev, cdev); + + if (buf_idx >= ftdev->minor_cnt) { + err = -ENODEV; + goto out; + } + if (ftdev->can_open && (err = ftdev->can_open(ftdev, buf_idx))) + goto out; + + ftdm = ftdev->minor + buf_idx; + ftdm->ftdev = ftdev; + filp->private_data = ftdm; + + if (mutex_lock_interruptible(&ftdm->lock)) { + err = -ERESTARTSYS; + goto out; + } + + if (!ftdm->readers && ftdev->alloc) + err = ftdev->alloc(ftdev, buf_idx); + if (0 == err) + ftdm->readers++; + + mutex_unlock(&ftdm->lock); +out: + return err; +} + +static int ftdev_release(struct inode *in, struct file *filp) +{ + struct ftdev* ftdev; + struct ftdev_minor* ftdm; + unsigned int buf_idx = iminor(in); + int err = 0; + + ftdev = container_of(in->i_cdev, struct ftdev, cdev); + + if (buf_idx >= ftdev->minor_cnt) { + err = -ENODEV; + goto out; + } + ftdm = ftdev->minor + buf_idx; + + if (mutex_lock_interruptible(&ftdm->lock)) { + err = -ERESTARTSYS; + goto out; + } + + if (ftdm->readers == 1) { + while (ftdm->events) + deactivate(&ftdm->events, ftdm->events->id); + + /* wait for any pending events to complete */ + set_current_state(TASK_UNINTERRUPTIBLE); + schedule_timeout(HZ); + + printk(KERN_ALERT "Failed trace writes: %u\n", + ftdm->buf->failed_writes); + + if (ftdev->free) + ftdev->free(ftdev, buf_idx); + } + + ftdm->readers--; + mutex_unlock(&ftdm->lock); +out: + return err; +} + +/* based on ft_buffer_read + * @returns < 0 : page fault + * = 0 : no data available + * = 1 : one slot copied + */ +static int ft_buffer_copy_to_user(struct ft_buffer* buf, char __user *dest) +{ + unsigned int idx; + int err = 0; + if (buf->free_count != buf->slot_count) { + /* data available */ + idx = buf->read_idx % buf->slot_count; + if (buf->slots[idx] == SLOT_READY) { + err = copy_to_user(dest, ((char*) buf->buffer_mem) + + idx * buf->slot_size, + buf->slot_size); + if (err == 0) { + /* copy ok */ + buf->slots[idx] = SLOT_FREE; + buf->read_idx++; + fetch_and_inc(&buf->free_count); + err = 1; + } + } + } + return err; +} + +static ssize_t ftdev_read(struct file *filp, + char __user *to, size_t len, loff_t *f_pos) +{ + /* we ignore f_pos, this is strictly sequential */ + + ssize_t err = 0; + size_t chunk; + int copied; + struct ftdev_minor* ftdm = filp->private_data; + + if (mutex_lock_interruptible(&ftdm->lock)) { + err = -ERESTARTSYS; + goto out; + } + + + chunk = ftdm->buf->slot_size; + while (len >= chunk) { + copied = ft_buffer_copy_to_user(ftdm->buf, to); + if (copied == 1) { + len -= chunk; + to += chunk; + err += chunk; + } else if (err == 0 && copied == 0 && ftdm->events) { + /* Only wait if there are any events enabled and only + * if we haven't copied some data yet. We cannot wait + * here with copied data because that data would get + * lost if the task is interrupted (e.g., killed). + */ + mutex_unlock(&ftdm->lock); + set_current_state(TASK_INTERRUPTIBLE); + + schedule_timeout(50); + + if (signal_pending(current)) { + if (err == 0) + /* nothing read yet, signal problem */ + err = -ERESTARTSYS; + goto out; + } + if (mutex_lock_interruptible(&ftdm->lock)) { + err = -ERESTARTSYS; + goto out; + } + } else if (copied < 0) { + /* page fault */ + err = copied; + break; + } else + /* nothing left to get, return to user space */ + break; + } + mutex_unlock(&ftdm->lock); +out: + return err; +} + +static long ftdev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + long err = -ENOIOCTLCMD; + struct ftdev_minor* ftdm = filp->private_data; + + if (mutex_lock_interruptible(&ftdm->lock)) { + err = -ERESTARTSYS; + goto out; + } + + /* FIXME: check id against list of acceptable events */ + + switch (cmd) { + case FTDEV_ENABLE_CMD: + if (activate(&ftdm->events, arg)) + err = -ENOMEM; + else + err = 0; + break; + + case FTDEV_DISABLE_CMD: + deactivate(&ftdm->events, arg); + err = 0; + break; + + default: + printk(KERN_DEBUG "ftdev: strange ioctl (%u, %lu)\n", cmd, arg); + }; + + mutex_unlock(&ftdm->lock); +out: + return err; +} + +static ssize_t ftdev_write(struct file *filp, const char __user *from, + size_t len, loff_t *f_pos) +{ + struct ftdev_minor* ftdm = filp->private_data; + ssize_t err = -EINVAL; + struct ftdev* ftdev = ftdm->ftdev; + + /* dispatch write to buffer-specific code, if available */ + if (ftdev->write) + err = ftdev->write(ftdm->buf, len, from); + + return err; +} + +struct file_operations ftdev_fops = { + .owner = THIS_MODULE, + .open = ftdev_open, + .release = ftdev_release, + .write = ftdev_write, + .read = ftdev_read, + .unlocked_ioctl = ftdev_ioctl, +}; + +int ftdev_init( struct ftdev* ftdev, struct module* owner, + const int minor_cnt, const char* name) +{ + int i, err; + + BUG_ON(minor_cnt < 1); + + cdev_init(&ftdev->cdev, &ftdev_fops); + ftdev->name = name; + ftdev->minor_cnt = minor_cnt; + ftdev->cdev.owner = owner; + ftdev->cdev.ops = &ftdev_fops; + ftdev->alloc = NULL; + ftdev->free = NULL; + ftdev->can_open = NULL; + ftdev->write = NULL; + + ftdev->minor = kcalloc(ftdev->minor_cnt, sizeof(*ftdev->minor), + GFP_KERNEL); + if (!ftdev->minor) { + printk(KERN_WARNING "ftdev(%s): Could not allocate memory\n", + ftdev->name); + err = -ENOMEM; + goto err_out; + } + + for (i = 0; i < ftdev->minor_cnt; i++) { + mutex_init(&ftdev->minor[i].lock); + ftdev->minor[i].readers = 0; + ftdev->minor[i].buf = NULL; + ftdev->minor[i].events = NULL; + } + + ftdev->class = class_create(owner, ftdev->name); + if (IS_ERR(ftdev->class)) { + err = PTR_ERR(ftdev->class); + printk(KERN_WARNING "ftdev(%s): " + "Could not create device class.\n", ftdev->name); + goto err_dealloc; + } + + return 0; + +err_dealloc: + kfree(ftdev->minor); +err_out: + return err; +} + +/* + * Destroy minor devices up to, but not including, up_to. + */ +static void ftdev_device_destroy(struct ftdev* ftdev, unsigned int up_to) +{ + dev_t minor_cntr; + + if (up_to < 1) + up_to = (ftdev->minor_cnt < 1) ? 0 : ftdev->minor_cnt; + + for (minor_cntr = 0; minor_cntr < up_to; ++minor_cntr) + device_destroy(ftdev->class, MKDEV(ftdev->major, minor_cntr)); +} + +void ftdev_exit(struct ftdev* ftdev) +{ + printk("ftdev(%s): Exiting\n", ftdev->name); + ftdev_device_destroy(ftdev, -1); + cdev_del(&ftdev->cdev); + unregister_chrdev_region(MKDEV(ftdev->major, 0), ftdev->minor_cnt); + class_destroy(ftdev->class); + kfree(ftdev->minor); +} + +int register_ftdev(struct ftdev* ftdev) +{ + struct device **device; + dev_t trace_dev_tmp, minor_cntr; + int err; + + err = alloc_chrdev_region(&trace_dev_tmp, 0, ftdev->minor_cnt, + ftdev->name); + if (err) { + printk(KERN_WARNING "ftdev(%s): " + "Could not allocate char. device region (%d minors)\n", + ftdev->name, ftdev->minor_cnt); + goto err_out; + } + + ftdev->major = MAJOR(trace_dev_tmp); + + err = cdev_add(&ftdev->cdev, trace_dev_tmp, ftdev->minor_cnt); + if (err) { + printk(KERN_WARNING "ftdev(%s): " + "Could not add cdev for major %u with %u minor(s).\n", + ftdev->name, ftdev->major, ftdev->minor_cnt); + goto err_unregister; + } + + /* create the minor device(s) */ + for (minor_cntr = 0; minor_cntr < ftdev->minor_cnt; ++minor_cntr) + { + trace_dev_tmp = MKDEV(ftdev->major, minor_cntr); + device = &ftdev->minor[minor_cntr].device; + + *device = device_create(ftdev->class, NULL, trace_dev_tmp, NULL, + "litmus/%s%d", ftdev->name, minor_cntr); + if (IS_ERR(*device)) { + err = PTR_ERR(*device); + printk(KERN_WARNING "ftdev(%s): " + "Could not create device major/minor number " + "%u/%u\n", ftdev->name, ftdev->major, + minor_cntr); + printk(KERN_WARNING "ftdev(%s): " + "will attempt deletion of allocated devices.\n", + ftdev->name); + goto err_minors; + } + } + + return 0; + +err_minors: + ftdev_device_destroy(ftdev, minor_cntr); + cdev_del(&ftdev->cdev); +err_unregister: + unregister_chrdev_region(MKDEV(ftdev->major, 0), ftdev->minor_cnt); +err_out: + return err; +} diff --git a/litmus/jobs.c b/litmus/jobs.c new file mode 100644 index 000000000000..36e314625d86 --- /dev/null +++ b/litmus/jobs.c @@ -0,0 +1,43 @@ +/* litmus/jobs.c - common job control code + */ + +#include + +#include +#include + +void prepare_for_next_period(struct task_struct *t) +{ + BUG_ON(!t); + /* prepare next release */ + t->rt_param.job_params.release = t->rt_param.job_params.deadline; + t->rt_param.job_params.deadline += get_rt_period(t); + t->rt_param.job_params.exec_time = 0; + /* update job sequence number */ + t->rt_param.job_params.job_no++; + + /* don't confuse Linux */ + t->rt.time_slice = 1; +} + +void release_at(struct task_struct *t, lt_t start) +{ + t->rt_param.job_params.deadline = start; + prepare_for_next_period(t); + set_rt_flags(t, RT_F_RUNNING); +} + + +/* + * Deactivate current task until the beginning of the next period. + */ +long complete_job(void) +{ + /* Mark that we do not excute anymore */ + set_rt_flags(current, RT_F_SLEEP); + /* call schedule, this will return when a new job arrives + * it also takes care of preparing for the next release + */ + schedule(); + return 0; +} diff --git a/litmus/litmus.c b/litmus/litmus.c new file mode 100644 index 000000000000..b22f84a02010 --- /dev/null +++ b/litmus/litmus.c @@ -0,0 +1,555 @@ +/* + * litmus.c -- Implementation of the LITMUS syscalls, + * the LITMUS intialization code, + * and the procfs interface.. + */ +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +/* Number of RT tasks that exist in the system */ +atomic_t rt_task_count = ATOMIC_INIT(0); +static DEFINE_RAW_SPINLOCK(task_transition_lock); +/* synchronize plugin switching */ +atomic_t cannot_use_plugin = ATOMIC_INIT(0); + +/* Give log messages sequential IDs. */ +atomic_t __log_seq_no = ATOMIC_INIT(0); + +#ifdef CONFIG_RELEASE_MASTER +/* current master CPU for handling timer IRQs */ +atomic_t release_master_cpu = ATOMIC_INIT(NO_CPU); +#endif + +static struct kmem_cache * bheap_node_cache; +extern struct kmem_cache * release_heap_cache; + +struct bheap_node* bheap_node_alloc(int gfp_flags) +{ + return kmem_cache_alloc(bheap_node_cache, gfp_flags); +} + +void bheap_node_free(struct bheap_node* hn) +{ + kmem_cache_free(bheap_node_cache, hn); +} + +struct release_heap* release_heap_alloc(int gfp_flags); +void release_heap_free(struct release_heap* rh); + +/* + * sys_set_task_rt_param + * @pid: Pid of the task which scheduling parameters must be changed + * @param: New real-time extension parameters such as the execution cost and + * period + * Syscall for manipulating with task rt extension params + * Returns EFAULT if param is NULL. + * ESRCH if pid is not corrsponding + * to a valid task. + * EINVAL if either period or execution cost is <=0 + * EPERM if pid is a real-time task + * 0 if success + * + * Only non-real-time tasks may be configured with this system call + * to avoid races with the scheduler. In practice, this means that a + * task's parameters must be set _before_ calling sys_prepare_rt_task() + * + * find_task_by_vpid() assumes that we are in the same namespace of the + * target. + */ +asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param) +{ + struct rt_task tp; + struct task_struct *target; + int retval = -EINVAL; + + printk("Setting up rt task parameters for process %d.\n", pid); + + if (pid < 0 || param == 0) { + goto out; + } + if (copy_from_user(&tp, param, sizeof(tp))) { + retval = -EFAULT; + goto out; + } + + /* Task search and manipulation must be protected */ + read_lock_irq(&tasklist_lock); + if (!(target = find_task_by_vpid(pid))) { + retval = -ESRCH; + goto out_unlock; + } + + if (is_realtime(target)) { + /* The task is already a real-time task. + * We cannot not allow parameter changes at this point. + */ + retval = -EBUSY; + goto out_unlock; + } + + if (tp.exec_cost <= 0) + goto out_unlock; + if (tp.period <= 0) + goto out_unlock; + if (!cpu_online(tp.cpu)) + goto out_unlock; + if (tp.period < tp.exec_cost) + { + printk(KERN_INFO "litmus: real-time task %d rejected " + "because wcet > period\n", pid); + goto out_unlock; + } + if (tp.budget_policy != NO_ENFORCEMENT && + tp.budget_policy != QUANTUM_ENFORCEMENT && + tp.budget_policy != PRECISE_ENFORCEMENT) + { + printk(KERN_INFO "litmus: real-time task %d rejected " + "because unsupported budget enforcement policy " + "specified (%d)\n", + pid, tp.budget_policy); + goto out_unlock; + } + + if (tp.priority >= LITMUS_MAX_PRIORITY) { + printk(KERN_INFO "litmus: invalid priority (%u); " + "task %s/%d rejected\n", + tp.priority, target->comm, target->pid); + goto out_unlock; + } + + target->rt_param.task_params = tp; + + retval = 0; + out_unlock: + read_unlock_irq(&tasklist_lock); + out: + return retval; +} + +/* + * Getter of task's RT params + * returns EINVAL if param or pid is NULL + * returns ESRCH if pid does not correspond to a valid task + * returns EFAULT if copying of parameters has failed. + * + * find_task_by_vpid() assumes that we are in the same namespace of the + * target. + */ +asmlinkage long sys_get_rt_task_param(pid_t pid, struct rt_task __user * param) +{ + int retval = -EINVAL; + struct task_struct *source; + struct rt_task lp; + if (param == 0 || pid < 0) + goto out; + read_lock(&tasklist_lock); + if (!(source = find_task_by_vpid(pid))) { + retval = -ESRCH; + goto out_unlock; + } + lp = source->rt_param.task_params; + read_unlock(&tasklist_lock); + /* Do copying outside the lock */ + retval = + copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0; + return retval; + out_unlock: + read_unlock(&tasklist_lock); + out: + return retval; + +} + +/* + * This is the crucial function for periodic task implementation, + * It checks if a task is periodic, checks if such kind of sleep + * is permitted and calls plugin-specific sleep, which puts the + * task into a wait array. + * returns 0 on successful wakeup + * returns EPERM if current conditions do not permit such sleep + * returns EINVAL if current task is not able to go to sleep + */ +asmlinkage long sys_complete_job(void) +{ + int retval = -EPERM; + if (!is_realtime(current)) { + retval = -EINVAL; + goto out; + } + /* Task with negative or zero period cannot sleep */ + if (get_rt_period(current) <= 0) { + retval = -EINVAL; + goto out; + } + /* The plugin has to put the task into an + * appropriate queue and call schedule + */ + retval = litmus->complete_job(); + out: + return retval; +} + +/* This is an "improved" version of sys_complete_job that + * addresses the problem of unintentionally missing a job after + * an overrun. + * + * returns 0 on successful wakeup + * returns EPERM if current conditions do not permit such sleep + * returns EINVAL if current task is not able to go to sleep + */ +asmlinkage long sys_wait_for_job_release(unsigned int job) +{ + int retval = -EPERM; + if (!is_realtime(current)) { + retval = -EINVAL; + goto out; + } + + /* Task with negative or zero period cannot sleep */ + if (get_rt_period(current) <= 0) { + retval = -EINVAL; + goto out; + } + + retval = 0; + + /* first wait until we have "reached" the desired job + * + * This implementation has at least two problems: + * + * 1) It doesn't gracefully handle the wrap around of + * job_no. Since LITMUS is a prototype, this is not much + * of a problem right now. + * + * 2) It is theoretically racy if a job release occurs + * between checking job_no and calling sleep_next_period(). + * A proper solution would requiring adding another callback + * in the plugin structure and testing the condition with + * interrupts disabled. + * + * FIXME: At least problem 2 should be taken care of eventually. + */ + while (!retval && job > current->rt_param.job_params.job_no) + /* If the last job overran then job <= job_no and we + * don't send the task to sleep. + */ + retval = litmus->complete_job(); + out: + return retval; +} + +/* This is a helper syscall to query the current job sequence number. + * + * returns 0 on successful query + * returns EPERM if task is not a real-time task. + * returns EFAULT if &job is not a valid pointer. + */ +asmlinkage long sys_query_job_no(unsigned int __user *job) +{ + int retval = -EPERM; + if (is_realtime(current)) + retval = put_user(current->rt_param.job_params.job_no, job); + + return retval; +} + +/* sys_null_call() is only used for determining raw system call + * overheads (kernel entry, kernel exit). It has no useful side effects. + * If ts is non-NULL, then the current Feather-Trace time is recorded. + */ +asmlinkage long sys_null_call(cycles_t __user *ts) +{ + long ret = 0; + cycles_t now; + + if (ts) { + now = get_cycles(); + ret = put_user(now, ts); + } + + return ret; +} + +/* p is a real-time task. Re-init its state as a best-effort task. */ +static void reinit_litmus_state(struct task_struct* p, int restore) +{ + struct rt_task user_config = {}; + void* ctrl_page = NULL; + + if (restore) { + /* Safe user-space provided configuration data. + * and allocated page. */ + user_config = p->rt_param.task_params; + ctrl_page = p->rt_param.ctrl_page; + } + + /* We probably should not be inheriting any task's priority + * at this point in time. + */ + WARN_ON(p->rt_param.inh_task); + + /* Cleanup everything else. */ + memset(&p->rt_param, 0, sizeof(p->rt_param)); + + /* Restore preserved fields. */ + if (restore) { + p->rt_param.task_params = user_config; + p->rt_param.ctrl_page = ctrl_page; + } +} + +long litmus_admit_task(struct task_struct* tsk) +{ + long retval = 0; + unsigned long flags; + + BUG_ON(is_realtime(tsk)); + + if (get_rt_period(tsk) == 0 || + get_exec_cost(tsk) > get_rt_period(tsk)) { + TRACE_TASK(tsk, "litmus admit: invalid task parameters " + "(%lu, %lu)\n", + get_exec_cost(tsk), get_rt_period(tsk)); + retval = -EINVAL; + goto out; + } + + if (!cpu_online(get_partition(tsk))) { + TRACE_TASK(tsk, "litmus admit: cpu %d is not online\n", + get_partition(tsk)); + retval = -EINVAL; + goto out; + } + + INIT_LIST_HEAD(&tsk_rt(tsk)->list); + + /* avoid scheduler plugin changing underneath us */ + raw_spin_lock_irqsave(&task_transition_lock, flags); + + /* allocate heap node for this task */ + tsk_rt(tsk)->heap_node = bheap_node_alloc(GFP_ATOMIC); + tsk_rt(tsk)->rel_heap = release_heap_alloc(GFP_ATOMIC); + + if (!tsk_rt(tsk)->heap_node || !tsk_rt(tsk)->rel_heap) { + printk(KERN_WARNING "litmus: no more heap node memory!?\n"); + + bheap_node_free(tsk_rt(tsk)->heap_node); + release_heap_free(tsk_rt(tsk)->rel_heap); + + retval = -ENOMEM; + goto out_unlock; + } else { + bheap_node_init(&tsk_rt(tsk)->heap_node, tsk); + } + + retval = litmus->admit_task(tsk); + + if (!retval) { + sched_trace_task_name(tsk); + sched_trace_task_param(tsk); + atomic_inc(&rt_task_count); + } + +out_unlock: + raw_spin_unlock_irqrestore(&task_transition_lock, flags); +out: + return retval; +} + +void litmus_exit_task(struct task_struct* tsk) +{ + if (is_realtime(tsk)) { + sched_trace_task_completion(tsk, 1); + + litmus->task_exit(tsk); + + BUG_ON(bheap_node_in_heap(tsk_rt(tsk)->heap_node)); + bheap_node_free(tsk_rt(tsk)->heap_node); + release_heap_free(tsk_rt(tsk)->rel_heap); + + atomic_dec(&rt_task_count); + reinit_litmus_state(tsk, 1); + } +} + +/* IPI callback to synchronize plugin switching */ +static void synch_on_plugin_switch(void* info) +{ + atomic_inc(&cannot_use_plugin); + while (atomic_read(&cannot_use_plugin) > 0) + cpu_relax(); +} + +/* Switching a plugin in use is tricky. + * We must watch out that no real-time tasks exists + * (and that none is created in parallel) and that the plugin is not + * currently in use on any processor (in theory). + */ +int switch_sched_plugin(struct sched_plugin* plugin) +{ + unsigned long flags; + int ret = 0; + + BUG_ON(!plugin); + + /* forbid other cpus to use the plugin */ + atomic_set(&cannot_use_plugin, 1); + /* send IPI to force other CPUs to synch with us */ + smp_call_function(synch_on_plugin_switch, NULL, 0); + + /* wait until all other CPUs have started synch */ + while (atomic_read(&cannot_use_plugin) < num_online_cpus()) + cpu_relax(); + + /* stop task transitions */ + raw_spin_lock_irqsave(&task_transition_lock, flags); + + /* don't switch if there are active real-time tasks */ + if (atomic_read(&rt_task_count) == 0) { + ret = litmus->deactivate_plugin(); + if (0 != ret) + goto out; + ret = plugin->activate_plugin(); + if (0 != ret) { + printk(KERN_INFO "Can't activate %s (%d).\n", + plugin->plugin_name, ret); + plugin = &linux_sched_plugin; + } + printk(KERN_INFO "Switching to LITMUS^RT plugin %s.\n", plugin->plugin_name); + litmus = plugin; + } else + ret = -EBUSY; +out: + raw_spin_unlock_irqrestore(&task_transition_lock, flags); + atomic_set(&cannot_use_plugin, 0); + return ret; +} + +/* Called upon fork. + * p is the newly forked task. + */ +void litmus_fork(struct task_struct* p) +{ + if (is_realtime(p)) { + /* clean out any litmus related state, don't preserve anything */ + reinit_litmus_state(p, 0); + /* Don't let the child be a real-time task. */ + p->sched_reset_on_fork = 1; + } else + /* non-rt tasks might have ctrl_page set */ + tsk_rt(p)->ctrl_page = NULL; + + /* od tables are never inherited across a fork */ + p->od_table = NULL; +} + +/* Called upon execve(). + * current is doing the exec. + * Don't let address space specific stuff leak. + */ +void litmus_exec(void) +{ + struct task_struct* p = current; + + if (is_realtime(p)) { + WARN_ON(p->rt_param.inh_task); + if (tsk_rt(p)->ctrl_page) { + free_page((unsigned long) tsk_rt(p)->ctrl_page); + tsk_rt(p)->ctrl_page = NULL; + } + } +} + +void exit_litmus(struct task_struct *dead_tsk) +{ + /* We also allow non-RT tasks to + * allocate control pages to allow + * measurements with non-RT tasks. + * So check if we need to free the page + * in any case. + */ + if (tsk_rt(dead_tsk)->ctrl_page) { + TRACE_TASK(dead_tsk, + "freeing ctrl_page %p\n", + tsk_rt(dead_tsk)->ctrl_page); + free_page((unsigned long) tsk_rt(dead_tsk)->ctrl_page); + } + + /* main cleanup only for RT tasks */ + if (is_realtime(dead_tsk)) + litmus_exit_task(dead_tsk); +} + + +#ifdef CONFIG_MAGIC_SYSRQ +int sys_kill(int pid, int sig); + +static void sysrq_handle_kill_rt_tasks(int key) +{ + struct task_struct *t; + read_lock(&tasklist_lock); + for_each_process(t) { + if (is_realtime(t)) { + sys_kill(t->pid, SIGKILL); + } + } + read_unlock(&tasklist_lock); +} + +static struct sysrq_key_op sysrq_kill_rt_tasks_op = { + .handler = sysrq_handle_kill_rt_tasks, + .help_msg = "quit-rt-tasks(X)", + .action_msg = "sent SIGKILL to all LITMUS^RT real-time tasks", +}; +#endif + +extern struct sched_plugin linux_sched_plugin; + +static int __init _init_litmus(void) +{ + /* Common initializers, + * mode change lock is used to enforce single mode change + * operation. + */ + printk("Starting LITMUS^RT kernel\n"); + + BUILD_BUG_ON(sizeof(union np_flag) != sizeof(uint32_t)); + + register_sched_plugin(&linux_sched_plugin); + + bheap_node_cache = KMEM_CACHE(bheap_node, SLAB_PANIC); + release_heap_cache = KMEM_CACHE(release_heap, SLAB_PANIC); + +#ifdef CONFIG_MAGIC_SYSRQ + /* offer some debugging help */ + if (!register_sysrq_key('x', &sysrq_kill_rt_tasks_op)) + printk("Registered kill rt tasks magic sysrq.\n"); + else + printk("Could not register kill rt tasks magic sysrq.\n"); +#endif + + init_litmus_proc(); + + return 0; +} + +static void _exit_litmus(void) +{ + exit_litmus_proc(); + kmem_cache_destroy(bheap_node_cache); + kmem_cache_destroy(release_heap_cache); +} + +module_init(_init_litmus); +module_exit(_exit_litmus); diff --git a/litmus/litmus_proc.c b/litmus/litmus_proc.c new file mode 100644 index 000000000000..4bf725a36c9c --- /dev/null +++ b/litmus/litmus_proc.c @@ -0,0 +1,347 @@ +/* + * litmus_proc.c -- Implementation of the /proc/litmus directory tree. + */ + +#include +#include + +#include +#include + +#include + +/* in litmus/litmus.c */ +extern atomic_t rt_task_count; + +static struct proc_dir_entry *litmus_dir = NULL, + *curr_file = NULL, + *stat_file = NULL, + *plugs_dir = NULL, +#ifdef CONFIG_RELEASE_MASTER + *release_master_file = NULL, +#endif + *plugs_file = NULL; + +/* in litmus/sync.c */ +int count_tasks_waiting_for_release(void); + +static int proc_read_stats(char *page, char **start, + off_t off, int count, + int *eof, void *data) +{ + int len; + + len = snprintf(page, PAGE_SIZE, + "real-time tasks = %d\n" + "ready for release = %d\n", + atomic_read(&rt_task_count), + count_tasks_waiting_for_release()); + return len; +} + +static int proc_read_plugins(char *page, char **start, + off_t off, int count, + int *eof, void *data) +{ + int len; + + len = print_sched_plugins(page, PAGE_SIZE); + return len; +} + +static int proc_read_curr(char *page, char **start, + off_t off, int count, + int *eof, void *data) +{ + int len; + + len = snprintf(page, PAGE_SIZE, "%s\n", litmus->plugin_name); + return len; +} + +/* in litmus/litmus.c */ +int switch_sched_plugin(struct sched_plugin*); + +static int proc_write_curr(struct file *file, + const char *buffer, + unsigned long count, + void *data) +{ + int len, ret; + char name[65]; + struct sched_plugin* found; + + len = copy_and_chomp(name, sizeof(name), buffer, count); + if (len < 0) + return len; + + found = find_sched_plugin(name); + + if (found) { + ret = switch_sched_plugin(found); + if (ret != 0) + printk(KERN_INFO "Could not switch plugin: %d\n", ret); + } else + printk(KERN_INFO "Plugin '%s' is unknown.\n", name); + + return len; +} + +#ifdef CONFIG_RELEASE_MASTER +static int proc_read_release_master(char *page, char **start, + off_t off, int count, + int *eof, void *data) +{ + int len, master; + master = atomic_read(&release_master_cpu); + if (master == NO_CPU) + len = snprintf(page, PAGE_SIZE, "NO_CPU\n"); + else + len = snprintf(page, PAGE_SIZE, "%d\n", master); + return len; +} + +static int proc_write_release_master(struct file *file, + const char *buffer, + unsigned long count, + void *data) +{ + int cpu, err, len, online = 0; + char msg[64]; + + len = copy_and_chomp(msg, sizeof(msg), buffer, count); + + if (len < 0) + return len; + + if (strcmp(msg, "NO_CPU") == 0) + atomic_set(&release_master_cpu, NO_CPU); + else { + err = sscanf(msg, "%d", &cpu); + if (err == 1 && cpu >= 0 && (online = cpu_online(cpu))) { + atomic_set(&release_master_cpu, cpu); + } else { + TRACE("invalid release master: '%s' " + "(err:%d cpu:%d online:%d)\n", + msg, err, cpu, online); + len = -EINVAL; + } + } + return len; +} +#endif + +int __init init_litmus_proc(void) +{ + litmus_dir = proc_mkdir("litmus", NULL); + if (!litmus_dir) { + printk(KERN_ERR "Could not allocate LITMUS^RT procfs entry.\n"); + return -ENOMEM; + } + + curr_file = create_proc_entry("active_plugin", + 0644, litmus_dir); + if (!curr_file) { + printk(KERN_ERR "Could not allocate active_plugin " + "procfs entry.\n"); + return -ENOMEM; + } + curr_file->read_proc = proc_read_curr; + curr_file->write_proc = proc_write_curr; + +#ifdef CONFIG_RELEASE_MASTER + release_master_file = create_proc_entry("release_master", + 0644, litmus_dir); + if (!release_master_file) { + printk(KERN_ERR "Could not allocate release_master " + "procfs entry.\n"); + return -ENOMEM; + } + release_master_file->read_proc = proc_read_release_master; + release_master_file->write_proc = proc_write_release_master; +#endif + + stat_file = create_proc_read_entry("stats", 0444, litmus_dir, + proc_read_stats, NULL); + + plugs_dir = proc_mkdir("plugins", litmus_dir); + if (!plugs_dir){ + printk(KERN_ERR "Could not allocate plugins directory " + "procfs entry.\n"); + return -ENOMEM; + } + + plugs_file = create_proc_read_entry("loaded", 0444, plugs_dir, + proc_read_plugins, NULL); + + return 0; +} + +void exit_litmus_proc(void) +{ + if (plugs_file) + remove_proc_entry("loaded", plugs_dir); + if (plugs_dir) + remove_proc_entry("plugins", litmus_dir); + if (stat_file) + remove_proc_entry("stats", litmus_dir); + if (curr_file) + remove_proc_entry("active_plugin", litmus_dir); +#ifdef CONFIG_RELEASE_MASTER + if (release_master_file) + remove_proc_entry("release_master", litmus_dir); +#endif + if (litmus_dir) + remove_proc_entry("litmus", NULL); +} + +long make_plugin_proc_dir(struct sched_plugin* plugin, + struct proc_dir_entry** pde_in) +{ + struct proc_dir_entry *pde_new = NULL; + long rv; + + if (!plugin || !plugin->plugin_name){ + printk(KERN_ERR "Invalid plugin struct passed to %s.\n", + __func__); + rv = -EINVAL; + goto out_no_pde; + } + + if (!plugs_dir){ + printk(KERN_ERR "Could not make plugin sub-directory, because " + "/proc/litmus/plugins does not exist.\n"); + rv = -ENOENT; + goto out_no_pde; + } + + pde_new = proc_mkdir(plugin->plugin_name, plugs_dir); + if (!pde_new){ + printk(KERN_ERR "Could not make plugin sub-directory: " + "out of memory?.\n"); + rv = -ENOMEM; + goto out_no_pde; + } + + rv = 0; + *pde_in = pde_new; + goto out_ok; + +out_no_pde: + *pde_in = NULL; +out_ok: + return rv; +} + +void remove_plugin_proc_dir(struct sched_plugin* plugin) +{ + if (!plugin || !plugin->plugin_name){ + printk(KERN_ERR "Invalid plugin struct passed to %s.\n", + __func__); + return; + } + remove_proc_entry(plugin->plugin_name, plugs_dir); +} + + + +/* misc. I/O helper functions */ + +int copy_and_chomp(char *kbuf, unsigned long ksize, + __user const char* ubuf, unsigned long ulength) +{ + /* caller must provide buffer space */ + BUG_ON(!ksize); + + ksize--; /* leave space for null byte */ + + if (ksize > ulength) + ksize = ulength; + + if(copy_from_user(kbuf, ubuf, ksize)) + return -EFAULT; + + kbuf[ksize] = '\0'; + + /* chomp kbuf */ + if (ksize > 0 && kbuf[ksize - 1] == '\n') + kbuf[ksize - 1] = '\0'; + + return ksize; +} + +/* helper functions for clustered plugins */ +static const char* cache_level_names[] = { + "ALL", + "L1", + "L2", + "L3", +}; + +int parse_cache_level(const char *cache_name, enum cache_level *level) +{ + int err = -EINVAL; + int i; + /* do a quick and dirty comparison to find the cluster size */ + for (i = GLOBAL_CLUSTER; i <= L3_CLUSTER; i++) + if (!strcmp(cache_name, cache_level_names[i])) { + *level = (enum cache_level) i; + err = 0; + break; + } + return err; +} + +const char* cache_level_name(enum cache_level level) +{ + int idx = level; + + if (idx >= GLOBAL_CLUSTER && idx <= L3_CLUSTER) + return cache_level_names[idx]; + else + return "INVALID"; +} + + +/* proc file interface to configure the cluster size */ +static int proc_read_cluster_size(char *page, char **start, + off_t off, int count, + int *eof, void *data) +{ + return snprintf(page, PAGE_SIZE, "%s\n", + cache_level_name(*((enum cache_level*) data)));; +} + +static int proc_write_cluster_size(struct file *file, + const char *buffer, + unsigned long count, + void *data) +{ + int len; + char cache_name[8]; + + len = copy_and_chomp(cache_name, sizeof(cache_name), buffer, count); + + if (len > 0 && parse_cache_level(cache_name, (enum cache_level*) data)) + printk(KERN_INFO "Cluster '%s' is unknown.\n", cache_name); + + return len; +} + +struct proc_dir_entry* create_cluster_file(struct proc_dir_entry* parent, + enum cache_level* level) +{ + struct proc_dir_entry* cluster_file; + + cluster_file = create_proc_entry("cluster", 0644, parent); + if (!cluster_file) { + printk(KERN_ERR "Could not allocate %s/cluster " + "procfs entry.\n", parent->name); + } else { + cluster_file->read_proc = proc_read_cluster_size; + cluster_file->write_proc = proc_write_cluster_size; + cluster_file->data = level; + } + return cluster_file; +} + diff --git a/litmus/locking.c b/litmus/locking.c new file mode 100644 index 000000000000..84a1d8309699 --- /dev/null +++ b/litmus/locking.c @@ -0,0 +1,186 @@ +#include +#include +#include + +#ifdef CONFIG_LITMUS_LOCKING + +#include +#include +#include + +static int create_generic_lock(void** obj_ref, obj_type_t type, void* __user arg); +static int open_generic_lock(struct od_table_entry* entry, void* __user arg); +static int close_generic_lock(struct od_table_entry* entry); +static void destroy_generic_lock(obj_type_t type, void* sem); + +struct fdso_ops generic_lock_ops = { + .create = create_generic_lock, + .open = open_generic_lock, + .close = close_generic_lock, + .destroy = destroy_generic_lock +}; + +static inline bool is_lock(struct od_table_entry* entry) +{ + return entry->class == &generic_lock_ops; +} + +static inline struct litmus_lock* get_lock(struct od_table_entry* entry) +{ + BUG_ON(!is_lock(entry)); + return (struct litmus_lock*) entry->obj->obj; +} + +static int create_generic_lock(void** obj_ref, obj_type_t type, void* __user arg) +{ + struct litmus_lock* lock; + int err; + + err = litmus->allocate_lock(&lock, type, arg); + if (err == 0) + *obj_ref = lock; + return err; +} + +static int open_generic_lock(struct od_table_entry* entry, void* __user arg) +{ + struct litmus_lock* lock = get_lock(entry); + if (lock->ops->open) + return lock->ops->open(lock, arg); + else + return 0; /* default: any task can open it */ +} + +static int close_generic_lock(struct od_table_entry* entry) +{ + struct litmus_lock* lock = get_lock(entry); + if (lock->ops->close) + return lock->ops->close(lock); + else + return 0; /* default: closing succeeds */ +} + +static void destroy_generic_lock(obj_type_t type, void* obj) +{ + struct litmus_lock* lock = (struct litmus_lock*) obj; + lock->ops->deallocate(lock); +} + +asmlinkage long sys_litmus_lock(int lock_od) +{ + long err = -EINVAL; + struct od_table_entry* entry; + struct litmus_lock* l; + + TS_SYSCALL_IN_START; + + TS_SYSCALL_IN_END; + + TS_LOCK_START; + + entry = get_entry_for_od(lock_od); + if (entry && is_lock(entry)) { + l = get_lock(entry); + TRACE_CUR("attempts to lock 0x%p\n", l); + err = l->ops->lock(l); + } + + /* Note: task my have been suspended or preempted in between! Take + * this into account when computing overheads. */ + TS_LOCK_END; + + TS_SYSCALL_OUT_START; + + return err; +} + +asmlinkage long sys_litmus_unlock(int lock_od) +{ + long err = -EINVAL; + struct od_table_entry* entry; + struct litmus_lock* l; + + TS_SYSCALL_IN_START; + + TS_SYSCALL_IN_END; + + TS_UNLOCK_START; + + entry = get_entry_for_od(lock_od); + if (entry && is_lock(entry)) { + l = get_lock(entry); + TRACE_CUR("attempts to unlock 0x%p\n", l); + err = l->ops->unlock(l); + } + + /* Note: task my have been preempted in between! Take this into + * account when computing overheads. */ + TS_UNLOCK_END; + + TS_SYSCALL_OUT_START; + + return err; +} + +struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq) +{ + wait_queue_t* q; + struct task_struct* t = NULL; + + if (waitqueue_active(wq)) { + q = list_entry(wq->task_list.next, + wait_queue_t, task_list); + t = (struct task_struct*) q->private; + __remove_wait_queue(wq, q); + } + return(t); +} + +unsigned int __add_wait_queue_prio_exclusive( + wait_queue_head_t* head, + prio_wait_queue_t *new) +{ + struct list_head *pos; + unsigned int passed = 0; + + new->wq.flags |= WQ_FLAG_EXCLUSIVE; + + /* find a spot where the new entry is less than the next */ + list_for_each(pos, &head->task_list) { + prio_wait_queue_t* queued = list_entry(pos, prio_wait_queue_t, + wq.task_list); + + if (unlikely(lt_before(new->priority, queued->priority) || + (new->priority == queued->priority && + new->tie_breaker < queued->tie_breaker))) { + /* pos is not less than new, thus insert here */ + __list_add(&new->wq.task_list, pos->prev, pos); + goto out; + } + passed++; + } + + /* if we get to this point either the list is empty or every entry + * queued element is less than new. + * Let's add new to the end. */ + list_add_tail(&new->wq.task_list, &head->task_list); +out: + return passed; +} + + +#else + +struct fdso_ops generic_lock_ops = {}; + +asmlinkage long sys_litmus_lock(int sem_od) +{ + return -ENOSYS; +} + +asmlinkage long sys_litmus_unlock(int sem_od) +{ + return -ENOSYS; +} + +#endif diff --git a/litmus/preempt.c b/litmus/preempt.c new file mode 100644 index 000000000000..90e09d091e30 --- /dev/null +++ b/litmus/preempt.c @@ -0,0 +1,131 @@ +#include + +#include +#include + +/* The rescheduling state of each processor. + */ +DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, resched_state); + +void sched_state_will_schedule(struct task_struct* tsk) +{ + /* Litmus hack: we only care about processor-local invocations of + * set_tsk_need_resched(). We can't reliably set the flag remotely + * since it might race with other updates to the scheduling state. We + * can't rely on the runqueue lock protecting updates to the sched + * state since processors do not acquire the runqueue locks for all + * updates to the sched state (to avoid acquiring two runqueue locks at + * the same time). Further, if tsk is residing on a remote processor, + * then that processor doesn't actually know yet that it is going to + * reschedule; it still must receive an IPI (unless a local invocation + * races). + */ + if (likely(task_cpu(tsk) == smp_processor_id())) { + VERIFY_SCHED_STATE(TASK_SCHEDULED | SHOULD_SCHEDULE | TASK_PICKED | WILL_SCHEDULE); + if (is_in_sched_state(TASK_PICKED | PICKED_WRONG_TASK)) + set_sched_state(PICKED_WRONG_TASK); + else + set_sched_state(WILL_SCHEDULE); + } else + /* Litmus tasks should never be subject to a remote + * set_tsk_need_resched(). */ + BUG_ON(is_realtime(tsk)); +// TRACE_TASK(tsk, "set_tsk_need_resched() ret:%p\n", +// __builtin_return_address(0)); +} + +/* Called by the IPI handler after another CPU called smp_send_resched(). */ +void sched_state_ipi(void) +{ + /* If the IPI was slow, we might be in any state right now. The IPI is + * only meaningful if we are in SHOULD_SCHEDULE. */ + if (is_in_sched_state(SHOULD_SCHEDULE)) { + /* Cause scheduler to be invoked. + * This will cause a transition to WILL_SCHEDULE. */ + set_tsk_need_resched(current); + TRACE_STATE("IPI -> set_tsk_need_resched(%s/%d)\n", + current->comm, current->pid); + } else { + /* ignore */ + TRACE_STATE("ignoring IPI in state %x (%s)\n", + get_sched_state(), + sched_state_name(get_sched_state())); + } +} + +/* Called by plugins to cause a CPU to reschedule. IMPORTANT: the caller must + * hold the lock that is used to serialize scheduling decisions. */ +void litmus_reschedule(int cpu) +{ + int picked_transition_ok = 0; + int scheduled_transition_ok = 0; + + /* The (remote) CPU could be in any state. */ + + /* The critical states are TASK_PICKED and TASK_SCHEDULED, as the CPU + * is not aware of the need to reschedule at this point. */ + + /* is a context switch in progress? */ + if (cpu_is_in_sched_state(cpu, TASK_PICKED)) + picked_transition_ok = sched_state_transition_on( + cpu, TASK_PICKED, PICKED_WRONG_TASK); + + if (!picked_transition_ok && + cpu_is_in_sched_state(cpu, TASK_SCHEDULED)) { + /* We either raced with the end of the context switch, or the + * CPU was in TASK_SCHEDULED anyway. */ + scheduled_transition_ok = sched_state_transition_on( + cpu, TASK_SCHEDULED, SHOULD_SCHEDULE); + } + + /* If the CPU was in state TASK_SCHEDULED, then we need to cause the + * scheduler to be invoked. */ + if (scheduled_transition_ok) { + if (smp_processor_id() == cpu) + set_tsk_need_resched(current); + else + smp_send_reschedule(cpu); + } + + TRACE_STATE("%s picked-ok:%d sched-ok:%d\n", + __FUNCTION__, + picked_transition_ok, + scheduled_transition_ok); +} + +void litmus_reschedule_local(void) +{ + if (is_in_sched_state(TASK_PICKED)) + set_sched_state(PICKED_WRONG_TASK); + else if (is_in_sched_state(TASK_SCHEDULED | SHOULD_SCHEDULE)) { + set_sched_state(WILL_SCHEDULE); + set_tsk_need_resched(current); + } +} + +#ifdef CONFIG_DEBUG_KERNEL + +void sched_state_plugin_check(void) +{ + if (!is_in_sched_state(TASK_PICKED | PICKED_WRONG_TASK)) { + TRACE("!!!! plugin did not call sched_state_task_picked()!" + "Calling sched_state_task_picked() is mandatory---fix this.\n"); + set_sched_state(TASK_PICKED); + } +} + +#define NAME_CHECK(x) case x: return #x +const char* sched_state_name(int s) +{ + switch (s) { + NAME_CHECK(TASK_SCHEDULED); + NAME_CHECK(SHOULD_SCHEDULE); + NAME_CHECK(WILL_SCHEDULE); + NAME_CHECK(TASK_PICKED); + NAME_CHECK(PICKED_WRONG_TASK); + default: + return "UNKNOWN"; + }; +} + +#endif diff --git a/litmus/rt_domain.c b/litmus/rt_domain.c new file mode 100644 index 000000000000..d405854cd39c --- /dev/null +++ b/litmus/rt_domain.c @@ -0,0 +1,357 @@ +/* + * litmus/rt_domain.c + * + * LITMUS real-time infrastructure. This file contains the + * functions that manipulate RT domains. RT domains are an abstraction + * of a ready queue and a release queue. + */ + +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +#include + +/* Uncomment when debugging timer races... */ +#if 0 +#define VTRACE_TASK TRACE_TASK +#define VTRACE TRACE +#else +#define VTRACE_TASK(t, fmt, args...) /* shut up */ +#define VTRACE(fmt, args...) /* be quiet already */ +#endif + +static int dummy_resched(rt_domain_t *rt) +{ + return 0; +} + +static int dummy_order(struct bheap_node* a, struct bheap_node* b) +{ + return 0; +} + +/* default implementation: use default lock */ +static void default_release_jobs(rt_domain_t* rt, struct bheap* tasks) +{ + merge_ready(rt, tasks); +} + +static unsigned int time2slot(lt_t time) +{ + return (unsigned int) time2quanta(time, FLOOR) % RELEASE_QUEUE_SLOTS; +} + +static enum hrtimer_restart on_release_timer(struct hrtimer *timer) +{ + unsigned long flags; + struct release_heap* rh; + rh = container_of(timer, struct release_heap, timer); + + TS_RELEASE_LATENCY(rh->release_time); + + VTRACE("on_release_timer(0x%p) starts.\n", timer); + + TS_RELEASE_START; + + + raw_spin_lock_irqsave(&rh->dom->release_lock, flags); + VTRACE("CB has the release_lock 0x%p\n", &rh->dom->release_lock); + /* remove from release queue */ + list_del(&rh->list); + raw_spin_unlock_irqrestore(&rh->dom->release_lock, flags); + VTRACE("CB returned release_lock 0x%p\n", &rh->dom->release_lock); + + /* call release callback */ + rh->dom->release_jobs(rh->dom, &rh->heap); + /* WARNING: rh can be referenced from other CPUs from now on. */ + + TS_RELEASE_END; + + VTRACE("on_release_timer(0x%p) ends.\n", timer); + + return HRTIMER_NORESTART; +} + +/* allocated in litmus.c */ +struct kmem_cache * release_heap_cache; + +struct release_heap* release_heap_alloc(int gfp_flags) +{ + struct release_heap* rh; + rh= kmem_cache_alloc(release_heap_cache, gfp_flags); + if (rh) { + /* initialize timer */ + hrtimer_init(&rh->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + rh->timer.function = on_release_timer; + } + return rh; +} + +void release_heap_free(struct release_heap* rh) +{ + /* make sure timer is no longer in use */ + hrtimer_cancel(&rh->timer); + kmem_cache_free(release_heap_cache, rh); +} + +/* Caller must hold release lock. + * Will return heap for given time. If no such heap exists prior to + * the invocation it will be created. + */ +static struct release_heap* get_release_heap(rt_domain_t *rt, + struct task_struct* t, + int use_task_heap) +{ + struct list_head* pos; + struct release_heap* heap = NULL; + struct release_heap* rh; + lt_t release_time = get_release(t); + unsigned int slot = time2slot(release_time); + + /* initialize pos for the case that the list is empty */ + pos = rt->release_queue.slot[slot].next; + list_for_each(pos, &rt->release_queue.slot[slot]) { + rh = list_entry(pos, struct release_heap, list); + if (release_time == rh->release_time) { + /* perfect match -- this happens on hyperperiod + * boundaries + */ + heap = rh; + break; + } else if (lt_before(release_time, rh->release_time)) { + /* we need to insert a new node since rh is + * already in the future + */ + break; + } + } + if (!heap && use_task_heap) { + /* use pre-allocated release heap */ + rh = tsk_rt(t)->rel_heap; + + rh->dom = rt; + rh->release_time = release_time; + + /* add to release queue */ + list_add(&rh->list, pos->prev); + heap = rh; + } + return heap; +} + +static void reinit_release_heap(struct task_struct* t) +{ + struct release_heap* rh; + + /* use pre-allocated release heap */ + rh = tsk_rt(t)->rel_heap; + + /* Make sure it is safe to use. The timer callback could still + * be executing on another CPU; hrtimer_cancel() will wait + * until the timer callback has completed. However, under no + * circumstances should the timer be active (= yet to be + * triggered). + * + * WARNING: If the CPU still holds the release_lock at this point, + * deadlock may occur! + */ + BUG_ON(hrtimer_cancel(&rh->timer)); + + /* initialize */ + bheap_init(&rh->heap); +#ifdef CONFIG_RELEASE_MASTER + atomic_set(&rh->info.state, HRTIMER_START_ON_INACTIVE); +#endif +} +/* arm_release_timer() - start local release timer or trigger + * remote timer (pull timer) + * + * Called by add_release() with: + * - tobe_lock taken + * - IRQ disabled + */ +#ifdef CONFIG_RELEASE_MASTER +#define arm_release_timer(t) arm_release_timer_on((t), NO_CPU) +static void arm_release_timer_on(rt_domain_t *_rt , int target_cpu) +#else +static void arm_release_timer(rt_domain_t *_rt) +#endif +{ + rt_domain_t *rt = _rt; + struct list_head list; + struct list_head *pos, *safe; + struct task_struct* t; + struct release_heap* rh; + + VTRACE("arm_release_timer() at %llu\n", litmus_clock()); + list_replace_init(&rt->tobe_released, &list); + + list_for_each_safe(pos, safe, &list) { + /* pick task of work list */ + t = list_entry(pos, struct task_struct, rt_param.list); + sched_trace_task_release(t); + list_del(pos); + + /* put into release heap while holding release_lock */ + raw_spin_lock(&rt->release_lock); + VTRACE_TASK(t, "I have the release_lock 0x%p\n", &rt->release_lock); + + rh = get_release_heap(rt, t, 0); + if (!rh) { + /* need to use our own, but drop lock first */ + raw_spin_unlock(&rt->release_lock); + VTRACE_TASK(t, "Dropped release_lock 0x%p\n", + &rt->release_lock); + + reinit_release_heap(t); + VTRACE_TASK(t, "release_heap ready\n"); + + raw_spin_lock(&rt->release_lock); + VTRACE_TASK(t, "Re-acquired release_lock 0x%p\n", + &rt->release_lock); + + rh = get_release_heap(rt, t, 1); + } + bheap_insert(rt->order, &rh->heap, tsk_rt(t)->heap_node); + VTRACE_TASK(t, "arm_release_timer(): added to release heap\n"); + + raw_spin_unlock(&rt->release_lock); + VTRACE_TASK(t, "Returned the release_lock 0x%p\n", &rt->release_lock); + + /* To avoid arming the timer multiple times, we only let the + * owner do the arming (which is the "first" task to reference + * this release_heap anyway). + */ + if (rh == tsk_rt(t)->rel_heap) { + VTRACE_TASK(t, "arming timer 0x%p\n", &rh->timer); + /* we cannot arm the timer using hrtimer_start() + * as it may deadlock on rq->lock + * + * PINNED mode is ok on both local and remote CPU + */ +#ifdef CONFIG_RELEASE_MASTER + if (rt->release_master == NO_CPU && + target_cpu == NO_CPU) +#endif + __hrtimer_start_range_ns(&rh->timer, + ns_to_ktime(rh->release_time), + 0, HRTIMER_MODE_ABS_PINNED, 0); +#ifdef CONFIG_RELEASE_MASTER + else + hrtimer_start_on( + /* target_cpu overrides release master */ + (target_cpu != NO_CPU ? + target_cpu : rt->release_master), + &rh->info, &rh->timer, + ns_to_ktime(rh->release_time), + HRTIMER_MODE_ABS_PINNED); +#endif + } else + VTRACE_TASK(t, "0x%p is not my timer\n", &rh->timer); + } +} + +void rt_domain_init(rt_domain_t *rt, + bheap_prio_t order, + check_resched_needed_t check, + release_jobs_t release + ) +{ + int i; + + BUG_ON(!rt); + if (!check) + check = dummy_resched; + if (!release) + release = default_release_jobs; + if (!order) + order = dummy_order; + +#ifdef CONFIG_RELEASE_MASTER + rt->release_master = NO_CPU; +#endif + + bheap_init(&rt->ready_queue); + INIT_LIST_HEAD(&rt->tobe_released); + for (i = 0; i < RELEASE_QUEUE_SLOTS; i++) + INIT_LIST_HEAD(&rt->release_queue.slot[i]); + + raw_spin_lock_init(&rt->ready_lock); + raw_spin_lock_init(&rt->release_lock); + raw_spin_lock_init(&rt->tobe_lock); + + rt->check_resched = check; + rt->release_jobs = release; + rt->order = order; +} + +/* add_ready - add a real-time task to the rt ready queue. It must be runnable. + * @new: the newly released task + */ +void __add_ready(rt_domain_t* rt, struct task_struct *new) +{ + TRACE("rt: adding %s/%d (%llu, %llu) rel=%llu to ready queue at %llu\n", + new->comm, new->pid, get_exec_cost(new), get_rt_period(new), + get_release(new), litmus_clock()); + + BUG_ON(bheap_node_in_heap(tsk_rt(new)->heap_node)); + + bheap_insert(rt->order, &rt->ready_queue, tsk_rt(new)->heap_node); + rt->check_resched(rt); +} + +/* merge_ready - Add a sorted set of tasks to the rt ready queue. They must be runnable. + * @tasks - the newly released tasks + */ +void __merge_ready(rt_domain_t* rt, struct bheap* tasks) +{ + bheap_union(rt->order, &rt->ready_queue, tasks); + rt->check_resched(rt); +} + + +#ifdef CONFIG_RELEASE_MASTER +void __add_release_on(rt_domain_t* rt, struct task_struct *task, + int target_cpu) +{ + TRACE_TASK(task, "add_release_on(), rel=%llu, target=%d\n", + get_release(task), target_cpu); + list_add(&tsk_rt(task)->list, &rt->tobe_released); + task->rt_param.domain = rt; + + /* start release timer */ + TS_SCHED2_START(task); + + arm_release_timer_on(rt, target_cpu); + + TS_SCHED2_END(task); +} +#endif + +/* add_release - add a real-time task to the rt release queue. + * @task: the sleeping task + */ +void __add_release(rt_domain_t* rt, struct task_struct *task) +{ + TRACE_TASK(task, "add_release(), rel=%llu\n", get_release(task)); + list_add(&tsk_rt(task)->list, &rt->tobe_released); + task->rt_param.domain = rt; + + /* start release timer */ + TS_SCHED2_START(task); + + arm_release_timer(rt); + + TS_SCHED2_END(task); +} + diff --git a/litmus/sched_cedf.c b/litmus/sched_cedf.c new file mode 100644 index 000000000000..4f5bb26b339b --- /dev/null +++ b/litmus/sched_cedf.c @@ -0,0 +1,1526 @@ +/* + * litmus/sched_cedf.c + * + * Implementation of the C-EDF scheduling algorithm. + * + * This implementation is based on G-EDF: + * - CPUs are clustered around L2 or L3 caches. + * - Clusters topology is automatically detected (this is arch dependent + * and is working only on x86 at the moment --- and only with modern + * cpus that exports cpuid4 information) + * - The plugins _does not_ attempt to put tasks in the right cluster i.e. + * the programmer needs to be aware of the topology to place tasks + * in the desired cluster + * - default clustering is around L2 cache (cache index = 2) + * supported clusters are: L1 (private cache: pedf), L2, L3, ALL (all + * online_cpus are placed in a single cluster). + * + * For details on functions, take a look at sched_gsn_edf.c + * + * Currently, we do not support changes in the number of online cpus. + * If the num_online_cpus() dynamically changes, the plugin is broken. + * + * This version uses the simple approach and serializes all scheduling + * decisions by the use of a queue lock. This is probably not the + * best way to do it, but it should suffice for now. + */ + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +/* to configure the cluster size */ +#include +#include + +/* Reference configuration variable. Determines which cache level is used to + * group CPUs into clusters. GLOBAL_CLUSTER, which is the default, means that + * all CPUs form a single cluster (just like GSN-EDF). + */ +static enum cache_level cluster_config = GLOBAL_CLUSTER; + +struct clusterdomain; + +/* cpu_entry_t - maintain the linked and scheduled state + * + * A cpu also contains a pointer to the cedf_domain_t cluster + * that owns it (struct clusterdomain*) + */ +typedef struct { + int cpu; + struct clusterdomain* cluster; /* owning cluster */ + struct task_struct* linked; /* only RT tasks */ + struct task_struct* scheduled; /* only RT tasks */ + atomic_t will_schedule; /* prevent unneeded IPIs */ + struct bheap_node* hn; +#ifdef CONFIG_LITMUS_LOCKING + struct bheap_node* pending_hn; + struct task_struct* pending; +#endif +} cpu_entry_t; + +/* one cpu_entry_t per CPU */ +DEFINE_PER_CPU(cpu_entry_t, cedf_cpu_entries); + + +static struct bheap_node cpu_nodes[NR_CPUS]; +#ifdef CONFIG_LITMUS_LOCKING +static struct bheap_node pending_nodes[NR_CPUS]; +#endif + +/* + * In C-EDF there is a cedf domain _per_ cluster + * The number of clusters is dynamically determined accordingly to the + * total cpu number and the cluster size + */ +typedef struct clusterdomain { + /* rt_domain for this cluster */ + rt_domain_t domain; + /* map of this cluster cpus */ + cpumask_var_t cpu_map; + unsigned int num_cpus; + /* the cpus queue themselves according to priority in here */ + struct bheap cpu_heap; +#ifdef CONFIG_LITMUS_LOCKING + struct bheap pending_jobs; + struct bheap pending_cpus; +#endif + /* lock for this cluster */ +#define cluster_lock domain.ready_lock +} cedf_domain_t; + +/* a cedf_domain per cluster; allocation is done at init/activation time */ +cedf_domain_t *cedf; + +#define remote_cpu(cpu) (&per_cpu(cedf_cpu_entries, cpu)) +#define remote_cluster(cpu) ((cedf_domain_t *) per_cpu(cedf_cpu_entries, cpu).cluster) +#define task_cpu_cluster(task) remote_cluster(get_partition(task)) + +/* Uncomment WANT_ALL_SCHED_EVENTS if you want to see all scheduling + * decisions in the TRACE() log; uncomment VERBOSE_INIT for verbose + * information during the initialization of the plugin (e.g., topology) +#define WANT_ALL_SCHED_EVENTS + */ +#define VERBOSE_INIT + +static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b) +{ + cpu_entry_t *a, *b; + a = _a->value; + b = _b->value; + /* Note that a and b are inverted: we want the lowest-priority CPU at + * the top of the heap. + */ + return edf_higher_prio(b->linked, a->linked); +} + +/* update_cpu_position - Move the cpu entry to the correct place to maintain + * order in the cpu queue. Caller must hold cedf lock. + */ +static void update_cpu_position(cpu_entry_t *entry) +{ + cedf_domain_t *cluster = entry->cluster; + + if (likely(bheap_node_in_heap(entry->hn))) + bheap_delete(cpu_lower_prio, + &cluster->cpu_heap, + entry->hn); + + bheap_insert(cpu_lower_prio, &cluster->cpu_heap, entry->hn); +} + +/* caller must hold cedf lock */ +static cpu_entry_t* lowest_prio_cpu(cedf_domain_t *cluster) +{ + struct bheap_node* hn; + hn = bheap_peek(cpu_lower_prio, &cluster->cpu_heap); + return hn->value; +} + + +/* link_task_to_cpu - Update the link of a CPU. + * Handles the case where the to-be-linked task is already + * scheduled on a different CPU. + */ +static noinline void link_task_to_cpu(struct task_struct* linked, + cpu_entry_t *entry) +{ + cpu_entry_t *sched; + struct task_struct* tmp; + int on_cpu; + + BUG_ON(linked && !is_realtime(linked)); + + /* Currently linked task is set to be unlinked. */ + if (entry->linked) { + entry->linked->rt_param.linked_on = NO_CPU; + } + + /* Link new task to CPU. */ + if (linked) { + /* handle task is already scheduled somewhere! */ + on_cpu = linked->rt_param.scheduled_on; + if (on_cpu != NO_CPU) { + sched = &per_cpu(cedf_cpu_entries, on_cpu); + /* this should only happen if not linked already */ + BUG_ON(sched->linked == linked); + + /* If we are already scheduled on the CPU to which we + * wanted to link, we don't need to do the swap -- + * we just link ourselves to the CPU and depend on + * the caller to get things right. + */ + if (entry != sched) { + TRACE_TASK(linked, + "already scheduled on %d, updating link.\n", + sched->cpu); + tmp = sched->linked; + linked->rt_param.linked_on = sched->cpu; + sched->linked = linked; + update_cpu_position(sched); + linked = tmp; + } + } + if (linked) /* might be NULL due to swap */ + linked->rt_param.linked_on = entry->cpu; + } + entry->linked = linked; +#ifdef WANT_ALL_SCHED_EVENTS + if (linked) + TRACE_TASK(linked, "linked to %d.\n", entry->cpu); + else + TRACE("NULL linked to %d.\n", entry->cpu); +#endif + update_cpu_position(entry); +} + +/* unlink - Make sure a task is not linked any longer to an entry + * where it was linked before. Must hold cedf_lock. + */ +static noinline void unlink(struct task_struct* t) +{ + cpu_entry_t *entry; + + if (t->rt_param.linked_on != NO_CPU) { + /* unlink */ + entry = &per_cpu(cedf_cpu_entries, t->rt_param.linked_on); + t->rt_param.linked_on = NO_CPU; + link_task_to_cpu(NULL, entry); + } else if (is_queued(t)) { + /* This is an interesting situation: t is scheduled, + * but was just recently unlinked. It cannot be + * linked anywhere else (because then it would have + * been relinked to this CPU), thus it must be in some + * queue. We must remove it from the list in this + * case. + * + * in C-EDF case is should be somewhere in the queue for + * its domain, therefore and we can get the domain using + * task_cpu_cluster + */ + remove(&(task_cpu_cluster(t))->domain, t); + } +} + + +/* preempt - force a CPU to reschedule + */ +static void preempt(cpu_entry_t *entry) +{ + preempt_if_preemptable(entry->scheduled, entry->cpu); +} + +#ifdef CONFIG_LITMUS_LOCKING +static int update_pending_job(cedf_domain_t* cluster, struct task_struct* t); +static void priodon_become_eligible(void); +static void priodon_complete_request(void); + +static inline int in_pending_heap(struct task_struct* t) +{ + return bheap_node_in_heap(tsk_rt(t)->pending_node); +} + +/* has this task already been processed for pending */ +static inline int is_pending(struct task_struct* t) +{ + return tsk_rt(t)->pending_on != NO_CPU || + in_pending_heap(t); +} + +#endif + +/* requeue - Put an unlinked task into gsn-edf domain. + * Caller must hold cedf_lock. + */ +static noinline void requeue(struct task_struct* task) +{ + cedf_domain_t *cluster = task_cpu_cluster(task); + BUG_ON(!task); + /* sanity check before insertion */ + BUG_ON(is_queued(task)); + + if (is_released(task, litmus_clock())) { +#ifdef CONFIG_LITMUS_LOCKING + if (!is_pending(task)) + update_pending_job(cluster, task); +#endif + __add_ready(&cluster->domain, task); + } else { + /* it has got to wait */ + add_release(&cluster->domain, task); + } +} + +/* check for any necessary preemptions */ +static void check_for_preemptions(cedf_domain_t *cluster) +{ + struct task_struct *task; + cpu_entry_t* last; + + for(last = lowest_prio_cpu(cluster); + edf_preemption_needed(&cluster->domain, last->linked); + last = lowest_prio_cpu(cluster)) { + /* preemption necessary */ + +#ifdef CONFIG_LITMUS_LOCKING + task = __peek_ready(&cluster->domain); + if (update_pending_job(cluster, task)) { + /* Something changed, re-evaluate priorites to + * see if we still need to preempt. + * */ + TRACE_TASK(task, "hitting continue\n"); + continue; + } +#endif + task = __take_ready(&cluster->domain); + TRACE_TASK(task, "attempting to link task to P%d\n", + last->cpu); + if (last->linked) + requeue(last->linked); + link_task_to_cpu(task, last); + preempt(last); + } +} + +#ifdef CONFIG_LITMUS_LOCKING + +static int pending_lower_prio(struct bheap_node *_a, struct bheap_node *_b) +{ + cpu_entry_t *a, *b; + a = _a->value; + b = _b->value; + /* Note that a and b are inverted: we want the lowest-priority CPU at + * the top of the heap. + */ + return edf_higher_base_prio(b->pending, a->pending); +} + +/* update_cpu_position - Move the cpu entry to the correct place to maintain + * order in the cpu queue. Caller must hold cedf lock. + */ +static void update_pending_position(cpu_entry_t *entry) +{ + cedf_domain_t *cluster = entry->cluster; + + if (likely(bheap_node_in_heap(entry->pending_hn))) + bheap_delete(pending_lower_prio, + &cluster->pending_cpus, + entry->pending_hn); + + bheap_insert(pending_lower_prio, &cluster->pending_cpus, entry->pending_hn); +} + +/* caller must hold cedf lock */ +static cpu_entry_t* lowest_pending_cpu(cedf_domain_t *cluster) +{ + struct bheap_node* hn; + hn = bheap_peek(pending_lower_prio, &cluster->pending_cpus); + return hn->value; +} + +static void priority_raised(struct task_struct* t) +{ + cedf_domain_t *cluster = task_cpu_cluster(t); + int linked_on; + + linked_on = tsk_rt(t)->linked_on; + + /* If it is scheduled, then we need to reorder the CPU heap. */ + if (linked_on != NO_CPU) { + TRACE_TASK(t, "%s: linked on %d\n", + __FUNCTION__, linked_on); + /* Holder is scheduled; need to re-order CPUs. + * We can't use heap_decrease() here since + * the cpu_heap is ordered in reverse direction, so + * it is actually an increase. */ + bheap_delete(cpu_lower_prio, &cluster->cpu_heap, + remote_cpu(linked_on)->hn); + bheap_insert(cpu_lower_prio, &cluster->cpu_heap, + remote_cpu(linked_on)->hn); + } else { + /* holder may be queued: first stop queue changes */ + raw_spin_lock(&cluster->domain.release_lock); + if (is_queued(t)) { + TRACE_TASK(t, "%s: is queued\n", + __FUNCTION__); + bheap_decrease(edf_ready_order, + tsk_rt(t)->heap_node); + } else { + /* Nothing to do: if it is not queued and not linked + * then it is either sleeping or currently being moved + * by other code (e.g., a timer interrupt handler) that + * will use the correct priority when enqueuing the + * task. */ + TRACE_TASK(t, "%s: is NOT queued => Done.\n", + __FUNCTION__); + } + raw_spin_unlock(&cluster->domain.release_lock); + } +} + +static void priority_lowered(struct task_struct* t) +{ + /* assumption: t is not in a release heap */ + if (is_queued(t) || tsk_rt(t)->linked_on != NO_CPU) { + unlink(t); + requeue(t); + } +} + +static void donate_priority(struct task_struct* recipient, struct task_struct* donor) +{ + cedf_domain_t *cluster = task_cpu_cluster(donor); + + BUG_ON(task_cpu_cluster(recipient) != task_cpu_cluster(donor)); + BUG_ON(tsk_rt(donor)->is_donor); + BUG_ON(tsk_rt(recipient)->is_donor); + BUG_ON(tsk_rt(donor)->inh_task); + BUG_ON(tsk_rt(recipient)->inh_task); + + TRACE_TASK(donor, "priodon: becomes priority donor for %s/%d\n", + recipient->comm, recipient->pid); + + /* swap priorities */ + tsk_rt(recipient)->inh_task = donor; + tsk_rt(donor)->inh_task = recipient; + tsk_rt(donor)->is_donor = 1; + + priority_lowered(donor); + priority_raised(recipient); + + bheap_uncache_min(edf_ready_order, + &cluster->domain.ready_queue); +} + +/* assumption: new_donor has a higher priority than old_donor */ +static void switch_donor(struct task_struct* recipient, + struct task_struct* old_donor, + struct task_struct* new_donor) +{ + TRACE_TASK(new_donor, "becomes donor for %s/%d instead of %s/%d\n", + recipient->comm, recipient->pid, old_donor->comm, old_donor->pid); + + BUG_ON(tsk_rt(recipient)->inh_task != old_donor); + BUG_ON(tsk_rt(old_donor)->inh_task != recipient); + BUG_ON(tsk_rt(new_donor)->inh_task != NULL); + BUG_ON(tsk_rt(new_donor)->is_donor); + + tsk_rt(old_donor)->inh_task = NULL; + tsk_rt(old_donor)->is_donor = 0; + + tsk_rt(recipient)->inh_task = new_donor; + tsk_rt(new_donor)->inh_task = recipient; + tsk_rt(new_donor)->is_donor = 1; + + priority_raised(recipient); + priority_raised(old_donor); + priority_lowered(new_donor); +} + +static void undonate_priority(struct task_struct* recipient, struct task_struct* donor) +{ + cedf_domain_t *cluster = task_cpu_cluster(donor); + + BUG_ON(tsk_rt(recipient)->inh_task != donor); + BUG_ON(tsk_rt(donor)->inh_task != recipient); + + TRACE_TASK(donor, "priodon: is no longer priority donor of %s/%d\n", + recipient->comm, recipient->pid); + + tsk_rt(recipient)->inh_task = NULL; + tsk_rt(donor)->inh_task = NULL; + tsk_rt(donor)->is_donor = 0; + + priority_lowered(recipient); + priority_raised(donor); + + bheap_uncache_min(edf_ready_order, + &cluster->domain.ready_queue); +} + +static inline void add_to_pending(cedf_domain_t* cluster, struct task_struct* t) +{ + TRACE_TASK(t, "priodon: adding to pending heap wait:%u donor:%u req:%u pend:%d\n", + tsk_rt(t)->waiting_eligible, + tsk_rt(t)->is_donor, tsk_rt(t)->request_incomplete, + tsk_rt(t)->pending_on); + bheap_insert(edf_pending_order, + &cluster->pending_jobs, + tsk_rt(t)->pending_node); +} + +static inline struct task_struct* take_pending(cedf_domain_t* cluster) +{ + struct bheap_node* node; + node = bheap_take(edf_pending_order, &cluster->pending_jobs); + return node ? (struct task_struct*) node->value : NULL; +} + +static inline struct task_struct* peek_pending(cedf_domain_t* cluster) +{ + struct bheap_node* node; + node = bheap_peek(edf_pending_order, &cluster->pending_jobs); + return node ? (struct task_struct*) node->value : NULL; +} + +static inline int fake_resume(struct task_struct* t) +{ + TRACE_TASK(t, "priodon: fake resume wait:%u donor:%u\n", + tsk_rt(t)->waiting_eligible, tsk_rt(t)->is_donor); + /* Fake suspended. Let's resume it. */ + if (tsk_rt(t)->waiting_eligible) { + tsk_rt(t)->waiting_eligible = 0; + if (tsk_rt(t)->scheduled_on == NO_CPU) { + /* it was removed from the queue */ + requeue(t); + return 1; + } + } + return 0; +} + + +/* Lazily update set of highest-priority pending jobs. + * Returns 1 if priority recheck is required. + */ +static int update_pending_job(cedf_domain_t* cluster, + struct task_struct* to_be_linked) +{ + cpu_entry_t* entry; + struct task_struct* lowest_hp; /* lowest-priority high-priority task */ + struct task_struct* highest_lp; /* highest-priority low-priority task */ + int reeval = 0; + + entry = lowest_pending_cpu(cluster); + lowest_hp = entry->pending; + + if (to_be_linked && !is_pending(to_be_linked)) + /* not yet accounted for, stick in heap */ + add_to_pending(cluster, to_be_linked); + + highest_lp = peek_pending(cluster); + if (edf_higher_base_prio(highest_lp, lowest_hp)) { + /* yep, should be become of the c highest-prior pending jobs */ + + TRACE_TASK(highest_lp, + "priodon: became one of the %u highest-prio tasks (P%d, req:%u) X\n", + cluster->num_cpus, + entry->cpu, + tsk_rt(highest_lp)->request_incomplete); + + /* get it out of the heap */ + highest_lp = take_pending(cluster); + + BUG_ON(highest_lp == lowest_hp); + + /* it should never be a priority donor at this point */ + BUG_ON(tsk_rt(highest_lp)->is_donor); + + entry->pending = highest_lp; + update_pending_position(entry); + tsk_rt(highest_lp)->pending_on = entry->cpu; + + /* things that could happen: + * + * 1) lowest_hp has no donor, but is in a request => highest_lp becomes donor + * 2) lowest_hp is donor => highest_lp becomes new donor, old donor is resumed if suspended + * 3) lowest_hp is not in a request, and highest_lp is waiting => highest_lp is resumed + * 4) lowest_hp is not in a request, and highest_lp is not waiting => nothing to do + * 5) highest_lp has a priority donor => resume its donor + */ + + /* do we need to put it back? */ + if (lowest_hp) { + TRACE_TASK(lowest_hp, + "priodon: no longer among %u highest-prio tasks req:%u\n", + cluster->num_cpus, + tsk_rt(lowest_hp)->request_incomplete); + tsk_rt(lowest_hp)->pending_on = NO_CPU; + add_to_pending(cluster, lowest_hp); + + + if (tsk_rt(lowest_hp)->request_incomplete) { + /* case 1) */ + donate_priority(lowest_hp, highest_lp); + reeval = 1; + } else if (tsk_rt(lowest_hp)->inh_task) { + /* case 2) */ + switch_donor(tsk_rt(lowest_hp)->inh_task, + lowest_hp, highest_lp); + fake_resume(lowest_hp); + reeval = 1; + } + } + + + if (!tsk_rt(highest_lp)->is_donor) { + if (tsk_rt(highest_lp)->waiting_eligible) { + /* case 3) */ + reeval = fake_resume(highest_lp); + BUG_ON(tsk_rt(highest_lp)->inh_task); + } else if (tsk_rt(highest_lp)->inh_task) { + /* case 5 */ + struct task_struct* donor = tsk_rt(highest_lp)->inh_task; + undonate_priority(highest_lp, donor); + reeval = fake_resume(donor); + } + } + } + + return reeval; +} + +/* job has exited => no longer pending */ + +static void job_pending_exit(struct task_struct* t) +{ + cedf_domain_t *cluster; + cpu_entry_t* entry; + + TRACE_TASK(t, "priodon: is no longer pending (pending_on:%d, queued:%d)\n", + tsk_rt(t)->pending_on, in_pending_heap(t)); + + cluster = task_cpu_cluster(t); + + if (tsk_rt(t)->pending_on != NO_CPU) { + entry = &per_cpu(cedf_cpu_entries, tsk_rt(t)->pending_on); + tsk_rt(t)->pending_on = NO_CPU; + entry->pending = NULL; + update_pending_position(entry); + + /* let's see if anything changed */ + update_pending_job(cluster, NULL); + } else if (in_pending_heap(t)) { + bheap_delete(edf_pending_order, &cluster->pending_jobs, + tsk_rt(t)->pending_node); + } +} + +#endif + + +/* cedf_job_arrival: task is either resumed or released */ +static noinline void cedf_job_arrival(struct task_struct* task) +{ + cedf_domain_t *cluster = task_cpu_cluster(task); + BUG_ON(!task); + + requeue(task); + check_for_preemptions(cluster); +} + + +static void cedf_release_jobs(rt_domain_t* rt, struct bheap* tasks) +{ + cedf_domain_t* cluster = container_of(rt, cedf_domain_t, domain); + unsigned long flags; + + raw_spin_lock_irqsave(&cluster->cluster_lock, flags); + + __merge_ready(&cluster->domain, tasks); + check_for_preemptions(cluster); + + raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags); +} + +/* caller holds cedf_lock */ +static noinline void job_completion(struct task_struct *t, int forced) +{ + BUG_ON(!t); + + sched_trace_task_completion(t, forced); + + TRACE_TASK(t, "job_completion().\n"); + +#ifdef CONFIG_LITMUS_LOCKING + job_pending_exit(t); +#endif + + /* prepare for next period */ + prepare_for_next_period(t); + if (is_released(t, litmus_clock())) + sched_trace_task_release(t); + /* unlink */ + unlink(t); + /* requeue + * But don't requeue a blocking task. */ + set_rt_flags(t, RT_F_RUNNING); + if (is_running(t)) + cedf_job_arrival(t); +} + +/* cedf_tick - this function is called for every local timer + * interrupt. + * + * checks whether the current task has expired and checks + * whether we need to preempt it if it has not expired + */ +static void cedf_tick(struct task_struct* t) +{ + if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) { + if (!is_np(t)) { + /* np tasks will be preempted when they become + * preemptable again + */ + litmus_reschedule_local(); + TRACE("cedf_scheduler_tick: " + "%d is preemptable " + " => FORCE_RESCHED\n", t->pid); + } else if (is_user_np(t)) { + TRACE("cedf_scheduler_tick: " + "%d is non-preemptable, " + "preemption delayed.\n", t->pid); + request_exit_np(t); + } + } +} + +/* Getting schedule() right is a bit tricky. schedule() may not make any + * assumptions on the state of the current task since it may be called for a + * number of reasons. The reasons include a scheduler_tick() determined that it + * was necessary, because sys_exit_np() was called, because some Linux + * subsystem determined so, or even (in the worst case) because there is a bug + * hidden somewhere. Thus, we must take extreme care to determine what the + * current state is. + * + * The CPU could currently be scheduling a task (or not), be linked (or not). + * + * The following assertions for the scheduled task could hold: + * + * - !is_running(scheduled) // the job blocks + * - scheduled->timeslice == 0 // the job completed (forcefully) + * - get_rt_flag() == RT_F_SLEEP // the job completed (by syscall) + * - linked != scheduled // we need to reschedule (for any reason) + * - is_np(scheduled) // rescheduling must be delayed, + * sys_exit_np must be requested + * + * Any of these can occur together. + */ +static struct task_struct* cedf_schedule(struct task_struct * prev) +{ + cpu_entry_t* entry = &__get_cpu_var(cedf_cpu_entries); + cedf_domain_t *cluster = entry->cluster; + int out_of_time, sleep, preempt, np, exists, blocks; + struct task_struct* next = NULL; + +#ifdef CONFIG_LITMUS_LOCKING + int priodon; +#else +#define priodon 0 +#endif + +#ifdef CONFIG_RELEASE_MASTER + /* Bail out early if we are the release master. + * The release master never schedules any real-time tasks. + */ + if (cluster->domain.release_master == entry->cpu) { + sched_state_task_picked(); + return NULL; + } +#endif + + raw_spin_lock(&cluster->cluster_lock); + + /* sanity checking */ + BUG_ON(entry->scheduled && entry->scheduled != prev); + BUG_ON(entry->scheduled && !is_realtime(prev)); + BUG_ON(is_realtime(prev) && !entry->scheduled); + + /* (0) Determine state */ + exists = entry->scheduled != NULL; + blocks = exists && !is_running(entry->scheduled); + out_of_time = exists && + budget_enforced(entry->scheduled) && + budget_exhausted(entry->scheduled); + np = exists && is_np(entry->scheduled); + sleep = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP; + preempt = entry->scheduled != entry->linked; + +#ifdef CONFIG_LITMUS_LOCKING + priodon = exists && (tsk_rt(entry->scheduled)->waiting_eligible || + /* can't allow job to exit until request is over */ + (tsk_rt(entry->scheduled)->is_donor && sleep)); + + /* this should never happend together (at least we don't handle it atm) */ + BUG_ON(priodon && blocks); +#endif + +#ifdef WANT_ALL_SCHED_EVENTS + TRACE_TASK(prev, "invoked cedf_schedule.\n"); +#endif + + if (exists) + TRACE_TASK(prev, + "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d " + "state:%d sig:%d priodon:%d\n", + blocks, out_of_time, np, sleep, preempt, + prev->state, signal_pending(prev), priodon); + if (entry->linked && preempt) + TRACE_TASK(prev, "will be preempted by %s/%d\n", + entry->linked->comm, entry->linked->pid); + + + /* If a task blocks we have no choice but to reschedule. + */ + if (blocks || priodon) + unlink(entry->scheduled); + + /* Request a sys_exit_np() call if we would like to preempt but cannot. + * Do not unlink since entry->scheduled is currently in the ready queue. + * We don't process out_of_time and sleep until the job is preemptive again. + */ + if (np && (out_of_time || preempt || sleep)) { + request_exit_np(entry->scheduled); + } + + /* Any task that is preemptable and either exhausts its execution + * budget or wants to sleep completes. We may have to reschedule after + * this. Don't do a job completion if we block (can't have timers running + * for blocked jobs). Preemption go first for the same reason. + */ + if (!np && (out_of_time || sleep) && !blocks && !preempt + && !priodon) + /* note: priority donation prevents job completion */ + job_completion(entry->scheduled, !sleep); + + /* Link pending task if we became unlinked. + */ + + if (!entry->linked) { +#ifdef CONFIG_LITMUS_LOCKING + struct task_struct *pulled; + int reeval; + do { + pulled = __take_ready(&cluster->domain); + reeval = 0; + if (pulled && !is_pending(pulled)) { + /* Pulled an un-processed task from the ready queue. */ + TRACE_TASK(pulled, "pulled unprocessed\n"); + reeval = update_pending_job(cluster, pulled); + if (reeval) + /* priority may have changed --- try again */ + requeue(pulled); + } + } while (reeval); + link_task_to_cpu(pulled, entry); +#else + link_task_to_cpu(__take_ready(&cluster->domain), entry); +#endif + } + + /* The final scheduling decision. Do we need to switch for some reason? + * If linked is different from scheduled, then select linked as next. + */ + if ((!np || blocks || priodon) && + entry->linked != entry->scheduled) { + /* Schedule a linked job? */ + if (entry->linked) { + entry->linked->rt_param.scheduled_on = entry->cpu; + next = entry->linked; + } + if (entry->scheduled) { + /* not gonna be scheduled soon */ + entry->scheduled->rt_param.scheduled_on = NO_CPU; + TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n"); + } + } else + /* Only override Linux scheduler if we have a real-time task + * scheduled that needs to continue. + */ + if (exists) + next = prev; + + sched_state_task_picked(); + raw_spin_unlock(&cluster->cluster_lock); + +#ifdef WANT_ALL_SCHED_EVENTS + TRACE("cedf_lock released, next=0x%p\n", next); + + if (next) + TRACE_TASK(next, "scheduled at %llu\n", litmus_clock()); + else if (exists && !next) + TRACE("becomes idle at %llu.\n", litmus_clock()); +#endif + + + return next; +} + + +/* _finish_switch - we just finished the switch away from prev + */ +static void cedf_finish_switch(struct task_struct *prev) +{ + cpu_entry_t* entry = &__get_cpu_var(cedf_cpu_entries); + + entry->scheduled = is_realtime(current) ? current : NULL; +#ifdef WANT_ALL_SCHED_EVENTS + TRACE_TASK(prev, "switched away from\n"); +#endif +} + + +/* Prepare a task for running in RT mode + */ +static void cedf_task_new(struct task_struct * t, int on_rq, int running) +{ + unsigned long flags; + cpu_entry_t* entry; + cedf_domain_t* cluster; + + TRACE("gsn edf: task new %d\n", t->pid); + + /* the cluster doesn't change even if t is running */ + cluster = task_cpu_cluster(t); + + raw_spin_lock_irqsave(&cluster->cluster_lock, flags); + + /* setup job params */ + release_at(t, litmus_clock()); + +#ifdef CONFIG_LITMUS_LOCKING + tsk_rt(t)->pending_node = bheap_node_alloc(GFP_ATOMIC | __GFP_NOFAIL); + bheap_node_init(&tsk_rt(t)->pending_node, t); + tsk_rt(t)->pending_on = NO_CPU; + add_to_pending(cluster, t); +#endif + + if (running) { + entry = &per_cpu(cedf_cpu_entries, task_cpu(t)); + BUG_ON(entry->scheduled); + +#ifdef CONFIG_RELEASE_MASTER + if (entry->cpu != cluster->domain.release_master) { +#endif + entry->scheduled = t; + tsk_rt(t)->scheduled_on = task_cpu(t); +#ifdef CONFIG_RELEASE_MASTER + } else { + /* do not schedule on release master */ + preempt(entry); /* force resched */ + tsk_rt(t)->scheduled_on = NO_CPU; + } +#endif + } else { + t->rt_param.scheduled_on = NO_CPU; + } + t->rt_param.linked_on = NO_CPU; + + cedf_job_arrival(t); + raw_spin_unlock_irqrestore(&(cluster->cluster_lock), flags); +} + +static void cedf_task_wake_up(struct task_struct *task) +{ + unsigned long flags; + lt_t now; + cedf_domain_t *cluster; + + TRACE_TASK(task, "wake_up at %llu\n", litmus_clock()); + + cluster = task_cpu_cluster(task); + + raw_spin_lock_irqsave(&cluster->cluster_lock, flags); + /* We need to take suspensions because of semaphores into + * account! If a job resumes after being suspended due to acquiring + * a semaphore, it should never be treated as a new job release. + */ + if (get_rt_flags(task) == RT_F_EXIT_SEM) { + set_rt_flags(task, RT_F_RUNNING); + } else { + now = litmus_clock(); + if (is_tardy(task, now)) { + /* new sporadic release */ + release_at(task, now); + sched_trace_task_release(task); + } + else { + if (task->rt.time_slice) { + /* came back in time before deadline + */ + set_rt_flags(task, RT_F_RUNNING); + } + } + } + cedf_job_arrival(task); + raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags); +} + +static void cedf_task_block(struct task_struct *t) +{ + unsigned long flags; + cedf_domain_t *cluster; + + TRACE_TASK(t, "block at %llu\n", litmus_clock()); + + cluster = task_cpu_cluster(t); + + /* unlink if necessary */ + raw_spin_lock_irqsave(&cluster->cluster_lock, flags); + unlink(t); + raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags); + + BUG_ON(!is_realtime(t)); +} + +#ifdef CONFIG_LITMUS_LOCKING +static void cedf_pre_setsched(struct task_struct *t, int policy) +{ + + unsigned long flags; + cedf_domain_t *cluster = task_cpu_cluster(t); + + int delay_donor_exit = 0; + + while (1) { + raw_spin_lock_irqsave(&cluster->cluster_lock, flags); + + TRACE_CUR("cedf_pre_setsched wait:%u pend:%d donor:%u req:%u\n", + tsk_rt(t)->waiting_eligible, + tsk_rt(t)->pending_on, tsk_rt(t)->is_donor, + tsk_rt(t)->request_incomplete); + + delay_donor_exit = tsk_rt(current)->is_donor; + + raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags); + + if (!delay_donor_exit) + break; + + TRACE_CUR("donor exit delay\n"); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + } +} +#endif + +static void cedf_task_exit(struct task_struct * t) +{ + unsigned long flags; + cedf_domain_t *cluster = task_cpu_cluster(t); + + /* unlink if necessary */ + raw_spin_lock_irqsave(&cluster->cluster_lock, flags); + + unlink(t); + +#ifdef CONFIG_LITMUS_LOCKING + /* make sure it's not pending anymore */ + job_pending_exit(t); + bheap_node_free(tsk_rt(t)->pending_node); +#endif + + if (tsk_rt(t)->scheduled_on != NO_CPU) { + cpu_entry_t *cpu; + cpu = &per_cpu(cedf_cpu_entries, tsk_rt(t)->scheduled_on); + cpu->scheduled = NULL; + tsk_rt(t)->scheduled_on = NO_CPU; + } + raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags); + + + BUG_ON(!is_realtime(t)); + TRACE_TASK(t, "RIP\n"); +} + +#ifdef CONFIG_LITMUS_LOCKING + +#include +#include + +/* NOTE: we use fake suspensions because we must wake the task from within the + * scheduler */ + +/* suspend until the current task becomes eligible to issue a lock request */ +static void priodon_become_eligible(void) +{ + struct task_struct* t = current; + unsigned long flags; + cedf_domain_t *cluster; + + cluster = task_cpu_cluster(t); + + do { + TRACE_CUR("priodon: checking whether request may be issued\n"); + raw_spin_lock_irqsave(&cluster->cluster_lock, flags); + + if (tsk_rt(t)->pending_on == NO_CPU || + tsk_rt(t)->is_donor) { + /* nope, gotta wait */ + tsk_rt(t)->waiting_eligible = 1; + TRACE_CUR("priodon: not eligible pend:%u donor:%u\n", + tsk_rt(t)->pending_on, tsk_rt(t)->is_donor); + } else { + /* alright! we are good to go! */ + tsk_rt(t)->request_incomplete = 1; + TRACE_CUR("priodon: request issued\n"); + } + + raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags); + + if (tsk_rt(t)->waiting_eligible) { + TRACE_CUR("priodon: fake suspending\n"); + TS_LOCK_SUSPEND; + schedule(); + TS_LOCK_RESUME; + } + + } while (!tsk_rt(t)->request_incomplete); +} + +/* current task has completed its request */ +static void priodon_complete_request(void) +{ + struct task_struct* t = current; + struct task_struct* donor; + unsigned long flags; + cedf_domain_t *cluster; + + cluster = task_cpu_cluster(t); + + preempt_disable(); + + raw_spin_lock_irqsave(&cluster->cluster_lock, flags); + + TRACE_CUR("priodon: completing request\n"); + + if (tsk_rt(t)->inh_task) { + /* we have a donor job --- see if we need to wake it */ + donor = tsk_rt(t)->inh_task; + undonate_priority(t, donor); + + if (fake_resume(donor)) + check_for_preemptions(cluster); + } + + tsk_rt(t)->request_incomplete = 0; + + raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags); + + preempt_enable(); +} + +/* struct for semaphore with priority inheritance */ +struct omlp_semaphore { + struct litmus_lock litmus_lock; + + /* current resource holder */ + struct task_struct *owner; + + /* FIFO queue of waiting tasks */ + wait_queue_head_t fifo_wait; +}; + +static inline struct omlp_semaphore* omlp_from_lock(struct litmus_lock* lock) +{ + return container_of(lock, struct omlp_semaphore, litmus_lock); +} + +static int cedf_omlp_lock(struct litmus_lock* l) +{ + struct task_struct* t = current; + struct omlp_semaphore *sem = omlp_from_lock(l); + wait_queue_t wait; + unsigned long flags; + + if (!is_realtime(t)) + return -EPERM; + + priodon_become_eligible(); + + spin_lock_irqsave(&sem->fifo_wait.lock, flags); + + if (sem->owner) { + /* resource is not free => must suspend and wait */ + + init_waitqueue_entry(&wait, t); + + set_task_state(t, TASK_UNINTERRUPTIBLE); + + __add_wait_queue_tail_exclusive(&sem->fifo_wait, &wait); + + TS_LOCK_SUSPEND; + + spin_unlock_irqrestore(&sem->fifo_wait.lock, flags); + + schedule(); + + TS_LOCK_RESUME; + + BUG_ON(sem->owner != t); + } else { + /* it's ours now */ + sem->owner = t; + + spin_unlock_irqrestore(&sem->fifo_wait.lock, flags); + } + + return 0; +} + +static int cedf_omlp_unlock(struct litmus_lock* l) +{ + struct task_struct *t = current, *next; + struct omlp_semaphore *sem = omlp_from_lock(l); + unsigned long flags; + int err = 0; + + spin_lock_irqsave(&sem->fifo_wait.lock, flags); + + if (sem->owner != t) { + err = -EINVAL; + spin_unlock_irqrestore(&sem->fifo_wait.lock, flags); + goto out; + } + + /* check if there are jobs waiting for this resource */ + next = __waitqueue_remove_first(&sem->fifo_wait); + if (next) { + /* next becomes the resouce holder */ + sem->owner = next; + TRACE_CUR("lock ownership passed to %s/%d\n", next->comm, next->pid); + + /* wake up next */ + wake_up_process(next); + } else + /* becomes available */ + sem->owner = NULL; + + spin_unlock_irqrestore(&sem->fifo_wait.lock, flags); + + priodon_complete_request(); + +out: + return err; +} + +static int cedf_omlp_close(struct litmus_lock* l) +{ + struct task_struct *t = current; + struct omlp_semaphore *sem = omlp_from_lock(l); + unsigned long flags; + + int owner; + + spin_lock_irqsave(&sem->fifo_wait.lock, flags); + + owner = sem->owner == t; + + spin_unlock_irqrestore(&sem->fifo_wait.lock, flags); + + if (owner) + cedf_omlp_unlock(l); + + return 0; +} + +static void cedf_omlp_free(struct litmus_lock* lock) +{ + kfree(omlp_from_lock(lock)); +} + +static struct litmus_lock_ops cedf_omlp_lock_ops = { + .close = cedf_omlp_close, + .lock = cedf_omlp_lock, + .unlock = cedf_omlp_unlock, + .deallocate = cedf_omlp_free, +}; + +static struct litmus_lock* cedf_new_omlp(void) +{ + struct omlp_semaphore* sem; + + sem = kmalloc(sizeof(*sem), GFP_KERNEL); + if (!sem) + return NULL; + + sem->owner = NULL; + init_waitqueue_head(&sem->fifo_wait); + sem->litmus_lock.ops = &cedf_omlp_lock_ops; + + return &sem->litmus_lock; +} + +static long cedf_allocate_lock(struct litmus_lock **lock, int type, + void* __user unused) +{ + int err = -ENXIO; + + switch (type) { + + case OMLP_SEM: + /* O(m) Multiprocessor Locking Protocol */ + *lock = cedf_new_omlp(); + if (*lock) + err = 0; + else + err = -ENOMEM; + break; + + }; + + return err; +} + + +#endif + +static long cedf_admit_task(struct task_struct* tsk) +{ + if (task_cpu(tsk) == tsk->rt_param.task_params.cpu) { +#ifdef CONFIG_LITMUS_LOCKING + +#endif + return 0; + } + else + return -EINVAL; +} + +/* total number of cluster */ +static int num_clusters; +/* we do not support cluster of different sizes */ +static unsigned int cluster_size; + +#ifdef VERBOSE_INIT +static void print_cluster_topology(cpumask_var_t mask, int cpu) +{ + int chk; + char buf[255]; + + chk = cpulist_scnprintf(buf, 254, mask); + buf[chk] = '\0'; + printk(KERN_INFO "CPU = %d, shared cpu(s) = %s\n", cpu, buf); + +} +#endif + +static int clusters_allocated = 0; + +static void cleanup_cedf(void) +{ + int i; + + if (clusters_allocated) { + for (i = 0; i < num_clusters; i++) { + free_cpumask_var(cedf[i].cpu_map); + } + + kfree(cedf); + } +} + +static long cedf_activate_plugin(void) +{ + int i, j, cpu, ccpu, cpu_count; + cpu_entry_t *entry; + + cpumask_var_t mask; + int chk = 0; + + /* de-allocate old clusters, if any */ + cleanup_cedf(); + + printk(KERN_INFO "C-EDF: Activate Plugin, cluster configuration = %d\n", + cluster_config); + + /* need to get cluster_size first */ + if(!zalloc_cpumask_var(&mask, GFP_ATOMIC)) + return -ENOMEM; + + if (unlikely(cluster_config == GLOBAL_CLUSTER)) { + cluster_size = num_online_cpus(); + } else { + chk = get_shared_cpu_map(mask, 0, cluster_config); + if (chk) { + /* if chk != 0 then it is the max allowed index */ + printk(KERN_INFO "C-EDF: Cluster configuration = %d " + "is not supported on this hardware.\n", + cluster_config); + /* User should notice that the configuration failed, so + * let's bail out. */ + return -EINVAL; + } + + cluster_size = cpumask_weight(mask); + } + + if ((num_online_cpus() % cluster_size) != 0) { + /* this can't be right, some cpus are left out */ + printk(KERN_ERR "C-EDF: Trying to group %d cpus in %d!\n", + num_online_cpus(), cluster_size); + return -1; + } + + num_clusters = num_online_cpus() / cluster_size; + printk(KERN_INFO "C-EDF: %d cluster(s) of size = %d\n", + num_clusters, cluster_size); + + /* initialize clusters */ + cedf = kmalloc(num_clusters * sizeof(cedf_domain_t), GFP_ATOMIC); + for (i = 0; i < num_clusters; i++) { + bheap_init(&(cedf[i].cpu_heap)); +#ifdef CONFIG_LITMUS_LOCKING + bheap_init(&(cedf[i].pending_jobs)); + bheap_init(&(cedf[i].pending_cpus)); +#endif + edf_domain_init(&(cedf[i].domain), NULL, cedf_release_jobs); + + if(!zalloc_cpumask_var(&cedf[i].cpu_map, GFP_ATOMIC)) + return -ENOMEM; +#ifdef CONFIG_RELEASE_MASTER + cedf[i].domain.release_master = atomic_read(&release_master_cpu); +#endif + } + + /* cycle through cluster and add cpus to them */ + for (i = 0; i < num_clusters; i++) { + + for_each_online_cpu(cpu) { + /* check if the cpu is already in a cluster */ + for (j = 0; j < num_clusters; j++) + if (cpumask_test_cpu(cpu, cedf[j].cpu_map)) + break; + /* if it is in a cluster go to next cpu */ + if (j < num_clusters && + cpumask_test_cpu(cpu, cedf[j].cpu_map)) + continue; + + /* this cpu isn't in any cluster */ + /* get the shared cpus */ + if (unlikely(cluster_config == GLOBAL_CLUSTER)) + cpumask_copy(mask, cpu_online_mask); + else + get_shared_cpu_map(mask, cpu, cluster_config); + + cpumask_copy(cedf[i].cpu_map, mask); +#ifdef VERBOSE_INIT + print_cluster_topology(mask, cpu); +#endif + /* add cpus to current cluster and init cpu_entry_t */ + cpu_count = 0; + cedf[i].num_cpus = 0; + for_each_cpu(ccpu, cedf[i].cpu_map) { + + entry = &per_cpu(cedf_cpu_entries, ccpu); + atomic_set(&entry->will_schedule, 0); + entry->cpu = ccpu; + entry->cluster = &cedf[i]; + entry->hn = cpu_nodes + ccpu; + bheap_node_init(&entry->hn, entry); + +#ifdef CONFIG_LITMUS_LOCKING + entry->pending_hn = pending_nodes + ccpu; + bheap_node_init(&entry->pending_hn, entry); + entry->pending = NULL; +#endif + + cpu_count++; + + entry->linked = NULL; + entry->scheduled = NULL; +#ifdef CONFIG_RELEASE_MASTER + /* only add CPUs that should schedule jobs */ + if (entry->cpu != entry->cluster->domain.release_master) +#endif + { + cedf[i].num_cpus++; + update_cpu_position(entry); +#ifdef CONFIG_LITMUS_LOCKING + update_pending_position(entry); +#endif + } + } + /* done with this cluster */ + break; + } + } + + free_cpumask_var(mask); + clusters_allocated = 1; + return 0; +} + +/* Plugin object */ +static struct sched_plugin cedf_plugin __cacheline_aligned_in_smp = { + .plugin_name = "C-EDF", + .finish_switch = cedf_finish_switch, + .tick = cedf_tick, + .task_new = cedf_task_new, + .complete_job = complete_job, + .task_exit = cedf_task_exit, + .schedule = cedf_schedule, + .task_wake_up = cedf_task_wake_up, + .task_block = cedf_task_block, + .admit_task = cedf_admit_task, + .activate_plugin = cedf_activate_plugin, +#ifdef CONFIG_LITMUS_LOCKING + .allocate_lock = cedf_allocate_lock, + .pre_setsched = cedf_pre_setsched, +#endif +}; + +static struct proc_dir_entry *cluster_file = NULL, *cedf_dir = NULL; + +static int __init init_cedf(void) +{ + int err, fs; + + err = register_sched_plugin(&cedf_plugin); + if (!err) { + fs = make_plugin_proc_dir(&cedf_plugin, &cedf_dir); + if (!fs) + cluster_file = create_cluster_file(cedf_dir, &cluster_config); + else + printk(KERN_ERR "Could not allocate C-EDF procfs dir.\n"); + } + return err; +} + +static void clean_cedf(void) +{ + cleanup_cedf(); + if (cluster_file) + remove_proc_entry("cluster", cedf_dir); + if (cedf_dir) + remove_plugin_proc_dir(&cedf_plugin); +} + +module_init(init_cedf); +module_exit(clean_cedf); diff --git a/litmus/sched_cedf.c.rej b/litmus/sched_cedf.c.rej new file mode 100644 index 000000000000..ec74da6c4a64 --- /dev/null +++ b/litmus/sched_cedf.c.rej @@ -0,0 +1,53 @@ +--- litmus/sched_cedf.c ++++ litmus/sched_cedf.c +@@ -739,6 +1100,12 @@ + int out_of_time, sleep, preempt, np, exists, blocks; + struct task_struct* next = NULL; + ++#ifdef CONFIG_LITMUS_LOCKING ++ int priodon; ++#else ++#define priodon 0 ++#endif ++ + #ifdef CONFIG_RELEASE_MASTER + /* Bail out early if we are the release master. + * The release master never schedules any real-time tasks. +@@ -750,7 +1117,6 @@ + #endif + + raw_spin_lock(&cluster->cluster_lock); +- clear_will_schedule(); + + /* sanity checking */ + BUG_ON(entry->scheduled && entry->scheduled != prev); +@@ -1032,7 +1466,15 @@ + + /* unlink if necessary */ + raw_spin_lock_irqsave(&cluster->cluster_lock, flags); ++ + unlink(t); ++ ++#ifdef CONFIG_LITMUS_LOCKING ++ /* make sure it's not pending anymore */ ++ job_pending_exit(t); ++ bheap_node_free(tsk_rt(t)->pending_node); ++#endif ++ + if (tsk_rt(t)->scheduled_on != NO_CPU) { + cpu_entry_t *cpu; + cpu = &per_cpu(cedf_cpu_entries, tsk_rt(t)->scheduled_on); +@@ -1446,7 +2140,13 @@ + /* only add CPUs that should schedule jobs */ + if (entry->cpu != entry->cluster->domain.release_master) + #endif ++ { ++ cedf[i].num_cpus++; + update_cpu_position(entry); ++#ifdef CONFIG_LITMUS_LOCKING ++ update_pending_position(entry); ++#endif ++ } + } + /* done with this cluster */ + break; diff --git a/litmus/sched_gfl_split_namechange.c b/litmus/sched_gfl_split_namechange.c new file mode 100644 index 000000000000..c154b115a00e --- /dev/null +++ b/litmus/sched_gfl_split_namechange.c @@ -0,0 +1,1149 @@ +/* + * litmus/sched_gfl_split.c + * + * Implementation of the G-FL with job splitting. See the Erickson/Anderson + * paper at ECRTS 2012 for a description of G-FL. + * + * This plugin is a modified version of the prior GSN-EDF-split plugin in + * litmus/sched_gsn_edf_split.c. Job splitting works the same way as in that + * plugin. The subjob "deadlines" (really priorities) are computed according + * to G-FL with respect to the post-split (smaller) jobs. + * + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include + +#ifdef CONFIG_SCHED_CPU_AFFINITY +#include +#endif + +#include + +/* cpu_entry_t - maintain the linked and scheduled state + */ +typedef struct { + int cpu; + struct task_struct* linked; /* only RT tasks */ + struct task_struct* scheduled; /* only RT tasks */ + struct bheap_node* hn; + struct hrtimer split_timer; + int timer_armed; +} cpu_entry_t; +DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries); + +cpu_entry_t* gsnedf_cpus[NR_CPUS]; + +/* the cpus queue themselves according to priority in here */ +static struct bheap_node gsnedf_heap_node[NR_CPUS]; +static struct bheap gsnedf_cpu_heap; + +static rt_domain_t gsnedf; +#define gsnedf_lock (gsnedf.ready_lock) + +inline static int get_slice_num(struct task_struct* t) +{ + int basic = ((t->rt_param.job_params.exec_time * + t->rt_param.task_params.split) / + t->rt_param.task_params.exec_cost) + 1; + if (basic <= t->rt_param.task_params.split){ + return basic; + } + else{ + /*Since we don't police budget, just leave where it's at.*/ + return t->rt_param.task_params.split; + } +} + +/* Returns the appropriate subjob deadline.*/ +inline static lt_t get_proper_deadline(struct task_struct* t) +{ + unsigned int num_cpus = num_online_cpus(); + return t->rt_param.job_params.release + + ((t->rt_param.task_params.period * get_slice_num(t)) + / t->rt_param.task_params.split) + /* G-FL correction */ + - (((num_cpus - 1) * t->rt_param.task_params.exec_cost) + / (num_cpus * t->rt_param.task_params.split)); +} + +/* Tells us if the current deadline is too small.*/ +inline static int needs_deadline_move(struct task_struct* t) +{ + BUG_ON(get_proper_deadline(t) < t->rt_param.job_params.subjob_deadline); +#ifdef CONFIG_LITMUS_LOCKING + return !is_in_crit_section(t) && + (get_proper_deadline(t) != + tsk_rt(t)->job_params.subjob_deadline); +#else + return get_proper_deadline(t) != tsk_rt(t)->job_params.subjob_deadline; +#endif +} + +/*Returns execution time until the next deadline move. + * 0 means the task has no more deadline moves + */ +inline static lt_t time_to_next_move(struct task_struct* t) +{ + if (get_slice_num(t) == t->rt_param.task_params.split){ + return 0; + } + /* +1 upper bounds ceiling, since integer division is floor*/ + return ((get_slice_num(t) * t->rt_param.task_params.exec_cost) + / t->rt_param.task_params.split) + 1 + - t->rt_param.job_params.exec_time; +} + +/* Timer stuff - similar to budget.c. */ +static enum hrtimer_restart on_split_timeout(struct hrtimer *timer) +{ + cpu_entry_t* st = container_of(timer, + cpu_entry_t, + split_timer); + + unsigned long flags; + + local_irq_save(flags); + TRACE("split timer fired.\n"); + st->timer_armed = 0; + /* Activate scheduler */ + litmus_reschedule_local(); + local_irq_restore(flags); + + return HRTIMER_NORESTART; +} + +static void cancel_split_timer(cpu_entry_t* ce) +{ + int ret; + + TRACE("cancelling split time.\n"); + + /* Since interrupts are disabled and et->timer_armed is only + * modified locally, we do not need any locks. + */ + + if (ce->timer_armed) { + ret = hrtimer_try_to_cancel(&ce->split_timer); + /* Should never be inactive. */ + BUG_ON(ret == 0); + /* Should never be running concurrently.*/ + BUG_ON(ret == -1); + + ce->timer_armed = 0; + } +} + +/* assumes called with IRQs off */ +static void arm_split_timer(cpu_entry_t *ce, + struct task_struct* t) +{ + lt_t when_to_fire; + lt_t time_to_move; + TRACE_TASK(t, "arming split timer.\n"); + + /* __hrtimer_start_range_ns() cancels the timer + * anyway, so we don't have to check whether it is still armed */ + + /*We won't do any new deadline moves if the budget has been exhausted*/ + if (likely(!is_np(t) && (time_to_move = time_to_next_move(t)))) { + when_to_fire = litmus_clock() + time_to_move; + TRACE_TASK(t, "actually arming for %llu into the future\n", + time_to_move); + __hrtimer_start_range_ns(&ce->split_timer, + ns_to_ktime(when_to_fire), + 0 /* delta */, + HRTIMER_MODE_ABS_PINNED, + 0 /* no wakeup */); + ce->timer_armed = 1; + } +} + +/* Uncomment this if you want to see all scheduling decisions in the + * TRACE() log. +#define WANT_ALL_SCHED_EVENTS + */ + +static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b) +{ + cpu_entry_t *a, *b; + a = _a->value; + b = _b->value; + /* Note that a and b are inverted: we want the lowest-priority CPU at + * the top of the heap. + */ + return edf_split_higher_prio(b->linked, a->linked); +} + +/* update_cpu_position - Move the cpu entry to the correct place to maintain + * order in the cpu queue. Caller must hold gsnedf lock. + */ +static void update_cpu_position(cpu_entry_t *entry) +{ + if (likely(bheap_node_in_heap(entry->hn))) + bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn); + bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn); +} + +/* caller must hold gsnedf lock */ +static cpu_entry_t* lowest_prio_cpu(void) +{ + struct bheap_node* hn; + hn = bheap_peek(cpu_lower_prio, &gsnedf_cpu_heap); + return hn->value; +} + + +/* link_task_to_cpu - Update the link of a CPU. + * Handles the case where the to-be-linked task is already + * scheduled on a different CPU. + */ +static noinline void link_task_to_cpu(struct task_struct* linked, + cpu_entry_t *entry) +{ + cpu_entry_t *sched; + struct task_struct* tmp; + int on_cpu; + + BUG_ON(linked && !is_realtime(linked)); + + /* Currently linked task is set to be unlinked. */ + if (entry->linked) { + entry->linked->rt_param.linked_on = NO_CPU; + } + + /* Link new task to CPU. */ + if (linked) { + set_rt_flags(linked, RT_F_RUNNING); + /* handle task is already scheduled somewhere! */ + on_cpu = linked->rt_param.scheduled_on; + if (on_cpu != NO_CPU) { + sched = &per_cpu(gsnedf_cpu_entries, on_cpu); + /* this should only happen if not linked already */ + BUG_ON(sched->linked == linked); + + /* If we are already scheduled on the CPU to which we + * wanted to link, we don't need to do the swap -- + * we just link ourselves to the CPU and depend on + * the caller to get things right. + */ + if (entry != sched) { + TRACE_TASK(linked, + "already scheduled on %d, updating link.\n", + sched->cpu); + tmp = sched->linked; + linked->rt_param.linked_on = sched->cpu; + sched->linked = linked; + update_cpu_position(sched); + linked = tmp; + } + } + if (linked) /* might be NULL due to swap */ + linked->rt_param.linked_on = entry->cpu; + } + entry->linked = linked; +#ifdef WANT_ALL_SCHED_EVENTS + if (linked) + TRACE_TASK(linked, "linked to %d.\n", entry->cpu); + else + TRACE("NULL linked to %d.\n", entry->cpu); +#endif + update_cpu_position(entry); +} + +/* unlink - Make sure a task is not linked any longer to an entry + * where it was linked before. Must hold gsnedf_lock. + */ +static noinline void unlink(struct task_struct* t) +{ + cpu_entry_t *entry; + + if (t->rt_param.linked_on != NO_CPU) { + /* unlink */ + entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on); + t->rt_param.linked_on = NO_CPU; + link_task_to_cpu(NULL, entry); + } else if (is_queued(t)) { + /* This is an interesting situation: t is scheduled, + * but was just recently unlinked. It cannot be + * linked anywhere else (because then it would have + * been relinked to this CPU), thus it must be in some + * queue. We must remove it from the list in this + * case. + */ + remove(&gsnedf, t); + } +} + + +/* preempt - force a CPU to reschedule + */ +static void preempt(cpu_entry_t *entry) +{ + preempt_if_preemptable(entry->scheduled, entry->cpu); +} + +/* requeue - Put an unlinked task into gsn-edf domain. + * Caller must hold gsnedf_lock. + */ +static noinline void requeue(struct task_struct* task) +{ + BUG_ON(!task); + /* sanity check before insertion */ + BUG_ON(is_queued(task)); + + if (is_released(task, litmus_clock())) + __add_ready(&gsnedf, task); + else { + /* it has got to wait */ + add_release(&gsnedf, task); + } +} + +#ifdef CONFIG_SCHED_CPU_AFFINITY +static cpu_entry_t* gsnedf_get_nearest_available_cpu(cpu_entry_t *start) +{ + cpu_entry_t *affinity; + + get_nearest_available_cpu(affinity, start, gsnedf_cpu_entries, +#ifdef CONFIG_RELEASE_MASTER + gsnedf.release_master +#else + NO_CPU +#endif + ); + + return(affinity); +} +#endif + +/* check for any necessary preemptions */ +static void check_for_preemptions(void) +{ + struct task_struct *task; + cpu_entry_t *last; + + for (last = lowest_prio_cpu(); + edf_split_preemption_needed(&gsnedf, last->linked); + last = lowest_prio_cpu()) { + /* preemption necessary */ + task = __take_ready(&gsnedf); + TRACE("check_for_preemptions: attempting to link task %d to %d\n", + task->pid, last->cpu); + +#ifdef CONFIG_SCHED_CPU_AFFINITY + { + cpu_entry_t *affinity = + gsnedf_get_nearest_available_cpu( + &per_cpu(gsnedf_cpu_entries, + task_cpu(task))); + if (affinity) + last = affinity; + else if (last->linked) + requeue(last->linked); + } +#else + if (last->linked) + requeue(last->linked); +#endif + + link_task_to_cpu(task, last); + preempt(last); + } +} + +/* gsnedf_job_arrival: task is either resumed or released */ +static noinline void gsnedf_job_arrival(struct task_struct* task) +{ + BUG_ON(!task); + + requeue(task); + check_for_preemptions(); +} + +static void gsnedf_release_jobs(rt_domain_t* rt, struct bheap* tasks) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&gsnedf_lock, flags); + + __merge_ready(rt, tasks); + check_for_preemptions(); + + raw_spin_unlock_irqrestore(&gsnedf_lock, flags); +} + +/* caller holds gsnedf_lock */ +static noinline void job_completion(struct task_struct *t, int forced) +{ + BUG_ON(!t); + + sched_trace_task_completion(t, forced); + + TRACE_TASK(t, "job_completion().\n"); + + /* set flags */ + set_rt_flags(t, RT_F_SLEEP); + /* prepare for next period */ + /* prepare_for_next_period assumes implicit deadlines and no splitting, + * so we call it with the job deadline it expects. + */ + t->rt_param.job_params.deadline = t->rt_param.job_params.release + + t->rt_param.task_params.period; + prepare_for_next_period(t); + /* We now set the subjob deadline to what it should be for scheduling + * priority. + */ + t->rt_param.job_params.subjob_deadline = get_proper_deadline(t); + if (is_released(t, litmus_clock())) + sched_trace_task_release(t); + /* unlink */ + unlink(t); + /* requeue + * But don't requeue a blocking task. */ + if (is_running(t)) + gsnedf_job_arrival(t); +} + +static void move_deadline(struct task_struct *t) +{ + tsk_rt(t)->job_params.subjob_deadline = get_proper_deadline(t); + /* Check if rescheduling needed with lower priority. */ + unlink(t); + gsnedf_job_arrival(t); +} + +/* gsnedf_tick - this function is called for every local timer + * interrupt. + * + * checks whether the current task has expired and checks + * whether we need to preempt it if it has not expired + */ +static void gsnedf_tick(struct task_struct* t) +{ + if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) { + if (!is_np(t)) { + /* np tasks will be preempted when they become + * preemptable again + */ + litmus_reschedule_local(); + TRACE("gsnedf_scheduler_tick: " + "%d is preemptable " + " => FORCE_RESCHED\n", t->pid); + } else if (is_user_np(t)) { + TRACE("gsnedf_scheduler_tick: " + "%d is non-preemptable, " + "preemption delayed.\n", t->pid); + request_exit_np(t); + } + } +} + +/* Getting schedule() right is a bit tricky. schedule() may not make any + * assumptions on the state of the current task since it may be called for a + * number of reasons. The reasons include a scheduler_tick() determined that it + * was necessary, because sys_exit_np() was called, because some Linux + * subsystem determined so, or even (in the worst case) because there is a bug + * hidden somewhere. Thus, we must take extreme care to determine what the + * current state is. + * + * The CPU could currently be scheduling a task (or not), be linked (or not). + * + * The following assertions for the scheduled task could hold: + * + * - !is_running(scheduled) // the job blocks + * - scheduled->timeslice == 0 // the job completed (forcefully) + * - get_rt_flag() == RT_F_SLEEP // the job completed (by syscall) + * - linked != scheduled // we need to reschedule (for any reason) + * - is_np(scheduled) // rescheduling must be delayed, + * sys_exit_np must be requested + * + * Any of these can occur together. + */ +static struct task_struct* gsnedf_schedule(struct task_struct * prev) +{ + cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries); + int out_of_time, sleep, preempt, np, exists, blocks, needs_move; + struct task_struct* next = NULL; + +#ifdef CONFIG_RELEASE_MASTER + /* Bail out early if we are the release master. + * The release master never schedules any real-time tasks. + */ + if (unlikely(gsnedf.release_master == entry->cpu)) { + sched_state_task_picked(); + return NULL; + } +#endif + + raw_spin_lock(&gsnedf_lock); + + /* sanity checking */ + BUG_ON(entry->scheduled && entry->scheduled != prev); + BUG_ON(entry->scheduled && !is_realtime(prev)); + BUG_ON(is_realtime(prev) && !entry->scheduled); + + /* (0) Determine state */ + exists = entry->scheduled != NULL; + blocks = exists && !is_running(entry->scheduled); + out_of_time = exists && + budget_enforced(entry->scheduled) && + budget_exhausted(entry->scheduled); + needs_move = exists && needs_deadline_move(entry->scheduled); + np = exists && is_np(entry->scheduled); + sleep = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP; + preempt = entry->scheduled != entry->linked; + +#ifdef WANT_ALL_SCHED_EVENTS + TRACE_TASK(prev, "invoked gsnedf_schedule.\n"); +#endif + + if (exists) + TRACE_TASK(prev, + "blocks:%d out_of_time:%d needs_move:%d np:%d" + " sleep:%d preempt:%d state:%d sig:%d\n", + blocks, out_of_time, needs_move, np, sleep, preempt, + prev->state, signal_pending(prev)); + if (entry->linked && preempt) + TRACE_TASK(prev, "will be preempted by %s/%d\n", + entry->linked->comm, entry->linked->pid); + + + /* If a task blocks we have no choice but to reschedule. + */ + if (blocks) + unlink(entry->scheduled); + + /* Request a sys_exit_np() call if we would like to preempt but cannot. + * We need to make sure to update the link structure anyway in case + * that we are still linked. Multiple calls to request_exit_np() don't + * hurt. + * + * Job deadline moves handled similarly + */ + if (np && (out_of_time || preempt || sleep)) { + unlink(entry->scheduled); + request_exit_np(entry->scheduled); + } + else if (np && needs_move) { + move_deadline(entry->scheduled); + } + + /* Any task that is preemptable and either exhausts its execution + * budget or wants to sleep completes. We may have to reschedule after + * this. Don't do a job completion if we block (can't have timers running + * for blocked jobs). Preemption go first for the same reason. + */ + if (!np && (out_of_time || sleep) && !blocks && !preempt) + job_completion(entry->scheduled, !sleep); + else if (!np && needs_move && !blocks && !preempt) { + move_deadline(entry->scheduled); + } + + /* Link pending task if we became unlinked. + */ + if (!entry->linked) + link_task_to_cpu(__take_ready(&gsnedf), entry); + + /* The final scheduling decision. Do we need to switch for some reason? + * If linked is different from scheduled, then select linked as next. + */ + if ((!np || blocks) && + entry->linked != entry->scheduled) { + /* Schedule a linked job? */ + if (entry->linked) { + entry->linked->rt_param.scheduled_on = entry->cpu; + next = entry->linked; + TRACE_TASK(next, "scheduled_on = P%d\n", smp_processor_id()); + } + if (entry->scheduled) { + /* not gonna be scheduled soon */ + entry->scheduled->rt_param.scheduled_on = NO_CPU; + TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n"); + } + } else + /* Only override Linux scheduler if we have a real-time task + * scheduled that needs to continue. + */ + if (exists) + next = prev; + + sched_state_task_picked(); + + raw_spin_unlock(&gsnedf_lock); + + if (next) { + arm_split_timer(entry, next); + } + else if (entry->timer_armed) { + cancel_split_timer(entry); + } + +#ifdef WANT_ALL_SCHED_EVENTS + TRACE("gsnedf_lock released, next=0x%p\n", next); + + if (next) + TRACE_TASK(next, "scheduled at %llu\n", litmus_clock()); + else if (exists && !next) + TRACE("becomes idle at %llu.\n", litmus_clock()); +#endif + + + return next; +} + + +/* _finish_switch - we just finished the switch away from prev + */ +static void gsnedf_finish_switch(struct task_struct *prev) +{ + cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries); + + entry->scheduled = is_realtime(current) ? current : NULL; +#ifdef WANT_ALL_SCHED_EVENTS + TRACE_TASK(prev, "switched away from\n"); +#endif +} + +static void gsnedf_release_at(struct task_struct *t, lt_t start) +{ + t->rt_param.job_params.deadline = start; + prepare_for_next_period(t); + t->rt_param.job_params.subjob_deadline = get_proper_deadline(t); + set_rt_flags(t, RT_F_RUNNING); +} + +/* Prepare a task for running in RT mode + */ +static void gsnedf_task_new(struct task_struct * t, int on_rq, int running) +{ + unsigned long flags; + cpu_entry_t* entry; + + TRACE("gsn edf: task new %d\n", t->pid); + + raw_spin_lock_irqsave(&gsnedf_lock, flags); + + /* setup job params */ + gsnedf_release_at(t, litmus_clock()); + + if (running) { + entry = &per_cpu(gsnedf_cpu_entries, task_cpu(t)); + BUG_ON(entry->scheduled); + +#ifdef CONFIG_RELEASE_MASTER + if (entry->cpu != gsnedf.release_master) { +#endif + entry->scheduled = t; + tsk_rt(t)->scheduled_on = task_cpu(t); +#ifdef CONFIG_RELEASE_MASTER + } else { + /* do not schedule on release master */ + preempt(entry); /* force resched */ + tsk_rt(t)->scheduled_on = NO_CPU; + } +#endif + } else { + t->rt_param.scheduled_on = NO_CPU; + } + t->rt_param.linked_on = NO_CPU; + + gsnedf_job_arrival(t); + raw_spin_unlock_irqrestore(&gsnedf_lock, flags); +} + +static void gsnedf_task_wake_up(struct task_struct *task) +{ + unsigned long flags; + lt_t now; + + TRACE_TASK(task, "wake_up at %llu\n", litmus_clock()); + + raw_spin_lock_irqsave(&gsnedf_lock, flags); + /* We need to take suspensions because of semaphores into + * account! If a job resumes after being suspended due to acquiring + * a semaphore, it should never be treated as a new job release. + */ + if (get_rt_flags(task) == RT_F_EXIT_SEM) { + set_rt_flags(task, RT_F_RUNNING); + } else { + now = litmus_clock(); + if (is_tardy(task, now)) { + /* new sporadic release */ + gsnedf_release_at(task, now); + sched_trace_task_release(task); + } + else { + if (task->rt.time_slice) { + /* came back in time before deadline + */ + set_rt_flags(task, RT_F_RUNNING); + } + } + } + gsnedf_job_arrival(task); + raw_spin_unlock_irqrestore(&gsnedf_lock, flags); +} + +static void gsnedf_task_block(struct task_struct *t) +{ + unsigned long flags; + + TRACE_TASK(t, "block at %llu\n", litmus_clock()); + + /* unlink if necessary */ + raw_spin_lock_irqsave(&gsnedf_lock, flags); + unlink(t); + raw_spin_unlock_irqrestore(&gsnedf_lock, flags); + + BUG_ON(!is_realtime(t)); +} + + +static void gsnedf_task_exit(struct task_struct * t) +{ + unsigned long flags; + + /* unlink if necessary */ + raw_spin_lock_irqsave(&gsnedf_lock, flags); + unlink(t); + if (tsk_rt(t)->scheduled_on != NO_CPU) { + gsnedf_cpus[tsk_rt(t)->scheduled_on]->scheduled = NULL; + tsk_rt(t)->scheduled_on = NO_CPU; + } + raw_spin_unlock_irqrestore(&gsnedf_lock, flags); + + BUG_ON(!is_realtime(t)); + TRACE_TASK(t, "RIP\n"); +} + + +static long gsnedf_admit_task(struct task_struct* tsk) +{ + return 0; +} + +#ifdef CONFIG_LITMUS_LOCKING + +#include + +/* called with IRQs off */ +static void set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh) +{ + int linked_on; + int check_preempt = 0; + + raw_spin_lock(&gsnedf_lock); + + TRACE_TASK(t, "inherits priority from %s/%d\n", prio_inh->comm, prio_inh->pid); + tsk_rt(t)->inh_task = prio_inh; + + linked_on = tsk_rt(t)->linked_on; + + /* If it is scheduled, then we need to reorder the CPU heap. */ + if (linked_on != NO_CPU) { + TRACE_TASK(t, "%s: linked on %d\n", + __FUNCTION__, linked_on); + /* Holder is scheduled; need to re-order CPUs. + * We can't use heap_decrease() here since + * the cpu_heap is ordered in reverse direction, so + * it is actually an increase. */ + bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap, + gsnedf_cpus[linked_on]->hn); + bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap, + gsnedf_cpus[linked_on]->hn); + } else { + /* holder may be queued: first stop queue changes */ + raw_spin_lock(&gsnedf.release_lock); + if (is_queued(t)) { + TRACE_TASK(t, "%s: is queued\n", + __FUNCTION__); + /* We need to update the position of holder in some + * heap. Note that this could be a release heap if we + * budget enforcement is used and this job overran. */ + check_preempt = + !bheap_decrease(edf_split_ready_order, + tsk_rt(t)->heap_node); + } else { + /* Nothing to do: if it is not queued and not linked + * then it is either sleeping or currently being moved + * by other code (e.g., a timer interrupt handler) that + * will use the correct priority when enqueuing the + * task. */ + TRACE_TASK(t, "%s: is NOT queued => Done.\n", + __FUNCTION__); + } + raw_spin_unlock(&gsnedf.release_lock); + + /* If holder was enqueued in a release heap, then the following + * preemption check is pointless, but we can't easily detect + * that case. If you want to fix this, then consider that + * simply adding a state flag requires O(n) time to update when + * releasing n tasks, which conflicts with the goal to have + * O(log n) merges. */ + if (check_preempt) { + /* heap_decrease() hit the top level of the heap: make + * sure preemption checks get the right task, not the + * potentially stale cache. */ + bheap_uncache_min(edf_split_ready_order, + &gsnedf.ready_queue); + check_for_preemptions(); + } + } + + raw_spin_unlock(&gsnedf_lock); +} + +/* called with IRQs off */ +static void update_unlocked_priority(struct task_struct* t) +{ + raw_spin_lock(&gsnedf_lock); + + /* A job only stops inheriting a priority when it releases a + * resource. Thus we can make the following assumption.*/ + BUG_ON(tsk_rt(t)->scheduled_on == NO_CPU); + + /* Clear priority inheritance */ + TRACE_TASK(t, "priority restored\n"); + tsk_rt(t)->inh_task = NULL; + + /* Update splitting deadline */ + tsk_rt(t)->job_params.subjob_deadline = get_proper_deadline(t); + + /* Check if rescheduling is necessary. We can't use heap_decrease() + * since the priority was effectively lowered. */ + unlink(t); + gsnedf_job_arrival(t); + + raw_spin_unlock(&gsnedf_lock); +} + + +/* ******************** FMLP support ********************** */ + +/* struct for semaphore with priority inheritance */ +struct fmlp_semaphore { + struct litmus_lock litmus_lock; + + /* current resource holder */ + struct task_struct *owner; + + /* highest-priority waiter */ + struct task_struct *hp_waiter; + + /* FIFO queue of waiting tasks */ + wait_queue_head_t wait; +}; + +static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock) +{ + return container_of(lock, struct fmlp_semaphore, litmus_lock); +} + +/* caller is responsible for locking */ +static struct task_struct* find_hp_waiter(struct fmlp_semaphore *sem, + struct task_struct* skip) +{ + struct list_head *pos; + struct task_struct *queued, *found = NULL; + + list_for_each(pos, &sem->wait.task_list) { + queued = (struct task_struct*) list_entry(pos, wait_queue_t, + task_list)->private; + + /* Compare task prios, find high prio task. */ + if (queued != skip && edf_split_higher_prio(queued, found)) + found = queued; + } + return found; +} + +int gsnedf_fmlp_lock(struct litmus_lock* l) +{ + struct task_struct* t = current; + struct fmlp_semaphore *sem = fmlp_from_lock(l); + cpu_entry_t* entry; + wait_queue_t wait; + unsigned long flags; + + if (!is_realtime(t)) + return -EPERM; + + spin_lock_irqsave(&sem->wait.lock, flags); + entry = &__get_cpu_var(gsnedf_cpu_entries); + + tsk_rt(t)->in_crit_section = 1; + if (entry->timer_armed) { + cancel_split_timer(entry); + } + + if (sem->owner) { + /* resource is not free => must suspend and wait */ + + init_waitqueue_entry(&wait, t); + + /* FIXME: interruptible would be nice some day */ + set_task_state(t, TASK_UNINTERRUPTIBLE); + + __add_wait_queue_tail_exclusive(&sem->wait, &wait); + + /* check if we need to activate priority inheritance */ + if (edf_split_higher_prio(t, sem->hp_waiter)) { + sem->hp_waiter = t; + if (edf_split_higher_prio(t, sem->owner)) + set_priority_inheritance(sem->owner, sem->hp_waiter); + } + + TS_LOCK_SUSPEND; + + /* release lock before sleeping */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + /* We depend on the FIFO order. Thus, we don't need to recheck + * when we wake up; we are guaranteed to have the lock since + * there is only one wake up per release. + */ + + schedule(); + + TS_LOCK_RESUME; + + /* Since we hold the lock, no other task will change + * ->owner. We can thus check it without acquiring the spin + * lock. */ + BUG_ON(sem->owner != t); + } else { + /* it's ours now */ + sem->owner = t; + + spin_unlock_irqrestore(&sem->wait.lock, flags); + } + + return 0; +} + +int gsnedf_fmlp_unlock(struct litmus_lock* l) +{ + struct task_struct *t = current, *next; + struct fmlp_semaphore *sem = fmlp_from_lock(l); + unsigned long flags; + int err = 0; + + spin_lock_irqsave(&sem->wait.lock, flags); + + if (sem->owner != t) { + err = -EINVAL; + goto out; + } + + /* check if there are jobs waiting for this resource */ + next = __waitqueue_remove_first(&sem->wait); + if (next) { + /* next becomes the resouce holder */ + sem->owner = next; + TRACE_CUR("lock ownership passed to %s/%d\n", next->comm, next->pid); + + /* determine new hp_waiter if necessary */ + if (next == sem->hp_waiter) { + TRACE_TASK(next, "was highest-prio waiter\n"); + /* next has the highest priority --- it doesn't need to + * inherit. However, we need to make sure that the + * next-highest priority in the queue is reflected in + * hp_waiter. */ + sem->hp_waiter = find_hp_waiter(sem, next); + if (sem->hp_waiter) + TRACE_TASK(sem->hp_waiter, "is new highest-prio waiter\n"); + else + TRACE("no further waiters\n"); + } else { + /* Well, if next is not the highest-priority waiter, + * then it ought to inherit the highest-priority + * waiter's priority. */ + set_priority_inheritance(next, sem->hp_waiter); + } + + /* wake up next */ + wake_up_process(next); + } else + /* becomes available */ + sem->owner = NULL; + + /* We are no longer in the critical section */ + tsk_rt(t)->in_crit_section = 0; + + /* we lose the benefit of priority inheritance (if any) and may need + * to move the deadline. In either case, may need to reschedule + * due to reduced priority. */ + if (tsk_rt(t)->inh_task || needs_deadline_move(t)) + update_unlocked_priority(t); + /* TODO: Check that schedule() gets called - it needs to arm the + * enforcement timer. Otherwise we should do it here or in + * update_unlocked_priority. */ + +out: + spin_unlock_irqrestore(&sem->wait.lock, flags); + + return err; +} + +int gsnedf_fmlp_close(struct litmus_lock* l) +{ + struct task_struct *t = current; + struct fmlp_semaphore *sem = fmlp_from_lock(l); + unsigned long flags; + + int owner; + + spin_lock_irqsave(&sem->wait.lock, flags); + + owner = sem->owner == t; + + spin_unlock_irqrestore(&sem->wait.lock, flags); + + if (owner) + gsnedf_fmlp_unlock(l); + + return 0; +} + +void gsnedf_fmlp_free(struct litmus_lock* lock) +{ + kfree(fmlp_from_lock(lock)); +} + +static struct litmus_lock_ops gsnedf_fmlp_lock_ops = { + .close = gsnedf_fmlp_close, + .lock = gsnedf_fmlp_lock, + .unlock = gsnedf_fmlp_unlock, + .deallocate = gsnedf_fmlp_free, +}; + +static struct litmus_lock* gsnedf_new_fmlp(void) +{ + struct fmlp_semaphore* sem; + + sem = kmalloc(sizeof(*sem), GFP_KERNEL); + if (!sem) + return NULL; + + sem->owner = NULL; + sem->hp_waiter = NULL; + init_waitqueue_head(&sem->wait); + sem->litmus_lock.ops = &gsnedf_fmlp_lock_ops; + + return &sem->litmus_lock; +} + +/* **** lock constructor **** */ + + +static long gsnedf_allocate_lock(struct litmus_lock **lock, int type, + void* __user unused) +{ + int err = -ENXIO; + + /* GSN-EDF currently only supports the FMLP for global resources. */ + switch (type) { + + case FMLP_SEM: + /* Flexible Multiprocessor Locking Protocol */ + *lock = gsnedf_new_fmlp(); + if (*lock) + err = 0; + else + err = -ENOMEM; + break; + + }; + + return err; +} + +#endif + + +static long gsnedf_activate_plugin(void) +{ + int cpu; + cpu_entry_t *entry; + + bheap_init(&gsnedf_cpu_heap); +#ifdef CONFIG_RELEASE_MASTER + gsnedf.release_master = atomic_read(&release_master_cpu); +#endif + + for_each_online_cpu(cpu) { + entry = &per_cpu(gsnedf_cpu_entries, cpu); + bheap_node_init(&entry->hn, entry); + entry->linked = NULL; + entry->scheduled = NULL; +#ifdef CONFIG_RELEASE_MASTER + if (cpu != gsnedf.release_master) { +#endif + TRACE("GSN-EDF: Initializing CPU #%d.\n", cpu); + update_cpu_position(entry); +#ifdef CONFIG_RELEASE_MASTER + } else { + TRACE("GSN-EDF: CPU %d is release master.\n", cpu); + } +#endif + } + return 0; +} + +/* Plugin object */ +static struct sched_plugin gfl_plugin __cacheline_aligned_in_smp = { + .plugin_name = "GSN-EDF", + .finish_switch = gsnedf_finish_switch, + .tick = gsnedf_tick, + .task_new = gsnedf_task_new, + .complete_job = complete_job, + .task_exit = gsnedf_task_exit, + .schedule = gsnedf_schedule, + .release_at = gsnedf_release_at, + .task_wake_up = gsnedf_task_wake_up, + .task_block = gsnedf_task_block, + .admit_task = gsnedf_admit_task, + .activate_plugin = gsnedf_activate_plugin, +#ifdef CONFIG_LITMUS_LOCKING + .allocate_lock = gsnedf_allocate_lock, +#endif +}; + + +static int __init init_gfl(void) +{ + int cpu; + cpu_entry_t *entry; + + bheap_init(&gsnedf_cpu_heap); + /* initialize CPU state */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + entry = &per_cpu(gsnedf_cpu_entries, cpu); + gsnedf_cpus[cpu] = entry; + entry->cpu = cpu; + entry->hn = &gsnedf_heap_node[cpu]; + hrtimer_init(&entry->split_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + entry->split_timer.function = on_split_timeout; + bheap_node_init(&entry->hn, entry); + } + edf_split_domain_init(&gsnedf, NULL, gsnedf_release_jobs); + return register_sched_plugin(&gfl_plugin); +} + + +module_init(init_gfl); diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c new file mode 100644 index 000000000000..9debea981419 --- /dev/null +++ b/litmus/sched_gsn_edf.c @@ -0,0 +1,1286 @@ +/* + * litmus/sched_gsn_edf.c + * + * Implementation of the GSN-EDF scheduling algorithm. + * + * This version uses the simple approach and serializes all scheduling + * decisions by the use of a queue lock. This is probably not the + * best way to do it, but it should suffice for now. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +/* Overview of GSN-EDF operations. + * + * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This + * description only covers how the individual operations are implemented in + * LITMUS. + * + * link_task_to_cpu(T, cpu) - Low-level operation to update the linkage + * structure (NOT the actually scheduled + * task). If there is another linked task To + * already it will set To->linked_on = NO_CPU + * (thereby removing its association with this + * CPU). However, it will not requeue the + * previously linked task (if any). It will set + * T's state to RT_F_RUNNING and check whether + * it is already running somewhere else. If T + * is scheduled somewhere else it will link + * it to that CPU instead (and pull the linked + * task to cpu). T may be NULL. + * + * unlink(T) - Unlink removes T from all scheduler data + * structures. If it is linked to some CPU it + * will link NULL to that CPU. If it is + * currently queued in the gsnedf queue it will + * be removed from the rt_domain. It is safe to + * call unlink(T) if T is not linked. T may not + * be NULL. + * + * requeue(T) - Requeue will insert T into the appropriate + * queue. If the system is in real-time mode and + * the T is released already, it will go into the + * ready queue. If the system is not in + * real-time mode is T, then T will go into the + * release queue. If T's release time is in the + * future, it will go into the release + * queue. That means that T's release time/job + * no/etc. has to be updated before requeu(T) is + * called. It is not safe to call requeue(T) + * when T is already queued. T may not be NULL. + * + * gsnedf_job_arrival(T) - This is the catch all function when T enters + * the system after either a suspension or at a + * job release. It will queue T (which means it + * is not safe to call gsnedf_job_arrival(T) if + * T is already queued) and then check whether a + * preemption is necessary. If a preemption is + * necessary it will update the linkage + * accordingly and cause scheduled to be called + * (either with an IPI or need_resched). It is + * safe to call gsnedf_job_arrival(T) if T's + * next job has not been actually released yet + * (releast time in the future). T will be put + * on the release queue in that case. + * + * job_completion(T) - Take care of everything that needs to be done + * to prepare T for its next release and place + * it in the right queue with + * gsnedf_job_arrival(). + * + * + * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is + * equivalent to unlink(T). Note that if you unlink a task from a CPU none of + * the functions will automatically propagate pending task from the ready queue + * to a linked task. This is the job of the calling function ( by means of + * __take_ready). + */ + + +/* cpu_entry_t - maintain the linked and scheduled state + */ +typedef struct { + int cpu; + struct task_struct* linked; /* only RT tasks */ + struct task_struct* scheduled; /* only RT tasks */ + struct bheap_node* hn; +} cpu_entry_t; +DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries); + +cpu_entry_t* gsnedf_cpus[NR_CPUS]; + +/* the cpus queue themselves according to priority in here */ +static struct bheap_node gsnedf_heap_node[NR_CPUS]; +static struct bheap gsnedf_cpu_heap; + +static rt_domain_t gsnedf; +#define gsnedf_lock (gsnedf.ready_lock) + + +/* Uncomment this if you want to see all scheduling decisions in the + * TRACE() log. +#define WANT_ALL_SCHED_EVENTS + */ + +static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b) +{ + cpu_entry_t *a, *b; + a = _a->value; + b = _b->value; + /* Note that a and b are inverted: we want the lowest-priority CPU at + * the top of the heap. + */ + return edf_higher_prio(b->linked, a->linked); +} + +/* update_cpu_position - Move the cpu entry to the correct place to maintain + * order in the cpu queue. Caller must hold gsnedf lock. + */ +static void update_cpu_position(cpu_entry_t *entry) +{ + if (likely(bheap_node_in_heap(entry->hn))) + bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn); + bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn); +} + +/* caller must hold gsnedf lock */ +static cpu_entry_t* lowest_prio_cpu(void) +{ + struct bheap_node* hn; + hn = bheap_peek(cpu_lower_prio, &gsnedf_cpu_heap); + return hn->value; +} + + +/* link_task_to_cpu - Update the link of a CPU. + * Handles the case where the to-be-linked task is already + * scheduled on a different CPU. + */ +static noinline void link_task_to_cpu(struct task_struct* linked, + cpu_entry_t *entry) +{ + cpu_entry_t *sched; + struct task_struct* tmp; + int on_cpu; + + BUG_ON(linked && !is_realtime(linked)); + + /* Currently linked task is set to be unlinked. */ + if (entry->linked) { + entry->linked->rt_param.linked_on = NO_CPU; + } + + /* Link new task to CPU. */ + if (linked) { + set_rt_flags(linked, RT_F_RUNNING); + /* handle task is already scheduled somewhere! */ + on_cpu = linked->rt_param.scheduled_on; + if (on_cpu != NO_CPU) { + sched = &per_cpu(gsnedf_cpu_entries, on_cpu); + /* this should only happen if not linked already */ + BUG_ON(sched->linked == linked); + + /* If we are already scheduled on the CPU to which we + * wanted to link, we don't need to do the swap -- + * we just link ourselves to the CPU and depend on + * the caller to get things right. + */ + if (entry != sched) { + TRACE_TASK(linked, + "already scheduled on %d, updating link.\n", + sched->cpu); + tmp = sched->linked; + linked->rt_param.linked_on = sched->cpu; + sched->linked = linked; + update_cpu_position(sched); + linked = tmp; + } + } + if (linked) /* might be NULL due to swap */ + linked->rt_param.linked_on = entry->cpu; + } + entry->linked = linked; +#ifdef WANT_ALL_SCHED_EVENTS + if (linked) + TRACE_TASK(linked, "linked to %d.\n", entry->cpu); + else + TRACE("NULL linked to %d.\n", entry->cpu); +#endif + update_cpu_position(entry); +} + +/* unlink - Make sure a task is not linked any longer to an entry + * where it was linked before. Must hold gsnedf_lock. + */ +static noinline void unlink(struct task_struct* t) +{ + cpu_entry_t *entry; + + if (t->rt_param.linked_on != NO_CPU) { + /* unlink */ + entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on); + t->rt_param.linked_on = NO_CPU; + link_task_to_cpu(NULL, entry); + } else if (is_queued(t)) { + /* This is an interesting situation: t is scheduled, + * but was just recently unlinked. It cannot be + * linked anywhere else (because then it would have + * been relinked to this CPU), thus it must be in some + * queue. We must remove it from the list in this + * case. + */ + remove(&gsnedf, t); + } +} + + +/* preempt - force a CPU to reschedule + */ +static void preempt(cpu_entry_t *entry) +{ + preempt_if_preemptable(entry->scheduled, entry->cpu); +} + +/* requeue - Put an unlinked task into gsn-edf domain. + * Caller must hold gsnedf_lock. + */ +static noinline void requeue(struct task_struct* task) +{ + BUG_ON(!task); + /* sanity check before insertion */ + BUG_ON(is_queued(task)); + + if (is_released(task, litmus_clock())) + __add_ready(&gsnedf, task); + else { + /* it has got to wait */ + add_release(&gsnedf, task); + } +} + +/* check for any necessary preemptions */ +static void check_for_preemptions(void) +{ + struct task_struct *task; + cpu_entry_t* last; + + for(last = lowest_prio_cpu(); + edf_preemption_needed(&gsnedf, last->linked); + last = lowest_prio_cpu()) { + /* preemption necessary */ + task = __take_ready(&gsnedf); + TRACE_TASK(task, "attempting to link to P%d\n", + last->cpu); + if (last->linked) + requeue(last->linked); + link_task_to_cpu(task, last); + preempt(last); + } +} + +/* gsnedf_job_arrival: task is either resumed or released */ +static noinline void gsnedf_job_arrival(struct task_struct* task) +{ + BUG_ON(!task); + + requeue(task); + check_for_preemptions(); +} + +static void gsnedf_release_jobs(rt_domain_t* rt, struct bheap* tasks) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&gsnedf_lock, flags); + + __merge_ready(rt, tasks); + check_for_preemptions(); + + raw_spin_unlock_irqrestore(&gsnedf_lock, flags); +} + +/* caller holds gsnedf_lock */ +static noinline void job_completion(struct task_struct *t, int forced) +{ + BUG_ON(!t); + + sched_trace_task_completion(t, forced); + + TRACE_TASK(t, "job_completion().\n"); + + /* set flags */ + set_rt_flags(t, RT_F_SLEEP); + /* prepare for next period */ + prepare_for_next_period(t); + if (is_released(t, litmus_clock())) + sched_trace_task_release(t); + /* unlink */ + unlink(t); + /* requeue + * But don't requeue a blocking task. */ + if (is_running(t)) + gsnedf_job_arrival(t); +} + +/* gsnedf_tick - this function is called for every local timer + * interrupt. + * + * checks whether the current task has expired and checks + * whether we need to preempt it if it has not expired + */ +static void gsnedf_tick(struct task_struct* t) +{ + if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) { + if (!is_np(t)) { + /* np tasks will be preempted when they become + * preemptable again + */ + litmus_reschedule_local(); + TRACE("gsnedf_scheduler_tick: " + "%d is preemptable " + " => FORCE_RESCHED\n", t->pid); + } else if (is_user_np(t)) { + TRACE("gsnedf_scheduler_tick: " + "%d is non-preemptable, " + "preemption delayed.\n", t->pid); + request_exit_np(t); + } + } +} + +/* Getting schedule() right is a bit tricky. schedule() may not make any + * assumptions on the state of the current task since it may be called for a + * number of reasons. The reasons include a scheduler_tick() determined that it + * was necessary, because sys_exit_np() was called, because some Linux + * subsystem determined so, or even (in the worst case) because there is a bug + * hidden somewhere. Thus, we must take extreme care to determine what the + * current state is. + * + * The CPU could currently be scheduling a task (or not), be linked (or not). + * + * The following assertions for the scheduled task could hold: + * + * - !is_running(scheduled) // the job blocks + * - scheduled->timeslice == 0 // the job completed (forcefully) + * - get_rt_flag() == RT_F_SLEEP // the job completed (by syscall) + * - linked != scheduled // we need to reschedule (for any reason) + * - is_np(scheduled) // rescheduling must be delayed, + * sys_exit_np must be requested + * + * Any of these can occur together. + */ +static struct task_struct* gsnedf_schedule(struct task_struct * prev) +{ + cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries); + int out_of_time, sleep, preempt, np, exists, blocks; + struct task_struct* next = NULL; + +#ifdef CONFIG_RELEASE_MASTER + /* Bail out early if we are the release master. + * The release master never schedules any real-time tasks. + */ + if (gsnedf.release_master == entry->cpu) { + sched_state_task_picked(); + return NULL; + } +#endif + + raw_spin_lock(&gsnedf_lock); + + /* sanity checking */ + BUG_ON(entry->scheduled && entry->scheduled != prev); + BUG_ON(entry->scheduled && !is_realtime(prev)); + BUG_ON(is_realtime(prev) && !entry->scheduled); + + /* (0) Determine state */ + exists = entry->scheduled != NULL; + blocks = exists && !is_running(entry->scheduled); + out_of_time = exists && + budget_enforced(entry->scheduled) && + budget_exhausted(entry->scheduled); + np = exists && is_np(entry->scheduled); + sleep = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP; + preempt = entry->scheduled != entry->linked; + +#ifdef WANT_ALL_SCHED_EVENTS + TRACE_TASK(prev, "invoked gsnedf_schedule.\n"); +#endif + + if (exists) + TRACE_TASK(prev, + "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d " + "state:%d sig:%d\n", + blocks, out_of_time, np, sleep, preempt, + prev->state, signal_pending(prev)); + if (entry->linked && preempt && !np) + TRACE_TASK(prev, "will be preempted by %s/%d\n", + entry->linked->comm, entry->linked->pid); + + + /* If a task blocks we have no choice but to reschedule. + */ + if (blocks) + unlink(entry->scheduled); + + /* Request a sys_exit_np() call if we would like to preempt but cannot. + * Do not unlink since entry->scheduled is currently in the ready queue. + * We don't process out_of_time and sleep until the job is preemptive again. + */ + if (np && (out_of_time || preempt || sleep)) { + request_exit_np(entry->scheduled); + } + + /* Any task that is preemptable and either exhausts its execution + * budget or wants to sleep completes. We may have to reschedule after + * this. Don't do a job completion if we block (can't have timers running + * for blocked jobs). Preemption go first for the same reason. + */ + if (!np && (out_of_time || sleep) && !blocks && !preempt) + job_completion(entry->scheduled, !sleep); + + /* Link pending task if we became unlinked. + */ + if (!entry->linked) + link_task_to_cpu(__take_ready(&gsnedf), entry); + + /* The final scheduling decision. Do we need to switch for some reason? + * If linked is different from scheduled, then select linked as next. + */ + if ((!np || blocks) && + entry->linked != entry->scheduled) { + /* Schedule a linked job? */ + if (entry->linked) { + entry->linked->rt_param.scheduled_on = entry->cpu; + next = entry->linked; + TRACE_TASK(next, "scheduled_on = P%d\n", smp_processor_id()); + } + if (entry->scheduled) { + /* not gonna be scheduled soon */ + entry->scheduled->rt_param.scheduled_on = NO_CPU; + TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n"); + } + } else + /* Only override Linux scheduler if we have a real-time task + * scheduled that needs to continue. + */ + if (exists) + next = prev; + + sched_state_task_picked(); + + raw_spin_unlock(&gsnedf_lock); + +#ifdef WANT_ALL_SCHED_EVENTS + TRACE("gsnedf_lock released, next=0x%p\n", next); + + if (next) + TRACE_TASK(next, "scheduled at %llu\n", litmus_clock()); + else if (exists && !next) + TRACE("becomes idle at %llu.\n", litmus_clock()); +#endif + + + return next; +} + + +/* _finish_switch - we just finished the switch away from prev + */ +static void gsnedf_finish_switch(struct task_struct *prev) +{ + cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries); + + entry->scheduled = is_realtime(current) ? current : NULL; +#ifdef WANT_ALL_SCHED_EVENTS + TRACE_TASK(prev, "switched away from\n"); +#endif +} + + +/* Prepare a task for running in RT mode + */ +static void gsnedf_task_new(struct task_struct * t, int on_rq, int running) +{ + unsigned long flags; + cpu_entry_t* entry; + + TRACE("gsn edf: task new %d\n", t->pid); + + raw_spin_lock_irqsave(&gsnedf_lock, flags); + + /* setup job params */ + release_at(t, litmus_clock()); + + if (running) { + entry = &per_cpu(gsnedf_cpu_entries, task_cpu(t)); + BUG_ON(entry->scheduled); + +#ifdef CONFIG_RELEASE_MASTER + if (entry->cpu != gsnedf.release_master) { +#endif + entry->scheduled = t; + tsk_rt(t)->scheduled_on = task_cpu(t); +#ifdef CONFIG_RELEASE_MASTER + } else { + /* do not schedule on release master */ + preempt(entry); /* force resched */ + tsk_rt(t)->scheduled_on = NO_CPU; + } +#endif + } else { + t->rt_param.scheduled_on = NO_CPU; + } + t->rt_param.linked_on = NO_CPU; + + gsnedf_job_arrival(t); + raw_spin_unlock_irqrestore(&gsnedf_lock, flags); +} + +static void gsnedf_task_wake_up(struct task_struct *task) +{ + unsigned long flags; + lt_t now; + + TRACE_TASK(task, "wake_up at %llu\n", litmus_clock()); + + raw_spin_lock_irqsave(&gsnedf_lock, flags); + /* We need to take suspensions because of semaphores into + * account! If a job resumes after being suspended due to acquiring + * a semaphore, it should never be treated as a new job release. + */ + if (get_rt_flags(task) == RT_F_EXIT_SEM) { + set_rt_flags(task, RT_F_RUNNING); + } else { + now = litmus_clock(); + if (is_tardy(task, now)) { + /* new sporadic release */ + release_at(task, now); + sched_trace_task_release(task); + } + else { + if (task->rt.time_slice) { + /* came back in time before deadline + */ + set_rt_flags(task, RT_F_RUNNING); + } + } + } + gsnedf_job_arrival(task); + raw_spin_unlock_irqrestore(&gsnedf_lock, flags); +} + +static void gsnedf_task_block(struct task_struct *t) +{ + unsigned long flags; + + TRACE_TASK(t, "block at %llu\n", litmus_clock()); + + /* unlink if necessary */ + raw_spin_lock_irqsave(&gsnedf_lock, flags); + unlink(t); + raw_spin_unlock_irqrestore(&gsnedf_lock, flags); + + BUG_ON(!is_realtime(t)); +} + + +static void gsnedf_task_exit(struct task_struct * t) +{ + unsigned long flags; + + /* unlink if necessary */ + raw_spin_lock_irqsave(&gsnedf_lock, flags); + unlink(t); + if (tsk_rt(t)->scheduled_on != NO_CPU) { + gsnedf_cpus[tsk_rt(t)->scheduled_on]->scheduled = NULL; + tsk_rt(t)->scheduled_on = NO_CPU; + } + raw_spin_unlock_irqrestore(&gsnedf_lock, flags); + + BUG_ON(!is_realtime(t)); + TRACE_TASK(t, "RIP\n"); +} + + +static long gsnedf_admit_task(struct task_struct* tsk) +{ + return 0; +} + +#ifdef CONFIG_LITMUS_LOCKING + +#include + + + +/* called with IRQs off */ +static void __set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh) +{ + int linked_on; + int check_preempt = 0; + + TRACE_TASK(t, "inherits priority from %s/%d\n", prio_inh->comm, prio_inh->pid); + tsk_rt(t)->inh_task = prio_inh; + + linked_on = tsk_rt(t)->linked_on; + + /* If it is scheduled, then we need to reorder the CPU heap. */ + if (linked_on != NO_CPU) { + TRACE_TASK(t, "%s: linked on %d\n", + __FUNCTION__, linked_on); + /* Holder is scheduled; need to re-order CPUs. + * We can't use heap_decrease() here since + * the cpu_heap is ordered in reverse direction, so + * it is actually an increase. */ + bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap, + gsnedf_cpus[linked_on]->hn); + bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap, + gsnedf_cpus[linked_on]->hn); + } else { + /* holder may be queued: first stop queue changes */ + raw_spin_lock(&gsnedf.release_lock); + if (is_queued(t)) { + TRACE_TASK(t, "%s: is queued\n", + __FUNCTION__); + /* We need to update the position of holder in some + * heap. Note that this could be a release heap if + * budget enforcement is used and this job overran. */ + check_preempt = + !bheap_decrease(edf_ready_order, + tsk_rt(t)->heap_node); + } else { + /* Nothing to do: if it is not queued and not linked + * then it is either sleeping or currently being moved + * by other code (e.g., a timer interrupt handler) that + * will use the correct priority when enqueuing the + * task. */ + TRACE_TASK(t, "%s: is NOT queued => Done.\n", + __FUNCTION__); + } + raw_spin_unlock(&gsnedf.release_lock); + + /* If holder was enqueued in a release heap, then the following + * preemption check is pointless, but we can't easily detect + * that case. If you want to fix this, then consider that + * simply adding a state flag requires O(n) time to update when + * releasing n tasks, which conflicts with the goal to have + * O(log n) merges. */ + if (check_preempt) { + /* heap_decrease() hit the top level of the heap: make + * sure preemption checks get the right task, not the + * potentially stale cache. */ + bheap_uncache_min(edf_ready_order, + &gsnedf.ready_queue); + check_for_preemptions(); + } + } +} + +static void set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh) +{ + raw_spin_lock(&gsnedf_lock); + __set_priority_inheritance(t, prio_inh); + raw_spin_unlock(&gsnedf_lock); +} + +static void __clear_priority_inheritance(struct task_struct* t) +{ + /* A job only stops inheriting a priority when it releases a + * resource. Thus we can make the following assumption.*/ + BUG_ON(tsk_rt(t)->scheduled_on == NO_CPU); + + TRACE_TASK(t, "priority restored\n"); + tsk_rt(t)->inh_task = NULL; + + /* Check if rescheduling is necessary. We can't use heap_decrease() + * since the priority was effectively lowered. */ + unlink(t); + gsnedf_job_arrival(t); +} + +/* set and clear at the same time to avoid having to + * acquire the runqueue lock twice */ +static void update_priority_inheritance( + struct task_struct* deprived, + struct task_struct* blocker, + struct task_struct* blocked) +{ + /* things to do: + * 1) deprived no longer inherits anything. + * 2) blocker gets blocked's priority. + */ + + raw_spin_lock(&gsnedf_lock); + + if (tsk_rt(deprived)->inh_task) + __clear_priority_inheritance(deprived); + + if (blocked) + __set_priority_inheritance(blocker, blocked); + + raw_spin_unlock(&gsnedf_lock); +} + + +/* ******************** FMLP support ********************** */ + +/* struct for semaphore with priority inheritance */ +struct fmlp_semaphore { + struct litmus_lock litmus_lock; + + /* current resource holder */ + struct task_struct *owner; + + /* highest-priority waiter */ + struct task_struct *hp_waiter; + + /* FIFO queue of waiting tasks */ + wait_queue_head_t wait; +}; + +static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock) +{ + return container_of(lock, struct fmlp_semaphore, litmus_lock); +} + +/* caller is responsible for locking */ +struct task_struct* find_hp_waiter(struct fmlp_semaphore *sem, + struct task_struct* skip) +{ + struct list_head *pos; + struct task_struct *queued, *found = NULL; + + list_for_each(pos, &sem->wait.task_list) { + queued = (struct task_struct*) list_entry(pos, wait_queue_t, + task_list)->private; + + /* Compare task prios, find high prio task. */ + if (queued != skip && edf_higher_prio(queued, found)) + found = queued; + } + return found; +} + +int gsnedf_fmlp_lock(struct litmus_lock* l) +{ + struct task_struct* t = current; + struct fmlp_semaphore *sem = fmlp_from_lock(l); + wait_queue_t wait; + unsigned long flags; + + if (!is_realtime(t)) + return -EPERM; + + spin_lock_irqsave(&sem->wait.lock, flags); + + if (sem->owner) { + /* resource is not free => must suspend and wait */ + + init_waitqueue_entry(&wait, t); + + /* FIXME: interruptible would be nice some day */ + set_task_state(t, TASK_UNINTERRUPTIBLE); + + __add_wait_queue_tail_exclusive(&sem->wait, &wait); + + /* check if we need to activate priority inheritance */ + if (edf_higher_prio(t, sem->hp_waiter)) { + sem->hp_waiter = t; + if (edf_higher_prio(t, sem->owner)) + set_priority_inheritance(sem->owner, sem->hp_waiter); + } + + TS_LOCK_SUSPEND; + + /* release lock before sleeping */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + /* We depend on the FIFO order. Thus, we don't need to recheck + * when we wake up; we are guaranteed to have the lock since + * there is only one wake up per release. + */ + + schedule(); + + TS_LOCK_RESUME; + + /* Since we hold the lock, no other task will change + * ->owner. We can thus check it without acquiring the spin + * lock. */ + BUG_ON(sem->owner != t); + } else { + /* it's ours now */ + sem->owner = t; + + spin_unlock_irqrestore(&sem->wait.lock, flags); + } + + return 0; +} + +int gsnedf_fmlp_unlock(struct litmus_lock* l) +{ + struct task_struct *t = current, *next, *blocked = NULL; + struct fmlp_semaphore *sem = fmlp_from_lock(l); + unsigned long flags; + int err = 0; + + spin_lock_irqsave(&sem->wait.lock, flags); + + if (sem->owner != t) { + err = -EINVAL; + goto out; + } + + /* check if there are jobs waiting for this resource */ + next = __waitqueue_remove_first(&sem->wait); + if (next) { + /* next becomes the resouce holder */ + sem->owner = next; + TRACE_CUR("lock ownership passed to %s/%d\n", next->comm, next->pid); + + /* determine new hp_waiter if necessary */ + if (next == sem->hp_waiter) { + TRACE_TASK(next, "was highest-prio waiter\n"); + /* next has the highest priority --- it doesn't need to + * inherit. However, we need to make sure that the + * next-highest priority in the queue is reflected in + * hp_waiter. */ + sem->hp_waiter = find_hp_waiter(sem, next); + if (sem->hp_waiter) + TRACE_TASK(sem->hp_waiter, "is new highest-prio waiter\n"); + else + TRACE("no further waiters\n"); + } else { + /* Well, if next is not the highest-priority waiter, + * then it ought to inherit the highest-priority + * waiter's priority. */ + blocked = sem->hp_waiter; + } + + /* wake up next */ + wake_up_process(next); + } else + /* becomes available */ + sem->owner = NULL; + + /* we lose the benefit of priority inheritance (if any) */ + if (tsk_rt(t)->inh_task || blocked) + update_priority_inheritance(t, next, blocked); + +out: + spin_unlock_irqrestore(&sem->wait.lock, flags); + + return err; +} + +int gsnedf_fmlp_close(struct litmus_lock* l) +{ + struct task_struct *t = current; + struct fmlp_semaphore *sem = fmlp_from_lock(l); + unsigned long flags; + + int owner; + + spin_lock_irqsave(&sem->wait.lock, flags); + + owner = sem->owner == t; + + spin_unlock_irqrestore(&sem->wait.lock, flags); + + if (owner) + gsnedf_fmlp_unlock(l); + + return 0; +} + +void gsnedf_fmlp_free(struct litmus_lock* lock) +{ + kfree(fmlp_from_lock(lock)); +} + +static struct litmus_lock_ops gsnedf_fmlp_lock_ops = { + .close = gsnedf_fmlp_close, + .lock = gsnedf_fmlp_lock, + .unlock = gsnedf_fmlp_unlock, + .deallocate = gsnedf_fmlp_free, +}; + +static struct litmus_lock* gsnedf_new_fmlp(void) +{ + struct fmlp_semaphore* sem; + + sem = kmalloc(sizeof(*sem), GFP_KERNEL); + if (!sem) + return NULL; + + sem->owner = NULL; + sem->hp_waiter = NULL; + init_waitqueue_head(&sem->wait); + sem->litmus_lock.ops = &gsnedf_fmlp_lock_ops; + + return &sem->litmus_lock; +} + + +/* ******************** OMLP support ********************** */ + +/* struct for semaphore with priority inheritance */ +struct omlp_semaphore { + struct litmus_lock litmus_lock; + + /* current resource holder */ + struct task_struct *owner; + + /* highest-priority waiter */ + struct task_struct *hp_waiter; + + /* FIFO queue of waiting tasks */ + wait_queue_head_t fifo_wait; + /* Priority queue of waiting tasks */ + wait_queue_head_t prio_wait; + + /* How many slots remaining in FIFO queue? */ + unsigned int num_free; +}; + +static inline struct omlp_semaphore* omlp_from_lock(struct litmus_lock* lock) +{ + return container_of(lock, struct omlp_semaphore, litmus_lock); +} + +/* already locked */ +static void omlp_enqueue(struct omlp_semaphore *sem, prio_wait_queue_t* wait) +{ + if (sem->num_free) { + /* there is space in the FIFO queue */ + sem->num_free--; + __add_wait_queue_tail_exclusive(&sem->fifo_wait, &wait->wq); + } else { + /* nope, gotta go to the priority queue */ + __add_wait_queue_prio_exclusive(&sem->prio_wait, wait); + } +} + +/* already locked */ +static int omlp_move(struct omlp_semaphore *sem) +{ + struct list_head* first; + + if (waitqueue_active(&sem->prio_wait)) { + first = sem->prio_wait.task_list.next; + list_move_tail(first, &sem->fifo_wait.task_list); + return 1; + } + else + return 0; +} + +static struct task_struct* omlp_dequeue(struct omlp_semaphore *sem) +{ + struct task_struct* first = __waitqueue_remove_first(&sem->fifo_wait); + + if (first && !omlp_move(sem)) + sem->num_free++; + + return first; +} + +/* caller is responsible for locking */ +static struct task_struct* omlp_find_hp_waiter(struct omlp_semaphore *sem, + struct task_struct* skip) +{ + struct list_head *pos; + struct task_struct *queued, *found = NULL; + + /* check FIFO queue first */ + list_for_each(pos, &sem->fifo_wait.task_list) { + queued = (struct task_struct*) list_entry(pos, wait_queue_t, + task_list)->private; + + /* Compare task prios, find high prio task. */ + if (queued != skip && edf_higher_prio(queued, found)) + found = queued; + } + + /* check priority queue next */ + if (waitqueue_active(&sem->prio_wait)) { + /* first has highest priority */ + pos = sem->prio_wait.task_list.next; + queued = (struct task_struct*) list_entry(pos, wait_queue_t, + task_list)->private; + if (edf_higher_prio(queued, found)) + found = queued; + } + + return found; +} + +int gsnedf_omlp_lock(struct litmus_lock* l) +{ + struct task_struct* t = current; + struct omlp_semaphore *sem = omlp_from_lock(l); + prio_wait_queue_t wait; + unsigned long flags; + + if (!is_realtime(t)) + return -EPERM; + + spin_lock_irqsave(&sem->fifo_wait.lock, flags); + + if (sem->owner) { + /* resource is not free => must suspend and wait */ + + init_prio_waitqueue_entry(&wait, t, get_deadline(t)); + + set_task_state(t, TASK_UNINTERRUPTIBLE); + + omlp_enqueue(sem, &wait); + + /* check if we need to activate priority inheritance */ + if (edf_higher_prio(t, sem->hp_waiter)) { + sem->hp_waiter = t; + if (edf_higher_prio(t, sem->owner)) + set_priority_inheritance(sem->owner, sem->hp_waiter); + } + + TS_LOCK_SUSPEND; + + /* release lock before sleeping */ + spin_unlock_irqrestore(&sem->fifo_wait.lock, flags); + + schedule(); + + TS_LOCK_RESUME; + + /* Since we hold the lock, no other task will change + * ->owner. We can thus check it without acquiring the spin + * lock. */ + BUG_ON(sem->owner != t); + } else { + /* it's ours now */ + sem->owner = t; + + spin_unlock_irqrestore(&sem->fifo_wait.lock, flags); + } + + return 0; +} + +static int gsnedf_omlp_unlock(struct litmus_lock* l) +{ + struct task_struct *t = current, *next, *blocked = NULL; + struct omlp_semaphore *sem = omlp_from_lock(l); + unsigned long flags; + int err = 0; + + spin_lock_irqsave(&sem->fifo_wait.lock, flags); + + if (sem->owner != t) { + err = -EINVAL; + goto out; + } + + /* check if there are jobs waiting for this resource */ + next = omlp_dequeue(sem); + if (next) { + /* next becomes the resouce holder */ + sem->owner = next; + TRACE_CUR("lock ownership passed to %s/%d\n", next->comm, next->pid); + + /* determine new hp_waiter if necessary */ + if (next == sem->hp_waiter) { + TRACE_TASK(next, "was highest-prio waiter\n"); + /* next has the highest priority --- it doesn't need to + * inherit. However, we need to make sure that the + * next-highest priority in the queue is reflected in + * hp_waiter. */ + sem->hp_waiter = omlp_find_hp_waiter(sem, next); + if (sem->hp_waiter) + TRACE_TASK(sem->hp_waiter, "is new highest-prio waiter\n"); + else + TRACE("no further waiters\n"); + } else { + /* Well, if next is not the highest-priority waiter, + * then it ought to inherit the highest-priority + * waiter's priority. */ + blocked = sem->hp_waiter; + } + + /* wake up next */ + wake_up_process(next); + } else + /* becomes available */ + sem->owner = NULL; + + /* we lose the benefit of priority inheritance (if any) */ + if (tsk_rt(t)->inh_task || blocked) + update_priority_inheritance(t, next, blocked); + +out: + spin_unlock_irqrestore(&sem->fifo_wait.lock, flags); + + return err; +} + +static int gsnedf_omlp_close(struct litmus_lock* l) +{ + struct task_struct *t = current; + struct omlp_semaphore *sem = omlp_from_lock(l); + unsigned long flags; + + int owner; + + spin_lock_irqsave(&sem->fifo_wait.lock, flags); + + owner = sem->owner == t; + + spin_unlock_irqrestore(&sem->fifo_wait.lock, flags); + + if (owner) + gsnedf_omlp_unlock(l); + + return 0; +} + +static void gsnedf_omlp_free(struct litmus_lock* lock) +{ + kfree(omlp_from_lock(lock)); +} + +static struct litmus_lock_ops gsnedf_omlp_lock_ops = { + .close = gsnedf_omlp_close, + .lock = gsnedf_omlp_lock, + .unlock = gsnedf_omlp_unlock, + .deallocate = gsnedf_omlp_free, +}; + +static struct litmus_lock* gsnedf_new_omlp(void) +{ + struct omlp_semaphore* sem; + + sem = kmalloc(sizeof(*sem), GFP_KERNEL); + if (!sem) + return NULL; + + sem->owner = NULL; + sem->hp_waiter = NULL; + init_waitqueue_head(&sem->fifo_wait); + init_waitqueue_head(&sem->prio_wait); + sem->litmus_lock.ops = &gsnedf_omlp_lock_ops; + /* free = cpus -1 since ->owner is the head and also counted */ + sem->num_free = num_online_cpus() - 1; + +#ifdef CONFIG_RELEASE_MASTER + /* If we use dedicated interrupt handling, then there are actually + * only m - 1 CPUs around. */ + if (gsnedf.release_master != NO_CPU) + sem->num_free -= 1; +#endif + + return &sem->litmus_lock; +} + + +/* **** lock constructor **** */ + + +static long gsnedf_allocate_lock(struct litmus_lock **lock, int type, + void* __user unused) +{ + int err = -ENXIO; + + /* GSN-EDF currently only supports the FMLP for global resources. */ + switch (type) { + + case FMLP_SEM: + /* Flexible Multiprocessor Locking Protocol */ + *lock = gsnedf_new_fmlp(); + if (*lock) + err = 0; + else + err = -ENOMEM; + break; + + case OMLP_SEM: + /* O(m) Multiprocessor Locking Protocol */ + *lock = gsnedf_new_omlp(); + if (*lock) + err = 0; + else + err = -ENOMEM; + break; + + }; + + return err; +} + +#endif + + +static long gsnedf_activate_plugin(void) +{ + int cpu; + cpu_entry_t *entry; + + bheap_init(&gsnedf_cpu_heap); +#ifdef CONFIG_RELEASE_MASTER + gsnedf.release_master = atomic_read(&release_master_cpu); +#endif + + for_each_online_cpu(cpu) { + entry = &per_cpu(gsnedf_cpu_entries, cpu); + bheap_node_init(&entry->hn, entry); + entry->linked = NULL; + entry->scheduled = NULL; +#ifdef CONFIG_RELEASE_MASTER + if (cpu != gsnedf.release_master) { +#endif + TRACE("GSN-EDF: Initializing CPU #%d.\n", cpu); + update_cpu_position(entry); +#ifdef CONFIG_RELEASE_MASTER + } else { + TRACE("GSN-EDF: CPU %d is release master.\n", cpu); + } +#endif + } + return 0; +} + +/* Plugin object */ +static struct sched_plugin gsn_edf_plugin __cacheline_aligned_in_smp = { + .plugin_name = "GSN-EDF", + .finish_switch = gsnedf_finish_switch, + .tick = gsnedf_tick, + .task_new = gsnedf_task_new, + .complete_job = complete_job, + .task_exit = gsnedf_task_exit, + .schedule = gsnedf_schedule, + .task_wake_up = gsnedf_task_wake_up, + .task_block = gsnedf_task_block, + .admit_task = gsnedf_admit_task, + .activate_plugin = gsnedf_activate_plugin, +#ifdef CONFIG_LITMUS_LOCKING + .allocate_lock = gsnedf_allocate_lock, +#endif +}; + + +static int __init init_gsn_edf(void) +{ + int cpu; + cpu_entry_t *entry; + + bheap_init(&gsnedf_cpu_heap); + /* initialize CPU state */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + entry = &per_cpu(gsnedf_cpu_entries, cpu); + gsnedf_cpus[cpu] = entry; + entry->cpu = cpu; + entry->hn = &gsnedf_heap_node[cpu]; + bheap_node_init(&entry->hn, entry); + } + edf_domain_init(&gsnedf, NULL, gsnedf_release_jobs); + return register_sched_plugin(&gsn_edf_plugin); +} + + +module_init(init_gsn_edf); diff --git a/litmus/sched_gsn_edf_split_namechange.c b/litmus/sched_gsn_edf_split_namechange.c new file mode 100644 index 000000000000..6839ae642b3a --- /dev/null +++ b/litmus/sched_gsn_edf_split_namechange.c @@ -0,0 +1,1165 @@ +/* + * litmus/sched_gsn_edf.c + * + * Implementation of the GSN-EDF scheduling algorithm with job splitting, i.e. + * GSN-EDF. + * + * This plugin is a modified version of the prior GSN-EDF plugin in + * litmus/sched_gsn_edf.c + * + * Splitting an implicit-deadline job simply means splitting each job into an + * integral number of subjobs. For example, a task with a period of 10 ms and + * a runtime of 4 ms could be re-organized as a task with a period of 5 ms and + * a runtime of 2 ms, with analytical benefit for bounded tardiness (ignoring + * overheads and assuming no critical sections). This would have a "splitting + * factor" of 2. + * + * Because our analysis works with early releasing, we actually only release + * each job once, but move the subjob deadline back when the appropriate amount + * of execution has been completed. (In the example above, a job released at + * time 0 would intially have a subjob deadline at time 5, but this deadline + * would be moved to time 10 as soon as 2 ms of execution had completed.) + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include + +#ifdef CONFIG_SCHED_CPU_AFFINITY +#include +#endif + +#include + +/* cpu_entry_t - maintain the linked and scheduled state + */ +typedef struct { + int cpu; + struct task_struct* linked; /* only RT tasks */ + struct task_struct* scheduled; /* only RT tasks */ + struct bheap_node* hn; + struct hrtimer split_timer; + int timer_armed; +} cpu_entry_t; +DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries); + +cpu_entry_t* gsnedf_cpus[NR_CPUS]; + +/* the cpus queue themselves according to priority in here */ +static struct bheap_node gsnedf_heap_node[NR_CPUS]; +static struct bheap gsnedf_cpu_heap; + +static rt_domain_t gsnedf; +#define gsnedf_lock (gsnedf.ready_lock) + +inline static int get_slice_num(struct task_struct* t) +{ + int basic = ((t->rt_param.job_params.exec_time * + t->rt_param.task_params.split) / + t->rt_param.task_params.exec_cost) + 1; + if (basic <= t->rt_param.task_params.split){ + return basic; + } + else{ + /*Since we don't police budget, just leave where it's at.*/ + return t->rt_param.task_params.split; + } +} + +/* Returns the appropriate subjob deadline.*/ +inline static lt_t get_proper_deadline(struct task_struct* t) +{ + return t->rt_param.job_params.release + + ((t->rt_param.task_params.period * get_slice_num(t)) + / t->rt_param.task_params.split); +} + +/* Tells us if the current deadline is too small.*/ +inline static int needs_deadline_move(struct task_struct* t) +{ + BUG_ON(get_proper_deadline(t) < t->rt_param.job_params.subjob_deadline); +#ifdef CONFIG_LITMUS_LOCKING + return !is_in_crit_section(t) && + (get_proper_deadline(t) != + tsk_rt(t)->job_params.subjob_deadline); +#else + return get_proper_deadline(t) != tsk_rt(t)->job_params.subjob_deadline; +#endif +} + +/*Returns execution time until the next deadline move. + * 0 means the task has no more deadline moves + */ +inline static lt_t time_to_next_move(struct task_struct* t) +{ + if (get_slice_num(t) == t->rt_param.task_params.split){ + return 0; + } + /* +1 upper bounds ceiling, since integer division is floor*/ + return ((get_slice_num(t) * t->rt_param.task_params.exec_cost) + / t->rt_param.task_params.split) + 1 + - t->rt_param.job_params.exec_time; +} + +/* Timer stuff - similar to budget.c. */ +static enum hrtimer_restart on_split_timeout(struct hrtimer *timer) +{ + cpu_entry_t* st = container_of(timer, + cpu_entry_t, + split_timer); + + unsigned long flags; + + local_irq_save(flags); + TRACE("split timer fired.\n"); + st->timer_armed = 0; + /* Activate scheduler */ + litmus_reschedule_local(); + local_irq_restore(flags); + + return HRTIMER_NORESTART; +} + +static void cancel_split_timer(cpu_entry_t* ce) +{ + int ret; + + TRACE("cancelling split time.\n"); + + /* Since interrupts are disabled and et->timer_armed is only + * modified locally, we do not need any locks. + */ + + if (ce->timer_armed) { + ret = hrtimer_try_to_cancel(&ce->split_timer); + /* Should never be inactive. */ + BUG_ON(ret == 0); + /* Should never be running concurrently.*/ + BUG_ON(ret == -1); + + ce->timer_armed = 0; + } +} + +/* assumes called with IRQs off */ +static void arm_split_timer(cpu_entry_t *ce, + struct task_struct* t) +{ + lt_t when_to_fire; + lt_t time_to_move; + TRACE_TASK(t, "arming split timer.\n"); + + /* __hrtimer_start_range_ns() cancels the timer + * anyway, so we don't have to check whether it is still armed */ + + /*We won't do any new deadline moves if the budget has been exhausted*/ + if (likely(!is_np(t) && (time_to_move = time_to_next_move(t)))) { + when_to_fire = litmus_clock() + time_to_move; + TRACE_TASK(t, "actually arming for %llu into the future\n", + time_to_move); + __hrtimer_start_range_ns(&ce->split_timer, + ns_to_ktime(when_to_fire), + 0 /* delta */, + HRTIMER_MODE_ABS_PINNED, + 0 /* no wakeup */); + ce->timer_armed = 1; + } +} + +/* Uncomment this if you want to see all scheduling decisions in the + * TRACE() log. +#define WANT_ALL_SCHED_EVENTS + */ + +static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b) +{ + cpu_entry_t *a, *b; + a = _a->value; + b = _b->value; + /* Note that a and b are inverted: we want the lowest-priority CPU at + * the top of the heap. + */ + return edf_higher_prio(b->linked, a->linked); +} + +/* update_cpu_position - Move the cpu entry to the correct place to maintain + * order in the cpu queue. Caller must hold gsnedf lock. + */ +static void update_cpu_position(cpu_entry_t *entry) +{ + if (likely(bheap_node_in_heap(entry->hn))) + bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn); + bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn); +} + +/* caller must hold gsnedf lock */ +static cpu_entry_t* lowest_prio_cpu(void) +{ + struct bheap_node* hn; + hn = bheap_peek(cpu_lower_prio, &gsnedf_cpu_heap); + return hn->value; +} + + +/* link_task_to_cpu - Update the link of a CPU. + * Handles the case where the to-be-linked task is already + * scheduled on a different CPU. + */ +static noinline void link_task_to_cpu(struct task_struct* linked, + cpu_entry_t *entry) +{ + cpu_entry_t *sched; + struct task_struct* tmp; + int on_cpu; + + BUG_ON(linked && !is_realtime(linked)); + + /* Currently linked task is set to be unlinked. */ + if (entry->linked) { + entry->linked->rt_param.linked_on = NO_CPU; + } + + /* Link new task to CPU. */ + if (linked) { + set_rt_flags(linked, RT_F_RUNNING); + /* handle task is already scheduled somewhere! */ + on_cpu = linked->rt_param.scheduled_on; + if (on_cpu != NO_CPU) { + sched = &per_cpu(gsnedf_cpu_entries, on_cpu); + /* this should only happen if not linked already */ + BUG_ON(sched->linked == linked); + + /* If we are already scheduled on the CPU to which we + * wanted to link, we don't need to do the swap -- + * we just link ourselves to the CPU and depend on + * the caller to get things right. + */ + if (entry != sched) { + TRACE_TASK(linked, + "already scheduled on %d, updating link.\n", + sched->cpu); + tmp = sched->linked; + linked->rt_param.linked_on = sched->cpu; + sched->linked = linked; + update_cpu_position(sched); + linked = tmp; + } + } + if (linked) /* might be NULL due to swap */ + linked->rt_param.linked_on = entry->cpu; + } + entry->linked = linked; +#ifdef WANT_ALL_SCHED_EVENTS + if (linked) + TRACE_TASK(linked, "linked to %d.\n", entry->cpu); + else + TRACE("NULL linked to %d.\n", entry->cpu); +#endif + update_cpu_position(entry); +} + +/* unlink - Make sure a task is not linked any longer to an entry + * where it was linked before. Must hold gsnedf_lock. + */ +static noinline void unlink(struct task_struct* t) +{ + cpu_entry_t *entry; + + if (t->rt_param.linked_on != NO_CPU) { + /* unlink */ + entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on); + t->rt_param.linked_on = NO_CPU; + link_task_to_cpu(NULL, entry); + } else if (is_queued(t)) { + /* This is an interesting situation: t is scheduled, + * but was just recently unlinked. It cannot be + * linked anywhere else (because then it would have + * been relinked to this CPU), thus it must be in some + * queue. We must remove it from the list in this + * case. + */ + remove(&gsnedf, t); + } +} + + +/* preempt - force a CPU to reschedule + */ +static void preempt(cpu_entry_t *entry) +{ + preempt_if_preemptable(entry->scheduled, entry->cpu); +} + +/* requeue - Put an unlinked task into gsn-edf domain. + * Caller must hold gsnedf_lock. + */ +static noinline void requeue(struct task_struct* task) +{ + BUG_ON(!task); + /* sanity check before insertion */ + BUG_ON(is_queued(task)); + + if (is_released(task, litmus_clock())) + __add_ready(&gsnedf, task); + else { + /* it has got to wait */ + add_release(&gsnedf, task); + } +} + +#ifdef CONFIG_SCHED_CPU_AFFINITY +static cpu_entry_t* gsnedf_get_nearest_available_cpu(cpu_entry_t *start) +{ + cpu_entry_t *affinity; + + get_nearest_available_cpu(affinity, start, gsnedf_cpu_entries, +#ifdef CONFIG_RELEASE_MASTER + gsnedf.release_master +#else + NO_CPU +#endif + ); + + return(affinity); +} +#endif + +/* check for any necessary preemptions */ +static void check_for_preemptions(void) +{ + struct task_struct *task; + cpu_entry_t *last; + + for (last = lowest_prio_cpu(); + edf_preemption_needed(&gsnedf, last->linked); + last = lowest_prio_cpu()) { + /* preemption necessary */ + task = __take_ready(&gsnedf); + TRACE("check_for_preemptions: attempting to link task %d to %d\n", + task->pid, last->cpu); + +#ifdef CONFIG_SCHED_CPU_AFFINITY + { + cpu_entry_t *affinity = + gsnedf_get_nearest_available_cpu( + &per_cpu(gsnedf_cpu_entries, + task_cpu(task))); + if (affinity) + last = affinity; + else if (last->linked) + requeue(last->linked); + } +#else + if (last->linked) + requeue(last->linked); +#endif + + link_task_to_cpu(task, last); + preempt(last); + } +} + +/* gsnedf_job_arrival: task is either resumed or released */ +static noinline void gsnedf_job_arrival(struct task_struct* task) +{ + BUG_ON(!task); + + requeue(task); + check_for_preemptions(); +} + +static void gsnedf_release_jobs(rt_domain_t* rt, struct bheap* tasks) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&gsnedf_lock, flags); + + __merge_ready(rt, tasks); + check_for_preemptions(); + + raw_spin_unlock_irqrestore(&gsnedf_lock, flags); +} + +/* caller holds gsnedf_lock */ +static noinline void job_completion(struct task_struct *t, int forced) +{ + BUG_ON(!t); + + sched_trace_task_completion(t, forced); + + TRACE_TASK(t, "job_completion().\n"); + + /* set flags */ + set_rt_flags(t, RT_F_SLEEP); + /* prepare for next period */ + /* prepare_for_next_period assumes implicit deadlines and no splitting, + * so we call it with the job deadline it expects. + */ + t->rt_param.job_params.deadline = t->rt_param.job_params.release + + t->rt_param.task_params.period; + prepare_for_next_period(t); + /* We now set the subjob deadline to what it should be for scheduling + * priority. + */ + t->rt_param.job_params.subjob_deadline = get_proper_deadline(t); + if (is_released(t, litmus_clock())) + sched_trace_task_release(t); + /* unlink */ + unlink(t); + /* requeue + * But don't requeue a blocking task. */ + if (is_running(t)) + gsnedf_job_arrival(t); +} + +static void move_deadline(struct task_struct *t) +{ + tsk_rt(t)->job_params.subjob_deadline = get_proper_deadline(t); + TRACE_TASK(t, "move_deadline called\nRelease: %llu\nPeriod: %llu" + "\nRelease + Period: %llu\nDeadline: %llu" + "\nDeadline - Release: %llu\n", + t->rt_param.job_params.release, + t->rt_param.task_params.period, + t->rt_param.job_params.release + + t->rt_param.task_params.period, + t->rt_param.job_params.subjob_deadline, + t->rt_param.job_params.subjob_deadline + - t->rt_param.job_params.release); + /* Check if rescheduling needed with lower priority. */ + unlink(t); + gsnedf_job_arrival(t); +} + +/* gsnedf_tick - this function is called for every local timer + * interrupt. + * + * checks whether the current task has expired and checks + * whether we need to preempt it if it has not expired + */ +static void gsnedf_tick(struct task_struct* t) +{ + if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) { + if (!is_np(t)) { + /* np tasks will be preempted when they become + * preemptable again + */ + litmus_reschedule_local(); + TRACE("gsnedf_scheduler_tick: " + "%d is preemptable " + " => FORCE_RESCHED\n", t->pid); + } else if (is_user_np(t)) { + TRACE("gsnedf_scheduler_tick: " + "%d is non-preemptable, " + "preemption delayed.\n", t->pid); + request_exit_np(t); + } + } +} + +/* Getting schedule() right is a bit tricky. schedule() may not make any + * assumptions on the state of the current task since it may be called for a + * number of reasons. The reasons include a scheduler_tick() determined that it + * was necessary, because sys_exit_np() was called, because some Linux + * subsystem determined so, or even (in the worst case) because there is a bug + * hidden somewhere. Thus, we must take extreme care to determine what the + * current state is. + * + * The CPU could currently be scheduling a task (or not), be linked (or not). + * + * The following assertions for the scheduled task could hold: + * + * - !is_running(scheduled) // the job blocks + * - scheduled->timeslice == 0 // the job completed (forcefully) + * - get_rt_flag() == RT_F_SLEEP // the job completed (by syscall) + * - linked != scheduled // we need to reschedule (for any reason) + * - is_np(scheduled) // rescheduling must be delayed, + * sys_exit_np must be requested + * + * Any of these can occur together. + */ +static struct task_struct* gsnedf_schedule(struct task_struct * prev) +{ + cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries); + int out_of_time, sleep, preempt, np, exists, blocks, needs_move; + struct task_struct* next = NULL; + +#ifdef CONFIG_RELEASE_MASTER + /* Bail out early if we are the release master. + * The release master never schedules any real-time tasks. + */ + if (unlikely(gsnedf.release_master == entry->cpu)) { + sched_state_task_picked(); + return NULL; + } +#endif + + raw_spin_lock(&gsnedf_lock); + + /* sanity checking */ + BUG_ON(entry->scheduled && entry->scheduled != prev); + BUG_ON(entry->scheduled && !is_realtime(prev)); + BUG_ON(is_realtime(prev) && !entry->scheduled); + + /* (0) Determine state */ + exists = entry->scheduled != NULL; + blocks = exists && !is_running(entry->scheduled); + out_of_time = exists && + budget_enforced(entry->scheduled) && + budget_exhausted(entry->scheduled); + needs_move = exists && needs_deadline_move(entry->scheduled); + np = exists && is_np(entry->scheduled); + sleep = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP; + preempt = entry->scheduled != entry->linked; + +#ifdef WANT_ALL_SCHED_EVENTS + TRACE_TASK(prev, "invoked gsnedf_schedule.\n"); +#endif + + if (exists) + TRACE_TASK(prev, + "blocks:%d out_of_time:%d needs_move:%d np:%d" + " sleep:%d preempt:%d state:%d sig:%d\n", + blocks, out_of_time, needs_move, np, sleep, preempt, + prev->state, signal_pending(prev)); + if (entry->linked && preempt) + TRACE_TASK(prev, "will be preempted by %s/%d\n", + entry->linked->comm, entry->linked->pid); + + + /* If a task blocks we have no choice but to reschedule. + */ + if (blocks) + unlink(entry->scheduled); + + /* Request a sys_exit_np() call if we would like to preempt but cannot. + * We need to make sure to update the link structure anyway in case + * that we are still linked. Multiple calls to request_exit_np() don't + * hurt. + * + * Job deadline moves handled similarly + */ + if (np && (out_of_time || preempt || sleep)) { + unlink(entry->scheduled); + request_exit_np(entry->scheduled); + } + else if (np && needs_move) { + move_deadline(entry->scheduled); + } + + /* Any task that is preemptable and either exhausts its execution + * budget or wants to sleep completes. We may have to reschedule after + * this. Don't do a job completion if we block (can't have timers running + * for blocked jobs). Preemption go first for the same reason. + */ + if (!np && (out_of_time || sleep) && !blocks && !preempt) + job_completion(entry->scheduled, !sleep); + else if (!np && needs_move && !blocks && !preempt) { + move_deadline(entry->scheduled); + } + + /* Link pending task if we became unlinked. + */ + if (!entry->linked) + link_task_to_cpu(__take_ready(&gsnedf), entry); + + /* The final scheduling decision. Do we need to switch for some reason? + * If linked is different from scheduled, then select linked as next. + */ + if ((!np || blocks) && + entry->linked != entry->scheduled) { + /* Schedule a linked job? */ + if (entry->linked) { + entry->linked->rt_param.scheduled_on = entry->cpu; + next = entry->linked; + TRACE_TASK(next, "scheduled_on = P%d\n", smp_processor_id()); + } + if (entry->scheduled) { + /* not gonna be scheduled soon */ + entry->scheduled->rt_param.scheduled_on = NO_CPU; + TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n"); + } + } else + /* Only override Linux scheduler if we have a real-time task + * scheduled that needs to continue. + */ + if (exists) + next = prev; + + sched_state_task_picked(); + + raw_spin_unlock(&gsnedf_lock); + + if (next) { + arm_split_timer(entry, next); + } + else if (entry->timer_armed) { + cancel_split_timer(entry); + } + +#ifdef WANT_ALL_SCHED_EVENTS + TRACE("gsnedf_lock released, next=0x%p\n", next); + + if (next) + TRACE_TASK(next, "scheduled at %llu\n", litmus_clock()); + else if (exists && !next) + TRACE("becomes idle at %llu.\n", litmus_clock()); +#endif + + + return next; +} + + +/* _finish_switch - we just finished the switch away from prev + */ +static void gsnedf_finish_switch(struct task_struct *prev) +{ + cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries); + + entry->scheduled = is_realtime(current) ? current : NULL; +#ifdef WANT_ALL_SCHED_EVENTS + TRACE_TASK(prev, "switched away from\n"); +#endif +} + +static void gsnedf_release_at(struct task_struct *t, lt_t start) +{ + t->rt_param.job_params.deadline = start; + prepare_for_next_period(t); + t->rt_param.job_params.subjob_deadline = get_proper_deadline(t); + set_rt_flags(t, RT_F_RUNNING); +} + +/* Prepare a task for running in RT mode + */ +static void gsnedf_task_new(struct task_struct * t, int on_rq, int running) +{ + unsigned long flags; + cpu_entry_t* entry; + + TRACE("gsn edf: task new %d\n", t->pid); + + raw_spin_lock_irqsave(&gsnedf_lock, flags); + + /* setup job params */ + gsnedf_release_at(t, litmus_clock()); + + if (running) { + entry = &per_cpu(gsnedf_cpu_entries, task_cpu(t)); + BUG_ON(entry->scheduled); + +#ifdef CONFIG_RELEASE_MASTER + if (entry->cpu != gsnedf.release_master) { +#endif + entry->scheduled = t; + tsk_rt(t)->scheduled_on = task_cpu(t); +#ifdef CONFIG_RELEASE_MASTER + } else { + /* do not schedule on release master */ + preempt(entry); /* force resched */ + tsk_rt(t)->scheduled_on = NO_CPU; + } +#endif + } else { + t->rt_param.scheduled_on = NO_CPU; + } + t->rt_param.linked_on = NO_CPU; + + gsnedf_job_arrival(t); + raw_spin_unlock_irqrestore(&gsnedf_lock, flags); +} + +static void gsnedf_task_wake_up(struct task_struct *task) +{ + unsigned long flags; + lt_t now; + + TRACE_TASK(task, "wake_up at %llu\n", litmus_clock()); + + raw_spin_lock_irqsave(&gsnedf_lock, flags); + /* We need to take suspensions because of semaphores into + * account! If a job resumes after being suspended due to acquiring + * a semaphore, it should never be treated as a new job release. + */ + if (get_rt_flags(task) == RT_F_EXIT_SEM) { + set_rt_flags(task, RT_F_RUNNING); + } else { + now = litmus_clock(); + if (is_tardy(task, now)) { + /* new sporadic release */ + gsnedf_release_at(task, now); + sched_trace_task_release(task); + } + else { + if (task->rt.time_slice) { + /* came back in time before deadline + */ + set_rt_flags(task, RT_F_RUNNING); + } + } + } + gsnedf_job_arrival(task); + raw_spin_unlock_irqrestore(&gsnedf_lock, flags); +} + +static void gsnedf_task_block(struct task_struct *t) +{ + unsigned long flags; + + TRACE_TASK(t, "block at %llu\n", litmus_clock()); + + /* unlink if necessary */ + raw_spin_lock_irqsave(&gsnedf_lock, flags); + unlink(t); + raw_spin_unlock_irqrestore(&gsnedf_lock, flags); + + BUG_ON(!is_realtime(t)); +} + + +static void gsnedf_task_exit(struct task_struct * t) +{ + unsigned long flags; + + /* unlink if necessary */ + raw_spin_lock_irqsave(&gsnedf_lock, flags); + unlink(t); + if (tsk_rt(t)->scheduled_on != NO_CPU) { + gsnedf_cpus[tsk_rt(t)->scheduled_on]->scheduled = NULL; + tsk_rt(t)->scheduled_on = NO_CPU; + } + raw_spin_unlock_irqrestore(&gsnedf_lock, flags); + + BUG_ON(!is_realtime(t)); + TRACE_TASK(t, "RIP\n"); +} + + +static long gsnedf_admit_task(struct task_struct* tsk) +{ + return 0; +} + +#ifdef CONFIG_LITMUS_LOCKING + +#include + +/* called with IRQs off */ +static void set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh) +{ + int linked_on; + int check_preempt = 0; + + raw_spin_lock(&gsnedf_lock); + + TRACE_TASK(t, "inherits priority from %s/%d\n", prio_inh->comm, prio_inh->pid); + tsk_rt(t)->inh_task = prio_inh; + + linked_on = tsk_rt(t)->linked_on; + + /* If it is scheduled, then we need to reorder the CPU heap. */ + if (linked_on != NO_CPU) { + TRACE_TASK(t, "%s: linked on %d\n", + __FUNCTION__, linked_on); + /* Holder is scheduled; need to re-order CPUs. + * We can't use heap_decrease() here since + * the cpu_heap is ordered in reverse direction, so + * it is actually an increase. */ + bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap, + gsnedf_cpus[linked_on]->hn); + bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap, + gsnedf_cpus[linked_on]->hn); + } else { + /* holder may be queued: first stop queue changes */ + raw_spin_lock(&gsnedf.release_lock); + if (is_queued(t)) { + TRACE_TASK(t, "%s: is queued\n", + __FUNCTION__); + /* We need to update the position of holder in some + * heap. Note that this could be a release heap if we + * budget enforcement is used and this job overran. */ + check_preempt = + !bheap_decrease(edf_ready_order, + tsk_rt(t)->heap_node); + } else { + /* Nothing to do: if it is not queued and not linked + * then it is either sleeping or currently being moved + * by other code (e.g., a timer interrupt handler) that + * will use the correct priority when enqueuing the + * task. */ + TRACE_TASK(t, "%s: is NOT queued => Done.\n", + __FUNCTION__); + } + raw_spin_unlock(&gsnedf.release_lock); + + /* If holder was enqueued in a release heap, then the following + * preemption check is pointless, but we can't easily detect + * that case. If you want to fix this, then consider that + * simply adding a state flag requires O(n) time to update when + * releasing n tasks, which conflicts with the goal to have + * O(log n) merges. */ + if (check_preempt) { + /* heap_decrease() hit the top level of the heap: make + * sure preemption checks get the right task, not the + * potentially stale cache. */ + bheap_uncache_min(edf_ready_order, + &gsnedf.ready_queue); + check_for_preemptions(); + } + } + + raw_spin_unlock(&gsnedf_lock); +} + +/* called with IRQs off */ +static void update_unlocked_priority(struct task_struct* t) +{ + raw_spin_lock(&gsnedf_lock); + + /* A job only stops inheriting a priority when it releases a + * resource. Thus we can make the following assumption.*/ + BUG_ON(tsk_rt(t)->scheduled_on == NO_CPU); + + /* Clear priority inheritance */ + TRACE_TASK(t, "priority restored\n"); + tsk_rt(t)->inh_task = NULL; + + /* Update splitting deadline */ + tsk_rt(t)->job_params.subjob_deadline = get_proper_deadline(t); + + /* Check if rescheduling is necessary. We can't use heap_decrease() + * since the priority was effectively lowered. */ + unlink(t); + gsnedf_job_arrival(t); + + raw_spin_unlock(&gsnedf_lock); +} + + +/* ******************** FMLP support ********************** */ + +/* struct for semaphore with priority inheritance */ +struct fmlp_semaphore { + struct litmus_lock litmus_lock; + + /* current resource holder */ + struct task_struct *owner; + + /* highest-priority waiter */ + struct task_struct *hp_waiter; + + /* FIFO queue of waiting tasks */ + wait_queue_head_t wait; +}; + +static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock) +{ + return container_of(lock, struct fmlp_semaphore, litmus_lock); +} + +/* caller is responsible for locking */ +static struct task_struct* find_hp_waiter(struct fmlp_semaphore *sem, + struct task_struct* skip) +{ + struct list_head *pos; + struct task_struct *queued, *found = NULL; + + list_for_each(pos, &sem->wait.task_list) { + queued = (struct task_struct*) list_entry(pos, wait_queue_t, + task_list)->private; + + /* Compare task prios, find high prio task. */ + if (queued != skip && edf_higher_prio(queued, found)) + found = queued; + } + return found; +} + +int gsnedf_fmlp_lock(struct litmus_lock* l) +{ + struct task_struct* t = current; + struct fmlp_semaphore *sem = fmlp_from_lock(l); + cpu_entry_t* entry; + wait_queue_t wait; + unsigned long flags; + + if (!is_realtime(t)) + return -EPERM; + + spin_lock_irqsave(&sem->wait.lock, flags); + entry = &__get_cpu_var(gsnedf_cpu_entries); + + tsk_rt(t)->in_crit_section = 1; + if (entry->timer_armed) { + cancel_split_timer(entry); + } + + if (sem->owner) { + /* resource is not free => must suspend and wait */ + + init_waitqueue_entry(&wait, t); + + /* FIXME: interruptible would be nice some day */ + set_task_state(t, TASK_UNINTERRUPTIBLE); + + __add_wait_queue_tail_exclusive(&sem->wait, &wait); + + /* check if we need to activate priority inheritance */ + if (edf_higher_prio(t, sem->hp_waiter)) { + sem->hp_waiter = t; + if (edf_higher_prio(t, sem->owner)) + set_priority_inheritance(sem->owner, sem->hp_waiter); + } + + TS_LOCK_SUSPEND; + + /* release lock before sleeping */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + /* We depend on the FIFO order. Thus, we don't need to recheck + * when we wake up; we are guaranteed to have the lock since + * there is only one wake up per release. + */ + + schedule(); + + TS_LOCK_RESUME; + + /* Since we hold the lock, no other task will change + * ->owner. We can thus check it without acquiring the spin + * lock. */ + BUG_ON(sem->owner != t); + } else { + /* it's ours now */ + sem->owner = t; + + spin_unlock_irqrestore(&sem->wait.lock, flags); + } + + return 0; +} + +int gsnedf_fmlp_unlock(struct litmus_lock* l) +{ + struct task_struct *t = current, *next; + struct fmlp_semaphore *sem = fmlp_from_lock(l); + unsigned long flags; + int err = 0; + + spin_lock_irqsave(&sem->wait.lock, flags); + + if (sem->owner != t) { + err = -EINVAL; + goto out; + } + + /* check if there are jobs waiting for this resource */ + next = __waitqueue_remove_first(&sem->wait); + if (next) { + /* next becomes the resouce holder */ + sem->owner = next; + TRACE_CUR("lock ownership passed to %s/%d\n", next->comm, next->pid); + + /* determine new hp_waiter if necessary */ + if (next == sem->hp_waiter) { + TRACE_TASK(next, "was highest-prio waiter\n"); + /* next has the highest priority --- it doesn't need to + * inherit. However, we need to make sure that the + * next-highest priority in the queue is reflected in + * hp_waiter. */ + sem->hp_waiter = find_hp_waiter(sem, next); + if (sem->hp_waiter) + TRACE_TASK(sem->hp_waiter, "is new highest-prio waiter\n"); + else + TRACE("no further waiters\n"); + } else { + /* Well, if next is not the highest-priority waiter, + * then it ought to inherit the highest-priority + * waiter's priority. */ + set_priority_inheritance(next, sem->hp_waiter); + } + + /* wake up next */ + wake_up_process(next); + } else + /* becomes available */ + sem->owner = NULL; + + /* We are no longer in a critical section */ + tsk_rt(t)->in_crit_section = 0; + + /* we lose the benefit of priority inheritance (if any) and may need + * to move the deadline. In either case, may need to reschedule + * due to reduced priority. */ + if (tsk_rt(t)->inh_task || needs_deadline_move(t)) + update_unlocked_priority(t); + /* TODO: Check that schedule() gets called - it needs to arm the + * enforcement timer. Otherwise we should do it here or in + * update_unlocked_priority. */ + +out: + spin_unlock_irqrestore(&sem->wait.lock, flags); + + return err; +} + +int gsnedf_fmlp_close(struct litmus_lock* l) +{ + struct task_struct *t = current; + struct fmlp_semaphore *sem = fmlp_from_lock(l); + unsigned long flags; + + int owner; + + spin_lock_irqsave(&sem->wait.lock, flags); + + owner = sem->owner == t; + + spin_unlock_irqrestore(&sem->wait.lock, flags); + + if (owner) + gsnedf_fmlp_unlock(l); + + return 0; +} + +void gsnedf_fmlp_free(struct litmus_lock* lock) +{ + kfree(fmlp_from_lock(lock)); +} + +static struct litmus_lock_ops gsnedf_fmlp_lock_ops = { + .close = gsnedf_fmlp_close, + .lock = gsnedf_fmlp_lock, + .unlock = gsnedf_fmlp_unlock, + .deallocate = gsnedf_fmlp_free, +}; + +static struct litmus_lock* gsnedf_new_fmlp(void) +{ + struct fmlp_semaphore* sem; + + sem = kmalloc(sizeof(*sem), GFP_KERNEL); + if (!sem) + return NULL; + + sem->owner = NULL; + sem->hp_waiter = NULL; + init_waitqueue_head(&sem->wait); + sem->litmus_lock.ops = &gsnedf_fmlp_lock_ops; + + return &sem->litmus_lock; +} + +/* **** lock constructor **** */ + + +static long gsnedf_allocate_lock(struct litmus_lock **lock, int type, + void* __user unused) +{ + int err = -ENXIO; + + /* GSN-EDF currently only supports the FMLP for global resources. */ + switch (type) { + + case FMLP_SEM: + /* Flexible Multiprocessor Locking Protocol */ + *lock = gsnedf_new_fmlp(); + if (*lock) + err = 0; + else + err = -ENOMEM; + break; + + }; + + return err; +} + +#endif + + +static long gsnedf_activate_plugin(void) +{ + int cpu; + cpu_entry_t *entry; + + bheap_init(&gsnedf_cpu_heap); +#ifdef CONFIG_RELEASE_MASTER + gsnedf.release_master = atomic_read(&release_master_cpu); +#endif + + for_each_online_cpu(cpu) { + entry = &per_cpu(gsnedf_cpu_entries, cpu); + bheap_node_init(&entry->hn, entry); + entry->linked = NULL; + entry->scheduled = NULL; +#ifdef CONFIG_RELEASE_MASTER + if (cpu != gsnedf.release_master) { +#endif + TRACE("GSN-EDF: Initializing CPU #%d.\n", cpu); + update_cpu_position(entry); +#ifdef CONFIG_RELEASE_MASTER + } else { + TRACE("GSN-EDF: CPU %d is release master.\n", cpu); + } +#endif + } + return 0; +} + +/* Plugin object */ +static struct sched_plugin gsn_edf_plugin __cacheline_aligned_in_smp = { + .plugin_name = "GSN-EDF", + .finish_switch = gsnedf_finish_switch, + .tick = gsnedf_tick, + .task_new = gsnedf_task_new, + .complete_job = complete_job, + .task_exit = gsnedf_task_exit, + .schedule = gsnedf_schedule, + .release_at = gsnedf_release_at, + .task_wake_up = gsnedf_task_wake_up, + .task_block = gsnedf_task_block, + .admit_task = gsnedf_admit_task, + .activate_plugin = gsnedf_activate_plugin, +#ifdef CONFIG_LITMUS_LOCKING + .allocate_lock = gsnedf_allocate_lock, +#endif +}; + + +static int __init init_gsn_edf(void) +{ + int cpu; + cpu_entry_t *entry; + + bheap_init(&gsnedf_cpu_heap); + /* initialize CPU state */ + for (cpu = 0; cpu < NR_CPUS; cpu++) { + entry = &per_cpu(gsnedf_cpu_entries, cpu); + gsnedf_cpus[cpu] = entry; + entry->cpu = cpu; + entry->hn = &gsnedf_heap_node[cpu]; + hrtimer_init(&entry->split_timer, + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + entry->split_timer.function = on_split_timeout; + bheap_node_init(&entry->hn, entry); + } + edf_domain_init(&gsnedf, NULL, gsnedf_release_jobs); + return register_sched_plugin(&gsn_edf_plugin); +} + + +module_init(init_gsn_edf); diff --git a/litmus/sched_litmus.c b/litmus/sched_litmus.c new file mode 100644 index 000000000000..5877307a996b --- /dev/null +++ b/litmus/sched_litmus.c @@ -0,0 +1,328 @@ +/* This file is included from kernel/sched.c */ + +#include +#include +#include +#include + +static void update_time_litmus(struct rq *rq, struct task_struct *p) +{ + u64 delta = rq->clock - p->se.exec_start; + if (unlikely((s64)delta < 0)) + delta = 0; + /* per job counter */ + p->rt_param.job_params.exec_time += delta; + /* task counter */ + p->se.sum_exec_runtime += delta; + /* sched_clock() */ + p->se.exec_start = rq->clock; + cpuacct_charge(p, delta); +} + +static void double_rq_lock(struct rq *rq1, struct rq *rq2); +static void double_rq_unlock(struct rq *rq1, struct rq *rq2); + +/* + * litmus_tick gets called by scheduler_tick() with HZ freq + * Interrupts are disabled + */ +static void litmus_tick(struct rq *rq, struct task_struct *p) +{ + TS_PLUGIN_TICK_START; + + if (is_realtime(p)) + update_time_litmus(rq, p); + + /* plugin tick */ + litmus->tick(p); + + TS_PLUGIN_TICK_END; + + return; +} + +static struct task_struct * +litmus_schedule(struct rq *rq, struct task_struct *prev) +{ + struct rq* other_rq; + struct task_struct *next; + + long was_running; + lt_t _maybe_deadlock = 0; + + /* let the plugin schedule */ + next = litmus->schedule(prev); + + sched_state_plugin_check(); + + /* check if a global plugin pulled a task from a different RQ */ + if (next && task_rq(next) != rq) { + /* we need to migrate the task */ + other_rq = task_rq(next); + TRACE_TASK(next, "migrate from %d\n", other_rq->cpu); + + /* while we drop the lock, the prev task could change its + * state + */ + was_running = is_running(prev); + mb(); + raw_spin_unlock(&rq->lock); + + /* Don't race with a concurrent switch. This could deadlock in + * the case of cross or circular migrations. It's the job of + * the plugin to make sure that doesn't happen. + */ + TRACE_TASK(next, "stack_in_use=%d\n", + next->rt_param.stack_in_use); + if (next->rt_param.stack_in_use != NO_CPU) { + TRACE_TASK(next, "waiting to deschedule\n"); + _maybe_deadlock = litmus_clock(); + } + while (next->rt_param.stack_in_use != NO_CPU) { + cpu_relax(); + mb(); + if (next->rt_param.stack_in_use == NO_CPU) + TRACE_TASK(next,"descheduled. Proceeding.\n"); + + if (lt_before(_maybe_deadlock + 10000000, + litmus_clock())) { + /* We've been spinning for 10ms. + * Something can't be right! + * Let's abandon the task and bail out; at least + * we will have debug info instead of a hard + * deadlock. + */ + TRACE_TASK(next,"stack too long in use. " + "Deadlock?\n"); + next = NULL; + + /* bail out */ + raw_spin_lock(&rq->lock); + return next; + } + } +#ifdef __ARCH_WANT_UNLOCKED_CTXSW + if (next->oncpu) + TRACE_TASK(next, "waiting for !oncpu"); + while (next->oncpu) { + cpu_relax(); + mb(); + } +#endif + double_rq_lock(rq, other_rq); + mb(); + if (is_realtime(prev) && is_running(prev) != was_running) { + TRACE_TASK(prev, + "state changed while we dropped" + " the lock: is_running=%d, was_running=%d\n", + is_running(prev), was_running); + if (is_running(prev) && !was_running) { + /* prev task became unblocked + * we need to simulate normal sequence of events + * to scheduler plugins. + */ + litmus->task_block(prev); + litmus->task_wake_up(prev); + } + } + + set_task_cpu(next, smp_processor_id()); + + /* DEBUG: now that we have the lock we need to make sure a + * couple of things still hold: + * - it is still a real-time task + * - it is still runnable (could have been stopped) + * If either is violated, then the active plugin is + * doing something wrong. + */ + if (!is_realtime(next) || !is_running(next)) { + /* BAD BAD BAD */ + TRACE_TASK(next,"BAD: migration invariant FAILED: " + "rt=%d running=%d\n", + is_realtime(next), + is_running(next)); + /* drop the task */ + next = NULL; + } + /* release the other CPU's runqueue, but keep ours */ + raw_spin_unlock(&other_rq->lock); + } + if (next) { + next->rt_param.stack_in_use = rq->cpu; + next->se.exec_start = rq->clock; + } + + update_enforcement_timer(next); + return next; +} + +static void enqueue_task_litmus(struct rq *rq, struct task_struct *p, + int flags) +{ + if (flags & ENQUEUE_WAKEUP) { + sched_trace_task_resume(p); + tsk_rt(p)->present = 1; + /* LITMUS^RT plugins need to update the state + * _before_ making it available in global structures. + * Linux gets away with being lazy about the task state + * update. We can't do that, hence we update the task + * state already here. + * + * WARNING: this needs to be re-evaluated when porting + * to newer kernel versions. + */ + p->state = TASK_RUNNING; + litmus->task_wake_up(p); + + rq->litmus.nr_running++; + } else + TRACE_TASK(p, "ignoring an enqueue, not a wake up.\n"); +} + +static void dequeue_task_litmus(struct rq *rq, struct task_struct *p, + int flags) +{ + if (flags & DEQUEUE_SLEEP) { + litmus->task_block(p); + tsk_rt(p)->present = 0; + sched_trace_task_block(p); + + rq->litmus.nr_running--; + } else + TRACE_TASK(p, "ignoring a dequeue, not going to sleep.\n"); +} + +static void yield_task_litmus(struct rq *rq) +{ + TS_SYSCALL_IN_START; + + TS_SYSCALL_IN_END; + + TRACE_CUR("yields\n"); + + BUG_ON(rq->curr != current); + /* sched_yield() is called to trigger delayed preemptions. + * Thus, mark the current task as needing to be rescheduled. + * This will cause the scheduler plugin to be invoked, which can + * then determine if a preemption is still required. + */ + clear_exit_np(current); + litmus_reschedule_local(); + + TS_SYSCALL_OUT_START; +} + +/* Plugins are responsible for this. + */ +static void check_preempt_curr_litmus(struct rq *rq, struct task_struct *p, int flags) +{ +} + +static void put_prev_task_litmus(struct rq *rq, struct task_struct *p) +{ +} + +static void pre_schedule_litmus(struct rq *rq, struct task_struct *prev) +{ + update_time_litmus(rq, prev); + if (!is_running(prev)) + tsk_rt(prev)->present = 0; +} + +/* pick_next_task_litmus() - litmus_schedule() function + * + * return the next task to be scheduled + */ +static struct task_struct *pick_next_task_litmus(struct rq *rq) +{ + /* get the to-be-switched-out task (prev) */ + struct task_struct *prev = rq->litmus.prev; + struct task_struct *next; + + /* if not called from schedule() but from somewhere + * else (e.g., migration), return now! + */ + if(!rq->litmus.prev) + return NULL; + + rq->litmus.prev = NULL; + + TS_PLUGIN_SCHED_START; + next = litmus_schedule(rq, prev); + TS_PLUGIN_SCHED_END; + + return next; +} + +static void task_tick_litmus(struct rq *rq, struct task_struct *p, int queued) +{ + /* nothing to do; tick related tasks are done by litmus_tick() */ + return; +} + +static void switched_to_litmus(struct rq *rq, struct task_struct *p, int running) +{ +} + +static void prio_changed_litmus(struct rq *rq, struct task_struct *p, + int oldprio, int running) +{ +} + +unsigned int get_rr_interval_litmus(struct rq *rq, struct task_struct *p) +{ + /* return infinity */ + return 0; +} + +/* This is called when a task became a real-time task, either due to a SCHED_* + * class transition or due to PI mutex inheritance. We don't handle Linux PI + * mutex inheritance yet (and probably never will). Use LITMUS provided + * synchronization primitives instead. + */ +static void set_curr_task_litmus(struct rq *rq) +{ + rq->curr->se.exec_start = rq->clock; +} + + +#ifdef CONFIG_SMP +/* execve tries to rebalance task in this scheduling domain. + * We don't care about the scheduling domain; can gets called from + * exec, fork, wakeup. + */ +static int select_task_rq_litmus(struct rq *rq, struct task_struct *p, + int sd_flag, int flags) +{ + /* preemption is already disabled. + * We don't want to change cpu here + */ + return task_cpu(p); +} +#endif + +static const struct sched_class litmus_sched_class = { + .next = &rt_sched_class, + .enqueue_task = enqueue_task_litmus, + .dequeue_task = dequeue_task_litmus, + .yield_task = yield_task_litmus, + + .check_preempt_curr = check_preempt_curr_litmus, + + .pick_next_task = pick_next_task_litmus, + .put_prev_task = put_prev_task_litmus, + +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_litmus, + + .pre_schedule = pre_schedule_litmus, +#endif + + .set_curr_task = set_curr_task_litmus, + .task_tick = task_tick_litmus, + + .get_rr_interval = get_rr_interval_litmus, + + .prio_changed = prio_changed_litmus, + .switched_to = switched_to_litmus, +}; diff --git a/litmus/sched_litmus.c.rej b/litmus/sched_litmus.c.rej new file mode 100644 index 000000000000..e0750ecbe7a2 --- /dev/null +++ b/litmus/sched_litmus.c.rej @@ -0,0 +1,11 @@ +--- litmus/sched_litmus.c ++++ litmus/sched_litmus.c +@@ -196,7 +196,7 @@ + { + TS_SYSCALL_IN_START; + +- TS_SYSCALL_OUT_END; ++ TS_SYSCALL_IN_END; + + TRACE_CUR("yields\n"); + diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c new file mode 100644 index 000000000000..c95bde87b5d7 --- /dev/null +++ b/litmus/sched_pfair.c @@ -0,0 +1,1056 @@ +/* + * kernel/sched_pfair.c + * + * Implementation of the PD^2 pfair scheduling algorithm. This + * implementation realizes "early releasing," i.e., it is work-conserving. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +/* to configure the cluster size */ +#include + +#include + +static enum cache_level pfair_cluster_level = GLOBAL_CLUSTER; + +struct subtask { + /* measured in quanta relative to job release */ + quanta_t release; + quanta_t deadline; + quanta_t overlap; /* called "b bit" by PD^2 */ + quanta_t group_deadline; +}; + +struct pfair_param { + quanta_t quanta; /* number of subtasks */ + quanta_t cur; /* index of current subtask */ + + quanta_t release; /* in quanta */ + quanta_t period; /* in quanta */ + + quanta_t last_quantum; /* when scheduled last */ + int last_cpu; /* where scheduled last */ + + struct pfair_cluster* cluster; /* where this task is scheduled */ + + struct subtask subtasks[0]; /* allocate together with pfair_param */ +}; + +#define tsk_pfair(tsk) ((tsk)->rt_param.pfair) + +struct pfair_state { + struct cluster_cpu topology; + + volatile quanta_t cur_tick; /* updated by the CPU that is advancing + * the time */ + volatile quanta_t local_tick; /* What tick is the local CPU currently + * executing? Updated only by the local + * CPU. In QEMU, this may lag behind the + * current tick. In a real system, with + * proper timers and aligned quanta, + * that should only be the case for a + * very short time after the time + * advanced. With staggered quanta, it + * will lag for the duration of the + * offset. + */ + + struct task_struct* linked; /* the task that should be executing */ + struct task_struct* local; /* the local copy of linked */ + struct task_struct* scheduled; /* what is actually scheduled */ + + lt_t offset; /* stagger offset */ + unsigned int missed_updates; + unsigned int missed_quanta; +}; + +struct pfair_cluster { + struct scheduling_cluster topology; + + /* The "global" time in this cluster. */ + quanta_t pfair_time; /* the "official" PFAIR clock */ + + /* The ready queue for this cluster. */ + rt_domain_t pfair; + + /* The set of jobs that should have their release enacted at the next + * quantum boundary. + */ + struct bheap release_queue; + raw_spinlock_t release_lock; +}; + +static inline struct pfair_cluster* cpu_cluster(struct pfair_state* state) +{ + return container_of(state->topology.cluster, struct pfair_cluster, topology); +} + +static inline int cpu_id(struct pfair_state* state) +{ + return state->topology.id; +} + +static inline struct pfair_state* from_cluster_list(struct list_head* pos) +{ + return list_entry(pos, struct pfair_state, topology.cluster_list); +} + +static inline struct pfair_cluster* from_domain(rt_domain_t* rt) +{ + return container_of(rt, struct pfair_cluster, pfair); +} + +static inline raw_spinlock_t* cluster_lock(struct pfair_cluster* cluster) +{ + /* The ready_lock is used to serialize all scheduling events. */ + return &cluster->pfair.ready_lock; +} + +static inline raw_spinlock_t* cpu_lock(struct pfair_state* state) +{ + return cluster_lock(cpu_cluster(state)); +} + +DEFINE_PER_CPU(struct pfair_state, pfair_state); +struct pfair_state* *pstate; /* short cut */ + +static struct pfair_cluster* pfair_clusters; +static int num_pfair_clusters; + +/* Enable for lots of trace info. + * #define PFAIR_DEBUG + */ + +#ifdef PFAIR_DEBUG +#define PTRACE_TASK(t, f, args...) TRACE_TASK(t, f, ## args) +#define PTRACE(f, args...) TRACE(f, ## args) +#else +#define PTRACE_TASK(t, f, args...) +#define PTRACE(f, args...) +#endif + +/* gcc will inline all of these accessor functions... */ +static struct subtask* cur_subtask(struct task_struct* t) +{ + return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur; +} + +static quanta_t cur_deadline(struct task_struct* t) +{ + return cur_subtask(t)->deadline + tsk_pfair(t)->release; +} + +static quanta_t cur_release(struct task_struct* t) +{ + /* This is early releasing: only the release of the first subtask + * counts. */ + return tsk_pfair(t)->release; +} + +static quanta_t cur_overlap(struct task_struct* t) +{ + return cur_subtask(t)->overlap; +} + +static quanta_t cur_group_deadline(struct task_struct* t) +{ + quanta_t gdl = cur_subtask(t)->group_deadline; + if (gdl) + return gdl + tsk_pfair(t)->release; + else + return gdl; +} + + +static int pfair_higher_prio(struct task_struct* first, + struct task_struct* second) +{ + return /* first task must exist */ + first && ( + /* Does the second task exist and is it a real-time task? If + * not, the first task (which is a RT task) has higher + * priority. + */ + !second || !is_realtime(second) || + + /* Is the (subtask) deadline of the first task earlier? + * Then it has higher priority. + */ + time_before(cur_deadline(first), cur_deadline(second)) || + + /* Do we have a deadline tie? + * Then break by B-bit. + */ + (cur_deadline(first) == cur_deadline(second) && + (cur_overlap(first) > cur_overlap(second) || + + /* Do we have a B-bit tie? + * Then break by group deadline. + */ + (cur_overlap(first) == cur_overlap(second) && + (time_after(cur_group_deadline(first), + cur_group_deadline(second)) || + + /* Do we have a group deadline tie? + * Then break by PID, which are unique. + */ + (cur_group_deadline(first) == + cur_group_deadline(second) && + first->pid < second->pid)))))); +} + +int pfair_ready_order(struct bheap_node* a, struct bheap_node* b) +{ + return pfair_higher_prio(bheap2task(a), bheap2task(b)); +} + +static void pfair_release_jobs(rt_domain_t* rt, struct bheap* tasks) +{ + struct pfair_cluster* cluster = from_domain(rt); + unsigned long flags; + + raw_spin_lock_irqsave(&cluster->release_lock, flags); + + bheap_union(pfair_ready_order, &cluster->release_queue, tasks); + + raw_spin_unlock_irqrestore(&cluster->release_lock, flags); +} + +static void prepare_release(struct task_struct* t, quanta_t at) +{ + tsk_pfair(t)->release = at; + tsk_pfair(t)->cur = 0; +} + +/* pull released tasks from the release queue */ +static void poll_releases(struct pfair_cluster* cluster) +{ + raw_spin_lock(&cluster->release_lock); + __merge_ready(&cluster->pfair, &cluster->release_queue); + raw_spin_unlock(&cluster->release_lock); +} + +static void check_preempt(struct task_struct* t) +{ + int cpu = NO_CPU; + if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on && + tsk_rt(t)->present) { + /* the task can be scheduled and + * is not scheduled where it ought to be scheduled + */ + cpu = tsk_rt(t)->linked_on != NO_CPU ? + tsk_rt(t)->linked_on : + tsk_rt(t)->scheduled_on; + PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n", + tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on); + /* preempt */ + litmus_reschedule(cpu); + } +} + +/* caller must hold pfair.ready_lock */ +static void drop_all_references(struct task_struct *t) +{ + int cpu; + struct pfair_state* s; + struct pfair_cluster* cluster; + if (bheap_node_in_heap(tsk_rt(t)->heap_node)) { + /* It must be in the ready queue; drop references isn't called + * when the job is in a release queue. */ + cluster = tsk_pfair(t)->cluster; + bheap_delete(pfair_ready_order, &cluster->pfair.ready_queue, + tsk_rt(t)->heap_node); + } + for (cpu = 0; cpu < num_online_cpus(); cpu++) { + s = &per_cpu(pfair_state, cpu); + if (s->linked == t) + s->linked = NULL; + if (s->local == t) + s->local = NULL; + if (s->scheduled == t) + s->scheduled = NULL; + } +} + +static void pfair_prepare_next_period(struct task_struct* t) +{ + struct pfair_param* p = tsk_pfair(t); + + prepare_for_next_period(t); + get_rt_flags(t) = RT_F_RUNNING; + p->release += p->period; +} + +/* returns 1 if the task needs to go the release queue */ +static int advance_subtask(quanta_t time, struct task_struct* t, int cpu) +{ + struct pfair_param* p = tsk_pfair(t); + int to_relq; + p->cur = (p->cur + 1) % p->quanta; + if (!p->cur) { + if (tsk_rt(t)->present) { + /* The job overran; we start a new budget allocation. */ + pfair_prepare_next_period(t); + } else { + /* remove task from system until it wakes */ + drop_all_references(t); + TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n", + cpu, p->cur); + return 0; + } + } + to_relq = time_after(cur_release(t), time); + TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d (cur_release:%lu time:%lu)\n", + cpu, p->cur, to_relq, cur_release(t), time); + return to_relq; +} + +static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time) +{ + struct task_struct* l; + struct pfair_param* p; + struct list_head* pos; + struct pfair_state* cpu; + + list_for_each(pos, &cluster->topology.cpus) { + cpu = from_cluster_list(pos); + l = cpu->linked; + cpu->missed_updates += cpu->linked != cpu->local; + if (l) { + p = tsk_pfair(l); + p->last_quantum = time; + p->last_cpu = cpu_id(cpu); + if (advance_subtask(time, l, cpu_id(cpu))) { + //cpu->linked = NULL; + PTRACE_TASK(l, "should go to release queue. " + "scheduled_on=%d present=%d\n", + tsk_rt(l)->scheduled_on, + tsk_rt(l)->present); + } + } + } +} + +static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu) +{ + int cpu; + if (tsk_rt(t)->scheduled_on != NO_CPU) { + /* always observe scheduled_on linkage */ + default_cpu = tsk_rt(t)->scheduled_on; + } else if (tsk_pfair(t)->last_quantum == time - 1) { + /* back2back quanta */ + /* Only observe last_quantum if no scheduled_on is in the way. + * This should only kick in if a CPU missed quanta, and that + * *should* only happen in QEMU. + */ + cpu = tsk_pfair(t)->last_cpu; + if (!pstate[cpu]->linked || + tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) { + default_cpu = cpu; + } + } + return default_cpu; +} + +/* returns one if linking was redirected */ +static int pfair_link(quanta_t time, int cpu, + struct task_struct* t) +{ + int target = target_cpu(time, t, cpu); + struct task_struct* prev = pstate[cpu]->linked; + struct task_struct* other; + struct pfair_cluster* cluster = cpu_cluster(pstate[cpu]); + + if (target != cpu) { + BUG_ON(pstate[target]->topology.cluster != pstate[cpu]->topology.cluster); + other = pstate[target]->linked; + pstate[target]->linked = t; + tsk_rt(t)->linked_on = target; + if (!other) + /* linked ok, but reschedule this CPU */ + return 1; + if (target < cpu) { + /* link other to cpu instead */ + tsk_rt(other)->linked_on = cpu; + pstate[cpu]->linked = other; + if (prev) { + /* prev got pushed back into the ready queue */ + tsk_rt(prev)->linked_on = NO_CPU; + __add_ready(&cluster->pfair, prev); + } + /* we are done with this cpu */ + return 0; + } else { + /* re-add other, it's original CPU was not considered yet */ + tsk_rt(other)->linked_on = NO_CPU; + __add_ready(&cluster->pfair, other); + /* reschedule this CPU */ + return 1; + } + } else { + pstate[cpu]->linked = t; + tsk_rt(t)->linked_on = cpu; + if (prev) { + /* prev got pushed back into the ready queue */ + tsk_rt(prev)->linked_on = NO_CPU; + __add_ready(&cluster->pfair, prev); + } + /* we are done with this CPU */ + return 0; + } +} + +static void schedule_subtasks(struct pfair_cluster *cluster, quanta_t time) +{ + int retry; + struct list_head *pos; + struct pfair_state *cpu_state; + + list_for_each(pos, &cluster->topology.cpus) { + cpu_state = from_cluster_list(pos); + retry = 1; +#ifdef CONFIG_RELEASE_MASTER + /* skip release master */ + if (cluster->pfair.release_master == cpu_id(cpu_state)) + continue; +#endif + while (retry) { + if (pfair_higher_prio(__peek_ready(&cluster->pfair), + cpu_state->linked)) + retry = pfair_link(time, cpu_id(cpu_state), + __take_ready(&cluster->pfair)); + else + retry = 0; + } + } +} + +static void schedule_next_quantum(struct pfair_cluster *cluster, quanta_t time) +{ + struct pfair_state *cpu; + struct list_head* pos; + + /* called with interrupts disabled */ + PTRACE("--- Q %lu at %llu PRE-SPIN\n", + time, litmus_clock()); + raw_spin_lock(cluster_lock(cluster)); + PTRACE("<<< Q %lu at %llu\n", + time, litmus_clock()); + + sched_trace_quantum_boundary(); + + advance_subtasks(cluster, time); + poll_releases(cluster); + schedule_subtasks(cluster, time); + + list_for_each(pos, &cluster->topology.cpus) { + cpu = from_cluster_list(pos); + if (cpu->linked) + PTRACE_TASK(cpu->linked, + " linked on %d.\n", cpu_id(cpu)); + else + PTRACE("(null) linked on %d.\n", cpu_id(cpu)); + } + /* We are done. Advance time. */ + mb(); + list_for_each(pos, &cluster->topology.cpus) { + cpu = from_cluster_list(pos); + if (cpu->local_tick != cpu->cur_tick) { + TRACE("BAD Quantum not acked on %d " + "(l:%lu c:%lu p:%lu)\n", + cpu_id(cpu), + cpu->local_tick, + cpu->cur_tick, + cluster->pfair_time); + cpu->missed_quanta++; + } + cpu->cur_tick = time; + } + PTRACE(">>> Q %lu at %llu\n", + time, litmus_clock()); + raw_spin_unlock(cluster_lock(cluster)); +} + +static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state) +{ + quanta_t loc; + + goto first; /* skip mb() on first iteration */ + do { + cpu_relax(); + mb(); + first: loc = state->cur_tick; + /* FIXME: what if loc > cur? */ + } while (time_before(loc, q)); + PTRACE("observed cur_tick:%lu >= q:%lu\n", + loc, q); +} + +static quanta_t current_quantum(struct pfair_state* state) +{ + lt_t t = litmus_clock() - state->offset; + return time2quanta(t, FLOOR); +} + +static void catchup_quanta(quanta_t from, quanta_t target, + struct pfair_state* state) +{ + quanta_t cur = from, time; + TRACE("+++< BAD catching up quanta from %lu to %lu\n", + from, target); + while (time_before(cur, target)) { + wait_for_quantum(cur, state); + cur++; + time = cmpxchg(&cpu_cluster(state)->pfair_time, + cur - 1, /* expected */ + cur /* next */ + ); + if (time == cur - 1) + schedule_next_quantum(cpu_cluster(state), cur); + } + TRACE("+++> catching up done\n"); +} + +/* pfair_tick - this function is called for every local timer + * interrupt. + */ +static void pfair_tick(struct task_struct* t) +{ + struct pfair_state* state = &__get_cpu_var(pfair_state); + quanta_t time, cur; + int retry = 10; + + do { + cur = current_quantum(state); + PTRACE("q %lu at %llu\n", cur, litmus_clock()); + + /* Attempt to advance time. First CPU to get here + * will prepare the next quantum. + */ + time = cmpxchg(&cpu_cluster(state)->pfair_time, + cur - 1, /* expected */ + cur /* next */ + ); + if (time == cur - 1) { + /* exchange succeeded */ + wait_for_quantum(cur - 1, state); + schedule_next_quantum(cpu_cluster(state), cur); + retry = 0; + } else if (time_before(time, cur - 1)) { + /* the whole system missed a tick !? */ + catchup_quanta(time, cur, state); + retry--; + } else if (time_after(time, cur)) { + /* our timer lagging behind!? */ + TRACE("BAD pfair_time:%lu > cur:%lu\n", time, cur); + retry--; + } else { + /* Some other CPU already started scheduling + * this quantum. Let it do its job and then update. + */ + retry = 0; + } + } while (retry); + + /* Spin locally until time advances. */ + wait_for_quantum(cur, state); + + /* copy assignment */ + /* FIXME: what if we race with a future update? Corrupted state? */ + state->local = state->linked; + /* signal that we are done */ + mb(); + state->local_tick = state->cur_tick; + + if (state->local != current + && (is_realtime(current) || is_present(state->local))) + litmus_reschedule_local(); +} + +static int safe_to_schedule(struct task_struct* t, int cpu) +{ + int where = tsk_rt(t)->scheduled_on; + if (where != NO_CPU && where != cpu) { + TRACE_TASK(t, "BAD: can't be scheduled on %d, " + "scheduled already on %d.\n", cpu, where); + return 0; + } else + return tsk_rt(t)->present && get_rt_flags(t) == RT_F_RUNNING; +} + +static struct task_struct* pfair_schedule(struct task_struct * prev) +{ + struct pfair_state* state = &__get_cpu_var(pfair_state); + struct pfair_cluster* cluster = cpu_cluster(state); + int blocks, completion, out_of_time; + struct task_struct* next = NULL; + +#ifdef CONFIG_RELEASE_MASTER + /* Bail out early if we are the release master. + * The release master never schedules any real-time tasks. + */ + if (unlikely(cluster->pfair.release_master == cpu_id(state))) { + sched_state_task_picked(); + return NULL; + } +#endif + + raw_spin_lock(cpu_lock(state)); + + blocks = is_realtime(prev) && !is_running(prev); + completion = is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP; + out_of_time = is_realtime(prev) && time_after(cur_release(prev), + state->local_tick); + + if (is_realtime(prev)) + PTRACE_TASK(prev, "blocks:%d completion:%d out_of_time:%d\n", + blocks, completion, out_of_time); + + if (completion) { + sched_trace_task_completion(prev, 0); + pfair_prepare_next_period(prev); + prepare_release(prev, cur_release(prev)); + } + + if (!blocks && (completion || out_of_time)) { + drop_all_references(prev); + sched_trace_task_release(prev); + add_release(&cluster->pfair, prev); + } + + if (state->local && safe_to_schedule(state->local, cpu_id(state))) + next = state->local; + + if (prev != next) { + tsk_rt(prev)->scheduled_on = NO_CPU; + if (next) + tsk_rt(next)->scheduled_on = cpu_id(state); + } + sched_state_task_picked(); + raw_spin_unlock(cpu_lock(state)); + + if (next) + TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n", + tsk_pfair(next)->release, cpu_cluster(state)->pfair_time, litmus_clock()); + else if (is_realtime(prev)) + TRACE("Becomes idle at %lu (%llu)\n", cpu_cluster(state)->pfair_time, litmus_clock()); + + return next; +} + +static void pfair_task_new(struct task_struct * t, int on_rq, int running) +{ + unsigned long flags; + struct pfair_cluster* cluster; + + TRACE("pfair: task new %d state:%d\n", t->pid, t->state); + + cluster = tsk_pfair(t)->cluster; + + raw_spin_lock_irqsave(cluster_lock(cluster), flags); + + prepare_release(t, cluster->pfair_time + 1); + + t->rt_param.scheduled_on = NO_CPU; + + if (running) { +#ifdef CONFIG_RELEASE_MASTER + if (task_cpu(t) != cluster->pfair.release_master) +#endif + t->rt_param.scheduled_on = task_cpu(t); + __add_ready(&cluster->pfair, t); + } + + check_preempt(t); + + raw_spin_unlock_irqrestore(cluster_lock(cluster), flags); +} + +static void pfair_task_wake_up(struct task_struct *t) +{ + unsigned long flags; + lt_t now; + struct pfair_cluster* cluster; + + cluster = tsk_pfair(t)->cluster; + + TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n", + litmus_clock(), cur_release(t), cluster->pfair_time); + + raw_spin_lock_irqsave(cluster_lock(cluster), flags); + + /* If a task blocks and wakes before its next job release, + * then it may resume if it is currently linked somewhere + * (as if it never blocked at all). Otherwise, we have a + * new sporadic job release. + */ + now = litmus_clock(); + if (lt_before(get_deadline(t), now)) { + release_at(t, now); + prepare_release(t, time2quanta(now, CEIL)); + sched_trace_task_release(t); + } + + /* only add to ready queue if the task isn't still linked somewhere */ + if (tsk_rt(t)->linked_on == NO_CPU) + __add_ready(&cluster->pfair, t); + + check_preempt(t); + + raw_spin_unlock_irqrestore(cluster_lock(cluster), flags); + TRACE_TASK(t, "wake up done at %llu\n", litmus_clock()); +} + +static void pfair_task_block(struct task_struct *t) +{ + BUG_ON(!is_realtime(t)); + TRACE_TASK(t, "blocks at %llu, state:%d\n", + litmus_clock(), t->state); +} + +static void pfair_task_exit(struct task_struct * t) +{ + unsigned long flags; + struct pfair_cluster *cluster; + + BUG_ON(!is_realtime(t)); + + cluster = tsk_pfair(t)->cluster; + + /* Remote task from release or ready queue, and ensure + * that it is not the scheduled task for ANY CPU. We + * do this blanket check because occassionally when + * tasks exit while blocked, the task_cpu of the task + * might not be the same as the CPU that the PFAIR scheduler + * has chosen for it. + */ + raw_spin_lock_irqsave(cluster_lock(cluster), flags); + + TRACE_TASK(t, "RIP, state:%d\n", t->state); + drop_all_references(t); + + raw_spin_unlock_irqrestore(cluster_lock(cluster), flags); + + kfree(t->rt_param.pfair); + t->rt_param.pfair = NULL; +} + + +static void pfair_release_at(struct task_struct* task, lt_t start) +{ + unsigned long flags; + quanta_t release; + + struct pfair_cluster *cluster; + + cluster = tsk_pfair(task)->cluster; + + BUG_ON(!is_realtime(task)); + + raw_spin_lock_irqsave(cluster_lock(cluster), flags); + release_at(task, start); + release = time2quanta(start, CEIL); + + TRACE_TASK(task, "sys release at %lu\n", release); + + drop_all_references(task); + prepare_release(task, release); + add_release(&cluster->pfair, task); + + raw_spin_unlock_irqrestore(cluster_lock(cluster), flags); +} + +static void init_subtask(struct subtask* sub, unsigned long i, + lt_t quanta, lt_t period) +{ + /* since i is zero-based, the formulas are shifted by one */ + lt_t tmp; + + /* release */ + tmp = period * i; + do_div(tmp, quanta); /* floor */ + sub->release = (quanta_t) tmp; + + /* deadline */ + tmp = period * (i + 1); + if (do_div(tmp, quanta)) /* ceil */ + tmp++; + sub->deadline = (quanta_t) tmp; + + /* next release */ + tmp = period * (i + 1); + do_div(tmp, quanta); /* floor */ + sub->overlap = sub->deadline - (quanta_t) tmp; + + /* Group deadline. + * Based on the formula given in Uma's thesis. + */ + if (2 * quanta >= period) { + /* heavy */ + tmp = (sub->deadline - (i + 1)) * period; + if (period > quanta && + do_div(tmp, (period - quanta))) /* ceil */ + tmp++; + sub->group_deadline = (quanta_t) tmp; + } else + sub->group_deadline = 0; +} + +static void dump_subtasks(struct task_struct* t) +{ + unsigned long i; + for (i = 0; i < t->rt_param.pfair->quanta; i++) + TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n", + i + 1, + t->rt_param.pfair->subtasks[i].release, + t->rt_param.pfair->subtasks[i].deadline, + t->rt_param.pfair->subtasks[i].overlap, + t->rt_param.pfair->subtasks[i].group_deadline); +} + +static long pfair_admit_task(struct task_struct* t) +{ + lt_t quanta; + lt_t period; + s64 quantum_length = ktime_to_ns(tick_period); + struct pfair_param* param; + unsigned long i; + + /* first check that the task is in the right cluster */ + if (cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]) != + cpu_cluster(pstate[task_cpu(t)])) + return -EINVAL; + + /* Pfair is a tick-based method, so the time + * of interest is jiffies. Calculate tick-based + * times for everything. + * (Ceiling of exec cost, floor of period.) + */ + + quanta = get_exec_cost(t); + period = get_rt_period(t); + + quanta = time2quanta(get_exec_cost(t), CEIL); + + if (do_div(period, quantum_length)) + printk(KERN_WARNING + "The period of %s/%d is not a multiple of %llu.\n", + t->comm, t->pid, (unsigned long long) quantum_length); + + if (quanta == period) { + /* special case: task has weight 1.0 */ + printk(KERN_INFO + "Admitting weight 1.0 task. (%s/%d, %llu, %llu).\n", + t->comm, t->pid, quanta, period); + quanta = 1; + period = 1; + } + + param = kmalloc(sizeof(*param) + + quanta * sizeof(struct subtask), GFP_ATOMIC); + + if (!param) + return -ENOMEM; + + param->quanta = quanta; + param->cur = 0; + param->release = 0; + param->period = period; + + param->cluster = cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]); + + for (i = 0; i < quanta; i++) + init_subtask(param->subtasks + i, i, quanta, period); + + if (t->rt_param.pfair) + /* get rid of stale allocation */ + kfree(t->rt_param.pfair); + + t->rt_param.pfair = param; + + /* spew out some debug info */ + dump_subtasks(t); + + return 0; +} + +static void pfair_init_cluster(struct pfair_cluster* cluster) +{ + rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, pfair_release_jobs); + bheap_init(&cluster->release_queue); + raw_spin_lock_init(&cluster->release_lock); + INIT_LIST_HEAD(&cluster->topology.cpus); +} + +static void cleanup_clusters(void) +{ + int i; + + if (num_pfair_clusters) + kfree(pfair_clusters); + pfair_clusters = NULL; + num_pfair_clusters = 0; + + /* avoid stale pointers */ + for (i = 0; i < num_online_cpus(); i++) { + pstate[i]->topology.cluster = NULL; + printk("P%d missed %u updates and %u quanta.\n", cpu_id(pstate[i]), + pstate[i]->missed_updates, pstate[i]->missed_quanta); + } +} + +static long pfair_activate_plugin(void) +{ + int err, i; + struct pfair_state* state; + struct pfair_cluster* cluster ; + quanta_t now; + int cluster_size; + struct cluster_cpu* cpus[NR_CPUS]; + struct scheduling_cluster* clust[NR_CPUS]; + + cluster_size = get_cluster_size(pfair_cluster_level); + + if (cluster_size <= 0 || num_online_cpus() % cluster_size != 0) + return -EINVAL; + + num_pfair_clusters = num_online_cpus() / cluster_size; + + pfair_clusters = kzalloc(num_pfair_clusters * sizeof(struct pfair_cluster), GFP_ATOMIC); + if (!pfair_clusters) { + num_pfair_clusters = 0; + printk(KERN_ERR "Could not allocate Pfair clusters!\n"); + return -ENOMEM; + } + + state = &__get_cpu_var(pfair_state); + now = current_quantum(state); + TRACE("Activating PFAIR at q=%lu\n", now); + + for (i = 0; i < num_pfair_clusters; i++) { + cluster = &pfair_clusters[i]; + pfair_init_cluster(cluster); + cluster->pfair_time = now; + clust[i] = &cluster->topology; +#ifdef CONFIG_RELEASE_MASTER + cluster->pfair.release_master = atomic_read(&release_master_cpu); +#endif + } + + for (i = 0; i < num_online_cpus(); i++) { + state = &per_cpu(pfair_state, i); + state->cur_tick = now; + state->local_tick = now; + state->missed_quanta = 0; + state->missed_updates = 0; + state->offset = cpu_stagger_offset(i); + printk(KERN_ERR "cpus[%d] set; %d\n", i, num_online_cpus()); + cpus[i] = &state->topology; + } + + err = assign_cpus_to_clusters(pfair_cluster_level, clust, num_pfair_clusters, + cpus, num_online_cpus()); + + if (err < 0) + cleanup_clusters(); + + return err; +} + +static long pfair_deactivate_plugin(void) +{ + cleanup_clusters(); + return 0; +} + +/* Plugin object */ +static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = { + .plugin_name = "PFAIR", + .tick = pfair_tick, + .task_new = pfair_task_new, + .task_exit = pfair_task_exit, + .schedule = pfair_schedule, + .task_wake_up = pfair_task_wake_up, + .task_block = pfair_task_block, + .admit_task = pfair_admit_task, + .release_at = pfair_release_at, + .complete_job = complete_job, + .activate_plugin = pfair_activate_plugin, + .deactivate_plugin = pfair_deactivate_plugin, +}; + + +static struct proc_dir_entry *cluster_file = NULL, *pfair_dir = NULL; + +static int __init init_pfair(void) +{ + int cpu, err, fs; + struct pfair_state *state; + + /* + * initialize short_cut for per-cpu pfair state; + * there may be a problem here if someone removes a cpu + * while we are doing this initialization... and if cpus + * are added / removed later... but we don't support CPU hotplug atm anyway. + */ + pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL); + + /* initialize CPU state */ + for (cpu = 0; cpu < num_online_cpus(); cpu++) { + state = &per_cpu(pfair_state, cpu); + state->topology.id = cpu; + state->cur_tick = 0; + state->local_tick = 0; + state->linked = NULL; + state->local = NULL; + state->scheduled = NULL; + state->missed_quanta = 0; + state->offset = cpu_stagger_offset(cpu); + pstate[cpu] = state; + } + + pfair_clusters = NULL; + num_pfair_clusters = 0; + + err = register_sched_plugin(&pfair_plugin); + if (!err) { + fs = make_plugin_proc_dir(&pfair_plugin, &pfair_dir); + if (!fs) + cluster_file = create_cluster_file(pfair_dir, &pfair_cluster_level); + else + printk(KERN_ERR "Could not allocate PFAIR procfs dir.\n"); + } + + return err; +} + +static void __exit clean_pfair(void) +{ + kfree(pstate); + + if (cluster_file) + remove_proc_entry("cluster", pfair_dir); + if (pfair_dir) + remove_plugin_proc_dir(&pfair_plugin); +} + +module_init(init_pfair); +module_exit(clean_pfair); diff --git a/litmus/sched_pfp.c b/litmus/sched_pfp.c new file mode 100644 index 000000000000..74a77e7a4959 --- /dev/null +++ b/litmus/sched_pfp.c @@ -0,0 +1,1542 @@ +/* + * litmus/sched_pfp.c + * + * Implementation of partitioned fixed-priority scheduling. + * Based on PSN-EDF. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +typedef struct { + rt_domain_t domain; + struct fp_prio_queue ready_queue; + int cpu; + struct task_struct* scheduled; /* only RT tasks */ +/* + * scheduling lock slock + * protects the domain and serializes scheduling decisions + */ +#define slock domain.ready_lock + +} pfp_domain_t; + +DEFINE_PER_CPU(pfp_domain_t, pfp_domains); + +pfp_domain_t* pfp_doms[NR_CPUS]; + +#define local_pfp (&__get_cpu_var(pfp_domains)) +#define remote_dom(cpu) (&per_cpu(pfp_domains, cpu).domain) +#define remote_pfp(cpu) (&per_cpu(pfp_domains, cpu)) +#define task_dom(task) remote_dom(get_partition(task)) +#define task_pfp(task) remote_pfp(get_partition(task)) + +/* we assume the lock is being held */ +static void preempt(pfp_domain_t *pfp) +{ + preempt_if_preemptable(pfp->scheduled, pfp->cpu); +} + +static unsigned int priority_index(struct task_struct* t) +{ +#ifdef CONFIG_LOCKING + if (unlikely(t->rt_param.inh_task)) + /* use effective priority */ + t = t->rt_param.inh_task; + + if (is_priority_boosted(t)) { + /* zero is reserved for priority-boosted tasks */ + return 0; + } else +#endif + return get_priority(t); +} + + +static void pfp_release_jobs(rt_domain_t* rt, struct bheap* tasks) +{ + pfp_domain_t *pfp = container_of(rt, pfp_domain_t, domain); + unsigned long flags; + struct task_struct* t; + struct bheap_node* hn; + + raw_spin_lock_irqsave(&pfp->slock, flags); + + while (!bheap_empty(tasks)) { + hn = bheap_take(fp_ready_order, tasks); + t = bheap2task(hn); + TRACE_TASK(t, "released (part:%d prio:%d)\n", + get_partition(t), get_priority(t)); + fp_prio_add(&pfp->ready_queue, t, priority_index(t)); + } + + /* do we need to preempt? */ + if (fp_higher_prio(fp_prio_peek(&pfp->ready_queue), pfp->scheduled)) { + TRACE_CUR("preempted by new release\n"); + preempt(pfp); + } + + raw_spin_unlock_irqrestore(&pfp->slock, flags); +} + +static void pfp_domain_init(pfp_domain_t* pfp, + int cpu) +{ + fp_domain_init(&pfp->domain, NULL, pfp_release_jobs); + pfp->cpu = cpu; + pfp->scheduled = NULL; + fp_prio_queue_init(&pfp->ready_queue); +} + +static void requeue(struct task_struct* t, pfp_domain_t *pfp) +{ + if (t->state != TASK_RUNNING) + TRACE_TASK(t, "requeue: !TASK_RUNNING\n"); + + set_rt_flags(t, RT_F_RUNNING); + if (is_released(t, litmus_clock())) + fp_prio_add(&pfp->ready_queue, t, priority_index(t)); + else + add_release(&pfp->domain, t); /* it has got to wait */ +} + +static void job_completion(struct task_struct* t, int forced) +{ + sched_trace_task_completion(t,forced); + TRACE_TASK(t, "job_completion().\n"); + + set_rt_flags(t, RT_F_SLEEP); + prepare_for_next_period(t); +} + +static void pfp_tick(struct task_struct *t) +{ + pfp_domain_t *pfp = local_pfp; + + /* Check for inconsistency. We don't need the lock for this since + * ->scheduled is only changed in schedule, which obviously is not + * executing in parallel on this CPU + */ + BUG_ON(is_realtime(t) && t != pfp->scheduled); + + if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) { + if (!is_np(t)) { + litmus_reschedule_local(); + TRACE("pfp_scheduler_tick: " + "%d is preemptable " + " => FORCE_RESCHED\n", t->pid); + } else if (is_user_np(t)) { + TRACE("pfp_scheduler_tick: " + "%d is non-preemptable, " + "preemption delayed.\n", t->pid); + request_exit_np(t); + } + } +} + +static struct task_struct* pfp_schedule(struct task_struct * prev) +{ + pfp_domain_t* pfp = local_pfp; + struct task_struct* next; + + int out_of_time, sleep, preempt, np, exists, blocks, resched, migrate; + + raw_spin_lock(&pfp->slock); + + /* sanity checking + * differently from gedf, when a task exits (dead) + * pfp->schedule may be null and prev _is_ realtime + */ + BUG_ON(pfp->scheduled && pfp->scheduled != prev); + BUG_ON(pfp->scheduled && !is_realtime(prev)); + + /* (0) Determine state */ + exists = pfp->scheduled != NULL; + blocks = exists && !is_running(pfp->scheduled); + out_of_time = exists && + budget_enforced(pfp->scheduled) && + budget_exhausted(pfp->scheduled); + np = exists && is_np(pfp->scheduled); + sleep = exists && get_rt_flags(pfp->scheduled) == RT_F_SLEEP; + migrate = exists && get_partition(pfp->scheduled) != pfp->cpu; + preempt = migrate || fp_preemption_needed(&pfp->ready_queue, prev); + + /* If we need to preempt do so. + * The following checks set resched to 1 in case of special + * circumstances. + */ + resched = preempt; + + /* If a task blocks we have no choice but to reschedule. + */ + if (blocks) + resched = 1; + + /* Request a sys_exit_np() call if we would like to preempt but cannot. + * Multiple calls to request_exit_np() don't hurt. + */ + if (np && (out_of_time || preempt || sleep)) + request_exit_np(pfp->scheduled); + + /* Any task that is preemptable and either exhausts its execution + * budget or wants to sleep completes. We may have to reschedule after + * this. + */ + if (!np && (out_of_time || sleep) && !blocks && !migrate) { + job_completion(pfp->scheduled, !sleep); + resched = 1; + } + + /* The final scheduling decision. Do we need to switch for some reason? + * Switch if we are in RT mode and have no task or if we need to + * resched. + */ + next = NULL; + if ((!np || blocks) && (resched || !exists)) { + /* When preempting a task that does not block, then + * re-insert it into either the ready queue or the + * release queue (if it completed). requeue() picks + * the appropriate queue. + */ + if (pfp->scheduled && !blocks && !migrate) + requeue(pfp->scheduled, pfp); + next = fp_prio_take(&pfp->ready_queue); + } else + /* Only override Linux scheduler if we have a real-time task + * scheduled that needs to continue. + */ + if (exists) + next = prev; + + if (next) { + TRACE_TASK(next, "scheduled at %llu\n", litmus_clock()); + set_rt_flags(next, RT_F_RUNNING); + } else { + TRACE("becoming idle at %llu\n", litmus_clock()); + } + + pfp->scheduled = next; + sched_state_task_picked(); + raw_spin_unlock(&pfp->slock); + + return next; +} + +#ifdef CONFIG_LITMUS_LOCKING + +/* prev is no longer scheduled --- see if it needs to migrate */ +static void pfp_finish_switch(struct task_struct *prev) +{ + pfp_domain_t *to; + + if (is_realtime(prev) && + is_running(prev) && + get_partition(prev) != smp_processor_id()) { + TRACE_TASK(prev, "needs to migrate from P%d to P%d\n", + smp_processor_id(), get_partition(prev)); + + to = task_pfp(prev); + + raw_spin_lock(&to->slock); + + TRACE_TASK(prev, "adding to queue on P%d\n", to->cpu); + requeue(prev, to); + if (fp_preemption_needed(&to->ready_queue, to->scheduled)) + preempt(to); + + raw_spin_unlock(&to->slock); + + } +} + +#endif + +/* Prepare a task for running in RT mode + */ +static void pfp_task_new(struct task_struct * t, int on_rq, int running) +{ + pfp_domain_t* pfp = task_pfp(t); + unsigned long flags; + + TRACE_TASK(t, "P-FP: task new, cpu = %d\n", + t->rt_param.task_params.cpu); + + /* setup job parameters */ + release_at(t, litmus_clock()); + + /* The task should be running in the queue, otherwise signal + * code will try to wake it up with fatal consequences. + */ + raw_spin_lock_irqsave(&pfp->slock, flags); + if (running) { + /* there shouldn't be anything else running at the time */ + BUG_ON(pfp->scheduled); + pfp->scheduled = t; + } else { + requeue(t, pfp); + /* maybe we have to reschedule */ + preempt(pfp); + } + raw_spin_unlock_irqrestore(&pfp->slock, flags); +} + +static void pfp_task_wake_up(struct task_struct *task) +{ + unsigned long flags; + pfp_domain_t* pfp = task_pfp(task); + lt_t now; + + TRACE_TASK(task, "wake_up at %llu\n", litmus_clock()); + raw_spin_lock_irqsave(&pfp->slock, flags); + +#ifdef CONFIG_LITMUS_LOCKING + /* Should only be queued when processing a fake-wake up due to a + * migration-related state change. */ + if (unlikely(is_queued(task))) { + TRACE_TASK(task, "WARNING: waking task still queued. Is this right?\n"); + goto out_unlock; + } +#else + BUG_ON(is_queued(task)); +#endif + now = litmus_clock(); + if (is_tardy(task, now) +#ifdef CONFIG_LITMUS_LOCKING + /* We need to take suspensions because of semaphores into + * account! If a job resumes after being suspended due to acquiring + * a semaphore, it should never be treated as a new job release. + */ + && !is_priority_boosted(task) +#endif + ) { + /* new sporadic release */ + release_at(task, now); + sched_trace_task_release(task); + } + + /* Only add to ready queue if it is not the currently-scheduled + * task. This could be the case if a task was woken up concurrently + * on a remote CPU before the executing CPU got around to actually + * de-scheduling the task, i.e., wake_up() raced with schedule() + * and won. Also, don't requeue if it is still queued, which can + * happen under the DPCP due wake-ups racing with migrations. + */ + if (pfp->scheduled != task) + requeue(task, pfp); + +out_unlock: + raw_spin_unlock_irqrestore(&pfp->slock, flags); + TRACE_TASK(task, "wake up done\n"); +} + +static void pfp_task_block(struct task_struct *t) +{ + /* only running tasks can block, thus t is in no queue */ + TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state); + + BUG_ON(!is_realtime(t)); + + /* If this task blocked normally, it shouldn't be queued. The exception is + * if this is a simulated block()/wakeup() pair from the pull-migration code path. + * This should only happen if the DPCP is being used. + */ +#ifdef CONFIG_LITMUS_LOCKING + if (unlikely(is_queued(t))) + TRACE_TASK(t, "WARNING: blocking task still queued. Is this right?\n"); +#else + BUG_ON(is_queued(t)); +#endif +} + +static void pfp_task_exit(struct task_struct * t) +{ + unsigned long flags; + pfp_domain_t* pfp = task_pfp(t); + rt_domain_t* dom; + + raw_spin_lock_irqsave(&pfp->slock, flags); + if (is_queued(t)) { + BUG(); /* This currently doesn't work. */ + /* dequeue */ + dom = task_dom(t); + remove(dom, t); + } + if (pfp->scheduled == t) { + pfp->scheduled = NULL; + preempt(pfp); + } + TRACE_TASK(t, "RIP, now reschedule\n"); + + raw_spin_unlock_irqrestore(&pfp->slock, flags); +} + +#ifdef CONFIG_LITMUS_LOCKING + +#include +#include + +static void fp_dequeue(pfp_domain_t* pfp, struct task_struct* t) +{ + BUG_ON(pfp->scheduled == t && is_queued(t)); + if (is_queued(t)) + fp_prio_remove(&pfp->ready_queue, t, priority_index(t)); +} + +static void fp_set_prio_inh(pfp_domain_t* pfp, struct task_struct* t, + struct task_struct* prio_inh) +{ + int requeue; + + if (!t || t->rt_param.inh_task == prio_inh) { + /* no update required */ + if (t) + TRACE_TASK(t, "no prio-inh update required\n"); + return; + } + + requeue = is_queued(t); + TRACE_TASK(t, "prio-inh: is_queued:%d\n", requeue); + + if (requeue) + /* first remove */ + fp_dequeue(pfp, t); + + t->rt_param.inh_task = prio_inh; + + if (requeue) + /* add again to the right queue */ + fp_prio_add(&pfp->ready_queue, t, priority_index(t)); +} + +static int effective_agent_priority(int prio) +{ + /* make sure agents have higher priority */ + return prio - LITMUS_MAX_PRIORITY; +} + +static lt_t prio_point(int eprio) +{ + /* make sure we have non-negative prio points */ + return eprio + LITMUS_MAX_PRIORITY; +} + +static int prio_from_point(lt_t prio_point) +{ + return ((int) prio_point) - LITMUS_MAX_PRIORITY; +} + +static void boost_priority(struct task_struct* t, lt_t priority_point) +{ + unsigned long flags; + pfp_domain_t* pfp = task_pfp(t); + + raw_spin_lock_irqsave(&pfp->slock, flags); + + + TRACE_TASK(t, "priority boosted at %llu\n", litmus_clock()); + + tsk_rt(t)->priority_boosted = 1; + /* tie-break by protocol-specific priority point */ + tsk_rt(t)->boost_start_time = priority_point; + + if (pfp->scheduled != t) { + /* holder may be queued: first stop queue changes */ + raw_spin_lock(&pfp->domain.release_lock); + if (is_queued(t) && + /* If it is queued, then we need to re-order. */ + bheap_decrease(fp_ready_order, tsk_rt(t)->heap_node) && + /* If we bubbled to the top, then we need to check for preemptions. */ + fp_preemption_needed(&pfp->ready_queue, pfp->scheduled)) + preempt(pfp); + raw_spin_unlock(&pfp->domain.release_lock); + } /* else: nothing to do since the job is not queued while scheduled */ + + raw_spin_unlock_irqrestore(&pfp->slock, flags); +} + +static void unboost_priority(struct task_struct* t) +{ + unsigned long flags; + pfp_domain_t* pfp = task_pfp(t); + lt_t now; + + raw_spin_lock_irqsave(&pfp->slock, flags); + now = litmus_clock(); + + /* assumption: this only happens when the job is scheduled */ + BUG_ON(pfp->scheduled != t); + + TRACE_TASK(t, "priority restored at %llu\n", now); + + /* priority boosted jobs must be scheduled */ + BUG_ON(pfp->scheduled != t); + + tsk_rt(t)->priority_boosted = 0; + tsk_rt(t)->boost_start_time = 0; + + /* check if this changes anything */ + if (fp_preemption_needed(&pfp->ready_queue, pfp->scheduled)) + preempt(pfp); + + raw_spin_unlock_irqrestore(&pfp->slock, flags); +} + +/* ******************** SRP support ************************ */ + +static unsigned int pfp_get_srp_prio(struct task_struct* t) +{ + return get_priority(t); +} + +/* ******************** FMLP support ********************** */ + +struct fmlp_semaphore { + struct litmus_lock litmus_lock; + + /* current resource holder */ + struct task_struct *owner; + + /* FIFO queue of waiting tasks */ + wait_queue_head_t wait; +}; + +static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock) +{ + return container_of(lock, struct fmlp_semaphore, litmus_lock); +} +int pfp_fmlp_lock(struct litmus_lock* l) +{ + struct task_struct* t = current; + struct fmlp_semaphore *sem = fmlp_from_lock(l); + wait_queue_t wait; + unsigned long flags; + lt_t time_of_request; + + if (!is_realtime(t)) + return -EPERM; + + spin_lock_irqsave(&sem->wait.lock, flags); + + /* tie-break by this point in time */ + time_of_request = litmus_clock(); + + /* Priority-boost ourself *before* we suspend so that + * our priority is boosted when we resume. */ + boost_priority(t, time_of_request); + + if (sem->owner) { + /* resource is not free => must suspend and wait */ + + init_waitqueue_entry(&wait, t); + + /* FIXME: interruptible would be nice some day */ + set_task_state(t, TASK_UNINTERRUPTIBLE); + + __add_wait_queue_tail_exclusive(&sem->wait, &wait); + + TS_LOCK_SUSPEND; + + /* release lock before sleeping */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + /* We depend on the FIFO order. Thus, we don't need to recheck + * when we wake up; we are guaranteed to have the lock since + * there is only one wake up per release. + */ + + schedule(); + + TS_LOCK_RESUME; + + /* Since we hold the lock, no other task will change + * ->owner. We can thus check it without acquiring the spin + * lock. */ + BUG_ON(sem->owner != t); + } else { + /* it's ours now */ + sem->owner = t; + + spin_unlock_irqrestore(&sem->wait.lock, flags); + } + + return 0; +} + +int pfp_fmlp_unlock(struct litmus_lock* l) +{ + struct task_struct *t = current, *next; + struct fmlp_semaphore *sem = fmlp_from_lock(l); + unsigned long flags; + int err = 0; + + spin_lock_irqsave(&sem->wait.lock, flags); + + if (sem->owner != t) { + err = -EINVAL; + goto out; + } + + /* we lose the benefit of priority boosting */ + + unboost_priority(t); + + /* check if there are jobs waiting for this resource */ + next = __waitqueue_remove_first(&sem->wait); + if (next) { + /* next becomes the resouce holder */ + sem->owner = next; + + /* Wake up next. The waiting job is already priority-boosted. */ + wake_up_process(next); + } else + /* resource becomes available */ + sem->owner = NULL; + +out: + spin_unlock_irqrestore(&sem->wait.lock, flags); + return err; +} + +int pfp_fmlp_close(struct litmus_lock* l) +{ + struct task_struct *t = current; + struct fmlp_semaphore *sem = fmlp_from_lock(l); + unsigned long flags; + + int owner; + + spin_lock_irqsave(&sem->wait.lock, flags); + + owner = sem->owner == t; + + spin_unlock_irqrestore(&sem->wait.lock, flags); + + if (owner) + pfp_fmlp_unlock(l); + + return 0; +} + +void pfp_fmlp_free(struct litmus_lock* lock) +{ + kfree(fmlp_from_lock(lock)); +} + +static struct litmus_lock_ops pfp_fmlp_lock_ops = { + .close = pfp_fmlp_close, + .lock = pfp_fmlp_lock, + .unlock = pfp_fmlp_unlock, + .deallocate = pfp_fmlp_free, +}; + +static struct litmus_lock* pfp_new_fmlp(void) +{ + struct fmlp_semaphore* sem; + + sem = kmalloc(sizeof(*sem), GFP_KERNEL); + if (!sem) + return NULL; + + sem->owner = NULL; + init_waitqueue_head(&sem->wait); + sem->litmus_lock.ops = &pfp_fmlp_lock_ops; + + return &sem->litmus_lock; +} + +/* ******************** MPCP support ********************** */ + +struct mpcp_semaphore { + struct litmus_lock litmus_lock; + + /* current resource holder */ + struct task_struct *owner; + + /* priority queue of waiting tasks */ + wait_queue_head_t wait; + + /* priority ceiling per cpu */ + unsigned int prio_ceiling[NR_CPUS]; + + /* should jobs spin "virtually" for this resource? */ + int vspin; +}; + +#define OMEGA_CEILING UINT_MAX + +/* Since jobs spin "virtually" while waiting to acquire a lock, + * they first must aquire a local per-cpu resource. + */ +static DEFINE_PER_CPU(wait_queue_head_t, mpcpvs_vspin_wait); +static DEFINE_PER_CPU(struct task_struct*, mpcpvs_vspin); + +/* called with preemptions off <=> no local modifications */ +static void mpcp_vspin_enter(void) +{ + struct task_struct* t = current; + + while (1) { + if (__get_cpu_var(mpcpvs_vspin) == NULL) { + /* good, we get to issue our request */ + __get_cpu_var(mpcpvs_vspin) = t; + break; + } else { + /* some job is spinning => enqueue in request queue */ + prio_wait_queue_t wait; + wait_queue_head_t* vspin = &__get_cpu_var(mpcpvs_vspin_wait); + unsigned long flags; + + /* ordered by regular priority */ + init_prio_waitqueue_entry(&wait, t, prio_point(get_priority(t))); + + spin_lock_irqsave(&vspin->lock, flags); + + set_task_state(t, TASK_UNINTERRUPTIBLE); + + __add_wait_queue_prio_exclusive(vspin, &wait); + + spin_unlock_irqrestore(&vspin->lock, flags); + + TS_LOCK_SUSPEND; + + preempt_enable_no_resched(); + + schedule(); + + preempt_disable(); + + TS_LOCK_RESUME; + /* Recheck if we got it --- some higher-priority process might + * have swooped in. */ + } + } + /* ok, now it is ours */ +} + +/* called with preemptions off */ +static void mpcp_vspin_exit(void) +{ + struct task_struct* t = current, *next; + unsigned long flags; + wait_queue_head_t* vspin = &__get_cpu_var(mpcpvs_vspin_wait); + + BUG_ON(__get_cpu_var(mpcpvs_vspin) != t); + + /* no spinning job */ + __get_cpu_var(mpcpvs_vspin) = NULL; + + /* see if anyone is waiting for us to stop "spinning" */ + spin_lock_irqsave(&vspin->lock, flags); + next = __waitqueue_remove_first(vspin); + + if (next) + wake_up_process(next); + + spin_unlock_irqrestore(&vspin->lock, flags); +} + +static inline struct mpcp_semaphore* mpcp_from_lock(struct litmus_lock* lock) +{ + return container_of(lock, struct mpcp_semaphore, litmus_lock); +} + +int pfp_mpcp_lock(struct litmus_lock* l) +{ + struct task_struct* t = current; + struct mpcp_semaphore *sem = mpcp_from_lock(l); + prio_wait_queue_t wait; + unsigned long flags; + + if (!is_realtime(t)) + return -EPERM; + + preempt_disable(); + + if (sem->vspin) + mpcp_vspin_enter(); + + /* Priority-boost ourself *before* we suspend so that + * our priority is boosted when we resume. Use the priority + * ceiling for the local partition. */ + boost_priority(t, sem->prio_ceiling[get_partition(t)]); + + spin_lock_irqsave(&sem->wait.lock, flags); + + preempt_enable_no_resched(); + + if (sem->owner) { + /* resource is not free => must suspend and wait */ + + /* ordered by regular priority */ + init_prio_waitqueue_entry(&wait, t, prio_point(get_priority(t))); + + /* FIXME: interruptible would be nice some day */ + set_task_state(t, TASK_UNINTERRUPTIBLE); + + __add_wait_queue_prio_exclusive(&sem->wait, &wait); + + TS_LOCK_SUSPEND; + + /* release lock before sleeping */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + /* We depend on the FIFO order. Thus, we don't need to recheck + * when we wake up; we are guaranteed to have the lock since + * there is only one wake up per release. + */ + + schedule(); + + TS_LOCK_RESUME; + + /* Since we hold the lock, no other task will change + * ->owner. We can thus check it without acquiring the spin + * lock. */ + BUG_ON(sem->owner != t); + } else { + /* it's ours now */ + sem->owner = t; + + spin_unlock_irqrestore(&sem->wait.lock, flags); + } + + return 0; +} + +int pfp_mpcp_unlock(struct litmus_lock* l) +{ + struct task_struct *t = current, *next; + struct mpcp_semaphore *sem = mpcp_from_lock(l); + unsigned long flags; + int err = 0; + + spin_lock_irqsave(&sem->wait.lock, flags); + + if (sem->owner != t) { + err = -EINVAL; + goto out; + } + + /* we lose the benefit of priority boosting */ + + unboost_priority(t); + + /* check if there are jobs waiting for this resource */ + next = __waitqueue_remove_first(&sem->wait); + if (next) { + /* next becomes the resouce holder */ + sem->owner = next; + + /* Wake up next. The waiting job is already priority-boosted. */ + wake_up_process(next); + } else + /* resource becomes available */ + sem->owner = NULL; + +out: + spin_unlock_irqrestore(&sem->wait.lock, flags); + + if (sem->vspin && err == 0) { + preempt_disable(); + mpcp_vspin_exit(); + preempt_enable(); + } + + return err; +} + +int pfp_mpcp_open(struct litmus_lock* l, void* config) +{ + struct task_struct *t = current; + struct mpcp_semaphore *sem = mpcp_from_lock(l); + int cpu, local_cpu; + unsigned long flags; + + if (!is_realtime(t)) + /* we need to know the real-time priority */ + return -EPERM; + + local_cpu = get_partition(t); + + spin_lock_irqsave(&sem->wait.lock, flags); + + for (cpu = 0; cpu < NR_CPUS; cpu++) + if (cpu != local_cpu) + { + sem->prio_ceiling[cpu] = min(sem->prio_ceiling[cpu], + get_priority(t)); + TRACE_CUR("priority ceiling for sem %p is now %d on cpu %d\n", + sem, sem->prio_ceiling[cpu], cpu); + } + + spin_unlock_irqrestore(&sem->wait.lock, flags); + + return 0; +} + +int pfp_mpcp_close(struct litmus_lock* l) +{ + struct task_struct *t = current; + struct mpcp_semaphore *sem = mpcp_from_lock(l); + unsigned long flags; + + int owner; + + spin_lock_irqsave(&sem->wait.lock, flags); + + owner = sem->owner == t; + + spin_unlock_irqrestore(&sem->wait.lock, flags); + + if (owner) + pfp_mpcp_unlock(l); + + return 0; +} + +void pfp_mpcp_free(struct litmus_lock* lock) +{ + kfree(mpcp_from_lock(lock)); +} + +static struct litmus_lock_ops pfp_mpcp_lock_ops = { + .close = pfp_mpcp_close, + .lock = pfp_mpcp_lock, + .open = pfp_mpcp_open, + .unlock = pfp_mpcp_unlock, + .deallocate = pfp_mpcp_free, +}; + +static struct litmus_lock* pfp_new_mpcp(int vspin) +{ + struct mpcp_semaphore* sem; + int cpu; + + sem = kmalloc(sizeof(*sem), GFP_KERNEL); + if (!sem) + return NULL; + + sem->owner = NULL; + init_waitqueue_head(&sem->wait); + sem->litmus_lock.ops = &pfp_mpcp_lock_ops; + + for (cpu = 0; cpu < NR_CPUS; cpu++) + sem->prio_ceiling[cpu] = OMEGA_CEILING; + + /* mark as virtual spinning */ + sem->vspin = vspin; + + return &sem->litmus_lock; +} + + +/* ******************** PCP support ********************** */ + + +struct pcp_semaphore { + struct list_head ceiling; + + /* current resource holder */ + struct task_struct *owner; + + /* priority ceiling --- can be negative due to DPCP support */ + int prio_ceiling; + + /* on which processor is this PCP semaphore allocated? */ + int on_cpu; +}; + +struct pcp_state { + struct list_head system_ceiling; + + /* highest-priority waiting task */ + struct task_struct* hp_waiter; + + /* list of jobs waiting to get past the system ceiling */ + wait_queue_head_t ceiling_blocked; +}; + +static void pcp_init_state(struct pcp_state* s) +{ + INIT_LIST_HEAD(&s->system_ceiling); + s->hp_waiter = NULL; + init_waitqueue_head(&s->ceiling_blocked); +} + +static DEFINE_PER_CPU(struct pcp_state, pcp_state); + +/* assumes preemptions are off */ +static struct pcp_semaphore* pcp_get_ceiling(void) +{ + struct list_head* top = __get_cpu_var(pcp_state).system_ceiling.next; + + if (top) + return list_entry(top, struct pcp_semaphore, ceiling); + else + return NULL; +} + +/* assumes preempt off */ +static void pcp_add_ceiling(struct pcp_semaphore* sem) +{ + struct list_head *pos; + struct list_head *in_use = &__get_cpu_var(pcp_state).system_ceiling; + struct pcp_semaphore* held; + + BUG_ON(sem->on_cpu != smp_processor_id()); + BUG_ON(in_list(&sem->ceiling)); + + list_for_each(pos, in_use) { + held = list_entry(pos, struct pcp_semaphore, ceiling); + if (held->prio_ceiling >= sem->prio_ceiling) { + __list_add(&sem->ceiling, pos->prev, pos); + return; + } + } + + /* we hit the end of the list */ + + list_add_tail(&sem->ceiling, in_use); +} + +/* assumes preempt off */ +static int pcp_exceeds_ceiling(struct pcp_semaphore* ceiling, + struct task_struct* task, + int effective_prio) +{ + return ceiling == NULL || + ceiling->prio_ceiling > effective_prio || + ceiling->owner == task; +} + +/* assumes preempt off */ +static void pcp_priority_inheritance(void) +{ + unsigned long flags; + pfp_domain_t* pfp = local_pfp; + + struct pcp_semaphore* ceiling = pcp_get_ceiling(); + struct task_struct *blocker, *blocked; + + blocker = ceiling ? ceiling->owner : NULL; + blocked = __get_cpu_var(pcp_state).hp_waiter; + + raw_spin_lock_irqsave(&pfp->slock, flags); + + /* Current is no longer inheriting anything by default. This should be + * the currently scheduled job, and hence not currently queued. */ + BUG_ON(current != pfp->scheduled); + + fp_set_prio_inh(pfp, current, NULL); + fp_set_prio_inh(pfp, blocked, NULL); + fp_set_prio_inh(pfp, blocker, NULL); + + + /* Let blocking job inherit priority of blocked job, if required. */ + if (blocker && blocked && + fp_higher_prio(blocked, blocker)) { + TRACE_TASK(blocker, "PCP inherits from %s/%d (prio %u -> %u) \n", + blocked->comm, blocked->pid, + get_priority(blocker), get_priority(blocked)); + fp_set_prio_inh(pfp, blocker, blocked); + } + + /* check if anything changed */ + if (fp_higher_prio(fp_prio_peek(&pfp->ready_queue), pfp->scheduled)) + preempt(pfp); + + raw_spin_unlock_irqrestore(&pfp->slock, flags); +} + +/* called with preemptions off */ +static void pcp_raise_ceiling(struct pcp_semaphore* sem, + int effective_prio) +{ + struct task_struct* t = current; + struct pcp_semaphore* ceiling; + prio_wait_queue_t wait; + unsigned int waiting_higher_prio; + + do { + ceiling = pcp_get_ceiling(); + if (pcp_exceeds_ceiling(ceiling, t, effective_prio)) + break; + + TRACE_CUR("PCP ceiling-blocked, wanted sem %p, but %s/%d has the ceiling \n", + sem, ceiling->owner->comm, ceiling->owner->pid); + + /* we need to wait until the ceiling is lowered */ + + /* enqueue in priority order */ + init_prio_waitqueue_entry(&wait, t, prio_point(effective_prio)); + set_task_state(t, TASK_UNINTERRUPTIBLE); + waiting_higher_prio = add_wait_queue_prio_exclusive( + &__get_cpu_var(pcp_state).ceiling_blocked, &wait); + + if (waiting_higher_prio == 0) { + TRACE_CUR("PCP new highest-prio waiter => prio inheritance\n"); + + /* we are the new highest-priority waiting job + * => update inheritance */ + __get_cpu_var(pcp_state).hp_waiter = t; + pcp_priority_inheritance(); + } + + TS_LOCK_SUSPEND; + + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + + /* pcp_resume_unblocked() removed us from wait queue */ + + TS_LOCK_RESUME; + } while(1); + + TRACE_CUR("PCP got the ceiling and sem %p\n", sem); + + /* We are good to go. The semaphore should be available. */ + BUG_ON(sem->owner != NULL); + + sem->owner = t; + + pcp_add_ceiling(sem); +} + +static void pcp_resume_unblocked(void) +{ + wait_queue_head_t *blocked = &__get_cpu_var(pcp_state).ceiling_blocked; + unsigned long flags; + prio_wait_queue_t* q; + struct task_struct* t = NULL; + + struct pcp_semaphore* ceiling = pcp_get_ceiling(); + + spin_lock_irqsave(&blocked->lock, flags); + + while (waitqueue_active(blocked)) { + /* check first == highest-priority waiting job */ + q = list_entry(blocked->task_list.next, + prio_wait_queue_t, wq.task_list); + t = (struct task_struct*) q->wq.private; + + /* can it proceed now? => let it go */ + if (pcp_exceeds_ceiling(ceiling, t, + prio_from_point(q->priority))) { + __remove_wait_queue(blocked, &q->wq); + wake_up_process(t); + } else { + /* We are done. Update highest-priority waiter. */ + __get_cpu_var(pcp_state).hp_waiter = t; + goto out; + } + } + /* If we get here, then there are no more waiting + * jobs. */ + __get_cpu_var(pcp_state).hp_waiter = NULL; +out: + spin_unlock_irqrestore(&blocked->lock, flags); +} + +/* assumes preempt off */ +static void pcp_lower_ceiling(struct pcp_semaphore* sem) +{ + BUG_ON(!in_list(&sem->ceiling)); + BUG_ON(sem->owner != current); + BUG_ON(sem->on_cpu != smp_processor_id()); + + /* remove from ceiling list */ + list_del(&sem->ceiling); + + /* release */ + sem->owner = NULL; + + TRACE_CUR("PCP released sem %p\n", sem); + + /* Wake up all ceiling-blocked jobs that now pass the ceiling. */ + pcp_resume_unblocked(); + + pcp_priority_inheritance(); +} + +static void pcp_update_prio_ceiling(struct pcp_semaphore* sem, + int effective_prio) +{ + /* This needs to be synchronized on something. + * Might as well use waitqueue lock for the processor. + * We assume this happens only before the task set starts execution, + * (i.e., during initialization), but it may happen on multiple processors + * at the same time. + */ + unsigned long flags; + + struct pcp_state* s = &per_cpu(pcp_state, sem->on_cpu); + + spin_lock_irqsave(&s->ceiling_blocked.lock, flags); + + sem->prio_ceiling = min(sem->prio_ceiling, effective_prio); + + spin_unlock_irqrestore(&s->ceiling_blocked.lock, flags); +} + +static void pcp_init_semaphore(struct pcp_semaphore* sem, int cpu) +{ + sem->owner = NULL; + INIT_LIST_HEAD(&sem->ceiling); + sem->prio_ceiling = INT_MAX; + sem->on_cpu = cpu; +} + + +/* ******************** DPCP support ********************** */ + +struct dpcp_semaphore { + struct litmus_lock litmus_lock; + struct pcp_semaphore pcp; + int owner_cpu; +}; + +static inline struct dpcp_semaphore* dpcp_from_lock(struct litmus_lock* lock) +{ + return container_of(lock, struct dpcp_semaphore, litmus_lock); +} + +/* called with preemptions disabled */ +static void pfp_migrate_to(int target_cpu) +{ + struct task_struct* t = current; + pfp_domain_t *from; + + if (get_partition(t) == target_cpu) + return; + + /* make sure target_cpu makes sense */ + BUG_ON(!cpu_online(target_cpu)); + + local_irq_disable(); + + /* scheduled task should not be in any ready or release queue */ + BUG_ON(is_queued(t)); + + /* lock both pfp domains in order of address */ + from = task_pfp(t); + + raw_spin_lock(&from->slock); + + /* switch partitions */ + tsk_rt(t)->task_params.cpu = target_cpu; + + raw_spin_unlock(&from->slock); + + /* Don't trace scheduler costs as part of + * locking overhead. Scheduling costs are accounted for + * explicitly. */ + TS_LOCK_SUSPEND; + + local_irq_enable(); + preempt_enable_no_resched(); + + /* deschedule to be migrated */ + schedule(); + + /* we are now on the target processor */ + preempt_disable(); + + /* start recording costs again */ + TS_LOCK_RESUME; + + BUG_ON(smp_processor_id() != target_cpu); +} + +int pfp_dpcp_lock(struct litmus_lock* l) +{ + struct task_struct* t = current; + struct dpcp_semaphore *sem = dpcp_from_lock(l); + int eprio = effective_agent_priority(get_priority(t)); + int from = get_partition(t); + int to = sem->pcp.on_cpu; + + if (!is_realtime(t)) + return -EPERM; + + preempt_disable(); + + /* Priority-boost ourself *before* we suspend so that + * our priority is boosted when we resume. */ + + boost_priority(t, get_priority(t)); + + pfp_migrate_to(to); + + pcp_raise_ceiling(&sem->pcp, eprio); + + /* yep, we got it => execute request */ + sem->owner_cpu = from; + + preempt_enable(); + + return 0; +} + +int pfp_dpcp_unlock(struct litmus_lock* l) +{ + struct task_struct *t = current; + struct dpcp_semaphore *sem = dpcp_from_lock(l); + int err = 0; + int home; + + preempt_disable(); + + if (sem->pcp.on_cpu != smp_processor_id() || sem->pcp.owner != t) { + err = -EINVAL; + goto out; + } + + home = sem->owner_cpu; + + /* give it back */ + pcp_lower_ceiling(&sem->pcp); + + /* we lose the benefit of priority boosting */ + unboost_priority(t); + + pfp_migrate_to(home); + +out: + preempt_enable(); + + return err; +} + +int pfp_dpcp_open(struct litmus_lock* l, void* __user config) +{ + struct task_struct *t = current; + struct dpcp_semaphore *sem = dpcp_from_lock(l); + int cpu, eprio; + + if (!is_realtime(t)) + /* we need to know the real-time priority */ + return -EPERM; + + if (get_user(cpu, (int*) config)) + return -EFAULT; + + /* make sure the resource location matches */ + if (cpu != sem->pcp.on_cpu) + return -EINVAL; + + eprio = effective_agent_priority(get_priority(t)); + + pcp_update_prio_ceiling(&sem->pcp, eprio); + + return 0; +} + +int pfp_dpcp_close(struct litmus_lock* l) +{ + struct task_struct *t = current; + struct dpcp_semaphore *sem = dpcp_from_lock(l); + int owner = 0; + + preempt_disable(); + + if (sem->pcp.on_cpu == smp_processor_id()) + owner = sem->pcp.owner == t; + + preempt_enable(); + + if (owner) + pfp_dpcp_unlock(l); + + return 0; +} + +void pfp_dpcp_free(struct litmus_lock* lock) +{ + kfree(dpcp_from_lock(lock)); +} + +static struct litmus_lock_ops pfp_dpcp_lock_ops = { + .close = pfp_dpcp_close, + .lock = pfp_dpcp_lock, + .open = pfp_dpcp_open, + .unlock = pfp_dpcp_unlock, + .deallocate = pfp_dpcp_free, +}; + +static struct litmus_lock* pfp_new_dpcp(int on_cpu) +{ + struct dpcp_semaphore* sem; + + sem = kmalloc(sizeof(*sem), GFP_KERNEL); + if (!sem) + return NULL; + + sem->litmus_lock.ops = &pfp_dpcp_lock_ops; + sem->owner_cpu = NO_CPU; + pcp_init_semaphore(&sem->pcp, on_cpu); + + return &sem->litmus_lock; +} + + +/* **** lock constructor **** */ + + +static long pfp_allocate_lock(struct litmus_lock **lock, int type, + void* __user config) +{ + int err = -ENXIO, cpu; + struct srp_semaphore* srp; + + /* P-FP currently supports the SRP for local resources and the FMLP + * for global resources. */ + switch (type) { + case FMLP_SEM: + /* FIFO Mutex Locking Protocol */ + *lock = pfp_new_fmlp(); + if (*lock) + err = 0; + else + err = -ENOMEM; + break; + + case MPCP_SEM: + /* Multiprocesor Priority Ceiling Protocol */ + *lock = pfp_new_mpcp(0); + if (*lock) + err = 0; + else + err = -ENOMEM; + break; + + case MPCP_VS_SEM: + /* Multiprocesor Priority Ceiling Protocol with virtual spinning */ + *lock = pfp_new_mpcp(1); + if (*lock) + err = 0; + else + err = -ENOMEM; + break; + + case DPCP_SEM: + /* Distributed Priority Ceiling Protocol */ + if (get_user(cpu, (int*) config)) + return -EFAULT; + + if (!cpu_online(cpu)) + return -EINVAL; + + *lock = pfp_new_dpcp(cpu); + if (*lock) + err = 0; + else + err = -ENOMEM; + break; + + case SRP_SEM: + /* Baker's Stack Resource Policy */ + srp = allocate_srp_semaphore(); + if (srp) { + *lock = &srp->litmus_lock; + err = 0; + } else + err = -ENOMEM; + break; + }; + + return err; +} + +#endif + +static long pfp_admit_task(struct task_struct* tsk) +{ + if (task_cpu(tsk) == tsk->rt_param.task_params.cpu && +#ifdef CONFIG_RELEASE_MASTER + /* don't allow tasks on release master CPU */ + task_cpu(tsk) != remote_dom(task_cpu(tsk))->release_master && +#endif + get_priority(tsk) > 0) + return 0; + else + return -EINVAL; +} + +static long pfp_activate_plugin(void) +{ +#ifdef CONFIG_RELEASE_MASTER + int cpu; + + for_each_online_cpu(cpu) { + remote_dom(cpu)->release_master = atomic_read(&release_master_cpu); + } +#endif + +#ifdef CONFIG_LITMUS_LOCKING + get_srp_prio = pfp_get_srp_prio; + + for_each_online_cpu(cpu) { + init_waitqueue_head(&per_cpu(mpcpvs_vspin_wait, cpu)); + per_cpu(mpcpvs_vspin, cpu) = NULL; + + pcp_init_state(&per_cpu(pcp_state, cpu)); + pfp_doms[cpu] = remote_pfp(cpu); + } + +#endif + + return 0; +} + + +/* Plugin object */ +static struct sched_plugin pfp_plugin __cacheline_aligned_in_smp = { + .plugin_name = "P-FP", + .tick = pfp_tick, + .task_new = pfp_task_new, + .complete_job = complete_job, + .task_exit = pfp_task_exit, + .schedule = pfp_schedule, + .task_wake_up = pfp_task_wake_up, + .task_block = pfp_task_block, + .admit_task = pfp_admit_task, + .activate_plugin = pfp_activate_plugin, +#ifdef CONFIG_LITMUS_LOCKING + .allocate_lock = pfp_allocate_lock, + .finish_switch = pfp_finish_switch, +#endif +}; + + +static int __init init_pfp(void) +{ + int i; + + /* We do not really want to support cpu hotplug, do we? ;) + * However, if we are so crazy to do so, + * we cannot use num_online_cpu() + */ + for (i = 0; i < num_online_cpus(); i++) { + pfp_domain_init(remote_pfp(i), i); + } + return register_sched_plugin(&pfp_plugin); +} + +module_init(init_pfp); + diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c new file mode 100644 index 000000000000..950fe5e6a1ab --- /dev/null +++ b/litmus/sched_plugin.c @@ -0,0 +1,233 @@ +/* sched_plugin.c -- core infrastructure for the scheduler plugin system + * + * This file includes the initialization of the plugin system, the no-op Linux + * scheduler plugin, some dummy functions, and some helper functions. + */ + +#include +#include +#include + +#include +#include +#include +#include + +/* + * Generic function to trigger preemption on either local or remote cpu + * from scheduler plugins. The key feature is that this function is + * non-preemptive section aware and does not invoke the scheduler / send + * IPIs if the to-be-preempted task is actually non-preemptive. + */ +void preempt_if_preemptable(struct task_struct* t, int cpu) +{ + /* t is the real-time task executing on CPU on_cpu If t is NULL, then + * on_cpu is currently scheduling background work. + */ + + int reschedule = 0; + + if (!t) + /* move non-real-time task out of the way */ + reschedule = 1; + else { + if (smp_processor_id() == cpu) { + /* local CPU case */ + /* check if we need to poke userspace */ + if (is_user_np(t)) + /* Yes, poke it. This doesn't have to be atomic since + * the task is definitely not executing. */ + request_exit_np(t); + else if (!is_kernel_np(t)) + /* only if we are allowed to preempt the + * currently-executing task */ + reschedule = 1; + } else { + /* Remote CPU case. Only notify if it's not a kernel + * NP section and if we didn't set the userspace + * flag. */ + reschedule = !(is_kernel_np(t) || request_exit_np_atomic(t)); + } + } + if (likely(reschedule)) + litmus_reschedule(cpu); +} + + +/************************************************************* + * Dummy plugin functions * + *************************************************************/ + +static void litmus_dummy_finish_switch(struct task_struct * prev) +{ +} + +static struct task_struct* litmus_dummy_schedule(struct task_struct * prev) +{ + sched_state_task_picked(); + return NULL; +} + +static void litmus_dummy_tick(struct task_struct* tsk) +{ +} + +static long litmus_dummy_admit_task(struct task_struct* tsk) +{ + printk(KERN_CRIT "LITMUS^RT: Linux plugin rejects %s/%d.\n", + tsk->comm, tsk->pid); + return -EINVAL; +} + +static void litmus_dummy_task_new(struct task_struct *t, int on_rq, int running) +{ +} + +static void litmus_dummy_task_wake_up(struct task_struct *task) +{ +} + +static void litmus_dummy_task_block(struct task_struct *task) +{ +} + +static void litmus_dummy_task_exit(struct task_struct *task) +{ +} + +static void litmus_dummy_pre_setsched(struct task_struct *task, int policy) +{ +} + + +static long litmus_dummy_complete_job(void) +{ + return -ENOSYS; +} + +static long litmus_dummy_activate_plugin(void) +{ + return 0; +} + +static long litmus_dummy_deactivate_plugin(void) +{ + return 0; +} + +#ifdef CONFIG_LITMUS_LOCKING + +static long litmus_dummy_allocate_lock(struct litmus_lock **lock, int type, + void* __user config) +{ + return -ENXIO; +} + +#endif + + +/* The default scheduler plugin. It doesn't do anything and lets Linux do its + * job. + */ +struct sched_plugin linux_sched_plugin = { + .plugin_name = "Linux", + .tick = litmus_dummy_tick, + .task_new = litmus_dummy_task_new, + .task_exit = litmus_dummy_task_exit, + .task_wake_up = litmus_dummy_task_wake_up, + .task_block = litmus_dummy_task_block, + .complete_job = litmus_dummy_complete_job, + .schedule = litmus_dummy_schedule, + .finish_switch = litmus_dummy_finish_switch, + .activate_plugin = litmus_dummy_activate_plugin, + .deactivate_plugin = litmus_dummy_deactivate_plugin, +#ifdef CONFIG_LITMUS_LOCKING + .allocate_lock = litmus_dummy_allocate_lock, +#endif + .admit_task = litmus_dummy_admit_task +}; + +/* + * The reference to current plugin that is used to schedule tasks within + * the system. It stores references to actual function implementations + * Should be initialized by calling "init_***_plugin()" + */ +struct sched_plugin *litmus = &linux_sched_plugin; + +/* the list of registered scheduling plugins */ +static LIST_HEAD(sched_plugins); +static DEFINE_RAW_SPINLOCK(sched_plugins_lock); + +#define CHECK(func) {\ + if (!plugin->func) \ + plugin->func = litmus_dummy_ ## func;} + +/* FIXME: get reference to module */ +int register_sched_plugin(struct sched_plugin* plugin) +{ + printk(KERN_INFO "Registering LITMUS^RT plugin %s.\n", + plugin->plugin_name); + + /* make sure we don't trip over null pointers later */ + CHECK(finish_switch); + CHECK(schedule); + CHECK(tick); + CHECK(task_wake_up); + CHECK(task_exit); + CHECK(task_block); + CHECK(task_new); + CHECK(complete_job); + CHECK(activate_plugin); + CHECK(deactivate_plugin); +#ifdef CONFIG_LITMUS_LOCKING + CHECK(allocate_lock); +#endif + CHECK(admit_task); + CHECK(pre_setsched); + + if (!plugin->release_at) + plugin->release_at = release_at; + + raw_spin_lock(&sched_plugins_lock); + list_add(&plugin->list, &sched_plugins); + raw_spin_unlock(&sched_plugins_lock); + + return 0; +} + + +/* FIXME: reference counting, etc. */ +struct sched_plugin* find_sched_plugin(const char* name) +{ + struct list_head *pos; + struct sched_plugin *plugin; + + raw_spin_lock(&sched_plugins_lock); + list_for_each(pos, &sched_plugins) { + plugin = list_entry(pos, struct sched_plugin, list); + if (!strcmp(plugin->plugin_name, name)) + goto out_unlock; + } + plugin = NULL; + +out_unlock: + raw_spin_unlock(&sched_plugins_lock); + return plugin; +} + +int print_sched_plugins(char* buf, int max) +{ + int count = 0; + struct list_head *pos; + struct sched_plugin *plugin; + + raw_spin_lock(&sched_plugins_lock); + list_for_each(pos, &sched_plugins) { + plugin = list_entry(pos, struct sched_plugin, list); + count += snprintf(buf + count, max - count, "%s\n", plugin->plugin_name); + if (max - count <= 0) + break; + } + raw_spin_unlock(&sched_plugins_lock); + return count; +} diff --git a/litmus/sched_psn_edf.c b/litmus/sched_psn_edf.c new file mode 100644 index 000000000000..7b12689ab61a --- /dev/null +++ b/litmus/sched_psn_edf.c @@ -0,0 +1,917 @@ +/* + * kernel/sched_psn_edf.c + * + * Implementation of the PSN-EDF scheduler plugin. + * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c. + * + * Suspensions and non-preemptable sections are supported. + * Priority inheritance is not supported. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct { + rt_domain_t domain; + int cpu; + struct task_struct* scheduled; /* only RT tasks */ +/* + * scheduling lock slock + * protects the domain and serializes scheduling decisions + */ +#define slock domain.ready_lock + +} psnedf_domain_t; + +DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains); + +#define local_edf (&__get_cpu_var(psnedf_domains).domain) +#define local_pedf (&__get_cpu_var(psnedf_domains)) +#define remote_edf(cpu) (&per_cpu(psnedf_domains, cpu).domain) +#define remote_pedf(cpu) (&per_cpu(psnedf_domains, cpu)) +#define task_edf(task) remote_edf(get_partition(task)) +#define task_pedf(task) remote_pedf(get_partition(task)) + + +static void psnedf_domain_init(psnedf_domain_t* pedf, + check_resched_needed_t check, + release_jobs_t release, + int cpu) +{ + edf_domain_init(&pedf->domain, check, release); + pedf->cpu = cpu; + pedf->scheduled = NULL; +} + +static void requeue(struct task_struct* t, rt_domain_t *edf) +{ + if (t->state != TASK_RUNNING) + TRACE_TASK(t, "requeue: !TASK_RUNNING\n"); + + set_rt_flags(t, RT_F_RUNNING); + if (is_released(t, litmus_clock())) + __add_ready(edf, t); + else + add_release(edf, t); /* it has got to wait */ +} + +/* we assume the lock is being held */ +static void preempt(psnedf_domain_t *pedf) +{ + preempt_if_preemptable(pedf->scheduled, pedf->cpu); +} + +#ifdef CONFIG_LITMUS_LOCKING + +static void boost_priority(struct task_struct* t) +{ + unsigned long flags; + psnedf_domain_t* pedf = task_pedf(t); + lt_t now; + + raw_spin_lock_irqsave(&pedf->slock, flags); + now = litmus_clock(); + + TRACE_TASK(t, "priority boosted at %llu\n", now); + + tsk_rt(t)->priority_boosted = 1; + tsk_rt(t)->boost_start_time = now; + + if (pedf->scheduled != t) { + /* holder may be queued: first stop queue changes */ + raw_spin_lock(&pedf->domain.release_lock); + if (is_queued(t) && + /* If it is queued, then we need to re-order. */ + bheap_decrease(edf_ready_order, tsk_rt(t)->heap_node) && + /* If we bubbled to the top, then we need to check for preemptions. */ + edf_preemption_needed(&pedf->domain, pedf->scheduled)) + preempt(pedf); + raw_spin_unlock(&pedf->domain.release_lock); + } /* else: nothing to do since the job is not queued while scheduled */ + + raw_spin_unlock_irqrestore(&pedf->slock, flags); +} + +static void unboost_priority(struct task_struct* t) +{ + unsigned long flags; + psnedf_domain_t* pedf = task_pedf(t); + lt_t now; + + raw_spin_lock_irqsave(&pedf->slock, flags); + now = litmus_clock(); + + /* assumption: this only happens when the job is scheduled */ + BUG_ON(pedf->scheduled != t); + + TRACE_TASK(t, "priority restored at %llu\n", now); + + /* priority boosted jobs must be scheduled */ + BUG_ON(pedf->scheduled != t); + + tsk_rt(t)->priority_boosted = 0; + tsk_rt(t)->boost_start_time = 0; + + /* check if this changes anything */ + if (edf_preemption_needed(&pedf->domain, pedf->scheduled)) + preempt(pedf); + + raw_spin_unlock_irqrestore(&pedf->slock, flags); +} + +#endif + +/* This check is trivial in partioned systems as we only have to consider + * the CPU of the partition. + */ +static int psnedf_check_resched(rt_domain_t *edf) +{ + psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain); + + /* because this is a callback from rt_domain_t we already hold + * the necessary lock for the ready queue + */ + if (edf_preemption_needed(edf, pedf->scheduled)) { + preempt(pedf); + return 1; + } else + return 0; +} + +static void job_completion(struct task_struct* t, int forced) +{ + sched_trace_task_completion(t,forced); + TRACE_TASK(t, "job_completion().\n"); + + set_rt_flags(t, RT_F_SLEEP); + prepare_for_next_period(t); +} + +static void psnedf_tick(struct task_struct *t) +{ + psnedf_domain_t *pedf = local_pedf; + + /* Check for inconsistency. We don't need the lock for this since + * ->scheduled is only changed in schedule, which obviously is not + * executing in parallel on this CPU + */ + BUG_ON(is_realtime(t) && t != pedf->scheduled); + + if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) { + if (!is_np(t)) { + litmus_reschedule_local(); + TRACE("psnedf_scheduler_tick: " + "%d is preemptable " + " => FORCE_RESCHED\n", t->pid); + } else if (is_user_np(t)) { + TRACE("psnedf_scheduler_tick: " + "%d is non-preemptable, " + "preemption delayed.\n", t->pid); + request_exit_np(t); + } + } +} + +static struct task_struct* psnedf_schedule(struct task_struct * prev) +{ + psnedf_domain_t* pedf = local_pedf; + rt_domain_t* edf = &pedf->domain; + struct task_struct* next; + + int out_of_time, sleep, preempt, + np, exists, blocks, resched; + + raw_spin_lock(&pedf->slock); + + /* sanity checking + * differently from gedf, when a task exits (dead) + * pedf->schedule may be null and prev _is_ realtime + */ + BUG_ON(pedf->scheduled && pedf->scheduled != prev); + BUG_ON(pedf->scheduled && !is_realtime(prev)); + + /* (0) Determine state */ + exists = pedf->scheduled != NULL; + blocks = exists && !is_running(pedf->scheduled); + out_of_time = exists && + budget_enforced(pedf->scheduled) && + budget_exhausted(pedf->scheduled); + np = exists && is_np(pedf->scheduled); + sleep = exists && get_rt_flags(pedf->scheduled) == RT_F_SLEEP; + preempt = edf_preemption_needed(edf, prev); + + /* If we need to preempt do so. + * The following checks set resched to 1 in case of special + * circumstances. + */ + resched = preempt; + + /* If a task blocks we have no choice but to reschedule. + */ + if (blocks) + resched = 1; + + /* Request a sys_exit_np() call if we would like to preempt but cannot. + * Multiple calls to request_exit_np() don't hurt. + */ + if (np && (out_of_time || preempt || sleep)) + request_exit_np(pedf->scheduled); + + /* Any task that is preemptable and either exhausts its execution + * budget or wants to sleep completes. We may have to reschedule after + * this. + */ + if (!np && (out_of_time || sleep) && !blocks) { + job_completion(pedf->scheduled, !sleep); + resched = 1; + } + + /* The final scheduling decision. Do we need to switch for some reason? + * Switch if we are in RT mode and have no task or if we need to + * resched. + */ + next = NULL; + if ((!np || blocks) && (resched || !exists)) { + /* When preempting a task that does not block, then + * re-insert it into either the ready queue or the + * release queue (if it completed). requeue() picks + * the appropriate queue. + */ + if (pedf->scheduled && !blocks) + requeue(pedf->scheduled, edf); + next = __take_ready(edf); + } else + /* Only override Linux scheduler if we have a real-time task + * scheduled that needs to continue. + */ + if (exists) + next = prev; + + if (next) { + TRACE_TASK(next, "scheduled at %llu\n", litmus_clock()); + set_rt_flags(next, RT_F_RUNNING); + } else { + TRACE("becoming idle at %llu\n", litmus_clock()); + } + + pedf->scheduled = next; + sched_state_task_picked(); + raw_spin_unlock(&pedf->slock); + + return next; +} + + +/* Prepare a task for running in RT mode + */ +static void psnedf_task_new(struct task_struct * t, int on_rq, int running) +{ + rt_domain_t* edf = task_edf(t); + psnedf_domain_t* pedf = task_pedf(t); + unsigned long flags; + + TRACE_TASK(t, "psn edf: task new, cpu = %d\n", + t->rt_param.task_params.cpu); + + /* setup job parameters */ + release_at(t, litmus_clock()); + + /* The task should be running in the queue, otherwise signal + * code will try to wake it up with fatal consequences. + */ + raw_spin_lock_irqsave(&pedf->slock, flags); + if (running) { + /* there shouldn't be anything else running at the time */ + BUG_ON(pedf->scheduled); + pedf->scheduled = t; + } else { + requeue(t, edf); + /* maybe we have to reschedule */ + preempt(pedf); + } + raw_spin_unlock_irqrestore(&pedf->slock, flags); +} + +static void psnedf_task_wake_up(struct task_struct *task) +{ + unsigned long flags; + psnedf_domain_t* pedf = task_pedf(task); + rt_domain_t* edf = task_edf(task); + lt_t now; + + TRACE_TASK(task, "wake_up at %llu\n", litmus_clock()); + raw_spin_lock_irqsave(&pedf->slock, flags); + BUG_ON(is_queued(task)); + now = litmus_clock(); + if (is_tardy(task, now) +#ifdef CONFIG_LITMUS_LOCKING + /* We need to take suspensions because of semaphores into + * account! If a job resumes after being suspended due to acquiring + * a semaphore, it should never be treated as a new job release. + */ + && !is_priority_boosted(task) +#endif + ) { + /* new sporadic release */ + release_at(task, now); + sched_trace_task_release(task); + } + + /* Only add to ready queue if it is not the currently-scheduled + * task. This could be the case if a task was woken up concurrently + * on a remote CPU before the executing CPU got around to actually + * de-scheduling the task, i.e., wake_up() raced with schedule() + * and won. + */ + if (pedf->scheduled != task) + requeue(task, edf); + + raw_spin_unlock_irqrestore(&pedf->slock, flags); + TRACE_TASK(task, "wake up done\n"); +} + +static void psnedf_task_block(struct task_struct *t) +{ + /* only running tasks can block, thus t is in no queue */ + TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state); + + BUG_ON(!is_realtime(t)); + BUG_ON(is_queued(t)); +} + +static void psnedf_task_exit(struct task_struct * t) +{ + unsigned long flags; + psnedf_domain_t* pedf = task_pedf(t); + rt_domain_t* edf; + + raw_spin_lock_irqsave(&pedf->slock, flags); + if (is_queued(t)) { + /* dequeue */ + edf = task_edf(t); + remove(edf, t); + } + if (pedf->scheduled == t) + pedf->scheduled = NULL; + + TRACE_TASK(t, "RIP, now reschedule\n"); + + preempt(pedf); + raw_spin_unlock_irqrestore(&pedf->slock, flags); +} + +#ifdef CONFIG_LITMUS_LOCKING + +#include +#include + +/* ******************** SRP support ************************ */ + +static unsigned int psnedf_get_srp_prio(struct task_struct* t) +{ + /* assumes implicit deadlines */ + return get_rt_period(t); +} + +/* ******************** FMLP support ********************** */ + +/* struct for semaphore with priority inheritance */ +struct fmlp_semaphore { + struct litmus_lock litmus_lock; + + /* current resource holder */ + struct task_struct *owner; + + /* FIFO queue of waiting tasks */ + wait_queue_head_t wait; +}; + +static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock) +{ + return container_of(lock, struct fmlp_semaphore, litmus_lock); +} +int psnedf_fmlp_lock(struct litmus_lock* l) +{ + struct task_struct* t = current; + struct fmlp_semaphore *sem = fmlp_from_lock(l); + wait_queue_t wait; + unsigned long flags; + + if (!is_realtime(t)) + return -EPERM; + + preempt_disable(); + + TRACE_CUR("want FMLP sem %p\n", sem); + + boost_priority(t); + + spin_lock_irqsave(&sem->wait.lock, flags); + + if (sem->owner) { + /* resource is not free => must suspend and wait */ + + init_waitqueue_entry(&wait, t); + + /* FIXME: interruptible would be nice some day */ + set_task_state(t, TASK_UNINTERRUPTIBLE); + + TRACE_CUR("blocking on FMLP sem %p\n", sem); + __add_wait_queue_tail_exclusive(&sem->wait, &wait); + + /* release lock before sleeping */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + /* We depend on the FIFO order. Thus, we don't need to recheck + * when we wake up; we are guaranteed to have the lock since + * there is only one wake up per release. + */ + + TS_LOCK_SUSPEND; + + preempt_enable_no_resched(); + + schedule(); + + preempt_disable(); + + TS_LOCK_RESUME; + + /* Since we hold the lock, no other task will change + * ->owner. We can thus check it without acquiring the spin + * lock. */ + BUG_ON(sem->owner != t); + } else { + /* it's ours now */ + sem->owner = t; + + spin_unlock_irqrestore(&sem->wait.lock, flags); + } + + TRACE_CUR("got FMLP sem %p\n", sem); + + preempt_enable(); + + return 0; +} + +int psnedf_fmlp_unlock(struct litmus_lock* l) +{ + struct task_struct *t = current, *next; + struct fmlp_semaphore *sem = fmlp_from_lock(l); + unsigned long flags; + int err = 0; + + spin_lock_irqsave(&sem->wait.lock, flags); + + if (sem->owner != t) { + err = -EINVAL; + goto out; + } + + TRACE_CUR("releasing FMLP sem %p\n", sem); + + /* we lose the benefit of priority boosting */ + + unboost_priority(t); + + /* check if there are jobs waiting for this resource */ + next = __waitqueue_remove_first(&sem->wait); + if (next) { + /* next becomes the resouce holder */ + sem->owner = next; + + /* wake up next */ + wake_up_process(next); + } else + /* resource becomes available */ + sem->owner = NULL; + +out: + spin_unlock_irqrestore(&sem->wait.lock, flags); + return err; +} + +int psnedf_fmlp_close(struct litmus_lock* l) +{ + struct task_struct *t = current; + struct fmlp_semaphore *sem = fmlp_from_lock(l); + unsigned long flags; + + int owner; + + spin_lock_irqsave(&sem->wait.lock, flags); + + owner = sem->owner == t; + + spin_unlock_irqrestore(&sem->wait.lock, flags); + + if (owner) + psnedf_fmlp_unlock(l); + + return 0; +} + +void psnedf_fmlp_free(struct litmus_lock* lock) +{ + kfree(fmlp_from_lock(lock)); +} + +static struct litmus_lock_ops psnedf_fmlp_lock_ops = { + .close = psnedf_fmlp_close, + .lock = psnedf_fmlp_lock, + .unlock = psnedf_fmlp_unlock, + .deallocate = psnedf_fmlp_free, +}; + +static struct litmus_lock* psnedf_new_fmlp(void) +{ + struct fmlp_semaphore* sem; + + sem = kmalloc(sizeof(*sem), GFP_KERNEL); + if (!sem) + return NULL; + + sem->owner = NULL; + init_waitqueue_head(&sem->wait); + sem->litmus_lock.ops = &psnedf_fmlp_lock_ops; + + return &sem->litmus_lock; +} + + + +/* ******************** OMLP support **********************/ + +/* Since jobs spin "virtually" while waiting to acquire a lock, + * they first must aquire a local per-cpu resource. + */ +static DEFINE_PER_CPU(wait_queue_head_t, omlp_token_wait); +static DEFINE_PER_CPU(struct task_struct*, omlp_token); + +/* called with preemptions off <=> no local modifications */ +static void omlp_grab_token(void) +{ + struct task_struct* t = current; + + while (1) { + if (__get_cpu_var(omlp_token) == NULL) { + /* take it */ + __get_cpu_var(omlp_token) = t; + break; + } else { + /* some job is spinning => enqueue in request queue */ + prio_wait_queue_t wait; + wait_queue_head_t* token_waiters = &__get_cpu_var(omlp_token_wait); + unsigned long flags; + + /* ordered by regular priority; break by lower PID */ + init_prio_waitqueue_entry_tie(&wait, t, get_deadline(t), t->pid); + + spin_lock_irqsave(&token_waiters->lock, flags); + + set_task_state(t, TASK_UNINTERRUPTIBLE); + + __add_wait_queue_prio_exclusive(token_waiters, &wait); + + TRACE_CUR("waiting for OMLP token\n"); + + spin_unlock_irqrestore(&token_waiters->lock, flags); + + TS_LOCK_SUSPEND; + + preempt_enable_no_resched(); + + schedule(); + + preempt_disable(); + + TS_LOCK_RESUME; + /* Recheck if we got it */ + } + } + /* ok, now it is ours */ + TRACE_CUR("got OMLP token\n"); +} + +/* called with preemptions off */ +static void omlp_release_token(void) +{ + struct task_struct* t = current, *next; + unsigned long flags; + wait_queue_head_t* token_waiters = &__get_cpu_var(omlp_token_wait); + + BUG_ON(__get_cpu_var(omlp_token) != t); + + __get_cpu_var(omlp_token) = NULL; + + TRACE_CUR("released OMLP token\n"); + + spin_lock_irqsave(&token_waiters->lock, flags); + next = __waitqueue_remove_first(token_waiters); + + if (next) + wake_up_process(next); + + spin_unlock_irqrestore(&token_waiters->lock, flags); +} + + +struct omlp_semaphore { + struct litmus_lock litmus_lock; + + /* current resource holder */ + struct task_struct *owner; + + /* FIFO queue of waiting tasks */ + wait_queue_head_t wait; +}; + +static inline struct omlp_semaphore* omlp_from_lock(struct litmus_lock* lock) +{ + return container_of(lock, struct omlp_semaphore, litmus_lock); +} +int psnedf_omlp_lock(struct litmus_lock* l) +{ + struct task_struct* t = current; + struct omlp_semaphore *sem = omlp_from_lock(l); + wait_queue_t wait; + unsigned long flags; + + if (!is_realtime(t)) + return -EPERM; + + preempt_disable(); + + omlp_grab_token(); + + /* Priority-boost ourself *before* we suspend so that + * our priority is boosted when we resume. */ + boost_priority(t); + + spin_lock_irqsave(&sem->wait.lock, flags); + + if (sem->owner) { + /* resource is not free => must suspend and wait */ + + init_waitqueue_entry(&wait, t); + + /* FIXME: interruptible would be nice some day */ + set_task_state(t, TASK_UNINTERRUPTIBLE); + + __add_wait_queue_tail_exclusive(&sem->wait, &wait); + + /* release lock before sleeping */ + spin_unlock_irqrestore(&sem->wait.lock, flags); + + /* We depend on the FIFO order. Thus, we don't need to recheck + * when we wake up; we are guaranteed to have the lock since + * there is only one wake up per release. + */ + TS_LOCK_SUSPEND; + + preempt_enable_no_resched(); + + schedule(); + + preempt_disable(); + + TS_LOCK_RESUME; + + /* Since we hold the lock, no other task will change + * ->owner. We can thus check it without acquiring the spin + * lock. */ + BUG_ON(sem->owner != t); + } else { + /* it's ours now */ + sem->owner = t; + + spin_unlock_irqrestore(&sem->wait.lock, flags); + } + + preempt_enable(); + + return 0; +} + +int psnedf_omlp_unlock(struct litmus_lock* l) +{ + struct task_struct *t = current, *next; + struct omlp_semaphore *sem = omlp_from_lock(l); + unsigned long flags; + int err = 0; + + preempt_disable(); + + spin_lock_irqsave(&sem->wait.lock, flags); + + if (sem->owner != t) { + err = -EINVAL; + spin_unlock_irqrestore(&sem->wait.lock, flags); + goto out; + } + + /* we lose the benefit of priority boosting */ + + unboost_priority(t); + + /* check if there are jobs waiting for this resource */ + next = __waitqueue_remove_first(&sem->wait); + if (next) { + /* next becomes the resouce holder */ + sem->owner = next; + + /* Wake up next. The waiting job is already priority-boosted. */ + wake_up_process(next); + } else + /* resource becomes available */ + sem->owner = NULL; + + spin_unlock_irqrestore(&sem->wait.lock, flags); + + omlp_release_token(); + +out: + preempt_enable(); + return err; +} + +int psnedf_omlp_close(struct litmus_lock* l) +{ + struct task_struct *t = current; + struct omlp_semaphore *sem = omlp_from_lock(l); + unsigned long flags; + + int owner; + + spin_lock_irqsave(&sem->wait.lock, flags); + + owner = sem->owner == t; + + spin_unlock_irqrestore(&sem->wait.lock, flags); + + if (owner) + psnedf_omlp_unlock(l); + + return 0; +} + +void psnedf_omlp_free(struct litmus_lock* lock) +{ + kfree(omlp_from_lock(lock)); +} + +static struct litmus_lock_ops psnedf_omlp_lock_ops = { + .close = psnedf_omlp_close, + .lock = psnedf_omlp_lock, + .unlock = psnedf_omlp_unlock, + .deallocate = psnedf_omlp_free, +}; + +static struct litmus_lock* psnedf_new_omlp(void) +{ + struct omlp_semaphore* sem; + + sem = kmalloc(sizeof(*sem), GFP_KERNEL); + if (!sem) + return NULL; + + sem->owner = NULL; + init_waitqueue_head(&sem->wait); + sem->litmus_lock.ops = &psnedf_omlp_lock_ops; + + return &sem->litmus_lock; +} + + +/* **** lock constructor **** */ + + +static long psnedf_allocate_lock(struct litmus_lock **lock, int type, + void* __user unused) +{ + int err = -ENXIO; + struct srp_semaphore* srp; + + /* PSN-EDF currently supports the SRP for local resources and the FMLP + * for global resources. */ + switch (type) { + case FMLP_SEM: + /* Flexible Multiprocessor Locking Protocol */ + *lock = psnedf_new_fmlp(); + if (*lock) + err = 0; + else + err = -ENOMEM; + break; + + case OMLP_SEM: + /* O(m) Locking Protocol */ + *lock = psnedf_new_omlp(); + if (*lock) + err = 0; + else + err = -ENOMEM; + break; + + case SRP_SEM: + /* Baker's Stack Resource Policy */ + srp = allocate_srp_semaphore(); + if (srp) { + *lock = &srp->litmus_lock; + err = 0; + } else + err = -ENOMEM; + break; + }; + + return err; +} + +#endif + + +static long psnedf_activate_plugin(void) +{ + + int cpu; + + for_each_online_cpu(cpu) { +#ifdef CONFIG_RELEASE_MASTER + remote_edf(cpu)->release_master = atomic_read(&release_master_cpu); +#endif +#ifdef CONFIG_LITMUS_LOCKING + init_waitqueue_head(&per_cpu(omlp_token_wait, cpu)); + per_cpu(omlp_token, cpu) = NULL; +#endif + } + + +#ifdef CONFIG_LITMUS_LOCKING + get_srp_prio = psnedf_get_srp_prio; +#endif + + return 0; +} + +static long psnedf_admit_task(struct task_struct* tsk) +{ + if (task_cpu(tsk) == tsk->rt_param.task_params.cpu +#ifdef CONFIG_RELEASE_MASTER + /* don't allow tasks on release master CPU */ + && task_cpu(tsk) != remote_edf(task_cpu(tsk))->release_master +#endif + ) + return 0; + else + return -EINVAL; +} + +/* Plugin object */ +static struct sched_plugin psn_edf_plugin __cacheline_aligned_in_smp = { + .plugin_name = "PSN-EDF", + .tick = psnedf_tick, + .task_new = psnedf_task_new, + .complete_job = complete_job, + .task_exit = psnedf_task_exit, + .schedule = psnedf_schedule, + .task_wake_up = psnedf_task_wake_up, + .task_block = psnedf_task_block, + .admit_task = psnedf_admit_task, + .activate_plugin = psnedf_activate_plugin, +#ifdef CONFIG_LITMUS_LOCKING + .allocate_lock = psnedf_allocate_lock, +#endif +}; + + +static int __init init_psn_edf(void) +{ + int i; + + /* We do not really want to support cpu hotplug, do we? ;) + * However, if we are so crazy to do so, + * we cannot use num_online_cpu() + */ + for (i = 0; i < num_online_cpus(); i++) { + psnedf_domain_init(remote_pedf(i), + psnedf_check_resched, + NULL, i); + } + return register_sched_plugin(&psn_edf_plugin); +} + +module_init(init_psn_edf); + diff --git a/litmus/sched_task_trace.c b/litmus/sched_task_trace.c new file mode 100644 index 000000000000..5ef8d09ab41f --- /dev/null +++ b/litmus/sched_task_trace.c @@ -0,0 +1,241 @@ +/* + * sched_task_trace.c -- record scheduling events to a byte stream + */ + +#define NO_TASK_TRACE_DECLS + +#include +#include +#include + +#include +#include + +#include +#include +#include + + +#define NO_EVENTS (1 << CONFIG_SCHED_TASK_TRACE_SHIFT) + +#define now() litmus_clock() + +struct local_buffer { + struct st_event_record record[NO_EVENTS]; + char flag[NO_EVENTS]; + struct ft_buffer ftbuf; +}; + +DEFINE_PER_CPU(struct local_buffer, st_event_buffer); + +static struct ftdev st_dev; + +static int st_dev_can_open(struct ftdev *dev, unsigned int cpu) +{ + return cpu_online(cpu) ? 0 : -ENODEV; +} + +static int __init init_sched_task_trace(void) +{ + struct local_buffer* buf; + int i, ok = 0, err; + printk("Allocated %u sched_trace_xxx() events per CPU " + "(buffer size: %d bytes)\n", + NO_EVENTS, (int) sizeof(struct local_buffer)); + + err = ftdev_init(&st_dev, THIS_MODULE, + num_online_cpus(), "sched_trace"); + if (err) + goto err_out; + + for (i = 0; i < st_dev.minor_cnt; i++) { + buf = &per_cpu(st_event_buffer, i); + ok += init_ft_buffer(&buf->ftbuf, NO_EVENTS, + sizeof(struct st_event_record), + buf->flag, + buf->record); + st_dev.minor[i].buf = &buf->ftbuf; + } + if (ok == st_dev.minor_cnt) { + st_dev.can_open = st_dev_can_open; + err = register_ftdev(&st_dev); + if (err) + goto err_dealloc; + } else { + err = -EINVAL; + goto err_dealloc; + } + + return 0; + +err_dealloc: + ftdev_exit(&st_dev); +err_out: + printk(KERN_WARNING "Could not register sched_trace module\n"); + return err; +} + +static void __exit exit_sched_task_trace(void) +{ + ftdev_exit(&st_dev); +} + +module_init(init_sched_task_trace); +module_exit(exit_sched_task_trace); + + +static inline struct st_event_record* get_record(u8 type, struct task_struct* t) +{ + struct st_event_record* rec = NULL; + struct local_buffer* buf; + + buf = &get_cpu_var(st_event_buffer); + if (ft_buffer_start_write(&buf->ftbuf, (void**) &rec)) { + rec->hdr.type = type; + rec->hdr.cpu = smp_processor_id(); + rec->hdr.pid = t ? t->pid : 0; + rec->hdr.job = t ? t->rt_param.job_params.job_no : 0; + } else { + put_cpu_var(st_event_buffer); + } + /* rec will be NULL if it failed */ + return rec; +} + +static inline void put_record(struct st_event_record* rec) +{ + struct local_buffer* buf; + buf = &__get_cpu_var(st_event_buffer); + ft_buffer_finish_write(&buf->ftbuf, rec); + put_cpu_var(st_event_buffer); +} + +feather_callback void do_sched_trace_task_name(unsigned long id, unsigned long _task) +{ + struct task_struct *t = (struct task_struct*) _task; + struct st_event_record* rec = get_record(ST_NAME, t); + int i; + if (rec) { + for (i = 0; i < min(TASK_COMM_LEN, ST_NAME_LEN); i++) + rec->data.name.cmd[i] = t->comm[i]; + put_record(rec); + } +} + +feather_callback void do_sched_trace_task_param(unsigned long id, unsigned long _task) +{ + struct task_struct *t = (struct task_struct*) _task; + struct st_event_record* rec = get_record(ST_PARAM, t); + if (rec) { + rec->data.param.wcet = get_exec_cost(t); + rec->data.param.period = get_rt_period(t); + rec->data.param.phase = get_rt_phase(t); + rec->data.param.partition = get_partition(t); + rec->data.param.class = get_class(t); + put_record(rec); + } +} + +feather_callback void do_sched_trace_task_release(unsigned long id, unsigned long _task) +{ + struct task_struct *t = (struct task_struct*) _task; + struct st_event_record* rec = get_record(ST_RELEASE, t); + if (rec) { + rec->data.release.release = get_release(t); + rec->data.release.deadline = get_deadline(t); + put_record(rec); + } +} + +/* skipped: st_assigned_data, we don't use it atm */ + +feather_callback void do_sched_trace_task_switch_to(unsigned long id, + unsigned long _task) +{ + struct task_struct *t = (struct task_struct*) _task; + struct st_event_record* rec; + if (is_realtime(t)) { + rec = get_record(ST_SWITCH_TO, t); + if (rec) { + rec->data.switch_to.when = now(); + rec->data.switch_to.exec_time = get_exec_time(t); + put_record(rec); + } + } +} + +feather_callback void do_sched_trace_task_switch_away(unsigned long id, + unsigned long _task) +{ + struct task_struct *t = (struct task_struct*) _task; + struct st_event_record* rec; + if (is_realtime(t)) { + rec = get_record(ST_SWITCH_AWAY, t); + if (rec) { + rec->data.switch_away.when = now(); + rec->data.switch_away.exec_time = get_exec_time(t); + put_record(rec); + } + } +} + +feather_callback void do_sched_trace_task_completion(unsigned long id, + unsigned long _task, + unsigned long forced) +{ + struct task_struct *t = (struct task_struct*) _task; + struct st_event_record* rec = get_record(ST_COMPLETION, t); + if (rec) { + rec->data.completion.when = now(); + rec->data.completion.forced = forced; + put_record(rec); + } +} + +feather_callback void do_sched_trace_task_block(unsigned long id, + unsigned long _task) +{ + struct task_struct *t = (struct task_struct*) _task; + struct st_event_record* rec = get_record(ST_BLOCK, t); + if (rec) { + rec->data.block.when = now(); + put_record(rec); + } +} + +feather_callback void do_sched_trace_task_resume(unsigned long id, + unsigned long _task) +{ + struct task_struct *t = (struct task_struct*) _task; + struct st_event_record* rec = get_record(ST_RESUME, t); + if (rec) { + rec->data.resume.when = now(); + put_record(rec); + } +} + +feather_callback void do_sched_trace_sys_release(unsigned long id, + unsigned long _start) +{ + lt_t *start = (lt_t*) _start; + struct st_event_record* rec = get_record(ST_SYS_RELEASE, NULL); + if (rec) { + rec->data.sys_release.when = now(); + rec->data.sys_release.release = *start; + put_record(rec); + } +} + +feather_callback void do_sched_trace_action(unsigned long id, + unsigned long _task, + unsigned long action) +{ + struct task_struct *t = (struct task_struct*) _task; + struct st_event_record* rec = get_record(ST_ACTION, t); + + if (rec) { + rec->data.action.when = now(); + rec->data.action.action = action; + put_record(rec); + } +} diff --git a/litmus/sched_trace.c b/litmus/sched_trace.c new file mode 100644 index 000000000000..f4171fddbbb1 --- /dev/null +++ b/litmus/sched_trace.c @@ -0,0 +1,252 @@ +/* + * sched_trace.c -- record scheduling events to a byte stream. + */ +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#define SCHED_TRACE_NAME "litmus/log" + +/* Compute size of TRACE() buffer */ +#define LITMUS_TRACE_BUF_SIZE (1 << CONFIG_SCHED_DEBUG_TRACE_SHIFT) + +/* Max length of one read from the buffer */ +#define MAX_READ_LEN (64 * 1024) + +/* Max length for one write --- by TRACE() --- to the buffer. This is used to + * allocate a per-cpu buffer for printf() formatting. */ +#define MSG_SIZE 255 + + +static DEFINE_MUTEX(reader_mutex); +static atomic_t reader_cnt = ATOMIC_INIT(0); +static DEFINE_KFIFO(debug_buffer, char, LITMUS_TRACE_BUF_SIZE); + + +static DEFINE_RAW_SPINLOCK(log_buffer_lock); +static DEFINE_PER_CPU(char[MSG_SIZE], fmt_buffer); + +/* + * sched_trace_log_message - Write to the trace buffer (log_buffer) + * + * This is the only function accessing the log_buffer from inside the + * kernel for writing. + * Concurrent access to sched_trace_log_message must be serialized using + * log_buffer_lock + * The maximum length of a formatted message is 255 + */ +void sched_trace_log_message(const char* fmt, ...) +{ + unsigned long flags; + va_list args; + size_t len; + char* buf; + + if (!atomic_read(&reader_cnt)) + /* early exit if nobody is listening */ + return; + + va_start(args, fmt); + local_irq_save(flags); + + /* format message */ + buf = __get_cpu_var(fmt_buffer); + len = vscnprintf(buf, MSG_SIZE, fmt, args); + + raw_spin_lock(&log_buffer_lock); + /* Don't copy the trailing null byte, we don't want null bytes in a + * text file. + */ + kfifo_in(&debug_buffer, buf, len); + raw_spin_unlock(&log_buffer_lock); + + local_irq_restore(flags); + va_end(args); +} + + +/* + * log_read - Read the trace buffer + * + * This function is called as a file operation from userspace. + * Readers can sleep. Access is serialized through reader_mutex + */ +static ssize_t log_read(struct file *filp, + char __user *to, size_t len, + loff_t *f_pos) +{ + /* we ignore f_pos, this is strictly sequential */ + + ssize_t error = -EINVAL; + char* mem; + + if (mutex_lock_interruptible(&reader_mutex)) { + error = -ERESTARTSYS; + goto out; + } + + if (len > MAX_READ_LEN) + len = MAX_READ_LEN; + + mem = kmalloc(len, GFP_KERNEL); + if (!mem) { + error = -ENOMEM; + goto out_unlock; + } + + error = kfifo_out(&debug_buffer, mem, len); + while (!error) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(110); + if (signal_pending(current)) + error = -ERESTARTSYS; + else + error = kfifo_out(&debug_buffer, mem, len); + } + + if (error > 0 && copy_to_user(to, mem, error)) + error = -EFAULT; + + kfree(mem); + out_unlock: + mutex_unlock(&reader_mutex); + out: + return error; +} + +/* + * Enable redirection of printk() messages to the trace buffer. + * Defined in kernel/printk.c + */ +extern int trace_override; +extern int trace_recurse; + +/* + * log_open - open the global log message ring buffer. + */ +static int log_open(struct inode *in, struct file *filp) +{ + int error = -EINVAL; + + if (mutex_lock_interruptible(&reader_mutex)) { + error = -ERESTARTSYS; + goto out; + } + + atomic_inc(&reader_cnt); + error = 0; + + printk(KERN_DEBUG + "sched_trace kfifo with buffer starting at: 0x%p\n", + debug_buffer.buf); + + /* override printk() */ + trace_override++; + + mutex_unlock(&reader_mutex); + out: + return error; +} + +static int log_release(struct inode *in, struct file *filp) +{ + int error = -EINVAL; + + if (mutex_lock_interruptible(&reader_mutex)) { + error = -ERESTARTSYS; + goto out; + } + + atomic_dec(&reader_cnt); + + /* release printk() overriding */ + trace_override--; + + printk(KERN_DEBUG "sched_trace kfifo released\n"); + + mutex_unlock(&reader_mutex); + out: + return error; +} + +/* + * log_fops - The file operations for accessing the global LITMUS log message + * buffer. + * + * Except for opening the device file it uses the same operations as trace_fops. + */ +static struct file_operations log_fops = { + .owner = THIS_MODULE, + .open = log_open, + .release = log_release, + .read = log_read, +}; + +static struct miscdevice litmus_log_dev = { + .name = SCHED_TRACE_NAME, + .minor = MISC_DYNAMIC_MINOR, + .fops = &log_fops, +}; + +#ifdef CONFIG_MAGIC_SYSRQ +void dump_trace_buffer(int max) +{ + char line[80]; + int len; + int count = 0; + + /* potential, but very unlikely, race... */ + trace_recurse = 1; + while ((max == 0 || count++ < max) && + (len = kfifo_out(&debug_buffer, line, sizeof(line - 1))) > 0) { + line[len] = '\0'; + printk("%s", line); + } + trace_recurse = 0; +} + +static void sysrq_dump_trace_buffer(int key) +{ + dump_trace_buffer(100); +} + +static struct sysrq_key_op sysrq_dump_trace_buffer_op = { + .handler = sysrq_dump_trace_buffer, + .help_msg = "dump-trace-buffer(Y)", + .action_msg = "writing content of TRACE() buffer", +}; +#endif + +static int __init init_sched_trace(void) +{ + printk("Initializing TRACE() device\n"); + +#ifdef CONFIG_MAGIC_SYSRQ + /* offer some debugging help */ + if (!register_sysrq_key('y', &sysrq_dump_trace_buffer_op)) + printk("Registered dump-trace-buffer(Y) magic sysrq.\n"); + else + printk("Could not register dump-trace-buffer(Y) magic sysrq.\n"); +#endif + + return misc_register(&litmus_log_dev); +} + +static void __exit exit_sched_trace(void) +{ + misc_deregister(&litmus_log_dev); +} + +module_init(init_sched_trace); +module_exit(exit_sched_trace); diff --git a/litmus/srp.c b/litmus/srp.c new file mode 100644 index 000000000000..2ed4ec12a9d3 --- /dev/null +++ b/litmus/srp.c @@ -0,0 +1,295 @@ +/* ************************************************************************** */ +/* STACK RESOURCE POLICY */ +/* ************************************************************************** */ + +#include +#include +#include + +#include +#include +#include +#include + + +#ifdef CONFIG_LITMUS_LOCKING + +#include + +srp_prioritization_t get_srp_prio; + +struct srp { + struct list_head ceiling; + wait_queue_head_t ceiling_blocked; +}; +#define system_ceiling(srp) list2prio(srp->ceiling.next) +#define ceiling2sem(c) container_of(c, struct srp_semaphore, ceiling) + +#define UNDEF_SEM -2 + +atomic_t srp_objects_in_use = ATOMIC_INIT(0); + +DEFINE_PER_CPU(struct srp, srp); + +/* Initialize SRP semaphores at boot time. */ +static int __init srp_init(void) +{ + int i; + + printk("Initializing SRP per-CPU ceilings..."); + for (i = 0; i < NR_CPUS; i++) { + init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked); + INIT_LIST_HEAD(&per_cpu(srp, i).ceiling); + } + printk(" done!\n"); + + return 0; +} +module_init(srp_init); + +/* SRP task priority comparison function. Smaller numeric values have higher + * priority, tie-break is PID. Special case: priority == 0 <=> no priority + */ +static int srp_higher_prio(struct srp_priority* first, + struct srp_priority* second) +{ + if (!first->priority) + return 0; + else + return !second->priority || + first->priority < second->priority || ( + first->priority == second->priority && + first->pid < second->pid); +} + + +static int srp_exceeds_ceiling(struct task_struct* first, + struct srp* srp) +{ + struct srp_priority prio; + + if (list_empty(&srp->ceiling)) + return 1; + else { + prio.pid = first->pid; + prio.priority = get_srp_prio(first); + return srp_higher_prio(&prio, system_ceiling(srp)) || + ceiling2sem(system_ceiling(srp))->owner == first; + } +} + +static void srp_add_prio(struct srp* srp, struct srp_priority* prio) +{ + struct list_head *pos; + if (in_list(&prio->list)) { + printk(KERN_CRIT "WARNING: SRP violation detected, prio is already in " + "ceiling list! cpu=%d, srp=%p\n", smp_processor_id(), ceiling2sem(prio)); + return; + } + list_for_each(pos, &srp->ceiling) + if (unlikely(srp_higher_prio(prio, list2prio(pos)))) { + __list_add(&prio->list, pos->prev, pos); + return; + } + + list_add_tail(&prio->list, &srp->ceiling); +} + + +static int lock_srp_semaphore(struct litmus_lock* l) +{ + struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock); + + if (!is_realtime(current)) + return -EPERM; + + preempt_disable(); + + /* Update ceiling. */ + srp_add_prio(&__get_cpu_var(srp), &sem->ceiling); + + /* SRP invariant: all resources available */ + BUG_ON(sem->owner != NULL); + + sem->owner = current; + TRACE_CUR("acquired srp 0x%p\n", sem); + + preempt_enable(); + + return 0; +} + +static int unlock_srp_semaphore(struct litmus_lock* l) +{ + struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock); + int err = 0; + + preempt_disable(); + + if (sem->owner != current) { + err = -EINVAL; + } else { + /* Determine new system priority ceiling for this CPU. */ + BUG_ON(!in_list(&sem->ceiling.list)); + + list_del(&sem->ceiling.list); + sem->owner = NULL; + + /* Wake tasks on this CPU, if they exceed current ceiling. */ + TRACE_CUR("released srp 0x%p\n", sem); + wake_up_all(&__get_cpu_var(srp).ceiling_blocked); + } + + preempt_enable(); + return err; +} + +static int open_srp_semaphore(struct litmus_lock* l, void* __user arg) +{ + struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock); + int err = 0; + struct task_struct* t = current; + struct srp_priority t_prio; + + if (!is_realtime(t)) + return -EPERM; + + TRACE_CUR("opening SRP semaphore %p, cpu=%d\n", sem, sem->cpu); + + preempt_disable(); + + if (sem->owner != NULL) + err = -EBUSY; + + if (err == 0) { + if (sem->cpu == UNDEF_SEM) + sem->cpu = get_partition(t); + else if (sem->cpu != get_partition(t)) + err = -EPERM; + } + + if (err == 0) { + t_prio.priority = get_srp_prio(t); + t_prio.pid = t->pid; + if (srp_higher_prio(&t_prio, &sem->ceiling)) { + sem->ceiling.priority = t_prio.priority; + sem->ceiling.pid = t_prio.pid; + } + } + + preempt_enable(); + + return err; +} + +static int close_srp_semaphore(struct litmus_lock* l) +{ + struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock); + int err = 0; + + preempt_disable(); + + if (sem->owner == current) + unlock_srp_semaphore(l); + + preempt_enable(); + + return err; +} + +static void deallocate_srp_semaphore(struct litmus_lock* l) +{ + struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock); + atomic_dec(&srp_objects_in_use); + kfree(sem); +} + +static struct litmus_lock_ops srp_lock_ops = { + .open = open_srp_semaphore, + .close = close_srp_semaphore, + .lock = lock_srp_semaphore, + .unlock = unlock_srp_semaphore, + .deallocate = deallocate_srp_semaphore, +}; + +struct srp_semaphore* allocate_srp_semaphore(void) +{ + struct srp_semaphore* sem; + + sem = kmalloc(sizeof(*sem), GFP_KERNEL); + if (!sem) + return NULL; + + INIT_LIST_HEAD(&sem->ceiling.list); + sem->ceiling.priority = 0; + sem->cpu = UNDEF_SEM; + sem->owner = NULL; + + sem->litmus_lock.ops = &srp_lock_ops; + + atomic_inc(&srp_objects_in_use); + return sem; +} + +static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync, + void *key) +{ + int cpu = smp_processor_id(); + struct task_struct *tsk = wait->private; + if (cpu != get_partition(tsk)) + TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b", + get_partition(tsk)); + else if (srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) + return default_wake_function(wait, mode, sync, key); + return 0; +} + +static void do_ceiling_block(struct task_struct *tsk) +{ + wait_queue_t wait = { + .private = tsk, + .func = srp_wake_up, + .task_list = {NULL, NULL} + }; + + tsk->state = TASK_UNINTERRUPTIBLE; + add_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait); + tsk->rt_param.srp_non_recurse = 1; + preempt_enable_no_resched(); + schedule(); + preempt_disable(); + tsk->rt_param.srp_non_recurse = 0; + remove_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait); +} + +/* Wait for current task priority to exceed system-wide priority ceiling. + * FIXME: the hotpath should be inline. + */ +void srp_ceiling_block(void) +{ + struct task_struct *tsk = current; + + /* Only applies to real-time tasks, but optimize for RT tasks. */ + if (unlikely(!is_realtime(tsk))) + return; + + /* Avoid recursive ceiling blocking. */ + if (unlikely(tsk->rt_param.srp_non_recurse)) + return; + + /* Bail out early if there aren't any SRP resources around. */ + if (likely(!atomic_read(&srp_objects_in_use))) + return; + + preempt_disable(); + if (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) { + TRACE_CUR("is priority ceiling blocked.\n"); + while (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) + do_ceiling_block(tsk); + TRACE_CUR("finally exceeds system ceiling.\n"); + } else + TRACE_CUR("is not priority ceiling blocked\n"); + preempt_enable(); +} + +#endif diff --git a/litmus/sync.c b/litmus/sync.c new file mode 100644 index 000000000000..bf75fde5450b --- /dev/null +++ b/litmus/sync.c @@ -0,0 +1,104 @@ +/* litmus/sync.c - Support for synchronous and asynchronous task system releases. + * + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +static DECLARE_COMPLETION(ts_release); + +static long do_wait_for_ts_release(void) +{ + long ret = 0; + + /* If the interruption races with a release, the completion object + * may have a non-zero counter. To avoid this problem, this should + * be replaced by wait_for_completion(). + * + * For debugging purposes, this is interruptible for now. + */ + ret = wait_for_completion_interruptible(&ts_release); + + return ret; +} + +int count_tasks_waiting_for_release(void) +{ + unsigned long flags; + int task_count = 0; + struct list_head *pos; + + spin_lock_irqsave(&ts_release.wait.lock, flags); + list_for_each(pos, &ts_release.wait.task_list) { + task_count++; + } + spin_unlock_irqrestore(&ts_release.wait.lock, flags); + + return task_count; +} + +static long do_release_ts(lt_t start) +{ + int task_count = 0; + unsigned long flags; + struct list_head *pos; + struct task_struct *t; + + + spin_lock_irqsave(&ts_release.wait.lock, flags); + TRACE("<<<<<< synchronous task system release >>>>>>\n"); + + sched_trace_sys_release(&start); + list_for_each(pos, &ts_release.wait.task_list) { + t = (struct task_struct*) list_entry(pos, + struct __wait_queue, + task_list)->private; + task_count++; + litmus->release_at(t, start + t->rt_param.task_params.phase); + sched_trace_task_release(t); + } + + spin_unlock_irqrestore(&ts_release.wait.lock, flags); + + complete_n(&ts_release, task_count); + + return task_count; +} + + +asmlinkage long sys_wait_for_ts_release(void) +{ + long ret = -EPERM; + struct task_struct *t = current; + + if (is_realtime(t)) + ret = do_wait_for_ts_release(); + + return ret; +} + + +asmlinkage long sys_release_ts(lt_t __user *__delay) +{ + long ret; + lt_t delay; + + /* FIXME: check capabilities... */ + + ret = copy_from_user(&delay, __delay, sizeof(delay)); + if (ret == 0) + ret = do_release_ts(litmus_clock() + delay); + + return ret; +} diff --git a/litmus/trace.c b/litmus/trace.c new file mode 100644 index 000000000000..39200c8ff74e --- /dev/null +++ b/litmus/trace.c @@ -0,0 +1,213 @@ +#include +#include +#include + +#include +#include +#include + +/******************************************************************************/ +/* Allocation */ +/******************************************************************************/ + +static struct ftdev overhead_dev; + +#define trace_ts_buf overhead_dev.minor[0].buf + +static unsigned int ts_seq_no = 0; + +static inline void __save_timestamp_cpu(unsigned long event, + uint8_t type, uint8_t cpu) +{ + unsigned int seq_no; + struct timestamp *ts; + seq_no = fetch_and_inc((int *) &ts_seq_no); + if (ft_buffer_start_write(trace_ts_buf, (void**) &ts)) { + ts->event = event; + ts->timestamp = ft_timestamp(); + ts->seq_no = seq_no; + ts->cpu = cpu; + ts->task_type = type; + ft_buffer_finish_write(trace_ts_buf, ts); + } +} + +static void __add_timestamp_user(struct timestamp *pre_recorded) +{ + unsigned int seq_no; + struct timestamp *ts; + seq_no = fetch_and_inc((int *) &ts_seq_no); + + if (ft_buffer_start_write(trace_ts_buf, (void**) &ts)) { + *ts = *pre_recorded; + ts->seq_no = seq_no; + ft_buffer_finish_write(trace_ts_buf, ts); + } +} + +static inline void __save_timestamp(unsigned long event, + uint8_t type) +{ + __save_timestamp_cpu(event, type, raw_smp_processor_id()); +} + +/* hack: fake timestamp to user-reported time, and record parts of the PID */ +feather_callback void save_timestamp_time(unsigned long event, unsigned long ptr) +{ + uint64_t* time = (uint64_t*) ptr; + unsigned int seq_no; + struct timestamp *ts; + seq_no = fetch_and_inc((int *) &ts_seq_no); + if (ft_buffer_start_write(trace_ts_buf, (void**) &ts)) { + ts->event = event; + ts->timestamp = *time; + ts->seq_no = seq_no; + /* type takes lowest byte of PID */ + ts->task_type = (uint8_t) current->pid; + /* cpu takes second-lowest byte of PID*/ + ts->cpu = (uint8_t) (current->pid >> 8); + + ft_buffer_finish_write(trace_ts_buf, ts); + } +} + +feather_callback void save_timestamp_pid(unsigned long event) +{ + /* Abuse existing fields to partially export PID. */ + __save_timestamp_cpu(event, + /* type takes lowest byte of PID */ + (uint8_t) current->pid, + /* cpu takes second-lowest byte of PID*/ + (uint8_t) (current->pid >> 8)); +} + +feather_callback void save_timestamp(unsigned long event) +{ + __save_timestamp(event, TSK_UNKNOWN); +} + +feather_callback void save_timestamp_def(unsigned long event, + unsigned long type) +{ + __save_timestamp(event, (uint8_t) type); +} + +feather_callback void save_timestamp_task(unsigned long event, + unsigned long t_ptr) +{ + int rt = is_realtime((struct task_struct *) t_ptr); + __save_timestamp(event, rt ? TSK_RT : TSK_BE); +} + +feather_callback void save_timestamp_cpu(unsigned long event, + unsigned long cpu) +{ + __save_timestamp_cpu(event, TSK_UNKNOWN, cpu); +} + +feather_callback void save_task_latency(unsigned long event, + unsigned long when_ptr) +{ + lt_t now = litmus_clock(); + lt_t *when = (lt_t*) when_ptr; + unsigned int seq_no; + int cpu = raw_smp_processor_id(); + struct timestamp *ts; + + seq_no = fetch_and_inc((int *) &ts_seq_no); + if (ft_buffer_start_write(trace_ts_buf, (void**) &ts)) { + ts->event = event; + ts->timestamp = now - *when; + ts->seq_no = seq_no; + ts->cpu = cpu; + ts->task_type = TSK_RT; + ft_buffer_finish_write(trace_ts_buf, ts); + } +} + +/******************************************************************************/ +/* DEVICE FILE DRIVER */ +/******************************************************************************/ + +/* + * should be 8M; it is the max we can ask to buddy system allocator (MAX_ORDER) + * and we might not get as much + */ +#define NO_TIMESTAMPS (2 << 16) + +static int alloc_timestamp_buffer(struct ftdev* ftdev, unsigned int idx) +{ + unsigned int count = NO_TIMESTAMPS; + while (count && !trace_ts_buf) { + printk("time stamp buffer: trying to allocate %u time stamps.\n", count); + ftdev->minor[idx].buf = alloc_ft_buffer(count, sizeof(struct timestamp)); + count /= 2; + } + return ftdev->minor[idx].buf ? 0 : -ENOMEM; +} + +static void free_timestamp_buffer(struct ftdev* ftdev, unsigned int idx) +{ + free_ft_buffer(ftdev->minor[idx].buf); + ftdev->minor[idx].buf = NULL; +} + +static ssize_t write_timestamp_from_user(struct ft_buffer* buf, size_t len, + const char __user *from) +{ + ssize_t consumed = 0; + struct timestamp ts; + + /* don't give us partial timestamps */ + if (len % sizeof(ts)) + return -EINVAL; + + while (len >= sizeof(ts)) { + if (copy_from_user(&ts, from, sizeof(ts))) { + consumed = -EFAULT; + goto out; + } + len -= sizeof(ts); + from += sizeof(ts); + consumed += sizeof(ts); + + __add_timestamp_user(&ts); + } + +out: + return consumed; +} + +static int __init init_ft_overhead_trace(void) +{ + int err; + + printk("Initializing Feather-Trace overhead tracing device.\n"); + err = ftdev_init(&overhead_dev, THIS_MODULE, 1, "ft_trace"); + if (err) + goto err_out; + + overhead_dev.alloc = alloc_timestamp_buffer; + overhead_dev.free = free_timestamp_buffer; + overhead_dev.write = write_timestamp_from_user; + + err = register_ftdev(&overhead_dev); + if (err) + goto err_dealloc; + + return 0; + +err_dealloc: + ftdev_exit(&overhead_dev); +err_out: + printk(KERN_WARNING "Could not register ft_trace module.\n"); + return err; +} + +static void __exit exit_ft_overhead_trace(void) +{ + ftdev_exit(&overhead_dev); +} + +module_init(init_ft_overhead_trace); +module_exit(exit_ft_overhead_trace); -- cgit v1.2.2