From 9aaa23e28a41fb579ef33ecb845e22cd717195c9 Mon Sep 17 00:00:00 2001 From: "Bjoern B. Brandenburg" Date: Mon, 29 Oct 2007 04:30:48 -0400 Subject: Added LITMUS release 2007.2. Also some text changes. --- download/MD5SUM | 3 + download/liblitmus-2007.2.tgz | Bin 0 -> 10825 bytes download/libso-2007.2.tgz | Bin 0 -> 15836 bytes download/litmus-rt-2007.2.patch | 12100 ++++++++++++++++++++++++++++++++++++++ index.html | 83 +- 5 files changed, 12166 insertions(+), 20 deletions(-) create mode 100644 download/liblitmus-2007.2.tgz create mode 100644 download/libso-2007.2.tgz create mode 100644 download/litmus-rt-2007.2.patch diff --git a/download/MD5SUM b/download/MD5SUM index 4d34aa9..4876c6d 100644 --- a/download/MD5SUM +++ b/download/MD5SUM @@ -1,3 +1,6 @@ 991469b3a8c9b6a0caa4cedfb663e9be liblitmus-2007.1.tgz +eddf0c80b0942f792ad8323cb62c9234 liblitmus-2007.2.tgz 6a80c8bb52af8f38dc1bbd874fa2e44f libso-2007.1.tgz +f3cb1e78f38dd22c4cca84a03fab3bbd libso-2007.2.tgz c6ef29d2e198c2fbc08e47d6f2f404bb litmus-rt-2007.1.patch +f4a1888b942a82ccce9daa55fce98202 litmus-rt-2007.2.patch diff --git a/download/liblitmus-2007.2.tgz b/download/liblitmus-2007.2.tgz new file mode 100644 index 0000000..616f345 Binary files /dev/null and b/download/liblitmus-2007.2.tgz differ diff --git a/download/libso-2007.2.tgz b/download/libso-2007.2.tgz new file mode 100644 index 0000000..394665f Binary files /dev/null and b/download/libso-2007.2.tgz differ diff --git a/download/litmus-rt-2007.2.patch b/download/litmus-rt-2007.2.patch new file mode 100644 index 0000000..deea27d --- /dev/null +++ b/download/litmus-rt-2007.2.patch @@ -0,0 +1,12100 @@ +diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig +index 0dfee81..da6f1e9 100644 +--- a/arch/i386/Kconfig ++++ b/arch/i386/Kconfig +@@ -1210,6 +1210,7 @@ config KPROBES + a probepoint and specifies the callback. Kprobes is useful + for kernel debugging, non-intrusive instrumentation and testing. + If in doubt, say "N". ++ + endmenu + + source "arch/i386/Kconfig.debug" +@@ -1259,3 +1260,30 @@ config X86_TRAMPOLINE + config KTIME_SCALAR + bool + default y ++ ++ ++menu "LITMUS^RT" ++ ++ ++config SCHED_TASK_TRACE ++ bool "Trace real-time tasks" ++ default y ++ help ++ Include support for the sched_trace_XXX() tracing functions. This ++ allows the collection of real-time task events such as job ++ completions, job releases, early completions, etc. This results in a ++ small overhead in the scheduling code. Disable if the overhead is not ++ acceptable (e.g., benchmarking). ++ ++config SCHED_DEBUG_TRACE ++ bool "TRACE() debugging" ++ default y ++ help ++ Include support for sched_trace_log_messageg(), which is used to ++ implement TRACE(). If disabled, no TRACE() messages will be included ++ in the kernel, and no overheads due to debugging statements will be ++ incurred by the scheduler. Disable if the overhead is not acceptable ++ (e.g. benchmarking). ++ ++ ++endmenu +diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c +index 776d9be..2e8909f 100644 +--- a/arch/i386/kernel/apic.c ++++ b/arch/i386/kernel/apic.c +@@ -26,6 +26,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -43,6 +44,8 @@ + + #include "io_ports.h" + ++#include ++ + /* + * cpu_mask that denotes the CPUs that needs timer interrupt coming in as + * IPIs in place of local APIC timers +@@ -54,6 +57,15 @@ static cpumask_t timer_bcast_ipi; + */ + static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ + ++/* ++ * Definitions and variables related to quantum synchronization. ++ */ ++#define WAIT_TO_SYNC 30000 /* time after boot until sync */ ++static int stagger = 0; /* are we using staggered quanta? */ ++static atomic_t qsync_time = ATOMIC_INIT(INITIAL_JIFFIES); ++static atomic_t quantum_sync_barrier = ATOMIC_INIT(0); ++static atomic_t sync_done = ATOMIC_INIT(0); ++ + static inline void lapic_disable(void) + { + enable_local_apic = -1; +@@ -786,6 +798,23 @@ static int __init apic_set_verbosity(char *str) + + __setup("apic=", apic_set_verbosity); + ++/* ++ * Determine whether to use aligned or staggerd quanta. ++ */ ++ ++static int __init apic_synch_type(char *str) ++{ ++ if (strcmp("aligned", str) == 0) ++ stagger = 0; ++ else if (strcmp("staggered", str) == 0) ++ stagger = 1; ++ else ++ stagger = 0; /* aligned quanta by default */ ++ return 1; ++} ++ ++__setup("quanta=", apic_synch_type); ++ + static int __init detect_init_APIC (void) + { + u32 h, l, features; +@@ -1198,6 +1227,47 @@ EXPORT_SYMBOL(switch_ipi_to_APIC_timer); + #undef APIC_DIVISOR + + /* ++ * This function is called to align all quanta, and to stagger quanta if ++ * necessary. It relies on a barrier to synchronize all processors, so ++ * that they all reset their APIC timers at the same time. If quanta ++ * should be staggered, the appropriate stagger delay is then added at ++ * each processor. ++ */ ++ ++void synchronize_quanta(void) ++{ ++ int cpu = smp_processor_id(); ++ int total_cpus = num_online_cpus(); ++ int stagger_interval = jiffies_to_usecs(1) / total_cpus; ++ ++ /* ++ * Disable APIC timer, wait for all other processors to reach barrier, ++ * and re-enable all timers concurrently. ++ */ ++ disable_APIC_timer(); ++ atomic_inc(&quantum_sync_barrier); ++ while (atomic_read(&quantum_sync_barrier) < total_cpus) { ++ /* Delay, otherwise atomic_inc's cannot occur. */ ++ udelay(1); ++ } ++ ++ /* Add necessary stagger for this CPU, if required. */ ++ if (stagger) { ++ int stagger_us = cpu * stagger_interval; ++ udelay(stagger_us); ++ } ++ ++ /* Re-enable all timers. */ ++ __setup_APIC_LVTT(calibration_result); ++ enable_APIC_timer(); ++ ++ /* The first CPU signals that quantum sync is complete. */ ++ if (cpu == 0) ++ atomic_inc(&sync_done); ++} ++ ++ ++/* + * Local timer interrupt handler. It does both profiling and + * process statistics/rescheduling. + * +@@ -1209,11 +1279,32 @@ EXPORT_SYMBOL(switch_ipi_to_APIC_timer); + + inline void smp_local_timer_interrupt(void) + { ++/* s64 offset; */ ++ ++ TS_TICK_START; ++ + profile_tick(CPU_PROFILING); + #ifdef CONFIG_SMP + update_process_times(user_mode_vm(get_irq_regs())); + #endif + ++ /* Print out timing data - can be commented out if necessary. */ ++/* offset = get_nsec_offset(); */ ++/* TRACE("%d\n", offset); */ ++ ++ /* ++ * Synchronize quanta if we have reached qsync_time plus wait ++ * interval. The synchronization code itself is placed in its own ++ * (non-inline) function, to avoid issues with creating an inline ++ * function that is too large. ++ */ ++ if (unlikely(!atomic_read(&sync_done) && ++ time_after(jiffies, ++ (unsigned long)(atomic_read(&qsync_time) + ++ msecs_to_jiffies(WAIT_TO_SYNC))))) { ++ synchronize_quanta(); ++ } ++ + /* + * We take the 'long' return path, and there every subsystem + * grabs the apropriate locks (kernel lock/ irq lock). +@@ -1224,6 +1315,7 @@ inline void smp_local_timer_interrupt(void) + * Currently this isn't too much of an issue (performance wise), + * we can take more than 100K local irqs per second on a 100 MHz P5. + */ ++ TS_TICK_END; + } + + /* +diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c +index e3d4b73..9670f77 100644 +--- a/arch/i386/kernel/i386_ksyms.c ++++ b/arch/i386/kernel/i386_ksyms.c +@@ -6,6 +6,7 @@ EXPORT_SYMBOL(__down_failed); + EXPORT_SYMBOL(__down_failed_interruptible); + EXPORT_SYMBOL(__down_failed_trylock); + EXPORT_SYMBOL(__up_wakeup); ++ + /* Networking helper routines. */ + EXPORT_SYMBOL(csum_partial_copy_generic); + +diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S +index 2697e92..9a633ea 100644 +--- a/arch/i386/kernel/syscall_table.S ++++ b/arch/i386/kernel/syscall_table.S +@@ -319,3 +319,32 @@ ENTRY(sys_call_table) + .long sys_move_pages + .long sys_getcpu + .long sys_epoll_pwait ++ /* LITMUS syscalls */ ++ .long sys_sched_setpolicy /* 320 */ ++ .long sys_sched_getpolicy ++ .long sys_set_rt_mode ++ .long sys_set_rt_task_param ++ .long sys_get_rt_task_param ++ .long sys_prepare_rt_task /* 325 */ ++ .long sys_ni_syscall /* CLEANUP: sys_reset_stat */ ++ .long sys_sleep_next_period ++ .long sys_scheduler_setup ++ .long sys_register_np_flag ++ .long sys_exit_np /* 330 */ ++ .long sys_pi_sema_init ++ .long sys_pi_down ++ .long sys_pi_up ++ .long sys_pi_sema_free ++ .long sys_sema_init /* 335 */ ++ .long sys_down ++ .long sys_up ++ .long sys_sema_free ++ .long sys_srp_sema_init ++ .long sys_srp_down /* 340 */ ++ .long sys_srp_up ++ .long sys_reg_task_srp_sem ++ .long sys_srp_sema_free ++ .long sys_query_job_no ++ .long sys_wait_for_job_release /* 345 */ ++ .long sys_set_service_levels ++ .long sys_get_cur_service_level +\ No newline at end of file +diff --git a/include/asm-i386/semaphore.h b/include/asm-i386/semaphore.h +index 4e34a46..7212f4b 100644 +--- a/include/asm-i386/semaphore.h ++++ b/include/asm-i386/semaphore.h +@@ -45,6 +45,7 @@ struct semaphore { + atomic_t count; + int sleepers; + wait_queue_head_t wait; ++ int used; /* allows semaphores to allocated to user space processes */ + }; + + +diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h +index 833fa17..8a5d47c 100644 +--- a/include/asm-i386/unistd.h ++++ b/include/asm-i386/unistd.h +@@ -325,10 +325,40 @@ + #define __NR_move_pages 317 + #define __NR_getcpu 318 + #define __NR_epoll_pwait 319 ++/* LITMUS */ ++#define __NR_sched_setpolicy 320 ++#define __NR_sched_getpolicy 321 ++/* Syscall definitions for mode change and task creation-manipulation */ ++#define __NR_set_rt_mode 322 ++#define __NR_set_rt_task_param 323 ++#define __NR_get_rt_task_param 324 ++#define __NR_prepare_rt_task 325 ++#define __NR_reset_stat 326 ++#define __NR_sleep_next_period 327 ++#define __NR_scheduler_setup 328 ++#define __NR_enter_np 329 ++#define __NR_exit_np 330 ++#define __NR_pi_sema_init 331 ++#define __NR_pi_down 332 ++#define __NR_pi_up 333 ++#define __NR_pi_sema_free 334 ++#define __NR_sema_init 335 ++#define __NR_down 336 ++#define __NR_up 337 ++#define __NR_sema_free 338 ++#define __NR_srp_sema_init 339 ++#define __NR_srp_down 340 ++#define __NR_srp_up 341 ++#define __NR_reg_task_srp_sem 342 ++#define __NR_srp_sema_free 343 ++#define __NR_query_job_no 344 ++#define __NR_wait_for_job_release 345 ++#define __NR_set_service_levels 346 ++#define __NR_get_cur_service_level 347 + + #ifdef __KERNEL__ + +-#define NR_syscalls 320 ++#define NR_syscalls 343 + + #define __ARCH_WANT_IPC_PARSE_VERSION + #define __ARCH_WANT_OLD_READDIR +diff --git a/include/linux/edf_common.h b/include/linux/edf_common.h +new file mode 100644 +index 0000000..f940308 +--- /dev/null ++++ b/include/linux/edf_common.h +@@ -0,0 +1,36 @@ ++/* EDF common data structures and utility functions shared by all EDF ++ * based scheduler plugins ++ */ ++ ++/* CLEANUP: Add comments and make it less messy. ++ * ++ */ ++ ++#ifndef __UNC_EDF_COMMON_H__ ++#define __UNC_EDF_COMMON_H__ ++ ++#include ++ ++ ++void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched); ++ ++int edf_higher_prio(struct task_struct* first, ++ struct task_struct* second); ++ ++int edf_ready_order(struct list_head* a, struct list_head* b); ++ ++void edf_release_at(struct task_struct *t, jiffie_t start); ++#define edf_release_now(t) edf_release_at(t, jiffies) ++ ++int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t); ++long edf_sleep_next_period(void); ++ ++void edf_prepare_for_next_period(struct task_struct *t); ++ ++#define job_completed(t) (!is_be(t) && \ ++ (t)->rt_param.times.exec_time == (t)->rt_param.basic_params.exec_cost) ++ ++int edf_set_hp_task(struct pi_semaphore *sem); ++int edf_set_hp_cpu_task(struct pi_semaphore *sem, int cpu); ++ ++#endif +diff --git a/include/linux/feather_buffer.h b/include/linux/feather_buffer.h +new file mode 100644 +index 0000000..c477772 +--- /dev/null ++++ b/include/linux/feather_buffer.h +@@ -0,0 +1,108 @@ ++#ifndef _FEATHER_BUFFER_H_ ++#define _FEATHER_BUFFER_H_ ++ ++/* requires UINT_MAX and memcpy */ ++ ++static inline int fetch_and_inc(int *val) ++{ ++ int ret = 1; ++ __asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" ); ++ return ret; ++} ++ ++static inline int fetch_and_dec(int *val) ++{ ++ int ret = -1; ++ __asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" ); ++ return ret; ++} ++ ++#define SLOT_FREE 0 ++#define SLOT_BUSY 1 ++#define SLOT_READY 2 ++ ++struct ft_buffer { ++ unsigned int slot_count; ++ unsigned int slot_size; ++ ++ int free_count; ++ unsigned int write_idx; ++ unsigned int read_idx; ++ ++ char* slots; ++ void* buffer_mem; ++ unsigned int failed_writes; ++}; ++ ++static inline int init_ft_buffer(struct ft_buffer* buf, ++ unsigned int slot_count, ++ unsigned int slot_size, ++ char* slots, ++ void* buffer_mem) ++{ ++ int i = 0; ++ if (!slot_count || UINT_MAX % slot_count != slot_count - 1) { ++ /* The slot count must divide UNIT_MAX + 1 so that when it ++ * wraps around the index correctly points to 0. ++ */ ++ return 0; ++ } else { ++ buf->slot_count = slot_count; ++ buf->slot_size = slot_size; ++ buf->slots = slots; ++ buf->buffer_mem = buffer_mem; ++ buf->free_count = slot_count; ++ buf->write_idx = 0; ++ buf->read_idx = 0; ++ buf->failed_writes = 0; ++ for (i = 0; i < slot_count; i++) ++ buf->slots[i] = SLOT_FREE; ++ return 1; ++ } ++} ++ ++static inline int ft_buffer_start_write(struct ft_buffer* buf, void **ptr) ++{ ++ int free = fetch_and_dec(&buf->free_count); ++ unsigned int idx; ++ if (free <= 0) { ++ fetch_and_inc(&buf->free_count); ++ *ptr = 0; ++ fetch_and_inc(&buf->failed_writes); ++ return 0; ++ } else { ++ idx = fetch_and_inc((int*) &buf->write_idx) % buf->slot_count; ++ buf->slots[idx] = SLOT_BUSY; ++ *ptr = ((char*) buf->buffer_mem) + idx * buf->slot_size; ++ return 1; ++ } ++} ++ ++static inline void ft_buffer_finish_write(struct ft_buffer* buf, void *ptr) ++{ ++ unsigned int idx = ((char*) ptr - (char*) buf->buffer_mem) / buf->slot_size; ++ buf->slots[idx] = SLOT_READY; ++} ++ ++ ++/* exclusive reader access is assumed */ ++static inline int ft_buffer_read(struct ft_buffer* buf, void* dest) ++{ ++ unsigned int idx; ++ if (buf->free_count == buf->slot_count) ++ /* nothing available */ ++ return 0; ++ idx = buf->read_idx % buf->slot_count; ++ if (buf->slots[idx] == SLOT_READY) { ++ memcpy(dest, ((char*) buf->buffer_mem) + idx * buf->slot_size, ++ buf->slot_size); ++ buf->slots[idx] = SLOT_FREE; ++ buf->read_idx++; ++ fetch_and_inc(&buf->free_count); ++ return 1; ++ } else ++ return 0; ++} ++ ++ ++#endif +diff --git a/include/linux/feather_trace.h b/include/linux/feather_trace.h +new file mode 100644 +index 0000000..57a21a5 +--- /dev/null ++++ b/include/linux/feather_trace.h +@@ -0,0 +1,93 @@ ++#ifndef _FEATHER_TRACE_H_ ++#define _FEATHER_TRACE_H_ ++ ++#define feather_callback __attribute__((regparm(0))) ++ ++/* make the compiler reload any register that is not saved in ++ * a cdecl function call ++ */ ++#define CLOBBER_LIST "memory", "cc", "eax", "ecx", "edx" ++ ++#define ft_event(id, callback) \ ++ __asm__ __volatile__( \ ++ "1: jmp 2f \n\t" \ ++ " call " #callback " \n\t" \ ++ ".section __event_table, \"aw\" \n\t" \ ++ ".long " #id ", 0, 1b, 2f \n\t" \ ++ ".previous \n\t" \ ++ "2: \n\t" \ ++ : : : CLOBBER_LIST) ++ ++#define ft_event0(id, callback) \ ++ __asm__ __volatile__( \ ++ "1: jmp 2f \n\t" \ ++ " subl $4, %%esp \n\t" \ ++ " movl $" #id ", (%%esp) \n\t" \ ++ " call " #callback " \n\t" \ ++ " addl $4, %%esp \n\t" \ ++ ".section __event_table, \"aw\" \n\t" \ ++ ".long " #id ", 0, 1b, 2f \n\t" \ ++ ".previous \n\t" \ ++ "2: \n\t" \ ++ : : : CLOBBER_LIST) ++ ++#define ft_event1(id, callback, param) \ ++ __asm__ __volatile__( \ ++ "1: jmp 2f \n\t" \ ++ " subl $8, %%esp \n\t" \ ++ " movl %0, 4(%%esp) \n\t" \ ++ " movl $" #id ", (%%esp) \n\t" \ ++ " call " #callback " \n\t" \ ++ " addl $8, %%esp \n\t" \ ++ ".section __event_table, \"aw\" \n\t" \ ++ ".long " #id ", 0, 1b, 2f \n\t" \ ++ ".previous \n\t" \ ++ "2: \n\t" \ ++ : : "r" (param) : CLOBBER_LIST) ++ ++#define ft_event2(id, callback, param, param2) \ ++ __asm__ __volatile__( \ ++ "1: jmp 2f \n\t" \ ++ " subl $12, %%esp \n\t" \ ++ " movl %1, 8(%%esp) \n\t" \ ++ " movl %0, 4(%%esp) \n\t" \ ++ " movl $" #id ", (%%esp) \n\t" \ ++ " call " #callback " \n\t" \ ++ " addl $12, %%esp \n\t" \ ++ ".section __event_table, \"aw\" \n\t" \ ++ ".long " #id ", 0, 1b, 2f \n\t" \ ++ ".previous \n\t" \ ++ "2: \n\t" \ ++ : : "r" (param), "r" (param2) : CLOBBER_LIST) ++ ++ ++#define ft_event3(id, callback, p, p2, p3) \ ++ __asm__ __volatile__( \ ++ "1: jmp 2f \n\t" \ ++ " subl $16, %%esp \n\t" \ ++ " movl %1, 12(%%esp) \n\t" \ ++ " movl %1, 8(%%esp) \n\t" \ ++ " movl %0, 4(%%esp) \n\t" \ ++ " movl $" #id ", (%%esp) \n\t" \ ++ " call " #callback " \n\t" \ ++ " addl $16, %%esp \n\t" \ ++ ".section __event_table, \"aw\" \n\t" \ ++ ".long " #id ", 0, 1b, 2f \n\t" \ ++ ".previous \n\t" \ ++ "2: \n\t" \ ++ : : "r" (p), "r" (p2), "r" (p3) : CLOBBER_LIST) ++ ++ ++static inline unsigned long long ft_read_tsc(void) ++{ ++ unsigned long long ret; ++ __asm__ __volatile__("rdtsc" : "=A" (ret)); ++ return ret; ++} ++ ++int ft_enable_event(unsigned long id); ++int ft_disable_event(unsigned long id); ++int ft_is_event_enabled(unsigned long id); ++int ft_disable_all_events(void); ++ ++#endif +diff --git a/include/linux/fifo_common.h b/include/linux/fifo_common.h +new file mode 100644 +index 0000000..0883226 +--- /dev/null ++++ b/include/linux/fifo_common.h +@@ -0,0 +1,18 @@ ++/* FIFO common definitions and utility functions. ++ */ ++#ifndef __UNC_SCHED_FIFO_H__ ++#define __UNC_SCHED_FIFO_H__ ++ ++#include ++ ++ ++int fifo_higher_prio(struct task_struct* first, ++ struct task_struct* second); ++ ++int fifo_ready_order(struct list_head* a, struct list_head* b); ++ ++ ++void fifo_domain_init(rt_domain_t* fifo, check_resched_needed_t resched); ++ ++ ++#endif +diff --git a/include/linux/fpmath.h b/include/linux/fpmath.h +new file mode 100644 +index 0000000..a15c239 +--- /dev/null ++++ b/include/linux/fpmath.h +@@ -0,0 +1,111 @@ ++#ifndef __FP_MATH_H__ ++#define __FP_MATH_H__ ++ ++#define FP_SHIFT 10 ++#define ROUND_BIT (FP_SHIFT - 1) ++#define ONE FP(1) ++ ++#define _fp(x) ((fp_t) {x}) ++ ++static inline long _point(fp_t x) ++{ ++ return (x.val % (1 << FP_SHIFT)); ++ ++} ++ ++#define fp2str(x) x.val ++/*(x.val >> FP_SHIFT), (x.val % (1 << FP_SHIFT)) */ ++#define _FP_ "%ld/1024" ++ ++ ++static inline fp_t FP(long x) ++{ ++ return _fp(((long) x) << FP_SHIFT); ++} ++ ++static inline long _floor(fp_t x) ++{ ++ return x.val >> FP_SHIFT; ++} ++ ++/* FIXME: negative rounding */ ++static inline long _round(fp_t x) ++{ ++ return _floor(x) + ((x.val >> ROUND_BIT) & 1); ++} ++ ++/* divide two integers to obtain a fixed point value */ ++static inline fp_t _frac(long a, long b) ++{ ++ return _fp(FP(a).val / (b)); ++} ++ ++/* multiply two fixed point values */ ++static inline fp_t _mul(fp_t a, fp_t b) ++{ ++ return _fp((a.val * b.val) >> FP_SHIFT); ++} ++ ++static inline fp_t _div(fp_t a, fp_t b) ++{ ++ /* try not to overflow */ ++ if (unlikely(a.val > 2 << (BITS_PER_LONG - FP_SHIFT))) ++ return _fp((a.val / b.val) << FP_SHIFT); ++ else ++ return _fp((a.val << FP_SHIFT) / b.val); ++} ++ ++static inline fp_t _add(fp_t a, fp_t b) ++{ ++ return _fp(a.val + b.val); ++} ++ ++static inline fp_t _sub(fp_t a, fp_t b) ++{ ++ return _fp(a.val - b.val); ++} ++ ++static inline fp_t _neg(fp_t x) ++{ ++ return _fp(-x.val); ++} ++ ++static inline fp_t _abs(fp_t x) ++{ ++ return _fp(abs(x.val)); ++} ++ ++static inline int _leq(fp_t a, fp_t b) ++{ ++ return a.val <= b.val; ++} ++ ++static inline int _geq(fp_t a, fp_t b) ++{ ++ return a.val >= b.val; ++} ++ ++static inline int _lt(fp_t a, fp_t b) ++{ ++ return a.val < b.val; ++} ++ ++static inline int _gt(fp_t a, fp_t b) ++{ ++ return a.val > b.val; ++} ++ ++static inline int _eq(fp_t a, fp_t b) ++{ ++ return a.val == b.val; ++} ++ ++static inline fp_t _max(fp_t a, fp_t b) ++{ ++ if (a.val < b.val) ++ return b; ++ else ++ return a; ++} ++ ++#endif +diff --git a/include/linux/list.h b/include/linux/list.h +index 611059d..319c5ed 100644 +--- a/include/linux/list.h ++++ b/include/linux/list.h +@@ -898,6 +898,36 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev, + ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + ++ ++typedef int (*list_cmp_t)(struct list_head*, struct list_head*); ++ ++static inline unsigned int list_insert(struct list_head* new, ++ struct list_head* head, ++ list_cmp_t order_before) ++{ ++ struct list_head *pos; ++ unsigned int passed = 0; ++ ++ BUG_ON(!new); ++ ++ /* find a spot where the new entry is less than the next */ ++ list_for_each(pos, head) { ++ if (unlikely(order_before(new, pos))) { ++ /* pos is not less than new, thus insert here */ ++ __list_add(new, pos->prev, pos); ++ goto out; ++ } ++ passed++; ++ } ++ /* if we get to this point either the list is empty or every entry ++ * queued element is less than new. ++ * Let's add new to the end. */ ++ list_add_tail(new, head); ++ out: ++ return passed; ++} ++ ++ + #else + #warning "don't include kernel headers in userspace" + #endif /* __KERNEL__ */ +diff --git a/include/linux/litmus.h b/include/linux/litmus.h +new file mode 100644 +index 0000000..259594e +--- /dev/null ++++ b/include/linux/litmus.h +@@ -0,0 +1,128 @@ ++/* ++ * Constant definitions related to ++ * scheduling policy. ++ */ ++ ++#ifndef _LINUX_LITMUS_H_ ++#define _LINUX_LITMUS_H_ ++ ++#include ++#include ++ ++typedef enum { ++ SCHED_BEG = 0, ++ SCHED_LINUX = 0, ++ SCHED_PFAIR = 1, ++ SCHED_PFAIR_STAGGER = 2, ++ SCHED_PART_EDF = 3, ++ SCHED_PART_EEVDF = 4, ++ SCHED_GLOBAL_EDF = 5, ++ SCHED_PFAIR_DESYNC = 6, ++ SCHED_GLOBAL_EDF_NP = 7, ++ SCHED_CUSTOM = 8, ++ SCHED_EDF_HSB = 9, ++ SCHED_GSN_EDF = 10, ++ SCHED_PSN_EDF = 11, ++ SCHED_ADAPTIVE = 12, ++ /* Add your scheduling policy here */ ++ ++ SCHED_END = 12, ++ SCHED_DEFAULT = 0, ++ SCHED_INVALID = -1, ++} spolicy; ++ ++ ++typedef enum { ++ LITMUS_RESERVED_RANGE = 1024, ++ ++} sched_setup_cmd_t; ++ ++/* Runtime modes */ ++enum rt_mode_t { ++ MODE_NON_RT = 0, ++ MODE_RT_RUN = 1 ++}; ++ ++/* Plugin boot options, for convenience */ ++#define PLUGIN_LINUX "linux" ++#define PLUGIN_PFAIR "pfair" ++#define PLUGIN_PART_EDF "part_edf" ++#define PLUGIN_GLOBAL_EDF "global_edf" ++#define PLUGIN_GLOBAL_EDF_NP "global_edf_np" ++#define PLUGIN_EDF_HSB "edf_hsb" ++#define PLUGIN_GSN_EDF "gsn_edf" ++#define PLUGIN_PSN_EDF "psn_edf" ++#define PLUGIN_ADAPTIVE "adaptive" ++ ++extern spolicy sched_policy; ++ ++/* RT mode start time */ ++extern volatile unsigned long rt_start_time; ++ ++/* Here we store the current mode of the system */ ++extern atomic_t rt_mode; ++ ++#define get_rt_mode() (atomic_read(&rt_mode)) ++#define set_rt_mode(a) atomic_set(&rt_mode,(a)) ++ ++#define TRACE(fmt, args...) \ ++ sched_trace_log_message("%d: " fmt, raw_smp_processor_id(), ## args) ++ ++#define TRACE_TASK(t, fmt, args...) \ ++ TRACE("(%s/%d) " fmt, (t)->comm, (t)->pid, ##args) ++ ++#define TRACE_CUR(fmt, args...) \ ++ TRACE_TASK(current, fmt, ## args) ++ ++#define TRACE_BUG_ON(cond) \ ++ do { if (cond) TRACE("BUG_ON(%s) at %s:%d " \ ++ "called from %p current=%s/%d state=%d " \ ++ "flags=%x mode=%d partition=%d cpu=%d rtflags=%d"\ ++ " job=%u knp=%d timeslice=%u\n", \ ++ #cond, __FILE__, __LINE__, __builtin_return_address(0), current->comm, \ ++ current->pid, current->state, current->flags, get_rt_mode(), \ ++ get_partition(current), smp_processor_id(), get_rt_flags(current), \ ++ current->rt_param.times.job_no, current->rt_param.kernel_np, \ ++ current->time_slice\ ++ ); } while(0); ++ ++ ++/* in_list - is a given list_head queued on some list? ++ */ ++static inline int in_list(struct list_head* list) ++{ ++ return !( /* case 1: deleted */ ++ (list->next == LIST_POISON1 && ++ list->prev == LIST_POISON2) ++ || ++ /* case 2: initialized */ ++ (list->next == list && ++ list->prev == list) ++ ); ++} ++ ++void list_qsort(struct list_head* list, list_cmp_t less_than); ++ ++ ++#define RT_PREEMPTIVE 0x2050 /* = NP */ ++#define RT_NON_PREEMPTIVE 0x4e50 /* = P */ ++#define RT_EXIT_NP_REQUESTED 0x5251 /* = RQ */ ++ ++/* returns 1 if task t has registered np flag and set it to RT_NON_PREEMPTIVE ++ */ ++int is_np(struct task_struct *t); ++ ++/* request that the task should call sys_exit_np() ++ */ ++void request_exit_np(struct task_struct *t); ++ ++/* kill naughty tasks ++ */ ++void scheduler_signal(struct task_struct *t, unsigned int signal); ++void send_scheduler_signals(void); ++void np_mem_kill(struct task_struct *t); ++ ++/* clean up real-time state of a task */ ++void exit_litmus(struct task_struct *dead_tsk); ++ ++#endif +diff --git a/include/linux/pfair_common.h b/include/linux/pfair_common.h +new file mode 100644 +index 0000000..67e18c6 +--- /dev/null ++++ b/include/linux/pfair_common.h +@@ -0,0 +1,40 @@ ++/* PFAIR common data structures and utility functions shared by all PFAIR ++ * based scheduler plugins ++ */ ++ ++#ifndef __UNC_PFAIR_COMMON_H__ ++#define __UNC_PFAIR_COMMON_H__ ++ ++#include ++#include ++ ++typedef struct _pfair_domain { ++ /* Global lock to protect the data structures */ ++ queuelock_t pfair_lock; ++ /* runnable rt tasks are in here */ ++ struct list_head ready_queue; ++ ++ /* real-time tasks waiting for release are in here */ ++ struct list_head release_queue; ++ ++ /* CPU's in the domain */ ++ cpumask_t domain_cpus; ++ ++} pfair_domain_t; ++ ++#define next_ready(pfair) \ ++ (list_entry((pfair)->ready_queue.next, struct task_struct, rt_list)) ++void pfair_domain_init(pfair_domain_t *pfair); ++void pfair_add_ready(pfair_domain_t* pfair, struct task_struct *new); ++struct task_struct* __pfair_take_ready(pfair_domain_t* pfair); ++void pfair_add_release(pfair_domain_t* pfair, struct task_struct *task); ++void pfair_try_release_pending(pfair_domain_t* pfair); ++void __pfair_prepare_new_release(struct task_struct *t, jiffie_t start); ++ ++void pfair_prepare_next_job(struct task_struct *t); ++void pfair_prepare_next_subtask(struct task_struct *t); ++ ++void pfair_prepare_new_releases(pfair_domain_t *pfair, jiffie_t start); ++ ++#endif ++ +diff --git a/include/linux/pfair_math.h b/include/linux/pfair_math.h +new file mode 100644 +index 0000000..b2a14e4 +--- /dev/null ++++ b/include/linux/pfair_math.h +@@ -0,0 +1,80 @@ ++/* PFAIR Mathematical functions */ ++#ifndef __UNC_PFAIR_MATH_H__ ++#define __UNC_PFAIR_MATH_H__ ++ ++#include ++#include ++#include ++#include ++ ++/* Type definition for our quantums */ ++typedef unsigned long long quantum_t; ++ ++/* ++* This file defines mathematical functions "ceiling", "floor", ++* and PFAIR specific functions for computing the release and ++* the deadline of a subtask, as well as tie breakers: ++* b-bit and group deadline. ++*/ ++static inline quantum_t FLOOR(quantum_t a, unsigned long b) ++{ ++ BUG_ON( b == 0); ++ do_div(a, b); ++ return a; ++} ++static inline quantum_t CEIL(quantum_t a, unsigned long b) ++{ ++ quantum_t t = FLOOR(a, b); ++ return (quantum_t)((t * b == a) ? t : (t + 1)); ++} ++ ++ ++/* ++* invariant - i-1=get_passed_quanta(t) ++* ++* release time of i-th subtask of j-th job is ++* r_{ij}+\lfloor i-1/wt(T) \rfloor ++* This operation should be robust to wrap-around ++* so we can compare the result with jiffies safely ++*/ ++static inline quantum_t release_time(struct task_struct * t) ++{ ++ quantum_t e = get_exec_cost(t); ++ quantum_t p = get_rt_period(t); ++ return FLOOR((get_passed_quanta(t)) * p, e); ++} ++/* ++* deadline time of i-th subtask of j-th job is ++* r_{ij}+\lceil i/wt(T) \rceil ++* This operation should be robust to wrap-around ++* so we can compare the result with jiffies safely ++*/ ++static inline quantum_t pfair_deadline(struct task_struct * t) ++{ ++ quantum_t e = get_exec_cost(t); ++ quantum_t p = get_rt_period(t); ++ return CEIL((get_passed_quanta(t) + 1) * p, e); ++} ++/* In PFAIR b-bit is defined as ++* \lceil i/wt(T) \rceil-\lfloor i/wt(T) \rfloor ++*/ ++static inline int b_bit(struct task_struct *t) ++{ ++ quantum_t e = get_exec_cost(t); ++ quantum_t p = get_rt_period(t); ++ return CEIL((get_passed_quanta(t) + 1) * p, e)- ++ FLOOR((get_passed_quanta(t) + 1) * p, e); ++} ++/* ++* Group deadline ++*/ ++static inline quantum_t group_deadline(struct task_struct * t) ++{ ++ quantum_t p = get_rt_period(t); ++ quantum_t e = get_exec_cost(t); ++ quantum_t stage1 = CEIL((get_passed_quanta(t) + 1) * p, e); ++ quantum_t stage2 = CEIL(stage1 * (p - e), p); ++ return CEIL(stage2 * p, p - e); ++} ++ ++#endif /* __UNC_PFAIR_MATH_H__ */ +diff --git a/include/linux/queuelock.h b/include/linux/queuelock.h +new file mode 100644 +index 0000000..454ff81 +--- /dev/null ++++ b/include/linux/queuelock.h +@@ -0,0 +1,98 @@ ++#ifndef _UNC_QUEUELOCK_H_ ++#define _UNC_QUEUELOCK_H_ ++/** ++* Queue lock ++* ++* This is an implementation of T. Anderson's queue lock. ++* It strives to follow the normal Linux locking conventions ++* as much as possible. The rules for acquiring a lock are: ++* ++* 1) The caller must ensure interrupts and preemptions are disabled. ++* ++* 2) The caller _cannot_ recursively acquire the lock. ++* ++* 3) The caller may not sleep while holding the lock. This is currently ++* not enforced, but it will not work. ++*/ ++ ++#include ++#include ++#include ++ ++typedef struct { ++ /* pad the values being spun on to make sure ++ that they are cache local ++ */ ++ union { ++ volatile enum { ++ MUST_WAIT, ++ HAS_LOCK ++ } val; ++ char padding[SMP_CACHE_BYTES]; ++ } slots[NR_CPUS]; ++ ++ /* since spin_slot is not being spun on it can be ++ * in a shared cache line. next_slot will be evicted ++ * anyway on every attempt to acquire the lock. ++ */ ++ int spin_slot[NR_CPUS]; ++ ++ /* The next slot that will be available. ++ */ ++ atomic_t next_slot; ++} queuelock_t; ++ ++ ++static inline void queue_lock_init(queuelock_t *lock) ++{ ++ int i; ++ for (i = 0; i < NR_CPUS; i++) { ++ lock->slots[i].val = MUST_WAIT; ++ lock->spin_slot[i] = i; ++ } ++ lock->slots[0].val = HAS_LOCK; ++ atomic_set(&lock->next_slot, 0); ++} ++ ++ ++static inline void queue_lock(queuelock_t *lock) ++{ ++ int me = smp_processor_id(); ++ volatile int* spin_var; ++ /* Get slot to spin on. atomic_inc_return() returns the incremented ++ * value, so take one of again ++ */ ++ lock->spin_slot[me] = atomic_inc_return(&lock->next_slot) - 1; ++ /* check for wrap-around ++ * This could probably optimized away if we ensure that NR_CPUS divides ++ * INT_MAX... ++ */ ++ if (unlikely(lock->spin_slot[me] == NR_CPUS - 1)) ++ atomic_add(-NR_CPUS, &lock->next_slot); ++ /* range limit*/ ++ lock->spin_slot[me] %= NR_CPUS; ++ /* spin until you acquire the lock */ ++ spin_var = (int*) &lock->slots[lock->spin_slot[me]].val; ++ while (*spin_var == MUST_WAIT) ++ cpu_relax(); ++ ++ /* reset the lock */ ++ lock->slots[lock->spin_slot[me]].val = MUST_WAIT; ++ barrier(); ++} ++ ++ ++static inline void queue_unlock(queuelock_t *lock) ++{ ++ int me = smp_processor_id(); ++ barrier(); ++ lock->slots[(lock->spin_slot[me] + 1) % NR_CPUS].val = HAS_LOCK; ++} ++ ++#define queue_lock_irqsave(lock, flags) \ ++ do { local_irq_save(flags); queue_lock(lock); } while (0); ++ ++#define queue_unlock_irqrestore(lock, flags) \ ++ do { queue_unlock(lock); local_irq_restore(flags); } while (0); ++ ++#endif /* _UNC_QUEUELOCK_H_ */ +diff --git a/include/linux/rt_domain.h b/include/linux/rt_domain.h +new file mode 100644 +index 0000000..237eac7 +--- /dev/null ++++ b/include/linux/rt_domain.h +@@ -0,0 +1,98 @@ ++/* CLEANUP: Add comments and make it less messy. ++ * ++ */ ++ ++#ifndef __UNC_RT_DOMAIN_H__ ++#define __UNC_RT_DOMAIN_H__ ++ ++struct _rt_domain; ++ ++typedef int (*check_resched_needed_t)(struct _rt_domain *rt); ++typedef void (*release_at_t)(struct task_struct *t, jiffie_t start); ++ ++typedef struct _rt_domain { ++ /* runnable rt tasks are in here */ ++ rwlock_t ready_lock; ++ struct list_head ready_queue; ++ ++ /* real-time tasks waiting for release are in here */ ++ spinlock_t release_lock; ++ struct list_head release_queue; ++ ++ /* how do we check if we need to kick another CPU? */ ++ check_resched_needed_t check_resched; ++ ++ /* how are tasks ordered in the ready queue? */ ++ list_cmp_t order; ++} rt_domain_t; ++ ++#define next_ready(rt) \ ++ (list_entry((rt)->ready_queue.next, struct task_struct, rt_list)) ++ ++#define ready_jobs_pending(rt) \ ++ (!list_empty(&(rt)->ready_queue)) ++ ++void rt_domain_init(rt_domain_t *rt, check_resched_needed_t f, ++ list_cmp_t order); ++ ++void __add_ready(rt_domain_t* rt, struct task_struct *new); ++void __add_release(rt_domain_t* rt, struct task_struct *task); ++ ++struct task_struct* __take_ready_rq(rt_domain_t* rt, runqueue_t* rq, int cpu); ++struct task_struct* __take_ready(rt_domain_t* rt); ++struct task_struct* __peek_ready(rt_domain_t* rt); ++ ++void try_release_pending(rt_domain_t* rt); ++void __release_pending(rt_domain_t* rt); ++ ++void rerelease_all(rt_domain_t *rt, release_at_t release); ++void __rerelease_all(rt_domain_t *rt, release_at_t release); ++ ++static inline void add_ready(rt_domain_t* rt, struct task_struct *new) ++{ ++ unsigned long flags; ++ /* first we need the write lock for rt_ready_queue */ ++ write_lock_irqsave(&rt->ready_lock, flags); ++ __add_ready(rt, new); ++ write_unlock_irqrestore(&rt->ready_lock, flags); ++} ++ ++static inline struct task_struct* take_ready(rt_domain_t* rt) ++{ ++ unsigned long flags; ++ struct task_struct* ret; ++ /* first we need the write lock for rt_ready_queue */ ++ write_lock_irqsave(&rt->ready_lock, flags); ++ ret = __take_ready(rt); ++ write_unlock_irqrestore(&rt->ready_lock, flags); ++ return ret; ++} ++ ++ ++static inline void add_release(rt_domain_t* rt, struct task_struct *task) ++{ ++ unsigned long flags; ++ /* first we need the write lock for rt_ready_queue */ ++ spin_lock_irqsave(&rt->release_lock, flags); ++ __add_release(rt, task); ++ spin_unlock_irqrestore(&rt->release_lock, flags); ++} ++ ++static inline int __jobs_pending(rt_domain_t* rt) ++{ ++ return !list_empty(&rt->ready_queue); ++} ++ ++static inline int jobs_pending(rt_domain_t* rt) ++{ ++ unsigned long flags; ++ int ret; ++ /* first we need the write lock for rt_ready_queue */ ++ read_lock_irqsave(&rt->ready_lock, flags); ++ ret = __jobs_pending(rt); ++ read_unlock_irqrestore(&rt->ready_lock, flags); ++ return ret; ++} ++ ++ ++#endif +diff --git a/include/linux/rt_param.h b/include/linux/rt_param.h +new file mode 100644 +index 0000000..426a929 +--- /dev/null ++++ b/include/linux/rt_param.h +@@ -0,0 +1,264 @@ ++/* ++ * Definition of the scheduler plugin interface. ++ * ++ */ ++#ifndef _LINUX_RT_PARAM_H_ ++#define _LINUX_RT_PARAM_H_ ++ ++#include ++ ++typedef unsigned long jiffie_t; ++ ++/* different types of clients */ ++typedef enum { ++ RT_CLASS_HARD, ++ RT_CLASS_SOFT, ++ RT_CLASS_BEST_EFFORT ++} task_class_t; ++ ++typedef struct rt_param { ++ unsigned long exec_cost; ++ unsigned long period; ++ unsigned int cpu; ++ task_class_t class; ++} rt_param_t; ++ ++/* fixed point wrapper to force compiler ++ * errors in case of misuse of a fixed point value ++ */ ++typedef struct ++{ ++ long val; ++} fp_t; ++ ++typedef struct { ++ fp_t weight; ++ unsigned long period; ++ fp_t value; ++} service_level_t; ++ ++typedef struct { ++ fp_t estimate; ++ fp_t accumulated; ++} predictor_state_t; ++ ++typedef struct { ++ /* when will this task be release the next time? */ ++ jiffie_t release; ++ /* time instant the last job was released */ ++ jiffie_t last_release; ++ /* what is the current deadline? */ ++ jiffie_t deadline; ++ /* b-bit tie breaker for PFAIR, it is ignored in EDF */ ++ int b_bit; ++ /* group deadline tie breaker, it is ignored in EDF */ ++ jiffie_t group_deadline; ++ /* how long has this task executed so far? ++ * In case of capacity sharing a job completion cannot be ++ * detected by checking time_slice == 0 as the job may have ++ * executed while using another capacity. Use this counter ++ * to keep track of the time spent on a CPU by a job. ++ * ++ * In other words: The number of consumed quanta since the ++ * last job release. ++ */ ++ unsigned int exec_time; ++ ++ /* Which job is this. This is used to let user space ++ * specify which job to wait for, which is important if jobs ++ * overrun. If we just call sys_sleep_next_period() then we ++ * will unintentionally miss jobs after an overrun. ++ * ++ * Increase this sequence number when a job is released. ++ */ ++ unsigned int job_no; ++} rt_times_t; ++ ++ ++/* RT task parameters for scheduling extensions ++ * These parameters are inherited during clone and therefore must ++ * be explicitly set up before the task set is launched. ++ */ ++typedef struct task_rt_param { ++ /* is the task sleeping? */ ++ unsigned int flags:8; ++ ++ /* Real-time marker: 1 iff it is a LITMUS real-time task. ++ */ ++ unsigned int is_realtime:1; ++ ++ /* is this task under control of litmus? ++` * ++ * this is necessary because otherwise signal delivery code ++ * may try to wake up a task that is already queued in plugin ++ * data structures. ++ */ ++ unsigned int litmus_controlled:1; ++ ++ /* Did this task register any SRP controlled resource accesses? ++ * This, of course, should only ever be true under partitioning. ++ * However, this limitation is not currently enforced. ++ */ ++ unsigned int subject_to_srp:1; ++ ++ /* user controlled parameters */ ++ rt_param_t basic_params; ++ ++ /* task representing the current "inherited" task ++ * priority, assigned by inherit_priority and ++ * return priority in the scheduler plugins. ++ * could point to self if PI does not result in ++ * an increased task priority. ++ */ ++ struct task_struct* inh_task; ++ ++ /* Don't just dereference this pointer in kernel space! ++ * It might very well point to junk or nothing at all. ++ * NULL indicates that the task has not requested any non-preemptable ++ * section support. ++ * TODO: What happens on fork? ++ */ ++ __user short* np_flag; ++ ++ /* For the FMLP under PSN-EDF, it is required to make the task ++ * non-preemptive from kernel space. In order not to interfere with ++ * user space, this counter indicates the kernel space np setting. ++ * kernel_np > 0 => task is non-preemptive ++ */ ++ unsigned int kernel_np; ++ ++ /* timing parameters */ ++ rt_times_t times; ++ ++ /* This is currently only used by the PFAIR code ++ * and a prime candidate for cleanup. ++ */ ++ rt_times_t backup; ++ ++ /* This field can be used by plugins to store where the task ++ * is currently scheduled. It is the responsibility of the ++ * plugin to avoid race conditions. ++ * ++ * Used by GSN-EDF. ++ */ ++ int scheduled_on; ++ ++ /* This field can be used by plugins to store where the task ++ * is currently linked. It is the responsibility of the plugin ++ * to avoid race conditions. ++ * ++ * Used by GSN-EDF. ++ */ ++ int linked_on; ++ ++ /* Adaptive support. Adaptive tasks will store service levels ++ * in this (dynamically allocated) structure. ++ */ ++ service_level_t* service_level; ++ unsigned int no_service_levels; ++ unsigned int cur_service_level; ++ ++ /* Adaptive support. Store state for weight estimation. ++ */ ++ predictor_state_t predictor_state; ++ ++ /* Adaptive support. Optimizer fields. ++ */ ++ struct list_head opt_list; ++ fp_t opt_order; ++ fp_t opt_dw; ++ fp_t opt_nw; ++ unsigned int opt_level; ++ jiffie_t opt_change; ++} task_rt_param_t; ++ ++/* Possible RT flags */ ++#define RT_F_RUNNING 0x00000000 ++#define RT_F_SLEEP 0x00000001 ++#define RT_F_EXP_QUANTA 0x00000002 ++#define RT_F_NON_PREEMTABLE 0x00000004 ++#define RT_F_EXIT_SEM 0x00000008 ++ ++#define is_realtime(t) ((t)->rt_param.is_realtime) ++ ++/* Realtime utility macros */ ++#define get_passed_quanta(t) ((t)->rt_param.times.exec_time) ++#define inc_passed_quanta(t) ((t)->rt_param.times.exec_time += 1) ++#define get_rt_flags(t) ((t)->rt_param.flags) ++#define set_rt_flags(t,f) (t)->rt_param.flags=(f) ++#define get_exec_cost(t) ((t)->rt_param.basic_params.exec_cost) ++#define get_rt_period(t) ((t)->rt_param.basic_params.period) ++#define set_rt_period(t,p) (t)->rt_param.basic_params.period=(p) ++#define set_exec_cost(t,e) (t)->rt_param.basic_params.exec_cost=(e) ++#define get_partition(t) (t)->rt_param.basic_params.cpu ++#define get_deadline(t) ((t)->rt_param.times.deadline) ++#define get_last_release(t) ((t)->rt_param.times.last_release) ++#define get_class(t) ((t)->rt_param.basic_params.class) ++ ++#define has_active_job(t) \ ++ (time_before(get_last_release(t), jiffies) \ ++ && time_before_eq(jiffies, get_deadline(t))) ++ ++#define get_est_weight(t) ((t)->rt_param.predictor_state.estimate) ++#define get_sl(t, l) \ ++ ((t)->rt_param.service_level[l]) ++#define get_cur_sl(t) ((t)->rt_param.cur_service_level) ++#define get_max_sl(t) ((t)->rt_param.no_service_levels - 1) ++#define get_opt_sl(t) ((t)->rt_param.opt_level) ++ ++ ++#define is_subject_to_srp(t) ((t)->rt_param.subject_to_srp) ++#define is_hrt(t) \ ++ ((t)->rt_param.basic_params.class == RT_CLASS_HARD) ++#define is_srt(t) \ ++ ((t)->rt_param.basic_params.class == RT_CLASS_SOFT) ++#define is_be(t) \ ++ ((t)->rt_param.basic_params.class == RT_CLASS_BEST_EFFORT) ++ ++#define clear_rt_params(t) \ ++memset(&(t)->rt_param,0, sizeof(struct task_rt_param)) ++ ++#define get_release(t) ((t)->rt_param.times.release) ++#define set_release(t,r) ((t)->rt_param.times.release=(r)) ++ ++/* honor the flag that is set when scheduling is in progress ++ * This is some dirty hack in Linux that creates race conditions in our code ++ * if we don't pay attention to it. ++ */ ++#define is_running(t) \ ++ ((t)->state == TASK_RUNNING || \ ++ (t)->thread_info->preempt_count & PREEMPT_ACTIVE) ++ ++#define is_blocked(t) (!is_running(t)) ++#define is_released(t) (time_before_eq((t)->rt_param.times.release, jiffies)) ++#define is_tardy(t) (time_before_eq((t)->rt_param.times.deadline, jiffies)) ++#define task_slack(t) ( (int) (t)->rt_param.times.deadline - (int) jiffies - \ ++ (int) ((t)->rt_param.basic_params.exec_cost - \ ++ (t)->rt_param.times.exec_time)) ++ ++ ++/* real-time comparison macros */ ++#define earlier_deadline(a, b) (time_before(\ ++ (a)->rt_param.times.deadline,\ ++ (b)->rt_param.times.deadline)) ++#define earlier_release(a, b) (time_before(\ ++ (a)->rt_param.times.release,\ ++ (b)->rt_param.times.release)) ++ ++#define earlier_last_release(a, b) (time_before(\ ++ (a)->rt_param.times.last_release,\ ++ (b)->rt_param.times.last_release)) ++ ++ ++#define make_np(t) do {t->rt_param.kernel_np++;} while(0); ++#define take_np(t) do {t->rt_param.kernel_np--;} while(0); ++ ++#define backup_times(t) do { (t)->rt_param.backup=(t)->rt_param.times; \ ++ } while(0); ++#define restore_times(t) do { (t)->rt_param.times=(t)->rt_param.backup; \ ++ } while(0); ++ ++ ++#define rt_list2task(p) list_entry(p, struct task_struct, rt_list) ++ ++#endif +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 4463735..f533ae3 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -3,6 +3,8 @@ + + #include /* For AT_VECTOR_SIZE */ + ++#include ++ + /* + * cloning flags: + */ +@@ -26,6 +28,8 @@ + #define CLONE_STOPPED 0x02000000 /* Start in stopped state */ + #define CLONE_NEWUTS 0x04000000 /* New utsname group? */ + #define CLONE_NEWIPC 0x08000000 /* New ipcs */ ++#define CLONE_REALTIME 0x10000000 /* LITMUS real-time task creation */ ++ + + /* + * Scheduling policies +@@ -1051,6 +1055,12 @@ struct task_struct { + #ifdef CONFIG_FAULT_INJECTION + int make_it_fail; + #endif ++ /* litmus parameters and state */ ++ task_rt_param_t rt_param; ++ ++ /* allow scheduler plugins to queue in release lists, etc. */ ++ struct list_head rt_list; ++ + }; + + static inline pid_t process_group(struct task_struct *tsk) +diff --git a/include/linux/sched_plugin.h b/include/linux/sched_plugin.h +new file mode 100644 +index 0000000..1ea8178 +--- /dev/null ++++ b/include/linux/sched_plugin.h +@@ -0,0 +1,149 @@ ++/* ++ * Definition of the scheduler plugin interface. ++ * ++ */ ++#ifndef _LINUX_SCHED_PLUGIN_H_ ++#define _LINUX_SCHED_PLUGIN_H_ ++ ++#include ++ ++/* struct for semaphore with priority inheritance */ ++struct pi_semaphore { ++ atomic_t count; ++ int sleepers; ++ wait_queue_head_t wait; ++ union { ++ /* highest-prio holder/waiter */ ++ struct task_struct *task; ++ struct task_struct* cpu_task[NR_CPUS]; ++ } hp; ++ /* current lock holder */ ++ struct task_struct *holder; ++ /* is the semaphore being used? */ ++ int used; ++}; ++ ++ ++/* Enforce runqueues to be opaque objects. ++ * ++ * This allows us to pass around pointers to runqueues, ++ * without actually having to rip it out of sched.c. It ++ * also discourages plugins from trying to be ++ * overly clever. ++ */ ++typedef void runqueue_t; ++ ++ ++/********************* scheduler invocation ******************/ ++ ++typedef enum { ++ NO_RESCHED = 0, ++ FORCE_RESCHED = 1 ++} reschedule_check_t; ++ ++ ++/* Plugin-specific realtime tick handler */ ++typedef reschedule_check_t (*scheduler_tick_t) (void); ++/* Novell make sched decision function */ ++typedef int (*schedule_t) (struct task_struct * prev, ++ struct task_struct ** next, ++ runqueue_t * rq); ++/* Clean up after the task switch has occured. ++ * This function is called after every (even non-rt) task switch. ++ */ ++typedef void (*finish_switch_t)(struct task_struct *prev); ++ ++ ++/********************* task state changes ********************/ ++ ++/* called to setup a new real-time task */ ++typedef long (*prepare_task_t) (struct task_struct *task); ++/* called to re-introduce a task after blocking */ ++typedef void (*wake_up_task_t) (struct task_struct *task); ++/* called to notify the plugin of a blocking real-time task ++ * it will only be called for real-time tasks and before schedule is called */ ++typedef void (*task_blocks_t) (struct task_struct *task); ++/* called when a real-time task exits. Free any allocated resources */ ++typedef long (*tear_down_t) (struct task_struct *); ++ ++/* Called when the new_owner is released from the wait queue ++ * it should now inherit the priority from sem, _before_ it gets readded ++ * to any queue ++ */ ++typedef long (*inherit_priority_t) (struct pi_semaphore *sem, ++ struct task_struct *new_owner); ++ ++/* Called when the current task releases a semahpore where it might have ++ * inherited a piority from ++ */ ++typedef long (*return_priority_t) (struct pi_semaphore *sem); ++ ++/* Called when a task tries to acquire a semaphore and fails. Check if its ++ * priority is higher than that of the current holder. ++ */ ++typedef long (*pi_block_t) (struct pi_semaphore *sem, struct task_struct *t); ++ ++ ++/********************* sys call backends ********************/ ++/* This function causes the caller to sleep until the next release */ ++typedef long (*sleep_next_period_t) (void); ++ ++typedef int (*scheduler_setup_t) (int cmd, void __user *parameter); ++ ++typedef int (*mode_change_t) (int); ++ ++struct sched_plugin { ++ /* basic info */ ++ char *plugin_name; ++ int ready_to_use; ++ ++ /* management interface */ ++ mode_change_t mode_change; ++ ++ /* scheduler invocation */ ++ scheduler_tick_t scheduler_tick; ++ schedule_t schedule; ++ finish_switch_t finish_switch; ++ ++ /* syscall backend */ ++ sleep_next_period_t sleep_next_period; ++ scheduler_setup_t scheduler_setup; ++ ++ /* task state changes */ ++ prepare_task_t prepare_task; ++ wake_up_task_t wake_up_task; ++ task_blocks_t task_blocks; ++ tear_down_t tear_down; ++ ++ /* priority inheritance */ ++ inherit_priority_t inherit_priority; ++ return_priority_t return_priority; ++ pi_block_t pi_block; ++} __attribute__ ((__aligned__(SMP_CACHE_BYTES))); ++ ++typedef struct sched_plugin sched_plugin_t; ++ ++extern sched_plugin_t *curr_sched_plugin; ++ ++ ++/* common scheduler tick */ ++reschedule_check_t rt_scheduler_tick(void); ++ ++ ++/* Don't pull in our definitions on top of the real ones ++ * in sched.c! ++ */ ++#ifndef __SCHED_C__ ++ ++/* External linux scheduler facilities */ ++void deactivate_task(struct task_struct *, runqueue_t *); ++/* This function is defined in sched.c. We need acces to it for ++ * indirect switching. ++ */ ++void __activate_task(struct task_struct *, runqueue_t *); ++void __setscheduler(struct task_struct *, int, int); ++ ++#endif ++ ++extern int get_sched_options(void); ++#endif +diff --git a/include/linux/sched_trace.h b/include/linux/sched_trace.h +new file mode 100644 +index 0000000..308cc7d +--- /dev/null ++++ b/include/linux/sched_trace.h +@@ -0,0 +1,182 @@ ++/* sched_trace.h -- record scheduler events to a byte stream for offline analysis. ++ */ ++#ifndef _LINUX_SCHED_TRACE_H_ ++#define _LINUX_SCHED_TRACE_H_ ++ ++#include ++ ++typedef enum { ++ ST_INVOCATION = 0, ++ ST_ARRIVAL = 1, ++ ST_DEPARTURE = 2, ++ ST_PREEMPTION = 3, ++ ST_SCHEDULED = 4, ++ ST_JOB_RELEASE = 5, ++ ST_JOB_COMPLETION = 6, ++ ST_CAPACITY_RELEASE = 7, ++ ST_CAPACITY_ALLOCATION = 8, ++ ST_SERVICE_LEVEL_CHANGE = 9, ++ ST_WEIGHT_ERROR = 10, ++} trace_type_t; ++ ++typedef struct { ++ trace_type_t trace:8; ++ unsigned int size:24; ++ unsigned long long timestamp; ++} trace_header_t; ++ ++ ++typedef struct { ++ unsigned int is_rt:1; ++ unsigned int is_server:1; ++ task_class_t class:4; ++ unsigned int budget:24; ++ u32 deadline; ++ ++ pid_t pid; ++} task_info_t; ++ ++typedef struct { ++ trace_header_t header; ++ unsigned long flags; ++} invocation_record_t; ++ ++typedef struct { ++ trace_header_t header; ++ task_info_t task; ++} arrival_record_t; ++ ++typedef struct { ++ trace_header_t header; ++ task_info_t task; ++} departure_record_t; ++ ++typedef struct { ++ trace_header_t header; ++ task_info_t task; ++ task_info_t by; ++} preemption_record_t; ++ ++typedef struct { ++ trace_header_t header; ++ task_info_t task; ++} scheduled_record_t; ++ ++typedef struct { ++ trace_header_t header; ++ task_info_t task; ++ u16 period; ++ u16 wcet; ++} release_record_t; ++ ++typedef struct { ++ trace_header_t header; ++ task_info_t task; ++ u16 period; ++ u16 wcet; ++ int tardiness; ++ unsigned int job_no; ++} completion_record_t; ++ ++typedef struct { ++ trace_header_t header; ++ task_info_t task; ++} cap_release_record_t; ++ ++typedef struct { ++ trace_header_t header; ++ task_info_t task; ++ u16 budget; ++ u32 deadline; ++ pid_t donor; ++} cap_allocation_record_t; ++ ++typedef struct { ++ trace_header_t header; ++ task_info_t task; ++ unsigned int from:16; ++ unsigned int to:16; ++ service_level_t new_level; ++ service_level_t old_level; ++} service_level_change_record_t; ++ ++typedef struct { ++ trace_header_t header; ++ pid_t task; ++ fp_t estimate; ++ fp_t actual; ++} weight_error_record_t; ++ ++#ifdef CONFIG_SCHED_TASK_TRACE ++void sched_trace_scheduler_invocation(void); ++ ++void sched_trace_task_arrival(struct task_struct *t); ++void sched_trace_task_departure(struct task_struct *t); ++void sched_trace_task_preemption(struct task_struct *t, ++ struct task_struct* by); ++void sched_trace_task_scheduled(struct task_struct *); ++ ++void sched_trace_job_release(struct task_struct *t); ++void sched_trace_job_completion(struct task_struct *t); ++ ++void sched_trace_capacity_release(struct task_struct *t); ++void sched_trace_capacity_allocation(struct task_struct *t, ++ u16 budget, u32 deadline, pid_t donor); ++ ++void sched_trace_capacity_alloc_srv(pid_t srv, u32 srv_dl, task_class_t cls, ++ u16 srv_budget, ++ u16 budget, u32 deadline, pid_t donor); ++ ++void sched_trace_server_release(int id, unsigned int wcet, ++ unsigned int period, ++ task_class_t class); ++ ++void sched_trace_server_completion(int id, unsigned int budget, ++ jiffie_t deadline, ++ task_class_t class); ++ ++void sched_trace_server_scheduled(int id, task_class_t class, ++ unsigned int budget, jiffie_t deadline); ++ ++void sched_trace_service_level_change(struct task_struct* t, ++ unsigned int from, ++ unsigned int to); ++ ++void sched_trace_weight_error(struct task_struct* t, fp_t actual); ++ ++#else ++#define sched_trace_scheduler_invocation(x) ++ ++#define sched_trace_task_arrival(t) ++#define sched_trace_task_departure(t) ++#define sched_trace_task_preemption(t, by) ++#define sched_trace_task_scheduled(t) ++#define sched_trace_job_release(t) ++#define sched_trace_job_completion(t) ++#define sched_trace_capacity_release(t) ++#define sched_trace_capacity_allocation(t, budget, deadline, donor) ++#define sched_trace_capacity_alloc_srv(srv, srv_dl, cls, srv_budget,\ ++ budget, deadline, donor) ++#define sched_trace_server_release(id, wcet, period, class) ++#define sched_trace_server_completion(id, budget, deadline, class) ++#define sched_trace_server_scheduled(id, class, budget, deadline) ++ ++#define sched_trace_service_level_change(t, a, b) ++ ++#define sched_trace_weight_error(x, y) ++ ++ ++#endif ++ ++ ++#ifdef CONFIG_SCHED_DEBUG_TRACE ++void sched_trace_log_message(const char* fmt, ...); ++ ++#else ++ ++#define sched_trace_log_message(fmt, ...) ++ ++#endif ++ ++ ++#endif +diff --git a/include/linux/trace.h b/include/linux/trace.h +new file mode 100644 +index 0000000..9e457aa +--- /dev/null ++++ b/include/linux/trace.h +@@ -0,0 +1,74 @@ ++ ++#ifndef _SYS_TRACE_H_ ++#define _SYS_TRACE_H_ ++ ++#include ++#include ++ ++ ++/*********************** TIMESTAMPS ************************/ ++ ++struct timestamp { ++ unsigned long event; ++ unsigned long long timestamp; ++ unsigned int seq_no; ++ int cpu; ++}; ++ ++ ++/* buffer holding time stamps - will be provided by driver */ ++extern struct ft_buffer* trace_ts_buf; ++ ++/* save_timestamp: stores current time as struct timestamp ++ * in trace_ts_buf ++ */ ++asmlinkage void save_timestamp(unsigned long event); ++ ++#define TIMESTAMP(id) ft_event0(id, save_timestamp) ++ ++/* Convention for timestamps ++ * ========================= ++ * ++ * In order to process the trace files with a common tool, we use the following ++ * convention to measure execution times: The end time id of a code segment is ++ * always the next number after the start time event id. ++ */ ++ ++#define TS_SCHED_START TIMESTAMP(100) ++#define TS_SCHED_END TIMESTAMP(101) ++#define TS_CXS_START TIMESTAMP(102) ++#define TS_CXS_END TIMESTAMP(103) ++ ++#define TS_TICK_START TIMESTAMP(110) ++#define TS_TICK_END TIMESTAMP(111) ++ ++#define TS_PLUGIN_SCHED_START TIMESTAMP(120) ++#define TS_PLUGIN_SCHED_END TIMESTAMP(121) ++ ++#define TS_PLUGIN_TICK_START TIMESTAMP(130) ++#define TS_PLUGIN_TICK_END TIMESTAMP(131) ++ ++#define TS_ENTER_NP_START TIMESTAMP(140) ++#define TS_ENTER_NP_END TIMESTAMP(141) ++ ++#define TS_EXIT_NP_START TIMESTAMP(150) ++#define TS_EXIT_NP_END TIMESTAMP(151) ++ ++#define TS_SRP_UP_START TIMESTAMP(160) ++#define TS_SRP_UP_END TIMESTAMP(161) ++#define TS_SRP_DOWN_START TIMESTAMP(162) ++#define TS_SRP_DOWN_END TIMESTAMP(163) ++ ++#define TS_PI_UP_START TIMESTAMP(170) ++#define TS_PI_UP_END TIMESTAMP(171) ++#define TS_PI_DOWN_START TIMESTAMP(172) ++#define TS_PI_DOWN_END TIMESTAMP(173) ++ ++#define TS_FIFO_UP_START TIMESTAMP(180) ++#define TS_FIFO_UP_END TIMESTAMP(181) ++#define TS_FIFO_DOWN_START TIMESTAMP(182) ++#define TS_FIFO_DOWN_END TIMESTAMP(183) ++ ++ ++ ++#endif /* !_SYS_TRACE_H_ */ +diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h +index 975c963..6ae0ff9 100644 +--- a/include/linux/uaccess.h ++++ b/include/linux/uaccess.h +@@ -84,4 +84,20 @@ static inline unsigned long __copy_from_user_nocache(void *to, + ret; \ + }) + ++/* This is a naive attempt at a write version of the above native Linux macro. ++ */ ++#define poke_kernel_address(val, addr) \ ++ ({ \ ++ long ret; \ ++ mm_segment_t old_fs = get_fs(); \ ++ \ ++ set_fs(KERNEL_DS); \ ++ pagefault_disable(); \ ++ ret = __put_user(val, (__force typeof(val) __user *)(addr)); \ ++ pagefault_enable(); \ ++ set_fs(old_fs); \ ++ ret; \ ++ }) ++ ++ + #endif /* __LINUX_UACCESS_H__ */ +diff --git a/include/linux/wait.h b/include/linux/wait.h +index e820d00..c7e96b6 100644 +--- a/include/linux/wait.h ++++ b/include/linux/wait.h +@@ -161,6 +161,8 @@ wait_queue_head_t *FASTCALL(bit_waitqueue(void *, int)); + #define wake_up_locked(x) __wake_up_locked((x), TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE) + #define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1) + ++#define pi_wake_up(x) __pi_wake_up(x, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, NULL) ++ + #define __wait_event(wq, condition) \ + do { \ + DEFINE_WAIT(__wait); \ +diff --git a/kernel/Makefile b/kernel/Makefile +index 14f4d45..55acc93 100644 +--- a/kernel/Makefile ++++ b/kernel/Makefile +@@ -8,7 +8,12 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \ + signal.o sys.o kmod.o workqueue.o pid.o \ + rcupdate.o extable.o params.o posix-timers.o \ + kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \ +- hrtimer.o rwsem.o latency.o nsproxy.o srcu.o ++ hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \ ++ sched_plugin.o litmus.o sched_trace.o \ ++ edf_common.o fifo_common.o pfair_common.o\ ++ sched_global_edf.o sched_part_edf.o sched_edf_hsb.o sched_pfair.o \ ++ sched_gsn_edf.o sched_psn_edf.o litmus_sem.o \ ++ trace.o ft_event.o rt_domain.o sched_adaptive.o + + obj-$(CONFIG_STACKTRACE) += stacktrace.o + obj-y += time/ +diff --git a/kernel/edf_common.c b/kernel/edf_common.c +new file mode 100644 +index 0000000..4746c66 +--- /dev/null ++++ b/kernel/edf_common.c +@@ -0,0 +1,135 @@ ++/* ++ * kernel/edf_common.c ++ * ++ * Common functions for EDF based scheduler. ++ */ ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++ ++#include ++ ++/* edf_higher_prio - returns true if first has a higher EDF priority ++ * than second. Deadline ties are broken by PID. ++ * ++ * first first must not be NULL and a real-time task. ++ * second may be NULL or a non-rt task. ++ */ ++int edf_higher_prio(struct task_struct* first, ++ struct task_struct* second) ++{ ++ struct task_struct *first_task = first; ++ struct task_struct *second_task = second; ++ ++ /* Check for inherited priorities. Change task ++ * used for comparison in such a case. ++ */ ++ if (first && first->rt_param.inh_task) ++ first_task = first->rt_param.inh_task; ++ if (second && second->rt_param.inh_task) ++ second_task = second->rt_param.inh_task; ++ ++ return ++ /* does the second task exist and is it a real-time task? If ++ * not, the first task (which is a RT task) has higher ++ * priority. ++ */ ++ !second_task || !is_realtime(second_task) || ++ ++ /* is the deadline of the first task earlier? ++ * Then it has higher priority. ++ */ ++ earlier_deadline(first_task, second_task) || ++ ++ /* Do we have a deadline tie? ++ * Then break by PID. ++ */ ++ (get_deadline(first_task) == get_deadline(second_task) && ++ (first_task->pid < second_task->pid || ++ ++ /* If the PIDs are the same then the task with the inherited ++ * priority wins. ++ */ ++ (first_task->pid == second_task->pid && ++ !second->rt_param.inh_task))); ++} ++ ++int edf_ready_order(struct list_head* a, struct list_head* b) ++{ ++ return edf_higher_prio( ++ list_entry(a, struct task_struct, rt_list), ++ list_entry(b, struct task_struct, rt_list)); ++} ++ ++void edf_release_at(struct task_struct *t, jiffie_t start) ++{ ++ t->rt_param.times.deadline = start; ++ edf_prepare_for_next_period(t); ++ t->rt_param.times.last_release = start; ++ set_rt_flags(t, RT_F_RUNNING); ++} ++ ++void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched) ++{ ++ rt_domain_init(rt, resched, edf_ready_order); ++} ++ ++void edf_prepare_for_next_period(struct task_struct *t) ++{ ++ BUG_ON(!t); ++ /* prepare next release */ ++ t->rt_param.times.release = t->rt_param.times.deadline; ++ t->rt_param.times.deadline += get_rt_period(t); ++ t->rt_param.times.exec_time = 0; ++ /* update job sequence number */ ++ t->rt_param.times.job_no++; ++ ++ t->time_slice = get_exec_cost(t); ++ ++ /* who uses this? statistics? */ ++ t->first_time_slice = 0; ++} ++ ++/* need_to_preempt - check whether the task t needs to be preempted ++ * call only with irqs disabled and with ready_lock acquired ++ * THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT! ++ */ ++int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t) ++{ ++ /* we need the read lock for edf_ready_queue */ ++ /* no need to preempt if there is nothing pending */ ++ if (!ready_jobs_pending(rt)) ++ return 0; ++ /* we need to reschedule if t doesn't exist */ ++ if (!t) ++ return 1; ++ ++ /* NOTE: We cannot check for non-preemptibility since we ++ * don't know what address space we're currently in. ++ */ ++ ++ /* make sure to get non-rt stuff out of the way */ ++ return !is_realtime(t) || edf_higher_prio(next_ready(rt), t); ++} ++ ++ ++/* ++ * Deactivate current task until the beginning of the next period. ++ */ ++long edf_sleep_next_period(void) ++{ ++ /* Mark that we do not excute anymore */ ++ set_rt_flags(current, RT_F_SLEEP); ++ /* call schedule, this will return when a new job arrives ++ * it also takes care of preparing for the next release ++ */ ++ schedule(); ++ return 0; ++} ++ +diff --git a/kernel/fifo_common.c b/kernel/fifo_common.c +new file mode 100644 +index 0000000..c1641a1 +--- /dev/null ++++ b/kernel/fifo_common.c +@@ -0,0 +1,86 @@ ++/* ++ * kernel/fifo_common.c ++ * ++ * Fifo helper functions. Could one day be a FIFO plugin if someone ++ * is interested. ++ * ++ * The current FIFO implementaion automatically chops Linux tasks into ++ * smaller jobs by assigning a fixed time slice. Once that time slice expires, ++ * it is treated as a new job release (that is queued in the back). ++ * ++ * The result is that it provides FIFO properties on a job level and round-robin ++ * on a task level if the tasks execute continuously. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++/* This function is defined in sched.c. We need access it for ++ * indirect switching. ++ */ ++void __activate_task(struct task_struct *p, runqueue_t *rq); ++ ++/* fifo_higher_prio - returns true if first has a higher FIFO priority ++ * than second. Release time ties are broken by PID. ++ * ++ * first first must not be NULL and a real-time task. ++ * second may be NULL or a non-rt task. ++ */ ++int fifo_higher_prio(struct task_struct* first, ++ struct task_struct* second) ++{ ++ struct task_struct *first_task = first; ++ struct task_struct *second_task = second; ++ ++ /* Check for inherited priorities. Change task ++ * used for comparison in such a case. ++ */ ++ if (first && first->rt_param.inh_task) ++ first_task = first->rt_param.inh_task; ++ if (second && second->rt_param.inh_task) ++ second_task = second->rt_param.inh_task; ++ ++ return ++ /* does the second task exist and is it a real-time task? If ++ * not, the first task (which is a RT task) has higher ++ * priority. ++ */ ++ !second_task || !is_realtime(second_task) || ++ ++ /* is the release of the first task earlier? ++ * Then it has higher priority. ++ */ ++ earlier_last_release(first_task, second_task) || ++ ++ /* Do we have a release time tie? ++ * Then break by PID. ++ */ ++ (get_last_release(first_task) == ++ get_last_release(second_task) && ++ (first_task->pid < second_task->pid || ++ ++ /* If the PIDs are the same then the task with the inherited ++ * priority wins. ++ */ ++ (first_task->pid == second_task->pid && ++ !second->rt_param.inh_task))); ++} ++ ++int fifo_ready_order(struct list_head* a, struct list_head* b) ++{ ++ return fifo_higher_prio( ++ list_entry(a, struct task_struct, rt_list), ++ list_entry(b, struct task_struct, rt_list)); ++} ++ ++void fifo_domain_init(rt_domain_t* rt, check_resched_needed_t resched) ++{ ++ rt_domain_init(rt, resched, fifo_ready_order); ++} +diff --git a/kernel/fork.c b/kernel/fork.c +index d57118d..d786dcf 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -57,6 +57,9 @@ + #include + #include + ++#include ++#include ++ + /* + * Protected counters by write_lock_irq(&tasklist_lock) + */ +@@ -118,6 +121,9 @@ void __put_task_struct(struct task_struct *tsk) + WARN_ON(atomic_read(&tsk->usage)); + WARN_ON(tsk == current); + ++ if (is_realtime(tsk)) ++ exit_litmus(tsk); ++ + security_task_free(tsk); + free_uid(tsk->user); + put_group_info(tsk->group_info); +diff --git a/kernel/ft_event.c b/kernel/ft_event.c +new file mode 100644 +index 0000000..10318ee +--- /dev/null ++++ b/kernel/ft_event.c +@@ -0,0 +1,104 @@ ++#include ++ ++#include ++ ++/* the feather trace management functions assume ++ * exclusive access to the event table ++ */ ++ ++ ++#define BYTE_JUMP 0xeb ++#define BYTE_JUMP_LEN 0x02 ++ ++/* for each event, there is an entry in the event table */ ++struct trace_event { ++ long id; ++ long count; ++ long start_addr; ++ long end_addr; ++}; ++ ++extern struct trace_event __start___event_table[]; ++extern struct trace_event __stop___event_table[]; ++ ++int ft_enable_event(unsigned long id) ++{ ++ struct trace_event* te = __start___event_table; ++ int count = 0; ++ char* delta; ++ unsigned char* instr; ++ ++ while (te < __stop___event_table) { ++ if (te->id == id && ++te->count == 1) { ++ instr = (unsigned char*) te->start_addr; ++ /* make sure we don't clobber something wrong */ ++ if (*instr == BYTE_JUMP) { ++ delta = (((unsigned char*) te->start_addr) + 1); ++ *delta = 0; ++ } ++ } ++ if (te->id == id) ++ count++; ++ te++; ++ } ++ return count; ++} ++ ++int ft_disable_event(unsigned long id) ++{ ++ struct trace_event* te = __start___event_table; ++ int count = 0; ++ char* delta; ++ unsigned char* instr; ++ ++ while (te < __stop___event_table) { ++ if (te->id == id && --te->count == 0) { ++ instr = (unsigned char*) te->start_addr; ++ if (*instr == BYTE_JUMP) { ++ delta = (((unsigned char*) te->start_addr) + 1); ++ *delta = te->end_addr - te->start_addr - ++ BYTE_JUMP_LEN; ++ } ++ } ++ if (te->id == id) ++ count++; ++ te++; ++ } ++ return count; ++} ++ ++int ft_disable_all_events(void) ++{ ++ struct trace_event* te = __start___event_table; ++ int count = 0; ++ char* delta; ++ unsigned char* instr; ++ ++ while (te < __stop___event_table) { ++ if (te->count) { ++ instr = (unsigned char*) te->start_addr; ++ if (*instr == BYTE_JUMP) { ++ delta = (((unsigned char*) te->start_addr) ++ + 1); ++ *delta = te->end_addr - te->start_addr - ++ BYTE_JUMP_LEN; ++ te->count = 0; ++ count++; ++ } ++ } ++ te++; ++ } ++ return count; ++} ++ ++int ft_is_event_enabled(unsigned long id) ++{ ++ struct trace_event* te = __start___event_table; ++ ++ while (te < __stop___event_table) { ++ if (te->id == id) ++ return te->count; ++ te++; ++ } ++ return 0; ++} +diff --git a/kernel/litmus.c b/kernel/litmus.c +new file mode 100644 +index 0000000..8f238ba +--- /dev/null ++++ b/kernel/litmus.c +@@ -0,0 +1,953 @@ ++/* litmus.c -- Implementation of the LITMUS syscalls, the LITMUS intialization, ++ * and the common tick function. ++ */ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#define MAX_SERVICE_LEVELS 10 ++ ++/* Variables that govern the scheduling process */ ++spolicy sched_policy = SCHED_DEFAULT; ++int sched_options = 0; ++ ++ ++/* This is a flag for switching the system into RT mode when it is booted up ++ * In RT-mode non-realtime tasks are scheduled as background tasks. ++ */ ++ ++/* The system is booting in non-realtime mode */ ++atomic_t rt_mode = ATOMIC_INIT(MODE_NON_RT); ++/* Here we specify a mode change to be made */ ++atomic_t new_mode = ATOMIC_INIT(MODE_NON_RT); ++/* Number of RT tasks that exist in the system */ ++atomic_t n_rt_tasks = ATOMIC_INIT(0); ++ ++/* Only one CPU may perform a mode change. */ ++static queuelock_t mode_change_lock; ++ ++/* The time instant when we switched to RT mode */ ++volatile jiffie_t rt_start_time = 0; ++ ++/* To send signals from the scheduler ++ * Must drop locks first. ++ */ ++static LIST_HEAD(sched_sig_list); ++static DEFINE_SPINLOCK(sched_sig_list_lock); ++ ++/** ++ * sys_set_rt_mode ++ * @newmode: new mode the scheduler must be switched to ++ * External syscall for setting the RT mode flag ++ * Returns EINVAL if mode is not recognized or mode transition is ++ * not permitted ++ * On success 0 is returned ++ * ++ * FIXME: In a "real" OS we cannot just let any user switch the mode... ++ */ ++asmlinkage long sys_set_rt_mode(int newmode) ++{ ++ if ((newmode == MODE_NON_RT) || (newmode == MODE_RT_RUN)) { ++ printk(KERN_INFO "real-time mode switch to %s\n", ++ (newmode == MODE_RT_RUN ? "rt" : "non-rt")); ++ atomic_set(&new_mode, newmode); ++ return 0; ++ } ++ return -EINVAL; ++} ++ ++/* ++ * sys_set_task_rt_param ++ * @pid: Pid of the task which scheduling parameters must be changed ++ * @param: New real-time extension parameters such as the execution cost and ++ * period ++ * Syscall for manipulating with task rt extension params ++ * Returns EFAULT if param is NULL. ++ * ESRCH if pid is not corrsponding ++ * to a valid task. ++ * EINVAL if either period or execution cost is <=0 ++ * EPERM if pid is a real-time task ++ * 0 if success ++ * ++ * Only non-real-time tasks may be configured with this system call ++ * to avoid races with the scheduler. In practice, this means that a ++ * task's parameters must be set _before_ calling sys_prepare_rt_task() ++ */ ++asmlinkage long sys_set_rt_task_param(pid_t pid, rt_param_t __user * param) ++{ ++ rt_param_t tp; ++ struct task_struct *target; ++ int retval = -EINVAL; ++ ++ printk("Setting up rt task parameters for process %d.\n", pid); ++ ++ if (pid < 0 || param == 0) { ++ goto out; ++ } ++ if (copy_from_user(&tp, param, sizeof(tp))) { ++ retval = -EFAULT; ++ goto out; ++ } ++ ++ /* Task search and manipulation must be protected */ ++ read_lock_irq(&tasklist_lock); ++ if (!(target = find_task_by_pid(pid))) { ++ retval = -ESRCH; ++ goto out_unlock; ++ } ++ ++ if (is_realtime(target)) { ++ /* The task is already a real-time task. ++ * We cannot not allow parameter changes at this point. ++ */ ++ retval = -EPERM; ++ goto out_unlock; ++ } ++ ++ if (tp.exec_cost <= 0) ++ goto out_unlock; ++ if (tp.period <= 0) ++ goto out_unlock; ++ if (!cpu_online(tp.cpu)) ++ goto out_unlock; ++ if (tp.period < tp.exec_cost) ++ { ++ printk(KERN_INFO "litmus: real-time task %d rejected " ++ "because wcet > period\n", pid); ++ goto out_unlock; ++ } ++ ++ /* Assign params */ ++ target->rt_param.basic_params = tp; ++ ++ retval = 0; ++ out_unlock: ++ read_unlock_irq(&tasklist_lock); ++ out: ++ return retval; ++} ++ ++/* Getter of task's RT params ++ * returns EINVAL if param or pid is NULL ++ * returns ESRCH if pid does not correspond to a valid task ++ * returns EFAULT if copying of parameters has failed. ++ */ ++asmlinkage long sys_get_rt_task_param(pid_t pid, rt_param_t __user * param) ++{ ++ int retval = -EINVAL; ++ struct task_struct *source; ++ rt_param_t lp; ++ if (param == 0 || pid < 0) ++ goto out; ++ read_lock(&tasklist_lock); ++ if (!(source = find_task_by_pid(pid))) { ++ retval = -ESRCH; ++ goto out_unlock; ++ } ++ lp = source->rt_param.basic_params; ++ read_unlock(&tasklist_lock); ++ /* Do copying outside the lock */ ++ retval = ++ copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0; ++ return retval; ++ out_unlock: ++ read_unlock(&tasklist_lock); ++ out: ++ return retval; ++ ++} ++ ++/* ++ * sys_set_service_levels ++ * @pid: Pid of the task that is to be configured ++ * @count: The number of service levels ++ * @levels: The new service levels. ++ * ++ * Returns EFAULT if levels is not a valid address. ++ * ESRCH if pid is not corrsponding ++ * to a valid task. ++ * EINVAL if either period or execution cost is <=0 for any level, ++ * of if utility is not incresing. ++ * EPERM if pid is a real-time task ++ * ENOMEM if there is insufficient memory available ++ * 0 if success ++ * ++ * May not be used on RT tasks to avoid races. ++ */ ++asmlinkage long sys_set_service_levels(pid_t pid, ++ unsigned int count, ++ service_level_t __user *levels) ++{ ++ struct task_struct *target; ++ service_level_t level, *klevels; ++ int retval = -EINVAL, i; ++ fp_t last_value = FP(0); ++ fp_t last_weight = FP(0); ++ ++ TRACE("Setting up service levels for process %d.\n", pid); ++ ++ if (pid < 0 || count > MAX_SERVICE_LEVELS) { ++ goto out; ++ } ++ ++ /* Task search and manipulation must be protected */ ++ read_lock_irq(&tasklist_lock); ++ if (!(target = find_task_by_pid(pid))) { ++ retval = -ESRCH; ++ read_unlock_irq(&tasklist_lock); ++ goto out; ++ } ++ read_unlock_irq(&tasklist_lock); ++ ++ if (is_realtime(target)) { ++ /* The task is already a real-time task. ++ * We cannot not allow parameter changes at this point. ++ */ ++ retval = -EPERM; ++ goto out; ++ } ++ ++ /* get rid of old service levels, if any */ ++ kfree(target->rt_param.service_level); ++ target->rt_param.service_level = NULL; ++ target->rt_param.no_service_levels = 0; ++ ++ /* count == 0 means tear down service levels*/ ++ if (count == 0) { ++ retval = 0; ++ goto out; ++ } ++ ++ klevels = kmalloc(sizeof(service_level_t) * count, GFP_KERNEL); ++ if (!klevels) { ++ retval = -ENOMEM; ++ goto out; ++ } ++ ++ for (i = 0; i < count; i++) { ++ if (copy_from_user(&level, levels + i, sizeof(level))) { ++ retval = -EFAULT; ++ kfree(klevels); ++ goto out; ++ } ++ if (level.period <= 0) { ++ TRACE("service level %d period <= 0\n", i); ++ goto out; ++ } ++ if (_leq(level.weight, last_weight)) { ++ TRACE("service level %d weight non-increase\n", i); ++ goto out; ++ } ++ if (_leq(level.value, last_value)) { ++ TRACE("service level %d value non-increase\n", i); ++ goto out; ++ } ++ last_value = level.value; ++ last_weight = level.weight; ++ klevels[i] = level; ++ } ++ target->rt_param.basic_params.exec_cost = _round(_mul(klevels[0].weight, ++ FP(klevels[0].period))); ++ target->rt_param.basic_params.period = klevels[0].period; ++ target->rt_param.service_level = klevels; ++ target->rt_param.no_service_levels = count; ++ retval = 0; ++ ++ out: ++ return retval; ++} ++ ++asmlinkage long sys_get_cur_service_level(void) ++{ ++ long level; ++ ++ if (!is_realtime(current)) ++ return -EINVAL; ++ ++ /* block scheduler that might cause reweighting to happen */ ++ local_irq_disable(); ++ level = current->rt_param.cur_service_level; ++ local_irq_enable(); ++ return level; ++} ++ ++ ++/* ++ * sys_prepare_rt_task ++ * @pid: Pid of the task we want to prepare for RT mode ++ * Syscall for adding a task to RT queue, plugin dependent. ++ * Must be called before RT tasks are going to start up. ++ * Returns EPERM if current plugin does not define prepare operation ++ * or scheduling policy does not allow the operation. ++ * ESRCH if pid does not correspond to a valid task. ++ * EINVAL if a task is non-realtime or in invalid state ++ * from underlying plugin function ++ * EAGAIN if a task is not in the right state ++ * ENOMEM if there is no memory space to handle this task ++ * 0 if success ++ */ ++asmlinkage long sys_prepare_rt_task(pid_t pid) ++{ ++ int retval = -EINVAL; ++ struct task_struct *target = 0; ++ /* If a plugin does not define preparation mode then nothing to do */ ++ if (curr_sched_plugin->prepare_task == 0 ++ || sched_policy == SCHED_DEFAULT) { ++ retval = -EPERM; ++ goto out_prepare; ++ } ++ read_lock_irq(&tasklist_lock); ++ if (!(target = find_task_by_pid(pid))) { ++ retval = -ESRCH; ++ goto out_prepare_unlock; ++ } ++ if (!cpu_online(get_partition(target))) ++ { ++ printk(KERN_WARNING "litmus prepare: cpu %d is not online\n", ++ get_partition(target)); ++ goto out_prepare_unlock; ++ } ++ retval = curr_sched_plugin->prepare_task(target); ++ if (!retval) { ++ atomic_inc(&n_rt_tasks); ++ target->rt_param.is_realtime = 1; ++ target->rt_param.litmus_controlled = 1; ++ } ++ out_prepare_unlock: ++ read_unlock_irq(&tasklist_lock); ++ out_prepare: ++ return retval; ++} ++ ++ ++/* implemented in kernel/litmus_sem.c */ ++void srp_ceiling_block(void); ++ ++/* ++ * This is the crucial function for periodic task implementation, ++ * It checks if a task is periodic, checks if such kind of sleep ++ * is permitted and calls plugin-specific sleep, which puts the ++ * task into a wait array. ++ * returns 0 on successful wakeup ++ * returns EPERM if current conditions do not permit such sleep ++ * returns EINVAL if current task is not able to go to sleep ++ */ ++asmlinkage long sys_sleep_next_period(void) ++{ ++ int retval = -EPERM; ++ if (!is_realtime(current)) { ++ retval = -EINVAL; ++ goto out; ++ } ++ /* Task with negative or zero period cannot sleep */ ++ if (get_rt_period(current) <= 0) { ++ retval = -EINVAL; ++ goto out; ++ } ++ /* The plugin has to put the task into an ++ * appropriate queue and call schedule ++ */ ++ retval = curr_sched_plugin->sleep_next_period(); ++ if (!retval && is_subject_to_srp(current)) ++ srp_ceiling_block(); ++ out: ++ return retval; ++} ++ ++/* This is an "improved" version of sys_sleep_next_period() that ++ * addresses the problem of unintentionally missing a job after ++ * an overrun. ++ * ++ * returns 0 on successful wakeup ++ * returns EPERM if current conditions do not permit such sleep ++ * returns EINVAL if current task is not able to go to sleep ++ */ ++asmlinkage long sys_wait_for_job_release(unsigned int job) ++{ ++ int retval = -EPERM; ++ if (!is_realtime(current)) { ++ retval = -EINVAL; ++ goto out; ++ } ++ ++ /* Task with negative or zero period cannot sleep */ ++ if (get_rt_period(current) <= 0) { ++ retval = -EINVAL; ++ goto out; ++ } ++ ++ retval = 0; ++ ++ /* first wait until we have "reached" the desired job ++ * ++ * This implementation has at least two problems: ++ * ++ * 1) It doesn't gracefully handle the wrap around of ++ * job_no. Since LITMUS is a prototype, this is not much ++ * of a problem right now. ++ * ++ * 2) It is theoretically racy if a job release occurs ++ * between checking job_no and calling sleep_next_period(). ++ * A proper solution would requiring adding another callback ++ * in the plugin structure and testing the condition with ++ * interrupts disabled. ++ * ++ * FIXME: At least problem 2 should be taken care of eventually. ++ */ ++ while (!retval && job > current->rt_param.times.job_no) ++ /* If the last job overran then job <= job_no and we ++ * don't send the task to sleep. ++ */ ++ retval = curr_sched_plugin->sleep_next_period(); ++ ++ /* We still have to honor the SRP after the actual release. ++ */ ++ if (!retval && is_subject_to_srp(current)) ++ srp_ceiling_block(); ++ out: ++ return retval; ++} ++ ++/* This is a helper syscall to query the current job sequence number. ++ * ++ * returns 0 on successful query ++ * returns EPERM if task is not a real-time task. ++ * returns EFAULT if &job is not a valid pointer. ++ */ ++asmlinkage long sys_query_job_no(unsigned int __user *job) ++{ ++ int retval = -EPERM; ++ if (is_realtime(current)) ++ retval = put_user(current->rt_param.times.job_no, job); ++ ++ return retval; ++} ++ ++ ++/* The LITMUS tick function. It manages the change to and from real-time mode ++ * and then calls the plugin's tick function. ++ */ ++reschedule_check_t __sched rt_scheduler_tick(void) ++{ ++ /* Check for mode change */ ++ if ((get_rt_mode() != atomic_read(&new_mode))) { ++ queue_lock(&mode_change_lock); ++ // If the mode is already changed, proceed ++ if (get_rt_mode() == atomic_read(&new_mode)) { ++ queue_unlock(&mode_change_lock); ++ goto proceed; ++ } ++ // change the mode ++ if ((atomic_read(&new_mode) == MODE_RT_RUN)) { ++ /* The deferral of entering real-time mode should be ++ * handled by deferring task releases in the plugin. ++ * The plugin interface does not really need to know ++ * about quanta, that is the plugin's job. ++ */ ++ ++ /* update rt start time */ ++ rt_start_time = jiffies; ++ printk(KERN_INFO "Real-Time mode enabled at %ld " ++ "on %d\n", ++ jiffies, smp_processor_id()); ++ } else ++ printk(KERN_INFO "Real-Time mode disabled at %ld " ++ "on %d\n", ++ jiffies, smp_processor_id()); ++ if (curr_sched_plugin->mode_change) ++ curr_sched_plugin-> ++ mode_change(atomic_read(&new_mode)); ++ printk(KERN_INFO "Plugin mode change done at %ld\n", ++ jiffies); ++ set_rt_mode(atomic_read(&new_mode)); ++ queue_unlock(&mode_change_lock); ++ } ++ ++ proceed: ++ /* Call plugin-defined tick handler ++ * ++ * It is the plugin's tick handler' job to detect quantum ++ * boundaries in pfair. ++ */ ++ return curr_sched_plugin->scheduler_tick(); ++} ++ ++asmlinkage spolicy sys_sched_setpolicy(spolicy newpolicy) ++{ ++ /* Dynamic policy change is disabled at the moment */ ++ return SCHED_INVALID; ++} ++ ++asmlinkage spolicy sys_sched_getpolicy(void) ++{ ++ return sched_policy; ++} ++ ++ ++asmlinkage int sys_scheduler_setup(int cmd, void __user *parameter) ++{ ++ int ret = -EINVAL; ++ ++ ret = curr_sched_plugin->scheduler_setup(cmd, parameter); ++ ++ return ret; ++} ++ ++struct sched_sig { ++ struct list_head list; ++ struct task_struct* task; ++ unsigned int signal:31; ++ int force:1; ++}; ++ ++static void __scheduler_signal(struct task_struct *t, unsigned int signo, ++ int force) ++{ ++ struct sched_sig* sig; ++ ++ sig = kmalloc(GFP_ATOMIC, sizeof(struct sched_sig)); ++ if (!sig) { ++ TRACE_TASK(t, "dropping signal: %u\n", t); ++ return; ++ } ++ ++ spin_lock(&sched_sig_list_lock); ++ ++ sig->signal = signo; ++ sig->force = force; ++ sig->task = t; ++ get_task_struct(t); ++ list_add(&sig->list, &sched_sig_list); ++ ++ spin_unlock(&sched_sig_list_lock); ++} ++ ++void scheduler_signal(struct task_struct *t, unsigned int signo) ++{ ++ __scheduler_signal(t, signo, 0); ++} ++ ++void force_scheduler_signal(struct task_struct *t, unsigned int signo) ++{ ++ __scheduler_signal(t, signo, 1); ++} ++ ++void send_scheduler_signals(void) ++{ ++ unsigned long flags; ++ struct list_head *p, *extra; ++ struct siginfo info; ++ struct sched_sig* sig; ++ struct task_struct* t; ++ struct list_head claimed; ++ ++ if (spin_trylock_irqsave(&sched_sig_list_lock, flags)) { ++ if (list_empty(&sched_sig_list)) ++ p = NULL; ++ else { ++ p = sched_sig_list.next; ++ list_del(&sched_sig_list); ++ INIT_LIST_HEAD(&sched_sig_list); ++ } ++ spin_unlock_irqrestore(&sched_sig_list_lock, flags); ++ ++ /* abort if there are no signals */ ++ if (!p) ++ return; ++ ++ /* take signal list we just obtained */ ++ list_add(&claimed, p); ++ ++ list_for_each_safe(p, extra, &claimed) { ++ list_del(p); ++ sig = list_entry(p, struct sched_sig, list); ++ t = sig->task; ++ info.si_signo = sig->signal; ++ info.si_errno = 0; ++ info.si_code = SI_KERNEL; ++ info.si_pid = 1; ++ info.si_uid = 0; ++ TRACE("sending signal %d to %d\n", info.si_signo, ++ t->pid); ++ if (sig->force) ++ force_sig_info(sig->signal, &info, t); ++ else ++ send_sig_info(sig->signal, &info, t); ++ put_task_struct(t); ++ kfree(sig); ++ } ++ } ++ ++} ++ ++static inline void np_mem_error(struct task_struct* t, const char* reason) ++{ ++ if (t->state != TASK_DEAD && !(t->flags & PF_EXITING)) { ++ TRACE("np section: %s => %s/%d killed\n", ++ reason, t->comm, t->pid); ++ force_scheduler_signal(t, SIGKILL); ++ } ++} ++ ++/* sys_register_np_flag() allows real-time tasks to register an ++ * np section indicator. ++ * returns 0 if the flag was successfully registered ++ * returns EINVAL if current task is not a real-time task ++ * returns EFAULT if *flag couldn't be written ++ */ ++asmlinkage long sys_register_np_flag(short __user *flag) ++{ ++ int retval = -EINVAL; ++ short test_val = RT_PREEMPTIVE; ++ ++ /* avoid races with the scheduler */ ++ preempt_disable(); ++ TRACE("reg_np_flag(%p) for %s/%d\n", flag, ++ current->comm, current->pid); ++ if (!is_realtime(current)) ++ goto out; ++ ++ /* Let's first try to write to the address. ++ * That way it is initialized and any bugs ++ * involving dangling pointers will caught ++ * early. ++ * NULL indicates disabling np section support ++ * and should not be tested. ++ */ ++ if (flag) ++ retval = poke_kernel_address(test_val, flag); ++ else ++ retval = 0; ++ TRACE("reg_np_flag: retval=%d\n", retval); ++ if (unlikely(0 != retval)) ++ np_mem_error(current, "np flag: not writable"); ++ else ++ /* the pointer is ok */ ++ current->rt_param.np_flag = flag; ++ ++ out: ++ preempt_enable(); ++ /* force rescheduling so that we can be preempted */ ++ return retval; ++} ++ ++ ++void request_exit_np(struct task_struct *t) ++{ ++ int ret; ++ short flag; ++ ++ /* We can only do this if t is actually currently scheduled on this CPU ++ * because otherwise we are in the wrong address space. Thus make sure ++ * to check. ++ */ ++ BUG_ON(t != current); ++ ++ if (unlikely(!is_realtime(t) || !t->rt_param.np_flag)) { ++ TRACE_TASK(t, "request_exit_np(): BAD TASK!\n"); ++ return; ++ } ++ ++ flag = RT_EXIT_NP_REQUESTED; ++ ret = poke_kernel_address(flag, t->rt_param.np_flag + 1); ++ TRACE("request_exit_np(%s/%d)\n", t->comm, t->pid); ++ if (unlikely(0 != ret)) ++ np_mem_error(current, "request_exit_np(): flag not writable"); ++ ++} ++ ++ ++int is_np(struct task_struct* t) ++{ ++ int ret; ++ unsigned short flag = 0x5858; /* = XX, looks nicer in debug*/ ++ ++ BUG_ON(t != current); ++ ++ if (unlikely(t->rt_param.kernel_np)) ++ return 1; ++ else if (unlikely(t->rt_param.np_flag == NULL) || ++ t->flags & PF_EXITING || ++ t->state == TASK_DEAD) ++ return 0; ++ else { ++ /* This is the tricky part. The process has registered a ++ * non-preemptive section marker. We now need to check whether ++ * it is set to to NON_PREEMPTIVE. Along the way we could ++ * discover that the pointer points to an unmapped region (=> ++ * kill the task) or that the location contains some garbage ++ * value (=> also kill the task). Killing the task in any case ++ * forces userspace to play nicely. Any bugs will be discovered ++ * immediately. ++ */ ++ ret = probe_kernel_address(t->rt_param.np_flag, flag); ++ if (0 == ret && (flag == RT_NON_PREEMPTIVE || ++ flag == RT_PREEMPTIVE)) ++ return flag != RT_PREEMPTIVE; ++ else { ++ /* either we could not read from the address or ++ * it contained garbage => kill the process ++ * FIXME: Should we cause a SEGFAULT instead? ++ */ ++ TRACE("is_np: ret=%d flag=%c%c (%x)\n", ret, ++ flag & 0xff, (flag >> 8) & 0xff, flag); ++ np_mem_error(t, "is_np() could not read"); ++ return 0; ++ } ++ } ++} ++ ++/* ++ * sys_exit_np() allows real-time tasks to signal that it left a ++ * non-preemptable section. It will be called after the kernel requested a ++ * callback in the preemption indicator flag. ++ * returns 0 if the signal was valid and processed. ++ * returns EINVAL if current task is not a real-time task ++ */ ++asmlinkage long sys_exit_np(void) ++{ ++ int retval = -EINVAL; ++ ++ TS_EXIT_NP_START; ++ ++ if (!is_realtime(current)) ++ goto out; ++ ++ TRACE("sys_exit_np(%s/%d)\n", current->comm, current->pid); ++ /* force rescheduling so that we can be preempted */ ++ set_tsk_need_resched(current); ++ retval = 0; ++ out: ++ ++ TS_EXIT_NP_END; ++ return retval; ++} ++ ++void exit_litmus(struct task_struct *dead_tsk) ++{ ++ kfree(dead_tsk->rt_param.service_level); ++ curr_sched_plugin->tear_down(dead_tsk); ++} ++ ++ ++void list_qsort(struct list_head* list, list_cmp_t less_than) ++{ ++ struct list_head lt; ++ struct list_head geq; ++ struct list_head *pos, *extra, *pivot; ++ int n_lt = 0, n_geq = 0; ++ BUG_ON(!list); ++ ++ if (list->next == list) ++ return; ++ ++ INIT_LIST_HEAD(<); ++ INIT_LIST_HEAD(&geq); ++ ++ pivot = list->next; ++ list_del(pivot); ++ list_for_each_safe(pos, extra, list) { ++ list_del(pos); ++ if (less_than(pos, pivot)) { ++ list_add(pos, <); ++ n_lt++; ++ } else { ++ list_add(pos, &geq); ++ n_geq++; ++ } ++ } ++ if (n_lt < n_geq) { ++ list_qsort(<, less_than); ++ list_qsort(&geq, less_than); ++ } else { ++ list_qsort(&geq, less_than); ++ list_qsort(<, less_than); ++ } ++ list_splice(&geq, list); ++ list_add(pivot, list); ++ list_splice(<, list); ++} ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++/* We offer the possibility to change the real-time mode of the system ++ * with a magic sys request. This helps in debugging in case the system fails ++ * to perform its planned switch back to normal mode. This may happen if we have ++ * total system utilization and the task that is supposed to do the switch is ++ * always preempted (if it is not a real-time task). ++ */ ++int sys_kill(int pid, int sig); ++ ++ ++static void sysrq_handle_toGgle_rt_mode(int key, struct tty_struct *tty) ++{ ++ sys_set_rt_mode(get_rt_mode() == MODE_NON_RT); ++} ++ ++static struct sysrq_key_op sysrq_toGgle_rt_mode_op = { ++ .handler = sysrq_handle_toGgle_rt_mode, ++ .help_msg = "toGgle-rt-mode", ++ .action_msg = "real-time mode changed", ++}; ++ ++static void sysrq_handle_kill_rt_tasks(int key, struct tty_struct *tty) ++{ ++ struct task_struct *t; ++ read_lock(&tasklist_lock); ++ for_each_process(t) { ++ if (is_realtime(t)) { ++ sys_kill(t->pid, SIGKILL); ++ } ++ } ++ read_unlock(&tasklist_lock); ++} ++ ++static struct sysrq_key_op sysrq_kill_rt_tasks_op = { ++ .handler = sysrq_handle_kill_rt_tasks, ++ .help_msg = "Quit-rt-tasks", ++ .action_msg = "sent SIGKILL to all real-time tasks", ++}; ++#endif ++ ++/* ++ * Scheduler initialization so that customized scheduler is ++ * enabled at boot time ++ * by setting boot option "rtsched=plugin_name", e.g. "rtsched=pfair" ++ */ ++ ++/* All we need to know about other plugins is their initialization ++ * functions. These functions initialize internal data structures of a ++ * scheduler and return a pointer to initialized sched_plugin data ++ * structure with pointers to scheduling function implementations. ++ * If called repeatedly these init functions just return an existing ++ * plugin pointer. ++ */ ++sched_plugin_t *init_global_edf_plugin(void); ++sched_plugin_t *init_global_edf_np_plugin(void); ++sched_plugin_t *init_part_edf_plugin(void); ++sched_plugin_t *init_edf_hsb_plugin(void); ++sched_plugin_t *init_pfair_plugin(void); ++sched_plugin_t *init_gsn_edf_plugin(void); ++sched_plugin_t *init_psn_edf_plugin(void); ++sched_plugin_t *init_adaptive_plugin(void); ++ ++/* keep everything needed to setup plugins in one place */ ++ ++/* we are lazy, so we use a convention for function naming to fill ++ * a table ++ */ ++#define PLUGIN(caps, small) \ ++ {PLUGIN_ ## caps, SCHED_ ## caps, init_ ## small ## _plugin} ++ ++#define init_nosetup_plugin 0 ++ ++static struct { ++ const char *name; ++ const spolicy policy_id; ++ sched_plugin_t *(*init) (void); ++} available_plugins[] = { ++ PLUGIN(LINUX, nosetup), ++ PLUGIN(GLOBAL_EDF_NP, global_edf_np), ++ PLUGIN(GLOBAL_EDF, global_edf), ++ PLUGIN(PART_EDF, part_edf), ++ PLUGIN(EDF_HSB, edf_hsb), ++ PLUGIN(PFAIR, pfair), ++ PLUGIN(GSN_EDF, gsn_edf), ++ PLUGIN(PSN_EDF, psn_edf), ++ PLUGIN(ADAPTIVE, adaptive), ++ /********************************************* ++ * Add your custom plugin here ++ **********************************************/ ++}; ++ ++/* Some plugins may leave important functions unused. We define dummies ++ * so that we don't have to check for null pointers all over the place. ++ */ ++void litmus_dummy_finish_switch(struct task_struct * prev); ++int litmus_dummy_schedule(struct task_struct * prev, struct task_struct** next, ++ runqueue_t* q); ++reschedule_check_t litmus_dummy_scheduler_tick(void); ++long litmus_dummy_prepare_task(struct task_struct *t); ++void litmus_dummy_wake_up_task(struct task_struct *task); ++void litmus_dummy_task_blocks(struct task_struct *task); ++long litmus_dummy_tear_down(struct task_struct *task); ++int litmus_dummy_scheduler_setup(int cmd, void __user *parameter); ++long litmus_dummy_sleep_next_period(void); ++long litmus_dummy_inherit_priority(struct pi_semaphore *sem, ++ struct task_struct *new_owner); ++long litmus_dummy_return_priority(struct pi_semaphore *sem); ++long litmus_dummy_pi_block(struct pi_semaphore *sem, ++ struct task_struct *t); ++ ++#define CHECK(func) {\ ++ if (!curr_sched_plugin->func) \ ++ curr_sched_plugin->func = litmus_dummy_ ## func;} ++ ++static int boot_sched_setup(char *plugin_name) ++{ ++ int i = 0; ++ ++ /* Common initializers, ++ * mode change lock is used to enforce single mode change ++ * operation. ++ */ ++ queue_lock_init(&mode_change_lock); ++ ++ printk("Starting LITMUS^RT kernel\n"); ++ ++ /* Look for a matching plugin. ++ */ ++ for (i = 0; i < ARRAY_SIZE(available_plugins); i++) { ++ if (!strcmp(plugin_name, available_plugins[i].name)) { ++ printk("Using %s scheduler plugin\n", plugin_name); ++ sched_policy = available_plugins[i].policy_id; ++ if (available_plugins[i].init) ++ curr_sched_plugin = available_plugins[i].init(); ++ goto out; ++ } ++ } ++ ++ ++ /* Otherwise we have default linux scheduler */ ++ printk("Plugin name %s is unknown, using default %s\n", plugin_name, ++ curr_sched_plugin->plugin_name); ++ ++out: ++ /* make sure we don't trip over null pointers later */ ++ CHECK(finish_switch); ++ CHECK(schedule); ++ CHECK(scheduler_tick); ++ CHECK(wake_up_task); ++ CHECK(tear_down); ++ CHECK(task_blocks); ++ CHECK(prepare_task); ++ CHECK(scheduler_setup); ++ CHECK(sleep_next_period); ++ CHECK(inherit_priority); ++ CHECK(return_priority); ++ CHECK(pi_block); ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++ /* offer some debugging help */ ++ if (!register_sysrq_key('g', &sysrq_toGgle_rt_mode_op)) ++ printk("Registered eXit real-time mode magic sysrq.\n"); ++ else ++ printk("Could not register eXit real-time mode magic sysrq.\n"); ++ if (!register_sysrq_key('q', &sysrq_kill_rt_tasks_op)) ++ printk("Registered kill rt tasks magic sysrq.\n"); ++ else ++ printk("Could not register kill rt tasks magic sysrq.\n"); ++#endif ++ printk("Litmus setup complete."); ++ return 1; ++} ++ ++/* Register for boot option */ ++__setup("rtsched=", boot_sched_setup); +diff --git a/kernel/litmus_sem.c b/kernel/litmus_sem.c +new file mode 100644 +index 0000000..12a6ab1 +--- /dev/null ++++ b/kernel/litmus_sem.c +@@ -0,0 +1,765 @@ ++ ++/* ++ * SMP- and interrupt-safe semaphores. Also PI and SRP implementations. ++ * Much of the code here is borrowed from include/asm-i386/semaphore.h. ++ * ++ * NOTE: This implementation is very much a prototype and horribly insecure. It ++ * is intended to be a proof of concept, not a feature-complete solution. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++#include ++/* ************************************************************************** */ ++/* STANDARD FIFO SEMAPHORES */ ++/* ************************************************************************** */ ++ ++#define MAX_SEMAPHORES 16000 ++#define MAX_PI_SEMAPHORES 16000 ++#define MAX_SRP_SEMAPHORES 16000 ++ ++ ++struct semaphore sems[MAX_SEMAPHORES]; /* all sems */ ++typedef int sema_id; /* Userspace ID of a semaphore */ ++ ++static int rt_fifo_wake_up(wait_queue_t *wait, unsigned mode, int sync, ++ void *key) ++{ ++ struct task_struct* t = (struct task_struct*) wait->private; ++ set_rt_flags(t, RT_F_EXIT_SEM); ++ TRACE_TASK(t, "woken up by rt_fifo_wake_up(), set RT_F_EXIT_SEM\n"); ++ default_wake_function(wait, mode, sync, key); ++ /* for reason why we always return 1 see rt_pi_wake_up() below */ ++ return 1; ++} ++ ++static fastcall void rt_fifo_up(struct semaphore * sem) ++{ ++ TRACE_CUR("releases lock %p\n"); ++ preempt_disable(); ++ TS_FIFO_UP_START; ++ if (atomic_inc_return(&sem->count) < 1) ++ /* there is a task queued */ ++ wake_up(&sem->wait); ++ TS_FIFO_UP_END; ++ preempt_enable(); ++} ++ ++/* not optimized like the Linux down() implementation, but then ++ * again we incur the cost of a syscall anyway, so this hardly matters ++ */ ++static fastcall void rt_fifo_down(struct semaphore * sem) ++{ ++ struct task_struct *tsk = current; ++ wait_queue_t wait = { ++ .private = tsk, ++ .func = rt_fifo_wake_up, ++ .task_list = {NULL, NULL} ++ }; ++ ++ preempt_disable(); ++ TS_FIFO_DOWN_START; ++ ++ spin_lock(&sem->wait.lock); ++ if (atomic_dec_return(&sem->count) < 0 || ++ waitqueue_active(&sem->wait)) { ++ /* we need to suspend */ ++ tsk->state = TASK_UNINTERRUPTIBLE; ++ add_wait_queue_exclusive_locked(&sem->wait, &wait); ++ ++ TRACE_CUR("suspends on lock %p\n", sem); ++ ++ /* release lock before sleeping */ ++ spin_unlock(&sem->wait.lock); ++ ++ TS_FIFO_DOWN_END; ++ preempt_enable_no_resched(); ++ ++ /* we depend on the FIFO order ++ * Thus, we don't need to recheck when we wake up, we ++ * are guaranteed to have the lock since there is only one ++ * wake up per release ++ */ ++ schedule(); ++ ++ TRACE_CUR("woke up, now owns lock %p\n", sem); ++ ++ /* try_to_wake_up() set our state to TASK_RUNNING, ++ * all we need to do is to remove our wait queue entry ++ */ ++ spin_lock(&sem->wait.lock); ++ remove_wait_queue_locked(&sem->wait, &wait); ++ spin_unlock(&sem->wait.lock); ++ } else { ++ TRACE_CUR("acquired lock %p, no contention\n", sem); ++ spin_unlock(&sem->wait.lock); ++ TS_FIFO_DOWN_END; ++ preempt_enable(); ++ } ++} ++ ++ ++ ++/* Initialize semaphores at boot time. */ ++static int __init sema_boot_init(void) ++{ ++ sema_id sem_id; ++ ++ printk("Initializing semaphores..."); ++ for (sem_id = 0; sem_id < MAX_SEMAPHORES; sem_id++) ++ sems[sem_id].used = 0; ++ printk(" done!\n"); ++ ++ return 0; ++} ++__initcall(sema_boot_init); ++ ++/* Find a free semaphore and return. */ ++asmlinkage long sys_sema_init (void) ++{ ++ sema_id sem_id; ++ ++ for (sem_id = 0; sem_id < MAX_SEMAPHORES; sem_id++) { ++ if (!cmpxchg(&sems[sem_id].used, 0, 1)) { ++ sema_init(&sems[sem_id], 1); ++ return sem_id; ++ } ++ } ++ return -ENOMEM; ++} ++ ++asmlinkage long sys_down(sema_id sem_id) ++{ ++ if (sem_id < 0 || sem_id >= MAX_SEMAPHORES) ++ return -EINVAL; ++ ++ if (!sems[sem_id].used) ++ return -EINVAL; ++ /* This allows for FIFO sems and gives others a chance... */ ++ rt_fifo_down(sems + sem_id); ++ return 0; ++} ++ ++asmlinkage long sys_up(sema_id sem_id) ++{ ++ if (sem_id < 0 || sem_id >= MAX_SEMAPHORES) ++ return -EINVAL; ++ ++ if (!sems[sem_id].used) ++ return -EINVAL; ++ rt_fifo_up(sems + sem_id); ++ return 0; ++} ++ ++asmlinkage long sys_sema_free(sema_id sem_id) ++{ ++ struct list_head *tmp, *next; ++ unsigned long flags; ++ ++ if (sem_id < 0 || sem_id >= MAX_SEMAPHORES) ++ return -EINVAL; ++ ++ if (!sems[sem_id].used) ++ return -EINVAL; ++ ++ spin_lock_irqsave(&sems[sem_id].wait.lock, flags); ++ if (waitqueue_active(&sems[sem_id].wait)) { ++ list_for_each_safe(tmp, next, &sems[sem_id].wait.task_list) { ++ wait_queue_t *curr = list_entry(tmp, wait_queue_t, ++ task_list); ++ list_del(tmp); ++ set_rt_flags((struct task_struct*)curr->private, ++ RT_F_EXIT_SEM); ++ curr->func(curr, ++ TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, ++ 0, NULL); ++ } ++ } ++ ++ spin_unlock_irqrestore(&sems[sem_id].wait.lock, flags); ++ sems[sem_id].used = 0; ++ ++ return 0; ++} ++ ++ ++ ++ ++/* ************************************************************************** */ ++/* PRIORITY INHERITANCE */ ++/* ************************************************************************** */ ++ ++ ++ ++struct pi_semaphore pi_sems[MAX_PI_SEMAPHORES]; /* all PI sems */ ++typedef int pi_sema_id; /* Userspace ID of a pi_semaphore */ ++ ++struct wq_pair { ++ struct task_struct* tsk; ++ struct pi_semaphore* sem; ++}; ++ ++static int rt_pi_wake_up(wait_queue_t *wait, unsigned mode, int sync, ++ void *key) ++{ ++ struct wq_pair* wqp = (struct wq_pair*) wait->private; ++ set_rt_flags(wqp->tsk, RT_F_EXIT_SEM); ++ curr_sched_plugin->inherit_priority(wqp->sem, wqp->tsk); ++ TRACE_TASK(wqp->tsk, ++ "woken up by rt_pi_wake_up() (RT_F_SEM_EXIT, PI)\n"); ++ /* point to task for default_wake_function() */ ++ wait->private = wqp->tsk; ++ default_wake_function(wait, mode, sync, key); ++ ++ /* Always return true since we know that if we encountered a task ++ * that was already running the wake_up raced with the schedule in ++ * rt_pi_down(). In that case the task in rt_pi_down() will be scheduled ++ * immediately and own the lock. We must not wake up another task in ++ * any case. ++ */ ++ return 1; ++} ++ ++ ++/* caller is responsible for locking */ ++int edf_set_hp_task(struct pi_semaphore *sem) ++{ ++ struct list_head *tmp, *next; ++ struct task_struct *queued; ++ int ret = 0; ++ ++ sem->hp.task = NULL; ++ list_for_each_safe(tmp, next, &sem->wait.task_list) { ++ queued = ((struct wq_pair*) ++ list_entry(tmp, wait_queue_t, ++ task_list)->private)->tsk; ++ ++ /* Compare task prios, find high prio task. */ ++ if (edf_higher_prio(queued, sem->hp.task)) { ++ sem->hp.task = queued; ++ ret = 1; ++ } ++ } ++ return ret; ++} ++ ++ ++/* caller is responsible for locking */ ++int edf_set_hp_cpu_task(struct pi_semaphore *sem, int cpu) ++{ ++ struct list_head *tmp, *next; ++ struct task_struct *queued; ++ int ret = 0; ++ ++ sem->hp.cpu_task[cpu] = NULL; ++ list_for_each_safe(tmp, next, &sem->wait.task_list) { ++ queued = ((struct wq_pair*) ++ list_entry(tmp, wait_queue_t, ++ task_list)->private)->tsk; ++ ++ /* Compare task prios, find high prio task. */ ++ if (get_partition(queued) == cpu && ++ edf_higher_prio(queued, sem->hp.cpu_task[cpu])) { ++ sem->hp.cpu_task[cpu] = queued; ++ ret = 1; ++ } ++ } ++ return ret; ++} ++ ++ ++/* Initialize PI semaphores at boot time. */ ++static int __init pi_sema_boot_init(void) ++{ ++ pi_sema_id sem_id; ++ ++ printk("Initializing PI semaphores..."); ++ for (sem_id = 0; sem_id < MAX_PI_SEMAPHORES; sem_id++) ++ pi_sems[sem_id].used = 0; ++ printk(" done!\n"); ++ ++ return 0; ++} ++__initcall(pi_sema_boot_init); ++ ++/* Find a free semaphore and return. */ ++asmlinkage long sys_pi_sema_init (void) ++{ ++ pi_sema_id sem_id; ++ int i = 0; ++ ++ for (sem_id = 0; sem_id < MAX_PI_SEMAPHORES; sem_id++) { ++ if (!cmpxchg(&pi_sems[sem_id].used, 0, 1)) { ++ atomic_set(&pi_sems[sem_id].count, 1); ++ pi_sems[sem_id].sleepers = 0; ++ init_waitqueue_head(&pi_sems[sem_id].wait); ++ pi_sems[sem_id].hp.task = NULL; ++ pi_sems[sem_id].holder = NULL; ++ for (i = 0; i < NR_CPUS; i++) ++ pi_sems[sem_id].hp.cpu_task[i] = NULL; ++ return sem_id; ++ } ++ } ++ return -ENOMEM; ++} ++ ++asmlinkage long sys_pi_down(pi_sema_id sem_id) ++{ ++ struct pi_semaphore * sem; ++ unsigned long flags; ++ struct task_struct *tsk = current; ++ struct wq_pair pair; ++ long ret = -EINVAL; ++ wait_queue_t wait = { ++ .private = &pair, ++ .func = rt_pi_wake_up, ++ .task_list = {NULL, NULL} ++ }; ++ ++ preempt_disable(); ++ TS_PI_DOWN_START; ++ ++ if (sem_id < 0 || sem_id >= MAX_PI_SEMAPHORES) ++ goto out; ++ ++ if (!pi_sems[sem_id].used) ++ goto out; ++ ++ sem = pi_sems + sem_id; ++ pair.tsk = tsk; ++ pair.sem = sem; ++ spin_lock_irqsave(&sem->wait.lock, flags); ++ ++ if (atomic_dec_return(&sem->count) < 0 || ++ waitqueue_active(&sem->wait)) { ++ /* we need to suspend */ ++ tsk->state = TASK_UNINTERRUPTIBLE; ++ add_wait_queue_exclusive_locked(&sem->wait, &wait); ++ ++ TRACE_CUR("suspends on PI lock %p\n", sem); ++ curr_sched_plugin->pi_block(sem, tsk); ++ ++ /* release lock before sleeping */ ++ spin_unlock_irqrestore(&sem->wait.lock, flags); ++ ++ TS_PI_DOWN_END; ++ preempt_enable_no_resched(); ++ ++ ++ /* we depend on the FIFO order ++ * Thus, we don't need to recheck when we wake up, we ++ * are guaranteed to have the lock since there is only one ++ * wake up per release ++ */ ++ schedule(); ++ ++ TRACE_CUR("woke up, now owns PI lock %p\n", sem); ++ ++ /* try_to_wake_up() set our state to TASK_RUNNING, ++ * all we need to do is to remove our wait queue entry ++ */ ++ remove_wait_queue(&sem->wait, &wait); ++ } else { ++ /* no priority inheritance necessary, since there are no queued ++ * tasks. ++ */ ++ TRACE_CUR("acquired PI lock %p, no contention\n", sem); ++ sem->holder = tsk; ++ sem->hp.task = tsk; ++ curr_sched_plugin->inherit_priority(sem, tsk); ++ spin_unlock_irqrestore(&sem->wait.lock, flags); ++ out: ++ TS_PI_DOWN_END; ++ preempt_enable(); ++ } ++ ret = 0; ++ return ret; ++} ++ ++asmlinkage long sys_pi_up(pi_sema_id sem_id) ++{ ++ unsigned long flags; ++ long ret = -EINVAL; ++ struct pi_semaphore * sem; ++ ++ preempt_disable(); ++ TS_PI_UP_START; ++ ++ if (sem_id < 0 || sem_id >= MAX_PI_SEMAPHORES) ++ goto out; ++ ++ if (!pi_sems[sem_id].used) ++ goto out; ++ ++ sem = pi_sems + sem_id; ++ spin_lock_irqsave(&sem->wait.lock, flags); ++ ++ TRACE_CUR("releases PI lock %p\n", sem); ++ curr_sched_plugin->return_priority(sem); ++ sem->holder = NULL; ++ if (atomic_inc_return(&sem->count) < 1) ++ /* there is a task queued */ ++ wake_up_locked(&sem->wait); ++ ++ spin_unlock_irqrestore(&sem->wait.lock, flags); ++ ++ ret = 0; ++ out: ++ TS_PI_UP_END; ++ preempt_enable(); ++ return ret; ++} ++ ++/* Clear wait queue and wakeup waiting tasks, and free semaphore. */ ++asmlinkage long sys_pi_sema_free(pi_sema_id sem_id) ++{ ++ struct list_head *tmp, *next; ++ unsigned long flags; ++ ++ if (sem_id < 0 || sem_id >= MAX_PI_SEMAPHORES) ++ return -EINVAL; ++ ++ if (!pi_sems[sem_id].used) ++ return -EINVAL; ++ ++ spin_lock_irqsave(&pi_sems[sem_id].wait.lock, flags); ++ if (waitqueue_active(&pi_sems[sem_id].wait)) { ++ list_for_each_safe(tmp, next, ++ &pi_sems[sem_id].wait.task_list) { ++ wait_queue_t *curr = list_entry(tmp, wait_queue_t, ++ task_list); ++ list_del(tmp); ++ set_rt_flags((struct task_struct*)curr->private, ++ RT_F_EXIT_SEM); ++ curr->func(curr, ++ TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, ++ 0, NULL); ++ } ++ } ++ ++ spin_unlock_irqrestore(&pi_sems[sem_id].wait.lock, flags); ++ pi_sems[sem_id].used = 0; ++ ++ return 0; ++} ++ ++ ++ ++ ++/* ************************************************************************** */ ++/* STACK RESOURCE POLICY */ ++/* ************************************************************************** */ ++ ++ ++struct srp_priority { ++ struct list_head list; ++ unsigned int period; ++ pid_t pid; ++}; ++ ++#define list2prio(l) list_entry(l, struct srp_priority, list) ++ ++static int srp_higher_prio(struct srp_priority* first, ++ struct srp_priority* second) ++{ ++ if (!first->period) ++ return 0; ++ else ++ return !second->period || ++ first->period < second->period || ( ++ first->period == second->period && ++ first->pid < second->pid); ++} ++ ++struct srp { ++ struct list_head ceiling; ++ wait_queue_head_t ceiling_blocked; ++}; ++ ++#define system_ceiling(srp) list2prio(srp->ceiling.next) ++ ++static int srp_exceeds_ceiling(struct task_struct* first, ++ struct srp* srp) ++{ ++ return list_empty(&srp->ceiling) || ++ get_rt_period(first) < system_ceiling(srp)->period || ++ (get_rt_period(first) == system_ceiling(srp)->period && ++ first->pid < system_ceiling(srp)->pid); ++} ++ ++static void srp_add_prio(struct srp* srp, struct srp_priority* prio) ++{ ++ struct list_head *pos; ++ if (in_list(&prio->list)) { ++ TRACE_CUR("WARNING: SRP violation detected, prio is already in " ++ "ceiling list!\n"); ++ return; ++ } ++ list_for_each(pos, &srp->ceiling) ++ if (unlikely(srp_higher_prio(prio, list2prio(pos)))) { ++ __list_add(&prio->list, pos->prev, pos); ++ return; ++ } ++ ++ list_add_tail(&prio->list, &srp->ceiling); ++} ++ ++/* struct for uniprocessor SRP "semaphore" */ ++struct srp_semaphore { ++ struct srp_priority ceiling; ++ int cpu; /* cpu associated with this "semaphore" and resource */ ++ int claimed; /* is the resource claimed (ceiling should be used)? */ ++ int used; /* is the semaphore being used? */ ++}; ++ ++ ++struct srp_semaphore srp_sems[MAX_SRP_SEMAPHORES]; /* all SRP sems */ ++typedef int srp_sema_id; /* Userspace ID of a srp_semaphore */ ++ ++DEFINE_PER_CPU(struct srp, srp); ++ ++/* Initialize SRP semaphores at boot time. */ ++static int __init srp_sema_boot_init(void) ++{ ++ srp_sema_id sem_id; ++ int i; ++ ++ printk("Initializing SRP semaphores..."); ++ for (sem_id = 0; sem_id < MAX_SRP_SEMAPHORES; sem_id++) { ++ srp_sems[sem_id].used = 0; ++ srp_sems[sem_id].claimed = 0; ++ srp_sems[sem_id].cpu = -1; ++ INIT_LIST_HEAD(&srp_sems[sem_id].ceiling.list); ++ } ++ for (i = 0; i < NR_CPUS; i++) { ++ init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked); ++ INIT_LIST_HEAD(&per_cpu(srp, i).ceiling); ++ } ++ printk(" done!\n"); ++ ++ return 0; ++} ++__initcall(srp_sema_boot_init); ++ ++/* Find a free semaphore and return. */ ++asmlinkage long sys_srp_sema_init (void) ++{ ++ srp_sema_id sem_id; ++ ++ if (!is_realtime(current)) ++ return -EPERM; ++ ++ for (sem_id = 0; sem_id < MAX_SRP_SEMAPHORES; sem_id++) { ++ if (!cmpxchg(&srp_sems[sem_id].used, 0, 1)) { ++ srp_sems[sem_id].ceiling.period = 0; ++ srp_sems[sem_id].cpu = get_partition(current); ++ return sem_id; ++ } ++ } ++ return -ENOMEM; ++} ++ ++/* SRP task priority comparison function. Smaller periods have highest ++ * priority, tie-break is PID. ++ */ ++ ++/* Adjust the system-wide priority ceiling if resource is claimed. */ ++asmlinkage long sys_srp_down(srp_sema_id sem_id) ++{ ++ int cpu; ++ int ret = -EINVAL; ++ ++ /* disabling preemptions is sufficient protection since ++ * SRP is strictly per CPU and we don't interfere with any ++ * interrupt handlers ++ */ ++ preempt_disable(); ++ TS_SRP_DOWN_START; ++ ++ ++ cpu = smp_processor_id(); ++ ++ if (sem_id < 0 || sem_id >= MAX_SRP_SEMAPHORES || ++ srp_sems[sem_id].cpu != cpu) ++ goto out; ++ ++ if (!srp_sems[sem_id].used) ++ goto out; ++ ++ /* claim... */ ++ srp_sems[sem_id].claimed = 1; ++ /* ...and update ceiling */ ++ srp_add_prio(&__get_cpu_var(srp), &srp_sems[sem_id].ceiling); ++ ++ ret = 0; ++ out: ++ TS_SRP_DOWN_END; ++ preempt_enable(); ++ return ret; ++} ++ ++/* Adjust the system-wide priority ceiling if resource is freed. */ ++asmlinkage long sys_srp_up(srp_sema_id sem_id) ++{ ++ int cpu; ++ int ret = -EINVAL; ++ ++ preempt_disable(); ++ TS_SRP_UP_START; ++ ++ cpu = smp_processor_id(); ++ ++ if (sem_id < 0 || sem_id >= MAX_SRP_SEMAPHORES || ++ srp_sems[sem_id].cpu != cpu) ++ goto out; ++ ++ if (!srp_sems[sem_id].used) ++ goto out; ++ ++ srp_sems[sem_id].claimed = 0; ++ /* Determine new system priority ceiling for this CPU. */ ++ if (in_list(&srp_sems[sem_id].ceiling.list)) ++ list_del(&srp_sems[sem_id].ceiling.list); ++ else ++ TRACE_CUR("WARNING: SRP violation detected, prio not in ceiling" ++ " list!\n"); ++ ++ /* Wake tasks on this CPU, if they exceed current ceiling. */ ++ wake_up_all(&__get_cpu_var(srp).ceiling_blocked); ++ ret = 0; ++ out: ++ TS_SRP_UP_END; ++ preempt_enable(); ++ return ret; ++} ++ ++/* Indicate that task will use a resource associated with a given ++ * semaphore. Should be done *a priori* before RT task system is ++ * executed, so this does *not* update the system priority ++ * ceiling! (The ceiling would be meaningless anyway, as the SRP ++ * breaks without this a priori knowledge.) ++ */ ++asmlinkage long sys_reg_task_srp_sem(srp_sema_id sem_id, pid_t t_pid) ++{ ++ struct pid *task_pid; ++ struct task_struct *t; ++ struct srp_priority t_prio; ++ ++ if (sem_id < 0 || sem_id >= MAX_SRP_SEMAPHORES) ++ return -EINVAL; ++ ++ task_pid = find_get_pid(t_pid); ++ if (!task_pid) ++ return -EINVAL; ++ ++ t = get_pid_task(task_pid, PIDTYPE_PID); ++ if (!t) ++ return -EINVAL; ++ ++ if (!is_realtime(t)) ++ return -EPERM; ++ ++ if (!srp_sems[sem_id].used) ++ return -EINVAL; ++ ++ if (srp_sems[sem_id].cpu != get_partition(t)) ++ return -EINVAL; ++ ++ preempt_disable(); ++ t->rt_param.subject_to_srp = 1; ++ t_prio.period = get_rt_period(t); ++ t_prio.pid = t->pid; ++ if (srp_higher_prio(&t_prio, &srp_sems[sem_id].ceiling)) { ++ srp_sems[sem_id].ceiling.period = t_prio.period; ++ srp_sems[sem_id].ceiling.pid = t_prio.pid; ++ } ++ ++ preempt_enable(); ++ ++ return 0; ++} ++ ++static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync, ++ void *key) ++{ ++ int cpu = smp_processor_id(); ++ struct task_struct *tsk = wait->private; ++ if (cpu != get_partition(tsk)) ++ TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b", ++ get_partition(tsk)); ++ else if (srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) ++ return default_wake_function(wait, mode, sync, key); ++ return 0; ++} ++ ++ ++/* Wait for current task priority to exceed system-wide priority ceiling. ++ * Can be used to determine when it is safe to run a job after its release. ++ */ ++void srp_ceiling_block(void) ++{ ++ struct task_struct *tsk = current; ++ wait_queue_t wait = { ++ .private = tsk, ++ .func = srp_wake_up, ++ .task_list = {NULL, NULL} ++ }; ++ ++ preempt_disable(); ++ if (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) { ++ tsk->state = TASK_UNINTERRUPTIBLE; ++ add_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait); ++ TRACE_CUR("is priority ceiling blocked.\n"); ++ preempt_enable_no_resched(); ++ schedule(); ++ /* Access to CPU var must occur with preemptions disabled, otherwise ++ * Linux debug code complains loudly, even if it is ok here. ++ */ ++ preempt_disable(); ++ TRACE_CUR("finally exceeds system ceiling.\n"); ++ remove_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait); ++ preempt_enable(); ++ } else { ++ TRACE_CUR("is not priority ceiling blocked\n"); ++ preempt_enable(); ++ } ++} ++ ++/* Free semaphore, adjusting the system-wide priority ceiling if necessary. */ ++asmlinkage long sys_srp_sema_free(srp_sema_id sem_id) ++{ ++ int cpu; ++ int ret = 0; ++ ++ preempt_disable(); ++ cpu = smp_processor_id(); ++ ++ if (sem_id < 0 || sem_id >= MAX_SRP_SEMAPHORES || ++ srp_sems[sem_id].cpu != cpu) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ srp_sems[sem_id].claimed = 0; ++ srp_sems[sem_id].used = 0; ++ ++out: ++ preempt_enable(); ++ return ret; ++} ++ ++ ++ ++/* ************************************************************************** */ ++ ++ ++ +diff --git a/kernel/pfair_common.c b/kernel/pfair_common.c +new file mode 100644 +index 0000000..c50fdab +--- /dev/null ++++ b/kernel/pfair_common.c +@@ -0,0 +1,237 @@ ++/* ++ * Common functions for PFAIR based scheduler. ++ */ ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include ++#include ++/* Comparison of two tasks whether ++ * the lhs has higher priority than the rhs */ ++int is_pfair_hp(struct task_struct *lhs, struct task_struct *rhs) ++{ ++ /* Favor subtasks with earlier deadlines */ ++ if(time_before(get_deadline(lhs), get_deadline(rhs))) ++ return 1; ++ if(get_deadline(lhs) == get_deadline(rhs)) { ++ /* If deadlines are equal, ++ * favor non-zero b-bit (a heavy task) */ ++ if(lhs->rt_param.times.b_bit > rhs->rt_param.times.b_bit) ++ return 1; ++ ++ if(lhs->rt_param.times.b_bit == rhs->rt_param.times.b_bit && ++ lhs->rt_param.times.b_bit == 1) ++ /* If b-bit is 1, favor tasks with later ++ * group deadline */ ++ return time_after(lhs->rt_param.times.group_deadline, ++ rhs->rt_param.times.group_deadline); ++ ++ } ++ return 0; ++} ++ ++void pfair_domain_init(pfair_domain_t *pfair) ++{ ++ BUG_ON(!pfair); ++ INIT_LIST_HEAD(&pfair->ready_queue); ++ INIT_LIST_HEAD(&pfair->release_queue); ++ queue_lock_init(&pfair->pfair_lock); ++ cpus_setall(pfair->domain_cpus); ++ /* Use cpu 0 to keep the system alive ++ * TODO: Remove later or make it configurable ++ * */ ++ cpu_clear(0, pfair->domain_cpus); ++} ++ ++ ++/* add_ready - add a real-time task to the PFAIR ready queue. ++ * It must be runnable. Global domain lock must be held before ++ * calling this function. ++ * ++ * @new: the newly released task ++ */ ++void pfair_add_ready(pfair_domain_t* pfair, struct task_struct *new) ++{ ++ struct list_head *pos; ++ struct task_struct *queued; ++ ++ BUG_ON(!new); ++ /* find a spot where our deadline is earlier than the next */ ++ list_for_each(pos, &pfair->ready_queue) { ++ queued = list_entry(pos, struct task_struct, rt_list); ++ if (unlikely(is_pfair_hp(new, queued))) { ++ /* the task at pos has a later deadline */ ++ /* insert the new task in front of it */ ++ __list_add(&new->rt_list, pos->prev, pos); ++ return; ++ } ++ } ++ /* if we get to this point either the list is empty or new has the ++ * lowest priority. Let's add it to the end. */ ++ list_add_tail(&new->rt_list, &pfair->ready_queue); ++} ++/** ++ * Extraction function. ++ */ ++struct task_struct* __pfair_take_ready(pfair_domain_t* pfair) ++{ ++ struct task_struct *t = NULL; ++ /* either not yet released, preempted, or non-rt */ ++ if (!list_empty(&pfair->ready_queue)) { ++ ++ /* take next rt task */ ++ t = list_entry(pfair->ready_queue.next, struct task_struct, ++ rt_list); ++ ++ /* kick it out of the ready list */ ++ list_del(&t->rt_list); ++ } ++ return t; ++} ++ ++ ++/* add_release - add a real-time task to the PFAIR release queue. ++ * Domain lock must be acquired before the function is called. ++ * ++ * @task: the sleeping task ++ */ ++void pfair_add_release(pfair_domain_t* pfair, struct task_struct *task) ++{ ++ struct list_head *pos; ++ struct task_struct *queued; ++ ++ BUG_ON(!task); ++ /* find a spot where our deadline is earlier than the next */ ++ list_for_each_prev(pos, &pfair->release_queue) { ++ queued = list_entry(pos, struct task_struct, rt_list); ++ if ((unlikely(time_before(queued->rt_param.times.release, ++ task->rt_param.times.release)))) { ++ /* the task at pos has an earlier release */ ++ /* insert the new task in behind it */ ++ __list_add(&task->rt_list, pos, pos->next); ++ return; ++ } ++ } ++ /* if we get to this point either the list is empty or task has the ++ * earliest release. Let's add it to the front. */ ++ list_add(&task->rt_list, &pfair->release_queue); ++} ++/** ++ * This function is called from tick handler, it acquires the lock ++ * automatically. Only one processor effectively merges the queues. ++ */ ++void pfair_try_release_pending(pfair_domain_t* pfair) ++{ ++ unsigned long flags; ++ struct list_head *pos, *save; ++ struct task_struct *queued; ++ queue_lock_irqsave(&pfair->pfair_lock, flags); ++ ++ list_for_each_safe(pos, save, &pfair->release_queue) { ++ queued = list_entry(pos, struct task_struct, rt_list); ++ if (likely(time_before_eq( ++ queued->rt_param.times.release, jiffies))) { ++ /* this one is ready to go*/ ++ list_del(pos); ++ set_rt_flags(queued, RT_F_RUNNING); ++ ++ sched_trace_job_release(queued); ++ /* now it can be picked up */ ++ barrier(); ++ pfair_add_ready(pfair, queued); ++ } ++ else ++ /* the release queue is ordered */ ++ break; ++ } ++ queue_unlock_irqrestore(&pfair->pfair_lock, flags); ++} ++/* ++ * Subtask preparation. Assuming that last_release ++ * denotes the time when the job was released. ++ */ ++void pfair_prepare_next_subtask(struct task_struct *t) ++{ ++ BUG_ON(!t); ++ /* assign subtask release time, deadline, b-bit, ++ * and group deadline ++ */ ++ t->rt_param.times.release = t->rt_param.times.last_release ++ +release_time(t); ++ t->rt_param.times.deadline = t->rt_param.times.last_release ++ +pfair_deadline(t); ++ t->rt_param.times.b_bit = b_bit(t); ++ t->rt_param.times.group_deadline = t->rt_param.times.last_release ++ +group_deadline(t); ++} ++ ++void pfair_prepare_next_job(struct task_struct *t) ++{ ++ BUG_ON(!t); ++ ++ /* prepare next job release */ ++ /* make passed quantums zero so that we could compute new release times ++ * and deadlines for subtasks correctly ++ */ ++ t->rt_param.times.exec_time = 0; ++ /* assign job-wide release time, ++ * this is the starting point to ++ * compute subtask releases, deadlines and group deadlines ++ */ ++ t->rt_param.times.last_release = t->rt_param.times.last_release ++ +get_rt_period(t); ++ /* Release the first subtask. */ ++ pfair_prepare_next_subtask(t); ++ t->first_time_slice = 0; ++ /* Increase job sequence number */ ++ t->rt_param.times.job_no++; ++} ++ ++void __pfair_prepare_new_release(struct task_struct *t, jiffie_t start) ++{ ++ t->rt_param.times.release = start; ++ t->rt_param.times.last_release = start; ++ t->rt_param.times.exec_time = 0; ++ t->first_time_slice = 0; ++ pfair_prepare_next_subtask(t); ++ set_rt_flags(t, RT_F_RUNNING); ++} ++ ++void pfair_prepare_new_releases(pfair_domain_t *pfair, jiffie_t start) ++{ ++ unsigned long flags; ++ struct list_head tmp_list; ++ struct list_head *pos, *n; ++ struct task_struct *t; ++ ++ INIT_LIST_HEAD(&tmp_list); ++ ++ queue_lock_irqsave(&pfair->pfair_lock, flags); ++ ++ ++ while (!list_empty(&pfair->release_queue)) { ++ pos = pfair->release_queue.next; ++ list_del(pos); ++ list_add(pos, &tmp_list); ++ } ++ while (!list_empty(&pfair->ready_queue)) { ++ pos = pfair->ready_queue.next; ++ list_del(pos); ++ list_add(pos, &tmp_list); ++ } ++ ++ list_for_each_safe(pos, n, &tmp_list) { ++ t = list_entry(pos, struct task_struct, rt_list); ++ list_del(pos); ++ __pfair_prepare_new_release(t, start); ++ pfair_add_release(pfair, t); ++ } ++ queue_unlock_irqrestore(&pfair->pfair_lock, flags); ++} ++ +diff --git a/kernel/rt_domain.c b/kernel/rt_domain.c +new file mode 100644 +index 0000000..4875c53 +--- /dev/null ++++ b/kernel/rt_domain.c +@@ -0,0 +1,185 @@ ++/* ++ * kernel/rt_domain.c ++ * ++ * LITMUS real-time infrastructure. This file contains the ++ * functions that manipulate RT domains. RT domains are an abstraction ++ * of a ready queue and a release queue. ++ */ ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++#include ++ ++ ++static int dummy_resched(rt_domain_t *rt) ++{ ++ return 0; ++} ++ ++static int dummy_order(struct list_head* a, struct list_head* b) ++{ ++ return 0; ++} ++ ++int release_order(struct list_head* a, struct list_head* b) ++{ ++ return earlier_release( ++ list_entry(a, struct task_struct, rt_list), ++ list_entry(b, struct task_struct, rt_list)); ++} ++ ++ ++void rt_domain_init(rt_domain_t *rt, ++ check_resched_needed_t f, ++ list_cmp_t order) ++{ ++ BUG_ON(!rt); ++ if (!f) ++ f = dummy_resched; ++ if (!order) ++ order = dummy_order; ++ INIT_LIST_HEAD(&rt->ready_queue); ++ INIT_LIST_HEAD(&rt->release_queue); ++ rt->ready_lock = RW_LOCK_UNLOCKED; ++ rt->release_lock = SPIN_LOCK_UNLOCKED; ++ rt->check_resched = f; ++ rt->order = order; ++} ++ ++/* add_ready - add a real-time task to the rt ready queue. It must be runnable. ++ * @new: the newly released task ++ */ ++void __add_ready(rt_domain_t* rt, struct task_struct *new) ++{ ++ TRACE("rt: adding %s/%d (%u, %u) to ready queue\n", ++ new->comm, new->pid, get_exec_cost(new), get_rt_period(new)); ++ ++ if (!list_insert(&new->rt_list, &rt->ready_queue, rt->order)) ++ rt->check_resched(rt); ++} ++ ++struct task_struct* __take_ready(rt_domain_t* rt) ++{ ++ struct task_struct *t = __peek_ready(rt); ++ ++ /* kick it out of the ready list */ ++ if (t) ++ list_del(&t->rt_list); ++ return t; ++} ++ ++struct task_struct* __peek_ready(rt_domain_t* rt) ++{ ++ if (!list_empty(&rt->ready_queue)) ++ return next_ready(rt); ++ else ++ return NULL; ++} ++ ++struct task_struct* __take_ready_rq(rt_domain_t* rt, runqueue_t* rq, int cpu) ++{ ++ struct task_struct *task = __take_ready(rt); ++ ++ if (task) { ++ set_task_cpu(task, cpu); ++ __activate_task(task, rq); ++ } ++ return task; ++} ++ ++/* add_release - add a real-time task to the rt release queue. ++ * @task: the sleeping task ++ */ ++void __add_release(rt_domain_t* rt, struct task_struct *task) ++{ ++ TRACE("rt: adding %s/%d (%u, %u) rel=%d to release queue\n", ++ task->comm, task->pid, get_exec_cost(task), get_rt_period(task), ++ get_release(task)); ++ ++ list_insert(&task->rt_list, &rt->release_queue, release_order); ++} ++ ++void __release_pending(rt_domain_t* rt) ++{ ++ struct list_head *pos, *save; ++ struct task_struct *queued; ++ list_for_each_safe(pos, save, &rt->release_queue) { ++ queued = list_entry(pos, struct task_struct, rt_list); ++ if (likely(is_released(queued))) { ++ /* this one is ready to go*/ ++ list_del(pos); ++ set_rt_flags(queued, RT_F_RUNNING); ++ ++ sched_trace_job_release(queued); ++ ++ /* now it can be picked up */ ++ barrier(); ++ add_ready(rt, queued); ++ } ++ else ++ /* the release queue is ordered */ ++ break; ++ } ++} ++ ++void try_release_pending(rt_domain_t* rt) ++{ ++ unsigned long flags; ++ ++ if (spin_trylock_irqsave(&rt->release_lock, flags)) { ++ __release_pending(rt); ++ spin_unlock_irqrestore(&rt->release_lock, flags); ++ } ++} ++ ++void rerelease_all(rt_domain_t *rt, ++ release_at_t release) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&rt->release_lock, flags); ++ write_lock(&rt->ready_lock); ++ ++ __rerelease_all(rt, release); ++ ++ write_unlock(&rt->ready_lock); ++ spin_unlock_irqrestore(&rt->release_lock, flags); ++} ++ ++void __rerelease_all(rt_domain_t *rt, ++ release_at_t release) ++{ ++ jiffie_t start = jiffies + 10; ++ struct list_head tmp_list; ++ struct list_head *pos, *n; ++ struct task_struct *t; ++ ++ INIT_LIST_HEAD(&tmp_list); ++ ++ while (!list_empty(&rt->release_queue)) { ++ pos = rt->release_queue.next; ++ list_del(pos); ++ list_add(pos, &tmp_list); ++ } ++ while (!list_empty(&rt->ready_queue)) { ++ pos = rt->ready_queue.next; ++ list_del(pos); ++ list_add(pos, &tmp_list); ++ } ++ ++ list_for_each_safe(pos, n, &tmp_list) { ++ t = list_entry(pos, struct task_struct, rt_list); ++ list_del(pos); ++ release(t, start); ++ __add_release(rt, t); ++ } ++ ++} ++ ++ +diff --git a/kernel/sched.c b/kernel/sched.c +index cca93cc..5ad4276 100644 +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -56,6 +56,16 @@ + + #include + ++#include ++#define __SCHED_C__ ++#include ++#include ++#include ++#include ++ ++/* LITMUS: avoid races with multiple task wake-ups */ ++DEFINE_SPINLOCK(litmus_task_set_lock); ++ + /* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], +@@ -836,7 +846,7 @@ static int effective_prio(struct task_struct *p) + * keep the priority unchanged. Otherwise, update priority + * to the normal priority: + */ +- if (!rt_prio(p->prio)) ++ if (!rt_prio(p->prio) && !is_realtime(p)) + return p->normal_prio; + return p->prio; + } +@@ -844,7 +854,7 @@ static int effective_prio(struct task_struct *p) + /* + * __activate_task - move a task to the runqueue. + */ +-static void __activate_task(struct task_struct *p, struct rq *rq) ++void __activate_task(struct task_struct *p, struct rq *rq) + { + struct prio_array *target = rq->active; + +@@ -999,7 +1009,7 @@ out: + /* + * deactivate_task - remove a task from the runqueue. + */ +-static void deactivate_task(struct task_struct *p, struct rq *rq) ++void deactivate_task(struct task_struct *p, struct rq *rq) + { + dec_nr_running(p, rq); + dequeue_task(p, p->array); +@@ -1408,13 +1418,44 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) + #endif + + rq = task_rq_lock(p, &flags); ++ ++ if (is_realtime(p)) ++ TRACE("try_to_wake_up(%s/%d)\n", p->comm, p->pid); ++ + old_state = p->state; + if (!(old_state & state)) +- goto out; ++ goto out; + + if (p->array) + goto out_running; + ++ ++ spin_lock(&litmus_task_set_lock); ++ if (p->rt_param.litmus_controlled) { ++ /* Already included. This can happen ++ * if the task dropped all locks to call ++ * schedule() but a wake up raced and came in ++ * early. ++ */ ++ ++ spin_unlock(&litmus_task_set_lock); ++ goto out_running; ++ } ++ ++ sched_trace_task_arrival(p); ++ if (is_realtime(p)) { ++ p->rt_param.litmus_controlled = 1; ++ curr_sched_plugin->wake_up_task(p); ++ ++ spin_unlock(&litmus_task_set_lock); ++ goto out_running; ++ } ++ ++ p->rt_param.litmus_controlled = 0; ++ spin_unlock(&litmus_task_set_lock); ++ ++ ++ + cpu = task_cpu(p); + this_cpu = smp_processor_id(); + +@@ -1580,6 +1621,7 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) + cpu = sched_balance_self(cpu, SD_BALANCE_FORK); + #endif + set_task_cpu(p, cpu); ++ clear_rt_params(p); + + /* + * We mark the process as running here, but have not actually +@@ -1595,6 +1637,10 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) + p->prio = current->normal_prio; + + INIT_LIST_HEAD(&p->run_list); ++ INIT_LIST_HEAD(&p->rt_list); ++ p->rt_param.basic_params.class = RT_CLASS_BEST_EFFORT; ++ p->rt_param.litmus_controlled = 0; ++ p->rt_param.inh_task = NULL; + p->array = NULL; + #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) + if (unlikely(sched_info_on())) +@@ -1647,6 +1693,12 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) + unsigned long flags; + int this_cpu, cpu; + ++ if (clone_flags & CLONE_REALTIME) { ++ /* just mark the task as stopped */ ++ p->state = TASK_STOPPED; ++ return; ++ } ++ + rq = task_rq_lock(p, &flags); + BUG_ON(p->state != TASK_RUNNING); + this_cpu = smp_processor_id(); +@@ -1730,6 +1782,9 @@ void fastcall sched_exit(struct task_struct *p) + unsigned long flags; + struct rq *rq; + ++ if (is_realtime(p)) ++ return; ++ + /* + * If the child was a (relative-) CPU hog then decrease + * the sleep_avg of the parent as well. +@@ -1801,6 +1856,13 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) + */ + prev_state = prev->state; + finish_arch_switch(prev); ++ /* Requeue previous real-time task before we drop the rq lock, cause ++ * that may lead to a preemption. ++ */ ++ curr_sched_plugin->finish_switch(prev); ++ sched_trace_task_scheduled(current); ++ /* trace before IRQs are enabled */ ++ TS_CXS_END; + finish_lock_switch(rq, prev); + if (mm) + mmdrop(mm); +@@ -1811,7 +1873,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) + */ + kprobe_flush_task(prev); + put_task_struct(prev); +- } ++ } + } + + /** +@@ -2990,7 +3052,7 @@ static inline void idle_balance(int cpu, struct rq *rq) + static inline void wake_priority_sleeper(struct rq *rq) + { + #ifdef CONFIG_SCHED_SMT +- if (!rq->nr_running) ++ if (!rq->nr_running || get_rt_mode() == MODE_RT_RUN) + return; + + spin_lock(&rq->lock); +@@ -3220,14 +3282,30 @@ void scheduler_tick(void) + + update_cpu_clock(p, rq, now); + +- if (p == rq->idle) +- /* Task on the idle queue */ +- wake_priority_sleeper(rq); +- else +- task_running_tick(rq, p); ++ /* check whether the RT scheduler plugin requires a call to ++ * schedule ++ */ ++ TS_PLUGIN_TICK_START; ++ if (rt_scheduler_tick() == FORCE_RESCHED) ++ set_tsk_need_resched(p); ++ TS_PLUGIN_TICK_END; ++ ++ /* real-time accounting is done by the plugin ++ * call linux functions only for background tasks ++ */ ++ if (!is_realtime(p)) { ++ if (p == rq->idle) ++ /* Task on the idle queue */ ++ wake_priority_sleeper(rq); ++ else ++ task_running_tick(rq, p); ++ } ++ send_scheduler_signals(); ++ + #ifdef CONFIG_SMP + update_load(rq); +- if (time_after_eq(jiffies, rq->next_balance)) ++ if (time_after_eq(jiffies, rq->next_balance) && ++ get_rt_mode() == MODE_NON_RT) + raise_softirq(SCHED_SOFTIRQ); + #endif + } +@@ -3420,6 +3498,7 @@ asmlinkage void __sched schedule(void) + long *switch_count; + struct rq *rq; + ++ + /* + * Test if we are atomic. Since do_exit() needs to call into + * schedule() atomically, we ignore that path for now. +@@ -3427,8 +3506,9 @@ asmlinkage void __sched schedule(void) + */ + if (unlikely(in_atomic() && !current->exit_state)) { + printk(KERN_ERR "BUG: scheduling while atomic: " +- "%s/0x%08x/%d\n", +- current->comm, preempt_count(), current->pid); ++ "%s/0x%08x/%d %s\n", ++ current->comm, preempt_count(), current->pid, ++ is_realtime(current) ? "rt" : "non-rt"); + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); +@@ -3438,6 +3518,7 @@ asmlinkage void __sched schedule(void) + + need_resched: + preempt_disable(); ++ TS_SCHED_START; + prev = current; + release_kernel_lock(prev); + need_resched_nonpreemptible: +@@ -3470,6 +3551,7 @@ need_resched_nonpreemptible: + spin_lock_irq(&rq->lock); + + switch_count = &prev->nivcsw; ++ /* check for blocking tasks */ + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && +@@ -3478,13 +3560,66 @@ need_resched_nonpreemptible: + else { + if (prev->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; ++ /* we need to remove real-time tasks from the runqueue*/ ++ ++ /* protect against races with signal delivery and IO ++ * interrupts on other CPUs ++ * ++ * FIXME: This is probably not sufficient, ++ * as (in theory) after ++ * unlocking the task_set_lock this task could ++ * be scheduled elsewere before we switched away ++ * from it. This has not been observed ++ * yet. To get this locking right is tricky. ++ */ ++ spin_lock(&litmus_task_set_lock); ++ if (prev->rt_param.litmus_controlled) ++ prev->rt_param.litmus_controlled = 0; ++ spin_unlock(&litmus_task_set_lock); ++ ++ if (is_realtime(prev)) { ++ TRACE("schedule: %s/%d blocks. state = %d\n", ++ prev->comm, prev->pid, prev->state); ++ curr_sched_plugin->task_blocks(prev); ++ /* Enable this for all tasks to get _a lot_ of ++ * data. Can be helpful for debugging. ++ */ ++ sched_trace_task_departure(prev); ++ } ++ ++ /* only indirect switching is supported in the current ++ * version of LITMUS ++ */ + deactivate_task(prev, rq); + } + } + ++ next = NULL; ++ ++ /* consult the real-time plugin */ ++ TS_PLUGIN_SCHED_START; ++ curr_sched_plugin->schedule(prev, &next, rq); ++ TS_PLUGIN_SCHED_END; ++ /* If the real-time plugin wants to switch to a specific task ++ * it'll be on the rq and have the highest priority. There will ++ * be exaclty one such task, thus the selection of the next task ++ * is unambiguous and the following code can only get ++ * triggered if there are no RT tasks pending (on this CPU). Thus, ++ * we may as well skip it. ++ */ ++ if (next) ++ goto switch_tasks; ++ + cpu = smp_processor_id(); + if (unlikely(!rq->nr_running)) { +- idle_balance(cpu, rq); ++ /* only load-balance if we are not in RT mode ++ * ++ * TODO: Maybe this can be relaxed by modifiying the ++ * load-balancing routines in such a way that they never touch ++ * real-time tasks. ++ */ ++ if (get_rt_mode() == MODE_NON_RT) ++ idle_balance(cpu, rq); + if (!rq->nr_running) { + next = rq->idle; + rq->expired_timestamp = 0; +@@ -3528,7 +3663,7 @@ need_resched_nonpreemptible: + } + } + next->sleep_type = SLEEP_NORMAL; +- if (dependent_sleeper(cpu, rq, next)) ++ if (get_rt_mode() == MODE_NON_RT && dependent_sleeper(cpu, rq, next)) + next = rq->idle; + switch_tasks: + if (next == rq->idle) +@@ -3546,7 +3681,11 @@ switch_tasks: + prev->timestamp = prev->last_ran = now; + + sched_info_switch(prev, next); ++ TS_SCHED_END; + if (likely(prev != next)) { ++ TS_CXS_START; ++ if (is_running(prev)) ++ sched_trace_task_preemption(prev, next); + next->timestamp = now; + rq->nr_switches++; + rq->curr = next; +@@ -3560,9 +3699,12 @@ switch_tasks: + * CPUs since it called schedule(), thus the 'rq' on its stack + * frame will be invalid. + */ +- finish_task_switch(this_rq(), prev); +- } else ++ finish_task_switch(this_rq(), prev); ++ } else { + spin_unlock_irq(&rq->lock); ++ } ++ ++ send_scheduler_signals(); + + prev = current; + if (unlikely(reacquire_kernel_lock(prev) < 0)) +@@ -3691,6 +3833,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + } + } + ++ + /** + * __wake_up - wake up threads blocked on a waitqueue. + * @q: the waitqueue +@@ -3709,6 +3852,7 @@ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, + } + EXPORT_SYMBOL(__wake_up); + ++ + /* + * Same as __wake_up but called with the spinlock in wait_queue_head_t held. + */ +@@ -3717,6 +3861,7 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) + __wake_up_common(q, mode, 1, 0, NULL); + } + ++ + /** + * __wake_up_sync - wake up threads blocked on a waitqueue. + * @q: the waitqueue +@@ -4175,7 +4320,7 @@ static inline struct task_struct *find_process_by_pid(pid_t pid) + } + + /* Actually do priority change: must hold rq lock. */ +-static void __setscheduler(struct task_struct *p, int policy, int prio) ++void __setscheduler(struct task_struct *p, int policy, int prio) + { + BUG_ON(p->array); + +@@ -6877,7 +7022,7 @@ void __init sched_init_smp(void) + BUG(); + } + #else +-void __init sched_init_smp(void) ++void __init linux_sched_init_smp(void) + { + } + #endif /* CONFIG_SMP */ +diff --git a/kernel/sched_adaptive.c b/kernel/sched_adaptive.c +new file mode 100644 +index 0000000..319ebbc +--- /dev/null ++++ b/kernel/sched_adaptive.c +@@ -0,0 +1,1454 @@ ++ ++ ++/* ++ * kernel/sched_adaptive.c ++ * ++ * Implementation of Aaron's adaptive global EDF scheduling algorithm. It is ++ * based on the GSN-EDF scheduler. However, it does not support synchronization ++ * primitives. ++ * ++ * It implements a version of FC-GEDF with a bunch of linearity assumptions for ++ * the optimizer and the the weight-transfer function. The code is meant to be ++ * clear, however you really need to read the paper if you want to understand ++ * what is going on here. ++ * ++ * Block et al., "Feedback-Controlled Adaptive Multiprocessor Real-Time ++ * Systems", submitted to RTAS 2008. ++ */ ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++/* Overview of GSN-EDF operations. ++ * ++ * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This ++ * description only covers how the individual operations are implemented in ++ * LITMUS. ++ * ++ * link_task_to_cpu(T, cpu) - Low-level operation to update the linkage ++ * structure (NOT the actually scheduled ++ * task). If there is another linked task To ++ * already it will set To->linked_on = NO_CPU ++ * (thereby removing its association with this ++ * CPU). However, it will not requeue the ++ * previously linked task (if any). It will set ++ * T's state to RT_F_RUNNING and check whether ++ * it is already running somewhere else. If T ++ * is scheduled somewhere else it will link ++ * it to that CPU instead (and pull the linked ++ * task to cpu). T may be NULL. ++ * ++ * unlink(T) - Unlink removes T from all scheduler data ++ * structures. If it is linked to some CPU it ++ * will link NULL to that CPU. If it is ++ * currently queued in the gsnedf queue it will ++ * be removed from the T->rt_list. It is safe to ++ * call unlink(T) if T is not linked. T may not ++ * be NULL. ++ * ++ * requeue(T) - Requeue will insert T into the appropriate ++ * queue. If the system is in real-time mode and ++ * the T is released already, it will go into the ++ * ready queue. If the system is not in ++ * real-time mode is T, then T will go into the ++ * release queue. If T's release time is in the ++ * future, it will go into the release ++ * queue. That means that T's release time/job ++ * no/etc. has to be updated before requeu(T) is ++ * called. It is not safe to call requeue(T) ++ * when T is already queued. T may not be NULL. ++ * ++ * gsnedf_job_arrival(T) - This is the catch all function when T enters ++ * the system after either a suspension or at a ++ * job release. It will queue T (which means it ++ * is not safe to call gsnedf_job_arrival(T) if ++ * T is already queued) and then check whether a ++ * preemption is necessary. If a preemption is ++ * necessary it will update the linkage ++ * accordingly and cause scheduled to be called ++ * (either with an IPI or need_resched). It is ++ * safe to call gsnedf_job_arrival(T) if T's ++ * next job has not been actually released yet ++ * (releast time in the future). T will be put ++ * on the release queue in that case. ++ * ++ * job_completion(T) - Take care of everything that needs to be done ++ * to prepare T for its next release and place ++ * it in the right queue with ++ * gsnedf_job_arrival(). ++ * ++ * ++ * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is ++ * equivalent to unlink(T). Note that if you unlink a task from a CPU none of ++ * the functions will automatically propagate pending task from the ready queue ++ * to a linked task. This is the job of the calling function ( by means of ++ * __take_ready). ++ */ ++ ++static void unlink(struct task_struct* t); ++static void adaptive_job_arrival(struct task_struct* task); ++ ++/* cpu_entry_t - maintain the linked and scheduled state ++ */ ++typedef struct { ++ int cpu; ++ struct task_struct* linked; /* only RT tasks */ ++ struct task_struct* scheduled; /* only RT tasks */ ++ struct list_head list; ++ atomic_t will_schedule; /* prevent unneeded IPIs */ ++} cpu_entry_t; ++DEFINE_PER_CPU(cpu_entry_t, adaptive_cpu_entries); ++ ++#define set_will_schedule() \ ++ (atomic_set(&__get_cpu_var(adaptive_cpu_entries).will_schedule, 1)) ++#define clear_will_schedule() \ ++ (atomic_set(&__get_cpu_var(adaptive_cpu_entries).will_schedule, 0)) ++#define test_will_schedule(cpu) \ ++ (atomic_read(&per_cpu(adaptive_cpu_entries, cpu).will_schedule)) ++ ++ ++#define NO_CPU 0xffffffff ++ ++/* The gsnedf_lock is used to serialize all scheduling events. ++ * It protects ++ */ ++static queuelock_t adaptive_lock; ++/* the cpus queue themselves according to priority in here */ ++static LIST_HEAD(adaptive_cpu_queue); ++ ++static rt_domain_t adaptive; ++ ++/* feedback control parameters */ ++static fp_t fc_a, fc_b; ++ ++/* optimizer trigger */ ++static jiffie_t last_optimizer_run; ++static jiffie_t optimizer_min_invocation_sep; ++static jiffie_t optimizer_period; ++static fp_t task_error_threshold; ++ ++static fp_t system_capacity; ++/* total actual weight of the task system */ ++static fp_t total_weight; ++ ++/* optimizer time snapshot */ ++jiffie_t opt_time; ++ ++/* Delayed weight increase notification list. ++ * This list gets clobbered on each optimizer run. ++ */ ++static LIST_HEAD(adaptive_inc_list); ++ ++/* comment out to disable optimizer debugging */ ++#define ENABLE_OPTIMIZER_DEBUGGING ++ ++#ifdef ENABLE_OPTIMIZER_DEBUGGING ++#define OPT_DBG TRACE ++#define OPT_DBG_T TRACE_TASK ++#else ++#define OPT_DBG ++#define OPT_DBG_T OPT_D ++#endif ++ ++/******************************************************************************/ ++/* OPTIMIZER MATH */ ++/******************************************************************************/ ++ ++/* All time dependent functions ++ * rely on opt_time. ++ * Update in the optimizer before use! ++ */ ++ ++static inline fp_t ideal(fp_t weight, jiffie_t delta_t) ++{ ++ return _mul(weight, FP(delta_t)); ++} ++ ++static noinline long ideal_exec_time(struct task_struct* t) ++{ ++ jiffie_t delta = opt_time - get_last_release(t); ++ return _round(ideal(get_est_weight(t), delta)); ++} ++ ++/* this makes a whole bunch of linearity assumptions */ ++static noinline fp_t weight_transfer(struct task_struct* t, ++ unsigned int from, unsigned int to, ++ fp_t act_weight) ++{ ++ fp_t rel_from, rel_to, ret; ++ rel_from = get_sl(t, from).weight; ++ rel_to = get_sl(t, to).weight; ++ ret.val = (act_weight.val * rel_to.val) / rel_from.val; ++ OPT_DBG("weight_transfer(%ld, %ld, %ld) => %ld to=%u from=%u\n", ++ rel_from.val, rel_to.val, act_weight.val, ret.val, from, to); ++ ++ return ret; ++} ++ ++static noinline fp_t est_weight_at(struct task_struct* t, unsigned int level) ++{ ++ if (t->rt_param.no_service_levels) ++ return weight_transfer(t, get_cur_sl(t), level, ++ get_est_weight(t)); ++ else ++ return get_est_weight(t); ++ ++} ++ ++static noinline void update_estimate(predictor_state_t *state, fp_t actual_weight, ++ fp_t a, fp_t b) ++{ ++ fp_t err, new; ++ ++ OPT_DBG("OLD ESTIMATE Weight" _FP_ " ActWt " _FP_ " A:" _FP_ ", B:" _FP_ ++ "\n", fp2str(state->estimate), fp2str(actual_weight), fp2str(a), ++ fp2str(b)); ++ err = _sub(actual_weight, state->estimate); ++ new = _add(_mul(a, err), ++ _mul(b, state->accumulated)); ++ ++ total_weight = _sub(total_weight, state->estimate); ++ state->estimate = new; ++ total_weight = _add(total_weight, state->estimate); ++ ++ state->accumulated = _add(state->accumulated, err); ++ OPT_DBG("ERROR " _FP_ ", NEW " _FP_ ", ACC" _FP_ "\n", fp2str(err), ++ fp2str(new), fp2str(state->accumulated)); ++ ++} ++ ++static noinline fp_t linear_metric(struct task_struct* t) ++{ ++ fp_t v1, vmax, g1, gmax; ++ fp_t est_w; ++ unsigned int l = t->rt_param.no_service_levels; ++ unsigned int lcur; ++ ++ if (l <= 1) ++ return FP(0); ++ ++ lcur = get_cur_sl(t);; ++ est_w = get_est_weight(t); ++ ++ OPT_DBG_T(t, " linear_metric: lcur=%u l=%u est_w=" _FP_ "\n", ++ lcur, l, est_w); ++ OPT_DBG_T(t, " linear_metric: est_w.val=%ld\n", est_w.val); ++ ++ ++ v1 = t->rt_param.service_level[0].value; ++ vmax = t->rt_param.service_level[l - 1].value; ++ ++ OPT_DBG_T(t, " linear_metric: v1=" _FP_ " vmax=" _FP_ "\n", v1, vmax); ++ OPT_DBG_T(t, " linear_metric: v1=%ld vmax=%ld\n", v1.val, vmax.val); ++ ++ ++ g1 = weight_transfer(t, lcur, 0, est_w); ++ gmax = weight_transfer(t, lcur, l - 1, est_w); ++ ++ OPT_DBG_T(t, " linear_metric: g1=" _FP_ " gmax=" _FP_ "\n", g1, gmax); ++ OPT_DBG_T(t, " linear_metric: g1=%ld gmax=%ld\n", g1, gmax); ++ ++ ++ TRACE_BUG_ON(_eq(_sub(gmax, g1), FP(0))); ++ if (_eq(_sub(gmax, g1), FP(0))) ++ return FP(0); ++ return _div(_sub(vmax, v1), ++ _sub(gmax, g1)); ++} ++ ++static noinline unsigned long reweighted_period(fp_t ow, fp_t nw, ++ unsigned long alloc, ++ jiffie_t deadline, ++ jiffie_t release) ++{ ++ fp_t dl; ++ dl = _mul(FP(deadline - release), ow); ++ dl = _sub(dl, FP(alloc)); ++ if(_eq(nw, FP(0))) ++ return 0; ++ dl = _div(dl, nw); ++ return _round(dl); ++} ++ ++static noinline int is_under_allocated(struct task_struct* t) ++{ ++ return ideal_exec_time(t) >= t->rt_param.times.exec_time; ++} ++ ++static noinline jiffie_t dec_equal_point_delay(struct task_struct* t) ++{ ++ if (_lt(FP(0), get_est_weight(t))) ++ /* when t was released plus time needed to equalize ++ * minus now ++ */ ++ return get_last_release(t) + ++ _round(_div( FP(t->rt_param.times.exec_time), ++ get_est_weight(t))) - ++ opt_time; ++ else ++ /* if the weight is zero we just take the ++ * deadline ++ */ ++ return t->rt_param.times.deadline; ++} ++ ++static noinline jiffie_t inc_equal_point_delay(struct task_struct* t) ++{ ++ if (_lt(FP(0), t->rt_param.opt_nw)) ++ /* when t was released plus time needed to equalize ++ * minus now ++ */ ++ return get_last_release(t) + ++ _round(_div( FP(t->rt_param.times.exec_time), ++ t->rt_param.opt_nw)) - ++ opt_time; ++ else ++ /* if the weight is zero we just take the ++ * deadline ++ */ ++ return t->rt_param.times.deadline; ++} ++ ++static noinline jiffie_t decrease_delay(struct task_struct* t) ++{ ++ if (has_active_job(t) && !is_under_allocated(t)) ++ return dec_equal_point_delay(t); ++ return 0; ++} ++ ++ ++ ++/******************************************************************************/ ++/* SORT ORDERS */ ++/******************************************************************************/ ++ ++static int by_linear_metric(struct list_head* a, struct list_head* b) ++{ ++ struct task_struct *ta, *tb; ++ ta = list_entry(a, struct task_struct, rt_param.opt_list); ++ tb = list_entry(b, struct task_struct, rt_param.opt_list); ++ return _gt(ta->rt_param.opt_order, tb->rt_param.opt_order); ++} ++ ++static int by_delta_weight(struct list_head* a, struct list_head* b) ++{ ++ struct task_struct *ta, *tb; ++ ta = list_entry(a, struct task_struct, rt_param.opt_list); ++ tb = list_entry(b, struct task_struct, rt_param.opt_list); ++ return _lt(ta->rt_param.opt_dw, tb->rt_param.opt_dw); ++} ++ ++static int by_enactment_time(struct list_head* a, struct list_head* b) ++{ ++ struct task_struct *ta, *tb; ++ ta = list_entry(a, struct task_struct, rt_param.opt_list); ++ tb = list_entry(b, struct task_struct, rt_param.opt_list); ++ return ta->rt_param.opt_change < tb->rt_param.opt_change; ++} ++ ++/******************************************************************************/ ++/* WEIGHT CHANGE MECHANICS */ ++/******************************************************************************/ ++ ++static void set_service_level(struct task_struct* t, unsigned int level) ++{ ++ service_level_t *new; ++ unsigned int old; ++ BUG_ON(!t); ++ BUG_ON(t->rt_param.no_service_levels <= level); ++ ++ old = t->rt_param.cur_service_level; ++ t->rt_param.cur_service_level = level; ++ new = t->rt_param.service_level + level; ++ t->rt_param.basic_params.period = new->period; ++ t->rt_param.basic_params.exec_cost = _round(_mul(new->weight, ++ FP(new->period))); ++ ++ scheduler_signal(t, SIGUSR1); ++ ++ sched_trace_service_level_change(t, old, level); ++ OPT_DBG_T(t, "service level %u activated\n", level); ++} ++ ++/* call this _before_ updating deadline and release of t */ ++static void update_weight_estimate(struct task_struct* t) ++{ ++ fp_t nw, ow; ++ jiffie_t sl_period, exec_time; ++ ++ ow = get_est_weight(t); ++ nw = t->rt_param.opt_nw; ++ exec_time = t->rt_param.times.exec_time; ++ sl_period = get_sl(t, get_opt_sl(t)).period; ++ ++ OPT_DBG("ow=" _FP_ " nw=" _FP_ ", r-d " _FP_ ++ ", deadline %d, release %d, exec_time=%ld sl_period=%lu\n", ++ fp2str(ow), fp2str(nw), ++ fp2str(FP(get_deadline(t) - get_last_release(t))), ++ get_deadline(t), get_last_release(t), exec_time, sl_period); ++ ++ total_weight = _sub(total_weight, get_est_weight(t)); ++ t->rt_param.predictor_state.estimate = nw; ++ OPT_DBG_T(t, "update_weight_estimate from " _FP_ " to "_FP_"\n", ++ fp2str(ow), fp2str(nw)); ++ total_weight = _add(total_weight, get_est_weight(t)); ++ ++ OPT_DBG_T(t, " update_weight_estimate: " _FP_ " => " _FP_ "\n", ++ fp2str(ow), fp2str(get_est_weight(t))); ++} ++ ++ ++static void decrease_weight(struct task_struct* t) ++{ ++ fp_t ow, nw; ++ jiffie_t last, period, delay; ++ ++ ow = get_sl(t, get_cur_sl(t)).weight; ++ nw = get_sl(t, get_opt_sl(t)).weight; ++ last = t->rt_param.times.last_release; ++ period = reweighted_period(ow, nw, t->rt_param.times.exec_time, ++ t->rt_param.times.deadline, last); ++ ++ /* necessary delay has already been computed by optimizer */ ++ delay = t->rt_param.opt_change; ++ ++ update_weight_estimate(t); ++ ++ if (!delay) ++ t->rt_param.times.last_release = opt_time; ++ t->rt_param.times.release = opt_time + delay; ++ t->rt_param.times.deadline = opt_time + delay + period; ++ ++ set_service_level(t, get_opt_sl(t)); ++ ++ /* take out of queue/link structure */ ++ unlink(t); ++ /* present as a new job */ ++ adaptive_job_arrival(t); ++} ++ ++ ++static void increase_weight(struct task_struct* t) ++{ ++ fp_t ow, nw; ++ jiffie_t last, period, delay; ++ ++ ow = get_sl(t, get_cur_sl(t)).weight; ++ nw = get_sl(t, get_opt_sl(t)).weight; ++ last = t->rt_param.times.last_release; ++ period = reweighted_period(ow, nw, t->rt_param.times.exec_time, ++ t->rt_param.times.deadline, last); ++ ++ if (t->rt_param.opt_change == 0) { ++ /* can be enacted now */ ++ if (is_under_allocated(t) || ++ time_before(opt_time + period, get_deadline(t))) ++ /* do it now */ ++ delay = 0; ++ else { ++ if (is_under_allocated(t)) { ++ t->rt_param.opt_change += opt_time; ++ /* The next job release will notice that opt != ++ * sl and initiate a weight change. ++ */ ++ return; ++ } else ++ /* nope, wait for equal point */ ++ delay = inc_equal_point_delay(t); ++ } ++ ++ update_weight_estimate(t); ++ ++ if (!delay) ++ t->rt_param.times.last_release = opt_time; ++ t->rt_param.times.release = opt_time + delay; ++ t->rt_param.times.deadline = opt_time + delay + period; ++ ++ set_service_level(t, get_opt_sl(t)); ++ ++ /* take out of queue/link structure */ ++ unlink(t); ++ /* present as a new job */ ++ adaptive_job_arrival(t); ++ ++ } else { ++ /* must wait until capacity is released */ ++ t->rt_param.opt_change += opt_time; ++ list_insert(&t->rt_param.opt_list, &adaptive_inc_list, ++ by_enactment_time); ++ } ++} ++ ++static void delayed_increase_weight(void) ++{ ++ struct list_head *p, *extra; ++ struct task_struct* t; ++ ++ opt_time = jiffies; ++ list_for_each_safe(p, extra, &adaptive_inc_list) { ++ t = list_entry(p, struct task_struct, rt_param.opt_list); ++ if (time_before_eq(t->rt_param.opt_change, opt_time)) { ++ list_del(p); ++ /* prevent recursion */ ++ t->rt_param.opt_change = 0; ++ /* this takes care of everything */ ++ increase_weight(t); ++ } else ++ /* list is sorted */ ++ break; ++ } ++} ++ ++static void change_weight(struct task_struct* t) ++{ ++ if (get_cur_sl(t) < get_opt_sl(t)) ++ increase_weight(t); ++ else ++ decrease_weight(t); ++ OPT_DBG_T(t, "after change_weight: last_rel:%d rel:%d dl:%d\n", ++ get_last_release(t), ++ get_release(t), ++ get_deadline(t)); ++} ++ ++/******************************************************************************/ ++/* OPTIMIZER */ ++/******************************************************************************/ ++ ++/* only invoke with adaptive_lock behing held */ ++void adaptive_optimize(void) ++{ ++ struct list_head list; ++ struct list_head inc, dec; ++ struct list_head *p, *extra; ++ cpu_entry_t *cpu; ++ struct task_struct* t; ++ fp_t M = FP(0), w0, wl, tmp, estU = FP(0); ++ unsigned int l; ++ jiffie_t enactment_time; ++ ++ if (time_before(jiffies, ++ last_optimizer_run + optimizer_min_invocation_sep)) ++ return; ++ ++ OPT_DBG(":::::: running adaptive optimizer\n"); ++ opt_time = jiffies; ++ ++ INIT_LIST_HEAD(&list); ++ ++ /* 1) gather all tasks */ ++ list_for_each(p, &adaptive.ready_queue) ++ list_add(&(rt_list2task(p)->rt_param.opt_list), &list); ++ list_for_each(p, &adaptive.release_queue) ++ list_add(&(rt_list2task(p)->rt_param.opt_list), &list); ++ list_for_each(p, &adaptive_cpu_queue) { ++ cpu = list_entry(p, cpu_entry_t, list); ++ if (cpu->linked) ++ list_add(&cpu->linked->rt_param.opt_list, &list); ++ } ++ ++ /* 2) determine current system capacity */ ++ M = system_capacity; ++ OPT_DBG("opt: system capacity: " _FP_ "\n", fp2str(M)); ++ ++ /* 3) Compute L value for all tasks, ++ * and set tasks to service level 0, ++ * also account for weight. ++ * Also establish current estimated utilization ++ */ ++ list_for_each_safe(p, extra, &list) { ++ t = list_entry(p, struct task_struct, rt_param.opt_list); ++ if (time_before(opt_time, get_last_release(t))) { ++ list_del(p); ++ continue; ++ } ++ t->rt_param.opt_order = linear_metric(t); ++ OPT_DBG_T(t, "est_w = " _FP_ " L = " _FP_ "\n", ++ get_est_weight(t), ++ fp2str(t->rt_param.opt_order)); ++ t->rt_param.opt_level = 0; ++ M = _sub(M, est_weight_at(t, 0)); ++ estU = _add(estU, get_est_weight(t)); ++ } ++ OPT_DBG("opt: estimated utilization: " _FP_ "\n", fp2str(estU)); ++ OPT_DBG("opt: estimated capacity at all sl=0: " _FP_ "\n", fp2str(M)); ++ ++ ++ /* 4) sort list by decreasing linear metric */ ++ list_qsort(&list, by_linear_metric); ++ ++ /* 5) assign each task a service level */ ++ list_for_each(p, &list) { ++ t = list_entry(p, struct task_struct, rt_param.opt_list); ++ l = t->rt_param.no_service_levels; ++ w0 = est_weight_at(t, 0); ++ while (l > 1) { ++ l--; ++ wl = est_weight_at(t, l); ++ tmp = _sub(M, _sub(wl, w0)); ++ if (_leq(FP(0), tmp)) { ++ /* this level fits in */ ++ M = tmp; ++ t->rt_param.opt_level = l; ++ t->rt_param.opt_dw = _sub(wl, ++ get_est_weight(t)); ++ t->rt_param.opt_nw = wl; ++ break; /* proceed to next task */ ++ } ++ } ++ OPT_DBG_T(t, " will run at sl=%u, prior=%u dw=" _FP_ "\n", ++ l, get_cur_sl(t), fp2str(t->rt_param.opt_dw)); ++ ++ } ++ ++ /* 6) filter tasks that reweight */ ++ INIT_LIST_HEAD(&inc); ++ INIT_LIST_HEAD(&dec); ++ list_for_each_safe(p, extra, &list) { ++ t = list_entry(p, struct task_struct, rt_param.opt_list); ++ list_del(p); ++ if (t->rt_param.opt_level < get_cur_sl(t)) { ++ list_add(p, &dec); ++ t->rt_param.opt_change = decrease_delay(t); ++ } else if (t->rt_param.opt_level > get_cur_sl(t)) { ++ list_add(p, &inc); ++ t->rt_param.opt_change = 0; ++ } ++ /* if t doesn't change we can ignore it from now on */ ++ } ++ ++ /* 7) sort dec and inc list */ ++ list_qsort(&dec, by_enactment_time); ++ list_qsort(&inc, by_delta_weight); ++ ++ /* 8) now figure out when we can enact weight increases ++ * It works like this: We know the current system utilization. ++ * Thus, we know the remaining capacity. We also know when ++ * decreases are going to be enacted (=> capacity increases). ++ * Now we only need to find a spot where the weight increase will ++ * not drive the system into overload. ++ */ ++ ++ /* Very ugly jump, but we need to force enactment_time = 0 ++ * during the first iteration. ++ */ ++ M = system_capacity; ++ enactment_time = 0; ++ goto first_iteration; ++ ++ while (!list_empty(&inc)) { ++ enactment_time = list_entry(dec.next, struct task_struct, ++ rt_param.opt_list) ++ ->rt_param.opt_change; ++ first_iteration: ++ /* Start by collapsing the next decrease. ++ * Except for in the first iteration, it will always ++ * pick off at least one task. ++ */ ++ list_for_each_safe(p, extra, &dec) { ++ t = list_entry(p, struct task_struct, ++ rt_param.opt_list); ++ if (t->rt_param.opt_change == enactment_time) { ++ list_del(p); ++ /* opt_dw is negative */ ++ estU = _add(estU, t->rt_param.opt_dw); ++ list_add(p, &list); ++ ++ OPT_DBG_T(t, " weight decrease at %ld => estU=" ++ _FP_ "\n", enactment_time, ++ fp2str(estU)); ++ ++ } else ++ /* stop decrease loop */ ++ break; ++ } ++ ++ /* now start setting enactment times for increases */ ++ while (!list_empty(&inc)) { ++ p = inc.next; ++ t = list_entry(p, struct task_struct, ++ rt_param.opt_list); ++ tmp = _add(estU, t->rt_param.opt_dw); ++ if (_leq(tmp, M)) { ++ /* it fits */ ++ estU = tmp; ++ t->rt_param.opt_change = enactment_time; ++ list_del(p); ++ list_add(p, &list); ++ ++ OPT_DBG_T(t, " weight increase at %ld => estU=" ++ _FP_ "\n", enactment_time, ++ fp2str(estU)); ++ ++ } else ++ /* stop increase loop */ ++ break; ++ } ++ ++ TRACE_BUG_ON(list_empty(&dec) && !list_empty(&inc)); ++ if (list_empty(&dec) && !list_empty(&inc)) ++ /* break out in case of bug */ ++ break; ++ } ++ ++ /* 9) Wow. We made it. Every task has a now a new service level ++ * assigned, together with a correct (earliest) enactment time. ++ * all we have left to do now is to enact changes that did not get ++ * delayed. Also convert change fields to actual timestamp for to be ++ * nice to the scheduler_tick(). ++ */ ++ INIT_LIST_HEAD(&adaptive_inc_list); ++ list_for_each_safe(p, extra, &list) { ++ t = list_entry(p, struct task_struct, rt_param.opt_list); ++ list_del(p); ++ change_weight(t); ++ } ++ ++ last_optimizer_run = jiffies; ++ OPT_DBG(":::::: optimizer run complete\n"); ++} ++ ++/* update_cpu_position - Move the cpu entry to the correct place to maintain ++ * order in the cpu queue. Caller must hold adaptive lock. ++ */ ++static void update_cpu_position(cpu_entry_t *entry) ++{ ++ cpu_entry_t *other; ++ struct list_head *pos; ++ list_del(&entry->list); ++ /* if we do not execute real-time jobs we just move ++ * to the end of the queue ++ */ ++ if (entry->linked) { ++ list_for_each(pos, &adaptive_cpu_queue) { ++ other = list_entry(pos, cpu_entry_t, list); ++ if (edf_higher_prio(entry->linked, other->linked)) { ++ __list_add(&entry->list, pos->prev, pos); ++ return; ++ } ++ } ++ } ++ /* if we get this far we have the lowest priority job */ ++ list_add_tail(&entry->list, &adaptive_cpu_queue); ++} ++ ++/* link_task_to_cpu - Update the link of a CPU. ++ * Handles the case where the to-be-linked task is already ++ * scheduled on a different CPU. ++ */ ++static noinline void link_task_to_cpu(struct task_struct* linked, ++ cpu_entry_t *entry) ++ ++{ ++ cpu_entry_t *sched; ++ struct task_struct* tmp; ++ int on_cpu; ++ ++ BUG_ON(linked && !is_realtime(linked)); ++ ++ /* Currently linked task is set to be unlinked. */ ++ if (entry->linked) ++ entry->linked->rt_param.linked_on = NO_CPU; ++ ++ /* Link new task to CPU. */ ++ if (linked) { ++ set_rt_flags(linked, RT_F_RUNNING); ++ /* handle task is already scheduled somewhere! */ ++ on_cpu = linked->rt_param.scheduled_on; ++ if (on_cpu != NO_CPU) { ++ sched = &per_cpu(adaptive_cpu_entries, on_cpu); ++ /* this should only happen if not linked already */ ++ BUG_ON(sched->linked == linked); ++ ++ /* If we are already scheduled on the CPU to which we ++ * wanted to link, we don't need to do the swap -- ++ * we just link ourselves to the CPU and depend on ++ * the caller to get things right. ++ */ ++ if (entry != sched) { ++ tmp = sched->linked; ++ linked->rt_param.linked_on = sched->cpu; ++ sched->linked = linked; ++ update_cpu_position(sched); ++ linked = tmp; ++ } ++ } ++ if (linked) /* might be NULL due to swap */ ++ linked->rt_param.linked_on = entry->cpu; ++ } ++ entry->linked = linked; ++ update_cpu_position(entry); ++} ++ ++/* unlink - Make sure a task is not linked any longer to an entry ++ * where it was linked before. Must hold adaptive_lock. ++ */ ++static void unlink(struct task_struct* t) ++{ ++ cpu_entry_t *entry; ++ ++ if (unlikely(!t)) { ++ TRACE_BUG_ON(!t); ++ return; ++ } ++ ++ if (t->rt_param.linked_on != NO_CPU) { ++ /* unlink */ ++ entry = &per_cpu(adaptive_cpu_entries, t->rt_param.linked_on); ++ t->rt_param.linked_on = NO_CPU; ++ link_task_to_cpu(NULL, entry); ++ } else if (in_list(&t->rt_list)) { ++ /* This is an interesting situation: t is scheduled, ++ * but was just recently unlinked. It cannot be ++ * linked anywhere else (because then it would have ++ * been relinked to this CPU), thus it must be in some ++ * queue. We must remove it from the list in this ++ * case. ++ */ ++ list_del(&t->rt_list); ++ } ++} ++ ++ ++/* preempt - force a CPU to reschedule ++ */ ++static noinline void preempt(cpu_entry_t *entry) ++{ ++ /* We cannot make the is_np() decision here if it is a remote CPU ++ * because requesting exit_np() requires that we currently use the ++ * address space of the task. Thus, in the remote case we just send ++ * the IPI and let schedule() handle the problem. ++ */ ++ ++ if (smp_processor_id() == entry->cpu) { ++ if (entry->scheduled && is_np(entry->scheduled)) ++ request_exit_np(entry->scheduled); ++ else ++ set_tsk_need_resched(current); ++ } else ++ /* in case that it is a remote CPU we have to defer the ++ * the decision to the remote CPU ++ */ ++ if (!test_will_schedule(entry->cpu)) ++ smp_send_reschedule(entry->cpu); ++} ++ ++/* requeue - Put an unlinked task into gsn-edf domain. ++ * Caller must hold adaptive_lock. ++ */ ++static noinline void requeue(struct task_struct* task) ++{ ++ BUG_ON(!task); ++ /* sanity check rt_list before insertion */ ++ BUG_ON(in_list(&task->rt_list)); ++ ++ if (get_rt_flags(task) == RT_F_SLEEP || ++ get_rt_mode() != MODE_RT_RUN) { ++ /* this task has expired ++ * _schedule has already taken care of updating ++ * the release and ++ * deadline. We just must check if it has been released. ++ */ ++ if (is_released(task) && get_rt_mode() == MODE_RT_RUN) ++ __add_ready(&adaptive, task); ++ else { ++ /* it has got to wait */ ++ __add_release(&adaptive, task); ++ } ++ ++ } else ++ /* this is a forced preemption ++ * thus the task stays in the ready_queue ++ * we only must make it available to others ++ */ ++ __add_ready(&adaptive, task); ++} ++ ++/* adaptive_job_arrival: task is either resumed or released */ ++static void adaptive_job_arrival(struct task_struct* task) ++{ ++ cpu_entry_t* last; ++ ++ BUG_ON(list_empty(&adaptive_cpu_queue)); ++ BUG_ON(!task); ++ ++ TRACE_TASK(task, "job_arrival: last_rel=%d rel=%d dl=%d now=%d\n", ++ get_last_release(task), get_release(task), ++ get_deadline(task), ++ jiffies); ++ ++ ++ /* first queue arriving job */ ++ requeue(task); ++ ++ /* then check for any necessary preemptions */ ++ last = list_entry(adaptive_cpu_queue.prev, cpu_entry_t, list); ++ if (edf_preemption_needed(&adaptive, last->linked)) { ++ /* preemption necessary */ ++ task = __take_ready(&adaptive); ++ ++ TRACE("job_arrival: task %d linked to %d\n", ++ task->pid, last->cpu); ++ ++ if (last->linked) ++ requeue(last->linked); ++ ++ link_task_to_cpu(task, last); ++ preempt(last); ++ } ++} ++ ++/* check for current job releases */ ++static noinline void adaptive_release_jobs(void) ++{ ++ struct list_head *pos, *save; ++ struct task_struct *queued; ++ ++ list_for_each_safe(pos, save, &adaptive.release_queue) { ++ queued = list_entry(pos, struct task_struct, rt_list); ++ if (likely(is_released(queued))) { ++ TRACE_TASK(queued, "released rel=%d now=%d\n", ++ get_release(queued), jiffies); ++ /* this one is ready to go*/ ++ list_del(pos); ++ set_rt_flags(queued, RT_F_RUNNING); ++ queued->rt_param.times.last_release = ++ queued->rt_param.times.release; ++ ++ /* check for delayed weight increase */ ++ if (get_opt_sl(queued) != get_cur_sl(queued) && ++ time_before_eq(queued->rt_param.opt_change, jiffies)) { ++ opt_time = jiffies; ++ set_service_level(queued, get_opt_sl(queued)); ++ queued->rt_param.times.deadline = ++ get_last_release(queued) + ++ get_rt_period(queued); ++ total_weight = _sub(total_weight, get_est_weight(queued)); ++ queued->rt_param.predictor_state.estimate = ++ queued->rt_param.opt_nw; ++ total_weight = _add(total_weight, get_est_weight(queued)); ++ } ++ ++ sched_trace_job_release(queued); ++ adaptive_job_arrival(queued); ++ } ++ else ++ /* the release queue is ordered */ ++ break; ++ } ++} ++ ++/* adaptive_scheduler_tick - this function is called for every local timer ++ * interrupt. ++ * ++ * checks whether the current task has expired and checks ++ * whether we need to preempt it if it has not expired ++ */ ++static reschedule_check_t adaptive_scheduler_tick(void) ++{ ++ unsigned long flags; ++ struct task_struct* t = current; ++ reschedule_check_t want_resched = NO_RESCHED; ++ ++ /* Account for exec time. ++ * Since we don't preempt forcefully, nothing else needs to be done. ++ */ ++ if (is_realtime(t)) ++ t->rt_param.times.exec_time++; ++ ++ /* only the first CPU needs to release jobs */ ++ if (get_rt_mode() == MODE_RT_RUN) { ++ queue_lock_irqsave(&adaptive_lock, flags); ++ ++ /* (1) run the optimizer if it did not trigger often enough */ ++ if (time_before_eq(last_optimizer_run + optimizer_period, jiffies)) { ++ ++ OPT_DBG("adaptive: optimizing due to period threshold\n"); ++ ++ adaptive_optimize(); ++ } ++ ++ /* (2) enact delayed weight increases */ ++ delayed_increase_weight(); ++ ++ /* (3) try to release pending jobs */ ++ adaptive_release_jobs(); ++ ++ /* we don't need to check linked != scheduled since ++ * set_tsk_need_resched has been set by preempt() if necessary ++ */ ++ ++ queue_unlock_irqrestore(&adaptive_lock, flags); ++ } ++ ++ return want_resched; ++} ++ ++/* caller holds adaptive_lock */ ++static noinline void job_completion(struct task_struct *t) ++{ ++ long delta; ++ fp_t actual_weight, old_estimate; ++ unsigned int lcurr = get_cur_sl(t); ++ fp_t v = t->rt_param.service_level[lcurr].value; ++ ++ int non_zero_weight; ++ fp_t error_percentage; ++ int exceeds_threshold; ++ ++ BUG_ON(!t); ++ ++ TRACE_TASK(t, " completion, last_rel=%d rel=%d dl=%d now=%d " ++ "period=%d\n", ++ get_last_release(t), get_release(t), get_deadline(t), ++ jiffies, get_rt_period(t)); ++ ++ sched_trace_job_completion(t); ++ delta = t->rt_param.times.exec_time - ++ t->rt_param.basic_params.exec_cost; ++ ++ OPT_DBG_T(t, "job %d completes, delta WCET = %d\n", ++ t->rt_param.times.job_no, delta); ++ ++ actual_weight = _frac(t->rt_param.times.exec_time, ++ t->rt_param.basic_params.period); ++ sched_trace_weight_error(t, actual_weight); ++ old_estimate = get_est_weight(t); ++ update_estimate(&t->rt_param.predictor_state, actual_weight, ++ fc_a, fc_b); ++ ++ OPT_DBG_T(t, "Job %d completes. Current value " _FP_ ++ ", Weight estimation: error=" _FP_ " weight=" ++ _FP_ " => " _FP_ "\n",t->rt_param.times.job_no, v, ++ _sub(get_est_weight(t), old_estimate), ++ old_estimate, get_est_weight(t)); ++ ++ /* Now we have determined the task error. ++ * Next we release the next job. ++ * Then we optimize. It's easier for the optimizer to deal ++ * with just-released jobs. ++ */ ++ ++ /* prepare for next period */ ++ edf_prepare_for_next_period(t); ++ ++ TRACE_TASK(t, " prepped, last_rel=%d rel=%d dl=%d now=%d\n", ++ get_last_release(t), get_release(t), get_deadline(t), ++ jiffies); ++ ++ if (is_released(t)) { ++ /* set flags */ ++ /* prevent fake completions */ ++ set_rt_flags(t, RT_F_RUNNING); ++ t->rt_param.times.last_release = ++ t->rt_param.times.release; ++ } ++ ++ ++ non_zero_weight = !_eq(get_est_weight(t),FP(0)); ++ if (non_zero_weight) ++ error_percentage = _div(_abs(_sub(get_est_weight(t), ++ old_estimate)), ++ get_est_weight(t)); ++ else ++ error_percentage = FP(0); ++ exceeds_threshold = _gt(error_percentage, task_error_threshold); ++ ++ ++ if (exceeds_threshold) { ++ OPT_DBG("adaptive: optimizing due to task error threshold\n"); ++ adaptive_optimize(); ++ } else if (_gt(total_weight, system_capacity)) { ++ OPT_DBG("adaptive: optimizing due to system capacity exceeded\n"); ++ adaptive_optimize(); ++ } ++ ++ ++ /* unlink */ ++ unlink(t); ++ /* requeue ++ * But don't requeue a blocking task. */ ++ if (is_running(t)) ++ adaptive_job_arrival(t); ++} ++ ++ ++/* Getting schedule() right is a bit tricky. schedule() may not make any ++ * assumptions on the state of the current task since it may be called for a ++ * number of reasons. The reasons include a scheduler_tick() determined that it ++ * was necessary, because sys_exit_np() was called, because some Linux ++ * subsystem determined so, or even (in the worst case) because there is a bug ++ * hidden somewhere. Thus, we must take extreme care to determine what the ++ * current state is. ++ * ++ * The CPU could currently be scheduling a task (or not), be linked (or not). ++ * ++ * The following assertions for the scheduled task could hold: ++ * ++ * - !is_running(scheduled) // the job blocks ++ * - get_rt_flag() == RT_F_SLEEP // the job completed (by syscall) ++ * - linked != scheduled // we need to reschedule (for any reason) ++ * ++ * Any of these can occur together. ++ */ ++static int adaptive_schedule(struct task_struct * prev, ++ struct task_struct ** next, ++ runqueue_t * rq) ++{ ++ cpu_entry_t* entry = &__get_cpu_var(adaptive_cpu_entries); ++ int sleep, preempt, exists, ++ rt, blocks; ++ struct task_struct* linked; ++ ++ /* Will be released in finish_switch. */ ++ queue_lock(&adaptive_lock); ++ clear_will_schedule(); ++ ++ /* sanity checking */ ++ BUG_ON(entry->scheduled && entry->scheduled != prev); ++ BUG_ON(entry->scheduled && !is_realtime(prev)); ++ ++ /* (0) Determine state */ ++ exists = entry->scheduled != NULL; ++ blocks = exists && !is_running(entry->scheduled); ++ sleep = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP; ++ preempt = entry->scheduled != entry->linked; ++ rt = get_rt_mode() == MODE_RT_RUN; ++ ++ /* If a task blocks we have no choice but to reschedule. ++ */ ++ if (blocks) ++ unlink(entry->scheduled); ++ ++ /* Task wants to sleep -> job is done. ++ */ ++ if (sleep) ++ job_completion(entry->scheduled); ++ ++ /* Stop real-time tasks when we leave real-time mode ++ */ ++ if (!rt && entry->linked) { ++ /* task will be preempted once it is preemptable ++ * (which it may be already) ++ */ ++ linked = entry->linked; ++ unlink(linked); ++ requeue(linked); ++ } ++ ++ /* Link pending task if we became unlinked. ++ */ ++ if (rt && !entry->linked) ++ link_task_to_cpu(__take_ready(&adaptive), entry); ++ ++ /* The final scheduling decision. Do we need to switch for some reason? ++ * If linked different from scheduled select linked as next. ++ */ ++ if (entry->linked != entry->scheduled) { ++ /* Take care of a previously scheduled ++ * job by taking it out of the Linux runqueue. ++ */ ++ if (entry->scheduled) ++ if (prev->array) ++ /* take it out of the run queue */ ++ deactivate_task(prev, rq); ++ ++ /* Schedule a linked job? */ ++ if (entry->linked) { ++ *next = entry->linked; ++ /* mark the task as executing on this cpu */ ++ set_task_cpu(*next, smp_processor_id()); ++ /* stick the task into the runqueue */ ++ __activate_task(*next, rq); ++ } ++ } else ++ /* Only override Linux scheduler if we have real-time task ++ * scheduled that needs to continue. ++ */ ++ if (exists) ++ *next = prev; ++ ++ /* Unlock in case that we don't affect real-time tasks or ++ * if nothing changed and finish_switch won't be called. ++ */ ++ if (prev == *next || (!is_realtime(prev) && !*next)) ++ queue_unlock(&adaptive_lock); ++ ++ return 0; ++} ++ ++ ++/* _finish_switch - we just finished the switch away from prev ++ */ ++static void adaptive_finish_switch(struct task_struct *prev) ++{ ++ cpu_entry_t* entry = &__get_cpu_var(adaptive_cpu_entries); ++ ++ if (is_realtime(current)) ++ entry->scheduled = current; ++ else ++ entry->scheduled = NULL; ++ ++ prev->rt_param.scheduled_on = NO_CPU; ++ current->rt_param.scheduled_on = smp_processor_id(); ++ ++ /* unlock in case schedule() left it locked */ ++ if (is_realtime(current) || is_realtime(prev)) ++ queue_unlock(&adaptive_lock); ++} ++ ++ ++/* Prepare a task for running in RT mode ++ * Enqueues the task into master queue data structure ++ * returns ++ * -EPERM if task is not TASK_STOPPED ++ */ ++static long adaptive_prepare_task(struct task_struct * t) ++{ ++ unsigned long flags; ++ ++ TRACE("adaptive: prepare task %d\n", t->pid); ++ ++ if (t->state == TASK_STOPPED) { ++ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1); ++ ++ t->rt_param.scheduled_on = NO_CPU; ++ t->rt_param.linked_on = NO_CPU; ++ if (t->rt_param.no_service_levels) { ++ t->rt_param.predictor_state.estimate = ++ get_sl(t, 0).weight; ++ } else ++ t->rt_param.predictor_state.estimate = ++ _frac(get_exec_cost(t), get_rt_period(t)); ++ ++ TRACE_TASK(t, "est_weight=" _FP_ "\n", get_est_weight(t)); ++ ++ if (get_rt_mode() == MODE_RT_RUN) ++ /* The action is already on. ++ * Prepare immediate release ++ */ ++ edf_release_now(t); ++ /* The task should be running in the queue, otherwise signal ++ * code will try to wake it up with fatal consequences. ++ */ ++ t->state = TASK_RUNNING; ++ ++ queue_lock_irqsave(&adaptive_lock, flags); ++ total_weight = _add(total_weight, get_est_weight(t)); ++ requeue(t); ++ queue_unlock_irqrestore(&adaptive_lock, flags); ++ return 0; ++ } ++ else ++ return -EPERM; ++} ++ ++static void adaptive_wake_up_task(struct task_struct *task) ++{ ++ unsigned long flags; ++ /* We must determine whether task should go into the release ++ * queue or into the ready queue. It may enter the ready queue ++ * if it has credit left in its time slice and has not yet reached ++ * its deadline. If it is now passed its deadline we assume this the ++ * arrival of a new sporadic job and thus put it in the ready queue ++ * anyway.If it has zero budget and the next release is in the future ++ * it has to go to the release queue. ++ */ ++ ++ TRACE("adaptive: %d unsuspends\n", task->pid); ++ ++ task->state = TASK_RUNNING; ++ ++ if (is_tardy(task)) { ++ /* new sporadic release */ ++ edf_release_now(task); ++ sched_trace_job_release(task); ++ } ++ else if (task->time_slice) ++ /* came back in time before deadline */ ++ set_rt_flags(task, RT_F_RUNNING); ++ ++ queue_lock_irqsave(&adaptive_lock, flags); ++ total_weight = _add(total_weight, get_est_weight(task)); ++ adaptive_job_arrival(task); ++ queue_unlock_irqrestore(&adaptive_lock, flags); ++} ++ ++static void adaptive_task_blocks(struct task_struct *t) ++{ ++ unsigned long flags; ++ ++ /* unlink if necessary */ ++ queue_lock_irqsave(&adaptive_lock, flags); ++ total_weight = _sub(total_weight, get_est_weight(t)); ++ unlink(t); ++ queue_unlock_irqrestore(&adaptive_lock, flags); ++ ++ BUG_ON(!is_realtime(t)); ++ ++ TRACE("task %d suspends\n", t->pid); ++ ++ BUG_ON(t->rt_list.next != LIST_POISON1); ++ BUG_ON(t->rt_list.prev != LIST_POISON2); ++} ++ ++ ++/* When _tear_down is called, the task should not be in any queue any more ++ * as it must have blocked first. We don't have any internal state for the task, ++ * it is all in the task_struct. ++ */ ++static long adaptive_tear_down(struct task_struct * t) ++{ ++ BUG_ON(!is_realtime(t)); ++ TRACE_TASK(t, "RIP\n"); ++ BUG_ON(t->array); ++ BUG_ON(t->rt_list.next != LIST_POISON1); ++ BUG_ON(t->rt_list.prev != LIST_POISON2); ++ return 0; ++} ++ ++static int adaptive_mode_change(int new_mode) ++{ ++ unsigned long flags; ++ int cpu; ++ cpu_entry_t *entry; ++ struct task_struct* t; ++ struct list_head* pos; ++ ++ if (new_mode == MODE_RT_RUN) { ++ queue_lock_irqsave(&adaptive_lock, flags); ++ ++ system_capacity = FP(0); ++ for_each_online_cpu(cpu) ++ system_capacity = _add(system_capacity, FP(1)); ++ ++ __rerelease_all(&adaptive, edf_release_at); ++ ++ total_weight = FP(0); ++ list_for_each(pos, &adaptive.release_queue) { ++ t = list_entry(pos, struct task_struct, rt_list); ++ total_weight = _add(total_weight, get_est_weight(t)); ++ } ++ TRACE("adaptive: total weight: " _FP_ ++ " (at mode change)\n", total_weight); ++ ++ ++ /* get old cruft out of the way in case we reenter real-time ++ * mode for a second time ++ */ ++ while (!list_empty(&adaptive_cpu_queue)) ++ list_del(adaptive_cpu_queue.next); ++ /* reinitialize */ ++ for_each_online_cpu(cpu) { ++ entry = &per_cpu(adaptive_cpu_entries, cpu); ++ atomic_set(&entry->will_schedule, 0); ++ entry->linked = NULL; ++ entry->scheduled = NULL; ++ list_add(&entry->list, &adaptive_cpu_queue); ++ } ++ ++ adaptive_optimize(); ++ ++ queue_unlock_irqrestore(&adaptive_lock, flags); ++ ++ } ++ return 0; ++} ++ ++ ++typedef enum { ++ ADAPTIVE_SET_MIN_OPT_SEP = 1 ++} adaptive_cmds_t; ++ ++ ++static int adaptive_setup(int cmd, void __user *up) ++{ ++ unsigned int error = -EINVAL; ++ unsigned int val; ++ ++ if (copy_from_user(&val, up, sizeof(unsigned int))) { ++ error = -EFAULT; ++ goto out; ++ } ++ ++ switch (cmd) { ++ case ADAPTIVE_SET_MIN_OPT_SEP: ++ optimizer_min_invocation_sep = val; ++ TRACE("adaptive: min opt sep set to %d\n", ++ optimizer_min_invocation_sep); ++ return 0; ++ break; ++ } ++ ++out: ++ return error; ++} ++ ++ ++/* Plugin object */ ++static sched_plugin_t s_plugin __cacheline_aligned_in_smp = { ++ .ready_to_use = 0 ++}; ++ ++ ++/* ++ * Plugin initialization code. ++ */ ++#define INIT_SCHED_PLUGIN (struct sched_plugin){ \ ++ .plugin_name = "ADAPTIVE", \ ++ .ready_to_use = 1, \ ++ .scheduler_tick = adaptive_scheduler_tick, \ ++ .prepare_task = adaptive_prepare_task, \ ++ .sleep_next_period = edf_sleep_next_period, \ ++ .tear_down = adaptive_tear_down, \ ++ .schedule = adaptive_schedule, \ ++ .finish_switch = adaptive_finish_switch, \ ++ .mode_change = adaptive_mode_change, \ ++ .wake_up_task = adaptive_wake_up_task, \ ++ .task_blocks = adaptive_task_blocks, \ ++ .scheduler_setup = adaptive_setup \ ++} ++ ++ ++sched_plugin_t *__init init_adaptive_plugin(void) ++{ ++ int cpu; ++ cpu_entry_t *entry; ++ ++ /* magic values given in the paper */ ++ fc_a = _frac( 102, 1000); ++ fc_b = _frac( 303, 1000); ++ ++ optimizer_period = 1000; ++ optimizer_min_invocation_sep = 200; ++ task_error_threshold = _frac(1, 2); ++ ++ if (!s_plugin.ready_to_use) ++ { ++ /* initialize CPU state */ ++ for (cpu = 0; cpu < NR_CPUS; cpu++) { ++ entry = &per_cpu(adaptive_cpu_entries, cpu); ++ atomic_set(&entry->will_schedule, 0); ++ entry->linked = NULL; ++ entry->scheduled = NULL; ++ entry->cpu = cpu; ++ } ++ ++ queue_lock_init(&adaptive_lock); ++ edf_domain_init(&adaptive, NULL); ++ s_plugin = INIT_SCHED_PLUGIN; ++ } ++ return &s_plugin; ++} ++ ++ +diff --git a/kernel/sched_edf_hsb.c b/kernel/sched_edf_hsb.c +new file mode 100644 +index 0000000..a2f670d +--- /dev/null ++++ b/kernel/sched_edf_hsb.c +@@ -0,0 +1,1724 @@ ++/* ++ * kernel/sched_edf_hsb.c ++ * ++ * Implementation of the EDF-HSB scheduler plugin. ++ * ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++/* undefine to remove capacity sharing */ ++#define HSB_CAP_SHARE_ENABLED ++ ++/* fake server PIDs */ ++#define HRT_BASE_PID 50000 ++#define SRT_BASE_PID 60000 ++ ++ ++/******************************************************************************/ ++/* Capacity queue */ ++/******************************************************************************/ ++ ++int cap_check_resched(jiffie_t deadline); ++ ++typedef struct { ++ int budget; ++ jiffie_t deadline; ++ pid_t donor; ++ ++ struct list_head list; ++} capacity_t; ++ ++typedef struct { ++ spinlock_t lock; ++ struct list_head queue; ++} capacity_queue_t; ++ ++#define next_cap(q) list_entry((q)->queue.next, capacity_t, list) ++ ++void capacity_queue_init(capacity_queue_t* queue) ++{ ++ queue->lock = SPIN_LOCK_UNLOCKED; ++ INIT_LIST_HEAD(&queue->queue); ++} ++ ++void __add_capacity(capacity_queue_t* queue, capacity_t *cap) ++{ ++ struct list_head* pos; ++ capacity_t* queued; ++ ++ list_for_each_prev(pos, &queue->queue) { ++ queued = list_entry(pos, capacity_t, list); ++ if ( time_before_eq(queued->deadline, cap->deadline)) { ++ __list_add(&cap->list, pos, pos->next); ++ return; ++ } ++ } ++ list_add(&cap->list, &queue->queue); ++} ++ ++int __capacity_available(capacity_queue_t* queue) ++{ ++ capacity_t *cap; ++ ++ while (!list_empty(&queue->queue)) { ++ cap = list_entry(queue->queue.next, capacity_t, list); ++ ++ ++ if (time_before_eq(cap->deadline, jiffies)) { ++ list_del(queue->queue.next); ++ kfree(cap); ++ cap = NULL; ++ } else ++ break; ++ } ++ ++ return !list_empty(&queue->queue); ++} ++ ++void __return_capacity(capacity_queue_t* queue, capacity_t *cap) ++{ ++ if (!cap->budget || time_before_eq(cap->deadline, jiffies)) ++ kfree(cap); ++ else ++ __add_capacity(queue, cap); ++} ++ ++ ++void return_capacity(capacity_queue_t* queue, capacity_t *cap) ++ ++{ ++ unsigned long flags; ++ ++ if (!cap->budget || time_before_eq(cap->deadline, jiffies)) ++ kfree(cap); ++ else { ++ spin_lock_irqsave(&queue->lock, flags); ++ __add_capacity(queue, cap); ++ spin_unlock_irqrestore(&queue->lock, flags); ++ } ++} ++ ++ ++#define MIN_TIME_DELTA 1 ++#define MIN_BUDGET 1 ++ ++#ifdef HSB_CAP_SHARE_ENABLED ++void release_capacity(capacity_queue_t* queue, unsigned int budget, ++ jiffie_t deadline, struct task_struct* t) ++{ ++ capacity_t* cap; ++ unsigned long flags; ++ ++ if (deadline >= jiffies + MIN_TIME_DELTA && budget >= MIN_BUDGET) { ++ cap = kmalloc(sizeof(capacity_t), GFP_ATOMIC); ++ if (cap) { ++ cap->budget = budget; ++ cap->deadline = deadline; ++ if (t) ++ cap->donor = t->pid; ++ else ++ cap->donor = 0; ++ spin_lock_irqsave(&queue->lock, flags); ++ __add_capacity(queue, cap); ++ cap_check_resched(next_cap(queue)->deadline); ++ spin_unlock_irqrestore(&queue->lock, flags); ++ if (t) ++ sched_trace_capacity_release(t); ++ } ++ } ++} ++ ++void __release_capacity(capacity_queue_t* queue, unsigned int budget, ++ jiffie_t deadline, struct task_struct* t) ++{ ++ capacity_t* cap; ++ ++ if (deadline >= jiffies + MIN_TIME_DELTA && budget >= MIN_BUDGET) { ++ cap = kmalloc(sizeof(capacity_t), GFP_ATOMIC); ++ if (cap) { ++ cap->budget = budget; ++ cap->deadline = deadline; ++ if (t) ++ cap->donor = t->pid; ++ else ++ cap->donor = 0; ++ /* no locking, no resched check -- called from schedule */ ++ __add_capacity(queue, cap); ++ if (t) ++ sched_trace_capacity_release(t); ++ } ++ } ++} ++ ++ ++capacity_t* __take_capacity(capacity_queue_t* queue, jiffie_t deadline, int deadline_matters) ++{ ++ capacity_t* cap = NULL; ++ ++ while (!list_empty(&queue->queue)) { ++ cap = list_entry(queue->queue.next, capacity_t, list); ++ ++ if (deadline_matters && time_before(deadline, cap->deadline)) { ++ cap = NULL; ++ break; ++ } ++ ++ list_del(queue->queue.next); ++ if (cap->deadline > jiffies) { ++ if (cap->deadline - jiffies < cap->budget) ++ cap->budget = cap->deadline - jiffies; ++ break; ++ } ++ kfree(cap); ++ cap = NULL; ++ } ++ ++ return cap; ++} ++#else ++ ++/* no capacity sharing */ ++void release_capacity(capacity_queue_t* queue, unsigned int budget, ++ jiffie_t deadline, struct task_struct* t) ++{ ++} ++ ++capacity_t* __take_capacity(capacity_queue_t* queue, jiffie_t deadline, int deadline_matters) ++{ ++ return NULL; ++} ++#endif ++ ++ ++/******************************************************************************/ ++/* server abstractions */ ++/******************************************************************************/ ++ ++ ++/* hrt_server_t - Abstraction of a hard real-time server. ++ * ++ * One HRT server per CPU. If it is unused period and wcet may be zero. ++ * HRT servers are strictly periodic and retain their budget. ++ */ ++typedef struct { ++ rt_domain_t domain; ++ ++ unsigned int period; ++ unsigned int wcet; ++ ++ jiffie_t deadline; ++ int budget; ++} hrt_server_t; ++ ++/* be_server_t - Abstraction of best-effort server. ++ * ++ * This is pretty much only an accounting abstraction. ++ */ ++typedef struct { ++ unsigned int period; ++ unsigned int wcet; ++ ++ jiffie_t deadline; ++ jiffie_t release; ++ int budget; ++ ++ struct list_head list; ++ pid_t pid; ++} be_server_t; ++ ++/* cast to int to allow for negative slack, i.e. tardiness */ ++#define server_slack(srv) \ ++ ( ((int) (srv)->deadline - (int) jiffies) - (int) (srv)->budget ) ++ ++typedef struct { ++ int cpu; ++ ++ hrt_server_t hrt; ++ be_server_t* be; ++ capacity_t* cap; ++ ++ task_class_t exec_class; ++ jiffie_t cur_deadline; ++ atomic_t will_schedule; ++ ++ struct list_head list; ++ spinlock_t lock; ++} cpu_state_t; ++ ++ ++DEFINE_PER_CPU(cpu_state_t, hsb_cpu_state); ++ ++#define hrt_dom(cpu) (&per_cpu(hsb_cpu_state, cpu).hrt.domain) ++ ++#define set_will_schedule() \ ++ (atomic_set(&__get_cpu_var(hsb_cpu_state).will_schedule, 1)) ++#define clear_will_schedule() \ ++ (atomic_set(&__get_cpu_var(hsb_cpu_state).will_schedule, 0)) ++#define test_will_schedule(cpu) \ ++ (atomic_read(&per_cpu(hsb_cpu_state, cpu).will_schedule)) ++ ++ ++static void prepare_hrt_release(hrt_server_t *srv, jiffie_t start) ++{ ++ if (srv->period && srv->wcet) { ++ srv->deadline = start; ++ srv->budget = 0; ++ } ++} ++ ++static void check_for_hrt_release(hrt_server_t *srv) { ++ if (srv->wcet && srv->period && ++ time_before_eq(srv->deadline, jiffies)) { ++ srv->deadline += srv->period; ++ srv->budget = srv->wcet; ++ sched_trace_server_release(HRT_BASE_PID + smp_processor_id(), ++ srv->budget, srv->period, RT_CLASS_HARD); ++ } ++} ++ ++/* A HRT client is eligible if either its deadline is before the ++ * the server deadline or if the server has zero slack. The server ++ * must have budget left. ++ */ ++static inline int hrt_client_eligible(hrt_server_t *srv) ++{ ++ if (!list_empty(&srv->domain.ready_queue)) ++ return srv->budget && ( ++ time_before(get_deadline(next_ready(&srv->domain)), ++ srv->deadline) ++ || server_slack(srv) <= 0); ++ else ++ return 0; ++} ++ ++static void hsb_cpu_state_init(cpu_state_t* cpu_state, ++ check_resched_needed_t check, ++ int cpu) ++{ ++ edf_domain_init(&cpu_state->hrt.domain, check); ++ cpu_state->hrt.budget = 0; ++ cpu_state->hrt.deadline = 0; ++ cpu_state->hrt.period = 0; ++ cpu_state->hrt.wcet = 0; ++ ++ cpu_state->be = NULL; ++ cpu_state->cap = NULL; ++ ++ cpu_state->cur_deadline = 0; ++ cpu_state->cpu = cpu; ++ cpu_state->lock = SPIN_LOCK_UNLOCKED; ++ cpu_state->exec_class = RT_CLASS_BEST_EFFORT; ++ ++ atomic_set(&cpu_state->will_schedule, 0); ++ INIT_LIST_HEAD(&cpu_state->list); ++} ++ ++/******************************************************************************/ ++/* BE queue functions - mostly like edf_common.c */ ++/******************************************************************************/ ++ ++#define be_earlier_deadline(a, b) (time_before(\ ++ (a)->deadline, (b)->deadline)) ++#define be_earlier_release(a, b) (time_before(\ ++ (a)->release, (b)->release)) ++ ++ ++static void be_add_ready(rt_domain_t* edf, be_server_t *new) ++{ ++ unsigned long flags; ++ struct list_head *pos; ++ be_server_t *queued; ++ unsigned int passed = 0; ++ ++ BUG_ON(!new); ++ /* first we need the write lock for rt_ready_queue */ ++ write_lock_irqsave(&edf->ready_lock, flags); ++ /* find a spot where our deadline is earlier than the next */ ++ list_for_each(pos, &edf->ready_queue) { ++ queued = list_entry(pos, be_server_t, list); ++ if (unlikely(be_earlier_deadline(new, queued))) { ++ __list_add(&new->list, pos->prev, pos); ++ goto out; ++ } ++ passed++; ++ } ++ /* if we get to this point either the list is empty or new has the ++ * lowest priority. Let's add it to the end. */ ++ list_add_tail(&new->list, &edf->ready_queue); ++ out: ++ if (!passed) ++ edf->check_resched(edf); ++ write_unlock_irqrestore(&edf->ready_lock, flags); ++} ++ ++static be_server_t* be_take_ready(rt_domain_t* edf) ++{ ++ be_server_t *t = NULL; ++ ++ if (!list_empty(&edf->ready_queue)) { ++ t = list_entry(edf->ready_queue.next, be_server_t, list); ++ /* kick it out of the ready list */ ++ list_del(&t->list); ++ } ++ return t; ++} ++ ++/*static be_server_t* get_be_server(rt_domain_t* edf) ++{ ++ be_server_t *t = NULL; ++ ++ spin_lock(&edf->release_lock); ++ write_lock(&edf->ready_lock); ++ t = be_take_ready(edf); ++ ++ if (!t && !list_empty(&edf->release_queue)) { ++ t = list_entry(edf->release_queue.next, be_server_t, list); ++ ++ list_del(&t->list); ++ } ++ ++ write_unlock(&edf->ready_lock); ++ spin_unlock(&edf->release_lock); ++ return t; ++}*/ ++ ++static void be_add_release(rt_domain_t* edf, be_server_t *srv) ++{ ++ unsigned long flags; ++ struct list_head *pos; ++ be_server_t *queued; ++ ++ spin_lock_irqsave(&edf->release_lock, flags); ++ list_for_each_prev(pos, &edf->release_queue) { ++ queued = list_entry(pos, be_server_t, list); ++ if ((unlikely(be_earlier_release(queued, srv)))) { ++ /* the task at pos has an earlier release */ ++ /* insert the new task in behind it */ ++ __list_add(&srv->list, pos, pos->next); ++ goto out; ++ } ++ } ++ ++ list_add(&srv->list, &edf->release_queue); ++ out: ++ spin_unlock_irqrestore(&edf->release_lock, flags); ++} ++ ++static void be_try_release_pending(rt_domain_t* edf) ++{ ++ unsigned long flags; ++ struct list_head *pos, *save; ++ be_server_t *queued; ++ ++ if (spin_trylock_irqsave(&edf->release_lock, flags)) { ++ list_for_each_safe(pos, save, &edf->release_queue) { ++ queued = list_entry(pos, be_server_t, list); ++ if (likely(time_before_eq( ++ queued->release, ++ jiffies))) { ++ list_del(pos); ++ be_add_ready(edf, queued); ++ sched_trace_server_release( ++ queued->pid, queued->budget, ++ queued->period, RT_CLASS_BEST_EFFORT); ++ } else ++ /* the release queue is ordered */ ++ break; ++ } ++ spin_unlock_irqrestore(&edf->release_lock, flags); ++ } ++} ++ ++static void be_prepare_new_release(be_server_t *t, jiffie_t start) { ++ t->release = start; ++ t->deadline = t->release + t->period; ++ t->budget = t->wcet; ++} ++ ++static void be_prepare_new_releases(rt_domain_t *edf, jiffie_t start) ++{ ++ unsigned long flags; ++ struct list_head tmp_list; ++ struct list_head *pos, *n; ++ be_server_t *t; ++ ++ INIT_LIST_HEAD(&tmp_list); ++ ++ spin_lock_irqsave(&edf->release_lock, flags); ++ write_lock(&edf->ready_lock); ++ ++ ++ while (!list_empty(&edf->release_queue)) { ++ pos = edf->release_queue.next; ++ list_del(pos); ++ list_add(pos, &tmp_list); ++ } ++ ++ while (!list_empty(&edf->ready_queue)) { ++ pos = edf->ready_queue.next; ++ list_del(pos); ++ list_add(pos, &tmp_list); ++ ++ } ++ ++ write_unlock(&edf->ready_lock); ++ spin_unlock_irqrestore(&edf->release_lock, flags); ++ ++ list_for_each_safe(pos, n, &tmp_list) { ++ t = list_entry(pos, be_server_t, list); ++ list_del(pos); ++ be_prepare_new_release(t, start); ++ be_add_release(edf, t); ++ } ++ ++} ++ ++static void be_prepare_for_next_period(be_server_t *t) ++{ ++ BUG_ON(!t); ++ /* prepare next release */ ++ t->release = t->deadline; ++ t->deadline += t->period; ++ t->budget = t->wcet; ++} ++ ++#define be_next_ready(edf) \ ++ list_entry((edf)->ready_queue.next, be_server_t, list) ++ ++ ++/* need_to_preempt - check whether the task t needs to be preempted by a ++ * best-effort server. ++ */ ++static inline int be_preemption_needed(rt_domain_t* edf, cpu_state_t* state) ++{ ++ /* we need the read lock for rt_ready_queue */ ++ if (!list_empty(&edf->ready_queue)) ++ { ++ ++ if (state->exec_class == RT_CLASS_SOFT) { ++ if (state->cap) ++ return time_before( ++ be_next_ready(edf)->deadline, ++ state->cap->deadline); ++ else ++ return time_before( ++ be_next_ready(edf)->deadline, ++ state->cur_deadline); ++ } else ++ return 1; ++ } ++ return 0; ++} ++ ++static void be_enqueue(rt_domain_t* edf, be_server_t* srv) ++{ ++ int new_release = 0; ++ if (!srv->budget) { ++ be_prepare_for_next_period(srv); ++ new_release = 1; ++ } ++ ++ if (time_before_eq(srv->release, jiffies) && ++ get_rt_mode() == MODE_RT_RUN) { ++ be_add_ready(edf, srv); ++ if (new_release) ++ sched_trace_server_release( ++ srv->pid, srv->budget, ++ srv->period, RT_CLASS_BEST_EFFORT); ++ } else ++ be_add_release(edf, srv); ++} ++ ++static void be_preempt(rt_domain_t *be, cpu_state_t *state) ++{ ++ be_server_t *srv; ++ ++ spin_lock(&state->lock); ++ srv = state->be; ++ state->be = NULL; ++ spin_unlock(&state->lock); ++ ++ /* add outside of lock to avoid deadlock */ ++ if (srv) ++ be_enqueue(be, srv); ++} ++ ++ ++/******************************************************************************/ ++/* Actual HSB implementation */ ++/******************************************************************************/ ++ ++/* always acquire the cpu lock as the last lock to avoid deadlocks */ ++static spinlock_t hsb_cpu_lock = SPIN_LOCK_UNLOCKED; ++/* the cpus queue themselves according to priority in here */ ++static LIST_HEAD(hsb_cpu_queue); ++ ++ ++/* the global soft real-time domain */ ++static rt_domain_t srt; ++/* the global best-effort server domain ++ * belongs conceptually to the srt domain, but has ++ * be_server_t* queued instead of tast_t* ++ */ ++static rt_domain_t be; ++ ++static rt_domain_t hsb_fifo; ++ ++static capacity_queue_t cap_queue; ++ ++ ++ ++ ++/* adjust_cpu_queue - Move the cpu entry to the correct place to maintain ++ * order in the cpu queue. ++ * ++ */ ++static void adjust_cpu_queue(task_class_t class, jiffie_t deadline, ++ be_server_t *be) ++{ ++ struct list_head *pos; ++ cpu_state_t *other; ++ cpu_state_t *entry; ++ ++ spin_lock(&hsb_cpu_lock); ++ ++ entry = &__get_cpu_var(hsb_cpu_state); ++ ++ spin_lock(&entry->lock); ++ entry->exec_class = class; ++ entry->cur_deadline = deadline; ++ entry->be = be; ++ ++ spin_unlock(&entry->lock); ++ ++ ++ ++ if (be) ++ sched_trace_server_scheduled( ++ be->pid, RT_CLASS_BEST_EFFORT, be->budget, ++ be->deadline); ++ else if (class == RT_CLASS_HARD) ++ sched_trace_server_scheduled( ++ HRT_BASE_PID + smp_processor_id(), RT_CLASS_HARD, ++ entry->hrt.budget, entry->hrt.deadline); ++ ++ list_del(&entry->list); ++ /* If we do not execute real-time jobs we just move ++ * to the end of the queue . ++ * If we execute hard real-time jobs we move the start ++ * of the queue. ++ */ ++ ++ switch (entry->exec_class) { ++ case RT_CLASS_HARD: ++ list_add(&entry->list, &hsb_cpu_queue); ++ break; ++ ++ case RT_CLASS_SOFT: ++ list_for_each(pos, &hsb_cpu_queue) { ++ other = list_entry(pos, cpu_state_t, list); ++ if (other->exec_class > RT_CLASS_SOFT || ++ time_before_eq(entry->cur_deadline, ++ other->cur_deadline)) ++ { ++ __list_add(&entry->list, pos->prev, pos); ++ goto out; ++ } ++ } ++ /* possible fall through if lowest SRT priority */ ++ ++ case RT_CLASS_BEST_EFFORT: ++ list_add_tail(&entry->list, &hsb_cpu_queue); ++ break; ++ ++ default: ++ /* something wrong in the variable */ ++ BUG(); ++ } ++ out: ++ spin_unlock(&hsb_cpu_lock); ++} ++ ++ ++/* hrt_check_resched - check whether the HRT server on given CPU needs to ++ * preempt the running task. ++ */ ++static int hrt_check_resched(rt_domain_t *edf) ++{ ++ hrt_server_t *srv = container_of(edf, hrt_server_t, domain); ++ cpu_state_t *state = container_of(srv, cpu_state_t, hrt); ++ int ret = 0; ++ ++ spin_lock(&state->lock); ++ ++ if (hrt_client_eligible(srv)) { ++ if (state->exec_class > RT_CLASS_HARD || ++ time_before( ++ get_deadline(next_ready(edf)), ++ state->cur_deadline) ++ ) { ++ if (state->cpu == smp_processor_id()) ++ set_tsk_need_resched(current); ++ else ++ smp_send_reschedule(state->cpu); ++ } ++ } ++ ++ spin_unlock(&state->lock); ++ return ret; ++} ++ ++ ++/* srt_check_resched - Check whether another CPU needs to switch to a SRT task. ++ * ++ * The function only checks and kicks the last CPU. It will reschedule and ++ * kick the next if necessary, and so on. The caller is responsible for making ++ * sure that it is not the last entry or that a reschedule is not necessary. ++ * ++ * Caller must hold edf->ready_lock! ++ */ ++static int srt_check_resched(rt_domain_t *edf) ++{ ++ cpu_state_t *last; ++ int ret = 0; ++ ++ spin_lock(&hsb_cpu_lock); ++ ++ if (!list_empty(&srt.ready_queue)) { ++ last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list); ++ /* guard against concurrent updates */ ++ spin_lock(&last->lock); ++ if (last->exec_class == RT_CLASS_BEST_EFFORT || ( ++ last->exec_class == RT_CLASS_SOFT && ++ time_before(get_deadline(next_ready(&srt)), ++ last->cur_deadline))) ++ { ++ if (smp_processor_id() == last->cpu) ++ set_tsk_need_resched(current); ++ else ++ if (!test_will_schedule(last->cpu)) ++ smp_send_reschedule(last->cpu); ++ ret = 1; ++ } ++ spin_unlock(&last->lock); ++ } ++ ++ spin_unlock(&hsb_cpu_lock); ++ return ret; ++} ++ ++ ++/* be_check_resched - Check whether another CPU needs to switch to a BE server.. ++ * ++ * Caller must hold edf->ready_lock! ++ */ ++static int be_check_resched(rt_domain_t *edf) ++{ ++ cpu_state_t *last; ++ int soft, bg; ++ int ret = 0; ++ ++ spin_lock(&hsb_cpu_lock); ++ ++ if (!list_empty(&be.ready_queue)) { ++ last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list); ++ /* guard against concurrent updates */ ++ spin_lock(&last->lock); ++ ++ bg = last->exec_class == RT_CLASS_BEST_EFFORT; ++ soft = last->exec_class == RT_CLASS_SOFT; ++ ++ if (bg || (soft && time_before(be_next_ready(&be)->deadline, ++ last->cur_deadline))) ++ { ++ if (smp_processor_id() == last->cpu) ++ set_tsk_need_resched(current); ++ else ++ if (!test_will_schedule(last->cpu)) ++ smp_send_reschedule(last->cpu); ++ ret = 1; ++ } ++ ++ spin_unlock(&last->lock); ++ } ++ ++ spin_unlock(&hsb_cpu_lock); ++ return ret; ++} ++ ++ ++int cap_check_resched(jiffie_t deadline) ++{ ++ unsigned long flags; ++ cpu_state_t *last; ++ int soft, bg; ++ int ret = 0; ++ ++ ++ ++ if (get_rt_mode() == MODE_RT_RUN) { ++ spin_lock_irqsave(&hsb_cpu_lock, flags); ++ ++ last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list); ++ /* guard against concurrent updates */ ++ spin_lock(&last->lock); ++ ++ bg = last->exec_class == RT_CLASS_BEST_EFFORT; ++ soft = last->exec_class == RT_CLASS_SOFT; ++ ++ if (bg || (soft && time_before(deadline, ++ last->cur_deadline))) ++ { ++ if (smp_processor_id() == last->cpu) ++ set_tsk_need_resched(current); ++ else ++ if (!test_will_schedule(last->cpu)) ++ smp_send_reschedule(last->cpu); ++ ret = 1; ++ } ++ ++ spin_unlock(&last->lock); ++ ++ spin_unlock_irqrestore(&hsb_cpu_lock, flags); ++ } ++ return ret; ++} ++ ++int fifo_check_resched(void) ++{ ++ unsigned long flags; ++ cpu_state_t *last; ++ int ret = 0; ++ ++ if (get_rt_mode() == MODE_RT_RUN) { ++ spin_lock_irqsave(&hsb_cpu_lock, flags); ++ ++ ++ last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list); ++ /* guard against concurrent updates */ ++ ++ spin_lock(&last->lock); ++ ++ if (last->exec_class == RT_CLASS_BEST_EFFORT) ++ { ++ if (smp_processor_id() == last->cpu) ++ set_tsk_need_resched(current); ++ else ++ if (!test_will_schedule(last->cpu)) ++ smp_send_reschedule(last->cpu); ++ ret = 1; ++ } ++ ++ spin_unlock(&last->lock); ++ ++ spin_unlock_irqrestore(&hsb_cpu_lock, flags); ++ } ++ return ret; ++} ++ ++ ++ ++static inline int hsb_preemption_needed(rt_domain_t* edf, cpu_state_t* state) ++{ ++ /* we need the read lock for rt_ready_queue */ ++ if (!list_empty(&edf->ready_queue)) ++ { ++ if (state->exec_class == RT_CLASS_SOFT) { ++ if (state->cap) ++ return time_before(get_deadline(next_ready(edf)) ++ , state->cap->deadline); ++ else ++ return time_before(get_deadline(next_ready(edf)) ++ , state->cur_deadline); ++ } else ++ return 1; ++ } ++ return 0; ++} ++ ++static inline int cap_preemption_needed(capacity_queue_t* q, cpu_state_t* state) ++{ ++ /* we need the read lock for rt_ready_queue */ ++ if (!list_empty(&q->queue)) ++ { ++ if (state->exec_class == RT_CLASS_SOFT) { ++ if (state->cap) ++ return time_before(next_cap(q)->deadline ++ , state->cap->deadline); ++ else ++ return time_before(next_cap(q)->deadline ++ , state->cur_deadline); ++ } else ++ return 1; ++ } ++ return 0; ++} ++ ++/* hsb_scheduler_tick - this function is called for every local timer ++ * interrupt. ++ * ++ * checks whether the current task has expired and checks ++ * whether we need to preempt it if it has not expired ++ */ ++static reschedule_check_t hsb_scheduler_tick(void) ++{ ++ unsigned long flags; ++ struct task_struct *t = current; ++ int resched = 0; ++ ++ cpu_state_t *state = &__get_cpu_var(hsb_cpu_state); ++ ++ /* expire tasks even if not in real-time mode ++ * this makes sure that at the end of real-time mode ++ * no tasks "run away forever". ++ */ ++ ++ /* charge BE server only if we are not running on a spare capacity */ ++ if (state->be && !state->cap && --state->be->budget <= 0) { ++ sched_trace_server_completion(state->be->pid, 0, ++ state->be->deadline, ++ RT_CLASS_BEST_EFFORT); ++ be_preempt(&be, state); ++ resched = 1; ++ } ++ ++ if (state->cap) ++ if (--state->cap->budget <= 0 || ++ time_before_eq(state->cap->deadline, jiffies)) { ++ kfree(state->cap); ++ state->cap = NULL; ++ resched = 1; ++ } ++ ++ if (is_realtime(t)) { ++ if (is_hrt(t) && (--state->hrt.budget <= 0)) { ++ sched_trace_server_completion( ++ HRT_BASE_PID + smp_processor_id(), 0, ++ state->hrt.deadline, RT_CLASS_HARD); ++ resched = 1; ++ } ++ ++ /* account for received service... */ ++ t->rt_param.times.exec_time++; ++ ++ /* ...and charge current budget */ ++ if (!state->cap) { ++ --t->time_slice; ++ /* a task always should be able to finish its job */ ++ BUG_ON(!is_be(t) && !t->time_slice && !job_completed(t)); ++ } ++ ++ if (job_completed(t) || (is_be(t) && !t->time_slice)) { ++ sched_trace_job_completion(t); ++ set_rt_flags(t, RT_F_SLEEP); ++ resched = 1; ++ } ++ } ++ ++ ++ if (get_rt_mode() == MODE_RT_RUN) ++ { ++ try_release_pending(&state->hrt.domain); ++ check_for_hrt_release(&state->hrt); ++ try_release_pending(&srt); ++ be_try_release_pending(&be); ++ ++ if (!resched) ++ switch (state->exec_class) { ++ case RT_CLASS_HARD: ++ read_lock_irqsave(&state->hrt.domain.ready_lock, ++ flags); ++ resched = edf_preemption_needed( ++ &state->hrt.domain, ++ t); ++ read_unlock_irqrestore( ++ &state->hrt.domain.ready_lock, flags); ++ break; ++ ++ case RT_CLASS_SOFT: ++ case RT_CLASS_BEST_EFFORT: ++ local_irq_save(flags); ++ ++ /* check for HRT jobs */ ++ read_lock(&state->hrt.domain.ready_lock); ++ resched = hrt_client_eligible(&state->hrt); ++ read_unlock(&state->hrt.domain.ready_lock); ++ ++ /* check for spare capacities */ ++ if (!resched) { ++ spin_lock(&cap_queue.lock); ++ resched = ++ cap_preemption_needed(&cap_queue, ++ state); ++ spin_unlock(&cap_queue.lock); ++ } ++ ++ /* check for SRT jobs */ ++ if (!resched) { ++ read_lock(&srt.ready_lock); ++ resched = hsb_preemption_needed( ++ &srt, state); ++ read_unlock(&srt.ready_lock); ++ } ++ ++ /* check for BE jobs */ ++ if (!resched) { ++ read_lock(&be.ready_lock); ++ resched = be_preemption_needed( ++ &be, state); ++ read_unlock(&be.ready_lock); ++ } ++ ++ /* check for background jobs */ ++ if (!resched && !is_realtime(current)) ++ resched = jobs_pending(&hsb_fifo); ++ local_irq_restore(flags); ++ break; ++ ++ default: ++ /* something wrong in the variable */ ++ BUG(); ++ } ++ } ++ ++ if (resched) { ++ set_will_schedule(); ++ return FORCE_RESCHED; ++ } else ++ return NO_RESCHED; ++} ++ ++static int schedule_hrt(struct task_struct * prev, ++ struct task_struct ** next, runqueue_t * rq) ++{ ++ unsigned long flags; ++ int deactivate = 1; ++ cpu_state_t *state; ++ ++ ++ state = &__get_cpu_var(hsb_cpu_state); ++ ++ write_lock_irqsave(&state->hrt.domain.ready_lock, flags); ++ ++ ++ if (state->cap) { ++ /* hrt_schedule does not have the cap_queue lock */ ++ return_capacity(&cap_queue, state->cap); ++ state->cap = NULL; ++ } ++ ++ if (is_hrt(prev) && is_released(prev) && is_running(prev) ++ && !edf_preemption_needed(&state->hrt.domain, prev)) { ++ /* This really should only happen if the task has ++ * 100% utilization or when we got a bogus/delayed ++ * resched IPI. ++ */ ++ TRACE("HRT: prev will be next, already released\n"); ++ *next = prev; ++ deactivate = 0; ++ } else { ++ /* either not yet released, preempted, or non-rt */ ++ *next = __take_ready(&state->hrt.domain); ++ /* the logic in hsb_schedule makes sure *next must exist ++ * if we get here */ ++ BUG_ON(!*next); ++ /* stick the task into the runqueue */ ++ __activate_task(*next, rq); ++ set_task_cpu(*next, smp_processor_id()); ++ } ++ ++ set_rt_flags(*next, RT_F_RUNNING); ++ adjust_cpu_queue(RT_CLASS_HARD, get_deadline(*next), NULL); ++ clear_will_schedule(); ++ ++ write_unlock_irqrestore(&state->hrt.domain.ready_lock, flags); ++ return deactivate; ++} ++ ++ ++static struct task_struct* find_min_slack_task(struct task_struct *prev, ++ rt_domain_t* edf) ++{ ++ struct list_head *pos; ++ struct task_struct* tsk = NULL; ++ struct task_struct* cur; ++ ++ if (is_realtime(prev) && is_running(prev) && ++ get_rt_flags(prev) != RT_F_SLEEP) ++ tsk = prev; ++ list_for_each(pos, &edf->ready_queue) { ++ cur = list_entry(pos, struct task_struct, rt_list); ++ if (!tsk || task_slack(tsk) > task_slack(cur)) ++ tsk = cur; ++ } ++ return tsk; ++} ++ ++static struct task_struct* null_heuristic(struct task_struct *prev, ++ rt_domain_t* edf, ++ rt_domain_t* fifo) ++{ ++ if (jobs_pending(fifo)) ++ return NULL; ++ else if (!list_empty(&edf->ready_queue)) ++ return list_entry(edf->ready_queue.next, ++ struct task_struct, rt_list); ++ else ++ return NULL; ++} ++ ++/* caller holds all locks ++ */ ++ ++static int schedule_capacity(struct task_struct *prev, ++ struct task_struct **next, runqueue_t *rq) ++{ ++ cpu_state_t *state = &__get_cpu_var(hsb_cpu_state); ++ capacity_t* old; ++ ++ if (state->cap) { ++ old = state->cap; ++ state->cap = __take_capacity(&cap_queue, old->deadline, 1); ++ if (!state->cap) ++ state->cap = old; ++ else ++ __return_capacity(&cap_queue, old); ++ } else ++ state->cap = __take_capacity(&cap_queue, 0, 0); ++ ++ ++ /* pick a task likely to be tardy */ ++ *next = find_min_slack_task(prev, &srt); ++ ++ /* only give away spare capacities if there is no task that ++ * is going to be tardy ++ */ ++ if (*next && task_slack(*next) >= 0) ++ *next = null_heuristic(prev, &srt, &hsb_fifo); ++ if (*next && *next != prev) ++ list_del(&(*next)->rt_list); ++ ++ ++ /* if there is none pick a BE job */ ++ if (!*next) { ++ if (is_realtime(prev) && is_be(prev) && is_running(prev) && ++ get_rt_flags(prev) != RT_F_SLEEP) ++ *next = prev; ++ else ++ *next = take_ready(&hsb_fifo); ++ } ++ ++ if (state->be) ++ be_preempt(&be, state); ++ BUG_ON(!state->cap); ++ if (*next && state->cap->donor) { ++ sched_trace_capacity_allocation( ++ *next, state->cap->budget, state->cap->deadline, ++ state->cap->donor); ++ } ++ ++ return *next != prev; ++} ++ ++ ++ ++#define BG 0 ++#define SRT 1 ++#define BE 2 ++#define CAP 3 ++ ++static inline int what_first(rt_domain_t *be, rt_domain_t *srt, capacity_queue_t* q) ++{ ++ jiffie_t sdl = 0, bdl= 0, cdl = 0, cur; ++ int _srt = !list_empty(&srt->ready_queue); ++ int _be = !list_empty(&be->ready_queue); ++ int _cap = __capacity_available(q); ++ ++ ++ int ret = BG; /* nothing ready => background mode*/ ++ cur = 0; ++ ++ if (_srt) ++ sdl = get_deadline(next_ready(srt)); ++ if (_be) ++ bdl = be_next_ready(be)->deadline; ++ if (_cap) ++ cdl = next_cap(q)->deadline; ++ ++ ++ ++ if (_cap) { ++ ret = CAP; ++ cur = cdl; ++ } ++ if (_srt && (time_before(sdl, cur) || !ret)) { ++ ret = SRT; ++ cur = sdl; ++ } ++ if (_be && (time_before(bdl, cur) || !ret)) { ++ ret = BE; ++ cur = bdl; ++ } ++ return ret; ++} ++ ++ ++ ++static int schedule_srt_be_cap(struct task_struct *prev, ++ struct task_struct **next, runqueue_t *rq) ++{ ++ task_class_t class = RT_CLASS_BEST_EFFORT; ++ jiffie_t deadline = 0; ++ unsigned long flags; ++ int deactivate = 1; ++ be_server_t* bes; ++ cpu_state_t* state; ++ int type = BG; ++ ++reschedule: ++ write_lock_irqsave(&srt.ready_lock, flags); ++ write_lock(&be.ready_lock); ++ spin_lock(&cap_queue.lock); ++ ++ ++ state = &__get_cpu_var(hsb_cpu_state); ++ bes = NULL; ++ ++ clear_will_schedule(); ++ ++ if (is_realtime(prev) && (is_released(prev) || is_be(prev)) && ++ is_running(prev) && !hsb_preemption_needed(&srt, state) && ++ !be_preemption_needed(&be, state) ++ ) { ++ /* Our current task's next job has already been ++ * released and has higher priority than the highest ++ * prioriy waiting task; in other words: it is tardy. ++ * We just keep it. ++ */ ++ TRACE("prev will be next, already released\n"); ++ *next = prev; ++ class = prev->rt_param.basic_params.class; ++ deadline = get_deadline(*next); ++ deactivate = 0; ++ } else { ++ /* either not yet released, preempted, or non-rt */ ++ type = what_first(&be, &srt, &cap_queue); ++ switch (type) { ++ case CAP: ++ /* capacity */ ++ deactivate = schedule_capacity(prev, next, rq); ++ deadline = state->cap->deadline; ++ if (*next) ++ class = RT_CLASS_SOFT; ++ else ++ class = RT_CLASS_BEST_EFFORT; ++ break; ++ case BE: ++ /* be */ ++ *next = NULL; ++ bes = be_take_ready(&be); ++ if (bes) { ++ class = RT_CLASS_SOFT; ++ deadline = bes->deadline; ++ *next = take_ready(&hsb_fifo); ++ if (!*next) { ++ /* deactivate */ ++ __release_capacity(&cap_queue, ++ bes->budget, ++ bes->deadline, NULL); ++ bes->budget = 0; ++ barrier(); ++ spin_unlock(&cap_queue.lock); ++ write_unlock(&be.ready_lock); ++ write_unlock_irqrestore(&srt.ready_lock, ++ flags); ++ be_enqueue(&be, bes); ++ goto reschedule; ++ } ++ } ++ break; ++ case SRT: ++ /* srt */ ++ *next = __take_ready(&srt); ++ if (*next) { ++ class = RT_CLASS_SOFT; ++ deadline = get_deadline(*next); ++ } ++ break; ++ case BG: ++ /* background server mode */ ++ class = RT_CLASS_BEST_EFFORT; ++ deadline = 0; ++ *next = take_ready(&hsb_fifo); ++ break; ++ } ++ ++ ++ /* give back capacities */ ++ if (type != CAP && state->cap) { ++ __return_capacity(&cap_queue, state->cap); ++ state->cap = NULL; ++ } ++ if (*next && deactivate) { ++ /* mark the task as executing on this cpu */ ++ set_task_cpu(*next, smp_processor_id()); ++ /* stick the task into the runqueue */ ++ __activate_task(*next, rq); ++ } ++ } ++ ++ adjust_cpu_queue(class, deadline, bes); ++ ++ switch (type) { ++ case BG: ++ break; ++ case BE: ++ be.check_resched(&be); ++ break; ++ case SRT: ++ srt.check_resched(&srt); ++ break; ++ case CAP: ++ if (!list_empty(&cap_queue.queue)) ++ cap_check_resched(list_entry(cap_queue.queue.next, ++ capacity_t, list)->deadline); ++ break; ++ } ++ ++ ++ if(*next) ++ set_rt_flags(*next, RT_F_RUNNING); ++ ++ spin_unlock(&cap_queue.lock); ++ write_unlock(&be.ready_lock); ++ write_unlock_irqrestore(&srt.ready_lock, flags); ++ return deactivate; ++} ++ ++ ++static int hsb_schedule(struct task_struct * prev, struct task_struct ** next, ++ runqueue_t * rq) ++{ ++ int need_deactivate = 1; ++ cpu_state_t *state = NULL; ++ ++ preempt_disable(); ++ ++ state = &__get_cpu_var(hsb_cpu_state); ++ ++ be_preempt(&be, state); ++ ++ ++ if (is_realtime(prev) && !is_be(prev) && ++ get_rt_flags(prev) == RT_F_SLEEP) ++ { ++ TRACE("preparing %d for next period\n", prev->pid); ++ release_capacity(&cap_queue, prev->time_slice, ++ prev->rt_param.times.deadline, prev); ++ edf_prepare_for_next_period(prev); ++ } ++ ++ if (get_rt_mode() == MODE_RT_RUN) { ++ /* we need to schedule hrt if a hrt job is pending or when ++ * we have a non expired hrt job on the cpu ++ */ ++ ++ if (hrt_client_eligible(&state->hrt) || ++ unlikely((is_hrt(prev) && is_running(prev) && ++ get_rt_flags(prev) != RT_F_SLEEP))) { ++ if (state->cap) { ++ return_capacity(&cap_queue, state->cap); ++ state->cap = NULL; ++ } ++ need_deactivate = schedule_hrt(prev, next, rq); ++ } else ++ need_deactivate = schedule_srt_be_cap(prev, next, rq); ++ ++ } ++ ++ if (is_realtime(prev) && need_deactivate && prev->array) { ++ /* take it out of the run queue */ ++ deactivate_task(prev, rq); ++ } ++ ++ preempt_enable(); ++ ++ return 0; ++} ++ ++/* put task into correct queue */ ++static inline void hsb_add_release(struct task_struct *t) ++{ ++ if (is_hrt(t)) ++ add_release(hrt_dom(get_partition(t)), t); ++ else if (is_srt(t)) ++ add_release(&srt, t); ++ else if (is_be(t)) { ++ t->time_slice = 0; ++ add_ready(&hsb_fifo, t); ++ fifo_check_resched(); ++ } else ++ BUG(); ++ ++} ++ ++/* put task into correct queue */ ++static inline void hsb_add_ready(struct task_struct *t) ++{ ++ if (is_hrt(t)) ++ add_ready(hrt_dom(get_partition(t)), t); ++ else if (is_srt(t)) ++ add_ready(&srt, t); ++ else if (is_be(t)) { ++ add_ready(&hsb_fifo, t); ++ fifo_check_resched(); ++ } ++ else ++ BUG(); ++} ++ ++ ++/* _finish_switch - we just finished the switch away from prev ++ * it is now safe to requeue the task ++ */ ++static void hsb_finish_switch(struct task_struct *prev) ++{ ++ if (!is_realtime(prev) || !is_running(prev)) ++ return; ++ ++ TRACE("finish switch for %d\n", prev->pid); ++ ++ if (is_be(prev)) { ++ add_ready(&hsb_fifo, prev); ++ return; ++ } ++ ++ if (get_rt_flags(prev) == RT_F_SLEEP || ++ get_rt_mode() != MODE_RT_RUN) { ++ /* this task has expired ++ * _schedule has already taken care of updating ++ * the release and ++ * deadline. We just must check if has been released. ++ */ ++ if (is_released(prev) && get_rt_mode() == MODE_RT_RUN) { ++ sched_trace_job_release(prev); ++ hsb_add_ready(prev); ++ TRACE("%d goes straight to ready queue\n", prev->pid); ++ } ++ else ++ /* it has got to wait */ ++ hsb_add_release(prev); ++ } ++ else { ++ /* this is a forced preemption ++ * thus the task stays in the ready_queue ++ * we only must make it available to other cpus ++ */ ++ hsb_add_ready(prev); ++ } ++} ++ ++ ++/* Prepare a task for running in RT mode ++ * Enqueues the task into master queue data structure ++ * returns ++ * -EPERM if task is not TASK_STOPPED ++ */ ++static long hsb_prepare_task(struct task_struct * t) ++{ ++ TRACE("edf-hsb: prepare task %d\n", t->pid); ++ ++ if (t->state == TASK_STOPPED) { ++ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1); ++ ++ if (get_rt_mode() == MODE_RT_RUN && !is_be(t)) ++ /* The action is already on. ++ * Prepare immediate release ++ */ ++ edf_release_now(t); ++ /* The task should be running in the queue, otherwise signal ++ * code will try to wake it up with fatal consequences. ++ */ ++ t->state = TASK_RUNNING; ++ if (is_be(t)) ++ t->rt_param.times.deadline = 0; ++ hsb_add_release(t); ++ return 0; ++ } ++ else ++ return -EPERM; ++} ++ ++static void hsb_wake_up_task(struct task_struct *task) ++{ ++ /* We must determine whether task should go into the release ++ * queue or into the ready queue. It may enter the ready queue ++ * if it has credit left in its time slice and has not yet reached ++ * its deadline. If it is now passed its deadline we assume this the ++ * arrival of a new sporadic job and thus put it in the ready queue ++ * anyway.If it has zero budget and the next release is in the future ++ * it has to go to the release queue. ++ */ ++ TRACE("edf-hsb: wake up %d with budget=%d\n", ++ task->pid, task->time_slice); ++ task->state = TASK_RUNNING; ++ ++ if (is_be(task)) { ++ task->rt_param.times.last_release = jiffies; ++ hsb_add_release(task); ++ } ++ else if (is_tardy(task)) { ++ /* new sporadic release */ ++ edf_release_now(task); ++ sched_trace_job_release(task); ++ hsb_add_ready(task); ++ } ++ else if (task->time_slice) { ++ /* came back in time before deadline ++ */ ++ set_rt_flags(task, RT_F_RUNNING); ++ hsb_add_ready(task); ++ } ++ else { ++ hsb_add_release(task); ++ } ++ ++} ++ ++static void hsb_task_blocks(struct task_struct *t) ++{ ++ /* not really anything to do since it can only block if ++ * it is running, and when it is not running it is not in any ++ * queue anyway. ++ */ ++ TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice); ++ if (is_be(t)) ++ sched_trace_job_completion(t); ++} ++ ++ ++static int hsb_mode_change(int new_mode) ++{ ++ int cpu; ++ cpu_state_t *entry; ++ jiffie_t start; ++ ++ TRACE("[%d] edf-hsb: mode changed to %d\n", smp_processor_id(), ++ new_mode); ++ if (new_mode == MODE_RT_RUN) { ++ start = jiffies + 20; ++ rerelease_all(&srt, edf_release_at); ++ be_prepare_new_releases(&be, start); ++ ++ /* initialize per CPU state ++ * we can't do this at boot time because we don't know ++ * which CPUs will be online and we can't put non-existing ++ * cpus into the queue ++ */ ++ spin_lock(&hsb_cpu_lock); ++ /* get old cruft out of the way in case we reenter real-time ++ * mode for a second time ++ */ ++ while (!list_empty(&hsb_cpu_queue)) ++ list_del(hsb_cpu_queue.next); ++ /* reinitialize */ ++ for_each_online_cpu(cpu) { ++ entry = &per_cpu(hsb_cpu_state, cpu); ++ atomic_set(&entry->will_schedule, 0); ++ entry->exec_class = RT_CLASS_BEST_EFFORT; ++ entry->cur_deadline = 0; ++ list_add(&entry->list, &hsb_cpu_queue); ++ ++ rerelease_all(&entry->hrt.domain, edf_release_at); ++ prepare_hrt_release(&entry->hrt, start); ++ } ++ spin_unlock(&hsb_cpu_lock); ++ ++ } ++ TRACE("[%d] edf-hsb: mode change done\n", smp_processor_id()); ++ return 0; ++} ++ ++ ++typedef enum { ++ EDF_HSB_SET_HRT, ++ EDF_HSB_GET_HRT, ++ EDF_HSB_CREATE_BE ++} edf_hsb_setup_cmds_t; ++ ++typedef struct { ++ int cpu; ++ unsigned int wcet; ++ unsigned int period; ++} setup_hrt_param_t; ++ ++typedef struct { ++ unsigned int wcet; ++ unsigned int period; ++} create_be_param_t; ++ ++typedef struct { ++ union { ++ setup_hrt_param_t setup_hrt; ++ create_be_param_t create_be; ++ }; ++} param_t; ++ ++static pid_t next_be_server_pid = SRT_BASE_PID; ++ ++static int hsb_scheduler_setup(int cmd, void __user* up) ++{ ++ unsigned long flags; ++ int error = -EINVAL; ++ cpu_state_t* state; ++ be_server_t* srv; ++ param_t param; ++ ++ switch (cmd) { ++ case EDF_HSB_SET_HRT: ++ if (copy_from_user(¶m, up, sizeof(setup_hrt_param_t))) { ++ error = -EFAULT; ++ goto out; ++ } ++ if (!cpu_online(param.setup_hrt.cpu)) { ++ printk(KERN_WARNING "scheduler setup: " ++ "CPU %d is not online!\n", param.setup_hrt.cpu); ++ error = -EINVAL; ++ goto out; ++ } ++ if (param.setup_hrt.period < param.setup_hrt.wcet) { ++ printk(KERN_WARNING "period < wcet!\n"); ++ error = -EINVAL; ++ goto out; ++ } ++ ++ state = &per_cpu(hsb_cpu_state, param.setup_hrt.cpu); ++ spin_lock_irqsave(&state->lock, flags); ++ ++ state->hrt.wcet = param.setup_hrt.wcet; ++ state->hrt.period = param.setup_hrt.period; ++ ++ spin_unlock_irqrestore(&state->lock, flags); ++ ++ printk(KERN_WARNING "edf-hsb: set HRT #%d to (%d, %d)\n", ++ param.setup_hrt.cpu, param.setup_hrt.wcet, ++ param.setup_hrt.period); ++ ++ error = 0; ++ ++ break; ++ ++ case EDF_HSB_GET_HRT: ++ if (copy_from_user(¶m, up, sizeof(setup_hrt_param_t))) { ++ error = -EFAULT; ++ goto out; ++ } ++ if (!cpu_online(param.setup_hrt.cpu)) { ++ error = -EINVAL; ++ goto out; ++ } ++ state = &per_cpu(hsb_cpu_state, param.setup_hrt.cpu); ++ spin_lock_irqsave(&state->lock, flags); ++ ++ param.setup_hrt.wcet = state->hrt.wcet; ++ param.setup_hrt.period = state->hrt.period; ++ ++ spin_unlock_irqrestore(&state->lock, flags); ++ ++ if (copy_to_user(up, ¶m, sizeof(setup_hrt_param_t))) { ++ error = -EFAULT; ++ goto out; ++ } ++ error = 0; ++ break; ++ ++ case EDF_HSB_CREATE_BE: ++ if (copy_from_user(¶m, up, sizeof(create_be_param_t))) { ++ error = -EFAULT; ++ goto out; ++ } ++ if (param.create_be.period < param.create_be.wcet || ++ !param.create_be.period || !param.create_be.wcet) { ++ error = -EINVAL; ++ goto out; ++ } ++ srv = (be_server_t*) kmalloc(sizeof(be_server_t), GFP_KERNEL); ++ if (!srv) { ++ error = -ENOMEM; ++ goto out; ++ } ++ srv->wcet = param.create_be.wcet; ++ srv->period = param.create_be.period; ++ srv->pid = next_be_server_pid++; ++ INIT_LIST_HEAD(&srv->list); ++ be_prepare_new_release(srv, jiffies); ++ be_enqueue(&be, srv); ++ ++ printk(KERN_WARNING "edf-hsb: created a BE with (%d, %d)\n", ++ param.create_be.wcet, param.create_be.period); ++ ++ error = 0; ++ break; ++ ++ default: ++ printk(KERN_WARNING "edf-hsb: unknown command %d\n", cmd); ++ } ++ ++out: ++ return error; ++} ++ ++/* Plugin object */ ++static sched_plugin_t s_plugin __cacheline_aligned_in_smp = { ++ .ready_to_use = 0 ++}; ++ ++ ++/* ++ * Plugin initialization code. ++ */ ++#define INIT_SCHED_PLUGIN (struct sched_plugin){\ ++ .plugin_name = "EDF-HSB",\ ++ .ready_to_use = 1,\ ++ .scheduler_tick = hsb_scheduler_tick,\ ++ .prepare_task = hsb_prepare_task,\ ++ .sleep_next_period = edf_sleep_next_period,\ ++ .schedule = hsb_schedule,\ ++ .finish_switch = hsb_finish_switch,\ ++ .mode_change = hsb_mode_change,\ ++ .wake_up_task = hsb_wake_up_task,\ ++ .task_blocks = hsb_task_blocks, \ ++ .scheduler_setup = hsb_scheduler_setup \ ++} ++ ++ ++sched_plugin_t *__init init_edf_hsb_plugin(void) ++{ ++ int i; ++ ++ if (!s_plugin.ready_to_use) ++ { ++ capacity_queue_init(&cap_queue); ++ edf_domain_init(&srt, srt_check_resched); ++ edf_domain_init(&be, be_check_resched); ++ fifo_domain_init(&hsb_fifo, NULL); ++ for (i = 0; i < NR_CPUS; i++) ++ { ++ hsb_cpu_state_init(&per_cpu(hsb_cpu_state, i), ++ hrt_check_resched, i); ++ printk("HRT server %d initialized.\n", i); ++ } ++ s_plugin = INIT_SCHED_PLUGIN; ++ } ++ return &s_plugin; ++} +diff --git a/kernel/sched_global_edf.c b/kernel/sched_global_edf.c +new file mode 100644 +index 0000000..4b36bc5 +--- /dev/null ++++ b/kernel/sched_global_edf.c +@@ -0,0 +1,550 @@ ++/* ++ * kernel/sched-global-edf.c ++ * ++ * Re-Implementation of the Global EDF scheduler. ++ * ++ * This version works without using the struct queue. It uses the ++ * builtin kernel lists. ++ */ ++ ++#include ++#include ++#include ++ ++#include ++#include ++ ++#include ++#include ++ ++ ++/* cpu_entry_t - maintain state of the priority of cpu's current task ++ * this is needed to check for priority inversions. ++ */ ++typedef struct { ++ int cpu; ++ int executes_realtime; ++ jiffie_t cur_deadline; ++ struct list_head list; ++ atomic_t will_schedule; ++} cpu_entry_t; ++DEFINE_PER_CPU(cpu_entry_t, gedf_cpu_entries); ++ ++#define set_will_schedule() \ ++ (atomic_set(&__get_cpu_var(gedf_cpu_entries).will_schedule, 1)) ++#define clear_will_schedule() \ ++ (atomic_set(&__get_cpu_var(gedf_cpu_entries).will_schedule, 0)) ++#define test_will_schedule(cpu) \ ++ (atomic_read(&per_cpu(gedf_cpu_entries, cpu).will_schedule)) ++ ++ ++/* always acquire the cpu lock as the last lock to avoid deadlocks */ ++static spinlock_t gedf_cpu_lock = SPIN_LOCK_UNLOCKED; ++/* the cpus queue themselves according to priority in here */ ++static LIST_HEAD(gedf_cpu_queue); ++ ++ ++static rt_domain_t gedf; ++ ++#define DUMP(args...) TRACE(args) ++ ++/* adjust_cpu_queue - Move the cpu entry to the correct place to maintain ++ * order in the cpu queue. Caller must hold ready write lock. ++ * ++ */ ++static void adjust_cpu_queue(int exec_rt, jiffie_t deadline) ++{ ++ struct list_head *pos; ++ cpu_entry_t *other; ++ cpu_entry_t *entry; ++ ++ spin_lock(&gedf_cpu_lock); ++ ++ entry = &__get_cpu_var(gedf_cpu_entries); ++ entry->executes_realtime = exec_rt; ++ entry->cur_deadline = deadline; ++ ++ list_del(&entry->list); ++ /* if we do not execute real-time jobs we just move ++ * to the end of the queue ++ */ ++ if (entry->executes_realtime) ++ list_for_each(pos, &gedf_cpu_queue) { ++ other = list_entry(pos, cpu_entry_t, list); ++ if (!other->executes_realtime || ++ time_before_eq(entry->cur_deadline, ++ other->cur_deadline)) ++ { ++ __list_add(&entry->list, pos->prev, pos); ++ goto out; ++ } ++ } ++ /* if we get this far we have the lowest priority task */ ++ list_add_tail(&entry->list, &gedf_cpu_queue); ++ ++ out: ++ spin_unlock(&gedf_cpu_lock); ++} ++ ++ ++/* check_reschedule_needed - Check whether another CPU needs to reschedule. ++ * ++ * The function only checks and kicks the last CPU. It will reschedule and ++ * kick the next if necessary, and so on. The caller is responsible for making ++ * sure that it is not the last entry or that a reschedule is not necessary. ++ * ++ */ ++static int gedf_check_resched(rt_domain_t *edf) ++{ ++ cpu_entry_t *last; ++ int ret = 0; ++ ++ spin_lock(&gedf_cpu_lock); ++ ++ if (!list_empty(&edf->ready_queue)) { ++ last = list_entry(gedf_cpu_queue.prev, cpu_entry_t, list); ++ if (!last->executes_realtime || ++ time_before(next_ready(edf)->rt_param.times.deadline, ++ last->cur_deadline)) ++ { ++ if (smp_processor_id() == last->cpu) ++ set_tsk_need_resched(current); ++ else ++ if (!test_will_schedule(last->cpu)) ++ smp_send_reschedule(last->cpu); ++ ret = 1; ++ } ++ } ++ ++ spin_unlock(&gedf_cpu_lock); ++ return ret; ++} ++ ++ ++ ++/* gedf_scheduler_tick - this function is called for every local timer ++ * interrupt. ++ * ++ * checks whether the current task has expired and checks ++ * whether we need to preempt it if it has not expired ++ */ ++static reschedule_check_t gedf_scheduler_tick(void) ++{ ++ unsigned long flags; ++ struct task_struct *t = current; ++ reschedule_check_t want_resched = NO_RESCHED; ++ ++ /* expire tasks even if not in real-time mode ++ * this makes sure that at the end of real-time mode ++ * no tasks "run away forever". ++ */ ++ BUG_ON(is_realtime(t) && t->time_slice > 100000); ++ if (is_realtime(t) && (!--t->time_slice)) { ++ /* this task has exhausted its budget in this period */ ++ set_rt_flags(t, RT_F_SLEEP); ++ want_resched = FORCE_RESCHED; ++ set_will_schedule(); ++ sched_trace_job_completion(t); ++ } ++ if (get_rt_mode() == MODE_RT_RUN) ++ { ++ /* check whether anything is waiting to be released ++ * this could probably be moved to the global timer ++ * interrupt handler since the state will only change ++ * once per jiffie ++ */ ++ try_release_pending(&gedf); ++ if (want_resched != FORCE_RESCHED) ++ { ++ read_lock_irqsave(&gedf.ready_lock, flags); ++ if (edf_preemption_needed(&gedf, t)) ++ { ++ want_resched = FORCE_RESCHED; ++ set_will_schedule(); ++ } ++ read_unlock_irqrestore(&gedf.ready_lock, flags); ++ } ++ } ++ return want_resched; ++} ++ ++/* This is main Global EDF schedule function ++ * ++ * Assumes the caller holds the lock for rq and that irqs are disabled ++ * This is function only works for indirect switching ++ */ ++static int gedf_schedule(struct task_struct * prev, ++ struct task_struct ** next, ++ runqueue_t * rq) ++{ ++ int need_deactivate = 1; ++ int rt; ++ jiffie_t deadline; ++ unsigned long flags; ++ ++ ++ if (is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP) ++ { ++ DUMP("preparing %d for next period\n", prev->pid); ++ edf_prepare_for_next_period(prev); ++ } ++ ++ if (get_rt_mode() == MODE_RT_RUN) { ++ write_lock_irqsave(&gedf.ready_lock, flags); ++ ++ clear_will_schedule(); ++ ++ if (is_realtime(prev) && is_released(prev) && is_running(prev) ++ && !edf_preemption_needed(&gedf, prev)) { ++ /* Our current task's next job has already been ++ * released and has higher priority than the highest ++ * prioriy waiting task; in other words: it is tardy. ++ * We just keep it. ++ */ ++ DUMP("prev will be next, already released\n"); ++ *next = prev; ++ rt = 1; ++ deadline = prev->rt_param.times.deadline; ++ need_deactivate = 0; ++ } else { ++ /* either not yet released, preempted, or non-rt */ ++ *next = __take_ready(&gedf); ++ if (*next) { ++ /* mark the task as executing on this cpu */ ++ set_task_cpu(*next, smp_processor_id()); ++ ++ /* stick the task into the runqueue */ ++ __activate_task(*next, rq); ++ rt = 1; ++ deadline = (*next)->rt_param.times.deadline; ++ } ++ else ++ rt = deadline = 0; ++ } ++ ++ adjust_cpu_queue(rt, deadline); ++ ++ if (rt) { ++ set_rt_flags(*next, RT_F_RUNNING); ++ gedf.check_resched(&gedf); ++ } ++ write_unlock_irqrestore(&gedf.ready_lock, flags); ++ } ++ ++ if (is_realtime(prev) && need_deactivate && prev->array) { ++ /* take it out of the run queue */ ++ deactivate_task(prev, rq); ++ } ++ ++ /* don't put back into release yet. ++ * We first need to actually switch ++ * stacks before we can execute it ++ * on a different CPU */ ++ ++ /* in the current implementation nobody cares about the return value */ ++ return 0; ++} ++ ++ ++/* _finish_switch - we just finished the switch away from prev ++ * it is now safe to requeue the task ++ */ ++static void gedf_finish_switch(struct task_struct *prev) ++{ ++ if (!is_realtime(prev) || !is_running(prev)) ++ return; ++ ++ /*printk(KERN_INFO "gedf finish switch for %d\n", prev->pid);*/ ++ if (get_rt_flags(prev) == RT_F_SLEEP || ++ get_rt_mode() != MODE_RT_RUN) { ++ /* this task has expired ++ * _schedule has already taken care of updating ++ * the release and ++ * deadline. We just must check if has been released. ++ */ ++ if (time_before_eq(prev->rt_param.times.release, jiffies) ++ && get_rt_mode() == MODE_RT_RUN) { ++ /* already released */ ++ add_ready(&gedf, prev); ++ DUMP("%d goes straight to ready queue\n", prev->pid); ++ } ++ else ++ /* it has got to wait */ ++ add_release(&gedf, prev); ++ } ++ else { ++ /* this is a forced preemption ++ * thus the task stays in the ready_queue ++ * we only must make it available to others ++ */ ++ add_ready(&gedf, prev); ++ } ++} ++ ++ ++/* Prepare a task for running in RT mode ++ * Enqueues the task into master queue data structure ++ * returns ++ * -EPERM if task is not TASK_STOPPED ++ */ ++static long gedf_prepare_task(struct task_struct * t) ++{ ++ TRACE("global edf: prepare task %d\n", t->pid); ++ ++ if (t->state == TASK_STOPPED) { ++ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1); ++ ++ if (get_rt_mode() == MODE_RT_RUN) ++ /* The action is already on. ++ * Prepare immediate release ++ */ ++ edf_release_now(t); ++ /* The task should be running in the queue, otherwise signal ++ * code will try to wake it up with fatal consequences. ++ */ ++ t->state = TASK_RUNNING; ++ add_release(&gedf, t); ++ return 0; ++ } ++ else ++ return -EPERM; ++} ++ ++static void gedf_wake_up_task(struct task_struct *task) ++{ ++ /* We must determine whether task should go into the release ++ * queue or into the ready queue. It may enter the ready queue ++ * if it has credit left in its time slice and has not yet reached ++ * its deadline. If it is now passed its deadline we assume this the ++ * arrival of a new sporadic job and thus put it in the ready queue ++ * anyway.If it has zero budget and the next release is in the future ++ * it has to go to the release queue. ++ */ ++ TRACE("global edf: wake up %d with budget=%d\n", ++ task->pid, task->time_slice); ++ task->state = TASK_RUNNING; ++ if (is_tardy(task)) { ++ /* new sporadic release */ ++ edf_release_now(task); ++ sched_trace_job_release(task); ++ add_ready(&gedf, task); ++ } ++ else if (task->time_slice) { ++ /* came back in time before deadline ++ */ ++ set_rt_flags(task, RT_F_RUNNING); ++ add_ready(&gedf, task); ++ } ++ else { ++ add_release(&gedf, task); ++ } ++ ++} ++ ++static void gedf_task_blocks(struct task_struct *t) ++{ ++ BUG_ON(!is_realtime(t)); ++ /* not really anything to do since it can only block if ++ * it is running, and when it is not running it is not in any ++ * queue anyway. ++ * ++ */ ++ TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice); ++ BUG_ON(t->rt_list.next != LIST_POISON1); ++ BUG_ON(t->rt_list.prev != LIST_POISON2); ++} ++ ++ ++/* When _tear_down is called, the task should not be in any queue any more ++ * as it must have blocked first. We don't have any internal state for the task, ++ * it is all in the task_struct. ++ */ ++static long gedf_tear_down(struct task_struct * t) ++{ ++ BUG_ON(!is_realtime(t)); ++ TRACE("global edf: tear down called for %d \n", t->pid); ++ BUG_ON(t->array); ++ BUG_ON(t->rt_list.next != LIST_POISON1); ++ BUG_ON(t->rt_list.prev != LIST_POISON2); ++ return 0; ++} ++ ++ ++static int gedf_mode_change(int new_mode) ++{ ++ int cpu; ++ cpu_entry_t *entry; ++ ++/* printk(KERN_INFO "[%d] global edf: mode changed to %d\n", smp_processor_id(), ++ new_mode);*/ ++ if (new_mode == MODE_RT_RUN) { ++ rerelease_all(&gedf, edf_release_at); ++ ++ /* initialize per CPU state ++ * we can't do this at boot time because we don't know ++ * which CPUs will be online and we can't put non-existing ++ * cpus into the queue ++ */ ++ spin_lock(&gedf_cpu_lock); ++ /* get old cruft out of the way in case we reenter real-time ++ * mode for a second time ++ */ ++ while (!list_empty(&gedf_cpu_queue)) ++ list_del(gedf_cpu_queue.next); ++ /* reinitialize */ ++ for_each_online_cpu(cpu) { ++ entry = &per_cpu(gedf_cpu_entries, cpu); ++ atomic_set(&entry->will_schedule, 0); ++ entry->executes_realtime = 0; ++ entry->cur_deadline = 0; ++ entry->cpu = cpu; ++ list_add(&entry->list, &gedf_cpu_queue); ++ } ++ spin_unlock(&gedf_cpu_lock); ++ } ++ /*printk(KERN_INFO "[%d] global edf: mode change done\n", smp_processor_id()); */ ++ return 0; ++} ++ ++ ++/* Plugin object */ ++static sched_plugin_t s_plugin __cacheline_aligned_in_smp = { ++ .ready_to_use = 0 ++}; ++ ++ ++/* ++ * Plugin initialization code. ++ */ ++#define INIT_SCHED_PLUGIN (struct sched_plugin){\ ++ .plugin_name = "Global EDF",\ ++ .ready_to_use = 1,\ ++ .scheduler_tick = gedf_scheduler_tick,\ ++ .prepare_task = gedf_prepare_task,\ ++ .sleep_next_period = edf_sleep_next_period,\ ++ .tear_down = gedf_tear_down,\ ++ .schedule = gedf_schedule,\ ++ .finish_switch = gedf_finish_switch,\ ++ .mode_change = gedf_mode_change,\ ++ .wake_up_task = gedf_wake_up_task,\ ++ .task_blocks = gedf_task_blocks \ ++ } ++ ++ ++sched_plugin_t *__init init_global_edf_plugin(void) ++{ ++ if (!s_plugin.ready_to_use) ++ { ++ edf_domain_init(&gedf, gedf_check_resched); ++ s_plugin = INIT_SCHED_PLUGIN; ++ } ++ return &s_plugin; ++} ++ ++ ++ ++/*****************************************************************************/ ++/*****************************************************************************/ ++/*****************************************************************************/ ++/* NON-PREEMPTIVE GLOBAL EDF */ ++ ++ ++/* gedf_np_scheduler_tick - this function is called for every local timer ++ * interrupt. ++ * ++ * checks whether the current task has expired and checks ++ * whether we need to preempt it if it has not expired ++ */ ++static reschedule_check_t gedf_np_scheduler_tick(void) ++{ ++ if (get_rt_mode() == MODE_RT_RUN) ++ { ++ /* check whether anything is waiting to be released ++ * this could probably be moved to the global timer ++ * interrupt handler since the state will only change ++ * once per jiffie ++ */ ++ try_release_pending(&gedf); ++ } ++ ++ /* expire tasks even if not in real-time mode ++ * this makes sure that at the end of real-time mode ++ * no tasks "run away forever". ++ */ ++ BUG_ON(current->time_slice > 1000); ++ if (is_realtime(current) && (!--current->time_slice)) { ++ /* this task has exhausted its budget in this period */ ++ set_rt_flags(current, RT_F_SLEEP); ++ return FORCE_RESCHED; ++ } ++ else ++ return NO_RESCHED; ++} ++ ++/* gedf_np_check_resched - Check whether another CPU needs to reschedule. ++ * ++ * The function only checks and kicks the last CPU. It will reschedule and ++ * kick the next if necessary, and so on. The caller is responsible for making ++ * sure that it is not the last entry or that a reschedule is not necessary. ++ * ++ */ ++static int gedf_np_check_resched(rt_domain_t *edf) ++{ ++ cpu_entry_t *last; ++ int ret = 0; ++ ++ spin_lock(&gedf_cpu_lock); ++ ++ if (!list_empty(&edf->ready_queue)) { ++ last = list_entry(gedf_cpu_queue.prev, cpu_entry_t, list); ++ /* preemption happens only for non-realtime tasks */ ++ if (!last->executes_realtime) ++ { ++ if (smp_processor_id() == last->cpu) ++ set_tsk_need_resched(current); ++ else ++ smp_send_reschedule(last->cpu); ++ ret = 1; ++ goto out; ++ } ++ } ++ ++ out: ++ spin_unlock(&gedf_cpu_lock); ++ return ret; ++} ++ ++ ++/* non-preemptive global EDF ++ * ++ * Non-preemptive EDF is almost the same as normal EDF. We only have to ++ * adjust the scheduler tick and the resched function. ++ */ ++#define INIT_SCHED_PLUGIN_NP (struct sched_plugin){\ ++ .plugin_name = "Non-Preemptive Global EDF",\ ++ .ready_to_use = 1,\ ++ .scheduler_tick = gedf_np_scheduler_tick,\ ++ .prepare_task = gedf_prepare_task,\ ++ .sleep_next_period = edf_sleep_next_period,\ ++ .tear_down = gedf_tear_down,\ ++ .schedule = gedf_schedule,\ ++ .finish_switch = gedf_finish_switch,\ ++ .mode_change = gedf_mode_change,\ ++ .wake_up_task = gedf_wake_up_task,\ ++ .task_blocks = gedf_task_blocks \ ++ } ++ ++ ++/* as we only set the plugin at boot time, ++ * we use the same structure as preemptive EDF. This simplifies a lot ++ * of the funtions. ++ */ ++sched_plugin_t* __init init_global_edf_np_plugin(void) ++{ ++ if (!s_plugin.ready_to_use) ++ { ++ edf_domain_init(&gedf, gedf_np_check_resched); ++ s_plugin = INIT_SCHED_PLUGIN_NP; ++ } ++ return &s_plugin; ++} +diff --git a/kernel/sched_gsn_edf.c b/kernel/sched_gsn_edf.c +new file mode 100644 +index 0000000..27d1b37 +--- /dev/null ++++ b/kernel/sched_gsn_edf.c +@@ -0,0 +1,814 @@ ++/* ++ * kernel/sched_gsn_edf.c ++ * ++ * Implementation of the GSN-EDF scheduling algorithm. ++ * ++ * This version uses the simple approach and serializes all scheduling ++ * decisions by the use of a queue lock. This is probably not the ++ * best way to do it, but it should suffice for now. It should not ++ * affect the benchmarks since all synchronization primitives will ++ * take the same performance hit, if any. ++ */ ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++/* Overview of GSN-EDF operations. ++ * ++ * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This ++ * description only covers how the individual operations are implemented in ++ * LITMUS. ++ * ++ * link_task_to_cpu(T, cpu) - Low-level operation to update the linkage ++ * structure (NOT the actually scheduled ++ * task). If there is another linked task To ++ * already it will set To->linked_on = NO_CPU ++ * (thereby removing its association with this ++ * CPU). However, it will not requeue the ++ * previously linked task (if any). It will set ++ * T's state to RT_F_RUNNING and check whether ++ * it is already running somewhere else. If T ++ * is scheduled somewhere else it will link ++ * it to that CPU instead (and pull the linked ++ * task to cpu). T may be NULL. ++ * ++ * unlink(T) - Unlink removes T from all scheduler data ++ * structures. If it is linked to some CPU it ++ * will link NULL to that CPU. If it is ++ * currently queued in the gsnedf queue it will ++ * be removed from the T->rt_list. It is safe to ++ * call unlink(T) if T is not linked. T may not ++ * be NULL. ++ * ++ * requeue(T) - Requeue will insert T into the appropriate ++ * queue. If the system is in real-time mode and ++ * the T is released already, it will go into the ++ * ready queue. If the system is not in ++ * real-time mode is T, then T will go into the ++ * release queue. If T's release time is in the ++ * future, it will go into the release ++ * queue. That means that T's release time/job ++ * no/etc. has to be updated before requeu(T) is ++ * called. It is not safe to call requeue(T) ++ * when T is already queued. T may not be NULL. ++ * ++ * gsnedf_job_arrival(T) - This is the catch all function when T enters ++ * the system after either a suspension or at a ++ * job release. It will queue T (which means it ++ * is not safe to call gsnedf_job_arrival(T) if ++ * T is already queued) and then check whether a ++ * preemption is necessary. If a preemption is ++ * necessary it will update the linkage ++ * accordingly and cause scheduled to be called ++ * (either with an IPI or need_resched). It is ++ * safe to call gsnedf_job_arrival(T) if T's ++ * next job has not been actually released yet ++ * (releast time in the future). T will be put ++ * on the release queue in that case. ++ * ++ * job_completion(T) - Take care of everything that needs to be done ++ * to prepare T for its next release and place ++ * it in the right queue with ++ * gsnedf_job_arrival(). ++ * ++ * ++ * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is ++ * equivalent to unlink(T). Note that if you unlink a task from a CPU none of ++ * the functions will automatically propagate pending task from the ready queue ++ * to a linked task. This is the job of the calling function ( by means of ++ * __take_ready). ++ */ ++ ++ ++/* cpu_entry_t - maintain the linked and scheduled state ++ */ ++typedef struct { ++ int cpu; ++ struct task_struct* linked; /* only RT tasks */ ++ struct task_struct* scheduled; /* only RT tasks */ ++ struct list_head list; ++ atomic_t will_schedule; /* prevent unneeded IPIs */ ++} cpu_entry_t; ++DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries); ++ ++#define set_will_schedule() \ ++ (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 1)) ++#define clear_will_schedule() \ ++ (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 0)) ++#define test_will_schedule(cpu) \ ++ (atomic_read(&per_cpu(gsnedf_cpu_entries, cpu).will_schedule)) ++ ++ ++#define NO_CPU 0xffffffff ++ ++/* The gsnedf_lock is used to serialize all scheduling events. ++ * It protects ++ */ ++static queuelock_t gsnedf_lock; ++/* the cpus queue themselves according to priority in here */ ++static LIST_HEAD(gsnedf_cpu_queue); ++ ++static rt_domain_t gsnedf; ++ ++ ++/* update_cpu_position - Move the cpu entry to the correct place to maintain ++ * order in the cpu queue. Caller must hold gsnedf lock. ++ */ ++static void update_cpu_position(cpu_entry_t *entry) ++{ ++ cpu_entry_t *other; ++ struct list_head *pos; ++ list_del(&entry->list); ++ /* if we do not execute real-time jobs we just move ++ * to the end of the queue ++ */ ++ if (entry->linked) { ++ list_for_each(pos, &gsnedf_cpu_queue) { ++ other = list_entry(pos, cpu_entry_t, list); ++ if (edf_higher_prio(entry->linked, other->linked)) { ++ __list_add(&entry->list, pos->prev, pos); ++ return; ++ } ++ } ++ } ++ /* if we get this far we have the lowest priority job */ ++ list_add_tail(&entry->list, &gsnedf_cpu_queue); ++} ++ ++/* link_task_to_cpu - Update the link of a CPU. ++ * Handles the case where the to-be-linked task is already ++ * scheduled on a different CPU. ++ */ ++static noinline void link_task_to_cpu(struct task_struct* linked, ++ cpu_entry_t *entry) ++ ++{ ++ cpu_entry_t *sched; ++ struct task_struct* tmp; ++ int on_cpu; ++ ++ BUG_ON(linked && !is_realtime(linked)); ++ ++ /* Currently linked task is set to be unlinked. */ ++ if (entry->linked) { ++ entry->linked->rt_param.linked_on = NO_CPU; ++ } ++ ++ /* Link new task to CPU. */ ++ if (linked) { ++ set_rt_flags(linked, RT_F_RUNNING); ++ /* handle task is already scheduled somewhere! */ ++ on_cpu = linked->rt_param.scheduled_on; ++ if (on_cpu != NO_CPU) { ++ sched = &per_cpu(gsnedf_cpu_entries, on_cpu); ++ /* this should only happen if not linked already */ ++ BUG_ON(sched->linked == linked); ++ ++ /* If we are already scheduled on the CPU to which we ++ * wanted to link, we don't need to do the swap -- ++ * we just link ourselves to the CPU and depend on ++ * the caller to get things right. ++ */ ++ if (entry != sched) { ++ tmp = sched->linked; ++ linked->rt_param.linked_on = sched->cpu; ++ sched->linked = linked; ++ update_cpu_position(sched); ++ linked = tmp; ++ } ++ } ++ if (linked) /* might be NULL due to swap */ ++ linked->rt_param.linked_on = entry->cpu; ++ } ++ entry->linked = linked; ++ update_cpu_position(entry); ++} ++ ++/* unlink - Make sure a task is not linked any longer to an entry ++ * where it was linked before. Must hold gsnedf_lock. ++ */ ++static noinline void unlink(struct task_struct* t) ++{ ++ cpu_entry_t *entry; ++ ++ if (unlikely(!t)) { ++ TRACE_BUG_ON(!t); ++ return; ++ } ++ ++ if (t->rt_param.linked_on != NO_CPU) { ++ /* unlink */ ++ entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on); ++ t->rt_param.linked_on = NO_CPU; ++ link_task_to_cpu(NULL, entry); ++ } else if (in_list(&t->rt_list)) { ++ /* This is an interesting situation: t is scheduled, ++ * but was just recently unlinked. It cannot be ++ * linked anywhere else (because then it would have ++ * been relinked to this CPU), thus it must be in some ++ * queue. We must remove it from the list in this ++ * case. ++ */ ++ list_del(&t->rt_list); ++ } ++} ++ ++ ++/* preempt - force a CPU to reschedule ++ */ ++static noinline void preempt(cpu_entry_t *entry) ++{ ++ /* We cannot make the is_np() decision here if it is a remote CPU ++ * because requesting exit_np() requires that we currently use the ++ * address space of the task. Thus, in the remote case we just send ++ * the IPI and let schedule() handle the problem. ++ */ ++ ++ if (smp_processor_id() == entry->cpu) { ++ if (entry->scheduled && is_np(entry->scheduled)) ++ request_exit_np(entry->scheduled); ++ else ++ set_tsk_need_resched(current); ++ } else ++ /* in case that it is a remote CPU we have to defer the ++ * the decision to the remote CPU ++ * FIXME: We could save a few IPI's here if we leave the flag ++ * set when we are waiting for a np_exit(). ++ */ ++ if (!test_will_schedule(entry->cpu)) ++ smp_send_reschedule(entry->cpu); ++} ++ ++/* requeue - Put an unlinked task into gsn-edf domain. ++ * Caller must hold gsnedf_lock. ++ */ ++static noinline void requeue(struct task_struct* task) ++{ ++ BUG_ON(!task); ++ /* sanity check rt_list before insertion */ ++ BUG_ON(in_list(&task->rt_list)); ++ ++ if (get_rt_flags(task) == RT_F_SLEEP || ++ get_rt_mode() != MODE_RT_RUN) { ++ /* this task has expired ++ * _schedule has already taken care of updating ++ * the release and ++ * deadline. We just must check if it has been released. ++ */ ++ if (is_released(task) && get_rt_mode() == MODE_RT_RUN) ++ __add_ready(&gsnedf, task); ++ else { ++ /* it has got to wait */ ++ __add_release(&gsnedf, task); ++ } ++ ++ } else ++ /* this is a forced preemption ++ * thus the task stays in the ready_queue ++ * we only must make it available to others ++ */ ++ __add_ready(&gsnedf, task); ++} ++ ++/* gsnedf_job_arrival: task is either resumed or released */ ++static noinline void gsnedf_job_arrival(struct task_struct* task) ++{ ++ cpu_entry_t* last; ++ ++ BUG_ON(list_empty(&gsnedf_cpu_queue)); ++ BUG_ON(!task); ++ ++ /* first queue arriving job */ ++ requeue(task); ++ ++ /* then check for any necessary preemptions */ ++ last = list_entry(gsnedf_cpu_queue.prev, cpu_entry_t, list); ++ if (edf_preemption_needed(&gsnedf, last->linked)) { ++ /* preemption necessary */ ++ task = __take_ready(&gsnedf); ++ TRACE("job_arrival: task %d linked to %d\n", ++ task->pid, last->cpu); ++ if (last->linked) ++ requeue(last->linked); ++ ++ link_task_to_cpu(task, last); ++ preempt(last); ++ } ++} ++ ++/* check for current job releases */ ++static noinline void gsnedf_release_jobs(void) ++{ ++ struct list_head *pos, *save; ++ struct task_struct *queued; ++ ++ list_for_each_safe(pos, save, &gsnedf.release_queue) { ++ queued = list_entry(pos, struct task_struct, rt_list); ++ if (likely(is_released(queued))) { ++ /* this one is ready to go*/ ++ list_del(pos); ++ set_rt_flags(queued, RT_F_RUNNING); ++ ++ sched_trace_job_release(queued); ++ gsnedf_job_arrival(queued); ++ } ++ else ++ /* the release queue is ordered */ ++ break; ++ } ++} ++ ++/* gsnedf_scheduler_tick - this function is called for every local timer ++ * interrupt. ++ * ++ * checks whether the current task has expired and checks ++ * whether we need to preempt it if it has not expired ++ */ ++static reschedule_check_t gsnedf_scheduler_tick(void) ++{ ++ unsigned long flags; ++ struct task_struct* t = current; ++ reschedule_check_t want_resched = NO_RESCHED; ++ ++ /* expire tasks even if not in real-time mode ++ * this makes sure that at the end of real-time mode ++ * no task "runs away forever". ++ */ ++ if (is_realtime(t)) ++ TRACE_CUR("before dec: time_slice == %u\n", t->time_slice); ++ ++ if (is_realtime(t) && t->time_slice && !--t->time_slice) { ++ if (!is_np(t)) { /* np tasks will be preempted when they become ++ preemptable again */ ++ want_resched = FORCE_RESCHED; ++ set_will_schedule(); ++ TRACE("gsnedf_scheduler_tick: " ++ "%d is preemptable " ++ " => FORCE_RESCHED\n", t->pid); ++ } else { ++ TRACE("gsnedf_scheduler_tick: " ++ "%d is non-preemptable, " ++ "preemption delayed.\n", t->pid); ++ request_exit_np(t); ++ } ++ } ++ ++ /* only the first CPU needs to release jobs */ ++ if (get_rt_mode() == MODE_RT_RUN && smp_processor_id() == 0) { ++ queue_lock_irqsave(&gsnedf_lock, flags); ++ ++ /* (1) try to release pending jobs */ ++ gsnedf_release_jobs(); ++ ++ /* we don't need to check linked != scheduled since ++ * set_tsk_need_resched has been set by preempt() if necessary ++ */ ++ ++ queue_unlock_irqrestore(&gsnedf_lock, flags); ++ } ++ ++ return want_resched; ++} ++ ++/* caller holds gsnedf_lock */ ++static noinline void job_completion(struct task_struct *t) ++{ ++ BUG_ON(!t); ++ ++ sched_trace_job_completion(t); ++ ++ TRACE_TASK(t, "job_completion().\n"); ++ ++ /* set flags */ ++ set_rt_flags(t, RT_F_SLEEP); ++ /* prepare for next period */ ++ edf_prepare_for_next_period(t); ++ /* unlink */ ++ unlink(t); ++ /* requeue ++ * But don't requeue a blocking task. */ ++ if (is_running(t)) ++ gsnedf_job_arrival(t); ++} ++ ++ ++/* Getting schedule() right is a bit tricky. schedule() may not make any ++ * assumptions on the state of the current task since it may be called for a ++ * number of reasons. The reasons include a scheduler_tick() determined that it ++ * was necessary, because sys_exit_np() was called, because some Linux ++ * subsystem determined so, or even (in the worst case) because there is a bug ++ * hidden somewhere. Thus, we must take extreme care to determine what the ++ * current state is. ++ * ++ * The CPU could currently be scheduling a task (or not), be linked (or not). ++ * ++ * The following assertions for the scheduled task could hold: ++ * ++ * - !is_running(scheduled) // the job blocks ++ * - scheduled->timeslice == 0 // the job completed (forcefully) ++ * - get_rt_flag() == RT_F_SLEEP // the job completed (by syscall) ++ * - linked != scheduled // we need to reschedule (for any reason) ++ * - is_np(scheduled) // rescheduling must be delayed, ++ * sys_exit_np must be requested ++ * ++ * Any of these can occur together. ++ */ ++static int gsnedf_schedule(struct task_struct * prev, ++ struct task_struct ** next, ++ runqueue_t * rq) ++{ ++ cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries); ++ int out_of_time, sleep, preempt, np, exists, ++ rt, blocks; ++ struct task_struct* linked; ++ ++ /* Will be released in finish_switch. */ ++ queue_lock(&gsnedf_lock); ++ clear_will_schedule(); ++ ++ /* sanity checking */ ++ BUG_ON(entry->scheduled && entry->scheduled != prev); ++ BUG_ON(entry->scheduled && !is_realtime(prev)); ++ ++ /* (0) Determine state */ ++ exists = entry->scheduled != NULL; ++ blocks = exists && !is_running(entry->scheduled); ++ out_of_time = exists && !entry->scheduled->time_slice; ++ np = exists && is_np(entry->scheduled); ++ sleep = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP; ++ preempt = entry->scheduled != entry->linked; ++ rt = get_rt_mode() == MODE_RT_RUN; ++ ++ /* If a task blocks we have no choice but to reschedule. ++ */ ++ if (blocks) ++ unlink(entry->scheduled); ++ ++ /* Request a sys_exit_np() call if we would like to preempt but cannot. ++ * We need to make sure to update the link structure anyway in case ++ * that we are still linked. Multiple calls to request_exit_np() don't ++ * hurt. ++ */ ++ if (np && (out_of_time || preempt || sleep)) { ++ unlink(entry->scheduled); ++ request_exit_np(entry->scheduled); ++ } ++ ++ /* Any task that is preemptable and either exhausts its execution ++ * budget or wants to sleep completes. We may have to reschedule after ++ * this. ++ */ ++ if (!np && (out_of_time || sleep)) ++ job_completion(entry->scheduled); ++ ++ /* Stop real-time tasks when we leave real-time mode ++ */ ++ if (!rt && entry->linked) { ++ /* task will be preempted once it is preemptable ++ * (which it may be already) ++ */ ++ linked = entry->linked; ++ unlink(linked); ++ requeue(linked); ++ } ++ ++ /* Link pending task if we became unlinked. ++ */ ++ if (rt && !entry->linked) ++ link_task_to_cpu(__take_ready(&gsnedf), entry); ++ ++ /* The final scheduling decision. Do we need to switch for some reason? ++ * If linked different from scheduled select linked as next. ++ */ ++ if ((!np || blocks) && ++ entry->linked != entry->scheduled) { ++ /* Take care of a previously scheduled ++ * job by taking it out of the Linux runqueue. ++ */ ++ if (entry->scheduled) { ++ if (prev->array) ++ /* take it out of the run queue */ ++ deactivate_task(prev, rq); ++ } ++ ++ /* Schedule a linked job? */ ++ if (entry->linked) { ++ *next = entry->linked; ++ /* mark the task as executing on this cpu */ ++ set_task_cpu(*next, smp_processor_id()); ++ /* stick the task into the runqueue */ ++ __activate_task(*next, rq); ++ } ++ } else ++ /* Only override Linux scheduler if we have real-time task ++ * scheduled that needs to continue. ++ */ ++ if (exists) ++ *next = prev; ++ ++ /* Unlock in case that we don't affect real-time tasks or ++ * if nothing changed and finish_switch won't be called. ++ */ ++ if (prev == *next || (!is_realtime(prev) && !*next)) ++ queue_unlock(&gsnedf_lock); ++ ++ return 0; ++} ++ ++ ++/* _finish_switch - we just finished the switch away from prev ++ */ ++static void gsnedf_finish_switch(struct task_struct *prev) ++{ ++ cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries); ++ ++ if (is_realtime(current)) ++ entry->scheduled = current; ++ else ++ entry->scheduled = NULL; ++ ++ prev->rt_param.scheduled_on = NO_CPU; ++ current->rt_param.scheduled_on = smp_processor_id(); ++ ++ /* unlock in case schedule() left it locked */ ++ if (is_realtime(current) || is_realtime(prev)) ++ queue_unlock(&gsnedf_lock); ++} ++ ++ ++/* Prepare a task for running in RT mode ++ * Enqueues the task into master queue data structure ++ * returns ++ * -EPERM if task is not TASK_STOPPED ++ */ ++static long gsnedf_prepare_task(struct task_struct * t) ++{ ++ unsigned long flags; ++ TRACE("gsn edf: prepare task %d\n", t->pid); ++ ++ if (t->state == TASK_STOPPED) { ++ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1); ++ ++ t->rt_param.scheduled_on = NO_CPU; ++ t->rt_param.linked_on = NO_CPU; ++ if (get_rt_mode() == MODE_RT_RUN) ++ /* The action is already on. ++ * Prepare immediate release ++ */ ++ edf_release_now(t); ++ /* The task should be running in the queue, otherwise signal ++ * code will try to wake it up with fatal consequences. ++ */ ++ t->state = TASK_RUNNING; ++ ++ queue_lock_irqsave(&gsnedf_lock, flags); ++ requeue(t); ++ queue_unlock_irqrestore(&gsnedf_lock, flags); ++ return 0; ++ } ++ else ++ return -EPERM; ++} ++ ++static void gsnedf_wake_up_task(struct task_struct *task) ++{ ++ unsigned long flags; ++ /* We must determine whether task should go into the release ++ * queue or into the ready queue. It may enter the ready queue ++ * if it has credit left in its time slice and has not yet reached ++ * its deadline. If it is now passed its deadline we assume this the ++ * arrival of a new sporadic job and thus put it in the ready queue ++ * anyway.If it has zero budget and the next release is in the future ++ * it has to go to the release queue. ++ */ ++ TRACE("gsnedf: %d unsuspends with budget=%d\n", ++ task->pid, task->time_slice); ++ task->state = TASK_RUNNING; ++ ++ /* We need to take suspensions because of semaphores into ++ * account! If a job resumes after being suspended due to acquiring ++ * a semaphore, it should never be treated as a new job release. ++ */ ++ if (get_rt_flags(task) == RT_F_EXIT_SEM) { ++ set_rt_flags(task, RT_F_RUNNING); ++ } else { ++ if (is_tardy(task)) { ++ /* new sporadic release */ ++ edf_release_now(task); ++ sched_trace_job_release(task); ++ } ++ else if (task->time_slice) ++ /* came back in time before deadline ++ */ ++ set_rt_flags(task, RT_F_RUNNING); ++ } ++ ++ queue_lock_irqsave(&gsnedf_lock, flags); ++ gsnedf_job_arrival(task); ++ queue_unlock_irqrestore(&gsnedf_lock, flags); ++} ++ ++static void gsnedf_task_blocks(struct task_struct *t) ++{ ++ unsigned long flags; ++ ++ /* unlink if necessary */ ++ queue_lock_irqsave(&gsnedf_lock, flags); ++ unlink(t); ++ queue_unlock_irqrestore(&gsnedf_lock, flags); ++ ++ BUG_ON(!is_realtime(t)); ++ TRACE("task %d suspends with budget=%d\n", t->pid, t->time_slice); ++ BUG_ON(t->rt_list.next != LIST_POISON1); ++ BUG_ON(t->rt_list.prev != LIST_POISON2); ++} ++ ++ ++/* When _tear_down is called, the task should not be in any queue any more ++ * as it must have blocked first. We don't have any internal state for the task, ++ * it is all in the task_struct. ++ */ ++static long gsnedf_tear_down(struct task_struct * t) ++{ ++ BUG_ON(!is_realtime(t)); ++ TRACE_TASK(t, "RIP\n"); ++ BUG_ON(t->array); ++ BUG_ON(t->rt_list.next != LIST_POISON1); ++ BUG_ON(t->rt_list.prev != LIST_POISON2); ++ return 0; ++} ++ ++static long gsnedf_pi_block(struct pi_semaphore *sem, ++ struct task_struct *new_waiter) ++{ ++ /* This callback has to handle the situation where a new waiter is ++ * added to the wait queue of the semaphore. ++ * ++ * We must check if has a higher priority than the currently ++ * highest-priority task, and then potentially reschedule. ++ */ ++ ++ BUG_ON(!new_waiter); ++ ++ if (edf_higher_prio(new_waiter, sem->hp.task)) { ++ TRACE_TASK(new_waiter, " boosts priority\n"); ++ /* called with IRQs disabled */ ++ queue_lock(&gsnedf_lock); ++ /* store new highest-priority task */ ++ sem->hp.task = new_waiter; ++ if (sem->holder) { ++ /* let holder inherit */ ++ sem->holder->rt_param.inh_task = new_waiter; ++ unlink(sem->holder); ++ gsnedf_job_arrival(sem->holder); ++ } ++ queue_unlock(&gsnedf_lock); ++ } ++ ++ return 0; ++} ++ ++static long gsnedf_inherit_priority(struct pi_semaphore *sem, ++ struct task_struct *new_owner) ++{ ++ /* We don't need to acquire the gsnedf_lock since at the time of this ++ * call new_owner isn't actually scheduled yet (it's still sleeping) ++ * and since the calling function already holds sem->wait.lock, which ++ * prevents concurrent sem->hp.task changes. ++ */ ++ ++ if (sem->hp.task && sem->hp.task != new_owner) { ++ new_owner->rt_param.inh_task = sem->hp.task; ++ TRACE_TASK(new_owner, "inherited priority from %s/%d\n", ++ sem->hp.task->comm, sem->hp.task->pid); ++ } else ++ TRACE_TASK(new_owner, ++ "cannot inherit priority, " ++ "no higher priority job waits.\n"); ++ return 0; ++} ++ ++/* This function is called on a semaphore release, and assumes that ++ * the current task is also the semaphore holder. ++ */ ++static long gsnedf_return_priority(struct pi_semaphore *sem) ++{ ++ struct task_struct* t = current; ++ int ret = 0; ++ ++ /* Find new highest-priority semaphore task ++ * if holder task is the current hp.task. ++ * ++ * Calling function holds sem->wait.lock. ++ */ ++ if (t == sem->hp.task) ++ edf_set_hp_task(sem); ++ ++ TRACE_CUR("gsnedf_return_priority for lock %p\n", sem); ++ ++ if (t->rt_param.inh_task) { ++ /* interrupts already disabled by PI code */ ++ queue_lock(&gsnedf_lock); ++ ++ /* Reset inh_task to NULL. */ ++ t->rt_param.inh_task = NULL; ++ ++ /* Check if rescheduling is necessary */ ++ unlink(t); ++ gsnedf_job_arrival(t); ++ queue_unlock(&gsnedf_lock); ++ } ++ ++ return ret; ++} ++ ++static int gsnedf_mode_change(int new_mode) ++{ ++ unsigned long flags; ++ int cpu; ++ cpu_entry_t *entry; ++ ++ if (new_mode == MODE_RT_RUN) { ++ queue_lock_irqsave(&gsnedf_lock, flags); ++ ++ __rerelease_all(&gsnedf, edf_release_at); ++ ++ /* get old cruft out of the way in case we reenter real-time ++ * mode for a second time ++ */ ++ while (!list_empty(&gsnedf_cpu_queue)) ++ list_del(gsnedf_cpu_queue.next); ++ /* reinitialize */ ++ for_each_online_cpu(cpu) { ++ entry = &per_cpu(gsnedf_cpu_entries, cpu); ++ atomic_set(&entry->will_schedule, 0); ++ entry->linked = NULL; ++ entry->scheduled = NULL; ++ list_add(&entry->list, &gsnedf_cpu_queue); ++ } ++ ++ queue_unlock_irqrestore(&gsnedf_lock, flags); ++ ++ } ++ return 0; ++} ++ ++ ++/* Plugin object */ ++static sched_plugin_t s_plugin __cacheline_aligned_in_smp = { ++ .ready_to_use = 0 ++}; ++ ++ ++/* ++ * Plugin initialization code. ++ */ ++#define INIT_SCHED_PLUGIN (struct sched_plugin){ \ ++ .plugin_name = "GSN-EDF", \ ++ .ready_to_use = 1, \ ++ .scheduler_tick = gsnedf_scheduler_tick, \ ++ .prepare_task = gsnedf_prepare_task, \ ++ .sleep_next_period = edf_sleep_next_period, \ ++ .tear_down = gsnedf_tear_down, \ ++ .schedule = gsnedf_schedule, \ ++ .finish_switch = gsnedf_finish_switch, \ ++ .mode_change = gsnedf_mode_change, \ ++ .wake_up_task = gsnedf_wake_up_task, \ ++ .task_blocks = gsnedf_task_blocks, \ ++ .inherit_priority = gsnedf_inherit_priority, \ ++ .return_priority = gsnedf_return_priority, \ ++ .pi_block = gsnedf_pi_block \ ++} ++ ++ ++sched_plugin_t *__init init_gsn_edf_plugin(void) ++{ ++ int cpu; ++ cpu_entry_t *entry; ++ ++ if (!s_plugin.ready_to_use) ++ { ++ /* initialize CPU state */ ++ for (cpu = 0; cpu < NR_CPUS; cpu++) { ++ entry = &per_cpu(gsnedf_cpu_entries, cpu); ++ atomic_set(&entry->will_schedule, 0); ++ entry->linked = NULL; ++ entry->scheduled = NULL; ++ entry->cpu = cpu; ++ } ++ ++ queue_lock_init(&gsnedf_lock); ++ edf_domain_init(&gsnedf, NULL); ++ s_plugin = INIT_SCHED_PLUGIN; ++ } ++ return &s_plugin; ++} ++ ++ +diff --git a/kernel/sched_part_edf.c b/kernel/sched_part_edf.c +new file mode 100644 +index 0000000..a792ac5 +--- /dev/null ++++ b/kernel/sched_part_edf.c +@@ -0,0 +1,340 @@ ++/* ++ * kernel/sched_part_edf.c ++ * ++ * Implementation of the partitioned EDF scheduler plugin. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++ ++typedef struct { ++ rt_domain_t domain; ++ int cpu; ++ struct task_struct* scheduled; /* only RT tasks */ ++ spinlock_t lock; ++} part_edf_domain_t; ++ ++ ++#define local_edf (&__get_cpu_var(part_edf_domains).domain) ++#define local_pedf (&__get_cpu_var(part_edf_domains)) ++#define remote_edf(cpu) (&per_cpu(part_edf_domains, cpu).domain) ++#define remote_pedf(cpu) (&per_cpu(part_edf_domains, cpu)) ++#define task_edf(task) remote_edf(get_partition(task)) ++ ++static void part_edf_domain_init(part_edf_domain_t* pedf, ++ check_resched_needed_t check, ++ int cpu) ++{ ++ edf_domain_init(&pedf->domain, check); ++ pedf->cpu = cpu; ++ pedf->lock = SPIN_LOCK_UNLOCKED; ++ pedf->scheduled = NULL; ++} ++ ++DEFINE_PER_CPU(part_edf_domain_t, part_edf_domains); ++ ++/* This check is trivial in partioned systems as we only have to consider ++ * the CPU of the partition. ++ * ++ */ ++static int part_edf_check_resched(rt_domain_t *edf) ++{ ++ part_edf_domain_t *pedf = container_of(edf, part_edf_domain_t, domain); ++ int ret = 0; ++ ++ spin_lock(&pedf->lock); ++ ++ /* because this is a callback from rt_domain_t we already hold ++ * the necessary lock for the ready queue ++ */ ++ if (edf_preemption_needed(edf, pedf->scheduled)) { ++ if (pedf->cpu == smp_processor_id()) ++ set_tsk_need_resched(current); ++ else ++ smp_send_reschedule(pedf->cpu); ++ ret = 1; ++ } ++ spin_unlock(&pedf->lock); ++ return ret; ++} ++ ++ ++static reschedule_check_t part_edf_scheduler_tick(void) ++{ ++ unsigned long flags; ++ struct task_struct *t = current; ++ reschedule_check_t want_resched = NO_RESCHED; ++ rt_domain_t *edf = local_edf; ++ part_edf_domain_t *pedf = local_pedf; ++ ++ /* Check for inconsistency. We don't need the lock for this since ++ * ->scheduled is only changed in schedule, which obviously is not ++ * executing in parallel on this CPU ++ */ ++ BUG_ON(is_realtime(t) && t != pedf->scheduled); ++ ++ /* expire tasks even if not in real-time mode ++ * this makes sure that at the end of real-time mode ++ * no tasks "run away forever". ++ */ ++ if (is_realtime(t) && (!--t->time_slice)) { ++ /* this task has exhausted its budget in this period */ ++ set_rt_flags(t, RT_F_SLEEP); ++ want_resched = FORCE_RESCHED; ++ } ++ if (get_rt_mode() == MODE_RT_RUN) ++ { ++ /* check whether anything is waiting to be released ++ * this could probably be moved to the global timer ++ * interrupt handler since the state will only change ++ * once per jiffie ++ */ ++ try_release_pending(edf); ++ if (want_resched != FORCE_RESCHED) ++ { ++ read_lock_irqsave(&edf->ready_lock, flags); ++ if (edf_preemption_needed(edf, t)) ++ want_resched = FORCE_RESCHED; ++ read_unlock_irqrestore(&edf->ready_lock, flags); ++ } ++ } ++ return want_resched; ++} ++ ++static int part_edf_schedule(struct task_struct * prev, ++ struct task_struct ** next, ++ runqueue_t * rq) ++{ ++ int need_deactivate = 1; ++ part_edf_domain_t* pedf = local_pedf; ++ rt_domain_t* edf = &pedf->domain; ++ ++ ++ if (is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP) ++ edf_prepare_for_next_period(prev); ++ ++ if (get_rt_mode() == MODE_RT_RUN) { ++ write_lock(&edf->ready_lock); ++ if (is_realtime(prev) && is_released(prev) && is_running(prev) ++ && !edf_preemption_needed(edf, prev)) { ++ /* this really should only happen if the task has ++ * 100% utilization... ++ */ ++ TRACE("prev will be next, already released\n"); ++ *next = prev; ++ need_deactivate = 0; ++ } else { ++ /* either not yet released, preempted, or non-rt */ ++ *next = __take_ready(edf); ++ if (*next) { ++ /* stick the task into the runqueue */ ++ __activate_task(*next, rq); ++ set_task_cpu(*next, smp_processor_id()); ++ } ++ } ++ spin_lock(&pedf->lock); ++ pedf->scheduled = *next; ++ spin_unlock(&pedf->lock); ++ if (*next) ++ set_rt_flags(*next, RT_F_RUNNING); ++ ++ write_unlock(&edf->ready_lock); ++ } ++ ++ if (is_realtime(prev) && need_deactivate && prev->array) { ++ /* take it out of the run queue */ ++ deactivate_task(prev, rq); ++ } ++ ++ return 0; ++} ++ ++ ++static void part_edf_finish_switch(struct task_struct *prev) ++{ ++ rt_domain_t* edf = local_edf; ++ ++ if (!is_realtime(prev) || !is_running(prev)) ++ return; ++ ++ if (get_rt_flags(prev) == RT_F_SLEEP || ++ get_rt_mode() != MODE_RT_RUN) { ++ /* this task has expired ++ * _schedule has already taken care of updating ++ * the release and ++ * deadline. We just must check if has been released. ++ */ ++ if (is_released(prev) && get_rt_mode() == MODE_RT_RUN) { ++ /* already released */ ++ add_ready(edf, prev); ++ TRACE("%d goes straight to ready queue\n", prev->pid); ++ } else ++ /* it has got to wait */ ++ add_release(edf, prev); ++ } else { ++ /* this is a forced preemption ++ * thus the task stays in the ready_queue ++ * we only must make it available to others ++ */ ++ add_ready(edf, prev); ++ } ++} ++ ++ ++/* Prepare a task for running in RT mode ++ * Enqueues the task into master queue data structure ++ * returns ++ * -EPERM if task is not TASK_STOPPED ++ */ ++static long part_edf_prepare_task(struct task_struct * t) ++{ ++ rt_domain_t* edf = task_edf(t); ++ ++ ++ TRACE("[%d] part edf: prepare task %d on CPU %d\n", ++ smp_processor_id(), t->pid, get_partition(t)); ++ if (t->state == TASK_STOPPED) { ++ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1); ++ ++ if (get_rt_mode() == MODE_RT_RUN) ++ /* The action is already on. ++ * Prepare immediate release. ++ */ ++ edf_release_now(t); ++ /* The task should be running in the queue, otherwise signal ++ * code will try to wake it up with fatal consequences. ++ */ ++ t->state = TASK_RUNNING; ++ add_release(edf, t); ++ return 0; ++ } else ++ return -EPERM; ++} ++ ++static void part_edf_wake_up_task(struct task_struct *task) ++{ ++ rt_domain_t* edf; ++ ++ edf = task_edf(task); ++ ++ /* We must determine whether task should go into the release ++ * queue or into the ready queue. It may enter the ready queue ++ * if it has credit left in its time slice and has not yet reached ++ * its deadline. If it is now passed its deadline we assume this the ++ * arrival of a new sporadic job and thus put it in the ready queue ++ * anyway.If it has zero budget and the next release is in the future ++ * it has to go to the release queue. ++ */ ++ TRACE("part edf: wake up %d with budget=%d for cpu %d\n", ++ task->pid, task->time_slice, get_partition(task)); ++ task->state = TASK_RUNNING; ++ if (is_tardy(task)) { ++ /* new sporadic release */ ++ edf_release_now(task); ++ add_ready(edf, task); ++ ++ } else if (task->time_slice) { ++ /* Came back in time before deadline. This may cause ++ * deadline overruns, but since we don't handle suspensions ++ * in the analytical model, we don't care since we can't ++ * guarantee anything at all if tasks block. ++ */ ++ set_rt_flags(task, RT_F_RUNNING); ++ add_ready(edf, task); ++ ++ } else { ++ add_release(edf, task); ++ } ++ ++} ++ ++static void part_edf_task_blocks(struct task_struct *t) ++{ ++ BUG_ON(!is_realtime(t)); ++ /* not really anything to do since it can only block if ++ * it is running, and when it is not running it is not in any ++ * queue anyway. ++ * ++ */ ++ TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice); ++ BUG_ON(in_list(&t->rt_list)); ++} ++ ++ ++/* When _tear_down is called, the task should not be in any queue any more ++ * as it must have blocked first. We don't have any internal state for the task, ++ * it is all in the task_struct. ++ */ ++static long part_edf_tear_down(struct task_struct * t) ++{ ++ BUG_ON(!is_realtime(t)); ++ TRACE("part edf: tear down called for %d \n", t->pid); ++ BUG_ON(t->array); ++ BUG_ON(in_list(&t->rt_list)); ++ return 0; ++} ++ ++ ++static int part_edf_mode_change(int new_mode) ++{ ++ int cpu; ++ ++ if (new_mode == MODE_RT_RUN) ++ for_each_online_cpu(cpu) ++ rerelease_all(remote_edf(cpu), edf_release_at); ++ TRACE("[%d] part edf: mode changed to %d\n", ++ smp_processor_id(), new_mode); ++ return 0; ++} ++ ++ ++/* Plugin object */ ++static sched_plugin_t s_plugin __cacheline_aligned_in_smp = { ++ .ready_to_use = 0 ++}; ++ ++ ++/* ++ * Plugin initialization code. ++ */ ++#define INIT_SCHED_PLUGIN (struct sched_plugin) {\ ++ .plugin_name = "Partitioned EDF",\ ++ .ready_to_use = 1,\ ++ .scheduler_tick = part_edf_scheduler_tick,\ ++ .prepare_task = part_edf_prepare_task,\ ++ .sleep_next_period = edf_sleep_next_period,\ ++ .tear_down = part_edf_tear_down,\ ++ .schedule = part_edf_schedule,\ ++ .finish_switch = part_edf_finish_switch,\ ++ .mode_change = part_edf_mode_change,\ ++ .wake_up_task = part_edf_wake_up_task,\ ++ .task_blocks = part_edf_task_blocks \ ++} ++ ++ ++sched_plugin_t *__init init_part_edf_plugin(void) ++{ ++ int i; ++ ++ if (!s_plugin.ready_to_use) ++ { ++ for (i = 0; i < NR_CPUS; i++) ++ { ++ part_edf_domain_init(remote_pedf(i), ++ part_edf_check_resched, i); ++ printk("CPU partition %d initialized.", i); ++ } ++ s_plugin = INIT_SCHED_PLUGIN; ++ } ++ return &s_plugin; ++} ++ ++ ++ +diff --git a/kernel/sched_pfair.c b/kernel/sched_pfair.c +new file mode 100644 +index 0000000..1a6a790 +--- /dev/null ++++ b/kernel/sched_pfair.c +@@ -0,0 +1,503 @@ ++/* ++ * ++ * Implementation of synchronized PFAIR PD2 scheduler ++ * ++ */ ++ ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++struct cpu_state { ++ struct task_struct * t; ++ volatile jiffie_t jiffie_marker; ++}; ++/* PFAIR scheduling domain, release and ready queues */ ++static pfair_domain_t pfair __cacheline_aligned_in_smp; ++ ++/* An indicator that quantum boundary was crossed ++ * and a decision has to be made ++ */ ++static int sync_go[NR_CPUS]; ++ ++ ++/* A collection of CPU states protected by pfair lock */ ++DEFINE_PER_CPU(struct cpu_state, states); ++ ++/* ++ * This function gets called by the timer code, with HZ frequency ++ * with interrupts disabled. ++ * ++ * The function merges the release queue with the ready queue ++ * and indicates that quantum boundary was crossed. ++ * ++ * It also suggests to schedule off currently running ++ * real-time task if the mode is non-real-time. ++ */ ++static reschedule_check_t pfair_scheduler_tick(void) ++{ ++ int want_resched = NO_RESCHED; ++ sync_go[smp_processor_id()] = 0; ++ if (!cpu_isset(smp_processor_id(), pfair.domain_cpus)) ++ goto out; ++ /* Now determine if we want current task to be preempted */ ++ if (get_rt_mode() == MODE_RT_RUN) { ++ pfair_try_release_pending(&pfair); ++ want_resched = FORCE_RESCHED; ++ /* indicate that the interrupt fired */ ++ sync_go[smp_processor_id()] = 1; ++ barrier(); ++ } else if (is_realtime(current) && is_running(current)) { ++ /* In non real-time mode we want to ++ * schedule off real-time tasks */ ++ want_resched = FORCE_RESCHED; ++ } else if (is_realtime(current) && !is_running(current)) { ++ TRACE("[%d] %d Timer interrupt on not runninng %d\n", ++ smp_processor_id(), ++ jiffies-rt_start_time, current->pid); ++ } ++out: ++ return want_resched; ++} ++ ++/** ++ * This function is called by the processor ++ * that performs rescheduling. It saves the timing ++ * parameters of currently running jobs that were not rescheduled yet ++ * and releases next subtask for these jobs placing them into ++ * release and ready queues. ++ */ ++static void pretend_release(cpumask_t p) ++{ ++ int i = 0; ++ struct task_struct * t = NULL; ++ /* for all the tasks increment the number of used quanta ++ * and release next subtask or job depending on the number ++ * of used quanta ++ */ ++ for_each_cpu_mask(i, p) { ++ t = per_cpu(states, i).t; ++ if (t != NULL) { ++ backup_times(t); ++ inc_passed_quanta(t); ++ if ( get_passed_quanta(t) == get_exec_cost(t)) { ++ pfair_prepare_next_job(t); ++ } else { ++ pfair_prepare_next_subtask(t); ++ } ++ /* ++ TRACE("[%d] %d pretending release %d with (%d, %d)\n", ++ smp_processor_id(), ++ jiffies-rt_start_time,t->pid, ++ get_release(t)-rt_start_time, ++ get_deadline(t)-rt_start_time);*/ ++ /* detect if the job or subtask has to be released now*/ ++ if (time_before_eq(get_release(t), jiffies)) ++ pfair_add_ready(&pfair, t); ++ else ++ pfair_add_release(&pfair, t); ++ } ++ } ++} ++/* ++ * Rollback the the pretended release of tasks. ++ * Timing parameters are restored and tasks are removed ++ * from the queues as it was before calling the schedule() function. ++ * ++ */ ++static void rollback_release(cpumask_t p) ++{ ++ int i = -1; ++ struct task_struct * t = NULL; ++ /* ++ * Rollback the pretended changes ++ */ ++ for_each_cpu_mask(i, p) { ++ t = per_cpu(states, i).t; ++ if (t != NULL) { ++ restore_times(t); ++ if(t->rt_list.prev != LIST_POISON1 || ++ t->rt_list.next != LIST_POISON2) { ++ /* Delete the task from a queue */ ++ list_del(&t->rt_list); ++ } ++ } ++ } ++} ++ ++/* ++ * The procedure creates a list of cpu's whose tasks have not been ++ * rescheduled yet. These are CPU's with jiffie marker different from ++ * the value of jiffies. ++ */ ++static void find_participants(cpumask_t * target) ++{ ++ cpumask_t res;int i; ++ cpus_clear(res); ++ for_each_online_cpu(i) { ++ if(per_cpu(states, i).jiffie_marker != jiffies) ++ cpu_set(i, res); ++ } ++ /* Examine only cpus in the domain */ ++ cpus_and(res, pfair.domain_cpus, res); ++ (*target) = res; ++} ++ ++/* ++ * This is main PFAIR schedule function, ++ * each processor pretends that some currently running tasks are ++ * released in the next quantum and determines whether it should ++ * keep the task that is currently running (this is usually the case ++ * for heavy tasks). ++*/ ++static int pfair_schedule(struct task_struct *prev, ++ struct task_struct **next, ++ runqueue_t * rq) ++{ ++ int cpu =-1; ++ int k =-1; ++ int need_deactivate = 1; ++ int keep =0; ++ unsigned long flags; ++ cpumask_t participants; ++ /* A temporary array */ ++ struct task_struct * rs_old_ptr[NR_CPUS]; ++ ++ *next = NULL; ++ cpu = smp_processor_id(); ++ /* CPU's not in the domain just bypass */ ++ if (!cpu_isset(cpu, pfair.domain_cpus)) { ++ goto out; ++ } ++ queue_lock_irqsave(&pfair.pfair_lock, flags); ++ ++ /* If we happen to run in non-realtime mode ++ * then we have to schedule off currently running tasks ++ * */ ++ if (get_rt_mode() != MODE_RT_RUN) { ++ if (is_realtime(prev)) { ++ per_cpu(states, cpu).t = NULL; ++ TRACE("[%d] %d Suspending %d\n", ++ cpu, jiffies - rt_start_time, ++ prev->pid); ++ /* Move the task to the ++ * release queue for future runs ++ * FIXME: Do something smarter. ++ * For example create a set where ++ * prepared or inactive tasks are placed ++ * and then released. ++ * */ ++ set_release(prev, get_release(prev) + 1000); ++ pfair_add_release(&pfair, prev); ++ } ++ goto out_deactivate; ++ } ++ /* If the current task stops or dies */ ++ if (is_realtime(prev) && !is_running(prev)) { ++ /* remove it from the running set */ ++ per_cpu(states, cpu).t = NULL; ++ } ++ /* Make pfair decisions at quantum boundaries only, ++ * but schedule off stopped or dead tasks */ ++ ++ if ((sync_go[cpu]--) != 1) ++ goto out_deactivate; ++ ++ /*TRACE("[%d] %d Scheduler activation", cpu, jiffies-rt_start_time); ++ cpus_and(res, pfair.domain_cpus, cpu_online_map); ++ for_each_cpu_mask(k, res) { ++ TRACE("%d" ,(per_cpu(states, k).jiffie_marker!=jiffies)); ++ } ++ TRACE("\n");*/ ++ ++ /* Find processors that have not rescheduled yet */ ++ find_participants(&participants); ++ /* For each task on remote cpu's pretend release */ ++ pretend_release(participants); ++ /* Clear temporary array */ ++ for_each_possible_cpu(k) { rs_old_ptr[k] = NULL; } ++ /* Select a new subset of eligible tasks */ ++ for_each_cpu_mask(k, participants) { ++ rs_old_ptr[k] = __pfair_take_ready (&pfair); ++ /* Check if our current task must be scheduled in the next quantum */ ++ if (rs_old_ptr[k] == per_cpu(states, cpu).t) { ++ /* this is our current task, keep it */ ++ *next = per_cpu(states, cpu).t; ++ need_deactivate = 0; ++ keep = 1; ++ break; ++ } ++ } ++ /* Put all the extracted tasks back into the ready queue */ ++ for_each_cpu_mask(k, participants) { ++ if (rs_old_ptr[k] != NULL){ ++ pfair_add_ready(&pfair, rs_old_ptr[k]); ++ rs_old_ptr[k] = NULL; ++ } ++ } ++ /* Rollback the pretended release, ++ * task parameters are restored and running tasks are removed ++ * from queues */ ++ rollback_release(participants); ++ /* ++ * If the current task is not scheduled in the next quantum ++ * then select a new pfair task ++ */ ++ if(!keep) { ++ *next = per_cpu(states, cpu).t = __pfair_take_ready(&pfair); ++ if (*next != NULL) { ++ /*TRACE("[%d] %d Scheduling %d with (%d, %d)\n", ++ cpu, jiffies-rt_start_time, ++ get_release(*next), ++ get_deadline(*next)); ++ */ ++ set_task_cpu(*next, cpu); ++ __activate_task(*next, rq); ++ } ++ } else { ++ if (is_realtime(prev)) { ++ /*TRACE("[%d] %d prev==next %d\n", ++ cpu,jiffies-rt_start_time, ++ (prev)->pid);*/ ++ ++ /* The task will not be switched off but we ++ * need to track the execution time ++ */ ++ inc_passed_quanta(prev); ++ } ++ } ++ ++ /*Show that our task does not participate in subsequent selections*/ ++ __get_cpu_var(states).jiffie_marker = jiffies; ++ ++out_deactivate: ++ if ( is_realtime(prev) && need_deactivate && prev->array) { ++ /* take prev out of the linux run queue */ ++ deactivate_task(prev, rq); ++ } ++ queue_unlock_irqrestore(&pfair.pfair_lock, flags); ++out: ++ return 0; ++} ++ ++static void pfair_finish_task_switch(struct task_struct *t) ++{ ++ if (!is_realtime(t) || !is_running(t)) ++ return; ++ ++ queue_lock(&pfair.pfair_lock); ++ /* Release in real-time mode only, ++ * if the mode is non real-time, then ++ * the task is already in the release queue ++ * with the time far in the future ++ */ ++ if (get_rt_mode() == MODE_RT_RUN) { ++ inc_passed_quanta(t); ++ if ( get_passed_quanta(t) == get_exec_cost(t)) { ++ sched_trace_job_completion(t); ++ pfair_prepare_next_job(t); ++ } else { ++ pfair_prepare_next_subtask(t); ++ } ++ /*TRACE("[%d] %d releasing %d with (%d, %d)\n", ++ smp_processor_id(), ++ jiffies-rt_start_time, ++ t->pid, ++ get_release(t)-rt_start_time, ++ get_deadline(t)-rt_start_time);*/ ++ if (time_before_eq(get_release(t), jiffies)) ++ pfair_add_ready(&pfair, t); ++ else ++ pfair_add_release(&pfair, t); ++ } ++ queue_unlock(&pfair.pfair_lock); ++} ++ ++/* Prepare a task for running in RT mode ++ * Enqueues the task into master queue data structure ++ * returns ++ * -EPERM if task is not TASK_STOPPED ++ */ ++static long pfair_prepare_task(struct task_struct * t) ++{ ++ unsigned long flags; ++ TRACE("pfair: prepare task %d\n", t->pid); ++ if (t->state == TASK_STOPPED) { ++ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1); ++ ++ if (get_rt_mode() == MODE_RT_RUN) ++ /* The action is already on. ++ * Prepare immediate release ++ */ ++ __pfair_prepare_new_release(t, jiffies); ++ /* The task should be running in the queue, otherwise signal ++ * code will try to wake it up with fatal consequences. ++ */ ++ t->state = TASK_RUNNING; ++ queue_lock_irqsave(&pfair.pfair_lock, flags); ++ pfair_add_release(&pfair, t); ++ queue_unlock_irqrestore(&pfair.pfair_lock, flags); ++ return 0; ++ } else ++ return -EPERM; ++} ++ ++ ++ ++static void pfair_wake_up_task(struct task_struct *task) ++{ ++ ++ unsigned long flags; ++ ++ /* We must determine whether task should go into the release ++ * queue or into the ready queue. ++ * The task enters the ready queue if the previous deadline was missed, ++ * so we treat the invoked job as a new sporadic release. ++ * ++ * The job can also enter the ready queue if it was invoked before its ++ * global deadline, but its budjet must be clipped down to one quantum ++ */ ++ task->state = TASK_RUNNING; ++ if (time_after_eq(jiffies, task->rt_param.times.last_release ++ + get_rt_period(task))) { ++ /* new sporadic release */ ++ TRACE("[%d] Sporadic release of %d at %d\n", ++ smp_processor_id(), ++ jiffies-rt_start_time, ++ task->pid); ++ __pfair_prepare_new_release(task, jiffies); ++ queue_lock_irqsave(&pfair.pfair_lock, flags); ++ sched_trace_job_release(task); ++ pfair_add_ready(&pfair, task); ++ queue_unlock_irqrestore(&pfair.pfair_lock, flags); ++ } else if (task->time_slice) { ++ /* came back in time before deadline ++ * clip the budget to be the last subtask of a job or ++ * the new job. ++ */ ++ task->rt_param.times.exec_time = get_exec_cost(task) - 1; ++ if (task->rt_param.times.exec_time == 0) { ++ pfair_prepare_next_job(task); ++ } else { ++ pfair_prepare_next_subtask(task); ++ } ++ TRACE("[%d] %d Resume of %d with %d, %d, %d\n", ++ smp_processor_id(), jiffies-rt_start_time, ++ task->pid, get_release(task)-rt_start_time, ++ get_deadline(task)-rt_start_time, ++ get_passed_quanta(task)); ++ ++ set_rt_flags(task, RT_F_RUNNING); ++ queue_lock_irqsave(&pfair.pfair_lock, flags); ++ sched_trace_job_release(task); ++ if (time_after_eq(jiffies, get_release(task))) { ++ pfair_add_ready(&pfair, task); ++ } else { ++ pfair_add_release(&pfair, task); ++ } ++ queue_unlock_irqrestore(&pfair.pfair_lock, flags); ++ ++ } else { ++ TRACE("[%d] %d Strange release of %d with %d, %d, %d\n", ++ smp_processor_id(), jiffies-rt_start_time, ++ task->pid, ++ get_release(task), get_deadline(task), ++ get_passed_quanta(task)); ++ ++ queue_lock_irqsave(&pfair.pfair_lock, flags); ++ pfair_add_release(&pfair, task); ++ queue_unlock_irqrestore(&pfair.pfair_lock, flags); ++ } ++} ++ ++ ++static void pfair_task_blocks(struct task_struct *t) ++{ ++ unsigned long flags; ++ int i; ++ cpumask_t res; ++ BUG_ON(!is_realtime(t)); ++ /* If the task blocks, then it must be removed from the running set */ ++ queue_lock_irqsave(&pfair.pfair_lock, flags); ++ cpus_and(res,pfair.domain_cpus, cpu_online_map); ++ for_each_cpu_mask(i, res) { ++ if (per_cpu(states, i).t == t) ++ per_cpu(states, i).t = NULL; ++ } ++ /* If the task is running and in some ++ * list it might have been released by another ++ * processor ++ */ ++ if((t->rt_list.next != LIST_POISON1 || ++ t->rt_list.prev != LIST_POISON2)) { ++ TRACE("[%d] %d task %d is deleted from the list\n", ++ smp_processor_id(), ++ jiffies-rt_start_time, t->pid); ++ list_del(&t->rt_list); ++ } ++ queue_unlock_irqrestore(&pfair.pfair_lock, flags); ++ TRACE("[%d] %d task %d blocks with budget=%d state=%d\n", ++ smp_processor_id(), jiffies-rt_start_time, ++ t->pid, t->time_slice, t->state); ++} ++ ++static long pfair_tear_down(struct task_struct * t) ++{ ++ BUG_ON(!is_realtime(t)); ++ TRACE("pfair: tear down called for %d \n", t->pid); ++ BUG_ON(t->array); ++ BUG_ON(t->rt_list.next != LIST_POISON1); ++ BUG_ON(t->rt_list.prev != LIST_POISON2); ++ return 0; ++} ++ ++static int pfair_mode_change(int new_mode) ++{ ++ printk(KERN_INFO "[%d] pfair mode change %d\n", ++ smp_processor_id(), new_mode); ++ if (new_mode == MODE_RT_RUN) { ++ pfair_prepare_new_releases(&pfair, jiffies + 10); ++ } ++ printk(KERN_INFO "[%d] pfair: mode change done\n", smp_processor_id()); ++ return 0; ++} ++ ++/* Plugin object */ ++static sched_plugin_t s_plugin __cacheline_aligned_in_smp = { ++ .ready_to_use = 0 ++}; ++/* ++* PFAIR plugin initialization macro. ++*/ ++#define INIT_PFAIR_PLUGIN (struct sched_plugin){\ ++ .plugin_name = "PFAIR",\ ++ .ready_to_use = 1,\ ++ .scheduler_tick = pfair_scheduler_tick,\ ++ .prepare_task = pfair_prepare_task,\ ++ .tear_down = pfair_tear_down,\ ++ .schedule = pfair_schedule,\ ++ .finish_switch = pfair_finish_task_switch,\ ++ .mode_change = pfair_mode_change,\ ++ .wake_up_task = pfair_wake_up_task,\ ++ .task_blocks = pfair_task_blocks \ ++ } ++ ++sched_plugin_t* __init init_pfair_plugin(void) ++{ ++ int i=0; ++ if (!s_plugin.ready_to_use) { ++ pfair_domain_init(&pfair); ++ for (i=0; i ++#include ++ ++ ++/************************************************************* ++ * Dummy plugin functions * ++ *************************************************************/ ++ ++void litmus_dummy_finish_switch(struct task_struct * prev) ++{ ++} ++ ++int litmus_dummy_schedule(struct task_struct * prev, ++ struct task_struct** next, ++ runqueue_t* q) ++{ ++ return 0; ++} ++ ++reschedule_check_t litmus_dummy_scheduler_tick(void) ++{ ++ return NO_RESCHED; ++} ++ ++ ++long litmus_dummy_prepare_task(struct task_struct *t) ++{ ++ return 0; ++} ++ ++void litmus_dummy_wake_up_task(struct task_struct *task) ++{ ++ printk(KERN_WARNING "task %d: unhandled real-time wake up!\n", ++ task->pid); ++} ++ ++void litmus_dummy_task_blocks(struct task_struct *task) ++{ ++} ++ ++long litmus_dummy_tear_down(struct task_struct *task) ++{ ++ return 0; ++} ++ ++int litmus_dummy_scheduler_setup(int cmd, void __user *parameter) ++{ ++ return -EPERM; ++} ++ ++long litmus_dummy_sleep_next_period(void) ++{ ++ return -EPERM; ++} ++ ++long litmus_dummy_inherit_priority(struct pi_semaphore *sem, ++ struct task_struct *new_owner) ++{ ++ return -EPERM; ++} ++ ++long litmus_dummy_return_priority(struct pi_semaphore *sem) ++{ ++ return -EPERM; ++} ++ ++long litmus_dummy_pi_block(struct pi_semaphore *sem, ++ struct task_struct *new_waiter) ++{ ++ return -EPERM; ++} ++ ++ ++/* The default scheduler plugin. It doesn't do anything and lets Linux do its ++ * job. ++ */ ++ ++sched_plugin_t linux_sched_plugin = { ++ .plugin_name = "Linux", ++ .ready_to_use = 1, ++ .scheduler_tick = litmus_dummy_scheduler_tick, ++ .prepare_task = litmus_dummy_prepare_task, ++ .tear_down = litmus_dummy_tear_down, ++ .wake_up_task = litmus_dummy_wake_up_task, ++ .task_blocks = litmus_dummy_task_blocks, ++ .sleep_next_period = litmus_dummy_sleep_next_period, ++ .schedule = litmus_dummy_schedule, ++ .finish_switch = litmus_dummy_finish_switch, ++ .scheduler_setup = litmus_dummy_scheduler_setup, ++ .inherit_priority = litmus_dummy_inherit_priority, ++ .return_priority = litmus_dummy_return_priority, ++ .pi_block = litmus_dummy_pi_block ++}; ++ ++/* ++ * The reference to current plugin that is used to schedule tasks within ++ * the system. It stores references to actual function implementations ++ * Should be initialized by calling "init_***_plugin()" ++ */ ++sched_plugin_t *curr_sched_plugin = &linux_sched_plugin; ++ +diff --git a/kernel/sched_psn_edf.c b/kernel/sched_psn_edf.c +new file mode 100644 +index 0000000..9e4f4ab +--- /dev/null ++++ b/kernel/sched_psn_edf.c +@@ -0,0 +1,523 @@ ++ ++/* ++ * kernel/sched_psn_edf.c ++ * ++ * Implementation of the PSN-EDF scheduler plugin. ++ * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c. ++ * ++ * Suspensions and non-preemptable sections are supported. ++ * Priority inheritance is not supported. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++ ++typedef struct { ++ rt_domain_t domain; ++ int cpu; ++ struct task_struct* scheduled; /* only RT tasks */ ++ spinlock_t lock; /* protects the domain and ++ * serializes scheduling decisions ++ */ ++} psnedf_domain_t; ++ ++DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains); ++ ++#define local_edf (&__get_cpu_var(psnedf_domains).domain) ++#define local_pedf (&__get_cpu_var(psnedf_domains)) ++#define remote_edf(cpu) (&per_cpu(psnedf_domains, cpu).domain) ++#define remote_pedf(cpu) (&per_cpu(psnedf_domains, cpu)) ++#define task_edf(task) remote_edf(get_partition(task)) ++#define task_pedf(task) remote_pedf(get_partition(task)) ++ ++ ++static void psnedf_domain_init(psnedf_domain_t* pedf, ++ check_resched_needed_t check, ++ int cpu) ++{ ++ edf_domain_init(&pedf->domain, check); ++ pedf->cpu = cpu; ++ pedf->lock = SPIN_LOCK_UNLOCKED; ++ pedf->scheduled = NULL; ++} ++ ++static void requeue(struct task_struct* t, rt_domain_t *edf) ++{ ++ /* only requeue if t is actually running */ ++ BUG_ON(!is_running(t)); ++ ++ if (t->state != TASK_RUNNING) ++ TRACE_TASK(t, "requeue: !TASK_RUNNING"); ++ ++ set_rt_flags(t, RT_F_RUNNING); ++ if (!is_released(t) || ++ get_rt_mode() != MODE_RT_RUN) ++ __add_release(edf, t); /* it has got to wait */ ++ else ++ __add_ready(edf, t); ++} ++ ++/* we assume the lock is being held */ ++static void preempt(psnedf_domain_t *pedf) ++{ ++ if (smp_processor_id() == pedf->cpu) { ++ if (pedf->scheduled && is_np(pedf->scheduled)) ++ request_exit_np(pedf->scheduled); ++ else ++ set_tsk_need_resched(current); ++ } else ++ /* in case that it is a remote CPU we have to defer the ++ * the decision to the remote CPU ++ */ ++ smp_send_reschedule(pedf->cpu); ++} ++ ++/* This check is trivial in partioned systems as we only have to consider ++ * the CPU of the partition. ++ */ ++static int psnedf_check_resched(rt_domain_t *edf) ++{ ++ psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain); ++ int ret = 0; ++ ++ /* because this is a callback from rt_domain_t we already hold ++ * the necessary lock for the ready queue ++ */ ++ if (edf_preemption_needed(edf, pedf->scheduled)) { ++ preempt(pedf); ++ ret = 1; ++ } ++ return ret; ++} ++ ++ ++static reschedule_check_t psnedf_scheduler_tick(void) ++{ ++ unsigned long flags; ++ struct task_struct *t = current; ++ reschedule_check_t want_resched = NO_RESCHED; ++ rt_domain_t *edf = local_edf; ++ psnedf_domain_t *pedf = local_pedf; ++ ++ /* Check for inconsistency. We don't need the lock for this since ++ * ->scheduled is only changed in schedule, which obviously is not ++ * executing in parallel on this CPU ++ */ ++ BUG_ON(is_realtime(t) && t != pedf->scheduled); ++ ++ if (is_realtime(t)) ++ TRACE("%s/%d was hit by scheduler tick\n", t->comm, t->pid); ++ ++ /* expire tasks even if not in real-time mode ++ * this makes sure that at the end of real-time mode ++ * no tasks "run away forever". ++ */ ++ if (is_realtime(t) && t->time_slice && !--t->time_slice) { ++ if (!is_np(t)) { ++ want_resched = FORCE_RESCHED; ++ } else { ++ TRACE("psnedf_scheduler_tick: " ++ "%d is non-preemptable, " ++ "preemption delayed.\n", t->pid); ++ request_exit_np(t); ++ } ++ } ++ ++ if (get_rt_mode() == MODE_RT_RUN) ++ { ++ /* check whether anything is waiting to be released ++ * this could probably be moved to the global timer ++ * interrupt handler since the state will only change ++ * once per jiffie ++ */ ++ spin_lock_irqsave(&pedf->lock, flags); ++ __release_pending(edf); ++ if (want_resched != FORCE_RESCHED && ++ edf_preemption_needed(edf, t)) ++ want_resched = FORCE_RESCHED; ++ ++ spin_unlock_irqrestore(&pedf->lock, flags); ++ ++ } ++ return want_resched; ++} ++ ++static void job_completion(struct task_struct* t) ++{ ++ TRACE_TASK(t, "job_completion().\n"); ++ set_rt_flags(t, RT_F_SLEEP); ++ edf_prepare_for_next_period(t); ++} ++ ++static int psnedf_schedule(struct task_struct * prev, ++ struct task_struct ** next, ++ runqueue_t * rq) ++{ ++ psnedf_domain_t* pedf = local_pedf; ++ rt_domain_t* edf = &pedf->domain; ++ ++ int out_of_time, sleep, preempt, ++ np, exists, rt, blocks, resched; ++ ++ spin_lock(&pedf->lock); ++ ++ /* sanity checking */ ++ BUG_ON(pedf->scheduled && pedf->scheduled != prev); ++ BUG_ON(pedf->scheduled && !is_realtime(prev)); ++ ++ /* (0) Determine state */ ++ exists = pedf->scheduled != NULL; ++ blocks = exists && !is_running(pedf->scheduled); ++ out_of_time = exists && !pedf->scheduled->time_slice; ++ np = exists && is_np(pedf->scheduled); ++ sleep = exists && get_rt_flags(pedf->scheduled) == RT_F_SLEEP; ++ preempt = edf_preemption_needed(edf, prev); ++ rt = get_rt_mode() == MODE_RT_RUN; ++ ++ ++ /* If we need to preempt do so. ++ * The following checks set resched to 1 in case of special ++ * circumstances. ++ */ ++ resched = preempt; ++ ++ /* If a task blocks we have no choice but to reschedule. ++ */ ++ if (blocks) ++ resched = 1; ++ ++ /* Request a sys_exit_np() call if we would like to preempt but cannot. ++ * Multiple calls to request_exit_np() don't hurt. ++ */ ++ if (np && (out_of_time || preempt || sleep)) ++ request_exit_np(pedf->scheduled); ++ ++ /* Any task that is preemptable and either exhausts its execution ++ * budget or wants to sleep completes. We may have to reschedule after ++ * this. ++ */ ++ if (!np && (out_of_time || sleep)) { ++ job_completion(pedf->scheduled); ++ resched = 1; ++ } ++ ++ /* Stop real-time tasks when we leave real-time mode ++ */ ++ if (!rt && exists) ++ resched = 1; ++ ++ /* The final scheduling decision. Do we need to switch for some reason? ++ * Switch if we are in RT mode and have no task or if we need to ++ * resched. ++ */ ++ *next = NULL; ++ if ((!np || blocks) && (resched || (!exists && rt))) { ++ /* Take care of a previously scheduled ++ * job by taking it out of the Linux runqueue. ++ */ ++ if (pedf->scheduled) { ++ /* as opposed to global schedulers that switch without ++ * a lock being held we can requeue already here since ++ * no other CPU will schedule from this domain. ++ */ ++ if (!blocks) ++ requeue(pedf->scheduled, edf); ++ if (prev->array) ++ /* take it out of the run queue */ ++ deactivate_task(prev, rq); ++ } ++ ++ /* only pick tasks if we are actually in RT mode */ ++ if (rt) ++ *next = __take_ready(edf); ++ if (*next) { ++ /* stick the task into the runqueue */ ++ __activate_task(*next, rq); ++ set_task_cpu(*next, smp_processor_id()); ++ } ++ ++ } else ++ /* Only override Linux scheduler if we have a real-time task ++ * scheduled that needs to continue. ++ */ ++ if (exists) ++ *next = prev; ++ ++ if (*next) ++ set_rt_flags(*next, RT_F_RUNNING); ++ ++ pedf->scheduled = *next; ++ spin_unlock(&pedf->lock); ++ return 0; ++} ++ ++ ++/* Prepare a task for running in RT mode ++ * Enqueues the task into master queue data structure ++ * returns ++ * -EPERM if task is not TASK_STOPPED ++ */ ++static long psnedf_prepare_task(struct task_struct * t) ++{ ++ rt_domain_t* edf = task_edf(t); ++ psnedf_domain_t* pedf = task_pedf(t); ++ unsigned long flags; ++ ++ TRACE("[%d] psn edf: prepare task %d on CPU %d\n", ++ smp_processor_id(), t->pid, get_partition(t)); ++ if (t->state == TASK_STOPPED) { ++ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1); ++ ++ if (get_rt_mode() == MODE_RT_RUN) ++ /* The action is already on. ++ * Prepare immediate release. ++ */ ++ edf_release_now(t); ++ /* The task should be running in the queue, otherwise signal ++ * code will try to wake it up with fatal consequences. ++ */ ++ t->state = TASK_RUNNING; ++ spin_lock_irqsave(&pedf->lock, flags); ++ __add_release(edf, t); ++ spin_unlock_irqrestore(&pedf->lock, flags); ++ return 0; ++ } else ++ return -EPERM; ++} ++ ++static void psnedf_wake_up_task(struct task_struct *task) ++{ ++ unsigned long flags; ++ psnedf_domain_t* pedf = task_pedf(task); ++ rt_domain_t* edf = task_edf(task); ++ ++ TRACE("psnedf: %d unsuspends with budget=%d\n", ++ task->pid, task->time_slice); ++ ++ /* After fixing the litmus_controlled bug, ++ * this should hold again. ++ */ ++ BUG_ON(in_list(&task->rt_list)); ++ ++ task->state = TASK_RUNNING; ++ ++ /* We need to take suspensions because of semaphores into ++ * account! If a job resumes after being suspended due to acquiring ++ * a semaphore, it should never be treated as a new job release. ++ */ ++ if (is_tardy(task) && get_rt_flags(task) != RT_F_EXIT_SEM) { ++ /* new sporadic release */ ++ edf_release_now(task); ++ sched_trace_job_release(task); ++ } ++ ++ spin_lock_irqsave(&pedf->lock, flags); ++ requeue(task, edf); ++ spin_unlock_irqrestore(&pedf->lock, flags); ++} ++ ++static void psnedf_task_blocks(struct task_struct *t) ++{ ++ BUG_ON(!is_realtime(t)); ++ /* not really anything to do since it can only block if ++ * it is running, and when it is not running it is not in any ++ * queue anyway. ++ */ ++ TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice); ++ BUG_ON(in_list(&t->rt_list)); ++} ++ ++ ++/* When _tear_down is called, the task should not be in any queue any more ++ * as it must have blocked first. We don't have any internal state for the task, ++ * it is all in the task_struct. ++ */ ++static long psnedf_tear_down(struct task_struct * t) ++{ ++ BUG_ON(!is_realtime(t)); ++ TRACE_TASK(t, "tear down called"); ++ BUG_ON(t->array); ++ BUG_ON(in_list(&t->rt_list)); ++ return 0; ++} ++ ++static long psnedf_pi_block(struct pi_semaphore *sem, ++ struct task_struct *new_waiter) ++{ ++ psnedf_domain_t* pedf; ++ rt_domain_t* edf; ++ struct task_struct* t; ++ int cpu = get_partition(new_waiter); ++ ++ BUG_ON(!new_waiter); ++ ++ if (edf_higher_prio(new_waiter, sem->hp.cpu_task[cpu])) { ++ TRACE_TASK(new_waiter, " boosts priority\n"); ++ pedf = task_pedf(new_waiter); ++ edf = task_edf(new_waiter); ++ ++ /* interrupts already disabled */ ++ spin_lock(&pedf->lock); ++ ++ /* store new highest-priority task */ ++ sem->hp.cpu_task[cpu] = new_waiter; ++ if (sem->holder && ++ get_partition(sem->holder) == get_partition(new_waiter)) { ++ /* let holder inherit */ ++ sem->holder->rt_param.inh_task = new_waiter; ++ t = sem->holder; ++ if (in_list(&t->rt_list)) { ++ /* queued in domain*/ ++ list_del(&t->rt_list); ++ /* readd to make priority change take place */ ++ if (is_released(t)) ++ __add_ready(edf, t); ++ else ++ __add_release(edf, t); ++ } ++ } ++ ++ /* check if we need to reschedule */ ++ if (edf_preemption_needed(edf, current)) ++ preempt(pedf); ++ ++ spin_unlock(&pedf->lock); ++ } ++ ++ return 0; ++} ++ ++static long psnedf_inherit_priority(struct pi_semaphore *sem, ++ struct task_struct *new_owner) ++{ ++ int cpu = get_partition(new_owner); ++ ++ new_owner->rt_param.inh_task = sem->hp.cpu_task[cpu]; ++ if (sem->hp.cpu_task[cpu] && new_owner != sem->hp.cpu_task[cpu]) { ++ TRACE_TASK(new_owner, ++ "inherited priority from %s/%d\n", ++ sem->hp.cpu_task[cpu]->comm, ++ sem->hp.cpu_task[cpu]->pid); ++ } else ++ TRACE_TASK(new_owner, ++ "cannot inherit priority: " ++ "no higher priority job waits on this CPU!\n"); ++ /* make new owner non-preemptable as required by FMLP under ++ * PSN-EDF. ++ */ ++ make_np(new_owner); ++ return 0; ++} ++ ++ ++/* This function is called on a semaphore release, and assumes that ++ * the current task is also the semaphore holder. ++ */ ++static long psnedf_return_priority(struct pi_semaphore *sem) ++{ ++ struct task_struct* t = current; ++ psnedf_domain_t* pedf = task_pedf(t); ++ rt_domain_t* edf = task_edf(t); ++ int ret = 0; ++ int cpu = get_partition(current); ++ ++ ++ /* Find new highest-priority semaphore task ++ * if holder task is the current hp.cpu_task[cpu]. ++ * ++ * Calling function holds sem->wait.lock. ++ */ ++ if (t == sem->hp.cpu_task[cpu]) ++ edf_set_hp_cpu_task(sem, cpu); ++ ++ take_np(t); ++ if (current->rt_param.inh_task) { ++ TRACE_CUR("return priority of %s/%d\n", ++ current->rt_param.inh_task->comm, ++ current->rt_param.inh_task->pid); ++ spin_lock(&pedf->lock); ++ ++ /* Reset inh_task to NULL. */ ++ current->rt_param.inh_task = NULL; ++ ++ /* check if we need to reschedule */ ++ if (edf_preemption_needed(edf, current)) ++ preempt(pedf); ++ ++ spin_unlock(&pedf->lock); ++ } else ++ TRACE_CUR(" no priority to return %p\n", sem); ++ ++ return ret; ++} ++ ++ ++static int psnedf_mode_change(int new_mode) ++{ ++ int cpu; ++ ++ if (new_mode == MODE_RT_RUN) ++ for_each_online_cpu(cpu) { ++ spin_lock(&remote_pedf(cpu)->lock); ++ __rerelease_all(remote_edf(cpu), edf_release_at); ++ spin_unlock(&remote_pedf(cpu)->lock); ++ } ++ ++ TRACE("[%d] psn edf: mode changed to %d\n", ++ smp_processor_id(), new_mode); ++ return 0; ++} ++ ++ ++/* Plugin object */ ++static sched_plugin_t s_plugin __cacheline_aligned_in_smp = { ++ .ready_to_use = 0 ++}; ++ ++ ++/* ++ * Plugin initialization code. ++ */ ++#define INIT_SCHED_PLUGIN (struct sched_plugin) {\ ++ .plugin_name = "PSN-EDF",\ ++ .ready_to_use = 1,\ ++ .scheduler_tick = psnedf_scheduler_tick,\ ++ .prepare_task = psnedf_prepare_task,\ ++ .sleep_next_period = edf_sleep_next_period,\ ++ .tear_down = psnedf_tear_down,\ ++ .schedule = psnedf_schedule,\ ++ .mode_change = psnedf_mode_change,\ ++ .wake_up_task = psnedf_wake_up_task,\ ++ .task_blocks = psnedf_task_blocks, \ ++ .pi_block = psnedf_pi_block, \ ++ .inherit_priority = psnedf_inherit_priority, \ ++ .return_priority = psnedf_return_priority \ ++} ++ ++ ++sched_plugin_t *__init init_psn_edf_plugin(void) ++{ ++ int i; ++ ++ if (!s_plugin.ready_to_use) ++ { ++ for (i = 0; i < NR_CPUS; i++) ++ { ++ psnedf_domain_init(remote_pedf(i), ++ psnedf_check_resched, i); ++ printk("PSN-EDF: CPU partition %d initialized.\n", i); ++ } ++ s_plugin = INIT_SCHED_PLUGIN; ++ } ++ return &s_plugin; ++} ++ ++ ++ +diff --git a/kernel/sched_trace.c b/kernel/sched_trace.c +new file mode 100644 +index 0000000..4cfe0c4 +--- /dev/null ++++ b/kernel/sched_trace.c +@@ -0,0 +1,755 @@ ++/* sched_trace.c -- record scheduling events to a byte stream. ++ * ++ * TODO: Move ring buffer to a lockfree implementation. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++ ++ ++typedef struct { ++ /* guard read and write pointers */ ++ spinlock_t lock; ++ /* guard against concurrent freeing of buffer */ ++ rwlock_t del_lock; ++ ++ /* memory allocated for ring buffer */ ++ unsigned long order; ++ char* buf; ++ char* end; ++ ++ /* Read/write pointer. May not cross. ++ * They point to the position of next write and ++ * last read. ++ */ ++ char* writep; ++ char* readp; ++ ++} ring_buffer_t; ++ ++#define EMPTY_RING_BUFFER { \ ++ .lock = SPIN_LOCK_UNLOCKED, \ ++ .del_lock = RW_LOCK_UNLOCKED, \ ++ .buf = NULL, \ ++ .end = NULL, \ ++ .writep = NULL, \ ++ .readp = NULL \ ++} ++ ++void rb_init(ring_buffer_t* buf) ++{ ++ *buf = (ring_buffer_t) EMPTY_RING_BUFFER; ++} ++ ++int rb_alloc_buf(ring_buffer_t* buf, unsigned long order) ++{ ++ unsigned long flags; ++ int error = 0; ++ char *mem; ++ ++ /* do memory allocation while not atomic */ ++ mem = (char *) __get_free_pages(GFP_KERNEL, order); ++ if (!mem) ++ return -ENOMEM; ++ write_lock_irqsave(&buf->del_lock, flags); ++ BUG_ON(buf->buf); ++ buf->buf = mem; ++ buf->end = buf->buf + PAGE_SIZE * (1 << order) - 1; ++ memset(buf->buf, 0xff, buf->end - buf->buf); ++ buf->order = order; ++ buf->writep = buf->buf + 1; ++ buf->readp = buf->buf; ++ write_unlock_irqrestore(&buf->del_lock, flags); ++ return error; ++} ++ ++int rb_free_buf(ring_buffer_t* buf) ++{ ++ unsigned long flags; ++ int error = 0; ++ write_lock_irqsave(&buf->del_lock, flags); ++ BUG_ON(!buf->buf); ++ free_pages((unsigned long) buf->buf, buf->order); ++ buf->buf = NULL; ++ buf->end = NULL; ++ buf->writep = NULL; ++ buf->readp = NULL; ++ write_unlock_irqrestore(&buf->del_lock, flags); ++ return error; ++} ++ ++/* Assumption: concurrent writes are serialized externally ++ * ++ * Will only succeed if there is enough space for all len bytes. ++ */ ++int rb_put(ring_buffer_t* buf, char* mem, size_t len) ++{ ++ unsigned long flags; ++ char* r , *w; ++ int error = 0; ++ read_lock_irqsave(&buf->del_lock, flags); ++ if (!buf->buf) { ++ error = -ENODEV; ++ goto out; ++ } ++ spin_lock(&buf->lock); ++ r = buf->readp; ++ w = buf->writep; ++ spin_unlock(&buf->lock); ++ if (r < w && buf->end - w >= len - 1) { ++ /* easy case: there is enough space in the buffer ++ * to write it in one continous chunk*/ ++ memcpy(w, mem, len); ++ w += len; ++ if (w > buf->end) ++ /* special case: fit exactly into buffer ++ * w is now buf->end + 1 ++ */ ++ w = buf->buf; ++ } else if (w < r && r - w >= len) { /* >= len because may not cross */ ++ /* we are constrained by the read pointer but we there ++ * is enough space ++ */ ++ memcpy(w, mem, len); ++ w += len; ++ } else if (r <= w && buf->end - w < len - 1) { ++ /* the wrap around case: there may or may not be space */ ++ if ((buf->end - w) + (r - buf->buf) >= len - 1) { ++ /* copy chunk that fits at the end */ ++ memcpy(w, mem, buf->end - w + 1); ++ mem += buf->end - w + 1; ++ len -= (buf->end - w + 1); ++ w = buf->buf; ++ /* copy the rest */ ++ memcpy(w, mem, len); ++ w += len; ++ } ++ else ++ error = -ENOMEM; ++ } else { ++ error = -ENOMEM; ++ } ++ if (!error) { ++ spin_lock(&buf->lock); ++ buf->writep = w; ++ spin_unlock(&buf->lock); ++ } ++ out: ++ read_unlock_irqrestore(&buf->del_lock, flags); ++ return error; ++} ++ ++/* Assumption: concurrent reads are serialized externally */ ++int rb_get(ring_buffer_t* buf, char* mem, size_t len) ++{ ++ unsigned long flags; ++ char* r , *w; ++ int error = 0; ++ read_lock_irqsave(&buf->del_lock, flags); ++ if (!buf->buf) { ++ error = -ENODEV; ++ goto out; ++ } ++ spin_lock(&buf->lock); ++ r = buf->readp; ++ w = buf->writep; ++ spin_unlock(&buf->lock); ++ ++ if (w <= r && buf->end - r >= len) { ++ /* easy case: there is enough data in the buffer ++ * to get it in one chunk*/ ++ memcpy(mem, r + 1, len); ++ r += len; ++ error = len; ++ ++ } else if (r + 1 < w && w - r - 1 >= len) { ++ /* we are constrained by the write pointer but ++ * there is enough data ++ */ ++ memcpy(mem, r + 1, len); ++ r += len; ++ error = len; ++ ++ } else if (r + 1 < w && w - r - 1 < len) { ++ /* we are constrained by the write pointer and there ++ * there is not enough data ++ */ ++ memcpy(mem, r + 1, w - r - 1); ++ error = w - r - 1; ++ r += w - r - 1; ++ ++ } else if (w <= r && buf->end - r < len) { ++ /* the wrap around case: there may or may not be enough data ++ * first let's get what is available ++ */ ++ memcpy(mem, r + 1, buf->end - r); ++ error += (buf->end - r); ++ mem += (buf->end - r); ++ len -= (buf->end - r); ++ r += (buf->end - r); ++ ++ if (w > buf->buf) { ++ /* there is more to get */ ++ r = buf->buf - 1; ++ if (w - r >= len) { ++ /* plenty */ ++ memcpy(mem, r + 1, len); ++ error += len; ++ r += len; ++ } else { ++ memcpy(mem, r + 1, w - r - 1); ++ error += w - r - 1; ++ r += w - r - 1; ++ } ++ } ++ } /* nothing available */ ++ ++ if (error > 0) { ++ spin_lock(&buf->lock); ++ buf->readp = r; ++ spin_unlock(&buf->lock); ++ } ++ out: ++ read_unlock_irqrestore(&buf->del_lock, flags); ++ return error; ++} ++ ++ ++ ++/******************************************************************************/ ++/* DEVICE FILE DRIVER */ ++/******************************************************************************/ ++ ++ ++ ++/* Allocate a buffer of about 1 MB per CPU. ++ * ++ */ ++#define BUFFER_ORDER 8 ++ ++typedef struct { ++ ring_buffer_t buf; ++ atomic_t reader_cnt; ++ struct semaphore reader_mutex; ++} trace_buffer_t; ++ ++ ++/* This does not initialize the semaphore!! */ ++ ++#define EMPTY_TRACE_BUFFER \ ++ { .buf = EMPTY_RING_BUFFER, .reader_cnt = ATOMIC_INIT(0)} ++ ++static DEFINE_PER_CPU(trace_buffer_t, trace_buffer); ++ ++#ifdef CONFIG_SCHED_DEBUG_TRACE ++static spinlock_t log_buffer_lock = SPIN_LOCK_UNLOCKED; ++#endif ++static trace_buffer_t log_buffer = EMPTY_TRACE_BUFFER; ++ ++static void init_buffers(void) ++{ ++ int i; ++ ++ for (i = 0; i < NR_CPUS; i++) { ++ rb_init(&per_cpu(trace_buffer, i).buf); ++ init_MUTEX(&per_cpu(trace_buffer, i).reader_mutex); ++ atomic_set(&per_cpu(trace_buffer, i).reader_cnt, 0); ++ } ++ /* only initialize the mutex, the rest was initialized as part ++ * of the static initialization macro ++ */ ++ init_MUTEX(&log_buffer.reader_mutex); ++} ++ ++static int trace_release(struct inode *in, struct file *filp) ++{ ++ int error = -EINVAL; ++ trace_buffer_t* buf = filp->private_data; ++ ++ BUG_ON(!filp->private_data); ++ ++ if (down_interruptible(&buf->reader_mutex)) { ++ error = -ERESTARTSYS; ++ goto out; ++ } ++ ++ /* last release must deallocate buffers */ ++ if (atomic_dec_return(&buf->reader_cnt) == 0) { ++ error = rb_free_buf(&buf->buf); ++ } ++ ++ up(&buf->reader_mutex); ++ out: ++ return error; ++} ++ ++static ssize_t trace_read(struct file *filp, char __user *to, size_t len, ++ loff_t *f_pos) ++{ ++ /* we ignore f_pos, this is strictly sequential */ ++ ++ ssize_t error = -EINVAL; ++ char* mem; ++ trace_buffer_t *buf = filp->private_data; ++ ++ if (down_interruptible(&buf->reader_mutex)) { ++ error = -ERESTARTSYS; ++ goto out; ++ } ++ ++ if (len > 64 * 1024) ++ len = 64 * 1024; ++ mem = kmalloc(len, GFP_KERNEL); ++ if (!mem) { ++ error = -ENOMEM; ++ goto out_unlock; ++ } ++ ++ error = rb_get(&buf->buf, mem, len); ++ while (!error) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ schedule_timeout(110); ++ if (signal_pending(current)) ++ error = -ERESTARTSYS; ++ else ++ error = rb_get(&buf->buf, mem, len); ++ } ++ ++ if (error > 0 && copy_to_user(to, mem, error)) ++ error = -EFAULT; ++ ++ kfree(mem); ++ out_unlock: ++ up(&buf->reader_mutex); ++ out: ++ return error; ++} ++ ++ ++/* trace_open - Open one of the per-CPU sched_trace buffers. ++ */ ++static int trace_open(struct inode *in, struct file *filp) ++{ ++ int error = -EINVAL; ++ int cpu = MINOR(in->i_rdev); ++ trace_buffer_t* buf; ++ ++ if (!cpu_online(cpu)) { ++ printk(KERN_WARNING "sched trace: " ++ "CPU #%d is not online. (open failed)\n", cpu); ++ error = -ENODEV; ++ goto out; ++ } ++ ++ buf = &per_cpu(trace_buffer, cpu); ++ ++ if (down_interruptible(&buf->reader_mutex)) { ++ error = -ERESTARTSYS; ++ goto out; ++ } ++ ++ /* first open must allocate buffers */ ++ if (atomic_inc_return(&buf->reader_cnt) == 1) { ++ if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER))) ++ { ++ atomic_dec(&buf->reader_cnt); ++ goto out_unlock; ++ } ++ } ++ ++ error = 0; ++ filp->private_data = buf; ++ ++ out_unlock: ++ up(&buf->reader_mutex); ++ out: ++ return error; ++} ++ ++/* log_open - open the global log message ring buffer. ++ */ ++static int log_open(struct inode *in, struct file *filp) ++{ ++ int error = -EINVAL; ++ trace_buffer_t* buf; ++ ++ buf = &log_buffer; ++ ++ if (down_interruptible(&buf->reader_mutex)) { ++ error = -ERESTARTSYS; ++ goto out; ++ } ++ ++ /* first open must allocate buffers */ ++ if (atomic_inc_return(&buf->reader_cnt) == 1) { ++ if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER))) ++ { ++ atomic_dec(&buf->reader_cnt); ++ goto out_unlock; ++ } ++ } ++ ++ error = 0; ++ filp->private_data = buf; ++ ++ out_unlock: ++ up(&buf->reader_mutex); ++ out: ++ return error; ++} ++ ++/******************************************************************************/ ++/* Device Registration */ ++/******************************************************************************/ ++ ++/* the major numbes are from the unassigned/local use block ++ * ++ * This should be converted to dynamic allocation at some point... ++ */ ++#define TRACE_MAJOR 250 ++#define LOG_MAJOR 251 ++ ++/* trace_fops - The file operations for accessing the per-CPU scheduling event ++ * trace buffers. ++ */ ++struct file_operations trace_fops = { ++ .owner = THIS_MODULE, ++ .open = trace_open, ++ .release = trace_release, ++ .read = trace_read, ++}; ++ ++/* log_fops - The file operations for accessing the global LITMUS log message ++ * buffer. ++ * ++ * Except for opening the device file it uses the same operations as trace_fops. ++ */ ++struct file_operations log_fops = { ++ .owner = THIS_MODULE, ++ .open = log_open, ++ .release = trace_release, ++ .read = trace_read, ++}; ++ ++static int __init register_buffer_dev(const char* name, ++ struct file_operations* fops, ++ int major, int count) ++{ ++ dev_t trace_dev; ++ struct cdev *cdev; ++ int error = 0; ++ ++ trace_dev = MKDEV(major, 0); ++ error = register_chrdev_region(trace_dev, count, name); ++ if (error) ++ { ++ printk(KERN_WARNING "sched trace: " ++ "Could not register major/minor number %d\n", major); ++ return error; ++ } ++ cdev = cdev_alloc(); ++ if (!cdev) { ++ printk(KERN_WARNING "sched trace: " ++ "Could not get a cdev for %s.\n", name); ++ return -ENOMEM; ++ } ++ cdev->owner = THIS_MODULE; ++ cdev->ops = fops; ++ error = cdev_add(cdev, trace_dev, count); ++ if (error) { ++ printk(KERN_WARNING "sched trace: " ++ "add_cdev failed for %s.\n", name); ++ return -ENOMEM; ++ } ++ return error; ++ ++} ++ ++static int __init init_sched_trace(void) ++{ ++ int error1 = 0, error2 = 0; ++ ++ printk("Initializing scheduler trace device\n"); ++ init_buffers(); ++ ++ error1 = register_buffer_dev("schedtrace", &trace_fops, ++ TRACE_MAJOR, NR_CPUS); ++ ++ error2 = register_buffer_dev("litmus_log", &log_fops, ++ LOG_MAJOR, 1); ++ if (error1 || error2) ++ return min(error1, error2); ++ else ++ return 0; ++} ++ ++module_init(init_sched_trace); ++ ++/******************************************************************************/ ++/* KERNEL API */ ++/******************************************************************************/ ++ ++/* The per-CPU LITMUS log buffer. Don't put it on the stack, it is too big for ++ * that and the kernel gets very picky with nested interrupts and small stacks. ++ */ ++ ++#ifdef CONFIG_SCHED_DEBUG_TRACE ++ ++#define MSG_SIZE 255 ++static DEFINE_PER_CPU(char[MSG_SIZE], fmt_buffer); ++ ++/* sched_trace_log_message - This is the only function that accesses the the ++ * log buffer inside the kernel for writing. ++ * Concurrent access to it is serialized via the ++ * log_buffer_lock. ++ * ++ * The maximum length of a formatted message is 255. ++ */ ++void sched_trace_log_message(const char* fmt, ...) ++{ ++ unsigned long flags; ++ va_list args; ++ size_t len; ++ char* buf; ++ ++ va_start(args, fmt); ++ local_irq_save(flags); ++ ++ /* format message */ ++ buf = __get_cpu_var(fmt_buffer); ++ len = vscnprintf(buf, MSG_SIZE, fmt, args); ++ ++ spin_lock(&log_buffer_lock); ++ /* Don't copy the trailing null byte, we don't want null bytes ++ * in a text file. ++ */ ++ rb_put(&log_buffer.buf, buf, len); ++ spin_unlock(&log_buffer_lock); ++ ++ local_irq_restore(flags); ++ va_end(args); ++} ++ ++#endif ++ ++#ifdef CONFIG_SCHED_TASK_TRACE ++ ++static inline void __put_trace(char* mem, size_t size) ++{ ++ trace_buffer_t* buf = &__get_cpu_var(trace_buffer); ++ rb_put(&buf->buf, mem, size); ++} ++ ++#define put_trace(obj) \ ++ if (get_rt_mode() == MODE_RT_RUN) \ ++ __put_trace((char *) &obj, sizeof(obj)) ++ ++#define header(rec, type) \ ++{ \ ++ rec.header.trace = type; \ ++ rec.header.timestamp = sched_clock(); \ ++ rec.header.size = sizeof(rec); \ ++} ++ ++#define tinfo(info, t) \ ++{ \ ++ info.is_rt = is_realtime(t); \ ++ info.is_server = 0; \ ++ info.class = get_class(t); \ ++ info.budget = (t)->time_slice; \ ++ info.pid = (t)->pid; \ ++ info.deadline = (t)->rt_param.times.deadline; \ ++} ++ ++#define rtinfo(info, t) \ ++{ \ ++ info.wcet = get_exec_cost(t); \ ++ info.period = get_rt_period(t); \ ++} ++ ++void sched_trace_scheduler_invocation(void) ++{ ++ invocation_record_t rec; ++ header(rec, ST_INVOCATION); ++ rec.flags = current->flags; ++ put_trace(rec); ++} ++ ++void sched_trace_task_arrival(struct task_struct *t) ++{ ++ arrival_record_t rec; ++ header(rec, ST_ARRIVAL); ++ tinfo(rec.task, t); ++ put_trace(rec); ++} ++ ++ ++void sched_trace_task_departure(struct task_struct *t) ++{ ++ departure_record_t rec; ++ header(rec, ST_DEPARTURE); ++ tinfo(rec.task, t); ++ put_trace(rec); ++} ++ ++void sched_trace_task_preemption(struct task_struct *t, struct task_struct* by) ++{ ++ preemption_record_t rec; ++ header(rec, ST_PREEMPTION); ++ tinfo(rec.task, t); ++ tinfo(rec.by, by); ++ put_trace(rec); ++} ++ ++ ++void sched_trace_task_scheduled(struct task_struct *t) ++{ ++ scheduled_record_t rec; ++ header(rec, ST_SCHEDULED); ++ tinfo(rec.task, t); ++ put_trace(rec); ++} ++ ++ ++void sched_trace_job_release(struct task_struct *t) ++{ ++ release_record_t rec; ++ header(rec, ST_JOB_RELEASE); ++ tinfo(rec.task, t); ++ rtinfo(rec, t); ++ put_trace(rec); ++} ++ ++void sched_trace_job_completion(struct task_struct *t) ++{ ++ completion_record_t rec; ++ header(rec, ST_JOB_COMPLETION); ++ tinfo(rec.task, t); ++ rtinfo(rec, t); ++ rec.tardiness = jiffies - t->rt_param.times.deadline; ++ rec.job_no = t->rt_param.times.job_no; ++ TRACE_TASK(t, "AAATardiness : %d\n", rec.tardiness); ++ put_trace(rec); ++} ++ ++ ++void sched_trace_server_scheduled(int id, task_class_t class, ++ unsigned int budget, jiffie_t deadline) ++{ ++ scheduled_record_t rec; ++ header(rec, ST_SCHEDULED); ++ rec.task.pid = id; ++ rec.task.is_rt = 1; ++ rec.task.is_server = 1; ++ rec.task.class = class; ++ rec.task.budget = budget; ++ rec.task.deadline = deadline; ++ put_trace(rec); ++} ++ ++void sched_trace_server_release(int id, unsigned int wcet, ++ unsigned int period, task_class_t class) ++{ ++ release_record_t rec; ++ header(rec, ST_JOB_RELEASE); ++ rec.task.pid = id; ++ rec.task.is_rt = 1; ++ rec.task.is_server = 1; ++ rec.task.class = class; ++ rec.task.budget = wcet; ++ rec.period = period; ++ rec.wcet = wcet; ++ put_trace(rec); ++} ++ ++void sched_trace_server_completion(int id, unsigned int budget, ++ jiffie_t deadline, task_class_t class) ++{ ++ completion_record_t rec; ++ header(rec, ST_JOB_COMPLETION); ++ rec.task.pid = id; ++ rec.task.is_rt = 1; ++ rec.task.is_server = 1; ++ rec.task.class = class; ++ rec.task.budget = budget; ++ rec.task.deadline = deadline; ++ rec.period = 0; ++ rec.tardiness = jiffies - deadline; ++ put_trace(rec); ++ ++} ++ ++void sched_trace_capacity_release(struct task_struct *t) ++{ ++ cap_release_record_t rec; ++ header(rec, ST_CAPACITY_RELEASE); ++ tinfo(rec.task, t); ++ put_trace(rec); ++} ++ ++void sched_trace_capacity_allocation(struct task_struct *t, u16 budget, u32 deadline, ++ pid_t donor) ++{ ++ cap_allocation_record_t rec; ++ header(rec, ST_CAPACITY_ALLOCATION); ++ tinfo(rec.task, t); ++ rec.donor = donor; ++ rec.budget = budget; ++ rec.deadline = deadline; ++ put_trace(rec); ++} ++ ++void sched_trace_capacity_alloc_srv(pid_t srv, u32 srv_dl, task_class_t cls, ++ u16 srv_budget, ++ u16 budget, u32 deadline, pid_t donor) ++{ ++ cap_allocation_record_t rec; ++ header(rec, ST_CAPACITY_ALLOCATION); ++ rec.task.pid = srv; ++ rec.task.is_rt = 1; ++ rec.task.is_server = 1; ++ rec.task.class = cls; ++ rec.task.budget = srv_budget; ++ rec.task.deadline = srv_dl; ++ rec.donor = donor; ++ rec.budget = budget; ++ rec.deadline = deadline; ++ put_trace(rec); ++} ++ ++void sched_trace_service_level_change(struct task_struct *t, ++ unsigned int from, ++ unsigned int to) ++{ ++ service_level_change_record_t rec; ++ header(rec, ST_SERVICE_LEVEL_CHANGE); ++ tinfo(rec.task, t); ++ rec.to = to; ++ rec.from = from; ++ rec.new_level = ++ t->rt_param.service_level[to]; ++ rec.old_level = ++ t->rt_param.service_level[from]; ++ put_trace(rec); ++} ++ ++void sched_trace_weight_error(struct task_struct* t, fp_t actual) ++{ ++ weight_error_record_t rec; ++ header(rec, ST_WEIGHT_ERROR); ++ rec.task = t->pid; ++ rec.actual = actual; ++ rec.estimate = get_est_weight(t); ++ put_trace(rec); ++} ++ ++ ++#endif +diff --git a/kernel/timer.c b/kernel/timer.c +index c2a8ccf..77a1b6b 100644 +--- a/kernel/timer.c ++++ b/kernel/timer.c +@@ -737,6 +737,27 @@ static inline s64 __get_nsec_offset(void) + return ns_offset; + } + ++/* Non-static, non-inline, public version of function above. ++ * It's up to the programmer to decide how to use it, no guarantees ++ * about anything are made here. ++ */ ++s64 get_nsec_offset(void) ++{ ++ cycle_t cycle_now, cycle_delta; ++ s64 ns_offset; ++ ++ /* read clocksource: */ ++ cycle_now = clocksource_read(clock); ++ ++ /* calculate the delta since the last update_wall_time: */ ++ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask; ++ ++ /* convert to nanoseconds: */ ++ ns_offset = cyc2ns(clock, cycle_delta); ++ ++ return ns_offset; ++} ++ + /** + * __get_realtime_clock_ts - Returns the time of day in a timespec + * @ts: pointer to the timespec to be set +@@ -789,6 +810,7 @@ void do_gettimeofday(struct timeval *tv) + } + + EXPORT_SYMBOL(do_gettimeofday); ++ + /** + * do_settimeofday - Sets the time of day + * @tv: pointer to the timespec variable containing the new time +diff --git a/kernel/trace.c b/kernel/trace.c +new file mode 100644 +index 0000000..6119574 +--- /dev/null ++++ b/kernel/trace.c +@@ -0,0 +1,302 @@ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++/******************************************************************************/ ++/* Allocation */ ++/******************************************************************************/ ++ ++struct ft_buffer* trace_ts_buf = NULL; ++ ++static unsigned int ts_seq_no = 0; ++ ++feather_callback void save_timestamp(unsigned long event) ++{ ++ unsigned int seq_no = fetch_and_inc((int *) &ts_seq_no); ++ struct timestamp *ts; ++ if (ft_buffer_start_write(trace_ts_buf, (void**) &ts)) { ++ ts->event = event; ++ ts->timestamp = ft_read_tsc(); ++ ts->seq_no = seq_no; ++ ts->cpu = raw_smp_processor_id(); ++ ft_buffer_finish_write(trace_ts_buf, ts); ++ } ++} ++ ++static struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size) ++{ ++ struct ft_buffer* buf; ++ size_t total = (size + 1) * count; ++ char* mem; ++ int order = 0, pages = 1; ++ ++ buf = kmalloc(sizeof(struct ft_buffer), GFP_KERNEL); ++ if (!buf) ++ return NULL; ++ ++ total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0); ++ while (pages < total) { ++ order++; ++ pages *= 2; ++ } ++ ++ mem = (char*) __get_free_pages(GFP_KERNEL, order); ++ if (!mem) { ++ kfree(buf); ++ return NULL; ++ } ++ ++ if (!init_ft_buffer(buf, count, size, ++ mem + (count * size), /* markers at the end */ ++ mem)) { /* buffer objects */ ++ free_pages((unsigned long) mem, order); ++ kfree(buf); ++ return NULL; ++ } ++ return buf; ++} ++ ++static void free_ft_buffer(struct ft_buffer* buf) ++{ ++ int order = 0, pages = 1; ++ size_t total; ++ ++ if (buf) { ++ total = (buf->slot_size + 1) * buf->slot_count; ++ total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0); ++ while (pages < total) { ++ order++; ++ pages *= 2; ++ } ++ free_pages((unsigned long) buf->buffer_mem, order); ++ kfree(buf); ++ } ++} ++ ++ ++/******************************************************************************/ ++/* DEVICE FILE DRIVER */ ++/******************************************************************************/ ++ ++#define NO_TIMESTAMPS 262144 ++ ++static DECLARE_MUTEX(feather_lock); ++static int use_count = 0; ++ ++static int trace_release(struct inode *in, struct file *filp) ++{ ++ int err = -EINVAL; ++ ++ if (down_interruptible(&feather_lock)) { ++ err = -ERESTARTSYS; ++ goto out; ++ } ++ ++ printk(KERN_ALERT "%s/%d disconnects from feather trace device. " ++ "use_count=%d\n", ++ current->comm, current->pid, use_count); ++ ++ if (use_count == 1) { ++ /* disable events */ ++ ft_disable_all_events(); ++ ++ /* wait for any pending events to complete */ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_timeout(HZ); ++ ++ printk(KERN_ALERT "Failed trace writes: %u\n", ++ trace_ts_buf->failed_writes); ++ ++ free_ft_buffer(trace_ts_buf); ++ trace_ts_buf = NULL; ++ } ++ ++ use_count--; ++ up(&feather_lock); ++out: ++ return err; ++} ++ ++ ++static ssize_t trace_read(struct file *filp, char __user *to, size_t len, ++ loff_t *f_pos) ++{ ++ /* we ignore f_pos, this is strictly sequential */ ++ ssize_t error = 0; ++ struct timestamp ts; ++ ++ if (down_interruptible(&feather_lock)) { ++ error = -ERESTARTSYS; ++ goto out; ++ } ++ ++ ++ while (len >= sizeof(struct timestamp)) { ++ if (ft_buffer_read(trace_ts_buf, &ts)) { ++ if (copy_to_user(to, &ts, sizeof(struct timestamp))) { ++ error = -EFAULT; ++ break; ++ } else { ++ len -= sizeof(struct timestamp); ++ to += sizeof(struct timestamp); ++ error += sizeof(struct timestamp); ++ } ++ } else { ++ set_current_state(TASK_INTERRUPTIBLE); ++ schedule_timeout(50); ++ if (signal_pending(current)) { ++ error = -ERESTARTSYS; ++ break; ++ } ++ } ++ } ++ up(&feather_lock); ++out: ++ return error; ++} ++ ++#define ENABLE_CMD 0 ++#define DISABLE_CMD 1 ++ ++static ssize_t trace_write(struct file *filp, const char __user *from, ++ size_t len, loff_t *f_pos) ++{ ++ ssize_t error = -EINVAL; ++ unsigned long cmd; ++ unsigned long id; ++ ++ if (len % sizeof(long) || len < 2 * sizeof(long)) ++ goto out; ++ ++ if (copy_from_user(&cmd, from, sizeof(long))) { ++ error = -EFAULT; ++ goto out; ++ } ++ len -= sizeof(long); ++ from += sizeof(long); ++ ++ if (cmd != ENABLE_CMD && cmd != DISABLE_CMD) ++ goto out; ++ ++ if (down_interruptible(&feather_lock)) { ++ error = -ERESTARTSYS; ++ goto out; ++ } ++ ++ error = sizeof(long); ++ while (len) { ++ if (copy_from_user(&id, from, sizeof(long))) { ++ error = -EFAULT; ++ goto out; ++ } ++ len -= sizeof(long); ++ from += sizeof(long); ++ if (cmd) { ++ printk(KERN_INFO ++ "Disabling feather-trace event %lu.\n", id); ++ ft_disable_event(id); ++ } else { ++ printk(KERN_INFO ++ "Enabling feather-trace event %lu.\n", id); ++ ft_enable_event(id); ++ } ++ error += sizeof(long); ++ } ++ ++ up(&feather_lock); ++ out: ++ return error; ++} ++ ++static int trace_open(struct inode *in, struct file *filp) ++{ ++ int err = 0; ++ unsigned int count = NO_TIMESTAMPS; ++ ++ if (down_interruptible(&feather_lock)) { ++ err = -ERESTARTSYS; ++ goto out; ++ } ++ ++ while (count && !trace_ts_buf) { ++ printk("trace: trying to allocate %u time stamps.\n", count); ++ trace_ts_buf = alloc_ft_buffer(count, sizeof(struct timestamp)); ++ count /= 2; ++ } ++ if (!trace_ts_buf) ++ err = -ENOMEM; ++ else ++ use_count++; ++ ++ up(&feather_lock); ++out: ++ return err; ++} ++ ++/******************************************************************************/ ++/* Device Registration */ ++/******************************************************************************/ ++ ++#define FT_TRACE_MAJOR 252 ++ ++struct file_operations ft_trace_fops = { ++ .owner = THIS_MODULE, ++ .open = trace_open, ++ .release = trace_release, ++ .write = trace_write, ++ .read = trace_read, ++}; ++ ++ ++static int __init register_buffer_dev(const char* name, ++ struct file_operations* fops, ++ int major, int count) ++{ ++ dev_t trace_dev; ++ struct cdev *cdev; ++ int error = 0; ++ ++ trace_dev = MKDEV(major, 0); ++ error = register_chrdev_region(trace_dev, count, name); ++ if (error) ++ { ++ printk(KERN_WARNING "trace: " ++ "Could not register major/minor number %d\n", major); ++ return error; ++ } ++ cdev = cdev_alloc(); ++ if (!cdev) { ++ printk(KERN_WARNING "trace: " ++ "Could not get a cdev for %s.\n", name); ++ return -ENOMEM; ++ } ++ cdev->owner = THIS_MODULE; ++ cdev->ops = fops; ++ error = cdev_add(cdev, trace_dev, count); ++ if (error) { ++ printk(KERN_WARNING "trace: " ++ "add_cdev failed for %s.\n", name); ++ return -ENOMEM; ++ } ++ return error; ++ ++} ++ ++static int __init init_sched_trace(void) ++{ ++ int error = 0; ++ ++ printk("Initializing Feather-Trace device\n"); ++ /* dummy entry to make linker happy */ ++ ft_event0(666, save_timestamp); ++ ++ error = register_buffer_dev("ft_trace", &ft_trace_fops, ++ FT_TRACE_MAJOR, 1); ++ return error; ++} ++ ++module_init(init_sched_trace); +diff --git a/lib/semaphore-sleepers.c b/lib/semaphore-sleepers.c +index 1281805..3f4d543 100644 +--- a/lib/semaphore-sleepers.c ++++ b/lib/semaphore-sleepers.c +@@ -108,7 +108,7 @@ fastcall int __sched __down_interruptible(struct semaphore * sem) + /* + * With signals pending, this turns into + * the trylock failure case - we won't be +- * sleeping, and we* can't get the lock as ++ * sleeping, and we can't get the lock as + * it has contention. Just correct the count + * and exit. + */ diff --git a/index.html b/index.html index bbdcf1a..623a80f 100644 --- a/index.html +++ b/index.html @@ -30,13 +30,26 @@ kernel with focus on multiprocessor real-time scheduling and synchronization. The Linux kernel is modified to support the sporadic task model and modular scheduler plugins. Both partitioned and global scheduling - is supported. In the current version (2007.1), scheduler plugins that - implement various EDF variants and PFAIR scheduling are included. + is supported. In the current version (2007.2), plugins for the following + scheduling policies are included: +
    +
  • Partitioned EDF (P-EDF)
  • +
  • Partitioned EDF with synchronization support (PSN-EDF)
  • +
  • Global EDF (G-EDF)
  • +
  • Global EDF with synchronization support (GSN-EDF)
  • +
  • Global non-preemptive EDF (G-NP-EDF)
  • +
  • Global Feedback-Controlled EDF (FC-EDF)
  • +
  • EDF for heterogeneous task systems (EDF-HSB)
  • +
  • PFAIR (both staggered and aligned quanta are supported)
  • +
+ + The latest public release of LITMUSRT occurred on 10/29/2007.

-

+

Support

@@ -142,7 +155,8 @@ General Public License (GPL).

- The current release (2007.1) consists of + The latest version of LITMUSRT is 2007.2 and was released on 10/29/2007. + It consists of our Linux kernel modifications in the form of a patch against Linux 2.6.20, liblitmus, the user-space API for real-time tasks, @@ -152,32 +166,47 @@

- Please note that the current implementation is a prototype with - certain limitations. Most notably, it is not safe in a multiuser context, - i.e., real-time system calls do not check for superuser + Please note that the current implementation is a prototype with + certain limitations. Most notably, it is not secure in a multiuser context, + i.e., real-time system calls do not require superuser privileges. Further, some resources (e.g. semaphores) that should be dynamically allocated are allocated statically in the current version.

+ +

+ Old releases: +

+

Installation

- The current release of LITMUSRT, version 2007.1, consists of an + The current release of LITMUSRT, version 2007.2, consists of an extension of the Linux kernel that adds support for the sporadic task model, a scheduler plugin infrastructure, and some scheduler plugins, as well as two user-space libraries that provide the LITMUSRT @@ -202,11 +231,11 @@ cd $DIR # get Linux 2.6.20 wget http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.20.tar.bz2 tar xjf linux-2.6.20.tar.bz2 -wget http://www.cs.unc.edu/~anderson/litmus-rt/download/litmus-rt-2007.1.patch +wget http://www.cs.unc.edu/~anderson/litmus-rt/download/litmus-rt-2007.2.patch mv linux-2.6.20 litmus-rt # apply the LITMUS RT patch cd litmus-rt -patch -p1 < ../litmus-rt-2007.1.patch +patch -p1 < ../litmus-rt-2007.2.patch # create a working kernel configuration with HZ=1000 make gconfig # compile the kernel @@ -223,7 +252,7 @@ make modules class="src">rtsched kernel parameter.

-rtsched={linux, pfair, part_edf, global_edf, global_edf_np, edf_hsb, gsn_edf, psn_edf}
+rtsched={linux, pfair, part_edf, global_edf, global_edf_np, edf_hsb, gsn_edf, psn_edf, adaptive}
 

For example, on our test machine, we use the @@ -246,8 +275,8 @@ initrd /boot/kernel-2.6.20-LITMUSRT.img

 cd $DIR
-wget http://www.cs.unc.edu/~anderson/litmus-rt/download/liblitmus-2007.1.tgz
-tar xzf liblitmus-2007.1.tgz
+wget http://www.cs.unc.edu/~anderson/litmus-rt/download/liblitmus-2007.2.tgz
+tar xzf liblitmus-2007.2.tgz
 cd liblitmus 
 make
 
@@ -260,8 +289,8 @@ make

 cd $DIR
-wget http://www.cs.unc.edu/~anderson/litmus-rt/download/libso-2007.1.tgz
-tar xzf libso-2007.1.tgz
+wget http://www.cs.unc.edu/~anderson/litmus-rt/download/libso-2007.2.tgz
+tar xzf libso-2007.2.tgz
 cd libso
 make
 make tests
@@ -277,13 +306,27 @@ make tests
 
     

Documentation

+

+ Most of the documentation has yet to be written. To get an overview of + the architecture of the kernel extension, we recommend to read the paper + “LITMUSRT: + A Status Report”. +
+
+ Please contact bbb[AT]cs.unc.edu if you have any + questions. +

+ +

-- cgit v1.2.2