From 1acaf95fd9ff52512bfd377a87f0c28050e01bc5 Mon Sep 17 00:00:00 2001 From: Bjoern Brandenburg <bbb@Serenity.local> Date: Fri, 22 Aug 2008 22:43:23 -0400 Subject: publish PCP implementation --- download/RTCSA08/SHA256SUMS | 2 + download/RTCSA08/liblitmus-RTCSA08.tgz | Bin 0 -> 10277 bytes download/RTCSA08/litmus-rt-RTCSA08.patch | 7768 ++++++++++++++++++++++++++++++ index.html | 17 + 4 files changed, 7787 insertions(+) create mode 100644 download/RTCSA08/SHA256SUMS create mode 100644 download/RTCSA08/liblitmus-RTCSA08.tgz create mode 100644 download/RTCSA08/litmus-rt-RTCSA08.patch diff --git a/download/RTCSA08/SHA256SUMS b/download/RTCSA08/SHA256SUMS new file mode 100644 index 0000000..4bc8472 --- /dev/null +++ b/download/RTCSA08/SHA256SUMS @@ -0,0 +1,2 @@ +f9176d0d1dfd7e1c4ab3ba5f4dc62efa3dd1ab8c50e2e63628fe2d2376cb344b liblitmus-RTCSA08.tgz +24c6b22ba13b096b3dc4356ed98f484548c68c77a59296952d72458154dd6bac litmus-rt-RTCSA08.patch diff --git a/download/RTCSA08/liblitmus-RTCSA08.tgz b/download/RTCSA08/liblitmus-RTCSA08.tgz new file mode 100644 index 0000000..9947121 Binary files /dev/null and b/download/RTCSA08/liblitmus-RTCSA08.tgz differ diff --git a/download/RTCSA08/litmus-rt-RTCSA08.patch b/download/RTCSA08/litmus-rt-RTCSA08.patch new file mode 100644 index 0000000..e4863a6 --- /dev/null +++ b/download/RTCSA08/litmus-rt-RTCSA08.patch @@ -0,0 +1,7768 @@ + Makefile | 2 +- + arch/i386/Kconfig | 28 ++ + arch/i386/kernel/apic.c | 92 +++++ + arch/i386/kernel/i386_ksyms.c | 1 + + arch/i386/kernel/signal.c | 3 +- + arch/i386/kernel/smp.c | 1 + + arch/i386/kernel/syscall_table.S | 22 + + fs/exec.c | 5 +- + fs/inode.c | 2 + + include/asm-i386/unistd.h | 25 ++- + include/linux/completion.h | 2 + + include/linux/fs.h | 5 + + include/linux/sched.h | 14 + + include/linux/uaccess.h | 16 + + include/litmus/edf_common.h | 27 ++ + include/litmus/fdso.h | 78 ++++ + include/litmus/feather_buffer.h | 108 +++++ + include/litmus/feather_trace.h | 93 +++++ + include/litmus/jobs.h | 9 + + include/litmus/litmus.h | 200 +++++++++ + include/litmus/rm_common.h | 44 ++ + include/litmus/rt_domain.h | 94 +++++ + include/litmus/rt_param.h | 177 ++++++++ + include/litmus/sched_plugin.h | 120 ++++++ + include/litmus/sched_trace.h | 31 ++ + include/litmus/trace.h | 106 +++++ + kernel/exit.c | 4 + + kernel/fork.c | 5 + + kernel/sched.c | 177 ++++++++- + lib/semaphore-sleepers.c | 2 +- + litmus/Makefile | 9 + + litmus/edf_common.c | 95 +++++ + litmus/fdso.c | 289 +++++++++++++ + litmus/ft_event.c | 104 +++++ + litmus/jobs.c | 43 ++ + litmus/litmus.c | 830 ++++++++++++++++++++++++++++++++++++++ + litmus/litmus_sem.c | 551 +++++++++++++++++++++++++ + litmus/pcp.c | 764 +++++++++++++++++++++++++++++++++++ + litmus/rm_common.c | 76 ++++ + litmus/rt_domain.c | 130 ++++++ + litmus/sched_gsn_edf.c | 733 +++++++++++++++++++++++++++++++++ + litmus/sched_plugin.c | 169 ++++++++ + litmus/sched_psn_edf.c | 458 +++++++++++++++++++++ + litmus/sched_rm.c | 397 ++++++++++++++++++ + litmus/sched_trace.c | 541 +++++++++++++++++++++++++ + litmus/sync.c | 84 ++++ + litmus/trace.c | 302 ++++++++++++++ + 47 files changed, 7052 insertions(+), 16 deletions(-) + +diff --git a/Makefile b/Makefile +index 7e2750f..79cf62b 100644 +--- a/Makefile ++++ b/Makefile +@@ -553,7 +553,7 @@ export mod_strip_cmd + + + ifeq ($(KBUILD_EXTMOD),) +-core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ ++core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ litmus/ + + vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \ + $(core-y) $(core-m) $(drivers-y) $(drivers-m) \ +diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig +index 0dfee81..da6f1e9 100644 +--- a/arch/i386/Kconfig ++++ b/arch/i386/Kconfig +@@ -1210,6 +1210,7 @@ config KPROBES + a probepoint and specifies the callback. Kprobes is useful + for kernel debugging, non-intrusive instrumentation and testing. + If in doubt, say "N". ++ + endmenu + + source "arch/i386/Kconfig.debug" +@@ -1259,3 +1260,30 @@ config X86_TRAMPOLINE + config KTIME_SCALAR + bool + default y ++ ++ ++menu "LITMUS^RT" ++ ++ ++config SCHED_TASK_TRACE ++ bool "Trace real-time tasks" ++ default y ++ help ++ Include support for the sched_trace_XXX() tracing functions. This ++ allows the collection of real-time task events such as job ++ completions, job releases, early completions, etc. This results in a ++ small overhead in the scheduling code. Disable if the overhead is not ++ acceptable (e.g., benchmarking). ++ ++config SCHED_DEBUG_TRACE ++ bool "TRACE() debugging" ++ default y ++ help ++ Include support for sched_trace_log_messageg(), which is used to ++ implement TRACE(). If disabled, no TRACE() messages will be included ++ in the kernel, and no overheads due to debugging statements will be ++ incurred by the scheduler. Disable if the overhead is not acceptable ++ (e.g. benchmarking). ++ ++ ++endmenu +diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c +index 776d9be..36b0159 100644 +--- a/arch/i386/kernel/apic.c ++++ b/arch/i386/kernel/apic.c +@@ -26,6 +26,7 @@ + #include <linux/sysdev.h> + #include <linux/cpu.h> + #include <linux/module.h> ++#include <litmus/litmus.h> + + #include <asm/atomic.h> + #include <asm/smp.h> +@@ -43,6 +44,8 @@ + + #include "io_ports.h" + ++#include <litmus/trace.h> ++ + /* + * cpu_mask that denotes the CPUs that needs timer interrupt coming in as + * IPIs in place of local APIC timers +@@ -54,6 +57,15 @@ static cpumask_t timer_bcast_ipi; + */ + static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */ + ++/* ++ * Definitions and variables related to quantum synchronization. ++ */ ++#define WAIT_TO_SYNC 30000 /* time after boot until sync */ ++static int stagger = 0; /* are we using staggered quanta? */ ++static atomic_t qsync_time = ATOMIC_INIT(INITIAL_JIFFIES); ++static atomic_t quantum_sync_barrier = ATOMIC_INIT(0); ++static atomic_t sync_done = ATOMIC_INIT(0); ++ + static inline void lapic_disable(void) + { + enable_local_apic = -1; +@@ -786,6 +798,23 @@ static int __init apic_set_verbosity(char *str) + + __setup("apic=", apic_set_verbosity); + ++/* ++ * Determine whether to use aligned or staggerd quanta. ++ */ ++ ++static int __init apic_synch_type(char *str) ++{ ++ if (strcmp("aligned", str) == 0) ++ stagger = 0; ++ else if (strcmp("staggered", str) == 0) ++ stagger = 1; ++ else ++ stagger = 0; /* aligned quanta by default */ ++ return 1; ++} ++ ++__setup("quanta=", apic_synch_type); ++ + static int __init detect_init_APIC (void) + { + u32 h, l, features; +@@ -1198,6 +1227,47 @@ EXPORT_SYMBOL(switch_ipi_to_APIC_timer); + #undef APIC_DIVISOR + + /* ++ * This function is called to align all quanta, and to stagger quanta if ++ * necessary. It relies on a barrier to synchronize all processors, so ++ * that they all reset their APIC timers at the same time. If quanta ++ * should be staggered, the appropriate stagger delay is then added at ++ * each processor. ++ */ ++ ++void synchronize_quanta(void) ++{ ++ int cpu = smp_processor_id(); ++ int total_cpus = num_online_cpus(); ++ int stagger_interval = jiffies_to_usecs(1) / total_cpus; ++ ++ /* ++ * Disable APIC timer, wait for all other processors to reach barrier, ++ * and re-enable all timers concurrently. ++ */ ++ disable_APIC_timer(); ++ atomic_inc(&quantum_sync_barrier); ++ while (atomic_read(&quantum_sync_barrier) < total_cpus) { ++ /* Delay, otherwise atomic_inc's cannot occur. */ ++ udelay(1); ++ } ++ ++ /* Add necessary stagger for this CPU, if required. */ ++ if (stagger) { ++ int stagger_us = cpu * stagger_interval; ++ udelay(stagger_us); ++ } ++ ++ /* Re-enable all timers. */ ++ __setup_APIC_LVTT(calibration_result); ++ enable_APIC_timer(); ++ ++ /* The first CPU signals that quantum sync is complete. */ ++ if (cpu == 0) ++ atomic_inc(&sync_done); ++} ++ ++ ++/* + * Local timer interrupt handler. It does both profiling and + * process statistics/rescheduling. + * +@@ -1209,11 +1279,32 @@ EXPORT_SYMBOL(switch_ipi_to_APIC_timer); + + inline void smp_local_timer_interrupt(void) + { ++/* s64 offset; */ ++ ++ TS_TICK_START; ++ + profile_tick(CPU_PROFILING); + #ifdef CONFIG_SMP + update_process_times(user_mode_vm(get_irq_regs())); + #endif + ++ /* Print out timing data - can be commented out if necessary. */ ++/* offset = get_nsec_offset(); */ ++/* TRACE("%d\n", offset); */ ++ ++ /* ++ * Synchronize quanta if we have reached qsync_time plus wait ++ * interval. The synchronization code itself is placed in its own ++ * (non-inline) function, to avoid issues with creating an inline ++ * function that is too large. ++ */ ++ if (unlikely(!atomic_read(&sync_done) && ++ time_after(jiffies, ++ (unsigned long)(atomic_read(&qsync_time) + ++ msecs_to_jiffies(WAIT_TO_SYNC))))) { ++ synchronize_quanta(); ++ } ++ + /* + * We take the 'long' return path, and there every subsystem + * grabs the apropriate locks (kernel lock/ irq lock). +@@ -1224,6 +1315,7 @@ inline void smp_local_timer_interrupt(void) + * Currently this isn't too much of an issue (performance wise), + * we can take more than 100K local irqs per second on a 100 MHz P5. + */ ++ TS_TICK_END; + } + + /* +diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c +index e3d4b73..9670f77 100644 +--- a/arch/i386/kernel/i386_ksyms.c ++++ b/arch/i386/kernel/i386_ksyms.c +@@ -6,6 +6,7 @@ EXPORT_SYMBOL(__down_failed); + EXPORT_SYMBOL(__down_failed_interruptible); + EXPORT_SYMBOL(__down_failed_trylock); + EXPORT_SYMBOL(__up_wakeup); ++ + /* Networking helper routines. */ + EXPORT_SYMBOL(csum_partial_copy_generic); + +diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c +index 65d7620..e95d732 100644 +--- a/arch/i386/kernel/signal.c ++++ b/arch/i386/kernel/signal.c +@@ -651,7 +651,6 @@ void do_notify_resume(struct pt_regs *regs, void *_unused, + + /* deal with pending signal delivery */ + if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK)) +- do_signal(regs); +- ++ do_signal(regs); + clear_thread_flag(TIF_IRET); + } +diff --git a/arch/i386/kernel/smp.c b/arch/i386/kernel/smp.c +index 5285aff..91921a3 100644 +--- a/arch/i386/kernel/smp.c ++++ b/arch/i386/kernel/smp.c +@@ -605,6 +605,7 @@ void smp_send_stop(void) + */ + fastcall void smp_reschedule_interrupt(struct pt_regs *regs) + { ++ set_tsk_need_resched(current); + ack_APIC_irq(); + } + +diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S +index 2697e92..48e5e8e 100644 +--- a/arch/i386/kernel/syscall_table.S ++++ b/arch/i386/kernel/syscall_table.S +@@ -319,3 +319,25 @@ ENTRY(sys_call_table) + .long sys_move_pages + .long sys_getcpu + .long sys_epoll_pwait ++ /* LITMUS syscalls */ ++ .long sys_set_rt_task_param /* 320 */ ++ .long sys_get_rt_task_param ++ .long sys_task_mode_transition ++ .long sys_sleep_next_period ++ .long sys_register_np_flag ++ .long sys_exit_np /* 325 */ ++ .long sys_od_open ++ .long sys_od_close ++ .long sys_pi_down ++ .long sys_pi_up ++ .long sys_srp_down /* 330 */ ++ .long sys_srp_up ++ .long sys_reg_task_srp_sem ++ .long sys_query_job_no ++ .long sys_wait_for_job_release ++ .long sys_wait_for_ts_release /* 335 */ ++ .long sys_release_ts ++ .long sys_pcp_down ++ .long sys_pcp_up ++ .long sys_dpcp_invoke ++ .long sys_dpcp_agent /* 340 */ +diff --git a/fs/exec.c b/fs/exec.c +index 11fe93f..353d6e3 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -54,6 +54,8 @@ + #include <asm/uaccess.h> + #include <asm/mmu_context.h> + ++#include <litmus/litmus.h> ++ + #ifdef CONFIG_KMOD + #include <linux/kmod.h> + #endif +@@ -1140,7 +1142,8 @@ int do_execve(char * filename, + if (IS_ERR(file)) + goto out_kfree; + +- sched_exec(); ++ sched_exec(); ++ litmus_exec(); + + bprm->p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *); + +diff --git a/fs/inode.c b/fs/inode.c +index bf21dc6..fcf8ce3 100644 +--- a/fs/inode.c ++++ b/fs/inode.c +@@ -205,6 +205,8 @@ void inode_init_once(struct inode *inode) + INIT_LIST_HEAD(&inode->inotify_watches); + mutex_init(&inode->inotify_mutex); + #endif ++ INIT_LIST_HEAD(&inode->i_obj_list); ++ mutex_init(&inode->i_obj_mutex); + } + + EXPORT_SYMBOL(inode_init_once); +diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h +index 833fa17..d0ba5c3 100644 +--- a/include/asm-i386/unistd.h ++++ b/include/asm-i386/unistd.h +@@ -325,10 +325,33 @@ + #define __NR_move_pages 317 + #define __NR_getcpu 318 + #define __NR_epoll_pwait 319 ++/* LITMUS */ ++#define __NR_set_rt_task_param 320 ++#define __NR_get_rt_task_param 321 ++#define __NR_task_mode 322 ++#define __NR_sleep_next_period 323 ++#define __NR_register_np_flag 324 ++#define __NR_exit_np 325 ++#define __NR_od_open 326 ++#define __NR_od_close 327 ++#define __NR_pi_down 328 ++#define __NR_pi_up 329 ++#define __NR_srp_down 330 ++#define __NR_srp_up 331 ++#define __NR_reg_task_srp_sem 332 ++#define __NR_query_job_no 333 ++#define __NR_wait_for_job_release 334 ++#define __NR_wait_for_ts_release 335 ++#define __NR_release_ts 336 ++#define __NR_pcp_down 337 ++#define __NR_pcp_up 338 ++#define __NR_dpcp_invoke 339 ++#define __NR_dpcp_agent 340 ++ + + #ifdef __KERNEL__ + +-#define NR_syscalls 320 ++#define NR_syscalls 343 + + #define __ARCH_WANT_IPC_PARSE_VERSION + #define __ARCH_WANT_OLD_READDIR +diff --git a/include/linux/completion.h b/include/linux/completion.h +index 268c5a4..dc633ed 100644 +--- a/include/linux/completion.h ++++ b/include/linux/completion.h +@@ -51,6 +51,8 @@ extern unsigned long FASTCALL(wait_for_completion_interruptible_timeout( + + extern void FASTCALL(complete(struct completion *)); + extern void FASTCALL(complete_all(struct completion *)); ++extern void FASTCALL(complete_n(struct completion *, int n)); ++ + + #define INIT_COMPLETION(x) ((x).done = 0) + +diff --git a/include/linux/fs.h b/include/linux/fs.h +index 1410e53..4e1117c 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -524,6 +524,8 @@ static inline int mapping_writably_mapped(struct address_space *mapping) + #define i_size_ordered_init(inode) do { } while (0) + #endif + ++struct inode_obj_id_table; ++ + struct inode { + struct hlist_node i_hash; + struct list_head i_list; +@@ -589,6 +591,9 @@ struct inode { + void *i_security; + #endif + void *i_private; /* fs or device private pointer */ ++ ++ struct list_head i_obj_list; ++ struct mutex i_obj_mutex; + }; + + /* +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 4463735..c7929d6 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -3,6 +3,8 @@ + + #include <linux/auxvec.h> /* For AT_VECTOR_SIZE */ + ++#include <litmus/rt_param.h> ++ + /* + * cloning flags: + */ +@@ -796,6 +798,8 @@ enum sleep_type { + SLEEP_INTERRUPTED, + }; + ++struct od_table_entry; ++ + struct prio_array; + + struct task_struct { +@@ -1051,6 +1055,16 @@ struct task_struct { + #ifdef CONFIG_FAULT_INJECTION + int make_it_fail; + #endif ++ /* litmus parameters and state */ ++ struct rt_param rt_param; ++ ++ /* allow scheduler plugins to queue in release lists, etc. ++ * Cleanup: Move this into the rt_param struct. ++ */ ++ struct list_head rt_list; ++ ++ /* references to PI semaphores, etc. */ ++ struct od_table_entry* od_table; + }; + + static inline pid_t process_group(struct task_struct *tsk) +diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h +index 975c963..6ae0ff9 100644 +--- a/include/linux/uaccess.h ++++ b/include/linux/uaccess.h +@@ -84,4 +84,20 @@ static inline unsigned long __copy_from_user_nocache(void *to, + ret; \ + }) + ++/* This is a naive attempt at a write version of the above native Linux macro. ++ */ ++#define poke_kernel_address(val, addr) \ ++ ({ \ ++ long ret; \ ++ mm_segment_t old_fs = get_fs(); \ ++ \ ++ set_fs(KERNEL_DS); \ ++ pagefault_disable(); \ ++ ret = __put_user(val, (__force typeof(val) __user *)(addr)); \ ++ pagefault_enable(); \ ++ set_fs(old_fs); \ ++ ret; \ ++ }) ++ ++ + #endif /* __LINUX_UACCESS_H__ */ +diff --git a/include/litmus/edf_common.h b/include/litmus/edf_common.h +new file mode 100644 +index 0000000..df711f5 +--- /dev/null ++++ b/include/litmus/edf_common.h +@@ -0,0 +1,27 @@ ++/* EDF common data structures and utility functions shared by all EDF ++ * based scheduler plugins ++ */ ++ ++/* CLEANUP: Add comments and make it less messy. ++ * ++ */ ++ ++#ifndef __UNC_EDF_COMMON_H__ ++#define __UNC_EDF_COMMON_H__ ++ ++#include <litmus/rt_domain.h> ++ ++ ++void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched); ++ ++int edf_higher_prio(struct task_struct* first, ++ struct task_struct* second); ++ ++int edf_ready_order(struct list_head* a, struct list_head* b); ++ ++int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t); ++ ++#define job_completed(t) (!is_be(t) && \ ++ (t)->rt_param.times.exec_time == (t)->rt_param.basic_params.exec_cost) ++ ++#endif +diff --git a/include/litmus/fdso.h b/include/litmus/fdso.h +new file mode 100644 +index 0000000..5544c1b +--- /dev/null ++++ b/include/litmus/fdso.h +@@ -0,0 +1,78 @@ ++/* fdso.h - file descriptor attached shared objects ++ * ++ * (c) 2007 B. Brandenburg, LITMUS^RT project ++ */ ++ ++#ifndef _LINUX_FDSO_H_ ++#define _LINUX_FDSO_H_ ++ ++#include <linux/list.h> ++#include <asm/atomic.h> ++ ++#include <linux/fs.h> ++ ++#define MAX_OBJECT_DESCRIPTORS 32 ++ ++typedef enum { ++ MIN_OBJ_TYPE = 0, ++ ++ PI_SEM = 0, ++ SRP_SEM = 1, ++ PCP_SEM = 2, ++ MPCP_SEM = 3, ++ ++ MAX_OBJ_TYPE = 3 ++} obj_type_t; ++ ++struct inode_obj_id { ++ struct list_head list; ++ atomic_t count; ++ struct inode* inode; ++ ++ obj_type_t type; ++ void* obj; ++ unsigned int id; ++}; ++ ++ ++struct od_table_entry { ++ unsigned int used; ++ ++ struct inode_obj_id* obj; ++ void* extra; ++}; ++ ++struct fdso_ops { ++ void* (*create) (void); ++ void (*destroy)(void*); ++ int (*open) (struct od_table_entry*, void* __user); ++ int (*close) (struct od_table_entry*); ++}; ++ ++/* translate a userspace supplied od into the raw table entry ++ * returns NULL if od is invalid ++ */ ++struct od_table_entry* __od_lookup(int od); ++ ++/* translate a userspace supplied od into the associated object ++ * returns NULL if od is invalid ++ */ ++static inline void* od_lookup(int od, obj_type_t type) ++{ ++ struct od_table_entry* e = __od_lookup(od); ++ return e && e->obj->type == type ? e->obj->obj : NULL; ++} ++ ++static inline void* od_lookup2(int od, obj_type_t type, obj_type_t type2) ++{ ++ struct od_table_entry* e = __od_lookup(od); ++ return e && (e->obj->type == type || e->obj->type == type2) ? ++ e->obj->obj : NULL; ++} ++ ++#define lookup_pi_sem(od) ((struct pi_semaphore*) od_lookup(od, PI_SEM)) ++#define lookup_srp_sem(od) ((struct srp_semaphore*) od_lookup(od, SRP_SEM)) ++#define lookup_pcp_sem(od) ((struct pcp_semaphore*) \ ++ od_lookup2(od, PCP_SEM, MPCP_SEM)) ++ ++#endif +diff --git a/include/litmus/feather_buffer.h b/include/litmus/feather_buffer.h +new file mode 100644 +index 0000000..c788227 +--- /dev/null ++++ b/include/litmus/feather_buffer.h +@@ -0,0 +1,108 @@ ++#ifndef _FEATHER_BUFFER_H_ ++#define _FEATHER_BUFFER_H_ ++ ++/* requires UINT_MAX and memcpy */ ++ ++static inline int fetch_and_inc(int *val) ++{ ++ int ret = 1; ++ __asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" ); ++ return ret; ++} ++ ++static inline int fetch_and_dec(int *val) ++{ ++ int ret = -1; ++ __asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" ); ++ return ret; ++} ++ ++#define SLOT_FREE 0 ++#define SLOT_BUSY 1 ++#define SLOT_READY 2 ++ ++struct ft_buffer { ++ unsigned int slot_count; ++ unsigned int slot_size; ++ ++ int free_count; ++ unsigned int write_idx; ++ unsigned int read_idx; ++ ++ char* slots; ++ void* buffer_mem; ++ unsigned int failed_writes; ++}; ++ ++static inline int init_ft_buffer(struct ft_buffer* buf, ++ unsigned int slot_count, ++ unsigned int slot_size, ++ char* slots, ++ void* buffer_mem) ++{ ++ int i = 0; ++ if (!slot_count || UINT_MAX % slot_count != slot_count - 1) { ++ /* The slot count must divide UNIT_MAX + 1 so that when it ++ * wraps around the index correctly points to 0. ++ */ ++ return 0; ++ } else { ++ buf->slot_count = slot_count; ++ buf->slot_size = slot_size; ++ buf->slots = slots; ++ buf->buffer_mem = buffer_mem; ++ buf->free_count = slot_count; ++ buf->write_idx = 0; ++ buf->read_idx = 0; ++ buf->failed_writes = 0; ++ for (i = 0; i < slot_count; i++) ++ buf->slots[i] = SLOT_FREE; ++ return 1; ++ } ++} ++ ++static inline int ft_buffer_start_write(struct ft_buffer* buf, void **ptr) ++{ ++ int free = fetch_and_dec(&buf->free_count); ++ unsigned int idx; ++ if (free <= 0) { ++ fetch_and_inc(&buf->free_count); ++ *ptr = 0; ++ fetch_and_inc(&buf->failed_writes); ++ return 0; ++ } else { ++ idx = fetch_and_inc((int*) &buf->write_idx) % buf->slot_count; ++ buf->slots[idx] = SLOT_BUSY; ++ *ptr = ((char*) buf->buffer_mem) + idx * buf->slot_size; ++ return 1; ++ } ++} ++ ++static inline void ft_buffer_finish_write(struct ft_buffer* buf, void *ptr) ++{ ++ unsigned int idx = ((char*) ptr - (char*) buf->buffer_mem) / buf->slot_size; ++ buf->slots[idx] = SLOT_READY; ++} ++ ++ ++/* exclusive reader access is assumed */ ++static inline int ft_buffer_read(struct ft_buffer* buf, void* dest) ++{ ++ unsigned int idx; ++ if (buf->free_count == buf->slot_count) ++ /* nothing available */ ++ return 0; ++ idx = buf->read_idx % buf->slot_count; ++ if (buf->slots[idx] == SLOT_READY) { ++ memcpy(dest, ((char*) buf->buffer_mem) + idx * buf->slot_size, ++ buf->slot_size); ++ buf->slots[idx] = SLOT_FREE; ++ buf->read_idx++; ++ fetch_and_inc(&buf->free_count); ++ return 1; ++ } else ++ return 0; ++} ++ ++ ++#endif +diff --git a/include/litmus/feather_trace.h b/include/litmus/feather_trace.h +new file mode 100644 +index 0000000..5c37ea7 +--- /dev/null ++++ b/include/litmus/feather_trace.h +@@ -0,0 +1,93 @@ ++#ifndef _FEATHER_TRACE_H_ ++#define _FEATHER_TRACE_H_ ++ ++#define feather_callback __attribute__((regparm(0))) ++ ++/* make the compiler reload any register that is not saved in ++ * a cdecl function call ++ */ ++#define CLOBBER_LIST "memory", "cc", "eax", "ecx", "edx" ++ ++#define ft_event(id, callback) \ ++ __asm__ __volatile__( \ ++ "1: jmp 2f \n\t" \ ++ " call " #callback " \n\t" \ ++ ".section __event_table, \"aw\" \n\t" \ ++ ".long " #id ", 0, 1b, 2f \n\t" \ ++ ".previous \n\t" \ ++ "2: \n\t" \ ++ : : : CLOBBER_LIST) ++ ++#define ft_event0(id, callback) \ ++ __asm__ __volatile__( \ ++ "1: jmp 2f \n\t" \ ++ " subl $4, %%esp \n\t" \ ++ " movl $" #id ", (%%esp) \n\t" \ ++ " call " #callback " \n\t" \ ++ " addl $4, %%esp \n\t" \ ++ ".section __event_table, \"aw\" \n\t" \ ++ ".long " #id ", 0, 1b, 2f \n\t" \ ++ ".previous \n\t" \ ++ "2: \n\t" \ ++ : : : CLOBBER_LIST) ++ ++#define ft_event1(id, callback, param) \ ++ __asm__ __volatile__( \ ++ "1: jmp 2f \n\t" \ ++ " subl $8, %%esp \n\t" \ ++ " movl %0, 4(%%esp) \n\t" \ ++ " movl $" #id ", (%%esp) \n\t" \ ++ " call " #callback " \n\t" \ ++ " addl $8, %%esp \n\t" \ ++ ".section __event_table, \"aw\" \n\t" \ ++ ".long " #id ", 0, 1b, 2f \n\t" \ ++ ".previous \n\t" \ ++ "2: \n\t" \ ++ : : "r" (param) : CLOBBER_LIST) ++ ++#define ft_event2(id, callback, param, param2) \ ++ __asm__ __volatile__( \ ++ "1: jmp 2f \n\t" \ ++ " subl $12, %%esp \n\t" \ ++ " movl %1, 8(%%esp) \n\t" \ ++ " movl %0, 4(%%esp) \n\t" \ ++ " movl $" #id ", (%%esp) \n\t" \ ++ " call " #callback " \n\t" \ ++ " addl $12, %%esp \n\t" \ ++ ".section __event_table, \"aw\" \n\t" \ ++ ".long " #id ", 0, 1b, 2f \n\t" \ ++ ".previous \n\t" \ ++ "2: \n\t" \ ++ : : "r" (param), "r" (param2) : CLOBBER_LIST) ++ ++ ++#define ft_event3(id, callback, p, p2, p3) \ ++ __asm__ __volatile__( \ ++ "1: jmp 2f \n\t" \ ++ " subl $16, %%esp \n\t" \ ++ " movl %1, 12(%%esp) \n\t" \ ++ " movl %1, 8(%%esp) \n\t" \ ++ " movl %0, 4(%%esp) \n\t" \ ++ " movl $" #id ", (%%esp) \n\t" \ ++ " call " #callback " \n\t" \ ++ " addl $16, %%esp \n\t" \ ++ ".section __event_table, \"aw\" \n\t" \ ++ ".long " #id ", 0, 1b, 2f \n\t" \ ++ ".previous \n\t" \ ++ "2: \n\t" \ ++ : : "r" (p), "r" (p2), "r" (p3) : CLOBBER_LIST) ++ ++ ++static inline unsigned long long ft_read_tsc(void) ++{ ++ unsigned long long ret; ++ __asm__ __volatile__("rdtsc" : "=A" (ret)); ++ return ret; ++} ++ ++int ft_enable_event(unsigned long id); ++int ft_disable_event(unsigned long id); ++int ft_is_event_enabled(unsigned long id); ++int ft_disable_all_events(void); ++ ++#endif +diff --git a/include/litmus/jobs.h b/include/litmus/jobs.h +new file mode 100644 +index 0000000..9bd361e +--- /dev/null ++++ b/include/litmus/jobs.h +@@ -0,0 +1,9 @@ ++#ifndef __LITMUS_JOBS_H__ ++#define __LITMUS_JOBS_H__ ++ ++void prepare_for_next_period(struct task_struct *t); ++void release_at(struct task_struct *t, lt_t start); ++long complete_job(void); ++ ++#endif ++ +diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h +new file mode 100644 +index 0000000..5853ed5 +--- /dev/null ++++ b/include/litmus/litmus.h +@@ -0,0 +1,200 @@ ++/* ++ * Constant definitions related to ++ * scheduling policy. ++ */ ++ ++#ifndef _LINUX_LITMUS_H_ ++#define _LINUX_LITMUS_H_ ++ ++#include <linux/jiffies.h> ++#include <litmus/sched_trace.h> ++ ++typedef enum { ++ SCHED_LINUX = 0, ++ SCHED_GSN_EDF = 10, ++ SCHED_PSN_EDF = 11, ++ /* Add your scheduling policy here */ ++ ++ SCHED_DEFAULT = 0, ++ SCHED_INVALID = -1, ++} spolicy; ++ ++ ++typedef enum { ++ LITMUS_RESERVED_RANGE = 1024, ++ ++} sched_setup_cmd_t; ++ ++/* per-task modes */ ++enum rt_task_mode_t { ++ BACKGROUND_TASK = 0, ++ LITMUS_RT_TASK = 1 ++}; ++ ++/* Plugin boot options, for convenience */ ++#define PLUGIN_LINUX "linux" ++#define PLUGIN_GSN_EDF "gsn_edf" ++#define PLUGIN_PSN_EDF "psn_edf" ++ ++extern spolicy sched_policy; ++ ++/* RT mode start time */ ++extern volatile unsigned long rt_start_time; ++ ++#define TRACE(fmt, args...) \ ++ sched_trace_log_message("%d: " fmt, raw_smp_processor_id(), ## args) ++ ++#define TRACE_TASK(t, fmt, args...) \ ++ TRACE("(%s/%d) " fmt, (t)->comm, (t)->pid, ##args) ++ ++#define TRACE_CUR(fmt, args...) \ ++ TRACE_TASK(current, fmt, ## args) ++ ++#define TRACE_BUG_ON(cond) \ ++ do { if (cond) TRACE("BUG_ON(%s) at %s:%d " \ ++ "called from %p current=%s/%d state=%d " \ ++ "flags=%x partition=%d cpu=%d rtflags=%d"\ ++ " job=%u knp=%d timeslice=%u\n", \ ++ #cond, __FILE__, __LINE__, __builtin_return_address(0), current->comm, \ ++ current->pid, current->state, current->flags, \ ++ get_partition(current), smp_processor_id(), get_rt_flags(current), \ ++ current->rt_param.job_params.job_no, current->rt_param.kernel_np, \ ++ current->time_slice\ ++ ); } while(0); ++ ++ ++/* in_list - is a given list_head queued on some list? ++ */ ++static inline int in_list(struct list_head* list) ++{ ++ return !( /* case 1: deleted */ ++ (list->next == LIST_POISON1 && ++ list->prev == LIST_POISON2) ++ || ++ /* case 2: initialized */ ++ (list->next == list && ++ list->prev == list) ++ ); ++} ++ ++typedef int (*prio_cmp_t)(struct task_struct* first, ++ struct task_struct* second); ++ ++typedef int (*list_cmp_t)(struct list_head*, struct list_head*); ++ ++static inline unsigned int list_insert(struct list_head* new, ++ struct list_head* head, ++ list_cmp_t order_before) ++{ ++ struct list_head *pos; ++ unsigned int passed = 0; ++ ++ BUG_ON(!new); ++ ++ /* find a spot where the new entry is less than the next */ ++ list_for_each(pos, head) { ++ if (unlikely(order_before(new, pos))) { ++ /* pos is not less than new, thus insert here */ ++ __list_add(new, pos->prev, pos); ++ goto out; ++ } ++ passed++; ++ } ++ /* if we get to this point either the list is empty or every entry ++ * queued element is less than new. ++ * Let's add new to the end. */ ++ list_add_tail(new, head); ++ out: ++ return passed; ++} ++ ++void list_qsort(struct list_head* list, list_cmp_t less_than); ++ ++ ++#define RT_PREEMPTIVE 0x2050 /* = NP */ ++#define RT_NON_PREEMPTIVE 0x4e50 /* = P */ ++#define RT_EXIT_NP_REQUESTED 0x5251 /* = RQ */ ++ ++/* returns 1 if task t has registered np flag and set it to RT_NON_PREEMPTIVE ++ */ ++int is_np(struct task_struct *t); ++ ++/* request that the task should call sys_exit_np() ++ */ ++void request_exit_np(struct task_struct *t); ++ ++/* kill naughty tasks ++ */ ++void scheduler_signal(struct task_struct *t, unsigned int signal); ++void send_scheduler_signals(void); ++void np_mem_kill(struct task_struct *t); ++ ++void litmus_fork(struct task_struct *tsk); ++void litmus_exec(void); ++/* clean up real-time state of a task */ ++void exit_litmus(struct task_struct *dead_tsk); ++ ++long transition_to_rt(struct task_struct* tsk); ++long transition_to_be(struct task_struct* tsk); ++ ++#define is_realtime(t) ((t)->rt_param.is_realtime) ++#define rt_transition_pending(t) \ ++ ((t)->rt_param.transition_pending) ++ ++/* Realtime utility macros */ ++#define get_rt_flags(t) ((t)->rt_param.flags) ++#define set_rt_flags(t,f) (t)->rt_param.flags=(f) ++#define get_exec_cost(t) ((t)->rt_param.task_params.exec_cost) ++#define get_exec_time(t) ((t)->rt_param.job_params.exec_time) ++#define get_rt_period(t) ((t)->rt_param.task_params.period) ++#define get_partition(t) (t)->rt_param.task_params.cpu ++#define get_deadline(t) ((t)->rt_param.job_params.deadline) ++#define get_class(t) ((t)->rt_param.task_params.cls) ++ ++inline static int budget_exhausted(struct task_struct* t) ++{ ++ return get_exec_time(t) >= get_exec_cost(t); ++} ++ ++ ++#define is_hrt(t) \ ++ ((t)->rt_param.task_params.class == RT_CLASS_HARD) ++#define is_srt(t) \ ++ ((t)->rt_param.task_params.class == RT_CLASS_SOFT) ++#define is_be(t) \ ++ ((t)->rt_param.task_params.class == RT_CLASS_BEST_EFFORT) ++ ++#define get_release(t) ((t)->rt_param.job_params.release) ++ ++/* Honor the flag in the preempt_count variable that is set ++ * when scheduling is in progress. ++ */ ++#define is_running(t) \ ++ ((t)->state == TASK_RUNNING || \ ++ (t)->thread_info->preempt_count & PREEMPT_ACTIVE) ++ ++#define is_blocked(t) \ ++ (!is_running(t)) ++#define is_released(t, now) \ ++ (lt_before_eq(get_release(t), now)) ++#define is_tardy(t, now) \ ++ (lt_before_eq((t)->rt_param.job_params.deadline, now)) ++ ++/* real-time comparison macros */ ++#define earlier_deadline(a, b) (lt_before(\ ++ (a)->rt_param.job_params.deadline,\ ++ (b)->rt_param.job_params.deadline)) ++#define earlier_release(a, b) (lt_before(\ ++ (a)->rt_param.job_params.release,\ ++ (b)->rt_param.job_params.release)) ++ ++#define shorter_period(a, b) (lt_before(\ ++ (a)->rt_param.task_params.period, \ ++ (b)->rt_param.task_params.period)) ++ ++#define make_np(t) do {t->rt_param.kernel_np++;} while(0); ++#define take_np(t) do {t->rt_param.kernel_np--;} while(0); ++ ++void srp_ceiling_block(void); ++ ++#endif +diff --git a/include/litmus/rm_common.h b/include/litmus/rm_common.h +new file mode 100644 +index 0000000..11e8365 +--- /dev/null ++++ b/include/litmus/rm_common.h +@@ -0,0 +1,44 @@ ++/* rate monotonic helper functions. ++ */ ++ ++ ++#ifndef __UNC_RM_COMMON_H__ ++#define __UNC_RM_COMMON_H__ ++ ++#include <litmus/rt_domain.h> ++ ++static inline int _rm_higher_prio(struct pcp_priority *p1, ++ struct pcp_priority *p2) ++{ ++ /* does the second task exist and is it a real-time task? If ++ * not, the first task (which is a RT task) has higher ++ * priority. ++ */ ++ ++ if (unlikely(!p2)) ++ return 1; ++ ++ if (p1->in_global_cs == p2->in_global_cs) { ++ /* tie break by RM priority */ ++ if (p1->prio == p2->prio) ++ /* tie break equal periods by PID */ ++ return p1->pid < p2->pid; ++ else ++ /* shorter period or lower index has higher priority */ ++ return p1->prio < p2->prio; ++ } else ++ /* gcs always have higher priority */ ++ return p1->in_global_cs > p2->in_global_cs; ++} ++ ++ ++void rm_domain_init(rt_domain_t* rt, check_resched_needed_t resched); ++ ++int rm_higher_prio(struct task_struct* first, ++ struct task_struct* second); ++ ++int rm_ready_order(struct list_head* a, struct list_head* b); ++ ++int rm_preemption_needed(rt_domain_t* rt, struct task_struct *t); ++ ++#endif +diff --git a/include/litmus/rt_domain.h b/include/litmus/rt_domain.h +new file mode 100644 +index 0000000..79b6034 +--- /dev/null ++++ b/include/litmus/rt_domain.h +@@ -0,0 +1,94 @@ ++/* CLEANUP: Add comments and make it less messy. ++ * ++ */ ++ ++#ifndef __UNC_RT_DOMAIN_H__ ++#define __UNC_RT_DOMAIN_H__ ++ ++struct _rt_domain; ++ ++typedef int (*check_resched_needed_t)(struct _rt_domain *rt); ++typedef void (*release_at_t)(struct task_struct *t, lt_t start); ++ ++typedef struct _rt_domain { ++ /* runnable rt tasks are in here */ ++ rwlock_t ready_lock; ++ struct list_head ready_queue; ++ ++ /* real-time tasks waiting for release are in here */ ++ spinlock_t release_lock; ++ struct list_head release_queue; ++ ++ /* how do we check if we need to kick another CPU? */ ++ check_resched_needed_t check_resched; ++ ++ /* how are tasks ordered in the ready queue? */ ++ list_cmp_t order; ++} rt_domain_t; ++ ++#define next_ready(rt) \ ++ (list_entry((rt)->ready_queue.next, struct task_struct, rt_list)) ++ ++#define ready_jobs_pending(rt) \ ++ (!list_empty(&(rt)->ready_queue)) ++ ++void rt_domain_init(rt_domain_t *rt, check_resched_needed_t f, ++ list_cmp_t order); ++ ++void __add_ready(rt_domain_t* rt, struct task_struct *new); ++void __add_release(rt_domain_t* rt, struct task_struct *task); ++ ++struct task_struct* __take_ready(rt_domain_t* rt); ++struct task_struct* __peek_ready(rt_domain_t* rt); ++ ++void try_release_pending(rt_domain_t* rt); ++void __release_pending(rt_domain_t* rt); ++ ++static inline void add_ready(rt_domain_t* rt, struct task_struct *new) ++{ ++ unsigned long flags; ++ /* first we need the write lock for rt_ready_queue */ ++ write_lock_irqsave(&rt->ready_lock, flags); ++ __add_ready(rt, new); ++ write_unlock_irqrestore(&rt->ready_lock, flags); ++} ++ ++static inline struct task_struct* take_ready(rt_domain_t* rt) ++{ ++ unsigned long flags; ++ struct task_struct* ret; ++ /* first we need the write lock for rt_ready_queue */ ++ write_lock_irqsave(&rt->ready_lock, flags); ++ ret = __take_ready(rt); ++ write_unlock_irqrestore(&rt->ready_lock, flags); ++ return ret; ++} ++ ++ ++static inline void add_release(rt_domain_t* rt, struct task_struct *task) ++{ ++ unsigned long flags; ++ /* first we need the write lock for rt_ready_queue */ ++ spin_lock_irqsave(&rt->release_lock, flags); ++ __add_release(rt, task); ++ spin_unlock_irqrestore(&rt->release_lock, flags); ++} ++ ++static inline int __jobs_pending(rt_domain_t* rt) ++{ ++ return !list_empty(&rt->ready_queue); ++} ++ ++static inline int jobs_pending(rt_domain_t* rt) ++{ ++ unsigned long flags; ++ int ret; ++ /* first we need the write lock for rt_ready_queue */ ++ read_lock_irqsave(&rt->ready_lock, flags); ++ ret = __jobs_pending(rt); ++ read_unlock_irqrestore(&rt->ready_lock, flags); ++ return ret; ++} ++ ++ ++#endif +diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h +new file mode 100644 +index 0000000..37a4495 +--- /dev/null ++++ b/include/litmus/rt_param.h +@@ -0,0 +1,177 @@ ++/* ++ * Definition of the scheduler plugin interface. ++ * ++ */ ++#ifndef _LINUX_RT_PARAM_H_ ++#define _LINUX_RT_PARAM_H_ ++ ++/* Litmus time type. */ ++typedef unsigned long long lt_t; ++ ++static inline int lt_after(lt_t a, lt_t b) ++{ ++ return ((long long) b) - ((long long) a) < 0; ++} ++#define lt_before(a, b) lt_after(b, a) ++ ++static inline int lt_after_eq(lt_t a, lt_t b) ++{ ++ return ((long long) a) - ((long long) b) >= 0; ++} ++#define lt_before_eq(a, b) lt_after_eq(b, a) ++ ++/* different types of clients */ ++typedef enum { ++ RT_CLASS_HARD, ++ RT_CLASS_SOFT, ++ RT_CLASS_BEST_EFFORT ++} task_class_t; ++ ++struct rt_task { ++ lt_t exec_cost; ++ lt_t period; ++ lt_t phase; ++ lt_t prio; ++ unsigned int cpu; ++ task_class_t cls; ++}; ++ ++#define DPCP_WAIT 0x1 ++#define DPCP_COMPLETE 0x2 ++ ++/* don't export internal data structures to user space (liblitmus) */ ++#ifdef __KERNEL__ ++ ++#include <linux/list.h> ++ ++struct rt_job { ++ /* Time instant the the job was or will be released. */ ++ lt_t release; ++ /* What is the current deadline? */ ++ lt_t deadline; ++ /* How much service has this job received so far? ++ */ ++ lt_t exec_time; ++ ++ /* Which job is this. This is used to let user space ++ * specify which job to wait for, which is important if jobs ++ * overrun. If we just call sys_sleep_next_period() then we ++ * will unintentionally miss jobs after an overrun. ++ * ++ * Increase this sequence number when a job is released. ++ */ ++ unsigned int job_no; ++ ++ /* when did this job start executing? */ ++ lt_t exec_start; ++}; ++ ++ ++/* make priority inheritance cleaner for PCP */ ++struct pcp_priority { ++ lt_t prio; ++ int in_global_cs; ++ int pid; ++}; ++ ++struct pcp_semaphore; ++ ++/* RT task parameters for scheduling extensions ++ * These parameters are inherited during clone and therefore must ++ * be explicitly set up before the task set is launched. ++ */ ++struct rt_param { ++ /* is the task sleeping? */ ++ unsigned int flags:8; ++ ++ /* Real-time marker: 1 iff it is a LITMUS real-time task. ++ */ ++ unsigned int is_realtime:1; ++ ++ /* is a BE->RT or RT->BE transition pending? */ ++ unsigned int transition_pending:1; ++ ++ /* is this task under control of litmus? ++ * ++ * this is necessary because otherwise signal delivery code ++ * may try to wake up a task that is already queued in plugin ++ * data structures. ++ * ++ * bbb: I believe this flag is fundamentally flawed and should be ++ * taken out in the redesign. ++ */ ++ unsigned int litmus_controlled:1; ++ ++ /* do we need to check for srp blocking? */ ++ unsigned int srp_non_recurse:1; ++ ++ /* if a BE->RT transition failed, then this field contains the error */ ++ unsigned long transition_error; ++ ++ /* user controlled parameters */ ++ struct rt_task task_params; ++ ++ /* timing parameters */ ++ struct rt_job job_params; ++ ++ ++ /* task representing the current "inherited" task ++ * priority, assigned by inherit_priority and ++ * return priority in the scheduler plugins. ++ * could point to self if PI does not result in ++ * an increased task priority. ++ */ ++ struct task_struct* inh_task; ++ ++ /* Don't just dereference this pointer in kernel space! ++ * It might very well point to junk or nothing at all. ++ * NULL indicates that the task has not requested any non-preemptable ++ * section support. ++ * Not inherited upon fork. ++ */ ++ short* np_flag; ++ ++ /* For the FMLP under PSN-EDF, it is required to make the task ++ * non-preemptive from kernel space. In order not to interfere with ++ * user space, this counter indicates the kernel space np setting. ++ * kernel_np > 0 => task is non-preemptive ++ */ ++ unsigned int kernel_np; ++ ++ /* This field can be used by plugins to store where the task ++ * is currently scheduled. It is the responsibility of the ++ * plugin to avoid race conditions. ++ * ++ * Used by GSN-EDF. ++ */ ++ int scheduled_on; ++ ++ /* This field can be used by plugins to store where the task ++ * is currently linked. It is the responsibility of the plugin ++ * to avoid race conditions. ++ * ++ * Used by GSN-EDF. ++ */ ++ int linked_on; ++ ++ /* Used by RM ++ */ ++ struct pcp_priority pcp_prio; ++ struct pcp_priority* cur_prio; ++ struct list_head owned_semaphores; ++ struct pcp_semaphore* blocked_on; ++ ++ /* Fields saved before BE->RT transition. ++ */ ++ int old_policy; ++ int old_prio; ++}; ++ ++/* Possible RT flags */ ++#define RT_F_RUNNING 0x00000000 ++#define RT_F_SLEEP 0x00000001 ++#define RT_F_EXIT_SEM 0x00000008 ++ ++#endif ++ ++#endif +diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h +new file mode 100644 +index 0000000..337668f +--- /dev/null ++++ b/include/litmus/sched_plugin.h +@@ -0,0 +1,120 @@ ++/* ++ * Definition of the scheduler plugin interface. ++ * ++ */ ++#ifndef _LINUX_SCHED_PLUGIN_H_ ++#define _LINUX_SCHED_PLUGIN_H_ ++ ++#include <linux/sched.h> ++#include <litmus/litmus.h> ++ ++/* struct for semaphore with priority inheritance */ ++struct pi_semaphore { ++ atomic_t count; ++ int sleepers; ++ wait_queue_head_t wait; ++ union { ++ /* highest-prio holder/waiter */ ++ struct task_struct *task; ++ struct task_struct* cpu_task[NR_CPUS]; ++ } hp; ++ /* current lock holder */ ++ struct task_struct *holder; ++}; ++ ++int set_hp_task(struct pi_semaphore *sem, prio_cmp_t cmp); ++int set_hp_cpu_task(struct pi_semaphore *sem, int cpu, prio_cmp_t cmp); ++ ++/********************* scheduler invocation ******************/ ++ ++/* Plugin-specific realtime tick handler */ ++typedef void (*scheduler_tick_t) (void); ++/* Novell make sched decision function */ ++typedef int (*schedule_t) (struct task_struct * prev, ++ struct task_struct ** next); ++/* Clean up after the task switch has occured. ++ * This function is called after every (even non-rt) task switch. ++ */ ++typedef void (*finish_switch_t)(struct task_struct *prev); ++ ++ ++/********************* task state changes ********************/ ++ ++/* called to setup a new real-time task */ ++typedef long (*prepare_task_t) (struct task_struct *task); ++/* called to re-introduce a task after blocking */ ++typedef void (*wake_up_task_t) (struct task_struct *task); ++/* called to notify the plugin of a blocking real-time task ++ * it will only be called for real-time tasks and before schedule is called */ ++typedef void (*task_blocks_t) (struct task_struct *task); ++/* called when a real-time task exits. Free any allocated resources */ ++typedef long (*tear_down_t) (struct task_struct *); ++ ++/* Called when the new_owner is released from the wait queue ++ * it should now inherit the priority from sem, _before_ it gets readded ++ * to any queue ++ */ ++typedef long (*inherit_priority_t) (struct pi_semaphore *sem, ++ struct task_struct *new_owner); ++ ++/* Called when the current task releases a semahpore where it might have ++ * inherited a piority from ++ */ ++typedef long (*return_priority_t) (struct pi_semaphore *sem); ++ ++/* Called when a task tries to acquire a semaphore and fails. Check if its ++ * priority is higher than that of the current holder. ++ */ ++typedef long (*pi_block_t) (struct pi_semaphore *sem, struct task_struct *t); ++ ++ ++/********************* sys call backends ********************/ ++/* This function causes the caller to sleep until the next release */ ++typedef long (*sleep_next_period_t) (void); ++ ++struct sched_plugin { ++ struct list_head list; ++ /* basic info */ ++ char *plugin_name; ++ unsigned int srp_active:1; ++ unsigned int pcp_active:1; ++ ++ /* scheduler invocation */ ++ scheduler_tick_t scheduler_tick; ++ schedule_t schedule; ++ finish_switch_t finish_switch; ++ ++ /* syscall backend */ ++ sleep_next_period_t sleep_next_period; ++ ++ /* task state changes */ ++ prepare_task_t prepare_task; ++ wake_up_task_t wake_up_task; ++ task_blocks_t task_blocks; ++ tear_down_t tear_down; ++ ++ /* priority inheritance */ ++ inherit_priority_t inherit_priority; ++ return_priority_t return_priority; ++ pi_block_t pi_block; ++} __attribute__ ((__aligned__(SMP_CACHE_BYTES))); ++ ++ ++extern struct sched_plugin *curr_sched_plugin; ++ ++int register_sched_plugin(struct sched_plugin* plugin); ++struct sched_plugin* find_sched_plugin(const char* name); ++int print_sched_plugins(char* buf, int max); ++ ++static inline int pcp_active(void) ++{ ++ return curr_sched_plugin->pcp_active; ++} ++ ++static inline int srp_active(void) ++{ ++ return curr_sched_plugin->srp_active; ++} ++ ++ ++#endif +diff --git a/include/litmus/sched_trace.h b/include/litmus/sched_trace.h +new file mode 100644 +index 0000000..f9938c2 +--- /dev/null ++++ b/include/litmus/sched_trace.h +@@ -0,0 +1,31 @@ ++/* sched_trace.h -- record scheduler events to a byte stream for offline analysis. ++ */ ++#ifndef _LINUX_SCHED_TRACE_H_ ++#define _LINUX_SCHED_TRACE_H_ ++ ++#include <linux/sched.h> ++ ++/* dummies, need to be re-implemented */ ++ ++/* used in sched.c */ ++#define sched_trace_task_arrival(t) ++#define sched_trace_task_departure(t) ++#define sched_trace_task_preemption(t, by) ++#define sched_trace_task_scheduled(t) ++ ++/* used in scheduler plugins */ ++#define sched_trace_job_release(t) ++#define sched_trace_job_completion(t) ++ ++ ++#ifdef CONFIG_SCHED_DEBUG_TRACE ++void sched_trace_log_message(const char* fmt, ...); ++ ++#else ++ ++#define sched_trace_log_message(fmt, ...) ++ ++#endif ++ ++ ++#endif +diff --git a/include/litmus/trace.h b/include/litmus/trace.h +new file mode 100644 +index 0000000..5c2c2c0 +--- /dev/null ++++ b/include/litmus/trace.h +@@ -0,0 +1,106 @@ ++ ++#ifndef _SYS_TRACE_H_ ++#define _SYS_TRACE_H_ ++ ++#include <litmus/feather_trace.h> ++#include <litmus/feather_buffer.h> ++ ++ ++/*********************** TIMESTAMPS ************************/ ++ ++struct timestamp { ++ unsigned long event; ++ unsigned long long timestamp; ++ unsigned int seq_no; ++ int cpu; ++}; ++ ++ ++/* buffer holding time stamps - will be provided by driver */ ++extern struct ft_buffer* trace_ts_buf; ++ ++/* save_timestamp: stores current time as struct timestamp ++ * in trace_ts_buf ++ */ ++asmlinkage void save_timestamp(unsigned long event); ++ ++#define TIMESTAMP(id) ft_event0(id, save_timestamp) ++ ++/* Convention for timestamps ++ * ========================= ++ * ++ * In order to process the trace files with a common tool, we use the following ++ * convention to measure execution times: The end time id of a code segment is ++ * always the next number after the start time event id. ++ */ ++ ++#define TS_SCHED_START TIMESTAMP(100) ++#define TS_SCHED_END TIMESTAMP(101) ++#define TS_CXS_START TIMESTAMP(102) ++#define TS_CXS_END TIMESTAMP(103) ++ ++#define TS_TICK_START TIMESTAMP(110) ++#define TS_TICK_END TIMESTAMP(111) ++ ++#define TS_PLUGIN_SCHED_START TIMESTAMP(120) ++#define TS_PLUGIN_SCHED_END TIMESTAMP(121) ++ ++#define TS_PLUGIN_TICK_START TIMESTAMP(130) ++#define TS_PLUGIN_TICK_END TIMESTAMP(131) ++ ++#define TS_ENTER_NP_START TIMESTAMP(140) ++#define TS_ENTER_NP_END TIMESTAMP(141) ++ ++#define TS_EXIT_NP_START TIMESTAMP(150) ++#define TS_EXIT_NP_END TIMESTAMP(151) ++ ++#define TS_SRP_UP_START TIMESTAMP(160) ++#define TS_SRP_UP_END TIMESTAMP(161) ++#define TS_SRP_DOWN_START TIMESTAMP(162) ++#define TS_SRP_DOWN_END TIMESTAMP(163) ++ ++#define TS_PI_UP_START TIMESTAMP(170) ++#define TS_PI_UP_END TIMESTAMP(171) ++#define TS_PI_DOWN_START TIMESTAMP(172) ++#define TS_PI_DOWN_END TIMESTAMP(173) ++ ++#define TS_FIFO_UP_START TIMESTAMP(180) ++#define TS_FIFO_UP_END TIMESTAMP(181) ++#define TS_FIFO_DOWN_START TIMESTAMP(182) ++#define TS_FIFO_DOWN_END TIMESTAMP(183) ++ ++#define PCP1 200 ++#define PCP2 204 ++ ++#define DPCP 210 ++#define MPCP 220 ++#define FMLP 230 ++#define SRPT 240 ++ ++#define TS_PCP_UP_START TIMESTAMP(PCP1) ++#define TS_PCP_UP_END TIMESTAMP(PCP1 + 1) ++#define TS_PCP1_DOWN_START TIMESTAMP(PCP1 + 2) ++#define TS_PCP1_DOWN_END TIMESTAMP(PCP1 + 3) ++#define TS_PCP2_DOWN_START TIMESTAMP(PCP2 + 2) ++#define TS_PCP2_DOWN_END TIMESTAMP(PCP2 + 3) ++ ++ ++#define TS_DPCP_INVOKE_START TIMESTAMP(DPCP) ++#define TS_DPCP_INVOKE_END TIMESTAMP(DPCP + 1) ++#define TS_DPCP_AGENT1_START TIMESTAMP(DPCP + 2) ++#define TS_DPCP_AGENT1_END TIMESTAMP(DPCP + 3) ++#define TS_DPCP_AGENT2_START TIMESTAMP(DPCP + 4) ++#define TS_DPCP_AGENT2_END TIMESTAMP(DPCP + 5) ++ ++ ++#define TS_MPCP_UP_START TIMESTAMP(MPCP) ++#define TS_MPCP_UP_END TIMESTAMP(MPCP + 1) ++#define TS_MPCP_DOWN_START TIMESTAMP(MPCP + 2) ++#define TS_MPCP_DOWN_END TIMESTAMP(MPCP + 3) ++ ++ ++#define TS_SRPT_START TIMESTAMP(SRPT) ++#define TS_SRPT_END TIMESTAMP(SRPT + 1) ++ ++ ++#endif /* !_SYS_TRACE_H_ */ +diff --git a/kernel/exit.c b/kernel/exit.c +index fec12eb..8a0eb79 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -50,6 +50,8 @@ + + extern void sem_exit (void); + ++extern void exit_od_table(struct task_struct* t); ++ + static void exit_mm(struct task_struct * tsk); + + static void __unhash_process(struct task_struct *p) +@@ -916,6 +918,8 @@ fastcall NORET_TYPE void do_exit(long code) + if (unlikely(tsk->audit_context)) + audit_free(tsk); + ++ exit_od_table(tsk); ++ + taskstats_exit(tsk, group_dead); + + exit_mm(tsk); +diff --git a/kernel/fork.c b/kernel/fork.c +index d57118d..6fa6e03 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -57,6 +57,9 @@ + #include <asm/cacheflush.h> + #include <asm/tlbflush.h> + ++#include <litmus/litmus.h> ++#include <litmus/sched_plugin.h> ++ + /* + * Protected counters by write_lock_irq(&tasklist_lock) + */ +@@ -118,6 +121,8 @@ void __put_task_struct(struct task_struct *tsk) + WARN_ON(atomic_read(&tsk->usage)); + WARN_ON(tsk == current); + ++ exit_litmus(tsk); ++ + security_task_free(tsk); + free_uid(tsk->user); + put_group_info(tsk->group_info); +diff --git a/kernel/sched.c b/kernel/sched.c +index cca93cc..fb35f31 100644 +--- a/kernel/sched.c ++++ b/kernel/sched.c +@@ -56,6 +56,12 @@ + + #include <asm/unistd.h> + ++#include <litmus/litmus.h> ++#include <litmus/sched_plugin.h> ++#include <litmus/sched_trace.h> ++#include <litmus/rt_param.h> ++#include <litmus/trace.h> ++ + /* + * Convert user-nice values [ -20 ... 0 ... 19 ] + * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], +@@ -836,7 +842,7 @@ static int effective_prio(struct task_struct *p) + * keep the priority unchanged. Otherwise, update priority + * to the normal priority: + */ +- if (!rt_prio(p->prio)) ++ if (!rt_prio(p->prio) && !is_realtime(p)) + return p->normal_prio; + return p->prio; + } +@@ -844,7 +850,7 @@ static int effective_prio(struct task_struct *p) + /* + * __activate_task - move a task to the runqueue. + */ +-static void __activate_task(struct task_struct *p, struct rq *rq) ++void __activate_task(struct task_struct *p, struct rq *rq) + { + struct prio_array *target = rq->active; + +@@ -999,7 +1005,7 @@ out: + /* + * deactivate_task - remove a task from the runqueue. + */ +-static void deactivate_task(struct task_struct *p, struct rq *rq) ++void deactivate_task(struct task_struct *p, struct rq *rq) + { + dec_nr_running(p, rq); + dequeue_task(p, p->array); +@@ -1408,6 +1414,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) + #endif + + rq = task_rq_lock(p, &flags); ++ ++ if (is_realtime(p)) ++ TRACE("try_to_wake_up(%s/%d)\n", p->comm, p->pid); ++ + old_state = p->state; + if (!(old_state & state)) + goto out; +@@ -1415,6 +1425,12 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) + if (p->array) + goto out_running; + ++ sched_trace_task_arrival(p); ++ if (is_realtime(p)) { ++ curr_sched_plugin->wake_up_task(p); ++ goto out_running; ++ } ++ + cpu = task_cpu(p); + this_cpu = smp_processor_id(); + +@@ -1576,6 +1592,8 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) + { + int cpu = get_cpu(); + ++ litmus_fork(p); ++ + #ifdef CONFIG_SMP + cpu = sched_balance_self(cpu, SD_BALANCE_FORK); + #endif +@@ -1730,6 +1748,9 @@ void fastcall sched_exit(struct task_struct *p) + unsigned long flags; + struct rq *rq; + ++ if (is_realtime(p)) ++ return; ++ + /* + * If the child was a (relative-) CPU hog then decrease + * the sleep_avg of the parent as well. +@@ -1765,6 +1786,31 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *next) + prepare_arch_switch(next); + } + ++static void litmus_transition(struct task_struct *tsk, struct rq *rq) ++{ ++ int wakeup = 0; ++ WARN_ON(tsk->state != TASK_STOPPED); ++ ++ tsk->rt_param.transition_pending = 0; ++ if (is_realtime(tsk)) { ++ /* RT -> BE transition */ ++ tsk->rt_param.transition_error = transition_to_be(tsk); ++ wakeup = tsk->rt_param.transition_error == 0; ++ } else { ++ /* BE -> RT transition */ ++ tsk->rt_param.transition_error = transition_to_rt(tsk); ++ /* If it was rejected as a real-time task, then ++ * keep it running as a best-effort task. ++ */ ++ wakeup = tsk->rt_param.transition_error != 0; ++ } ++ if (wakeup) { ++ /* we still hold the runqueue lock */ ++ tsk->state = TASK_RUNNING; ++ __activate_task(tsk, rq); ++ } ++} ++ + /** + * finish_task_switch - clean up after a task-switch + * @rq: runqueue associated with task-switch +@@ -1801,6 +1847,15 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) + */ + prev_state = prev->state; + finish_arch_switch(prev); ++ /* Requeue previous real-time task before we drop the rq lock, cause ++ * that may lead to a preemption. ++ */ ++ curr_sched_plugin->finish_switch(prev); ++ sched_trace_task_scheduled(current); ++ if (rt_transition_pending(prev)) ++ litmus_transition(prev, rq); ++ /* trace before IRQs are enabled */ ++ TS_CXS_END; + finish_lock_switch(rq, prev); + if (mm) + mmdrop(mm); +@@ -2095,6 +2150,10 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, + struct sched_domain *sd, enum idle_type idle, + int *all_pinned) + { ++ /* Don't migrate LITMUS^RT tasks. */ ++ if (is_realtime(p)) ++ return 0; ++ + /* + * We do not migrate tasks that are: + * 1) running (obviously), or +@@ -3220,11 +3279,30 @@ void scheduler_tick(void) + + update_cpu_clock(p, rq, now); + ++ /* real-time accounting is done by the plugin ++ * call linux functions only for background tasks ++ */ + if (p == rq->idle) +- /* Task on the idle queue */ +- wake_priority_sleeper(rq); +- else ++ /* Task on the idle queue */ ++ wake_priority_sleeper(rq); ++ else if (is_realtime(p)) { ++ /* time accounting for LITMUS^RT tasks */ ++ p->rt_param.job_params.exec_time += ++ now - p->rt_param.job_params.exec_start; ++ p->rt_param.job_params.exec_start = now; ++ } else ++ /* normal Linux tasks */ + task_running_tick(rq, p); ++ ++ /* check whether the RT scheduler plugin requires a call to ++ * schedule ++ */ ++ TS_PLUGIN_TICK_START; ++ curr_sched_plugin->scheduler_tick(); ++ TS_PLUGIN_TICK_END; ++ ++ send_scheduler_signals(); ++ + #ifdef CONFIG_SMP + update_load(rq); + if (time_after_eq(jiffies, rq->next_balance)) +@@ -3406,6 +3484,7 @@ static inline int interactive_sleep(enum sleep_type sleep_type) + sleep_type == SLEEP_INTERRUPTED); + } + ++ + /* + * schedule() is the main scheduler function. + */ +@@ -3420,6 +3499,7 @@ asmlinkage void __sched schedule(void) + long *switch_count; + struct rq *rq; + ++ + /* + * Test if we are atomic. Since do_exit() needs to call into + * schedule() atomically, we ignore that path for now. +@@ -3427,8 +3507,9 @@ asmlinkage void __sched schedule(void) + */ + if (unlikely(in_atomic() && !current->exit_state)) { + printk(KERN_ERR "BUG: scheduling while atomic: " +- "%s/0x%08x/%d\n", +- current->comm, preempt_count(), current->pid); ++ "%s/0x%08x/%d %s\n", ++ current->comm, preempt_count(), current->pid, ++ is_realtime(current) ? "rt" : "non-rt"); + debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); +@@ -3438,6 +3519,7 @@ asmlinkage void __sched schedule(void) + + need_resched: + preempt_disable(); ++ TS_SCHED_START; + prev = current; + release_kernel_lock(prev); + need_resched_nonpreemptible: +@@ -3470,6 +3552,7 @@ need_resched_nonpreemptible: + spin_lock_irq(&rq->lock); + + switch_count = &prev->nivcsw; ++ /* check for blocking tasks */ + if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { + switch_count = &prev->nvcsw; + if (unlikely((prev->state & TASK_INTERRUPTIBLE) && +@@ -3478,11 +3561,60 @@ need_resched_nonpreemptible: + else { + if (prev->state == TASK_UNINTERRUPTIBLE) + rq->nr_uninterruptible++; ++ ++ if (is_realtime(prev)) { ++ TRACE_TASK(prev, "blocks, state = %d\n", ++ prev->state); ++ curr_sched_plugin->task_blocks(prev); ++ /* Enable this for all tasks to get _a lot_ of ++ * data. Can be helpful for debugging. ++ */ ++ sched_trace_task_departure(prev); ++ } ++ /* only indirect switching is supported in the current ++ * version of LITMUS ++ */ + deactivate_task(prev, rq); + } + } + ++ next = NULL; ++ ++ if (is_realtime(prev)) { ++ /* If we are invoked after scheduler_tick(), then ++ * prev is charged a tiny amount of overhead time. ++ * Since analysis has (or should have) accounted for ++ * overheads, this is ok. ++ */ ++ prev->rt_param.job_params.exec_time += ++ now - prev->rt_param.job_params.exec_start; ++ prev->rt_param.job_params.exec_start = now; ++ } ++ ++ /* consult the real-time plugin */ ++ TS_PLUGIN_SCHED_START; ++ curr_sched_plugin->schedule(prev, &next); ++ TS_PLUGIN_SCHED_END; ++ + cpu = smp_processor_id(); ++ ++ if (prev != next && is_realtime(prev) && is_running(prev)) ++ deactivate_task(prev, rq); ++ if (next && prev != next) { ++ __activate_task(next, rq); ++ set_task_cpu(next, cpu); ++ } ++ ++ /* If the real-time plugin wants to switch to a specific task ++ * it'll be on the rq and have the highest priority. There will ++ * be exaclty one such task, thus the selection of the next task ++ * is unambiguous and the following code can only get ++ * triggered if there are no RT tasks pending (on this CPU). Thus, ++ * we may as well skip it. ++ */ ++ if (next) ++ goto switch_tasks; ++ + if (unlikely(!rq->nr_running)) { + idle_balance(cpu, rq); + if (!rq->nr_running) { +@@ -3546,12 +3678,17 @@ switch_tasks: + prev->timestamp = prev->last_ran = now; + + sched_info_switch(prev, next); ++ TS_SCHED_END; + if (likely(prev != next)) { ++ TS_CXS_START; ++ if (is_running(prev)) ++ sched_trace_task_preemption(prev, next); + next->timestamp = now; + rq->nr_switches++; + rq->curr = next; + ++*switch_count; + ++ next->rt_param.job_params.exec_start = now; + prepare_task_switch(rq, next); + prev = context_switch(rq, prev, next); + barrier(); +@@ -3561,8 +3698,11 @@ switch_tasks: + * frame will be invalid. + */ + finish_task_switch(this_rq(), prev); +- } else ++ } else { + spin_unlock_irq(&rq->lock); ++ } ++ ++ send_scheduler_signals(); + + prev = current; + if (unlikely(reacquire_kernel_lock(prev) < 0)) +@@ -3570,6 +3710,8 @@ switch_tasks: + preempt_enable_no_resched(); + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + goto need_resched; ++ if (srp_active()) ++ srp_ceiling_block(); + } + EXPORT_SYMBOL(schedule); + +@@ -3691,6 +3833,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, + } + } + ++ + /** + * __wake_up - wake up threads blocked on a waitqueue. + * @q: the waitqueue +@@ -3709,6 +3852,7 @@ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode, + } + EXPORT_SYMBOL(__wake_up); + ++ + /* + * Same as __wake_up but called with the spinlock in wait_queue_head_t held. + */ +@@ -3717,6 +3861,7 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode) + __wake_up_common(q, mode, 1, 0, NULL); + } + ++ + /** + * __wake_up_sync - wake up threads blocked on a waitqueue. + * @q: the waitqueue +@@ -3772,6 +3917,18 @@ void fastcall complete_all(struct completion *x) + } + EXPORT_SYMBOL(complete_all); + ++void fastcall complete_n(struct completion *x, int n) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&x->wait.lock, flags); ++ x->done += n; ++ __wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, ++ n, 0, NULL); ++ spin_unlock_irqrestore(&x->wait.lock, flags); ++} ++EXPORT_SYMBOL(complete_n); ++ + void fastcall __sched wait_for_completion(struct completion *x) + { + might_sleep(); +@@ -4175,7 +4332,7 @@ static inline struct task_struct *find_process_by_pid(pid_t pid) + } + + /* Actually do priority change: must hold rq lock. */ +-static void __setscheduler(struct task_struct *p, int policy, int prio) ++void __setscheduler(struct task_struct *p, int policy, int prio) + { + BUG_ON(p->array); + +diff --git a/lib/semaphore-sleepers.c b/lib/semaphore-sleepers.c +index 1281805..3f4d543 100644 +--- a/lib/semaphore-sleepers.c ++++ b/lib/semaphore-sleepers.c +@@ -108,7 +108,7 @@ fastcall int __sched __down_interruptible(struct semaphore * sem) + /* + * With signals pending, this turns into + * the trylock failure case - we won't be +- * sleeping, and we* can't get the lock as ++ * sleeping, and we can't get the lock as + * it has contention. Just correct the count + * and exit. + */ +diff --git a/litmus/Makefile b/litmus/Makefile +new file mode 100644 +index 0000000..db2518d +--- /dev/null ++++ b/litmus/Makefile +@@ -0,0 +1,9 @@ ++# ++# Makefile for LITMUS^RT ++# ++ ++obj-y = sched_plugin.o litmus.o sched_trace.o \ ++ edf_common.o rm_common.o\ ++ sched_gsn_edf.o sched_psn_edf.o litmus_sem.o \ ++ trace.o ft_event.o rt_domain.o fdso.o \ ++ sched_rm.o sync.o jobs.o pcp.o +diff --git a/litmus/edf_common.c b/litmus/edf_common.c +new file mode 100644 +index 0000000..2a52835 +--- /dev/null ++++ b/litmus/edf_common.c +@@ -0,0 +1,95 @@ ++/* ++ * kernel/edf_common.c ++ * ++ * Common functions for EDF based scheduler. ++ */ ++ ++#include <linux/percpu.h> ++#include <linux/sched.h> ++#include <linux/list.h> ++ ++#include <litmus/litmus.h> ++#include <litmus/sched_plugin.h> ++#include <litmus/sched_trace.h> ++ ++ ++#include <litmus/edf_common.h> ++ ++/* edf_higher_prio - returns true if first has a higher EDF priority ++ * than second. Deadline ties are broken by PID. ++ * ++ * first first must not be NULL and a real-time task. ++ * second may be NULL or a non-rt task. ++ */ ++int edf_higher_prio(struct task_struct* first, ++ struct task_struct* second) ++{ ++ struct task_struct *first_task = first; ++ struct task_struct *second_task = second; ++ ++ /* Check for inherited priorities. Change task ++ * used for comparison in such a case. ++ */ ++ if (first && first->rt_param.inh_task) ++ first_task = first->rt_param.inh_task; ++ if (second && second->rt_param.inh_task) ++ second_task = second->rt_param.inh_task; ++ ++ return ++ /* does the second task exist and is it a real-time task? If ++ * not, the first task (which is a RT task) has higher ++ * priority. ++ */ ++ !second_task || !is_realtime(second_task) || ++ ++ /* is the deadline of the first task earlier? ++ * Then it has higher priority. ++ */ ++ earlier_deadline(first_task, second_task) || ++ ++ /* Do we have a deadline tie? ++ * Then break by PID. ++ */ ++ (get_deadline(first_task) == get_deadline(second_task) && ++ (first_task->pid < second_task->pid || ++ ++ /* If the PIDs are the same then the task with the inherited ++ * priority wins. ++ */ ++ (first_task->pid == second_task->pid && ++ !second->rt_param.inh_task))); ++} ++ ++int edf_ready_order(struct list_head* a, struct list_head* b) ++{ ++ return edf_higher_prio( ++ list_entry(a, struct task_struct, rt_list), ++ list_entry(b, struct task_struct, rt_list)); ++} ++ ++void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched) ++{ ++ rt_domain_init(rt, resched, edf_ready_order); ++} ++ ++/* need_to_preempt - check whether the task t needs to be preempted ++ * call only with irqs disabled and with ready_lock acquired ++ * THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT! ++ */ ++int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t) ++{ ++ /* we need the read lock for edf_ready_queue */ ++ /* no need to preempt if there is nothing pending */ ++ if (!ready_jobs_pending(rt)) ++ return 0; ++ /* we need to reschedule if t doesn't exist */ ++ if (!t) ++ return 1; ++ ++ /* NOTE: We cannot check for non-preemptibility since we ++ * don't know what address space we're currently in. ++ */ ++ ++ /* make sure to get non-rt stuff out of the way */ ++ return !is_realtime(t) || edf_higher_prio(next_ready(rt), t); ++} +diff --git a/litmus/fdso.c b/litmus/fdso.c +new file mode 100644 +index 0000000..ded9918 +--- /dev/null ++++ b/litmus/fdso.c +@@ -0,0 +1,289 @@ ++/* fdso.c - file descriptor attached shared objects ++ * ++ * (c) 2007 B. Brandenburg, LITMUS^RT project ++ * ++ * Notes: ++ * - objects descriptor (OD) tables are not cloned during a fork. ++ * - objects are created on-demand, and freed after the last reference ++ * is dropped. ++ * - for now, object types are hard coded. ++ * - As long as we have live objects, we keep a reference to the inode. ++ */ ++ ++#include <linux/errno.h> ++#include <linux/sched.h> ++#include <linux/mutex.h> ++#include <linux/file.h> ++#include <asm/uaccess.h> ++ ++#include <litmus/fdso.h> ++ ++extern struct fdso_ops pi_sem_ops; ++extern struct fdso_ops srp_sem_ops; ++extern struct fdso_ops pcp_sem_ops; ++extern struct fdso_ops mpcp_sem_ops; ++ ++static const struct fdso_ops* fdso_ops[] = { ++ &pi_sem_ops, ++ &srp_sem_ops, ++ &pcp_sem_ops, ++ &mpcp_sem_ops, ++}; ++ ++static void* fdso_create(obj_type_t type) ++{ ++ return fdso_ops[type]->create(); ++} ++ ++static void fdso_destroy(obj_type_t type, void* obj) ++{ ++ fdso_ops[type]->destroy(obj); ++} ++ ++static int fdso_open(struct od_table_entry* entry, void* __user config) ++{ ++ if (fdso_ops[entry->obj->type]->open) ++ return fdso_ops[entry->obj->type]->open(entry, config); ++ else ++ return 0; ++} ++ ++static int fdso_close(struct od_table_entry* entry) ++{ ++ if (fdso_ops[entry->obj->type]->close) ++ return fdso_ops[entry->obj->type]->close(entry); ++ else ++ return 0; ++} ++ ++/* inode must be locked already */ ++static struct inode_obj_id* alloc_inode_obj(struct inode* inode, ++ obj_type_t type, ++ unsigned int id) ++{ ++ struct inode_obj_id* obj; ++ void* raw_obj; ++ ++ raw_obj = fdso_create(type); ++ if (!raw_obj) ++ return NULL; ++ ++ obj = kmalloc(sizeof(struct inode_obj_id), GFP_KERNEL); ++ if (!obj) ++ return NULL; ++ INIT_LIST_HEAD(&obj->list); ++ atomic_set(&obj->count, 1); ++ obj->type = type; ++ obj->id = id; ++ obj->obj = raw_obj; ++ obj->inode = inode; ++ ++ list_add(&obj->list, &inode->i_obj_list); ++ atomic_inc(&inode->i_count); ++/* ++ printk(KERN_DEBUG "alloc_inode_obj(%p, %d, %d): object created\n", ++ inode, type, id); ++*/ ++ return obj; ++} ++ ++/* inode must be locked already */ ++static struct inode_obj_id* get_inode_obj(struct inode* inode, ++ obj_type_t type, ++ unsigned int id) ++{ ++ struct list_head* pos; ++ struct inode_obj_id* obj = NULL; ++ ++ list_for_each(pos, &inode->i_obj_list) { ++ obj = list_entry(pos, struct inode_obj_id, list); ++ if (obj->id == id && obj->type == type) { ++ atomic_inc(&obj->count); ++ return obj; ++ } ++ } ++/* ++ printk(KERN_DEBUG "get_inode_obj(%p, %d, %d): couldn't find object\n", ++ inode, type, id); ++*/ ++ return NULL; ++} ++ ++ ++static void put_inode_obj(struct inode_obj_id* obj) ++{ ++ struct inode* inode; ++ int let_go = 0; ++ ++ inode = obj->inode; ++ if (atomic_dec_and_test(&obj->count)) { ++ ++ mutex_lock(&inode->i_obj_mutex); ++ /* no new references can be obtained */ ++ if (!atomic_read(&obj->count)) { ++ list_del(&obj->list); ++ fdso_destroy(obj->type, obj->obj); ++ kfree(obj); ++ let_go = 1; ++ } ++ mutex_unlock(&inode->i_obj_mutex); ++ if (let_go) ++ iput(inode); ++ } ++} ++ ++static struct od_table_entry* get_od_entry(struct task_struct* t) ++{ ++ struct od_table_entry* table; ++ int i; ++ ++ ++ table = t->od_table; ++ if (!table) { ++ table = (struct od_table_entry*) ++ kzalloc(sizeof(struct od_table_entry) * ++ MAX_OBJECT_DESCRIPTORS, GFP_KERNEL); ++ t->od_table = table; ++ } ++ ++ for (i = 0; table && i < MAX_OBJECT_DESCRIPTORS; i++) ++ if (!table[i].used) { ++ table[i].used = 1; ++ return table + i; ++ } ++ return NULL; ++} ++ ++static int put_od_entry(struct od_table_entry* od) ++{ ++ put_inode_obj(od->obj); ++ od->used = 0; ++ return 0; ++} ++ ++void exit_od_table(struct task_struct* t) ++{ ++ int i; ++ ++ if (t->od_table) { ++ for (i = 0; i < MAX_OBJECT_DESCRIPTORS; i++) ++ if (t->od_table[i].used) ++ put_od_entry(t->od_table + i); ++ kfree(t->od_table); ++ t->od_table = NULL; ++ } ++} ++ ++static int do_sys_od_open(struct file* file, obj_type_t type, int id, ++ void* __user config) ++{ ++ int idx = 0, err; ++ struct inode* inode; ++ struct inode_obj_id* obj = NULL; ++ struct od_table_entry* entry; ++ ++ inode = file->f_dentry->d_inode; ++ ++ entry = get_od_entry(current); ++ if (!entry) ++ return -ENOMEM; ++ ++ mutex_lock(&inode->i_obj_mutex); ++ obj = get_inode_obj(inode, type, id); ++ if (!obj) ++ obj = alloc_inode_obj(inode, type, id); ++ if (!obj) { ++ idx = -ENOMEM; ++ entry->used = 0; ++ } else { ++ entry->obj = obj; ++ entry->extra = NULL; ++ idx = entry - current->od_table; ++ } ++ ++ mutex_unlock(&inode->i_obj_mutex); ++ ++ /* FIXME: What if the allocation failed? */ ++ err = fdso_open(entry, config); ++ if (err < 0) { ++ /* The class rejected the open call. ++ * We need to clean up and tell user space. ++ */ ++ put_od_entry(entry); ++ idx = err; ++ } ++ ++ return idx; ++} ++ ++ ++struct od_table_entry* __od_lookup(int od) ++{ ++ struct task_struct *t = current; ++ ++ if (!t->od_table) ++ return NULL; ++ if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS) ++ return NULL; ++ if (!t->od_table[od].used) ++ return NULL; ++ return t->od_table + od; ++} ++ ++ ++asmlinkage int sys_od_open(int fd, int type, int obj_id, void* __user config) ++{ ++ int ret = 0; ++ struct file* file; ++ ++ /* ++ 1) get file from fd, get inode from file ++ 2) lock inode ++ 3) try to lookup object ++ 4) if not present create and enqueue object, inc inode refcnt ++ 5) increment refcnt of object ++ 6) alloc od_table_entry, setup ptrs ++ 7) unlock inode ++ 8) return offset in od_table as OD ++ */ ++ ++ if (type < MIN_OBJ_TYPE || type > MAX_OBJ_TYPE) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ file = fget(fd); ++ if (!file) { ++ ret = -EBADF; ++ goto out; ++ } ++ ++ ret = do_sys_od_open(file, type, obj_id, config); ++ ++ fput(file); ++ ++out: ++ return ret; ++} ++ ++ ++asmlinkage int sys_od_close(int od) ++{ ++ int ret = -EINVAL; ++ struct task_struct *t = current; ++ ++ if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS) ++ return ret; ++ ++ if (!t->od_table || !t->od_table[od].used) ++ return ret; ++ ++ ++ /* give the class a chance to reject the close ++ */ ++ ret = fdso_close(t->od_table + od); ++ if (ret == 0) ++ ret = put_od_entry(t->od_table + od); ++ ++ return ret; ++} +diff --git a/litmus/ft_event.c b/litmus/ft_event.c +new file mode 100644 +index 0000000..db9f4ea +--- /dev/null ++++ b/litmus/ft_event.c +@@ -0,0 +1,104 @@ ++#include <linux/types.h> ++ ++#include <litmus/feather_trace.h> ++ ++/* the feather trace management functions assume ++ * exclusive access to the event table ++ */ ++ ++ ++#define BYTE_JUMP 0xeb ++#define BYTE_JUMP_LEN 0x02 ++ ++/* for each event, there is an entry in the event table */ ++struct trace_event { ++ long id; ++ long count; ++ long start_addr; ++ long end_addr; ++}; ++ ++extern struct trace_event __start___event_table[]; ++extern struct trace_event __stop___event_table[]; ++ ++int ft_enable_event(unsigned long id) ++{ ++ struct trace_event* te = __start___event_table; ++ int count = 0; ++ char* delta; ++ unsigned char* instr; ++ ++ while (te < __stop___event_table) { ++ if (te->id == id && ++te->count == 1) { ++ instr = (unsigned char*) te->start_addr; ++ /* make sure we don't clobber something wrong */ ++ if (*instr == BYTE_JUMP) { ++ delta = (((unsigned char*) te->start_addr) + 1); ++ *delta = 0; ++ } ++ } ++ if (te->id == id) ++ count++; ++ te++; ++ } ++ return count; ++} ++ ++int ft_disable_event(unsigned long id) ++{ ++ struct trace_event* te = __start___event_table; ++ int count = 0; ++ char* delta; ++ unsigned char* instr; ++ ++ while (te < __stop___event_table) { ++ if (te->id == id && --te->count == 0) { ++ instr = (unsigned char*) te->start_addr; ++ if (*instr == BYTE_JUMP) { ++ delta = (((unsigned char*) te->start_addr) + 1); ++ *delta = te->end_addr - te->start_addr - ++ BYTE_JUMP_LEN; ++ } ++ } ++ if (te->id == id) ++ count++; ++ te++; ++ } ++ return count; ++} ++ ++int ft_disable_all_events(void) ++{ ++ struct trace_event* te = __start___event_table; ++ int count = 0; ++ char* delta; ++ unsigned char* instr; ++ ++ while (te < __stop___event_table) { ++ if (te->count) { ++ instr = (unsigned char*) te->start_addr; ++ if (*instr == BYTE_JUMP) { ++ delta = (((unsigned char*) te->start_addr) ++ + 1); ++ *delta = te->end_addr - te->start_addr - ++ BYTE_JUMP_LEN; ++ te->count = 0; ++ count++; ++ } ++ } ++ te++; ++ } ++ return count; ++} ++ ++int ft_is_event_enabled(unsigned long id) ++{ ++ struct trace_event* te = __start___event_table; ++ ++ while (te < __stop___event_table) { ++ if (te->id == id) ++ return te->count; ++ te++; ++ } ++ return 0; ++} +diff --git a/litmus/jobs.c b/litmus/jobs.c +new file mode 100644 +index 0000000..e294bc5 +--- /dev/null ++++ b/litmus/jobs.c +@@ -0,0 +1,43 @@ ++/* litmus/jobs.c - common job control code ++ */ ++ ++#include <linux/sched.h> ++ ++#include <litmus/litmus.h> ++#include <litmus/jobs.h> ++ ++void prepare_for_next_period(struct task_struct *t) ++{ ++ BUG_ON(!t); ++ /* prepare next release */ ++ t->rt_param.job_params.release = t->rt_param.job_params.deadline; ++ t->rt_param.job_params.deadline += get_rt_period(t); ++ t->rt_param.job_params.exec_time = 0; ++ /* update job sequence number */ ++ t->rt_param.job_params.job_no++; ++ ++ /* don't confuse Linux */ ++ t->time_slice = 1; ++} ++ ++void release_at(struct task_struct *t, lt_t start) ++{ ++ t->rt_param.job_params.deadline = start; ++ prepare_for_next_period(t); ++ set_rt_flags(t, RT_F_RUNNING); ++} ++ ++ ++/* ++ * Deactivate current task until the beginning of the next period. ++ */ ++long complete_job(void) ++{ ++ /* Mark that we do not excute anymore */ ++ set_rt_flags(current, RT_F_SLEEP); ++ /* call schedule, this will return when a new job arrives ++ * it also takes care of preparing for the next release ++ */ ++ schedule(); ++ return 0; ++} +diff --git a/litmus/litmus.c b/litmus/litmus.c +new file mode 100644 +index 0000000..77aad7d +--- /dev/null ++++ b/litmus/litmus.c +@@ -0,0 +1,830 @@ ++/* litmus.c -- Implementation of the LITMUS syscalls, the LITMUS intialization code, ++ * and the procfs interface.. ++ */ ++#include <asm/uaccess.h> ++#include <linux/uaccess.h> ++#include <linux/sysrq.h> ++ ++#include <linux/module.h> ++#include <linux/proc_fs.h> ++ ++ ++#include <litmus/litmus.h> ++#include <linux/sched.h> ++#include <litmus/sched_plugin.h> ++ ++#include <litmus/trace.h> ++ ++/* Number of RT tasks that exist in the system */ ++atomic_t rt_task_count = ATOMIC_INIT(0); ++static DEFINE_SPINLOCK(task_transition_lock); ++ ++/* To send signals from the scheduler ++ * Must drop locks first. ++ */ ++static LIST_HEAD(sched_sig_list); ++static DEFINE_SPINLOCK(sched_sig_list_lock); ++ ++/* ++ * sys_set_task_rt_param ++ * @pid: Pid of the task which scheduling parameters must be changed ++ * @param: New real-time extension parameters such as the execution cost and ++ * period ++ * Syscall for manipulating with task rt extension params ++ * Returns EFAULT if param is NULL. ++ * ESRCH if pid is not corrsponding ++ * to a valid task. ++ * EINVAL if either period or execution cost is <=0 ++ * EPERM if pid is a real-time task ++ * 0 if success ++ * ++ * Only non-real-time tasks may be configured with this system call ++ * to avoid races with the scheduler. In practice, this means that a ++ * task's parameters must be set _before_ calling sys_prepare_rt_task() ++ */ ++asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param) ++{ ++ struct rt_task tp; ++ struct task_struct *target; ++ int retval = -EINVAL; ++ ++ printk("Setting up rt task parameters for process %d.\n", pid); ++ ++ if (pid < 0 || param == 0) { ++ goto out; ++ } ++ if (copy_from_user(&tp, param, sizeof(tp))) { ++ retval = -EFAULT; ++ goto out; ++ } ++ ++ /* Task search and manipulation must be protected */ ++ read_lock_irq(&tasklist_lock); ++ if (!(target = find_task_by_pid(pid))) { ++ retval = -ESRCH; ++ goto out_unlock; ++ } ++ ++ if (is_realtime(target)) { ++ /* The task is already a real-time task. ++ * We cannot not allow parameter changes at this point. ++ */ ++ retval = -EBUSY; ++ goto out_unlock; ++ } ++ ++ if (tp.exec_cost <= 0) ++ goto out_unlock; ++ if (tp.period <= 0) ++ goto out_unlock; ++ if (!cpu_online(tp.cpu)) ++ goto out_unlock; ++ if (tp.period < tp.exec_cost) ++ { ++ printk(KERN_INFO "litmus: real-time task %d rejected " ++ "because wcet > period\n", pid); ++ goto out_unlock; ++ } ++ ++ target->rt_param.task_params = tp; ++ ++ retval = 0; ++ out_unlock: ++ read_unlock_irq(&tasklist_lock); ++ out: ++ return retval; ++} ++ ++/* Getter of task's RT params ++ * returns EINVAL if param or pid is NULL ++ * returns ESRCH if pid does not correspond to a valid task ++ * returns EFAULT if copying of parameters has failed. ++ */ ++asmlinkage long sys_get_rt_task_param(pid_t pid, struct rt_task __user * param) ++{ ++ int retval = -EINVAL; ++ struct task_struct *source; ++ struct rt_task lp; ++ if (param == 0 || pid < 0) ++ goto out; ++ read_lock(&tasklist_lock); ++ if (!(source = find_task_by_pid(pid))) { ++ retval = -ESRCH; ++ goto out_unlock; ++ } ++ lp = source->rt_param.task_params; ++ read_unlock(&tasklist_lock); ++ /* Do copying outside the lock */ ++ retval = ++ copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0; ++ return retval; ++ out_unlock: ++ read_unlock(&tasklist_lock); ++ out: ++ return retval; ++ ++} ++ ++/* sys_task_mode_transition ++ * @target_mode: The desired execution mode after the system call completes. ++ * Either BACKGROUND_TASK or LITMUS_RT_TASK. ++ * Allow a normal task to become a real-time task, vice versa. ++ * Returns EINVAL if illegal transition requested. ++ * 0 if task mode was changed succesfully ++ * other if plugin failed. ++ */ ++asmlinkage long sys_task_mode_transition(int target_mode) ++{ ++ int retval = -EINVAL; ++ struct task_struct *t = current; ++ ++ if (( is_realtime(t) && target_mode == BACKGROUND_TASK) || ++ (!is_realtime(t) && target_mode == LITMUS_RT_TASK)) { ++ TRACE_TASK(t, "attempts mode transition to %s\n", ++ is_realtime(t) ? "best-effort" : "real-time"); ++ preempt_disable(); ++ t->rt_param.transition_pending = 1; ++ t->state = TASK_STOPPED; ++ preempt_enable_no_resched(); ++ ++ schedule(); ++ ++ retval = t->rt_param.transition_error; ++ } ++ return retval; ++} ++ ++/* implemented in kernel/litmus_sem.c */ ++void srp_ceiling_block(void); ++ ++/* ++ * This is the crucial function for periodic task implementation, ++ * It checks if a task is periodic, checks if such kind of sleep ++ * is permitted and calls plugin-specific sleep, which puts the ++ * task into a wait array. ++ * returns 0 on successful wakeup ++ * returns EPERM if current conditions do not permit such sleep ++ * returns EINVAL if current task is not able to go to sleep ++ */ ++asmlinkage long sys_sleep_next_period(void) ++{ ++ int retval = -EPERM; ++ if (!is_realtime(current)) { ++ retval = -EINVAL; ++ goto out; ++ } ++ /* Task with negative or zero period cannot sleep */ ++ if (get_rt_period(current) <= 0) { ++ retval = -EINVAL; ++ goto out; ++ } ++ /* The plugin has to put the task into an ++ * appropriate queue and call schedule ++ */ ++ retval = curr_sched_plugin->sleep_next_period(); ++ out: ++ return retval; ++} ++ ++/* This is an "improved" version of sys_sleep_next_period() that ++ * addresses the problem of unintentionally missing a job after ++ * an overrun. ++ * ++ * returns 0 on successful wakeup ++ * returns EPERM if current conditions do not permit such sleep ++ * returns EINVAL if current task is not able to go to sleep ++ */ ++asmlinkage long sys_wait_for_job_release(unsigned int job) ++{ ++ int retval = -EPERM; ++ if (!is_realtime(current)) { ++ retval = -EINVAL; ++ goto out; ++ } ++ ++ /* Task with negative or zero period cannot sleep */ ++ if (get_rt_period(current) <= 0) { ++ retval = -EINVAL; ++ goto out; ++ } ++ ++ retval = 0; ++ ++ /* first wait until we have "reached" the desired job ++ * ++ * This implementation has at least two problems: ++ * ++ * 1) It doesn't gracefully handle the wrap around of ++ * job_no. Since LITMUS is a prototype, this is not much ++ * of a problem right now. ++ * ++ * 2) It is theoretically racy if a job release occurs ++ * between checking job_no and calling sleep_next_period(). ++ * A proper solution would requiring adding another callback ++ * in the plugin structure and testing the condition with ++ * interrupts disabled. ++ * ++ * FIXME: At least problem 2 should be taken care of eventually. ++ */ ++ while (!retval && job > current->rt_param.job_params.job_no) ++ /* If the last job overran then job <= job_no and we ++ * don't send the task to sleep. ++ */ ++ retval = curr_sched_plugin->sleep_next_period(); ++ out: ++ return retval; ++} ++ ++/* This is a helper syscall to query the current job sequence number. ++ * ++ * returns 0 on successful query ++ * returns EPERM if task is not a real-time task. ++ * returns EFAULT if &job is not a valid pointer. ++ */ ++asmlinkage long sys_query_job_no(unsigned int __user *job) ++{ ++ int retval = -EPERM; ++ if (is_realtime(current)) ++ retval = put_user(current->rt_param.job_params.job_no, job); ++ ++ return retval; ++} ++ ++struct sched_sig { ++ struct list_head list; ++ struct task_struct* task; ++ unsigned int signal:31; ++ int force:1; ++}; ++ ++static void __scheduler_signal(struct task_struct *t, unsigned int signo, ++ int force) ++{ ++ struct sched_sig* sig; ++ ++ sig = kmalloc(GFP_ATOMIC, sizeof(struct sched_sig)); ++ if (!sig) { ++ TRACE_TASK(t, "dropping signal: %u\n", t); ++ return; ++ } ++ ++ spin_lock(&sched_sig_list_lock); ++ ++ sig->signal = signo; ++ sig->force = force; ++ sig->task = t; ++ get_task_struct(t); ++ list_add(&sig->list, &sched_sig_list); ++ ++ spin_unlock(&sched_sig_list_lock); ++} ++ ++void scheduler_signal(struct task_struct *t, unsigned int signo) ++{ ++ __scheduler_signal(t, signo, 0); ++} ++ ++void force_scheduler_signal(struct task_struct *t, unsigned int signo) ++{ ++ __scheduler_signal(t, signo, 1); ++} ++ ++/* FIXME: get rid of the locking and do this on a per-processor basis */ ++void send_scheduler_signals(void) ++{ ++ unsigned long flags; ++ struct list_head *p, *extra; ++ struct siginfo info; ++ struct sched_sig* sig; ++ struct task_struct* t; ++ struct list_head claimed; ++ ++ if (spin_trylock_irqsave(&sched_sig_list_lock, flags)) { ++ if (list_empty(&sched_sig_list)) ++ p = NULL; ++ else { ++ p = sched_sig_list.next; ++ list_del(&sched_sig_list); ++ INIT_LIST_HEAD(&sched_sig_list); ++ } ++ spin_unlock_irqrestore(&sched_sig_list_lock, flags); ++ ++ /* abort if there are no signals */ ++ if (!p) ++ return; ++ ++ /* take signal list we just obtained */ ++ list_add(&claimed, p); ++ ++ list_for_each_safe(p, extra, &claimed) { ++ list_del(p); ++ sig = list_entry(p, struct sched_sig, list); ++ t = sig->task; ++ info.si_signo = sig->signal; ++ info.si_errno = 0; ++ info.si_code = SI_KERNEL; ++ info.si_pid = 1; ++ info.si_uid = 0; ++ TRACE("sending signal %d to %d\n", info.si_signo, ++ t->pid); ++ if (sig->force) ++ force_sig_info(sig->signal, &info, t); ++ else ++ send_sig_info(sig->signal, &info, t); ++ put_task_struct(t); ++ kfree(sig); ++ } ++ } ++ ++} ++ ++static inline void np_mem_error(struct task_struct* t, const char* reason) ++{ ++ if (t->state != TASK_DEAD && !(t->flags & PF_EXITING)) { ++ TRACE("np section: %s => %s/%d killed\n", ++ reason, t->comm, t->pid); ++ force_scheduler_signal(t, SIGKILL); ++ } ++} ++ ++/* sys_register_np_flag() allows real-time tasks to register an ++ * np section indicator. ++ * returns 0 if the flag was successfully registered ++ * returns EINVAL if current task is not a real-time task ++ * returns EFAULT if *flag couldn't be written ++ */ ++asmlinkage long sys_register_np_flag(short __user *flag) ++{ ++ int retval = -EINVAL; ++ short test_val = RT_PREEMPTIVE; ++ ++ /* avoid races with the scheduler */ ++ preempt_disable(); ++ TRACE("reg_np_flag(%p) for %s/%d\n", flag, ++ current->comm, current->pid); ++ ++ /* Let's first try to write to the address. ++ * That way it is initialized and any bugs ++ * involving dangling pointers will caught ++ * early. ++ * NULL indicates disabling np section support ++ * and should not be tested. ++ */ ++ if (flag) ++ retval = poke_kernel_address(test_val, flag); ++ else ++ retval = 0; ++ TRACE("reg_np_flag: retval=%d\n", retval); ++ if (unlikely(0 != retval)) ++ np_mem_error(current, "np flag: not writable"); ++ else ++ /* the pointer is ok */ ++ current->rt_param.np_flag = flag; ++ ++ preempt_enable(); ++ return retval; ++} ++ ++ ++void request_exit_np(struct task_struct *t) ++{ ++ int ret; ++ short flag; ++ ++ /* We can only do this if t is actually currently scheduled on this CPU ++ * because otherwise we are in the wrong address space. Thus make sure ++ * to check. ++ */ ++ BUG_ON(t != current); ++ ++ if (unlikely(!is_realtime(t) || !t->rt_param.np_flag)) { ++ TRACE_TASK(t, "request_exit_np(): BAD TASK!\n"); ++ return; ++ } ++ ++ flag = RT_EXIT_NP_REQUESTED; ++ ret = poke_kernel_address(flag, t->rt_param.np_flag + 1); ++ TRACE("request_exit_np(%s/%d)\n", t->comm, t->pid); ++ if (unlikely(0 != ret)) ++ np_mem_error(current, "request_exit_np(): flag not writable"); ++ ++} ++ ++ ++int is_np(struct task_struct* t) ++{ ++ int ret; ++ unsigned short flag = 0x5858; /* = XX, looks nicer in debug*/ ++ ++ BUG_ON(t != current); ++ ++ if (unlikely(t->rt_param.kernel_np)) ++ return 1; ++ else if (unlikely(t->rt_param.np_flag == NULL) || ++ t->flags & PF_EXITING || ++ t->state == TASK_DEAD) ++ return 0; ++ else { ++ /* This is the tricky part. The process has registered a ++ * non-preemptive section marker. We now need to check whether ++ * it is set to to NON_PREEMPTIVE. Along the way we could ++ * discover that the pointer points to an unmapped region (=> ++ * kill the task) or that the location contains some garbage ++ * value (=> also kill the task). Killing the task in any case ++ * forces userspace to play nicely. Any bugs will be discovered ++ * immediately. ++ */ ++ ret = probe_kernel_address(t->rt_param.np_flag, flag); ++ if (0 == ret && (flag == RT_NON_PREEMPTIVE || ++ flag == RT_PREEMPTIVE)) ++ return flag != RT_PREEMPTIVE; ++ else { ++ /* either we could not read from the address or ++ * it contained garbage => kill the process ++ * FIXME: Should we cause a SEGFAULT instead? ++ */ ++ TRACE("is_np: ret=%d flag=%c%c (%x)\n", ret, ++ flag & 0xff, (flag >> 8) & 0xff, flag); ++ np_mem_error(t, "is_np() could not read"); ++ return 0; ++ } ++ } ++} ++ ++/* ++ * sys_exit_np() allows real-time tasks to signal that it left a ++ * non-preemptable section. It will be called after the kernel requested a ++ * callback in the preemption indicator flag. ++ * returns 0 if the signal was valid and processed. ++ * returns EINVAL if current task is not a real-time task ++ */ ++asmlinkage long sys_exit_np(void) ++{ ++ int retval = -EINVAL; ++ ++ TS_EXIT_NP_START; ++ ++ if (!is_realtime(current)) ++ goto out; ++ ++ TRACE("sys_exit_np(%s/%d)\n", current->comm, current->pid); ++ /* force rescheduling so that we can be preempted */ ++ set_tsk_need_resched(current); ++ retval = 0; ++ out: ++ ++ TS_EXIT_NP_END; ++ return retval; ++} ++ ++void __setscheduler(struct task_struct *, int, int); ++ ++/* p is a real-time task. Re-init its state as a best-effort task. */ ++static void reinit_litmus_state(struct task_struct* p, int restore) ++{ ++ struct rt_task user_config = {}; ++ __user short *np_flag = NULL; ++ ++ if (restore) { ++ /* Safe user-space provided configuration data. ++ * FIXME: This is missing service levels for adaptive tasks. ++ */ ++ user_config = p->rt_param.task_params; ++ np_flag = p->rt_param.np_flag; ++ } ++ ++ /* We probably should not be inheriting any task's priority ++ * at this point in time. ++ */ ++ WARN_ON(p->rt_param.inh_task); ++ ++ /* We need to restore the priority of the task. */ ++ __setscheduler(p, p->rt_param.old_policy, p->rt_param.old_prio); ++ ++ /* Cleanup everything else. */ ++ memset(&p->rt_param, 0, sizeof(struct rt_task)); ++ ++ /* Restore preserved fields. */ ++ if (restore) { ++ p->rt_param.task_params = user_config; ++ p->rt_param.np_flag = np_flag; ++ } ++} ++ ++long transition_to_rt(struct task_struct* tsk) ++{ ++ long retval; ++ long flags; ++ ++ BUG_ON(is_realtime(tsk)); ++ ++ if (get_rt_period(tsk) == 0 || ++ get_exec_cost(tsk) > get_rt_period(tsk)) { ++ TRACE_TASK(tsk, "litmus prepare: invalid task parameters " ++ "(%lu, %lu)\n", ++ get_exec_cost(tsk), get_rt_period(tsk)); ++ return -EINVAL; ++ } ++ ++ if (!cpu_online(get_partition(tsk))) ++ { ++ TRACE_TASK(tsk, "litmus prepare: cpu %d is not online\n", ++ get_partition(tsk)); ++ return -EINVAL; ++ } ++ ++ tsk->rt_param.old_prio = tsk->rt_priority; ++ tsk->rt_param.old_policy = tsk->policy; ++ INIT_LIST_HEAD(&tsk->rt_list); ++ ++ /* avoid scheduler plugin changing underneath us */ ++ spin_lock_irqsave(&task_transition_lock, flags); ++ retval = curr_sched_plugin->prepare_task(tsk); ++ ++ if (!retval) { ++ atomic_inc(&rt_task_count); ++ __setscheduler(tsk, SCHED_FIFO, MAX_RT_PRIO - 1); ++ tsk->rt_param.is_realtime = 1; ++ tsk->rt_param.litmus_controlled = 1; ++ } ++ spin_unlock_irqrestore(&task_transition_lock, flags); ++ ++ return retval; ++} ++ ++long transition_to_be(struct task_struct* tsk) ++{ ++ BUG_ON(!is_realtime(tsk)); ++ ++ curr_sched_plugin->tear_down(tsk); ++ atomic_dec(&rt_task_count); ++ reinit_litmus_state(tsk, 1); ++ return 0; ++} ++ ++ ++/* Switching a plugin in use is tricky. ++ * We must watch out that no real-time tasks exists ++ * (and that none is created in parallel) and that the plugin is not ++ * currently in use on any processor (in theory). ++ * ++ * For now, we don't enforce the second part since it is unlikely to cause ++ * any trouble by itself as long as we don't unload modules. ++ */ ++int switch_sched_plugin(struct sched_plugin* plugin) ++{ ++ long flags; ++ int ret = 0; ++ ++ BUG_ON(!plugin); ++ ++ /* stop task transitions */ ++ spin_lock_irqsave(&task_transition_lock, flags); ++ ++ /* don't switch if there are active real-time tasks */ ++ if (atomic_read(&rt_task_count) == 0) { ++ printk(KERN_INFO "Switching to LITMUS^RT plugin %s.\n", plugin->plugin_name); ++ curr_sched_plugin = plugin; ++ } else ++ ret = -EBUSY; ++ ++ spin_unlock_irqrestore(&task_transition_lock, flags); ++ return ret; ++} ++ ++/* Called upon fork. ++ * p is the newly forked task. ++ */ ++void litmus_fork(struct task_struct* p) ++{ ++ if (is_realtime(p)) ++ /* clean out any litmus related state, don't preserve anything*/ ++ reinit_litmus_state(p, 0); ++} ++ ++/* Called upon execve(). ++ * current is doing the exec. ++ * Don't let address space specific stuff leak. ++ */ ++void litmus_exec(void) ++{ ++ struct task_struct* p = current; ++ ++ if (is_realtime(p)) { ++ WARN_ON(p->rt_param.inh_task); ++ p->rt_param.np_flag = NULL; ++ } ++} ++ ++void exit_litmus(struct task_struct *dead_tsk) ++{ ++ if (is_realtime(dead_tsk)) ++ transition_to_be(dead_tsk); ++} ++ ++ ++void list_qsort(struct list_head* list, list_cmp_t less_than) ++{ ++ struct list_head lt; ++ struct list_head geq; ++ struct list_head *pos, *extra, *pivot; ++ int n_lt = 0, n_geq = 0; ++ BUG_ON(!list); ++ ++ if (list->next == list) ++ return; ++ ++ INIT_LIST_HEAD(<); ++ INIT_LIST_HEAD(&geq); ++ ++ pivot = list->next; ++ list_del(pivot); ++ list_for_each_safe(pos, extra, list) { ++ list_del(pos); ++ if (less_than(pos, pivot)) { ++ list_add(pos, <); ++ n_lt++; ++ } else { ++ list_add(pos, &geq); ++ n_geq++; ++ } ++ } ++ if (n_lt < n_geq) { ++ list_qsort(<, less_than); ++ list_qsort(&geq, less_than); ++ } else { ++ list_qsort(&geq, less_than); ++ list_qsort(<, less_than); ++ } ++ list_splice(&geq, list); ++ list_add(pivot, list); ++ list_splice(<, list); ++} ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++int sys_kill(int pid, int sig); ++ ++static void sysrq_handle_kill_rt_tasks(int key, struct tty_struct *tty) ++{ ++ struct task_struct *t; ++ read_lock(&tasklist_lock); ++ for_each_process(t) { ++ if (is_realtime(t)) { ++ sys_kill(t->pid, SIGKILL); ++ } ++ } ++ read_unlock(&tasklist_lock); ++} ++ ++static struct sysrq_key_op sysrq_kill_rt_tasks_op = { ++ .handler = sysrq_handle_kill_rt_tasks, ++ .help_msg = "Quit-rt-tasks", ++ .action_msg = "sent SIGKILL to all real-time tasks", ++}; ++#endif ++ ++static int proc_read_stats(char *page, char **start, ++ off_t off, int count, ++ int *eof, void *data) ++{ ++ int len; ++ ++ len = snprintf(page, PAGE_SIZE, ++ "real-time task count = %d\n", ++ atomic_read(&rt_task_count)); ++ return len; ++} ++ ++static int proc_read_plugins(char *page, char **start, ++ off_t off, int count, ++ int *eof, void *data) ++{ ++ int len; ++ ++ len = print_sched_plugins(page, PAGE_SIZE); ++ return len; ++} ++ ++static int proc_read_curr(char *page, char **start, ++ off_t off, int count, ++ int *eof, void *data) ++{ ++ int len; ++ ++ len = snprintf(page, PAGE_SIZE, "%s\n", curr_sched_plugin->plugin_name); ++ return len; ++} ++ ++static int proc_write_curr(struct file *file, ++ const char *buffer, ++ unsigned long count, ++ void *data) ++{ ++ int len, ret; ++ char name[65]; ++ struct sched_plugin* found; ++ ++ if(count > 64) ++ len = 64; ++ else ++ len = count; ++ ++ if(copy_from_user(name, buffer, len)) ++ return -EFAULT; ++ ++ name[len] = '\0'; ++ /* chomp name */ ++ if (len > 1 && name[len - 1] == '\n') ++ name[len - 1] = '\0'; ++ ++ found = find_sched_plugin(name); ++ ++ if (found) { ++ ret = switch_sched_plugin(found); ++ if (ret != 0) ++ printk(KERN_INFO "Could not switch plugin: %d\n", ret); ++ } else ++ printk(KERN_INFO "Plugin '%s' is unknown.\n", name); ++ ++ return len; ++} ++ ++ ++static struct proc_dir_entry *litmus_dir = NULL, ++ *curr_file = NULL, ++ *stat_file = NULL, ++ *plugs_file = NULL; ++ ++static int __init init_litmus_proc(void) ++{ ++ litmus_dir = proc_mkdir("litmus", NULL); ++ if (!litmus_dir) { ++ printk(KERN_ERR "Could not allocate LITMUS^RT procfs entry.\n"); ++ return -ENOMEM; ++ } ++ litmus_dir->owner = THIS_MODULE; ++ ++ curr_file = create_proc_entry("active_plugin", ++ 0644, litmus_dir); ++ if (!curr_file) { ++ printk(KERN_ERR "Could not allocate active_plugin " ++ "procfs entry.\n"); ++ return -ENOMEM; ++ } ++ curr_file->owner = THIS_MODULE; ++ curr_file->read_proc = proc_read_curr; ++ curr_file->write_proc = proc_write_curr; ++ ++ stat_file = create_proc_read_entry("stats", 0444, litmus_dir, ++ proc_read_stats, NULL); ++ ++ plugs_file = create_proc_read_entry("plugins", 0444, litmus_dir, ++ proc_read_plugins, NULL); ++ ++ return 0; ++} ++ ++static void exit_litmus_proc(void) ++{ ++ if (plugs_file) ++ remove_proc_entry("plugins", litmus_dir); ++ if (stat_file) ++ remove_proc_entry("stats", litmus_dir); ++ if (curr_file) ++ remove_proc_entry("active_plugin", litmus_dir); ++ if (litmus_dir) ++ remove_proc_entry("litmus", NULL); ++} ++ ++extern struct sched_plugin linux_sched_plugin; ++ ++static int __init _init_litmus(void) ++{ ++ /* Common initializers, ++ * mode change lock is used to enforce single mode change ++ * operation. ++ */ ++ printk("Starting LITMUS^RT kernel\n"); ++ ++ register_sched_plugin(&linux_sched_plugin); ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++ /* offer some debugging help */ ++ if (!register_sysrq_key('q', &sysrq_kill_rt_tasks_op)) ++ printk("Registered kill rt tasks magic sysrq.\n"); ++ else ++ printk("Could not register kill rt tasks magic sysrq.\n"); ++#endif ++ ++ init_litmus_proc(); ++ ++ return 0; ++} ++ ++static void _exit_litmus(void) ++{ ++ exit_litmus_proc(); ++} ++ ++module_init(_init_litmus); ++module_exit(_exit_litmus); +diff --git a/litmus/litmus_sem.c b/litmus/litmus_sem.c +new file mode 100644 +index 0000000..7179b43 +--- /dev/null ++++ b/litmus/litmus_sem.c +@@ -0,0 +1,551 @@ ++/* ++ * PI semaphores and SRP implementations. ++ * Much of the code here is borrowed from include/asm-i386/semaphore.h. ++ * ++ * NOTE: This implementation is very much a prototype and horribly insecure. It ++ * is intended to be a proof of concept, not a feature-complete solution. ++ */ ++ ++#include <asm/atomic.h> ++#include <asm/semaphore.h> ++#include <linux/sched.h> ++#include <linux/wait.h> ++#include <linux/spinlock.h> ++#include <litmus/litmus.h> ++#include <litmus/sched_plugin.h> ++#include <litmus/edf_common.h> ++ ++#include <litmus/fdso.h> ++ ++#include <litmus/trace.h> ++ ++/* ************************************************************************** */ ++/* PRIORITY INHERITANCE */ ++/* ************************************************************************** */ ++ ++static void* create_pi_semaphore(void) ++{ ++ struct pi_semaphore* sem; ++ int i; ++ ++ sem = kmalloc(sizeof(struct pi_semaphore), GFP_KERNEL); ++ if (!sem) ++ return NULL; ++ atomic_set(&sem->count, 1); ++ sem->sleepers = 0; ++ init_waitqueue_head(&sem->wait); ++ sem->hp.task = NULL; ++ sem->holder = NULL; ++ for (i = 0; i < NR_CPUS; i++) ++ sem->hp.cpu_task[i] = NULL; ++ return sem; ++} ++ ++static void destroy_pi_semaphore(void* sem) ++{ ++ /* XXX assert invariants */ ++ kfree(sem); ++} ++ ++struct fdso_ops pi_sem_ops = { ++ .create = create_pi_semaphore, ++ .destroy = destroy_pi_semaphore ++}; ++ ++struct wq_pair { ++ struct task_struct* tsk; ++ struct pi_semaphore* sem; ++}; ++ ++static int rt_pi_wake_up(wait_queue_t *wait, unsigned mode, int sync, ++ void *key) ++{ ++ struct wq_pair* wqp = (struct wq_pair*) wait->private; ++ set_rt_flags(wqp->tsk, RT_F_EXIT_SEM); ++ curr_sched_plugin->inherit_priority(wqp->sem, wqp->tsk); ++ TRACE_TASK(wqp->tsk, ++ "woken up by rt_pi_wake_up() (RT_F_SEM_EXIT, PI)\n"); ++ /* point to task for default_wake_function() */ ++ wait->private = wqp->tsk; ++ default_wake_function(wait, mode, sync, key); ++ ++ /* Always return true since we know that if we encountered a task ++ * that was already running the wake_up raced with the schedule in ++ * rt_pi_down(). In that case the task in rt_pi_down() will be scheduled ++ * immediately and own the lock. We must not wake up another task in ++ * any case. ++ */ ++ return 1; ++} ++ ++/* caller is responsible for locking */ ++int set_hp_task(struct pi_semaphore *sem, prio_cmp_t higher_prio) ++{ ++ struct list_head *tmp, *next; ++ struct task_struct *queued; ++ int ret = 0; ++ ++ sem->hp.task = NULL; ++ list_for_each_safe(tmp, next, &sem->wait.task_list) { ++ queued = ((struct wq_pair*) ++ list_entry(tmp, wait_queue_t, ++ task_list)->private)->tsk; ++ ++ /* Compare task prios, find high prio task. */ ++ if (higher_prio(queued, sem->hp.task)) { ++ sem->hp.task = queued; ++ ret = 1; ++ } ++ } ++ return ret; ++} ++ ++/* caller is responsible for locking */ ++int set_hp_cpu_task(struct pi_semaphore *sem, int cpu, prio_cmp_t higher_prio) ++{ ++ struct list_head *tmp, *next; ++ struct task_struct *queued; ++ int ret = 0; ++ ++ sem->hp.cpu_task[cpu] = NULL; ++ list_for_each_safe(tmp, next, &sem->wait.task_list) { ++ queued = ((struct wq_pair*) ++ list_entry(tmp, wait_queue_t, ++ task_list)->private)->tsk; ++ ++ /* Compare task prios, find high prio task. */ ++ if (get_partition(queued) == cpu && ++ higher_prio(queued, sem->hp.cpu_task[cpu])) { ++ sem->hp.cpu_task[cpu] = queued; ++ ret = 1; ++ } ++ } ++ return ret; ++} ++ ++int do_pi_down(struct pi_semaphore* sem) ++{ ++ unsigned long flags; ++ struct task_struct *tsk = current; ++ struct wq_pair pair; ++ int suspended = 1; ++ wait_queue_t wait = { ++ .private = &pair, ++ .func = rt_pi_wake_up, ++ .task_list = {NULL, NULL} ++ }; ++ ++ pair.tsk = tsk; ++ pair.sem = sem; ++ spin_lock_irqsave(&sem->wait.lock, flags); ++ ++ if (atomic_dec_return(&sem->count) < 0 || ++ waitqueue_active(&sem->wait)) { ++ /* we need to suspend */ ++ tsk->state = TASK_UNINTERRUPTIBLE; ++ add_wait_queue_exclusive_locked(&sem->wait, &wait); ++ ++ TRACE_CUR("suspends on PI lock %p\n", sem); ++ curr_sched_plugin->pi_block(sem, tsk); ++ ++ /* release lock before sleeping */ ++ spin_unlock_irqrestore(&sem->wait.lock, flags); ++ ++ TS_PI_DOWN_END; ++ preempt_enable_no_resched(); ++ ++ ++ /* we depend on the FIFO order ++ * Thus, we don't need to recheck when we wake up, we ++ * are guaranteed to have the lock since there is only one ++ * wake up per release ++ */ ++ schedule(); ++ ++ TRACE_CUR("woke up, now owns PI lock %p\n", sem); ++ ++ /* try_to_wake_up() set our state to TASK_RUNNING, ++ * all we need to do is to remove our wait queue entry ++ */ ++ remove_wait_queue(&sem->wait, &wait); ++ } else { ++ /* no priority inheritance necessary, since there are no queued ++ * tasks. ++ */ ++ suspended = 0; ++ TRACE_CUR("acquired PI lock %p, no contention\n", sem); ++ sem->holder = tsk; ++ sem->hp.task = tsk; ++ curr_sched_plugin->inherit_priority(sem, tsk); ++ spin_unlock_irqrestore(&sem->wait.lock, flags); ++ } ++ return suspended; ++} ++ ++void do_pi_up(struct pi_semaphore* sem) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&sem->wait.lock, flags); ++ ++ TRACE_CUR("releases PI lock %p\n", sem); ++ curr_sched_plugin->return_priority(sem); ++ sem->holder = NULL; ++ if (atomic_inc_return(&sem->count) < 1) ++ /* there is a task queued */ ++ wake_up_locked(&sem->wait); ++ ++ spin_unlock_irqrestore(&sem->wait.lock, flags); ++} ++ ++asmlinkage long sys_pi_down(int sem_od) ++{ ++ long ret = 0; ++ struct pi_semaphore * sem; ++ int suspended = 0; ++ ++ preempt_disable(); ++ TS_PI_DOWN_START; ++ ++ sem = lookup_pi_sem(sem_od); ++ if (sem) ++ suspended = do_pi_down(sem); ++ else ++ ret = -EINVAL; ++ ++ if (!suspended) { ++ TS_PI_DOWN_END; ++ preempt_enable(); ++ } ++ ++ return ret; ++} ++ ++asmlinkage long sys_pi_up(int sem_od) ++{ ++ long ret = 0; ++ struct pi_semaphore * sem; ++ ++ preempt_disable(); ++ TS_PI_UP_START; ++ ++ sem = lookup_pi_sem(sem_od); ++ if (sem) ++ do_pi_up(sem); ++ else ++ ret = -EINVAL; ++ ++ ++ TS_PI_UP_END; ++ preempt_enable(); ++ ++ return ret; ++} ++ ++ ++/* ************************************************************************** */ ++/* STACK RESOURCE POLICY */ ++/* ************************************************************************** */ ++ ++ ++struct srp_priority { ++ struct list_head list; ++ unsigned int period; ++ pid_t pid; ++}; ++ ++#define list2prio(l) list_entry(l, struct srp_priority, list) ++ ++/* SRP task priority comparison function. Smaller periods have highest ++ * priority, tie-break is PID. Special case: period == 0 <=> no priority ++ */ ++static int srp_higher_prio(struct srp_priority* first, ++ struct srp_priority* second) ++{ ++ if (!first->period) ++ return 0; ++ else ++ return !second->period || ++ first->period < second->period || ( ++ first->period == second->period && ++ first->pid < second->pid); ++} ++ ++struct srp { ++ struct list_head ceiling; ++ wait_queue_head_t ceiling_blocked; ++}; ++ ++ ++atomic_t srp_objects_in_use = ATOMIC_INIT(0); ++ ++DEFINE_PER_CPU(struct srp, srp); ++ ++ ++/* Initialize SRP semaphores at boot time. */ ++static int __init srp_init(void) ++{ ++ int i; ++ ++ printk("Initializing SRP per-CPU ceilings..."); ++ for (i = 0; i < NR_CPUS; i++) { ++ init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked); ++ INIT_LIST_HEAD(&per_cpu(srp, i).ceiling); ++ } ++ printk(" done!\n"); ++ ++ return 0; ++} ++module_init(srp_init); ++ ++ ++#define system_ceiling(srp) list2prio(srp->ceiling.next) ++ ++ ++#define UNDEF_SEM -2 ++ ++ ++/* struct for uniprocessor SRP "semaphore" */ ++struct srp_semaphore { ++ struct srp_priority ceiling; ++ struct task_struct* owner; ++ int cpu; /* cpu associated with this "semaphore" and resource */ ++}; ++ ++#define ceiling2sem(c) container_of(c, struct srp_semaphore, ceiling) ++ ++static int srp_exceeds_ceiling(struct task_struct* first, ++ struct srp* srp) ++{ ++ return list_empty(&srp->ceiling) || ++ get_rt_period(first) < system_ceiling(srp)->period || ++ (get_rt_period(first) == system_ceiling(srp)->period && ++ first->pid < system_ceiling(srp)->pid) || ++ ceiling2sem(system_ceiling(srp))->owner == first; ++} ++ ++static void srp_add_prio(struct srp* srp, struct srp_priority* prio) ++{ ++ struct list_head *pos; ++ if (in_list(&prio->list)) { ++ printk(KERN_CRIT "WARNING: SRP violation detected, prio is already in " ++ "ceiling list! cpu=%d, srp=%p\n", smp_processor_id(), ceiling2sem(prio)); ++ return; ++ } ++ list_for_each(pos, &srp->ceiling) ++ if (unlikely(srp_higher_prio(prio, list2prio(pos)))) { ++ __list_add(&prio->list, pos->prev, pos); ++ return; ++ } ++ ++ list_add_tail(&prio->list, &srp->ceiling); ++} ++ ++ ++static void* create_srp_semaphore(void) ++{ ++ struct srp_semaphore* sem; ++ ++ sem = kmalloc(sizeof(*sem), GFP_KERNEL); ++ if (!sem) ++ return NULL; ++ ++ INIT_LIST_HEAD(&sem->ceiling.list); ++ sem->ceiling.period = 0; ++ sem->cpu = UNDEF_SEM; ++ sem->owner = NULL; ++ atomic_inc(&srp_objects_in_use); ++ return sem; ++} ++ ++static noinline int open_srp_semaphore(struct od_table_entry* entry, void* __user arg) ++{ ++ struct srp_semaphore* sem = (struct srp_semaphore*) entry->obj->obj; ++ int ret = 0; ++ struct task_struct* t = current; ++ struct srp_priority t_prio; ++ ++ TRACE("opening SRP semaphore %p, cpu=%d\n", sem, sem->cpu); ++ if (!srp_active()) ++ return -EBUSY; ++ ++ if (sem->cpu == UNDEF_SEM) ++ sem->cpu = get_partition(t); ++ else if (sem->cpu != get_partition(t)) ++ ret = -EPERM; ++ ++ if (ret == 0) { ++ t_prio.period = get_rt_period(t); ++ t_prio.pid = t->pid; ++ if (srp_higher_prio(&t_prio, &sem->ceiling)) { ++ sem->ceiling.period = t_prio.period; ++ sem->ceiling.pid = t_prio.pid; ++ } ++ } ++ ++ return ret; ++} ++ ++static void destroy_srp_semaphore(void* sem) ++{ ++ /* XXX invariants */ ++ atomic_dec(&srp_objects_in_use); ++ kfree(sem); ++} ++ ++struct fdso_ops srp_sem_ops = { ++ .create = create_srp_semaphore, ++ .open = open_srp_semaphore, ++ .destroy = destroy_srp_semaphore ++}; ++ ++ ++void do_srp_down(struct srp_semaphore* sem) ++{ ++ /* Update ceiling. */ ++ srp_add_prio(&__get_cpu_var(srp), &sem->ceiling); ++ WARN_ON(sem->owner != NULL); ++ sem->owner = current; ++ TRACE_CUR("acquired srp 0x%p\n", sem); ++} ++ ++void do_srp_up(struct srp_semaphore* sem) ++{ ++ /* Determine new system priority ceiling for this CPU. */ ++ WARN_ON(!in_list(&sem->ceiling.list)); ++ if (in_list(&sem->ceiling.list)) ++ list_del(&sem->ceiling.list); ++ ++ sem->owner = NULL; ++ ++ /* Wake tasks on this CPU, if they exceed current ceiling. */ ++ TRACE_CUR("released srp 0x%p\n", sem); ++ wake_up_all(&__get_cpu_var(srp).ceiling_blocked); ++} ++ ++/* Adjust the system-wide priority ceiling if resource is claimed. */ ++asmlinkage long sys_srp_down(int sem_od) ++{ ++ int cpu; ++ int ret = -EINVAL; ++ struct srp_semaphore* sem; ++ ++ /* disabling preemptions is sufficient protection since ++ * SRP is strictly per CPU and we don't interfere with any ++ * interrupt handlers ++ */ ++ preempt_disable(); ++ TS_SRP_DOWN_START; ++ ++ cpu = smp_processor_id(); ++ sem = lookup_srp_sem(sem_od); ++ if (sem && sem->cpu == cpu) { ++ do_srp_down(sem); ++ ret = 0; ++ } ++ ++ TS_SRP_DOWN_END; ++ preempt_enable(); ++ return ret; ++} ++ ++/* Adjust the system-wide priority ceiling if resource is freed. */ ++asmlinkage long sys_srp_up(int sem_od) ++{ ++ int cpu; ++ int ret = -EINVAL; ++ struct srp_semaphore* sem; ++ ++ preempt_disable(); ++ TS_SRP_UP_START; ++ ++ cpu = smp_processor_id(); ++ sem = lookup_srp_sem(sem_od); ++ ++ if (sem && sem->cpu == cpu) { ++ do_srp_up(sem); ++ ret = 0; ++ } ++ ++ TS_SRP_UP_END; ++ preempt_enable(); ++ return ret; ++} ++ ++asmlinkage long sys_reg_task_srp_sem(int sem_od) ++{ ++ /* unused */ ++ return 0; ++} ++ ++static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync, ++ void *key) ++{ ++ int cpu = smp_processor_id(); ++ struct task_struct *tsk = wait->private; ++ if (cpu != get_partition(tsk)) ++ TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b", ++ get_partition(tsk)); ++ else if (srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) ++ return default_wake_function(wait, mode, sync, key); ++ return 0; ++} ++ ++ ++ ++static void do_ceiling_block(struct task_struct *tsk) ++{ ++ wait_queue_t wait = { ++ .private = tsk, ++ .func = srp_wake_up, ++ .task_list = {NULL, NULL} ++ }; ++ ++ tsk->state = TASK_UNINTERRUPTIBLE; ++ add_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait); ++ tsk->rt_param.srp_non_recurse = 1; ++ preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++ tsk->rt_param.srp_non_recurse = 0; ++ remove_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait); ++} ++ ++/* Wait for current task priority to exceed system-wide priority ceiling. ++ */ ++void srp_ceiling_block(void) ++{ ++ struct task_struct *tsk = current; ++ ++ TS_SRPT_START; ++ ++ /* Only applies to real-time tasks, but optimize for RT tasks. */ ++ if (unlikely(!is_realtime(tsk))) ++ return; ++ ++ /* Avoid recursive ceiling blocking. */ ++ if (unlikely(tsk->rt_param.srp_non_recurse)) ++ return; ++ ++ /* Bail out early if there aren't any SRP resources around. */ ++ if (likely(!atomic_read(&srp_objects_in_use))) ++ return; ++ ++ preempt_disable(); ++ if (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) { ++ TRACE_CUR("is priority ceiling blocked.\n"); ++ TS_SRPT_END; ++ while (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) ++ do_ceiling_block(tsk); ++ TRACE_CUR("finally exceeds system ceiling.\n"); ++ } else { ++ TS_SRPT_END; ++ TRACE_CUR("is not priority ceiling blocked\n"); ++ } ++ preempt_enable(); ++} ++ ++/* ************************************************************************** */ ++ ++ ++ +diff --git a/litmus/pcp.c b/litmus/pcp.c +new file mode 100644 +index 0000000..06030d4 +--- /dev/null ++++ b/litmus/pcp.c +@@ -0,0 +1,764 @@ ++/* pcp.c -- Implementations of the PCP, D-PCP, and M-PCP. ++ * ++ */ ++#include <asm/uaccess.h> ++#include <linux/wait.h> ++#include <linux/list.h> ++#include <linux/sched.h> ++#include <linux/spinlock.h> ++#include <linux/completion.h> ++ ++#include <litmus/sched_plugin.h> ++#include <litmus/litmus.h> ++#include <litmus/rm_common.h> ++#include <litmus/fdso.h> ++#include <litmus/trace.h> ++ ++/* from sched_rm.c */ ++void rm_set_prio(struct task_struct *t, struct pcp_priority* new_prio); ++ ++#define GLOBAL_SEM -1 ++#define UNDEF_SEM -2 ++ ++#define get_prio(t) ((t)->rt_param.cur_prio) ++#define get_base_prio(t) (&((t)->rt_param.pcp_prio)) ++ ++ ++struct dpcp_request { ++ struct list_head list; ++ struct completion done; ++ long arg; ++ lt_t prio; ++ int pid; ++}; ++ ++struct pcp_semaphore { ++ int cpu; ++ ++ /* waiting tasks */ ++ wait_queue_head_t blocked; ++ struct pcp_priority* blocked_prio; ++ ++ /* system ceiling support */ ++ struct list_head list; ++ struct pcp_priority ceiling; ++ ++ /* task_struct owned_semaphore list */ ++ struct list_head owned_list; ++ ++ /* Current lock holder. ++ * NULL implies unlocked. ++ */ ++ struct task_struct* holder; ++ ++ /* D-PCP support */ ++ spinlock_t dpcp_lock; ++ struct list_head dpcp_requests; ++ int dpcp_count; ++ struct dpcp_request* dpcp_current; ++ struct completion dpcp_job; ++ struct task_struct* dpcp_agent; ++}; ++ ++static DEFINE_PER_CPU(spinlock_t, pcp_lock); ++static DEFINE_PER_CPU(struct list_head, sys_ceiling); ++ ++static noinline void init_pcp_sem(struct pcp_semaphore *sem, int cpu) ++{ ++ sem->cpu = cpu; ++ init_waitqueue_head(&sem->blocked); ++ INIT_LIST_HEAD(&sem->list); ++ INIT_LIST_HEAD(&sem->owned_list); ++ INIT_LIST_HEAD(&sem->dpcp_requests); ++ sem->holder = NULL; ++ sem->dpcp_current = NULL; ++ sem->blocked_prio = NULL; ++ sem->ceiling = (struct pcp_priority) {ULLONG_MAX, 0, INT_MAX}; ++ init_completion(&sem->dpcp_job); ++ spin_lock_init(&sem->dpcp_lock); ++ sem->dpcp_count = 0; ++ sem->dpcp_agent = NULL; ++} ++ ++static noinline int tsk_pcp_higher_prio(struct task_struct* t, ++ struct pcp_priority* p2) ++{ ++ return _rm_higher_prio(t->rt_param.cur_prio, p2); ++} ++ ++static noinline struct pcp_semaphore* get_ceiling(int cpu) ++{ ++ struct list_head *ceil_list = &per_cpu(sys_ceiling, cpu); ++ if (list_empty(ceil_list)) ++ return NULL; ++ return list_entry(ceil_list->next, struct pcp_semaphore, list); ++} ++ ++static noinline void raise_ceiling(struct pcp_semaphore* sem, int cpu) ++{ ++ struct list_head *ceil_list = &per_cpu(sys_ceiling, cpu); ++ list_add(&sem->list, ceil_list); ++} ++ ++static noinline int exceeds_ceiling(struct task_struct* t, ++ struct pcp_semaphore* ceil) ++{ ++ return !ceil || ceil->holder == t || ++ tsk_pcp_higher_prio(t, &ceil->ceiling); ++} ++ ++static noinline void give_priority(struct task_struct* t, struct pcp_semaphore* sem) ++{ ++ struct pcp_semaphore* next; ++ /* sem->blocked_prio can be NULL, but _rm_higher_prio() handles that */ ++ ++ /* only update if we actually exceed existing priorities */ ++ if (_rm_higher_prio(get_prio(t), sem->blocked_prio) && ++ _rm_higher_prio(get_prio(t), get_base_prio(sem->holder))) { ++ /* we need to register our priority */ ++ sem->blocked_prio = get_prio(t); ++ ++ /* only update task if it results in a priority increase */ ++ if (_rm_higher_prio(get_prio(t), get_prio(sem->holder))) { ++ /* update prio */ ++ TRACE("PCP: %s/%d inherits from %s/%d\n", ++ sem->holder->comm, sem->holder->pid, ++ t->comm, t->pid); ++ rm_set_prio(sem->holder, get_prio(t)); ++ /* check if recipient is blocked, too */ ++ next = sem->holder->rt_param.blocked_on; ++ if (next) ++ /* Transitive priority inheritance. ++ * Recurse. ++ */ ++ give_priority(sem->holder, next); ++ } ++ } ++} ++ ++static noinline long local_pcp_down(struct pcp_semaphore *sem) ++{ ++ long ret = 0; ++ struct task_struct* t = current; ++ struct pcp_semaphore* ceiling; ++ int cpu; ++ int ceiling_passed = 0; ++ ++ /* don't allow recursive locking */ ++ if (sem->holder == t) ++ return -EINVAL; ++ ++ cpu = smp_processor_id(); ++ if (cpu != sem->cpu) { ++ preempt_enable(); ++ return -EPERM; ++ } ++ ++ ++ /* first we need to pass the local system ceiling */ ++ while (!ceiling_passed) { ++ ceiling = get_ceiling(cpu); ++ TRACE_TASK(t, "PCP: I want %p, ceiling is %p\n", sem, ceiling); ++ ceiling_passed = exceeds_ceiling(t, ceiling); ++ if (!ceiling_passed) { ++ /* block on sys_ceiling */ ++ DECLARE_WAITQUEUE(waitq, t); ++ TRACE_TASK(t, "blocks on PCP system ceiling\n"); ++ add_wait_queue(&ceiling->blocked, &waitq); ++ /* initiate priority inheritance */ ++ give_priority(t, ceiling); ++ t->rt_param.blocked_on = ceiling; ++ t->state = TASK_UNINTERRUPTIBLE; ++ preempt_enable_no_resched(); ++ TS_PCP1_DOWN_END; ++ schedule(); ++ preempt_disable(); ++ t->rt_param.blocked_on = NULL; ++ remove_wait_queue(&ceiling->blocked, &waitq); ++ } else { ++ if (ceiling) ++ TRACE_TASK(t, ++ "system ceiling passed: {%llu, %d, %d} < " ++ "{%llu, %d, %d}\n", ++ ceiling->ceiling.prio, ++ ceiling->ceiling.in_global_cs, ++ ceiling->ceiling.pid, ++ t->rt_param.cur_prio->prio, ++ t->rt_param.cur_prio->in_global_cs, ++ t->rt_param.cur_prio->pid ++ ); ++ else ++ TRACE_TASK(t, ++ "system ceiling passed: NULL < " ++ "{%llu, %d, %d}\n", ++ t->rt_param.cur_prio->prio, ++ t->rt_param.cur_prio->in_global_cs, ++ t->rt_param.cur_prio->pid ++ ); ++ TS_PCP1_DOWN_END; ++ } ++ } ++ ++ TS_PCP2_DOWN_START; ++ /* Since we have passed the priority ceiling the semaphore cannot be ++ * in use. If it were in use then the ceiling would be at least as high ++ * as our priority. ++ */ ++ WARN_ON(sem->holder); ++ ++ TRACE_TASK(t, "taking PCP semaphore 0x%p, owner:%p\n", sem, sem->holder); ++ ++ /* We can become the owner. */ ++ sem->holder = t; ++ list_add(&sem->owned_list, &t->rt_param.owned_semaphores); ++ ++ /* We need to update the system ceiling, but only ++ * if the new ceiling is higher than the old. ++ */ ++ ceiling = get_ceiling(cpu); ++ /* if the priorities are equal then t already owns ceiling, ++ * otherwise it would not have gotten past the system ceiling ++ */ ++ if (!ceiling || _rm_higher_prio(&sem->ceiling, &ceiling->ceiling)) { ++ raise_ceiling(sem, cpu); ++ TRACE_TASK(t, "raised ceiling on %d\n", cpu); ++ } ++ ++ TS_PCP2_DOWN_END; ++ return ret; ++} ++ ++static noinline struct pcp_priority* fetch_highest_prio(struct task_struct *t) ++{ ++ struct pcp_priority *prio; ++ struct list_head* pos; ++ struct pcp_semaphore* sem; ++ ++ /* base case is that the task uses its normal priority */ ++ prio = get_base_prio(t); ++ ++ /* now search the list of semaphores that we own for a higher priority ++ * to inherit ++ */ ++ list_for_each(pos, &t->rt_param.owned_semaphores) { ++ sem = list_entry(pos, struct pcp_semaphore, owned_list); ++ /* sem->blocked_prio could be NULL */ ++ if (!_rm_higher_prio(prio, sem->blocked_prio)) ++ prio = sem->blocked_prio; ++ } ++ return prio; ++} ++ ++static noinline long local_pcp_up(struct pcp_semaphore *sem) ++{ ++ long ret = 0; ++ struct task_struct* t = current; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ ++ if (cpu != sem->cpu) ++ return -EPERM; ++ ++ if (sem->holder == t) { ++ TRACE_TASK(t, "giving up PCP semaphore 0x%p.\n", sem); ++ ++ /* we need to unblock all tasks in the wait_queue */ ++ wake_up_all(&sem->blocked); ++ ++ /* unlock semaphore */ ++ sem->holder = NULL; ++ list_del(&sem->owned_list); ++ ++ /* remove from system ceiling list */ ++ if (in_list(&sem->list)) ++ list_del(&sem->list); ++ ++ if (sem->blocked_prio == get_prio(t)) { ++ /* We are currently inheriting from this ++ * semaphore. We need to figure out which priority ++ * we should fall back to. ++ */ ++ TRACE_TASK(t, "giving up inherited prio.\n"); ++ rm_set_prio(t, fetch_highest_prio(t)); ++ } ++ /* reset semaphore priority inheritance */ ++ sem->blocked_prio = NULL; ++ } else { ++ TRACE_TASK(t, "local_pcp_up EINVAL 0x%p.\n", sem); ++ ret = -EINVAL; ++ } ++ ++ TS_PCP_UP_END; ++ return ret; ++} ++ ++static noinline struct task_struct* wqlist2task(struct list_head* l) ++{ ++ return (struct task_struct*) ++ list_entry(l, wait_queue_t, task_list)->private; ++} ++ ++static noinline int wait_order(struct list_head* la, struct list_head* lb) ++{ ++ return rm_higher_prio(wqlist2task(la), wqlist2task(lb)); ++} ++ ++/* The default function is too picky. ++ * We really only want to wake up one task. ++ */ ++int single_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key) ++{ ++ int ret = default_wake_function(wait, mode, sync, key); ++ if (!ret) ++ TRACE("Overriding default_wake_function() return code.\n"); ++ return 1; ++} ++ ++static noinline long global_pcp_down(struct pcp_semaphore* sem) ++{ ++ unsigned long flags; ++ long ret = 0; ++ struct task_struct* t = current; ++ ++ /* don't allow recursive locking */ ++ if (sem->holder == t) ++ return -EINVAL; ++ ++ spin_lock_irqsave(&sem->blocked.lock, flags); ++ ++ /* Get the global priority. Do this before ++ * we block, so that we wake up as a high-priority task. ++ */ ++ t->rt_param.pcp_prio.in_global_cs = 1; ++ rm_set_prio(t, &t->rt_param.pcp_prio); ++ ++ if (sem->holder) { ++ /* semaphore is not free. We need to block. */ ++ DECLARE_WAITQUEUE(waitq, t); ++ TRACE_TASK(t, "blocks on MPCP semaphore %p.\n", sem); ++ waitq.flags = WQ_FLAG_EXCLUSIVE; ++ waitq.func = single_wake_function; ++ /* insert ordered by priority */ ++ list_insert(&waitq.task_list, &sem->blocked.task_list, ++ wait_order); ++ t->state = TASK_UNINTERRUPTIBLE; ++ spin_unlock_irqrestore(&sem->blocked.lock, flags); ++ preempt_enable_no_resched(); ++ TS_MPCP_DOWN_END; ++ ++ schedule(); ++ ++ preempt_disable(); ++ /* once we wake up we are the owner of the lock */ ++ spin_lock_irqsave(&sem->blocked.lock, flags); ++ remove_wait_queue_locked(&sem->blocked, &waitq); ++ } else { ++ /* semaphore is free. We can proceed. */ ++ TS_MPCP_DOWN_END; ++ sem->holder = t; ++ } ++ if (sem->holder != t) { ++ if (sem->holder) ++ TRACE("expected %s/%d, but I am %s/%d\n", ++ sem->holder->comm, sem->holder->pid, t->comm, t->pid); ++ else ++ TRACE("expected NULL, but I am %s/%d\n", ++ t->comm, t->pid); ++ } ++ TRACE_TASK(t, "acquired MPCP semaphore %p.\n", sem); ++ ++ ++ spin_unlock_irqrestore(&sem->blocked.lock, flags); ++ return ret; ++} ++ ++static noinline long global_pcp_up(struct pcp_semaphore* sem) ++{ ++ unsigned long flags; ++ long ret = 0; ++ struct task_struct* t = current; ++ ++ if (sem->holder != t) ++ return -EINVAL; ++ ++ TRACE_TASK(t, "releasing MPCP semaphore %p.\n", sem); ++ ++ spin_lock_irqsave(&sem->blocked.lock, flags); ++ if (waitqueue_active(&sem->blocked)) { ++ /* pass ownership on */ ++ sem->holder = wqlist2task(sem->blocked.task_list.next); ++ TRACE_TASK(t, "waking up next (=%s/%d) on MPCP semaphore %p.\n", ++ sem->holder->comm, sem->holder->pid, sem); ++ /* wake up first */ ++ wake_up_locked(&sem->blocked); ++ } else ++ sem->holder = NULL; ++ ++ /* restore our own priority */ ++ t->rt_param.pcp_prio.in_global_cs = 0; ++ rm_set_prio(t, &t->rt_param.pcp_prio); ++ ++ TS_MPCP_UP_END; ++ spin_unlock_irqrestore(&sem->blocked.lock, flags); ++ return ret; ++} ++ ++static noinline int request_order(struct list_head* la, struct list_head* lb) ++{ ++ struct dpcp_request *a, *b; ++ a = list_entry(la, struct dpcp_request, list); ++ b = list_entry(lb, struct dpcp_request, list); ++ return a->prio < b->prio; ++} ++ ++static noinline long dpcp_invoke(struct pcp_semaphore* sem, long arg) ++{ ++ unsigned long flags; ++ long ret = 0; ++ struct task_struct* t = current, *a; ++ struct dpcp_request req; ++ ++ spin_lock_irqsave(&sem->dpcp_lock, flags); ++ ++ init_completion(&req.done); ++ req.arg = arg; ++ req.prio = t->rt_param.pcp_prio.prio; ++ req.pid = t->rt_param.pcp_prio.pid; ++ ++ list_insert(&req.list, &sem->dpcp_requests, ++ request_order); ++ ++ if (!(sem->dpcp_count++)) { ++ /* agent needs to be awakened */ ++ TRACE_TASK(t, "waking DPCP agent for %p.\n", sem); ++ if (sem->dpcp_agent) { ++ a = sem->dpcp_agent; ++ /* set agent priority */ ++ a->rt_param.pcp_prio.in_global_cs = 1; ++ a->rt_param.pcp_prio.prio = req.prio; ++ rm_set_prio(a, &a->rt_param.pcp_prio); ++ } ++ complete(&sem->dpcp_job); ++ } ++ ++ spin_unlock_irqrestore(&sem->dpcp_lock, flags); ++ TRACE_TASK(t, "blocking on DPCP sem %p.\n", sem); ++ preempt_enable_no_resched(); ++ TS_DPCP_INVOKE_END; ++ ++ wait_for_completion(&req.done); ++ ++ preempt_disable(); ++ /* we don't need to clean up, the remote agent did that for us */ ++ return ret; ++} ++ ++static noinline long dpcp_agent(struct pcp_semaphore* sem, long flags, long *arg) ++{ ++ unsigned long spinflags; ++ long ret = 0; ++ struct task_struct* t = current; ++ ++ spin_lock_irqsave(&sem->dpcp_lock, spinflags); ++ ++ /* defend against multiple concurrent agents */ ++ if (sem->dpcp_agent && sem->dpcp_agent != t) { ++ spin_unlock_irqrestore(&sem->dpcp_lock, spinflags); ++ return -EBUSY; ++ } else ++ sem->dpcp_agent = t; ++ ++ if (sem->cpu != get_partition(t)) { ++ int cpu = smp_processor_id(); ++ spin_unlock_irqrestore(&sem->dpcp_lock, spinflags); ++ printk(KERN_CRIT ++ "dpcp_agent: sem->cpu: %d, but agent " ++ "is on %d, and part=%d\n", ++ sem->cpu, cpu, get_partition(t)); ++ return -EINVAL; ++ } ++ ++ if ((flags & DPCP_COMPLETE) && sem->dpcp_current) { ++ TRACE_TASK(t, "completing DPCP sem %p.\n", sem); ++ /* we need to release the holder */ ++ complete(&sem->dpcp_current->done); ++ sem->dpcp_count--; ++ sem->dpcp_current = NULL; ++ } ++ ++ if (flags & DPCP_WAIT) { ++ do { ++ if (sem->dpcp_count) { ++ /* pass ownership on */ ++ sem->dpcp_current = list_entry( ++ sem->dpcp_requests.next, ++ struct dpcp_request, list); ++ list_del(sem->dpcp_requests.next); ++ t->rt_param.pcp_prio.in_global_cs = 1; ++ t->rt_param.pcp_prio.prio = ++ sem->dpcp_current->prio; ++ t->rt_param.pcp_prio.pid = sem->dpcp_current->pid; ++ rm_set_prio(t, &t->rt_param.pcp_prio); ++ TS_DPCP_AGENT2_END; ++ } else { ++ /* need to wait */ ++ spin_unlock_irqrestore(&sem->dpcp_lock, ++ spinflags); ++ TRACE_TASK(t, "agent waiting for " ++ "DPCP sem %p.\n", sem); ++ ++ preempt_enable_no_resched(); ++ TS_DPCP_AGENT2_END; ++ ret = wait_for_completion_interruptible(&sem->dpcp_job); ++ preempt_disable(); ++ TRACE_TASK(t, "got DPCP job on sem %p, " ++ "ret=%d.\n", sem, ret); ++ spin_lock_irqsave(&sem->dpcp_lock, spinflags); ++ if (ret != 0) { ++ /* FIXME: set priority */ ++ break; ++ } ++ } ++ } while (!sem->dpcp_current); ++ if (ret == 0) ++ *arg = sem->dpcp_current->arg; ++ } else { ++ /* restore our own priority */ ++ t->rt_param.pcp_prio.in_global_cs = 0; ++ t->rt_param.pcp_prio.prio = ULLONG_MAX; ++ rm_set_prio(t, &t->rt_param.pcp_prio); ++ sem->dpcp_agent = NULL; ++ } ++ ++ spin_unlock_irqrestore(&sem->dpcp_lock, spinflags); ++ return ret; ++} ++ ++ ++/* system calls */ ++ ++asmlinkage long sys_pcp_down(int sem_od) ++{ ++ long ret = 0; ++ struct pcp_semaphore * sem; ++ ++ preempt_disable(); ++ TS_MPCP_DOWN_START; ++ TS_PCP1_DOWN_START; ++ ++ if (!is_realtime(current)) { ++ ret = -EPERM; ++ goto out; ++ } ++ ++ sem = lookup_pcp_sem(sem_od); ++ if (sem) { ++ if (sem->cpu != GLOBAL_SEM) ++ ret = local_pcp_down(sem); ++ else ++ ret = global_pcp_down(sem); ++ } else ++ ret = -EINVAL; ++ ++out: ++ preempt_enable(); ++ return ret; ++} ++ ++asmlinkage long sys_pcp_up(int sem_od) ++{ ++ long ret = 0; ++ struct pcp_semaphore * sem; ++ ++ preempt_disable(); ++ TS_PCP_UP_START; ++ TS_MPCP_UP_START; ++ ++ if (!is_realtime(current)) { ++ ret = -EPERM; ++ goto out; ++ } ++ ++ sem = lookup_pcp_sem(sem_od); ++ if (sem) { ++ if (sem->cpu != GLOBAL_SEM) ++ ret = local_pcp_up(sem); ++ else ++ ret = global_pcp_up(sem); ++ } else ++ ret = -EINVAL; ++ ++out: ++ preempt_enable(); ++ return ret; ++} ++ ++ ++asmlinkage long sys_dpcp_invoke(int sem_od, long arg) ++{ ++ long ret = 0; ++ struct pcp_semaphore * sem; ++ ++ preempt_disable(); ++ TS_DPCP_INVOKE_START; ++ ++ if (!is_realtime(current)) { ++ ret = -EPERM; ++ goto out; ++ } ++ ++ sem = lookup_pcp_sem(sem_od); ++ if (sem) { ++ ret = dpcp_invoke(sem, arg); ++ } else ++ ret = -EINVAL; ++ ++out: ++ preempt_enable(); ++ return ret; ++} ++ ++asmlinkage long sys_dpcp_agent(int sem_od, long flags, long __user *__arg) ++{ ++ long ret = 0; ++ long arg; ++ struct pcp_semaphore * sem; ++ ++ preempt_disable(); ++ TS_DPCP_AGENT1_START; ++ ++ if (!is_realtime(current)) { ++ ret = -EPERM; ++ goto out; ++ } ++ ++ sem = lookup_pcp_sem(sem_od); ++ if (sem) { ++ TS_DPCP_AGENT1_END; ++ if (flags & DPCP_COMPLETE) { ++ TS_PCP_UP_START; ++ local_pcp_up(sem); ++ } ++ TS_DPCP_AGENT2_START; ++ ret = dpcp_agent(sem, flags, &arg); ++ if (ret == 0 && (flags & DPCP_WAIT)) { ++ ret = put_user(arg, __arg); ++ if (ret == 0) { ++ TS_PCP1_DOWN_START; ++ local_pcp_down(sem); ++ } ++ } ++ } else ++ ret = -EINVAL; ++ ++out: ++ preempt_enable(); ++ return ret; ++} ++ ++ ++/* FDSO callbacks */ ++ ++static noinline void* create_pcp_semaphore(void) ++{ ++ struct pcp_semaphore* sem; ++ ++ sem = kmalloc(sizeof(struct pcp_semaphore), GFP_KERNEL); ++ if (!sem) ++ return NULL; ++ init_pcp_sem(sem, UNDEF_SEM); ++ TRACE("allocated PCP semaphore %p\n", sem); ++ return sem; ++} ++ ++static noinline void destroy_pcp_semaphore(void* obj) ++{ ++ struct pcp_semaphore* sem = (struct pcp_semaphore*) obj; ++ WARN_ON(sem->holder); ++ WARN_ON(in_list(&sem->list)); ++ kfree(sem); ++} ++ ++static noinline void update_pcp_ceiling(struct pcp_semaphore* sem, struct task_struct* t, int global) ++{ ++ struct pcp_priority prio = {get_rt_period(t), 1, t->pid}; ++ if (global && !sem->ceiling.in_global_cs) ++ sem->ceiling.in_global_cs = 1; ++ if (_rm_higher_prio(&prio, &sem->ceiling)) ++ sem->ceiling = prio; ++} ++ ++static noinline int open_pcp_semaphore(struct od_table_entry* entry, void __user *__arg) ++{ ++ struct pcp_semaphore* sem = (struct pcp_semaphore*) entry->obj->obj; ++ int *arg = (int*) __arg; ++ struct task_struct* t = current; ++ int cpu= get_partition(t); ++ ++ TRACE("opening PCP semaphore %p, cpu=%d\n", sem, sem->cpu); ++ if (!pcp_active()) ++ return -EBUSY; ++ ++ if (arg && get_user(cpu, arg) != 0) ++ return -EFAULT; ++ ++ if (sem->cpu == UNDEF_SEM) ++ sem->cpu = cpu; ++ ++ update_pcp_ceiling(sem, t, sem->cpu != get_partition(t)); ++ ++ return 0; ++} ++ ++static noinline void update_mpcp_ceiling(struct pcp_semaphore* sem, struct task_struct* t) ++{ ++ struct pcp_priority prio = {get_rt_period(t), 1, t->pid}; ++ if (_rm_higher_prio(&prio, &sem->ceiling)) ++ sem->ceiling = prio; ++} ++ ++static noinline int open_mpcp_semaphore(struct od_table_entry* entry, void* __user arg) ++{ ++ struct pcp_semaphore* sem = (struct pcp_semaphore*) entry->obj->obj; ++ int ret = 0; ++ struct task_struct* t = current; ++ ++ if (!pcp_active()) ++ return -EBUSY; ++ ++ if (sem->cpu == UNDEF_SEM) ++ sem->cpu = GLOBAL_SEM; ++ ++ update_mpcp_ceiling(sem, t); ++ ++ return ret; ++} ++ ++struct fdso_ops pcp_sem_ops = { ++ .create = create_pcp_semaphore, ++ .destroy = destroy_pcp_semaphore, ++ .open = open_pcp_semaphore ++}; ++ ++struct fdso_ops mpcp_sem_ops = { ++ .create = create_pcp_semaphore, ++ .destroy = destroy_pcp_semaphore, ++ .open = open_mpcp_semaphore ++}; ++ ++static noinline int __init pcp_boot_init(void) ++{ ++ int i; ++ ++ printk("Initializing PCP per-CPU ceilings..."); ++ for (i = 0; i < NR_CPUS; i++) { ++ INIT_LIST_HEAD(&per_cpu(sys_ceiling, i)); ++ per_cpu(pcp_lock, i) = __SPIN_LOCK_UNLOCKED(pcp_lock); ++ } ++ printk(" done!\n"); ++ ++ return 0; ++} ++ ++module_init(pcp_boot_init); +diff --git a/litmus/rm_common.c b/litmus/rm_common.c +new file mode 100644 +index 0000000..9bf21fd +--- /dev/null ++++ b/litmus/rm_common.c +@@ -0,0 +1,76 @@ ++/* ++ * litmus/rm_common.c ++ * ++ * Common functions for RM based schedulers. ++ * ++ * FIXME: Too much code duplication with edf_common.c ++ */ ++ ++#include <linux/percpu.h> ++#include <linux/sched.h> ++#include <linux/list.h> ++ ++#include <litmus/litmus.h> ++#include <litmus/sched_plugin.h> ++#include <litmus/sched_trace.h> ++ ++ ++#include <litmus/rm_common.h> ++ ++/* rm_higher_prio - returns true if first has a higher RM priority ++ * than second. Period ties are broken by PID. ++ * ++ * first first must not be NULL and a real-time task. ++ * second may be NULL or a non-rt task. ++ */ ++int rm_higher_prio(struct task_struct* first, ++ struct task_struct* second) ++{ ++ struct pcp_priority *p1, *p2; ++ ++ /* verify assumptions in DEBUG build */ ++ BUG_ON(!first); ++ BUG_ON(!is_realtime(first)); ++ BUG_ON(second && !is_realtime(second) && second->rt_param.cur_prio); ++ ++ p1 = first->rt_param.cur_prio; ++ ++ /* if second is not a real-time task, then cur_prio is NULL */ ++ p2 = second ? second->rt_param.cur_prio : NULL; ++ return _rm_higher_prio(p1, p2); ++} ++ ++int rm_ready_order(struct list_head* a, struct list_head* b) ++{ ++ return rm_higher_prio( ++ list_entry(a, struct task_struct, rt_list), ++ list_entry(b, struct task_struct, rt_list)); ++} ++ ++ ++void rm_domain_init(rt_domain_t* rt, check_resched_needed_t resched) ++{ ++ rt_domain_init(rt, resched, rm_ready_order); ++} ++ ++/* need_to_preempt - check whether the task t needs to be preempted ++ * call only with irqs disabled and with ready_lock acquired ++ * THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT! ++ */ ++int rm_preemption_needed(rt_domain_t* rt, struct task_struct *t) ++{ ++ /* we need the read lock for edf_ready_queue */ ++ /* no need to preempt if there is nothing pending */ ++ if (!ready_jobs_pending(rt)) ++ return 0; ++ /* we need to reschedule if t doesn't exist */ ++ if (!t) ++ return 1; ++ ++ /* NOTE: We cannot check for non-preemptibility since we ++ * don't know what address space we're currently in. ++ */ ++ ++ /* make sure to get non-rt stuff out of the way */ ++ return !is_realtime(t) || rm_higher_prio(next_ready(rt), t); ++} +diff --git a/litmus/rt_domain.c b/litmus/rt_domain.c +new file mode 100644 +index 0000000..fe7bd29 +--- /dev/null ++++ b/litmus/rt_domain.c +@@ -0,0 +1,130 @@ ++/* ++ * kernel/rt_domain.c ++ * ++ * LITMUS real-time infrastructure. This file contains the ++ * functions that manipulate RT domains. RT domains are an abstraction ++ * of a ready queue and a release queue. ++ */ ++ ++#include <linux/percpu.h> ++#include <linux/sched.h> ++#include <linux/list.h> ++ ++#include <litmus/litmus.h> ++#include <litmus/sched_plugin.h> ++#include <litmus/sched_trace.h> ++ ++#include <litmus/rt_domain.h> ++ ++ ++static int dummy_resched(rt_domain_t *rt) ++{ ++ return 0; ++} ++ ++static int dummy_order(struct list_head* a, struct list_head* b) ++{ ++ return 0; ++} ++ ++int release_order(struct list_head* a, struct list_head* b) ++{ ++ return earlier_release( ++ list_entry(a, struct task_struct, rt_list), ++ list_entry(b, struct task_struct, rt_list)); ++} ++ ++ ++void rt_domain_init(rt_domain_t *rt, ++ check_resched_needed_t f, ++ list_cmp_t order) ++{ ++ BUG_ON(!rt); ++ if (!f) ++ f = dummy_resched; ++ if (!order) ++ order = dummy_order; ++ INIT_LIST_HEAD(&rt->ready_queue); ++ INIT_LIST_HEAD(&rt->release_queue); ++ rt->ready_lock = RW_LOCK_UNLOCKED; ++ rt->release_lock = SPIN_LOCK_UNLOCKED; ++ rt->check_resched = f; ++ rt->order = order; ++} ++ ++/* add_ready - add a real-time task to the rt ready queue. It must be runnable. ++ * @new: the newly released task ++ */ ++void __add_ready(rt_domain_t* rt, struct task_struct *new) ++{ ++ TRACE("rt: adding %s/%d (%llu, %llu) to ready queue at %llu\n", ++ new->comm, new->pid, get_exec_cost(new), get_rt_period(new), ++ sched_clock()); ++ ++ if (!list_insert(&new->rt_list, &rt->ready_queue, rt->order)) ++ rt->check_resched(rt); ++} ++ ++struct task_struct* __take_ready(rt_domain_t* rt) ++{ ++ struct task_struct *t = __peek_ready(rt); ++ ++ /* kick it out of the ready list */ ++ if (t) ++ list_del(&t->rt_list); ++ return t; ++} ++ ++struct task_struct* __peek_ready(rt_domain_t* rt) ++{ ++ if (!list_empty(&rt->ready_queue)) ++ return next_ready(rt); ++ else ++ return NULL; ++} ++ ++/* add_release - add a real-time task to the rt release queue. ++ * @task: the sleeping task ++ */ ++void __add_release(rt_domain_t* rt, struct task_struct *task) ++{ ++ TRACE("rt: adding %s/%d (%llu, %llu) rel=%llu to release queue\n", ++ task->comm, task->pid, get_exec_cost(task), get_rt_period(task), ++ get_release(task)); ++ ++ list_insert(&task->rt_list, &rt->release_queue, release_order); ++} ++ ++void __release_pending(rt_domain_t* rt) ++{ ++ struct list_head *pos, *save; ++ struct task_struct *queued; ++ lt_t now = sched_clock(); ++ list_for_each_safe(pos, save, &rt->release_queue) { ++ queued = list_entry(pos, struct task_struct, rt_list); ++ if (likely(is_released(queued, now))) { ++ /* this one is ready to go*/ ++ list_del(pos); ++ set_rt_flags(queued, RT_F_RUNNING); ++ ++ sched_trace_job_release(queued); ++ ++ /* now it can be picked up */ ++ barrier(); ++ add_ready(rt, queued); ++ } ++ else ++ /* the release queue is ordered */ ++ break; ++ } ++} ++ ++void try_release_pending(rt_domain_t* rt) ++{ ++ unsigned long flags; ++ ++ if (spin_trylock_irqsave(&rt->release_lock, flags)) { ++ __release_pending(rt); ++ spin_unlock_irqrestore(&rt->release_lock, flags); ++ } ++} +diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c +new file mode 100644 +index 0000000..314f8a1 +--- /dev/null ++++ b/litmus/sched_gsn_edf.c +@@ -0,0 +1,733 @@ ++/* ++ * kernel/sched_gsn_edf.c ++ * ++ * Implementation of the GSN-EDF scheduling algorithm. ++ * ++ * This version uses the simple approach and serializes all scheduling ++ * decisions by the use of a queue lock. This is probably not the ++ * best way to do it, but it should suffice for now. ++ */ ++ ++#include <linux/spinlock.h> ++#include <linux/percpu.h> ++#include <linux/sched.h> ++#include <linux/list.h> ++ ++#include <litmus/litmus.h> ++#include <litmus/jobs.h> ++#include <litmus/sched_plugin.h> ++#include <litmus/edf_common.h> ++#include <litmus/sched_trace.h> ++ ++#include <linux/module.h> ++ ++/* Overview of GSN-EDF operations. ++ * ++ * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This ++ * description only covers how the individual operations are implemented in ++ * LITMUS. ++ * ++ * link_task_to_cpu(T, cpu) - Low-level operation to update the linkage ++ * structure (NOT the actually scheduled ++ * task). If there is another linked task To ++ * already it will set To->linked_on = NO_CPU ++ * (thereby removing its association with this ++ * CPU). However, it will not requeue the ++ * previously linked task (if any). It will set ++ * T's state to RT_F_RUNNING and check whether ++ * it is already running somewhere else. If T ++ * is scheduled somewhere else it will link ++ * it to that CPU instead (and pull the linked ++ * task to cpu). T may be NULL. ++ * ++ * unlink(T) - Unlink removes T from all scheduler data ++ * structures. If it is linked to some CPU it ++ * will link NULL to that CPU. If it is ++ * currently queued in the gsnedf queue it will ++ * be removed from the T->rt_list. It is safe to ++ * call unlink(T) if T is not linked. T may not ++ * be NULL. ++ * ++ * requeue(T) - Requeue will insert T into the appropriate ++ * queue. If the system is in real-time mode and ++ * the T is released already, it will go into the ++ * ready queue. If the system is not in ++ * real-time mode is T, then T will go into the ++ * release queue. If T's release time is in the ++ * future, it will go into the release ++ * queue. That means that T's release time/job ++ * no/etc. has to be updated before requeu(T) is ++ * called. It is not safe to call requeue(T) ++ * when T is already queued. T may not be NULL. ++ * ++ * gsnedf_job_arrival(T) - This is the catch all function when T enters ++ * the system after either a suspension or at a ++ * job release. It will queue T (which means it ++ * is not safe to call gsnedf_job_arrival(T) if ++ * T is already queued) and then check whether a ++ * preemption is necessary. If a preemption is ++ * necessary it will update the linkage ++ * accordingly and cause scheduled to be called ++ * (either with an IPI or need_resched). It is ++ * safe to call gsnedf_job_arrival(T) if T's ++ * next job has not been actually released yet ++ * (releast time in the future). T will be put ++ * on the release queue in that case. ++ * ++ * job_completion(T) - Take care of everything that needs to be done ++ * to prepare T for its next release and place ++ * it in the right queue with ++ * gsnedf_job_arrival(). ++ * ++ * ++ * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is ++ * equivalent to unlink(T). Note that if you unlink a task from a CPU none of ++ * the functions will automatically propagate pending task from the ready queue ++ * to a linked task. This is the job of the calling function ( by means of ++ * __take_ready). ++ */ ++ ++ ++/* cpu_entry_t - maintain the linked and scheduled state ++ */ ++typedef struct { ++ int cpu; ++ struct task_struct* linked; /* only RT tasks */ ++ struct task_struct* scheduled; /* only RT tasks */ ++ struct list_head list; ++ atomic_t will_schedule; /* prevent unneeded IPIs */ ++} cpu_entry_t; ++DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries); ++ ++#define set_will_schedule() \ ++ (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 1)) ++#define clear_will_schedule() \ ++ (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 0)) ++#define test_will_schedule(cpu) \ ++ (atomic_read(&per_cpu(gsnedf_cpu_entries, cpu).will_schedule)) ++ ++ ++#define NO_CPU 0xffffffff ++ ++/* The gsnedf_lock is used to serialize all scheduling events. ++ * It protects ++ */ ++static DEFINE_SPINLOCK(gsnedf_lock); ++/* the cpus queue themselves according to priority in here */ ++static LIST_HEAD(gsnedf_cpu_queue); ++ ++static rt_domain_t gsnedf; ++ ++ ++/* update_cpu_position - Move the cpu entry to the correct place to maintain ++ * order in the cpu queue. Caller must hold gsnedf lock. ++ * ++ * This really should be a heap. ++ */ ++static void update_cpu_position(cpu_entry_t *entry) ++{ ++ cpu_entry_t *other; ++ struct list_head *pos; ++ ++ if (likely(in_list(&entry->list))) ++ list_del(&entry->list); ++ /* if we do not execute real-time jobs we just move ++ * to the end of the queue ++ */ ++ if (entry->linked) { ++ list_for_each(pos, &gsnedf_cpu_queue) { ++ other = list_entry(pos, cpu_entry_t, list); ++ if (edf_higher_prio(entry->linked, other->linked)) { ++ __list_add(&entry->list, pos->prev, pos); ++ return; ++ } ++ } ++ } ++ /* if we get this far we have the lowest priority job */ ++ list_add_tail(&entry->list, &gsnedf_cpu_queue); ++} ++ ++/* link_task_to_cpu - Update the link of a CPU. ++ * Handles the case where the to-be-linked task is already ++ * scheduled on a different CPU. ++ */ ++static noinline void link_task_to_cpu(struct task_struct* linked, ++ cpu_entry_t *entry) ++{ ++ cpu_entry_t *sched; ++ struct task_struct* tmp; ++ int on_cpu; ++ ++ BUG_ON(linked && !is_realtime(linked)); ++ ++ /* Currently linked task is set to be unlinked. */ ++ if (entry->linked) { ++ entry->linked->rt_param.linked_on = NO_CPU; ++ } ++ ++ /* Link new task to CPU. */ ++ if (linked) { ++ set_rt_flags(linked, RT_F_RUNNING); ++ /* handle task is already scheduled somewhere! */ ++ on_cpu = linked->rt_param.scheduled_on; ++ if (on_cpu != NO_CPU) { ++ sched = &per_cpu(gsnedf_cpu_entries, on_cpu); ++ /* this should only happen if not linked already */ ++ BUG_ON(sched->linked == linked); ++ ++ /* If we are already scheduled on the CPU to which we ++ * wanted to link, we don't need to do the swap -- ++ * we just link ourselves to the CPU and depend on ++ * the caller to get things right. ++ */ ++ if (entry != sched) { ++ tmp = sched->linked; ++ linked->rt_param.linked_on = sched->cpu; ++ sched->linked = linked; ++ update_cpu_position(sched); ++ linked = tmp; ++ } ++ } ++ if (linked) /* might be NULL due to swap */ ++ linked->rt_param.linked_on = entry->cpu; ++ } ++ entry->linked = linked; ++ update_cpu_position(entry); ++} ++ ++/* unlink - Make sure a task is not linked any longer to an entry ++ * where it was linked before. Must hold gsnedf_lock. ++ */ ++static noinline void unlink(struct task_struct* t) ++{ ++ cpu_entry_t *entry; ++ ++ if (unlikely(!t)) { ++ TRACE_BUG_ON(!t); ++ return; ++ } ++ ++ if (t->rt_param.linked_on != NO_CPU) { ++ /* unlink */ ++ entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on); ++ t->rt_param.linked_on = NO_CPU; ++ link_task_to_cpu(NULL, entry); ++ } else if (in_list(&t->rt_list)) { ++ /* This is an interesting situation: t is scheduled, ++ * but was just recently unlinked. It cannot be ++ * linked anywhere else (because then it would have ++ * been relinked to this CPU), thus it must be in some ++ * queue. We must remove it from the list in this ++ * case. ++ */ ++ list_del(&t->rt_list); ++ } ++} ++ ++ ++/* preempt - force a CPU to reschedule ++ */ ++static noinline void preempt(cpu_entry_t *entry) ++{ ++ /* We cannot make the is_np() decision here if it is a remote CPU ++ * because requesting exit_np() requires that we currently use the ++ * address space of the task. Thus, in the remote case we just send ++ * the IPI and let schedule() handle the problem. ++ */ ++ ++ if (smp_processor_id() == entry->cpu) { ++ if (entry->scheduled && is_np(entry->scheduled)) ++ request_exit_np(entry->scheduled); ++ else ++ set_tsk_need_resched(current); ++ } else ++ /* in case that it is a remote CPU we have to defer the ++ * the decision to the remote CPU ++ * FIXME: We could save a few IPI's here if we leave the flag ++ * set when we are waiting for a np_exit(). ++ */ ++ if (!test_will_schedule(entry->cpu)) ++ smp_send_reschedule(entry->cpu); ++} ++ ++/* requeue - Put an unlinked task into gsn-edf domain. ++ * Caller must hold gsnedf_lock. ++ */ ++static noinline void requeue(struct task_struct* task) ++{ ++ BUG_ON(!task); ++ /* sanity check rt_list before insertion */ ++ BUG_ON(in_list(&task->rt_list)); ++ ++ if (get_rt_flags(task) == RT_F_SLEEP) { ++ /* this task has expired ++ * _schedule has already taken care of updating ++ * the release and ++ * deadline. We just must check if it has been released. ++ */ ++ if (is_released(task, sched_clock())) ++ __add_ready(&gsnedf, task); ++ else { ++ /* it has got to wait */ ++ __add_release(&gsnedf, task); ++ } ++ ++ } else ++ /* this is a forced preemption ++ * thus the task stays in the ready_queue ++ * we only must make it available to others ++ */ ++ __add_ready(&gsnedf, task); ++} ++ ++/* gsnedf_job_arrival: task is either resumed or released */ ++static noinline void gsnedf_job_arrival(struct task_struct* task) ++{ ++ cpu_entry_t* last; ++ ++ BUG_ON(list_empty(&gsnedf_cpu_queue)); ++ BUG_ON(!task); ++ ++ /* first queue arriving job */ ++ requeue(task); ++ ++ /* then check for any necessary preemptions */ ++ last = list_entry(gsnedf_cpu_queue.prev, cpu_entry_t, list); ++ if (edf_preemption_needed(&gsnedf, last->linked)) { ++ /* preemption necessary */ ++ task = __take_ready(&gsnedf); ++ TRACE("job_arrival: task %d linked to %d\n", ++ task->pid, last->cpu); ++ if (last->linked) ++ requeue(last->linked); ++ ++ link_task_to_cpu(task, last); ++ preempt(last); ++ } ++} ++ ++/* check for current job releases */ ++static noinline void gsnedf_release_jobs(void) ++{ ++ struct list_head *pos, *save; ++ struct task_struct *queued; ++ lt_t now = sched_clock(); ++ ++ ++ list_for_each_safe(pos, save, &gsnedf.release_queue) { ++ queued = list_entry(pos, struct task_struct, rt_list); ++ if (likely(is_released(queued, now))) { ++ /* this one is ready to go*/ ++ list_del(pos); ++ set_rt_flags(queued, RT_F_RUNNING); ++ ++ sched_trace_job_release(queued); ++ gsnedf_job_arrival(queued); ++ } ++ else ++ /* the release queue is ordered */ ++ break; ++ } ++} ++ ++/* gsnedf_scheduler_tick - this function is called for every local timer ++ * interrupt. ++ * ++ * checks whether the current task has expired and checks ++ * whether we need to preempt it if it has not expired ++ */ ++static void gsnedf_scheduler_tick(void) ++{ ++ unsigned long flags; ++ struct task_struct* t = current; ++ ++ if (is_realtime(t) && budget_exhausted(t)) { ++ if (!is_np(t)) { ++ /* np tasks will be preempted when they become ++ * preemptable again ++ */ ++ set_tsk_need_resched(t); ++ set_will_schedule(); ++ TRACE("gsnedf_scheduler_tick: " ++ "%d is preemptable " ++ " => FORCE_RESCHED\n", t->pid); ++ } else { ++ TRACE("gsnedf_scheduler_tick: " ++ "%d is non-preemptable, " ++ "preemption delayed.\n", t->pid); ++ request_exit_np(t); ++ } ++ } ++ ++ /* only the first CPU needs to release jobs */ ++ if (smp_processor_id() == 0) { ++ spin_lock_irqsave(&gsnedf_lock, flags); ++ ++ /* Try to release pending jobs */ ++ gsnedf_release_jobs(); ++ ++ /* We don't need to check linked != scheduled since ++ * set_tsk_need_resched has been set by preempt() if necessary. ++ */ ++ ++ spin_unlock_irqrestore(&gsnedf_lock, flags); ++ } ++} ++ ++/* caller holds gsnedf_lock */ ++static noinline void job_completion(struct task_struct *t) ++{ ++ BUG_ON(!t); ++ ++ sched_trace_job_completion(t); ++ ++ TRACE_TASK(t, "job_completion().\n"); ++ ++ /* set flags */ ++ set_rt_flags(t, RT_F_SLEEP); ++ /* prepare for next period */ ++ prepare_for_next_period(t); ++ /* unlink */ ++ unlink(t); ++ /* requeue ++ * But don't requeue a blocking task. */ ++ if (is_running(t)) ++ gsnedf_job_arrival(t); ++} ++ ++ ++/* Getting schedule() right is a bit tricky. schedule() may not make any ++ * assumptions on the state of the current task since it may be called for a ++ * number of reasons. The reasons include a scheduler_tick() determined that it ++ * was necessary, because sys_exit_np() was called, because some Linux ++ * subsystem determined so, or even (in the worst case) because there is a bug ++ * hidden somewhere. Thus, we must take extreme care to determine what the ++ * current state is. ++ * ++ * The CPU could currently be scheduling a task (or not), be linked (or not). ++ * ++ * The following assertions for the scheduled task could hold: ++ * ++ * - !is_running(scheduled) // the job blocks ++ * - scheduled->timeslice == 0 // the job completed (forcefully) ++ * - get_rt_flag() == RT_F_SLEEP // the job completed (by syscall) ++ * - linked != scheduled // we need to reschedule (for any reason) ++ * - is_np(scheduled) // rescheduling must be delayed, ++ * sys_exit_np must be requested ++ * ++ * Any of these can occur together. ++ */ ++static int gsnedf_schedule(struct task_struct * prev, ++ struct task_struct ** next) ++{ ++ cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries); ++ int out_of_time, sleep, preempt, np, exists, blocks; ++ ++ /* Will be released in finish_switch. */ ++ spin_lock(&gsnedf_lock); ++ clear_will_schedule(); ++ ++ /* sanity checking */ ++ BUG_ON(entry->scheduled && entry->scheduled != prev); ++ BUG_ON(entry->scheduled && !is_realtime(prev)); ++ BUG_ON(is_realtime(prev) && !entry->scheduled); ++ ++ /* (0) Determine state */ ++ exists = entry->scheduled != NULL; ++ blocks = exists && !is_running(entry->scheduled); ++ out_of_time = exists && budget_exhausted(entry->scheduled); ++ np = exists && is_np(entry->scheduled); ++ sleep = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP; ++ preempt = entry->scheduled != entry->linked; ++ ++ /* If a task blocks we have no choice but to reschedule. ++ */ ++ if (blocks) ++ unlink(entry->scheduled); ++ ++ /* Request a sys_exit_np() call if we would like to preempt but cannot. ++ * We need to make sure to update the link structure anyway in case ++ * that we are still linked. Multiple calls to request_exit_np() don't ++ * hurt. ++ */ ++ if (np && (out_of_time || preempt || sleep)) { ++ unlink(entry->scheduled); ++ request_exit_np(entry->scheduled); ++ } ++ ++ /* Any task that is preemptable and either exhausts its execution ++ * budget or wants to sleep completes. We may have to reschedule after ++ * this. ++ */ ++ if (!np && (out_of_time || sleep)) ++ job_completion(entry->scheduled); ++ ++ /* Link pending task if we became unlinked. ++ */ ++ if (!entry->linked) ++ link_task_to_cpu(__take_ready(&gsnedf), entry); ++ ++ /* The final scheduling decision. Do we need to switch for some reason? ++ * If linked different from scheduled select linked as next. ++ */ ++ if ((!np || blocks) && ++ entry->linked != entry->scheduled) { ++ /* Schedule a linked job? */ ++ if (entry->linked) ++ *next = entry->linked; ++ } else ++ /* Only override Linux scheduler if we have real-time task ++ * scheduled that needs to continue. ++ */ ++ if (exists) ++ *next = prev; ++ ++ spin_unlock(&gsnedf_lock); ++ ++ /* don't race with a concurrent switch */ ++ if (*next && prev != *next) ++ while ((*next)->rt_param.scheduled_on != NO_CPU) ++ cpu_relax(); ++ return 0; ++} ++ ++ ++/* _finish_switch - we just finished the switch away from prev ++ */ ++static void gsnedf_finish_switch(struct task_struct *prev) ++{ ++ cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries); ++ ++ entry->scheduled = is_realtime(current) ? current : NULL; ++ ++ prev->rt_param.scheduled_on = NO_CPU; ++ current->rt_param.scheduled_on = smp_processor_id(); ++} ++ ++ ++/* Prepare a task for running in RT mode ++ * Enqueues the task into master queue data structure ++ * returns ++ * -EPERM if task is not TASK_STOPPED ++ */ ++static long gsnedf_prepare_task(struct task_struct * t) ++{ ++ unsigned long flags; ++ TRACE("gsn edf: prepare task %d\n", t->pid); ++ ++ if (t->state == TASK_STOPPED) { ++ t->rt_param.scheduled_on = NO_CPU; ++ t->rt_param.linked_on = NO_CPU; ++ ++ /* delay by 1ms */ ++ release_at(t, sched_clock() + 1000000); ++ ++ /* The task should be running in the queue, otherwise signal ++ * code will try to wake it up with fatal consequences. ++ */ ++ t->state = TASK_RUNNING; ++ spin_lock_irqsave(&gsnedf_lock, flags); ++ t->rt_param.litmus_controlled = 1; ++ requeue(t); ++ spin_unlock_irqrestore(&gsnedf_lock, flags); ++ return 0; ++ } ++ else ++ return -EPERM; ++} ++ ++static void gsnedf_wake_up_task(struct task_struct *task) ++{ ++ unsigned long flags; ++ lt_t now; ++ /* We must determine whether task should go into the release ++ * queue or into the ready queue. It may enter the ready queue ++ * if it has credit left in its time slice and has not yet reached ++ * its deadline. If it is now passed its deadline we assume this the ++ * arrival of a new sporadic job and thus put it in the ready queue ++ * anyway.If it has zero budget and the next release is in the future ++ * it has to go to the release queue. ++ */ ++ TRACE("gsnedf: %d unsuspends with budget=%d\n", ++ task->pid, task->time_slice); ++ ++ spin_lock_irqsave(&gsnedf_lock, flags); ++ if (!task->rt_param.litmus_controlled) { ++ task->rt_param.litmus_controlled = 1; ++ /* We need to take suspensions because of semaphores into ++ * account! If a job resumes after being suspended due to acquiring ++ * a semaphore, it should never be treated as a new job release. ++ */ ++ if (get_rt_flags(task) == RT_F_EXIT_SEM) { ++ set_rt_flags(task, RT_F_RUNNING); ++ } else { ++ now = sched_clock(); ++ if (is_tardy(task, now)) { ++ /* new sporadic release */ ++ release_at(task, now); ++ sched_trace_job_release(task); ++ } ++ else if (task->time_slice) ++ /* came back in time before deadline ++ */ ++ set_rt_flags(task, RT_F_RUNNING); ++ } ++ task->state = TASK_RUNNING; ++ gsnedf_job_arrival(task); ++ } ++ spin_unlock_irqrestore(&gsnedf_lock, flags); ++} ++ ++static void gsnedf_task_blocks(struct task_struct *t) ++{ ++ unsigned long flags; ++ ++ /* unlink if necessary */ ++ spin_lock_irqsave(&gsnedf_lock, flags); ++ unlink(t); ++ t->rt_param.litmus_controlled = 0; ++ spin_unlock_irqrestore(&gsnedf_lock, flags); ++ ++ BUG_ON(!is_realtime(t)); ++ TRACE("task %d suspends with budget=%d\n", t->pid, t->time_slice); ++ BUG_ON(t->rt_list.next != LIST_POISON1); ++ BUG_ON(t->rt_list.prev != LIST_POISON2); ++} ++ ++ ++/* When _tear_down is called, the task should not be in any queue any more ++ * as it must have blocked first. We don't have any internal state for the task, ++ * it is all in the task_struct. ++ */ ++static long gsnedf_tear_down(struct task_struct * t) ++{ ++ BUG_ON(!is_realtime(t)); ++ TRACE_TASK(t, "RIP\n"); ++ BUG_ON(t->array); ++ BUG_ON(t->rt_list.next != LIST_POISON1); ++ BUG_ON(t->rt_list.prev != LIST_POISON2); ++ return 0; ++} ++ ++static long gsnedf_pi_block(struct pi_semaphore *sem, ++ struct task_struct *new_waiter) ++{ ++ /* This callback has to handle the situation where a new waiter is ++ * added to the wait queue of the semaphore. ++ * ++ * We must check if has a higher priority than the currently ++ * highest-priority task, and then potentially reschedule. ++ */ ++ ++ BUG_ON(!new_waiter); ++ ++ if (edf_higher_prio(new_waiter, sem->hp.task)) { ++ TRACE_TASK(new_waiter, " boosts priority\n"); ++ /* called with IRQs disabled */ ++ spin_lock(&gsnedf_lock); ++ /* store new highest-priority task */ ++ sem->hp.task = new_waiter; ++ if (sem->holder) { ++ /* let holder inherit */ ++ sem->holder->rt_param.inh_task = new_waiter; ++ unlink(sem->holder); ++ gsnedf_job_arrival(sem->holder); ++ } ++ spin_unlock(&gsnedf_lock); ++ } ++ ++ return 0; ++} ++ ++static long gsnedf_inherit_priority(struct pi_semaphore *sem, ++ struct task_struct *new_owner) ++{ ++ /* We don't need to acquire the gsnedf_lock since at the time of this ++ * call new_owner isn't actually scheduled yet (it's still sleeping) ++ * and since the calling function already holds sem->wait.lock, which ++ * prevents concurrent sem->hp.task changes. ++ */ ++ ++ if (sem->hp.task && sem->hp.task != new_owner) { ++ new_owner->rt_param.inh_task = sem->hp.task; ++ TRACE_TASK(new_owner, "inherited priority from %s/%d\n", ++ sem->hp.task->comm, sem->hp.task->pid); ++ } else ++ TRACE_TASK(new_owner, ++ "cannot inherit priority, " ++ "no higher priority job waits.\n"); ++ return 0; ++} ++ ++/* This function is called on a semaphore release, and assumes that ++ * the current task is also the semaphore holder. ++ */ ++static long gsnedf_return_priority(struct pi_semaphore *sem) ++{ ++ struct task_struct* t = current; ++ int ret = 0; ++ ++ /* Find new highest-priority semaphore task ++ * if holder task is the current hp.task. ++ * ++ * Calling function holds sem->wait.lock. ++ */ ++ if (t == sem->hp.task) ++ set_hp_task(sem, edf_higher_prio); ++ ++ TRACE_CUR("gsnedf_return_priority for lock %p\n", sem); ++ ++ if (t->rt_param.inh_task) { ++ /* interrupts already disabled by PI code */ ++ spin_lock(&gsnedf_lock); ++ ++ /* Reset inh_task to NULL. */ ++ t->rt_param.inh_task = NULL; ++ ++ /* Check if rescheduling is necessary */ ++ unlink(t); ++ gsnedf_job_arrival(t); ++ spin_unlock(&gsnedf_lock); ++ } ++ ++ return ret; ++} ++ ++/* Plugin object */ ++static struct sched_plugin gsn_edf_plugin __cacheline_aligned_in_smp = { ++ .plugin_name = "GSN-EDF", ++ .scheduler_tick = gsnedf_scheduler_tick, ++ .prepare_task = gsnedf_prepare_task, ++ .sleep_next_period = complete_job, ++ .tear_down = gsnedf_tear_down, ++ .schedule = gsnedf_schedule, ++ .finish_switch = gsnedf_finish_switch, ++ .wake_up_task = gsnedf_wake_up_task, ++ .task_blocks = gsnedf_task_blocks, ++ .inherit_priority = gsnedf_inherit_priority, ++ .return_priority = gsnedf_return_priority, ++ .pi_block = gsnedf_pi_block ++}; ++ ++ ++static int __init init_gsn_edf(void) ++{ ++ int cpu; ++ cpu_entry_t *entry; ++ ++ /* initialize CPU state */ ++ for (cpu = 0; cpu < NR_CPUS; cpu++) { ++ entry = &per_cpu(gsnedf_cpu_entries, cpu); ++ atomic_set(&entry->will_schedule, 0); ++ entry->linked = NULL; ++ entry->scheduled = NULL; ++ entry->cpu = cpu; ++ INIT_LIST_HEAD(&entry->list); ++ } ++ ++ edf_domain_init(&gsnedf, NULL); ++ return register_sched_plugin(&gsn_edf_plugin); ++} ++ ++ ++module_init(init_gsn_edf); +diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c +new file mode 100644 +index 0000000..f05fc56 +--- /dev/null ++++ b/litmus/sched_plugin.c +@@ -0,0 +1,169 @@ ++/* sched_plugin.c -- core infrastructure for the scheduler plugin system ++ * ++ * This file includes the initialization of the plugin system, the no-op Linux ++ * scheduler plugin and some dummy functions. ++ */ ++ ++#include <linux/list.h> ++#include <linux/spinlock.h> ++ ++#include <litmus/litmus.h> ++#include <litmus/sched_plugin.h> ++ ++ ++/************************************************************* ++ * Dummy plugin functions * ++ *************************************************************/ ++ ++static void litmus_dummy_finish_switch(struct task_struct * prev) ++{ ++} ++ ++static int litmus_dummy_schedule(struct task_struct * prev, ++ struct task_struct** next) ++{ ++ return 0; ++} ++ ++static void litmus_dummy_scheduler_tick(void) ++{ ++} ++ ++static long litmus_dummy_prepare_task(struct task_struct *t) ++{ ++ return -ENOSYS; ++} ++ ++static void litmus_dummy_wake_up_task(struct task_struct *task) ++{ ++ printk(KERN_WARNING "task %d: unhandled real-time wake up!\n", ++ task->pid); ++} ++ ++static void litmus_dummy_task_blocks(struct task_struct *task) ++{ ++} ++ ++static long litmus_dummy_tear_down(struct task_struct *task) ++{ ++ return 0; ++} ++ ++static long litmus_dummy_sleep_next_period(void) ++{ ++ return -ENOSYS; ++} ++ ++static long litmus_dummy_inherit_priority(struct pi_semaphore *sem, ++ struct task_struct *new_owner) ++{ ++ return -ENOSYS; ++} ++ ++static long litmus_dummy_return_priority(struct pi_semaphore *sem) ++{ ++ return -ENOSYS; ++} ++ ++static long litmus_dummy_pi_block(struct pi_semaphore *sem, ++ struct task_struct *new_waiter) ++{ ++ return -ENOSYS; ++} ++ ++ ++/* The default scheduler plugin. It doesn't do anything and lets Linux do its ++ * job. ++ */ ++struct sched_plugin linux_sched_plugin = { ++ .plugin_name = "Linux", ++ .scheduler_tick = litmus_dummy_scheduler_tick, ++ .prepare_task = litmus_dummy_prepare_task, ++ .tear_down = litmus_dummy_tear_down, ++ .wake_up_task = litmus_dummy_wake_up_task, ++ .task_blocks = litmus_dummy_task_blocks, ++ .sleep_next_period = litmus_dummy_sleep_next_period, ++ .schedule = litmus_dummy_schedule, ++ .finish_switch = litmus_dummy_finish_switch, ++ .inherit_priority = litmus_dummy_inherit_priority, ++ .return_priority = litmus_dummy_return_priority, ++ .pi_block = litmus_dummy_pi_block ++}; ++ ++/* ++ * The reference to current plugin that is used to schedule tasks within ++ * the system. It stores references to actual function implementations ++ * Should be initialized by calling "init_***_plugin()" ++ */ ++struct sched_plugin *curr_sched_plugin = &linux_sched_plugin; ++ ++/* the list of registered scheduling plugins */ ++static LIST_HEAD(sched_plugins); ++static DEFINE_SPINLOCK(sched_plugins_lock); ++ ++#define CHECK(func) {\ ++ if (!plugin->func) \ ++ plugin->func = litmus_dummy_ ## func;} ++ ++/* FIXME: get reference to module */ ++int register_sched_plugin(struct sched_plugin* plugin) ++{ ++ printk(KERN_INFO "Registering LITMUS^RT plugin %s.\n", ++ plugin->plugin_name); ++ ++ /* make sure we don't trip over null pointers later */ ++ CHECK(finish_switch); ++ CHECK(schedule); ++ CHECK(scheduler_tick); ++ CHECK(wake_up_task); ++ CHECK(tear_down); ++ CHECK(task_blocks); ++ CHECK(prepare_task); ++ CHECK(sleep_next_period); ++ CHECK(inherit_priority); ++ CHECK(return_priority); ++ CHECK(pi_block); ++ ++ spin_lock(&sched_plugins_lock); ++ list_add(&plugin->list, &sched_plugins); ++ spin_unlock(&sched_plugins_lock); ++ ++ return 0; ++} ++ ++ ++/* FIXME: reference counting, etc. */ ++struct sched_plugin* find_sched_plugin(const char* name) ++{ ++ struct list_head *pos; ++ struct sched_plugin *plugin; ++ ++ spin_lock(&sched_plugins_lock); ++ list_for_each(pos, &sched_plugins) { ++ plugin = list_entry(pos, struct sched_plugin, list); ++ if (!strcmp(plugin->plugin_name, name)) ++ goto out_unlock; ++ } ++ plugin = NULL; ++ ++out_unlock: ++ spin_unlock(&sched_plugins_lock); ++ return plugin; ++} ++ ++int print_sched_plugins(char* buf, int max) ++{ ++ int count = 0; ++ struct list_head *pos; ++ struct sched_plugin *plugin; ++ ++ spin_lock(&sched_plugins_lock); ++ list_for_each(pos, &sched_plugins) { ++ plugin = list_entry(pos, struct sched_plugin, list); ++ count += snprintf(buf + count, max - count, "%s\n", plugin->plugin_name); ++ if (max - count <= 0) ++ break; ++ } ++ spin_unlock(&sched_plugins_lock); ++ return count; ++} +diff --git a/litmus/sched_psn_edf.c b/litmus/sched_psn_edf.c +new file mode 100644 +index 0000000..27f4b5c +--- /dev/null ++++ b/litmus/sched_psn_edf.c +@@ -0,0 +1,458 @@ ++ ++/* ++ * kernel/sched_psn_edf.c ++ * ++ * Implementation of the PSN-EDF scheduler plugin. ++ * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c. ++ * ++ * Suspensions and non-preemptable sections are supported. ++ * Priority inheritance is not supported. ++ */ ++ ++#include <linux/percpu.h> ++#include <linux/sched.h> ++#include <linux/list.h> ++#include <linux/spinlock.h> ++ ++#include <linux/module.h> ++ ++#include <litmus/litmus.h> ++#include <litmus/jobs.h> ++#include <litmus/sched_plugin.h> ++#include <litmus/edf_common.h> ++ ++ ++typedef struct { ++ rt_domain_t domain; ++ int cpu; ++ struct task_struct* scheduled; /* only RT tasks */ ++ spinlock_t lock; /* protects the domain and ++ * serializes scheduling decisions ++ */ ++} psnedf_domain_t; ++ ++DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains); ++ ++#define local_edf (&__get_cpu_var(psnedf_domains).domain) ++#define local_pedf (&__get_cpu_var(psnedf_domains)) ++#define remote_edf(cpu) (&per_cpu(psnedf_domains, cpu).domain) ++#define remote_pedf(cpu) (&per_cpu(psnedf_domains, cpu)) ++#define task_edf(task) remote_edf(get_partition(task)) ++#define task_pedf(task) remote_pedf(get_partition(task)) ++ ++ ++static void psnedf_domain_init(psnedf_domain_t* pedf, ++ check_resched_needed_t check, ++ int cpu) ++{ ++ edf_domain_init(&pedf->domain, check); ++ pedf->cpu = cpu; ++ pedf->lock = SPIN_LOCK_UNLOCKED; ++ pedf->scheduled = NULL; ++} ++ ++static void requeue(struct task_struct* t, rt_domain_t *edf) ++{ ++ /* only requeue if t is actually running */ ++ BUG_ON(!is_running(t)); ++ ++ if (t->state != TASK_RUNNING) ++ TRACE_TASK(t, "requeue: !TASK_RUNNING"); ++ ++ set_rt_flags(t, RT_F_RUNNING); ++ if (is_released(t, sched_clock())) ++ __add_ready(edf, t); ++ else ++ __add_release(edf, t); /* it has got to wait */ ++} ++ ++/* we assume the lock is being held */ ++static void preempt(psnedf_domain_t *pedf) ++{ ++ if (smp_processor_id() == pedf->cpu) { ++ if (pedf->scheduled && is_np(pedf->scheduled)) ++ request_exit_np(pedf->scheduled); ++ else ++ set_tsk_need_resched(current); ++ } else ++ /* in case that it is a remote CPU we have to defer the ++ * the decision to the remote CPU ++ */ ++ smp_send_reschedule(pedf->cpu); ++} ++ ++/* This check is trivial in partioned systems as we only have to consider ++ * the CPU of the partition. ++ */ ++static int psnedf_check_resched(rt_domain_t *edf) ++{ ++ psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain); ++ int ret = 0; ++ ++ /* because this is a callback from rt_domain_t we already hold ++ * the necessary lock for the ready queue ++ */ ++ if (edf_preemption_needed(edf, pedf->scheduled)) { ++ preempt(pedf); ++ ret = 1; ++ } ++ return ret; ++} ++ ++ ++static void psnedf_scheduler_tick(void) ++{ ++ unsigned long flags; ++ struct task_struct *t = current; ++ rt_domain_t *edf = local_edf; ++ psnedf_domain_t *pedf = local_pedf; ++ ++ /* Check for inconsistency. We don't need the lock for this since ++ * ->scheduled is only changed in schedule, which obviously is not ++ * executing in parallel on this CPU ++ */ ++ BUG_ON(is_realtime(t) && t != pedf->scheduled); ++ ++ if (is_realtime(t) && budget_exhausted(t)) { ++ if (!is_np(t)) ++ set_tsk_need_resched(t); ++ else { ++ TRACE("psnedf_scheduler_tick: " ++ "%d is non-preemptable, " ++ "preemption delayed.\n", t->pid); ++ request_exit_np(t); ++ } ++ } ++ ++ spin_lock_irqsave(&pedf->lock, flags); ++ __release_pending(edf); ++ if (edf_preemption_needed(edf, t)) ++ set_tsk_need_resched(t); ++ spin_unlock_irqrestore(&pedf->lock, flags); ++} ++ ++static void job_completion(struct task_struct* t) ++{ ++ TRACE_TASK(t, "job_completion().\n"); ++ set_rt_flags(t, RT_F_SLEEP); ++ prepare_for_next_period(t); ++} ++ ++static int psnedf_schedule(struct task_struct * prev, ++ struct task_struct ** next) ++{ ++ psnedf_domain_t* pedf = local_pedf; ++ rt_domain_t* edf = &pedf->domain; ++ ++ int out_of_time, sleep, preempt, ++ np, exists, blocks, resched; ++ ++ spin_lock(&pedf->lock); ++ ++ /* sanity checking */ ++ BUG_ON(pedf->scheduled && pedf->scheduled != prev); ++ BUG_ON(pedf->scheduled && !is_realtime(prev)); ++ ++ /* (0) Determine state */ ++ exists = pedf->scheduled != NULL; ++ blocks = exists && !is_running(pedf->scheduled); ++ out_of_time = exists && budget_exhausted(pedf->scheduled); ++ np = exists && is_np(pedf->scheduled); ++ sleep = exists && get_rt_flags(pedf->scheduled) == RT_F_SLEEP; ++ preempt = edf_preemption_needed(edf, prev); ++ ++ /* If we need to preempt do so. ++ * The following checks set resched to 1 in case of special ++ * circumstances. ++ */ ++ resched = preempt; ++ ++ /* If a task blocks we have no choice but to reschedule. ++ */ ++ if (blocks) ++ resched = 1; ++ ++ /* Request a sys_exit_np() call if we would like to preempt but cannot. ++ * Multiple calls to request_exit_np() don't hurt. ++ */ ++ if (np && (out_of_time || preempt || sleep)) ++ request_exit_np(pedf->scheduled); ++ ++ /* Any task that is preemptable and either exhausts its execution ++ * budget or wants to sleep completes. We may have to reschedule after ++ * this. ++ */ ++ if (!np && (out_of_time || sleep)) { ++ job_completion(pedf->scheduled); ++ resched = 1; ++ } ++ ++ /* The final scheduling decision. Do we need to switch for some reason? ++ * Switch if we are in RT mode and have no task or if we need to ++ * resched. ++ */ ++ *next = NULL; ++ if ((!np || blocks) && (resched || !exists)) { ++ /* Take care of a previously scheduled ++ * job by taking it out of the Linux runqueue. ++ */ ++ if (pedf->scheduled) { ++ /* as opposed to global schedulers that switch without ++ * a lock being held we can requeue already here since ++ * no other CPU will schedule from this domain. ++ */ ++ if (!blocks) ++ requeue(pedf->scheduled, edf); ++ } ++ *next = __take_ready(edf); ++ } else ++ /* Only override Linux scheduler if we have a real-time task ++ * scheduled that needs to continue. ++ */ ++ if (exists) ++ *next = prev; ++ ++ if (*next) ++ set_rt_flags(*next, RT_F_RUNNING); ++ ++ pedf->scheduled = *next; ++ spin_unlock(&pedf->lock); ++ return 0; ++} ++ ++ ++/* Prepare a task for running in RT mode ++ * Enqueues the task into master queue data structure ++ * returns ++ * -EPERM if task is not TASK_STOPPED ++ */ ++static long psnedf_prepare_task(struct task_struct * t) ++{ ++ rt_domain_t* edf = task_edf(t); ++ psnedf_domain_t* pedf = task_pedf(t); ++ unsigned long flags; ++ ++ TRACE("[%d] psn edf: prepare task %d on CPU %d\n", ++ smp_processor_id(), t->pid, get_partition(t)); ++ if (t->state == TASK_STOPPED) { ++ ++ /* 1ms delay */ ++ release_at(t, sched_clock() + 1000000); ++ ++ /* The task should be running in the queue, otherwise signal ++ * code will try to wake it up with fatal consequences. ++ */ ++ t->state = TASK_RUNNING; ++ spin_lock_irqsave(&pedf->lock, flags); ++ t->rt_param.litmus_controlled = 1; ++ __add_release(edf, t); ++ spin_unlock_irqrestore(&pedf->lock, flags); ++ return 0; ++ } else ++ return -EPERM; ++} ++ ++static void psnedf_wake_up_task(struct task_struct *task) ++{ ++ unsigned long flags; ++ psnedf_domain_t* pedf = task_pedf(task); ++ rt_domain_t* edf = task_edf(task); ++ lt_t now; ++ ++ TRACE("psnedf: %d unsuspends with budget=%d\n", ++ task->pid, task->time_slice); ++ ++ spin_lock_irqsave(&pedf->lock, flags); ++ if (!task->rt_param.litmus_controlled) { ++ BUG_ON(in_list(&task->rt_list)); ++ task->rt_param.litmus_controlled = 1; ++ /* We need to take suspensions because of semaphores into ++ * account! If a job resumes after being suspended due to acquiring ++ * a semaphore, it should never be treated as a new job release. ++ */ ++ now = sched_clock(); ++ if (is_tardy(task, now) && ++ get_rt_flags(task) != RT_F_EXIT_SEM) { ++ /* new sporadic release */ ++ release_at(task, now); ++ sched_trace_job_release(task); ++ } ++ task->state = TASK_RUNNING; ++ requeue(task, edf); ++ } ++ spin_unlock_irqrestore(&pedf->lock, flags); ++} ++ ++static void psnedf_task_blocks(struct task_struct *t) ++{ ++ BUG_ON(!is_realtime(t)); ++ /* not really anything to do since it can only block if ++ * it is running, and when it is not running it is not in any ++ * queue anyway. ++ */ ++ TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice); ++ BUG_ON(in_list(&t->rt_list)); ++ t->rt_param.litmus_controlled = 0; ++} ++ ++ ++/* When _tear_down is called, the task should not be in any queue any more ++ * as it must have blocked first. We don't have any internal state for the task, ++ * it is all in the task_struct. ++ */ ++static long psnedf_tear_down(struct task_struct * t) ++{ ++ BUG_ON(!is_realtime(t)); ++ TRACE_TASK(t, "tear down called"); ++ BUG_ON(t->array); ++ BUG_ON(in_list(&t->rt_list)); ++ return 0; ++} ++ ++static long psnedf_pi_block(struct pi_semaphore *sem, ++ struct task_struct *new_waiter) ++{ ++ psnedf_domain_t* pedf; ++ rt_domain_t* edf; ++ struct task_struct* t; ++ int cpu = get_partition(new_waiter); ++ ++ BUG_ON(!new_waiter); ++ ++ if (edf_higher_prio(new_waiter, sem->hp.cpu_task[cpu])) { ++ TRACE_TASK(new_waiter, " boosts priority\n"); ++ pedf = task_pedf(new_waiter); ++ edf = task_edf(new_waiter); ++ ++ /* interrupts already disabled */ ++ spin_lock(&pedf->lock); ++ ++ /* store new highest-priority task */ ++ sem->hp.cpu_task[cpu] = new_waiter; ++ if (sem->holder && ++ get_partition(sem->holder) == get_partition(new_waiter)) { ++ /* let holder inherit */ ++ sem->holder->rt_param.inh_task = new_waiter; ++ t = sem->holder; ++ if (in_list(&t->rt_list)) { ++ /* queued in domain*/ ++ list_del(&t->rt_list); ++ /* readd to make priority change take place */ ++ if (is_released(t, sched_clock())) ++ __add_ready(edf, t); ++ else ++ __add_release(edf, t); ++ } ++ } ++ ++ /* check if we need to reschedule */ ++ if (edf_preemption_needed(edf, current)) ++ preempt(pedf); ++ ++ spin_unlock(&pedf->lock); ++ } ++ ++ return 0; ++} ++ ++static long psnedf_inherit_priority(struct pi_semaphore *sem, ++ struct task_struct *new_owner) ++{ ++ int cpu = get_partition(new_owner); ++ ++ /* FIXME: This doesn't look correct at all! ++ * Why do we inherit in any case??? ++ */ ++ new_owner->rt_param.inh_task = sem->hp.cpu_task[cpu]; ++ if (sem->hp.cpu_task[cpu] && new_owner != sem->hp.cpu_task[cpu]) { ++ TRACE_TASK(new_owner, ++ "inherited priority from %s/%d\n", ++ sem->hp.cpu_task[cpu]->comm, ++ sem->hp.cpu_task[cpu]->pid); ++ } else ++ TRACE_TASK(new_owner, ++ "cannot inherit priority: " ++ "no higher priority job waits on this CPU!\n"); ++ /* make new owner non-preemptable as required by FMLP under ++ * PSN-EDF. ++ */ ++ make_np(new_owner); ++ return 0; ++} ++ ++ ++/* This function is called on a semaphore release, and assumes that ++ * the current task is also the semaphore holder. ++ */ ++static long psnedf_return_priority(struct pi_semaphore *sem) ++{ ++ struct task_struct* t = current; ++ psnedf_domain_t* pedf = task_pedf(t); ++ rt_domain_t* edf = task_edf(t); ++ int ret = 0; ++ int cpu = get_partition(current); ++ ++ ++ /* Find new highest-priority semaphore task ++ * if holder task is the current hp.cpu_task[cpu]. ++ * ++ * Calling function holds sem->wait.lock. ++ */ ++ if (t == sem->hp.cpu_task[cpu]) ++ set_hp_cpu_task(sem, cpu, edf_higher_prio); ++ ++ take_np(t); ++ if (current->rt_param.inh_task) { ++ TRACE_CUR("return priority of %s/%d\n", ++ current->rt_param.inh_task->comm, ++ current->rt_param.inh_task->pid); ++ spin_lock(&pedf->lock); ++ ++ /* Reset inh_task to NULL. */ ++ current->rt_param.inh_task = NULL; ++ ++ /* check if we need to reschedule */ ++ if (edf_preemption_needed(edf, current)) ++ preempt(pedf); ++ ++ spin_unlock(&pedf->lock); ++ } else ++ TRACE_CUR(" no priority to return %p\n", sem); ++ ++ return ret; ++} ++ ++ ++/* Plugin object */ ++static struct sched_plugin psn_edf_plugin __cacheline_aligned_in_smp = { ++ .plugin_name = "PSN-EDF", ++ .srp_active = 1, ++ .scheduler_tick = psnedf_scheduler_tick, ++ .prepare_task = psnedf_prepare_task, ++ .sleep_next_period = complete_job, ++ .tear_down = psnedf_tear_down, ++ .schedule = psnedf_schedule, ++ .wake_up_task = psnedf_wake_up_task, ++ .task_blocks = psnedf_task_blocks, ++ .pi_block = psnedf_pi_block, ++ .inherit_priority = psnedf_inherit_priority, ++ .return_priority = psnedf_return_priority ++}; ++ ++ ++static int __init init_psn_edf(void) ++{ ++ int i; ++ ++ for (i = 0; i < NR_CPUS; i++) ++ { ++ psnedf_domain_init(remote_pedf(i), ++ psnedf_check_resched, i); ++ printk("PSN-EDF: CPU partition %d initialized.\n", i); ++ } ++ return register_sched_plugin(&psn_edf_plugin); ++} ++ ++ ++ ++module_init(init_psn_edf); +diff --git a/litmus/sched_rm.c b/litmus/sched_rm.c +new file mode 100644 +index 0000000..57acde4 +--- /dev/null ++++ b/litmus/sched_rm.c +@@ -0,0 +1,397 @@ ++ ++/* RM implementation. ++ * Will support the M-PCP eventually. ++ */ ++ ++#include <linux/percpu.h> ++#include <linux/sched.h> ++#include <linux/list.h> ++#include <linux/spinlock.h> ++ ++#include <linux/module.h> ++ ++#include <litmus/litmus.h> ++#include <litmus/jobs.h> ++#include <litmus/sched_plugin.h> ++#include <litmus/rm_common.h> ++ ++ ++typedef struct { ++ rt_domain_t domain; ++ int cpu; ++ struct task_struct* scheduled; /* only RT tasks */ ++ spinlock_t lock; /* protects the domain and ++ * serializes scheduling decisions ++ */ ++} rm_domain_t; ++ ++DEFINE_PER_CPU(rm_domain_t, rm_domains); ++ ++#define local_dom (&__get_cpu_var(rm_domains).domain) ++#define local_part (&__get_cpu_var(rm_domains)) ++#define remote_dom(cpu) (&per_cpu(rm_domains, cpu).domain) ++#define remote_part(cpu) (&per_cpu(rm_domains, cpu)) ++#define task_dom(task) remote_dom(get_partition(task)) ++#define task_part(task) remote_part(get_partition(task)) ++ ++ ++static void prm_domain_init(rm_domain_t* part, ++ check_resched_needed_t check, ++ int cpu) ++{ ++ rm_domain_init(&part->domain, check); ++ part->cpu = cpu; ++ part->lock = SPIN_LOCK_UNLOCKED; ++ part->scheduled = NULL; ++} ++ ++static void requeue(struct task_struct* t, rt_domain_t *dom) ++{ ++ /* only requeue if t is actually running */ ++ BUG_ON(!is_running(t)); ++ ++ if (t->state != TASK_RUNNING) ++ TRACE_TASK(t, "requeue: !TASK_RUNNING"); ++ ++ set_rt_flags(t, RT_F_RUNNING); ++ if (is_released(t, sched_clock())) ++ __add_ready(dom, t); ++ else ++ __add_release(dom, t); /* it has got to wait */ ++} ++ ++/* we assume the lock is being held */ ++static void preempt(rm_domain_t *part) ++{ ++ if (smp_processor_id() == part->cpu) { ++ if (part->scheduled && is_np(part->scheduled)) ++ request_exit_np(part->scheduled); ++ else ++ set_tsk_need_resched(current); ++ } else ++ /* in case that it is a remote CPU we have to defer the ++ * the decision to the remote CPU ++ */ ++ smp_send_reschedule(part->cpu); ++} ++ ++/* This check is trivial in partioned systems as we only have to consider ++ * the CPU of the partition. ++ */ ++static int rm_check_resched(rt_domain_t *dom) ++{ ++ rm_domain_t *part = container_of(dom, rm_domain_t, domain); ++ int ret = 0; ++ ++ /* because this is a callback from rt_domain_t we already hold ++ * the necessary lock for the ready queue ++ */ ++ if (rm_preemption_needed(dom, part->scheduled)) { ++ preempt(part); ++ ret = 1; ++ } ++ return ret; ++} ++ ++static void __rm_set_prio(struct task_struct *t, struct pcp_priority* new_prio, ++ rm_domain_t* part) ++{ ++ t->rt_param.cur_prio = new_prio; ++ if (in_list(&t->rt_list)) { ++ list_del(&t->rt_list); ++ requeue(t, &part->domain); ++ } else ++ rm_check_resched(&part->domain); ++} ++ ++/* call only with IRQs disabled */ ++void rm_set_prio(struct task_struct *t, struct pcp_priority* new_prio) ++{ ++ unsigned long flags; ++ rm_domain_t *part = task_part(t); ++ ++ BUG_ON(!is_realtime(t)); ++ spin_lock_irqsave(&part->lock, flags); ++ __rm_set_prio(t, new_prio, part); ++ spin_unlock_irqrestore(&part->lock, flags); ++} ++ ++static void rm_scheduler_tick(void) ++{ ++ unsigned long flags; ++ struct task_struct *t = current; ++ rt_domain_t *dom = local_dom; ++ rm_domain_t *part = local_part; ++ ++ /* Check for inconsistency. We don't need the lock for this since ++ * ->scheduled is only changed in schedule, which obviously is not ++ * executing in parallel on this CPU ++ */ ++ BUG_ON(is_realtime(t) && t != part->scheduled); ++ ++/* if (is_realtime(t) && budget_exhausted(t)) { ++ if (!is_np(t)) ++ set_tsk_need_resched(t); ++ else { ++ TRACE("rm_scheduler_tick: " ++ "%d is non-preemptable, " ++ "preemption delayed.\n", t->pid); ++ request_exit_np(t); ++ } ++ } ++*/ ++ spin_lock_irqsave(&part->lock, flags); ++ __release_pending(dom); ++ if (rm_preemption_needed(dom, t)) ++ set_tsk_need_resched(t); ++ spin_unlock_irqrestore(&part->lock, flags); ++} ++ ++static void job_completion(struct task_struct* t) ++{ ++ TRACE_TASK(t, "job_completion().\n"); ++ set_rt_flags(t, RT_F_SLEEP); ++ prepare_for_next_period(t); ++} ++ ++static int rm_schedule(struct task_struct * prev, ++ struct task_struct ** next) ++{ ++ rm_domain_t* part = local_part; ++ rt_domain_t* dom = &part->domain; ++ ++ int sleep, preempt, ++ np, exists, blocks, resched; ++// int out_of_time; ++ ++ spin_lock(&part->lock); ++ ++ /* sanity checking */ ++ BUG_ON(part->scheduled && part->scheduled != prev); ++ BUG_ON(part->scheduled && !is_realtime(prev)); ++ ++ /* (0) Determine state */ ++ exists = part->scheduled != NULL; ++ blocks = exists && !is_running(part->scheduled); ++// out_of_time = exists && budget_exhausted(part->scheduled); ++#define out_of_time 0 ++ np = exists && is_np(part->scheduled); ++ sleep = exists && get_rt_flags(part->scheduled) == RT_F_SLEEP; ++ preempt = rm_preemption_needed(dom, prev); ++ ++ /* If we need to preempt do so. ++ * The following checks set resched to 1 in case of special ++ * circumstances. ++ */ ++ resched = preempt; ++ ++ /* If a task blocks we have no choice but to reschedule. ++ */ ++ if (blocks) ++ resched = 1; ++ ++ /* Request a sys_exit_np() call if we would like to preempt but cannot. ++ * Multiple calls to request_exit_np() don't hurt. ++ */ ++ if (np && (out_of_time || preempt || sleep)) ++ request_exit_np(part->scheduled); ++ ++ /* Any task that is preemptable and either exhausts its execution ++ * budget or wants to sleep completes. We may have to reschedule after ++ * this. ++ */ ++ if (!np && (out_of_time || sleep)) { ++ job_completion(part->scheduled); ++ resched = 1; ++ } ++ ++ /* The final scheduling decision. Do we need to switch for some reason? ++ * Switch if we are in RT mode and have no task or if we need to ++ * resched. ++ */ ++ *next = NULL; ++ if ((!np || blocks) && (resched || !exists)) { ++ /* Take care of a previously scheduled ++ * job by taking it out of the Linux runqueue. ++ */ ++ if (part->scheduled) { ++ /* as opposed to global schedulers that switch without ++ * a lock being held we can requeue already here since ++ * no other CPU will schedule from this domain. ++ */ ++ if (!blocks) ++ requeue(part->scheduled, dom); ++ } ++ *next = __take_ready(dom); ++ } else ++ /* Only override Linux scheduler if we have a real-time task ++ * scheduled that needs to continue. ++ */ ++ if (exists) ++ *next = prev; ++ ++ if (*next) ++ set_rt_flags(*next, RT_F_RUNNING); ++ ++ part->scheduled = *next; ++ spin_unlock(&part->lock); ++ return 0; ++} ++ ++ ++/* Prepare a task for running in RT mode ++ * Enqueues the task into master queue data structure ++ * returns ++ * -EPERM if task is not TASK_STOPPED ++ */ ++static long rm_prepare_task(struct task_struct * t) ++{ ++ rt_domain_t* dom = task_dom(t); ++ rm_domain_t* part = task_part(t); ++ unsigned long flags; ++ ++ TRACE("[%d] P-RM: prepare task %d on CPU %d\n", ++ smp_processor_id(), t->pid, get_partition(t)); ++ if (t->state == TASK_STOPPED) { ++//FIXME if (!t->rt_param.task_params.prio) { ++ TRACE_TASK(t, "using rate-monotonic prio assignment\n"); ++ t->rt_param.pcp_prio.prio = get_rt_period(t); ++// } else { ++// TRACE_TASK(t, "using user-defined static prio assignment\n"); ++// t->rt_param.pcp_prio.prio = t->rt_param.task_params.prio; ++// } ++ t->rt_param.pcp_prio.in_global_cs = 0; ++ t->rt_param.pcp_prio.pid = t->pid; ++ t->rt_param.cur_prio = &t->rt_param.pcp_prio; ++ INIT_LIST_HEAD(&t->rt_param.owned_semaphores); ++ /* 1ms delay */ ++ release_at(t, sched_clock() + 1000000); ++ ++ /* The task should be running in the queue, otherwise signal ++ * code will try to wake it up with fatal consequences. ++ */ ++ t->state = TASK_RUNNING; ++ ++ spin_lock_irqsave(&part->lock, flags); ++ t->rt_param.litmus_controlled = 1; ++ __add_release(dom, t); ++ spin_unlock_irqrestore(&part->lock, flags); ++ return 0; ++ } else ++ return -EPERM; ++} ++ ++static void rm_wake_up_task(struct task_struct *task) ++{ ++ unsigned long flags; ++ rm_domain_t* part = task_part(task); ++ rt_domain_t* dom = task_dom(task); ++ ++ TRACE_TASK(task, "P-RM: %d unsuspends.\n"); ++ ++ spin_lock_irqsave(&part->lock, flags); ++ if (!task->rt_param.litmus_controlled) { ++ BUG_ON(in_list(&task->rt_list)); ++ task->rt_param.litmus_controlled = 1; ++ task->state = TASK_RUNNING; ++ requeue(task, dom); ++ } ++ spin_unlock_irqrestore(&part->lock, flags); ++} ++ ++static void rm_task_blocks(struct task_struct *t) ++{ ++ BUG_ON(!is_realtime(t)); ++ /* not really anything to do since it can only block if ++ * it is running, and when it is not running it is not in any ++ * queue anyway. ++ */ ++ TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice); ++ BUG_ON(in_list(&t->rt_list)); ++ t->rt_param.litmus_controlled = 0; ++} ++ ++ ++/* When _tear_down is called, the task should not be in any queue any more ++ * as it must have blocked first. We don't have any internal state for the task, ++ * it is all in the task_struct. ++ */ ++static long rm_tear_down(struct task_struct * t) ++{ ++ BUG_ON(!is_realtime(t)); ++ TRACE_TASK(t, "tear down called"); ++ BUG_ON(t->array); ++ BUG_ON(in_list(&t->rt_list)); ++ return 0; ++} ++ ++static struct pcp_priority boosted = {0, 1, INT_MAX}; ++ ++static long rm_pi_block(struct pi_semaphore *sem, ++ struct task_struct *new_waiter) ++{ ++ return 0; ++} ++ ++static long rm_inherit_priority(struct pi_semaphore *sem, ++ struct task_struct *new_owner) ++{ ++ rm_set_prio(new_owner, &boosted); ++ TRACE_TASK(new_owner, "priority boosted"); ++ make_np(new_owner); ++ return 0; ++} ++ ++ ++/* This function is called on a semaphore release, and assumes that ++ * the current task is also the semaphore holder. ++ */ ++static long rm_return_priority(struct pi_semaphore *sem) ++{ ++ struct task_struct* t = current; ++ ++ take_np(t); ++ /* reset prio to trigger resched if required */ ++ rm_set_prio(t, &t->rt_param.pcp_prio); ++ TRACE_TASK(t, "prio boost ended"); ++ return 0; ++} ++ ++/* Plugin object */ ++static struct sched_plugin p_rm_plugin __cacheline_aligned_in_smp = { ++ .plugin_name = "P-RM", ++ /* PCP and SRP don't really work together, but this is something the ++ * user has to get right for the moment. ++ * System will not crash and burn, but timing correctness is not ensured. ++ * Just don't use both APIs at the same time for now. ++ */ ++ .pcp_active = 1, ++ .srp_active = 1, ++ .scheduler_tick = rm_scheduler_tick, ++ .prepare_task = rm_prepare_task, ++ .sleep_next_period = complete_job, ++ .tear_down = rm_tear_down, ++ .schedule = rm_schedule, ++ .wake_up_task = rm_wake_up_task, ++ .task_blocks = rm_task_blocks, ++ .pi_block = rm_pi_block, ++ .inherit_priority = rm_inherit_priority, ++ .return_priority = rm_return_priority ++}; ++ ++static int __init init_rm(void) ++{ ++ int i; ++ ++ for (i = 0; i < NR_CPUS; i++) ++ { ++ prm_domain_init(remote_part(i), ++ rm_check_resched, i); ++ printk("P-RM: CPU partition %d initialized.\n", i); ++ } ++ return register_sched_plugin(&p_rm_plugin); ++} ++ ++ ++ ++module_init(init_rm); +diff --git a/litmus/sched_trace.c b/litmus/sched_trace.c +new file mode 100644 +index 0000000..0976e83 +--- /dev/null ++++ b/litmus/sched_trace.c +@@ -0,0 +1,541 @@ ++/* sched_trace.c -- record scheduling events to a byte stream. ++ * ++ * TODO: Move ring buffer to a lockfree implementation. ++ */ ++ ++#include <linux/spinlock.h> ++#include <linux/fs.h> ++#include <linux/cdev.h> ++#include <asm/semaphore.h> ++#include <asm/uaccess.h> ++#include <linux/module.h> ++ ++#include <litmus/sched_trace.h> ++#include <litmus/litmus.h> ++ ++ ++typedef struct { ++ /* guard read and write pointers */ ++ spinlock_t lock; ++ /* guard against concurrent freeing of buffer */ ++ rwlock_t del_lock; ++ ++ /* memory allocated for ring buffer */ ++ unsigned long order; ++ char* buf; ++ char* end; ++ ++ /* Read/write pointer. May not cross. ++ * They point to the position of next write and ++ * last read. ++ */ ++ char* writep; ++ char* readp; ++ ++} ring_buffer_t; ++ ++#define EMPTY_RING_BUFFER { \ ++ .lock = SPIN_LOCK_UNLOCKED, \ ++ .del_lock = RW_LOCK_UNLOCKED, \ ++ .buf = NULL, \ ++ .end = NULL, \ ++ .writep = NULL, \ ++ .readp = NULL \ ++} ++ ++void rb_init(ring_buffer_t* buf) ++{ ++ *buf = (ring_buffer_t) EMPTY_RING_BUFFER; ++} ++ ++int rb_alloc_buf(ring_buffer_t* buf, unsigned long order) ++{ ++ unsigned long flags; ++ int error = 0; ++ char *mem; ++ ++ /* do memory allocation while not atomic */ ++ mem = (char *) __get_free_pages(GFP_KERNEL, order); ++ if (!mem) ++ return -ENOMEM; ++ write_lock_irqsave(&buf->del_lock, flags); ++ BUG_ON(buf->buf); ++ buf->buf = mem; ++ buf->end = buf->buf + PAGE_SIZE * (1 << order) - 1; ++ memset(buf->buf, 0xff, buf->end - buf->buf); ++ buf->order = order; ++ buf->writep = buf->buf + 1; ++ buf->readp = buf->buf; ++ write_unlock_irqrestore(&buf->del_lock, flags); ++ return error; ++} ++ ++int rb_free_buf(ring_buffer_t* buf) ++{ ++ unsigned long flags; ++ int error = 0; ++ write_lock_irqsave(&buf->del_lock, flags); ++ BUG_ON(!buf->buf); ++ free_pages((unsigned long) buf->buf, buf->order); ++ buf->buf = NULL; ++ buf->end = NULL; ++ buf->writep = NULL; ++ buf->readp = NULL; ++ write_unlock_irqrestore(&buf->del_lock, flags); ++ return error; ++} ++ ++/* Assumption: concurrent writes are serialized externally ++ * ++ * Will only succeed if there is enough space for all len bytes. ++ */ ++int rb_put(ring_buffer_t* buf, char* mem, size_t len) ++{ ++ unsigned long flags; ++ char* r , *w; ++ int error = 0; ++ read_lock_irqsave(&buf->del_lock, flags); ++ if (!buf->buf) { ++ error = -ENODEV; ++ goto out; ++ } ++ spin_lock(&buf->lock); ++ r = buf->readp; ++ w = buf->writep; ++ spin_unlock(&buf->lock); ++ if (r < w && buf->end - w >= len - 1) { ++ /* easy case: there is enough space in the buffer ++ * to write it in one continous chunk*/ ++ memcpy(w, mem, len); ++ w += len; ++ if (w > buf->end) ++ /* special case: fit exactly into buffer ++ * w is now buf->end + 1 ++ */ ++ w = buf->buf; ++ } else if (w < r && r - w >= len) { /* >= len because may not cross */ ++ /* we are constrained by the read pointer but we there ++ * is enough space ++ */ ++ memcpy(w, mem, len); ++ w += len; ++ } else if (r <= w && buf->end - w < len - 1) { ++ /* the wrap around case: there may or may not be space */ ++ if ((buf->end - w) + (r - buf->buf) >= len - 1) { ++ /* copy chunk that fits at the end */ ++ memcpy(w, mem, buf->end - w + 1); ++ mem += buf->end - w + 1; ++ len -= (buf->end - w + 1); ++ w = buf->buf; ++ /* copy the rest */ ++ memcpy(w, mem, len); ++ w += len; ++ } ++ else ++ error = -ENOMEM; ++ } else { ++ error = -ENOMEM; ++ } ++ if (!error) { ++ spin_lock(&buf->lock); ++ buf->writep = w; ++ spin_unlock(&buf->lock); ++ } ++ out: ++ read_unlock_irqrestore(&buf->del_lock, flags); ++ return error; ++} ++ ++/* Assumption: concurrent reads are serialized externally */ ++int rb_get(ring_buffer_t* buf, char* mem, size_t len) ++{ ++ unsigned long flags; ++ char* r , *w; ++ int error = 0; ++ read_lock_irqsave(&buf->del_lock, flags); ++ if (!buf->buf) { ++ error = -ENODEV; ++ goto out; ++ } ++ spin_lock(&buf->lock); ++ r = buf->readp; ++ w = buf->writep; ++ spin_unlock(&buf->lock); ++ ++ if (w <= r && buf->end - r >= len) { ++ /* easy case: there is enough data in the buffer ++ * to get it in one chunk*/ ++ memcpy(mem, r + 1, len); ++ r += len; ++ error = len; ++ ++ } else if (r + 1 < w && w - r - 1 >= len) { ++ /* we are constrained by the write pointer but ++ * there is enough data ++ */ ++ memcpy(mem, r + 1, len); ++ r += len; ++ error = len; ++ ++ } else if (r + 1 < w && w - r - 1 < len) { ++ /* we are constrained by the write pointer and there ++ * there is not enough data ++ */ ++ memcpy(mem, r + 1, w - r - 1); ++ error = w - r - 1; ++ r += w - r - 1; ++ ++ } else if (w <= r && buf->end - r < len) { ++ /* the wrap around case: there may or may not be enough data ++ * first let's get what is available ++ */ ++ memcpy(mem, r + 1, buf->end - r); ++ error += (buf->end - r); ++ mem += (buf->end - r); ++ len -= (buf->end - r); ++ r += (buf->end - r); ++ ++ if (w > buf->buf) { ++ /* there is more to get */ ++ r = buf->buf - 1; ++ if (w - r >= len) { ++ /* plenty */ ++ memcpy(mem, r + 1, len); ++ error += len; ++ r += len; ++ } else { ++ memcpy(mem, r + 1, w - r - 1); ++ error += w - r - 1; ++ r += w - r - 1; ++ } ++ } ++ } /* nothing available */ ++ ++ if (error > 0) { ++ spin_lock(&buf->lock); ++ buf->readp = r; ++ spin_unlock(&buf->lock); ++ } ++ out: ++ read_unlock_irqrestore(&buf->del_lock, flags); ++ return error; ++} ++ ++ ++ ++/******************************************************************************/ ++/* DEVICE FILE DRIVER */ ++/******************************************************************************/ ++ ++ ++ ++/* Allocate a buffer of about 1 MB per CPU. ++ * ++ */ ++#define BUFFER_ORDER 8 ++ ++typedef struct { ++ ring_buffer_t buf; ++ atomic_t reader_cnt; ++ struct semaphore reader_mutex; ++} trace_buffer_t; ++ ++ ++/* This does not initialize the semaphore!! */ ++ ++#define EMPTY_TRACE_BUFFER \ ++ { .buf = EMPTY_RING_BUFFER, .reader_cnt = ATOMIC_INIT(0)} ++ ++static DEFINE_PER_CPU(trace_buffer_t, trace_buffer); ++ ++#ifdef CONFIG_SCHED_DEBUG_TRACE ++static spinlock_t log_buffer_lock = SPIN_LOCK_UNLOCKED; ++#endif ++static trace_buffer_t log_buffer = EMPTY_TRACE_BUFFER; ++ ++static void init_buffers(void) ++{ ++ int i; ++ ++ for (i = 0; i < NR_CPUS; i++) { ++ rb_init(&per_cpu(trace_buffer, i).buf); ++ init_MUTEX(&per_cpu(trace_buffer, i).reader_mutex); ++ atomic_set(&per_cpu(trace_buffer, i).reader_cnt, 0); ++ } ++ /* only initialize the mutex, the rest was initialized as part ++ * of the static initialization macro ++ */ ++ init_MUTEX(&log_buffer.reader_mutex); ++} ++ ++static int trace_release(struct inode *in, struct file *filp) ++{ ++ int error = -EINVAL; ++ trace_buffer_t* buf = filp->private_data; ++ ++ BUG_ON(!filp->private_data); ++ ++ if (down_interruptible(&buf->reader_mutex)) { ++ error = -ERESTARTSYS; ++ goto out; ++ } ++ ++ /* last release must deallocate buffers */ ++ if (atomic_dec_return(&buf->reader_cnt) == 0) { ++ error = rb_free_buf(&buf->buf); ++ } ++ ++ up(&buf->reader_mutex); ++ out: ++ return error; ++} ++ ++static ssize_t trace_read(struct file *filp, char __user *to, size_t len, ++ loff_t *f_pos) ++{ ++ /* we ignore f_pos, this is strictly sequential */ ++ ++ ssize_t error = -EINVAL; ++ char* mem; ++ trace_buffer_t *buf = filp->private_data; ++ ++ if (down_interruptible(&buf->reader_mutex)) { ++ error = -ERESTARTSYS; ++ goto out; ++ } ++ ++ if (len > 64 * 1024) ++ len = 64 * 1024; ++ mem = kmalloc(len, GFP_KERNEL); ++ if (!mem) { ++ error = -ENOMEM; ++ goto out_unlock; ++ } ++ ++ error = rb_get(&buf->buf, mem, len); ++ while (!error) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ schedule_timeout(110); ++ if (signal_pending(current)) ++ error = -ERESTARTSYS; ++ else ++ error = rb_get(&buf->buf, mem, len); ++ } ++ ++ if (error > 0 && copy_to_user(to, mem, error)) ++ error = -EFAULT; ++ ++ kfree(mem); ++ out_unlock: ++ up(&buf->reader_mutex); ++ out: ++ return error; ++} ++ ++ ++/* trace_open - Open one of the per-CPU sched_trace buffers. ++ */ ++static int trace_open(struct inode *in, struct file *filp) ++{ ++ int error = -EINVAL; ++ int cpu = MINOR(in->i_rdev); ++ trace_buffer_t* buf; ++ ++ if (!cpu_online(cpu)) { ++ printk(KERN_WARNING "sched trace: " ++ "CPU #%d is not online. (open failed)\n", cpu); ++ error = -ENODEV; ++ goto out; ++ } ++ ++ buf = &per_cpu(trace_buffer, cpu); ++ ++ if (down_interruptible(&buf->reader_mutex)) { ++ error = -ERESTARTSYS; ++ goto out; ++ } ++ ++ /* first open must allocate buffers */ ++ if (atomic_inc_return(&buf->reader_cnt) == 1) { ++ if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER))) ++ { ++ atomic_dec(&buf->reader_cnt); ++ goto out_unlock; ++ } ++ } ++ ++ error = 0; ++ filp->private_data = buf; ++ ++ out_unlock: ++ up(&buf->reader_mutex); ++ out: ++ return error; ++} ++ ++/* log_open - open the global log message ring buffer. ++ */ ++static int log_open(struct inode *in, struct file *filp) ++{ ++ int error = -EINVAL; ++ trace_buffer_t* buf; ++ ++ buf = &log_buffer; ++ ++ if (down_interruptible(&buf->reader_mutex)) { ++ error = -ERESTARTSYS; ++ goto out; ++ } ++ ++ /* first open must allocate buffers */ ++ if (atomic_inc_return(&buf->reader_cnt) == 1) { ++ if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER))) ++ { ++ atomic_dec(&buf->reader_cnt); ++ goto out_unlock; ++ } ++ } ++ ++ error = 0; ++ filp->private_data = buf; ++ ++ out_unlock: ++ up(&buf->reader_mutex); ++ out: ++ return error; ++} ++ ++/******************************************************************************/ ++/* Device Registration */ ++/******************************************************************************/ ++ ++/* the major numbes are from the unassigned/local use block ++ * ++ * This should be converted to dynamic allocation at some point... ++ */ ++#define TRACE_MAJOR 250 ++#define LOG_MAJOR 251 ++ ++/* trace_fops - The file operations for accessing the per-CPU scheduling event ++ * trace buffers. ++ */ ++struct file_operations trace_fops = { ++ .owner = THIS_MODULE, ++ .open = trace_open, ++ .release = trace_release, ++ .read = trace_read, ++}; ++ ++/* log_fops - The file operations for accessing the global LITMUS log message ++ * buffer. ++ * ++ * Except for opening the device file it uses the same operations as trace_fops. ++ */ ++struct file_operations log_fops = { ++ .owner = THIS_MODULE, ++ .open = log_open, ++ .release = trace_release, ++ .read = trace_read, ++}; ++ ++static int __init register_buffer_dev(const char* name, ++ struct file_operations* fops, ++ int major, int count) ++{ ++ dev_t trace_dev; ++ struct cdev *cdev; ++ int error = 0; ++ ++ trace_dev = MKDEV(major, 0); ++ error = register_chrdev_region(trace_dev, count, name); ++ if (error) ++ { ++ printk(KERN_WARNING "sched trace: " ++ "Could not register major/minor number %d\n", major); ++ return error; ++ } ++ cdev = cdev_alloc(); ++ if (!cdev) { ++ printk(KERN_WARNING "sched trace: " ++ "Could not get a cdev for %s.\n", name); ++ return -ENOMEM; ++ } ++ cdev->owner = THIS_MODULE; ++ cdev->ops = fops; ++ error = cdev_add(cdev, trace_dev, count); ++ if (error) { ++ printk(KERN_WARNING "sched trace: " ++ "add_cdev failed for %s.\n", name); ++ return -ENOMEM; ++ } ++ return error; ++ ++} ++ ++static int __init init_sched_trace(void) ++{ ++ int error1 = 0, error2 = 0; ++ ++ printk("Initializing scheduler trace device\n"); ++ init_buffers(); ++ ++ error1 = register_buffer_dev("schedtrace", &trace_fops, ++ TRACE_MAJOR, NR_CPUS); ++ ++ error2 = register_buffer_dev("litmus_log", &log_fops, ++ LOG_MAJOR, 1); ++ if (error1 || error2) ++ return min(error1, error2); ++ else ++ return 0; ++} ++ ++module_init(init_sched_trace); ++ ++/******************************************************************************/ ++/* KERNEL API */ ++/******************************************************************************/ ++ ++/* The per-CPU LITMUS log buffer. Don't put it on the stack, it is too big for ++ * that and the kernel gets very picky with nested interrupts and small stacks. ++ */ ++ ++#ifdef CONFIG_SCHED_DEBUG_TRACE ++ ++#define MSG_SIZE 255 ++static DEFINE_PER_CPU(char[MSG_SIZE], fmt_buffer); ++ ++/* sched_trace_log_message - This is the only function that accesses the the ++ * log buffer inside the kernel for writing. ++ * Concurrent access to it is serialized via the ++ * log_buffer_lock. ++ * ++ * The maximum length of a formatted message is 255. ++ */ ++void sched_trace_log_message(const char* fmt, ...) ++{ ++ unsigned long flags; ++ va_list args; ++ size_t len; ++ char* buf; ++ ++ va_start(args, fmt); ++ local_irq_save(flags); ++ ++ /* format message */ ++ buf = __get_cpu_var(fmt_buffer); ++ len = vscnprintf(buf, MSG_SIZE, fmt, args); ++ ++ spin_lock(&log_buffer_lock); ++ /* Don't copy the trailing null byte, we don't want null bytes ++ * in a text file. ++ */ ++ rb_put(&log_buffer.buf, buf, len); ++ spin_unlock(&log_buffer_lock); ++ ++ local_irq_restore(flags); ++ va_end(args); ++} ++ ++#endif ++ +diff --git a/litmus/sync.c b/litmus/sync.c +new file mode 100644 +index 0000000..4405228 +--- /dev/null ++++ b/litmus/sync.c +@@ -0,0 +1,84 @@ ++/* litmus/sync.c - Support for synchronous and asynchronous task system releases. ++ * ++ * ++ */ ++ ++#include <asm/atomic.h> ++#include <asm/uaccess.h> ++#include <linux/spinlock.h> ++#include <linux/list.h> ++#include <linux/sched.h> ++#include <linux/completion.h> ++ ++#include <litmus/litmus.h> ++#include <litmus/jobs.h> ++ ++static DECLARE_COMPLETION(ts_release); ++ ++static long do_wait_for_ts_release(void) ++{ ++ long ret = 0; ++ ++ /* If the interruption races with a release, the completion object ++ * may have a non-zero counter. To avoid this problem, this should ++ * be replaced by wait_for_completion(). ++ * ++ * For debugging purposes, this is interruptible for now. ++ */ ++ ret = wait_for_completion_interruptible(&ts_release); ++ ++ return ret; ++} ++ ++ ++static long do_release_ts(lt_t start) ++{ ++ int task_count = 0; ++ long flags; ++ struct list_head *pos; ++ struct task_struct *t; ++ ++ ++ spin_lock_irqsave(&ts_release.wait.lock, flags); ++ ++ list_for_each(pos, &ts_release.wait.task_list) { ++ t = (struct task_struct*) list_entry(pos, ++ struct __wait_queue, ++ task_list)->private; ++ task_count++; ++ release_at(t, start + t->rt_param.task_params.phase); ++ } ++ ++ spin_unlock_irqrestore(&ts_release.wait.lock, flags); ++ ++ complete_n(&ts_release, task_count); ++ ++ return task_count; ++} ++ ++ ++asmlinkage long sys_wait_for_ts_release(void) ++{ ++ long ret = -EPERM; ++ struct task_struct *t = current; ++ ++ if (is_realtime(t)) ++ ret = do_wait_for_ts_release(); ++ ++ return ret; ++} ++ ++ ++asmlinkage long sys_release_ts(lt_t __user *__delay) ++{ ++ long ret; ++ lt_t delay; ++ ++ /* FIXME: check capabilities... */ ++ ++ ret = copy_from_user(&delay, __delay, sizeof(lt_t)); ++ if (ret == 0) ++ ret = do_release_ts(sched_clock() + delay); ++ ++ return ret; ++} +diff --git a/litmus/trace.c b/litmus/trace.c +new file mode 100644 +index 0000000..bcdf103 +--- /dev/null ++++ b/litmus/trace.c +@@ -0,0 +1,302 @@ ++#include <linux/fs.h> ++#include <linux/cdev.h> ++#include <asm/semaphore.h> ++#include <asm/uaccess.h> ++#include <linux/module.h> ++ ++#include <litmus/trace.h> ++ ++/******************************************************************************/ ++/* Allocation */ ++/******************************************************************************/ ++ ++struct ft_buffer* trace_ts_buf = NULL; ++ ++static unsigned int ts_seq_no = 0; ++ ++feather_callback void save_timestamp(unsigned long event) ++{ ++ unsigned int seq_no = fetch_and_inc((int *) &ts_seq_no); ++ struct timestamp *ts; ++ if (ft_buffer_start_write(trace_ts_buf, (void**) &ts)) { ++ ts->event = event; ++ ts->timestamp = ft_read_tsc(); ++ ts->seq_no = seq_no; ++ ts->cpu = raw_smp_processor_id(); ++ ft_buffer_finish_write(trace_ts_buf, ts); ++ } ++} ++ ++static struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size) ++{ ++ struct ft_buffer* buf; ++ size_t total = (size + 1) * count; ++ char* mem; ++ int order = 0, pages = 1; ++ ++ buf = kmalloc(sizeof(struct ft_buffer), GFP_KERNEL); ++ if (!buf) ++ return NULL; ++ ++ total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0); ++ while (pages < total) { ++ order++; ++ pages *= 2; ++ } ++ ++ mem = (char*) __get_free_pages(GFP_KERNEL, order); ++ if (!mem) { ++ kfree(buf); ++ return NULL; ++ } ++ ++ if (!init_ft_buffer(buf, count, size, ++ mem + (count * size), /* markers at the end */ ++ mem)) { /* buffer objects */ ++ free_pages((unsigned long) mem, order); ++ kfree(buf); ++ return NULL; ++ } ++ return buf; ++} ++ ++static void free_ft_buffer(struct ft_buffer* buf) ++{ ++ int order = 0, pages = 1; ++ size_t total; ++ ++ if (buf) { ++ total = (buf->slot_size + 1) * buf->slot_count; ++ total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0); ++ while (pages < total) { ++ order++; ++ pages *= 2; ++ } ++ free_pages((unsigned long) buf->buffer_mem, order); ++ kfree(buf); ++ } ++} ++ ++ ++/******************************************************************************/ ++/* DEVICE FILE DRIVER */ ++/******************************************************************************/ ++ ++#define NO_TIMESTAMPS 262144 ++ ++static DECLARE_MUTEX(feather_lock); ++static int use_count = 0; ++ ++static int trace_release(struct inode *in, struct file *filp) ++{ ++ int err = -EINVAL; ++ ++ if (down_interruptible(&feather_lock)) { ++ err = -ERESTARTSYS; ++ goto out; ++ } ++ ++ printk(KERN_ALERT "%s/%d disconnects from feather trace device. " ++ "use_count=%d\n", ++ current->comm, current->pid, use_count); ++ ++ if (use_count == 1) { ++ /* disable events */ ++ ft_disable_all_events(); ++ ++ /* wait for any pending events to complete */ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_timeout(HZ); ++ ++ printk(KERN_ALERT "Failed trace writes: %u\n", ++ trace_ts_buf->failed_writes); ++ ++ free_ft_buffer(trace_ts_buf); ++ trace_ts_buf = NULL; ++ } ++ ++ use_count--; ++ up(&feather_lock); ++out: ++ return err; ++} ++ ++ ++static ssize_t trace_read(struct file *filp, char __user *to, size_t len, ++ loff_t *f_pos) ++{ ++ /* we ignore f_pos, this is strictly sequential */ ++ ssize_t error = 0; ++ struct timestamp ts; ++ ++ if (down_interruptible(&feather_lock)) { ++ error = -ERESTARTSYS; ++ goto out; ++ } ++ ++ ++ while (len >= sizeof(struct timestamp)) { ++ if (ft_buffer_read(trace_ts_buf, &ts)) { ++ if (copy_to_user(to, &ts, sizeof(struct timestamp))) { ++ error = -EFAULT; ++ break; ++ } else { ++ len -= sizeof(struct timestamp); ++ to += sizeof(struct timestamp); ++ error += sizeof(struct timestamp); ++ } ++ } else { ++ set_current_state(TASK_INTERRUPTIBLE); ++ schedule_timeout(50); ++ if (signal_pending(current)) { ++ error = -ERESTARTSYS; ++ break; ++ } ++ } ++ } ++ up(&feather_lock); ++out: ++ return error; ++} ++ ++#define ENABLE_CMD 0 ++#define DISABLE_CMD 1 ++ ++static ssize_t trace_write(struct file *filp, const char __user *from, ++ size_t len, loff_t *f_pos) ++{ ++ ssize_t error = -EINVAL; ++ unsigned long cmd; ++ unsigned long id; ++ ++ if (len % sizeof(long) || len < 2 * sizeof(long)) ++ goto out; ++ ++ if (copy_from_user(&cmd, from, sizeof(long))) { ++ error = -EFAULT; ++ goto out; ++ } ++ len -= sizeof(long); ++ from += sizeof(long); ++ ++ if (cmd != ENABLE_CMD && cmd != DISABLE_CMD) ++ goto out; ++ ++ if (down_interruptible(&feather_lock)) { ++ error = -ERESTARTSYS; ++ goto out; ++ } ++ ++ error = sizeof(long); ++ while (len) { ++ if (copy_from_user(&id, from, sizeof(long))) { ++ error = -EFAULT; ++ goto out; ++ } ++ len -= sizeof(long); ++ from += sizeof(long); ++ if (cmd) { ++ printk(KERN_INFO ++ "Disabling feather-trace event %lu.\n", id); ++ ft_disable_event(id); ++ } else { ++ printk(KERN_INFO ++ "Enabling feather-trace event %lu.\n", id); ++ ft_enable_event(id); ++ } ++ error += sizeof(long); ++ } ++ ++ up(&feather_lock); ++ out: ++ return error; ++} ++ ++static int trace_open(struct inode *in, struct file *filp) ++{ ++ int err = 0; ++ unsigned int count = NO_TIMESTAMPS; ++ ++ if (down_interruptible(&feather_lock)) { ++ err = -ERESTARTSYS; ++ goto out; ++ } ++ ++ while (count && !trace_ts_buf) { ++ printk("trace: trying to allocate %u time stamps.\n", count); ++ trace_ts_buf = alloc_ft_buffer(count, sizeof(struct timestamp)); ++ count /= 2; ++ } ++ if (!trace_ts_buf) ++ err = -ENOMEM; ++ else ++ use_count++; ++ ++ up(&feather_lock); ++out: ++ return err; ++} ++ ++/******************************************************************************/ ++/* Device Registration */ ++/******************************************************************************/ ++ ++#define FT_TRACE_MAJOR 252 ++ ++struct file_operations ft_trace_fops = { ++ .owner = THIS_MODULE, ++ .open = trace_open, ++ .release = trace_release, ++ .write = trace_write, ++ .read = trace_read, ++}; ++ ++ ++static int __init register_buffer_dev(const char* name, ++ struct file_operations* fops, ++ int major, int count) ++{ ++ dev_t trace_dev; ++ struct cdev *cdev; ++ int error = 0; ++ ++ trace_dev = MKDEV(major, 0); ++ error = register_chrdev_region(trace_dev, count, name); ++ if (error) ++ { ++ printk(KERN_WARNING "trace: " ++ "Could not register major/minor number %d\n", major); ++ return error; ++ } ++ cdev = cdev_alloc(); ++ if (!cdev) { ++ printk(KERN_WARNING "trace: " ++ "Could not get a cdev for %s.\n", name); ++ return -ENOMEM; ++ } ++ cdev->owner = THIS_MODULE; ++ cdev->ops = fops; ++ error = cdev_add(cdev, trace_dev, count); ++ if (error) { ++ printk(KERN_WARNING "trace: " ++ "add_cdev failed for %s.\n", name); ++ return -ENOMEM; ++ } ++ return error; ++ ++} ++ ++static int __init init_sched_trace(void) ++{ ++ int error = 0; ++ ++ printk("Initializing Feather-Trace device\n"); ++ /* dummy entry to make linker happy */ ++ ft_event0(666, save_timestamp); ++ ++ error = register_buffer_dev("ft_trace", &ft_trace_fops, ++ FT_TRACE_MAJOR, 1); ++ return error; ++} ++ ++module_init(init_sched_trace); diff --git a/index.html b/index.html index 4e251e9..af6bef4 100644 --- a/index.html +++ b/index.html @@ -126,6 +126,23 @@ <cite>Proceedings of the 14th IEEE International Conference on Embedded and Real-Time Computing Systems and Applications</cite>, to appear, August 2008. <a href="http://www.cs.unc.edu/~anderson/papers/rtcsa08.ps">Postscript</a>. <a href="http://www.cs.unc.edu/~anderson/papers/rtcsa08.pdf">PDF</a>. </p> + <p><strong>Note:</strong> The work described in this paper took part in a branch that is currently not part of + the main distribution. For reference, we provide the branch as a separate download: + </p> + <ul> + <li> + <a href="download/RTCSA08/litmus-rt-RTCSA08.patch">litmus-rt-RTCSA08.patch</a> + </li> + <li> + <a href="download/RTCSA08/liblitmus-RTCSA08.tgz">liblitmus-RTCSA08.tgz</a> + </li> + <li><a href="download/RTCSA08/SHA256SUMS">SHA256 check sums</a> + </li> + </ul> + <p>Please don't use this version for active development. If you are interested in this work, it would be best + to first port the desired features to LTIMUS<sup>RT</sup> 2008 and merge them into the main distribution. + </p> + </li> <li> -- cgit v1.2.2