From 0d769b3bb0fa07600a7d36d4e0b045e404f7e753 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20B=2E=20Brandenburg?=
Date: Thu, 21 Oct 2010 16:08:46 -0400
Subject: Add 2010.2 release
---
download/2010.2/SHA256SUMS | 3 +
download/2010.2/ft_tools-2010.2.tgz | Bin 0 -> 5577 bytes
download/2010.2/liblitmus-2010.2.tgz | Bin 0 -> 17962 bytes
download/2010.2/litmus-rt-2010.2.patch | 11076 +++++++++++++++++++++++++++++++
index.html | 44 +-
5 files changed, 11119 insertions(+), 4 deletions(-)
create mode 100644 download/2010.2/SHA256SUMS
create mode 100644 download/2010.2/ft_tools-2010.2.tgz
create mode 100644 download/2010.2/liblitmus-2010.2.tgz
create mode 100644 download/2010.2/litmus-rt-2010.2.patch
diff --git a/download/2010.2/SHA256SUMS b/download/2010.2/SHA256SUMS
new file mode 100644
index 0000000..19f5f12
--- /dev/null
+++ b/download/2010.2/SHA256SUMS
@@ -0,0 +1,3 @@
+b911c0a77b0bfd4d73928404338f6a1d98279340d9288a32deb0c5c1e4281469 ft_tools-2010.2.tgz
+d2b772cd6c3a03c1329b259ad4a2bfbf9f7268a5699b8f988b85aa1eafe7600a liblitmus-2010.2.tgz
+c460952c4c91076392e889ef457cf231d5ecbcf7fbf72257ff84c0e63be7f9da litmus-rt-2010.2.patch
diff --git a/download/2010.2/ft_tools-2010.2.tgz b/download/2010.2/ft_tools-2010.2.tgz
new file mode 100644
index 0000000..4d95abb
Binary files /dev/null and b/download/2010.2/ft_tools-2010.2.tgz differ
diff --git a/download/2010.2/liblitmus-2010.2.tgz b/download/2010.2/liblitmus-2010.2.tgz
new file mode 100644
index 0000000..abeb6c2
Binary files /dev/null and b/download/2010.2/liblitmus-2010.2.tgz differ
diff --git a/download/2010.2/litmus-rt-2010.2.patch b/download/2010.2/litmus-rt-2010.2.patch
new file mode 100644
index 0000000..6dcfc56
--- /dev/null
+++ b/download/2010.2/litmus-rt-2010.2.patch
@@ -0,0 +1,11076 @@
+ Makefile | 4 +-
+ arch/x86/Kconfig | 8 +
+ arch/x86/include/asm/entry_arch.h | 1 +
+ arch/x86/include/asm/feather_trace.h | 17 +
+ arch/x86/include/asm/feather_trace_32.h | 79 +++
+ arch/x86/include/asm/feather_trace_64.h | 67 +++
+ arch/x86/include/asm/hw_irq.h | 3 +
+ arch/x86/include/asm/irq_vectors.h | 5 +
+ arch/x86/include/asm/processor.h | 4 +
+ arch/x86/include/asm/unistd_32.h | 6 +-
+ arch/x86/include/asm/unistd_64.h | 4 +
+ arch/x86/kernel/Makefile | 2 +
+ arch/x86/kernel/cpu/intel_cacheinfo.c | 17 +
+ arch/x86/kernel/entry_64.S | 2 +
+ arch/x86/kernel/ft_event.c | 118 ++++
+ arch/x86/kernel/irqinit.c | 3 +
+ arch/x86/kernel/smp.c | 28 +
+ arch/x86/kernel/syscall_table_32.S | 14 +
+ fs/exec.c | 13 +-
+ fs/inode.c | 2 +
+ include/linux/completion.h | 1 +
+ include/linux/fs.h | 21 +-
+ include/linux/hrtimer.h | 32 ++
+ include/linux/sched.h | 17 +-
+ include/linux/smp.h | 5 +
+ include/linux/tick.h | 5 +
+ include/litmus/bheap.h | 77 +++
+ include/litmus/budget.h | 8 +
+ include/litmus/edf_common.h | 27 +
+ include/litmus/fdso.h | 70 +++
+ include/litmus/feather_buffer.h | 94 ++++
+ include/litmus/feather_trace.h | 65 +++
+ include/litmus/ftdev.h | 49 ++
+ include/litmus/jobs.h | 9 +
+ include/litmus/litmus.h | 267 +++++++++
+ include/litmus/rt_domain.h | 182 +++++++
+ include/litmus/rt_param.h | 196 +++++++
+ include/litmus/sched_plugin.h | 162 ++++++
+ include/litmus/sched_trace.h | 192 +++++++
+ include/litmus/trace.h | 113 ++++
+ include/litmus/unistd_32.h | 23 +
+ include/litmus/unistd_64.h | 37 ++
+ kernel/exit.c | 4 +
+ kernel/fork.c | 7 +
+ kernel/hrtimer.c | 95 ++++
+ kernel/printk.c | 14 +-
+ kernel/sched.c | 106 ++++-
+ kernel/sched_fair.c | 2 +-
+ kernel/sched_rt.c | 2 +-
+ kernel/time/tick-sched.c | 48 ++-
+ litmus/Kconfig | 134 +++++
+ litmus/Makefile | 25 +
+ litmus/bheap.c | 314 +++++++++++
+ litmus/budget.c | 109 ++++
+ litmus/ctrldev.c | 150 +++++
+ litmus/edf_common.c | 102 ++++
+ litmus/fdso.c | 281 ++++++++++
+ litmus/fmlp.c | 268 +++++++++
+ litmus/ft_event.c | 43 ++
+ litmus/ftdev.c | 360 +++++++++++++
+ litmus/jobs.c | 43 ++
+ litmus/litmus.c | 799 +++++++++++++++++++++++++++
+ litmus/rt_domain.c | 355 ++++++++++++
+ litmus/sched_cedf.c | 773 ++++++++++++++++++++++++++
+ litmus/sched_gsn_edf.c | 842 +++++++++++++++++++++++++++++
+ litmus/sched_litmus.c | 315 +++++++++++
+ litmus/sched_pfair.c | 897 +++++++++++++++++++++++++++++++
+ litmus/sched_plugin.c | 265 +++++++++
+ litmus/sched_psn_edf.c | 482 +++++++++++++++++
+ litmus/sched_task_trace.c | 204 +++++++
+ litmus/sched_trace.c | 378 +++++++++++++
+ litmus/srp.c | 318 +++++++++++
+ litmus/sync.c | 104 ++++
+ litmus/trace.c | 103 ++++
+ 74 files changed, 9954 insertions(+), 37 deletions(-)
+
+diff --git a/Makefile b/Makefile
+index ebc8225..316557d 100644
+--- a/Makefile
++++ b/Makefile
+@@ -1,7 +1,7 @@
+ VERSION = 2
+ PATCHLEVEL = 6
+ SUBLEVEL = 34
+-EXTRAVERSION =
++EXTRAVERSION =-litmus2010
+ NAME = Sheep on Meth
+
+ # *DOCUMENTATION*
+@@ -650,7 +650,7 @@ export mod_strip_cmd
+
+
+ ifeq ($(KBUILD_EXTMOD),)
+-core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/
++core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ litmus/
+
+ vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
+ $(core-y) $(core-m) $(drivers-y) $(drivers-m) \
+diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
+index 9458685..7b2c8db 100644
+--- a/arch/x86/Kconfig
++++ b/arch/x86/Kconfig
+@@ -2125,3 +2125,11 @@ source "crypto/Kconfig"
+ source "arch/x86/kvm/Kconfig"
+
+ source "lib/Kconfig"
++
++config ARCH_HAS_FEATHER_TRACE
++ def_bool y
++
++config ARCH_HAS_SEND_PULL_TIMERS
++ def_bool y
++
++source "litmus/Kconfig"
+diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
+index 8e8ec66..5d07dea 100644
+--- a/arch/x86/include/asm/entry_arch.h
++++ b/arch/x86/include/asm/entry_arch.h
+@@ -13,6 +13,7 @@
+ BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
+ BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
+ BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
++BUILD_INTERRUPT(pull_timers_interrupt,PULL_TIMERS_VECTOR)
+ BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
+ BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
+
+diff --git a/arch/x86/include/asm/feather_trace.h b/arch/x86/include/asm/feather_trace.h
+new file mode 100644
+index 0000000..4fd3163
+--- /dev/null
++++ b/arch/x86/include/asm/feather_trace.h
+@@ -0,0 +1,17 @@
++#ifndef _ARCH_FEATHER_TRACE_H
++#define _ARCH_FEATHER_TRACE_H
++
++#include
++
++static inline unsigned long long ft_timestamp(void)
++{
++ return __native_read_tsc();
++}
++
++#ifdef CONFIG_X86_32
++#include "feather_trace_32.h"
++#else
++#include "feather_trace_64.h"
++#endif
++
++#endif
+diff --git a/arch/x86/include/asm/feather_trace_32.h b/arch/x86/include/asm/feather_trace_32.h
+new file mode 100644
+index 0000000..70202f9
+--- /dev/null
++++ b/arch/x86/include/asm/feather_trace_32.h
+@@ -0,0 +1,79 @@
++/* Do not directly include this file. Include feather_trace.h instead */
++
++#define feather_callback __attribute__((regparm(0)))
++
++/*
++ * make the compiler reload any register that is not saved in
++ * a cdecl function call
++ */
++#define CLOBBER_LIST "memory", "cc", "eax", "ecx", "edx"
++
++#define ft_event(id, callback) \
++ __asm__ __volatile__( \
++ "1: jmp 2f \n\t" \
++ " call " #callback " \n\t" \
++ ".section __event_table, \"aw\" \n\t" \
++ ".long " #id ", 0, 1b, 2f \n\t" \
++ ".previous \n\t" \
++ "2: \n\t" \
++ : : : CLOBBER_LIST)
++
++#define ft_event0(id, callback) \
++ __asm__ __volatile__( \
++ "1: jmp 2f \n\t" \
++ " subl $4, %%esp \n\t" \
++ " movl $" #id ", (%%esp) \n\t" \
++ " call " #callback " \n\t" \
++ " addl $4, %%esp \n\t" \
++ ".section __event_table, \"aw\" \n\t" \
++ ".long " #id ", 0, 1b, 2f \n\t" \
++ ".previous \n\t" \
++ "2: \n\t" \
++ : : : CLOBBER_LIST)
++
++#define ft_event1(id, callback, param) \
++ __asm__ __volatile__( \
++ "1: jmp 2f \n\t" \
++ " subl $8, %%esp \n\t" \
++ " movl %0, 4(%%esp) \n\t" \
++ " movl $" #id ", (%%esp) \n\t" \
++ " call " #callback " \n\t" \
++ " addl $8, %%esp \n\t" \
++ ".section __event_table, \"aw\" \n\t" \
++ ".long " #id ", 0, 1b, 2f \n\t" \
++ ".previous \n\t" \
++ "2: \n\t" \
++ : : "r" (param) : CLOBBER_LIST)
++
++#define ft_event2(id, callback, param, param2) \
++ __asm__ __volatile__( \
++ "1: jmp 2f \n\t" \
++ " subl $12, %%esp \n\t" \
++ " movl %1, 8(%%esp) \n\t" \
++ " movl %0, 4(%%esp) \n\t" \
++ " movl $" #id ", (%%esp) \n\t" \
++ " call " #callback " \n\t" \
++ " addl $12, %%esp \n\t" \
++ ".section __event_table, \"aw\" \n\t" \
++ ".long " #id ", 0, 1b, 2f \n\t" \
++ ".previous \n\t" \
++ "2: \n\t" \
++ : : "r" (param), "r" (param2) : CLOBBER_LIST)
++
++
++#define ft_event3(id, callback, p, p2, p3) \
++ __asm__ __volatile__( \
++ "1: jmp 2f \n\t" \
++ " subl $16, %%esp \n\t" \
++ " movl %2, 12(%%esp) \n\t" \
++ " movl %1, 8(%%esp) \n\t" \
++ " movl %0, 4(%%esp) \n\t" \
++ " movl $" #id ", (%%esp) \n\t" \
++ " call " #callback " \n\t" \
++ " addl $16, %%esp \n\t" \
++ ".section __event_table, \"aw\" \n\t" \
++ ".long " #id ", 0, 1b, 2f \n\t" \
++ ".previous \n\t" \
++ "2: \n\t" \
++ : : "r" (p), "r" (p2), "r" (p3) : CLOBBER_LIST)
++
+diff --git a/arch/x86/include/asm/feather_trace_64.h b/arch/x86/include/asm/feather_trace_64.h
+new file mode 100644
+index 0000000..54ac2ae
+--- /dev/null
++++ b/arch/x86/include/asm/feather_trace_64.h
+@@ -0,0 +1,67 @@
++/* Do not directly include this file. Include feather_trace.h instead */
++
++/* regparm is the default on x86_64 */
++#define feather_callback
++
++# define _EVENT_TABLE(id,from,to) \
++ ".section __event_table, \"aw\"\n\t" \
++ ".balign 8\n\t" \
++ ".quad " #id ", 0, " #from ", " #to " \n\t" \
++ ".previous \n\t"
++
++/*
++ * x86_64 callee only owns rbp, rbx, r12 -> r15
++ * the called can freely modify the others
++ */
++#define CLOBBER_LIST "memory", "cc", "rdi", "rsi", "rdx", "rcx", \
++ "r8", "r9", "r10", "r11", "rax"
++
++#define ft_event(id, callback) \
++ __asm__ __volatile__( \
++ "1: jmp 2f \n\t" \
++ " call " #callback " \n\t" \
++ _EVENT_TABLE(id,1b,2f) \
++ "2: \n\t" \
++ : : : CLOBBER_LIST)
++
++#define ft_event0(id, callback) \
++ __asm__ __volatile__( \
++ "1: jmp 2f \n\t" \
++ " movq $" #id ", %%rdi \n\t" \
++ " call " #callback " \n\t" \
++ _EVENT_TABLE(id,1b,2f) \
++ "2: \n\t" \
++ : : : CLOBBER_LIST)
++
++#define ft_event1(id, callback, param) \
++ __asm__ __volatile__( \
++ "1: jmp 2f \n\t" \
++ " movq %0, %%rsi \n\t" \
++ " movq $" #id ", %%rdi \n\t" \
++ " call " #callback " \n\t" \
++ _EVENT_TABLE(id,1b,2f) \
++ "2: \n\t" \
++ : : "r" (param) : CLOBBER_LIST)
++
++#define ft_event2(id, callback, param, param2) \
++ __asm__ __volatile__( \
++ "1: jmp 2f \n\t" \
++ " movq %1, %%rdx \n\t" \
++ " movq %0, %%rsi \n\t" \
++ " movq $" #id ", %%rdi \n\t" \
++ " call " #callback " \n\t" \
++ _EVENT_TABLE(id,1b,2f) \
++ "2: \n\t" \
++ : : "r" (param), "r" (param2) : CLOBBER_LIST)
++
++#define ft_event3(id, callback, p, p2, p3) \
++ __asm__ __volatile__( \
++ "1: jmp 2f \n\t" \
++ " movq %2, %%rcx \n\t" \
++ " movq %1, %%rdx \n\t" \
++ " movq %0, %%rsi \n\t" \
++ " movq $" #id ", %%rdi \n\t" \
++ " call " #callback " \n\t" \
++ _EVENT_TABLE(id,1b,2f) \
++ "2: \n\t" \
++ : : "r" (p), "r" (p2), "r" (p3) : CLOBBER_LIST)
+diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
+index 46c0fe0..c174115 100644
+--- a/arch/x86/include/asm/hw_irq.h
++++ b/arch/x86/include/asm/hw_irq.h
+@@ -53,6 +53,8 @@ extern void threshold_interrupt(void);
+ extern void call_function_interrupt(void);
+ extern void call_function_single_interrupt(void);
+
++extern void pull_timers_interrupt(void);
++
+ /* IOAPIC */
+ #define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs))
+ extern unsigned long io_apic_irqs;
+@@ -122,6 +124,7 @@ extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
+ extern void smp_reschedule_interrupt(struct pt_regs *);
+ extern void smp_call_function_interrupt(struct pt_regs *);
+ extern void smp_call_function_single_interrupt(struct pt_regs *);
++extern void smp_pull_timers_interrupt(struct pt_regs *);
+ #ifdef CONFIG_X86_32
+ extern void smp_invalidate_interrupt(struct pt_regs *);
+ #else
+diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
+index 8767d99..bb5318b 100644
+--- a/arch/x86/include/asm/irq_vectors.h
++++ b/arch/x86/include/asm/irq_vectors.h
+@@ -109,6 +109,11 @@
+ #define LOCAL_TIMER_VECTOR 0xef
+
+ /*
++ * LITMUS^RT pull timers IRQ vector
++ */
++#define PULL_TIMERS_VECTOR 0xee
++
++/*
+ * Generic system vector for platform specific use
+ */
+ #define X86_PLATFORM_IPI_VECTOR 0xed
+diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
+index b753ea5..48426f9 100644
+--- a/arch/x86/include/asm/processor.h
++++ b/arch/x86/include/asm/processor.h
+@@ -173,6 +173,10 @@ extern void print_cpu_info(struct cpuinfo_x86 *);
+ extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
+ extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
+ extern unsigned short num_cache_leaves;
++#ifdef CONFIG_SYSFS
++extern int get_shared_cpu_map(cpumask_var_t mask,
++ unsigned int cpu, int index);
++#endif
+
+ extern void detect_extended_topology(struct cpuinfo_x86 *c);
+ extern void detect_ht(struct cpuinfo_x86 *c);
+diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
+index beb9b5f..987e523 100644
+--- a/arch/x86/include/asm/unistd_32.h
++++ b/arch/x86/include/asm/unistd_32.h
+@@ -344,9 +344,13 @@
+ #define __NR_perf_event_open 336
+ #define __NR_recvmmsg 337
+
++#define __NR_LITMUS 338
++
++#include "litmus/unistd_32.h"
++
+ #ifdef __KERNEL__
+
+-#define NR_syscalls 338
++#define NR_syscalls 338 + NR_litmus_syscalls
+
+ #define __ARCH_WANT_IPC_PARSE_VERSION
+ #define __ARCH_WANT_OLD_READDIR
+diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
+index ff4307b..b21c3b2 100644
+--- a/arch/x86/include/asm/unistd_64.h
++++ b/arch/x86/include/asm/unistd_64.h
+@@ -664,6 +664,10 @@ __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
+ #define __NR_recvmmsg 299
+ __SYSCALL(__NR_recvmmsg, sys_recvmmsg)
+
++#define __NR_LITMUS 299
++
++#include "litmus/unistd_64.h"
++
+ #ifndef __NO_STUBS
+ #define __ARCH_WANT_OLD_READDIR
+ #define __ARCH_WANT_OLD_STAT
+diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
+index 4c58352..d09934e 100644
+--- a/arch/x86/kernel/Makefile
++++ b/arch/x86/kernel/Makefile
+@@ -117,6 +117,8 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
+
+ obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
+
++obj-$(CONFIG_FEATHER_TRACE) += ft_event.o
++
+ ###
+ # 64 bit specific files
+ ifeq ($(CONFIG_X86_64),y)
+diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
+index 95962a9..94d8e47 100644
+--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
++++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
+@@ -632,6 +632,23 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
+ static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
+ #define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y]))
+
++/* returns CPUs that share the index cache with cpu */
++int get_shared_cpu_map(cpumask_var_t mask, unsigned int cpu, int index)
++{
++ int ret = 0;
++ struct _cpuid4_info *this_leaf;
++
++ if (index >= num_cache_leaves) {
++ index = num_cache_leaves - 1;
++ ret = index;
++ }
++
++ this_leaf = CPUID4_INFO_IDX(cpu,index);
++ cpumask_copy(mask, to_cpumask(this_leaf->shared_cpu_map));
++
++ return ret;
++}
++
+ #ifdef CONFIG_SMP
+ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
+ {
+diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
+index 0697ff1..b9ec6cd 100644
+--- a/arch/x86/kernel/entry_64.S
++++ b/arch/x86/kernel/entry_64.S
+@@ -1016,6 +1016,8 @@ apicinterrupt CALL_FUNCTION_VECTOR \
+ call_function_interrupt smp_call_function_interrupt
+ apicinterrupt RESCHEDULE_VECTOR \
+ reschedule_interrupt smp_reschedule_interrupt
++apicinterrupt PULL_TIMERS_VECTOR \
++ pull_timers_interrupt smp_pull_timers_interrupt
+ #endif
+
+ apicinterrupt ERROR_APIC_VECTOR \
+diff --git a/arch/x86/kernel/ft_event.c b/arch/x86/kernel/ft_event.c
+new file mode 100644
+index 0000000..37cc332
+--- /dev/null
++++ b/arch/x86/kernel/ft_event.c
+@@ -0,0 +1,118 @@
++#include
++
++#include
++
++/* the feather trace management functions assume
++ * exclusive access to the event table
++ */
++
++#ifndef CONFIG_DEBUG_RODATA
++
++#define BYTE_JUMP 0xeb
++#define BYTE_JUMP_LEN 0x02
++
++/* for each event, there is an entry in the event table */
++struct trace_event {
++ long id;
++ long count;
++ long start_addr;
++ long end_addr;
++};
++
++extern struct trace_event __start___event_table[];
++extern struct trace_event __stop___event_table[];
++
++/* Workaround: if no events are defined, then the event_table section does not
++ * exist and the above references cause linker errors. This could probably be
++ * fixed by adjusting the linker script, but it is easier to maintain for us if
++ * we simply create a dummy symbol in the event table section.
++ */
++int __event_table_dummy[0] __attribute__ ((section("__event_table")));
++
++int ft_enable_event(unsigned long id)
++{
++ struct trace_event* te = __start___event_table;
++ int count = 0;
++ char* delta;
++ unsigned char* instr;
++
++ while (te < __stop___event_table) {
++ if (te->id == id && ++te->count == 1) {
++ instr = (unsigned char*) te->start_addr;
++ /* make sure we don't clobber something wrong */
++ if (*instr == BYTE_JUMP) {
++ delta = (((unsigned char*) te->start_addr) + 1);
++ *delta = 0;
++ }
++ }
++ if (te->id == id)
++ count++;
++ te++;
++ }
++
++ printk(KERN_DEBUG "ft_enable_event: enabled %d events\n", count);
++ return count;
++}
++
++int ft_disable_event(unsigned long id)
++{
++ struct trace_event* te = __start___event_table;
++ int count = 0;
++ char* delta;
++ unsigned char* instr;
++
++ while (te < __stop___event_table) {
++ if (te->id == id && --te->count == 0) {
++ instr = (unsigned char*) te->start_addr;
++ if (*instr == BYTE_JUMP) {
++ delta = (((unsigned char*) te->start_addr) + 1);
++ *delta = te->end_addr - te->start_addr -
++ BYTE_JUMP_LEN;
++ }
++ }
++ if (te->id == id)
++ count++;
++ te++;
++ }
++
++ printk(KERN_DEBUG "ft_disable_event: disabled %d events\n", count);
++ return count;
++}
++
++int ft_disable_all_events(void)
++{
++ struct trace_event* te = __start___event_table;
++ int count = 0;
++ char* delta;
++ unsigned char* instr;
++
++ while (te < __stop___event_table) {
++ if (te->count) {
++ instr = (unsigned char*) te->start_addr;
++ if (*instr == BYTE_JUMP) {
++ delta = (((unsigned char*) te->start_addr)
++ + 1);
++ *delta = te->end_addr - te->start_addr -
++ BYTE_JUMP_LEN;
++ te->count = 0;
++ count++;
++ }
++ }
++ te++;
++ }
++ return count;
++}
++
++int ft_is_event_enabled(unsigned long id)
++{
++ struct trace_event* te = __start___event_table;
++
++ while (te < __stop___event_table) {
++ if (te->id == id)
++ return te->count;
++ te++;
++ }
++ return 0;
++}
++
++#endif
+diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
+index 0ed2d30..a760ce1 100644
+--- a/arch/x86/kernel/irqinit.c
++++ b/arch/x86/kernel/irqinit.c
+@@ -189,6 +189,9 @@ static void __init smp_intr_init(void)
+ alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
+ call_function_single_interrupt);
+
++ /* IPI for hrtimer pulling on remote cpus */
++ alloc_intr_gate(PULL_TIMERS_VECTOR, pull_timers_interrupt);
++
+ /* Low priority IPI to cleanup after moving an irq */
+ set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+ set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
+diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
+index d801210..97af589 100644
+--- a/arch/x86/kernel/smp.c
++++ b/arch/x86/kernel/smp.c
+@@ -23,6 +23,9 @@
+ #include
+ #include
+
++#include
++#include
++
+ #include
+ #include
+ #include
+@@ -118,6 +121,7 @@ static void native_smp_send_reschedule(int cpu)
+ WARN_ON(1);
+ return;
+ }
++ TS_SEND_RESCHED_START(cpu);
+ apic->send_IPI_mask(cpumask_of(cpu), RESCHEDULE_VECTOR);
+ }
+
+@@ -147,6 +151,16 @@ void native_send_call_func_ipi(const struct cpumask *mask)
+ free_cpumask_var(allbutself);
+ }
+
++/* trigger timers on remote cpu */
++void smp_send_pull_timers(int cpu)
++{
++ if (unlikely(cpu_is_offline(cpu))) {
++ WARN_ON(1);
++ return;
++ }
++ apic->send_IPI_mask(cpumask_of(cpu), PULL_TIMERS_VECTOR);
++}
++
+ /*
+ * this function calls the 'stop' function on all other CPUs in the system.
+ */
+@@ -198,7 +212,12 @@ static void native_smp_send_stop(void)
+ void smp_reschedule_interrupt(struct pt_regs *regs)
+ {
+ ack_APIC_irq();
++ /* LITMUS^RT needs this interrupt to proper reschedule
++ * on this cpu
++ */
++ set_tsk_need_resched(current);
+ inc_irq_stat(irq_resched_count);
++ TS_SEND_RESCHED_END;
+ /*
+ * KVM uses this interrupt to force a cpu out of guest mode
+ */
+@@ -222,6 +241,15 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
+ irq_exit();
+ }
+
++extern void hrtimer_pull(void);
++
++void smp_pull_timers_interrupt(struct pt_regs *regs)
++{
++ ack_APIC_irq();
++ TRACE("pull timer interrupt\n");
++ hrtimer_pull();
++}
++
+ struct smp_ops smp_ops = {
+ .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
+ .smp_prepare_cpus = native_smp_prepare_cpus,
+diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
+index 8b37293..5da9a68 100644
+--- a/arch/x86/kernel/syscall_table_32.S
++++ b/arch/x86/kernel/syscall_table_32.S
+@@ -337,3 +337,17 @@ ENTRY(sys_call_table)
+ .long sys_rt_tgsigqueueinfo /* 335 */
+ .long sys_perf_event_open
+ .long sys_recvmmsg
++ .long sys_set_rt_task_param /* LITMUS^RT 338 */
++ .long sys_get_rt_task_param
++ .long sys_complete_job
++ .long sys_od_open
++ .long sys_od_close
++ .long sys_fmlp_down
++ .long sys_fmlp_up
++ .long sys_srp_down
++ .long sys_srp_up
++ .long sys_query_job_no
++ .long sys_wait_for_job_release
++ .long sys_wait_for_ts_release
++ .long sys_release_ts
++ .long sys_null_call
+diff --git a/fs/exec.c b/fs/exec.c
+index e6e94c6..0293087 100644
+--- a/fs/exec.c
++++ b/fs/exec.c
+@@ -19,7 +19,7 @@
+ * current->executable is only used by the procfs. This allows a dispatch
+ * table to check for several different types of binary formats. We keep
+ * trying until we recognize the file or we run out of supported binary
+- * formats.
++ * formats.
+ */
+
+ #include
+@@ -56,6 +56,8 @@
+ #include
+ #include
+
++#include
++
+ #include
+ #include
+ #include
+@@ -79,7 +81,7 @@ int __register_binfmt(struct linux_binfmt * fmt, int insert)
+ insert ? list_add(&fmt->lh, &formats) :
+ list_add_tail(&fmt->lh, &formats);
+ write_unlock(&binfmt_lock);
+- return 0;
++ return 0;
+ }
+
+ EXPORT_SYMBOL(__register_binfmt);
+@@ -1045,7 +1047,7 @@ void setup_new_exec(struct linux_binprm * bprm)
+ group */
+
+ current->self_exec_id++;
+-
++
+ flush_signal_handlers(current, 0);
+ flush_old_files(current->files);
+ }
+@@ -1135,8 +1137,8 @@ int check_unsafe_exec(struct linux_binprm *bprm)
+ return res;
+ }
+
+-/*
+- * Fill the binprm structure from the inode.
++/*
++ * Fill the binprm structure from the inode.
+ * Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
+ *
+ * This may be called multiple times for binary chains (scripts for example).
+@@ -1348,6 +1350,7 @@ int do_execve(char * filename,
+ goto out_unmark;
+
+ sched_exec();
++ litmus_exec();
+
+ bprm->file = file;
+ bprm->filename = filename;
+diff --git a/fs/inode.c b/fs/inode.c
+index 407bf39..aaaaf09 100644
+--- a/fs/inode.c
++++ b/fs/inode.c
+@@ -271,6 +271,8 @@ void inode_init_once(struct inode *inode)
+ #ifdef CONFIG_FSNOTIFY
+ INIT_HLIST_HEAD(&inode->i_fsnotify_mark_entries);
+ #endif
++ INIT_LIST_HEAD(&inode->i_obj_list);
++ mutex_init(&inode->i_obj_mutex);
+ }
+ EXPORT_SYMBOL(inode_init_once);
+
+diff --git a/include/linux/completion.h b/include/linux/completion.h
+index 4a6b604..258bec1 100644
+--- a/include/linux/completion.h
++++ b/include/linux/completion.h
+@@ -88,6 +88,7 @@ extern bool completion_done(struct completion *x);
+
+ extern void complete(struct completion *);
+ extern void complete_all(struct completion *);
++extern void complete_n(struct completion *, int n);
+
+ /**
+ * INIT_COMPLETION: - reinitialize a completion structure
+diff --git a/include/linux/fs.h b/include/linux/fs.h
+index 44f35ae..8949184 100644
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -15,8 +15,8 @@
+ * nr_file rlimit, so it's safe to set up a ridiculously high absolute
+ * upper limit on files-per-process.
+ *
+- * Some programs (notably those using select()) may have to be
+- * recompiled to take full advantage of the new limits..
++ * Some programs (notably those using select()) may have to be
++ * recompiled to take full advantage of the new limits..
+ */
+
+ /* Fixed constants first: */
+@@ -173,7 +173,7 @@ struct inodes_stat_t {
+ #define SEL_EX 4
+
+ /* public flags for file_system_type */
+-#define FS_REQUIRES_DEV 1
++#define FS_REQUIRES_DEV 1
+ #define FS_BINARY_MOUNTDATA 2
+ #define FS_HAS_SUBTYPE 4
+ #define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */
+@@ -471,7 +471,7 @@ struct iattr {
+ */
+ #include
+
+-/**
++/**
+ * enum positive_aop_returns - aop return codes with specific semantics
+ *
+ * @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
+@@ -481,7 +481,7 @@ struct iattr {
+ * be a candidate for writeback again in the near
+ * future. Other callers must be careful to unlock
+ * the page if they get this return. Returned by
+- * writepage();
++ * writepage();
+ *
+ * @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
+ * unlocked it and the page might have been truncated.
+@@ -720,6 +720,7 @@ static inline int mapping_writably_mapped(struct address_space *mapping)
+
+ struct posix_acl;
+ #define ACL_NOT_CACHED ((void *)(-1))
++struct inode_obj_id_table;
+
+ struct inode {
+ struct hlist_node i_hash;
+@@ -788,6 +789,8 @@ struct inode {
+ struct posix_acl *i_acl;
+ struct posix_acl *i_default_acl;
+ #endif
++ struct list_head i_obj_list;
++ struct mutex i_obj_mutex;
+ void *i_private; /* fs or device private pointer */
+ };
+
+@@ -1000,10 +1003,10 @@ static inline int file_check_writeable(struct file *filp)
+
+ #define MAX_NON_LFS ((1UL<<31) - 1)
+
+-/* Page cache limit. The filesystems should put that into their s_maxbytes
+- limits, otherwise bad things can happen in VM. */
++/* Page cache limit. The filesystems should put that into their s_maxbytes
++ limits, otherwise bad things can happen in VM. */
+ #if BITS_PER_LONG==32
+-#define MAX_LFS_FILESIZE (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
++#define MAX_LFS_FILESIZE (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
+ #elif BITS_PER_LONG==64
+ #define MAX_LFS_FILESIZE 0x7fffffffffffffffUL
+ #endif
+@@ -2129,7 +2132,7 @@ extern int may_open(struct path *, int, int);
+
+ extern int kernel_read(struct file *, loff_t, char *, unsigned long);
+ extern struct file * open_exec(const char *);
+-
++
+ /* fs/dcache.c -- generic fs support functions */
+ extern int is_subdir(struct dentry *, struct dentry *);
+ extern int path_is_under(struct path *, struct path *);
+diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
+index 5d86fb2..9470a9e 100644
+--- a/include/linux/hrtimer.h
++++ b/include/linux/hrtimer.h
+@@ -167,6 +167,7 @@ struct hrtimer_clock_base {
+ * @nr_retries: Total number of hrtimer interrupt retries
+ * @nr_hangs: Total number of hrtimer interrupt hangs
+ * @max_hang_time: Maximum time spent in hrtimer_interrupt
++ * @to_pull: LITMUS^RT list of timers to be pulled on this cpu
+ */
+ struct hrtimer_cpu_base {
+ raw_spinlock_t lock;
+@@ -180,8 +181,32 @@ struct hrtimer_cpu_base {
+ unsigned long nr_hangs;
+ ktime_t max_hang_time;
+ #endif
++ struct list_head to_pull;
+ };
+
++#ifdef CONFIG_ARCH_HAS_SEND_PULL_TIMERS
++
++#define HRTIMER_START_ON_INACTIVE 0
++#define HRTIMER_START_ON_QUEUED 1
++
++/*
++ * struct hrtimer_start_on_info - save timer info on remote cpu
++ * @list: list of hrtimer_start_on_info on remote cpu (to_pull)
++ * @timer: timer to be triggered on remote cpu
++ * @time: time event
++ * @mode: timer mode
++ * @state: activity flag
++ */
++struct hrtimer_start_on_info {
++ struct list_head list;
++ struct hrtimer *timer;
++ ktime_t time;
++ enum hrtimer_mode mode;
++ atomic_t state;
++};
++
++#endif
++
+ static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
+ {
+ timer->_expires = time;
+@@ -348,6 +373,13 @@ __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+ unsigned long delta_ns,
+ const enum hrtimer_mode mode, int wakeup);
+
++#ifdef CONFIG_ARCH_HAS_SEND_PULL_TIMERS
++extern void hrtimer_start_on_info_init(struct hrtimer_start_on_info *info);
++extern int hrtimer_start_on(int cpu, struct hrtimer_start_on_info *info,
++ struct hrtimer *timer, ktime_t time,
++ const enum hrtimer_mode mode);
++#endif
++
+ extern int hrtimer_cancel(struct hrtimer *timer);
+ extern int hrtimer_try_to_cancel(struct hrtimer *timer);
+
+diff --git a/include/linux/sched.h b/include/linux/sched.h
+index 2b7b81d..225347d 100644
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -38,6 +38,7 @@
+ #define SCHED_BATCH 3
+ /* SCHED_ISO: reserved but not implemented yet */
+ #define SCHED_IDLE 5
++#define SCHED_LITMUS 6
+ /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
+ #define SCHED_RESET_ON_FORK 0x40000000
+
+@@ -94,6 +95,8 @@ struct sched_param {
+
+ #include
+
++#include
++
+ struct exec_domain;
+ struct futex_pi_state;
+ struct robust_list_head;
+@@ -1166,6 +1169,7 @@ struct sched_rt_entity {
+ };
+
+ struct rcu_node;
++struct od_table_entry;
+
+ struct task_struct {
+ volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
+@@ -1250,9 +1254,9 @@ struct task_struct {
+ unsigned long stack_canary;
+ #endif
+
+- /*
++ /*
+ * pointers to (original) parent process, youngest child, younger sibling,
+- * older sibling, respectively. (p->father can be replaced with
++ * older sibling, respectively. (p->father can be replaced with
+ * p->real_parent->pid)
+ */
+ struct task_struct *real_parent; /* real parent process */
+@@ -1464,6 +1468,13 @@ struct task_struct {
+ int make_it_fail;
+ #endif
+ struct prop_local_single dirties;
++
++ /* LITMUS RT parameters and state */
++ struct rt_param rt_param;
++
++ /* references to PI semaphores, etc. */
++ struct od_table_entry *od_table;
++
+ #ifdef CONFIG_LATENCYTOP
+ int latency_record_count;
+ struct latency_record latency_record[LT_SAVECOUNT];
+@@ -2018,7 +2029,7 @@ static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, s
+ spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
+
+ return ret;
+-}
++}
+
+ extern void block_all_signals(int (*notifier)(void *priv), void *priv,
+ sigset_t *mask);
+diff --git a/include/linux/smp.h b/include/linux/smp.h
+index cfa2d20..f86d407 100644
+--- a/include/linux/smp.h
++++ b/include/linux/smp.h
+@@ -80,6 +80,11 @@ int smp_call_function_any(const struct cpumask *mask,
+ void (*func)(void *info), void *info, int wait);
+
+ /*
++ * sends a 'pull timer' event to a remote CPU
++ */
++extern void smp_send_pull_timers(int cpu);
++
++/*
+ * Generic and arch helpers
+ */
+ #ifdef CONFIG_USE_GENERIC_SMP_HELPERS
+diff --git a/include/linux/tick.h b/include/linux/tick.h
+index d2ae79e..25d0cf4 100644
+--- a/include/linux/tick.h
++++ b/include/linux/tick.h
+@@ -73,6 +73,11 @@ extern int tick_is_oneshot_available(void);
+ extern struct tick_device *tick_get_device(int cpu);
+
+ # ifdef CONFIG_HIGH_RES_TIMERS
++/* LITMUS^RT tick alignment */
++#define LINUX_DEFAULT_TICKS 0
++#define LITMUS_ALIGNED_TICKS 1
++#define LITMUS_STAGGERED_TICKS 2
++
+ extern int tick_init_highres(void);
+ extern int tick_program_event(ktime_t expires, int force);
+ extern void tick_setup_sched_timer(void);
+diff --git a/include/litmus/bheap.h b/include/litmus/bheap.h
+new file mode 100644
+index 0000000..cf4864a
+--- /dev/null
++++ b/include/litmus/bheap.h
+@@ -0,0 +1,77 @@
++/* bheaps.h -- Binomial Heaps
++ *
++ * (c) 2008, 2009 Bjoern Brandenburg
++ */
++
++#ifndef BHEAP_H
++#define BHEAP_H
++
++#define NOT_IN_HEAP UINT_MAX
++
++struct bheap_node {
++ struct bheap_node* parent;
++ struct bheap_node* next;
++ struct bheap_node* child;
++
++ unsigned int degree;
++ void* value;
++ struct bheap_node** ref;
++};
++
++struct bheap {
++ struct bheap_node* head;
++ /* We cache the minimum of the heap.
++ * This speeds up repeated peek operations.
++ */
++ struct bheap_node* min;
++};
++
++typedef int (*bheap_prio_t)(struct bheap_node* a, struct bheap_node* b);
++
++void bheap_init(struct bheap* heap);
++void bheap_node_init(struct bheap_node** ref_to_bheap_node_ptr, void* value);
++
++static inline int bheap_node_in_heap(struct bheap_node* h)
++{
++ return h->degree != NOT_IN_HEAP;
++}
++
++static inline int bheap_empty(struct bheap* heap)
++{
++ return heap->head == NULL && heap->min == NULL;
++}
++
++/* insert (and reinitialize) a node into the heap */
++void bheap_insert(bheap_prio_t higher_prio,
++ struct bheap* heap,
++ struct bheap_node* node);
++
++/* merge addition into target */
++void bheap_union(bheap_prio_t higher_prio,
++ struct bheap* target,
++ struct bheap* addition);
++
++struct bheap_node* bheap_peek(bheap_prio_t higher_prio,
++ struct bheap* heap);
++
++struct bheap_node* bheap_take(bheap_prio_t higher_prio,
++ struct bheap* heap);
++
++void bheap_uncache_min(bheap_prio_t higher_prio, struct bheap* heap);
++int bheap_decrease(bheap_prio_t higher_prio, struct bheap_node* node);
++
++void bheap_delete(bheap_prio_t higher_prio,
++ struct bheap* heap,
++ struct bheap_node* node);
++
++/* allocate from memcache */
++struct bheap_node* bheap_node_alloc(int gfp_flags);
++void bheap_node_free(struct bheap_node* hn);
++
++/* allocate a heap node for value and insert into the heap */
++int bheap_add(bheap_prio_t higher_prio, struct bheap* heap,
++ void* value, int gfp_flags);
++
++void* bheap_take_del(bheap_prio_t higher_prio,
++ struct bheap* heap);
++#endif
+diff --git a/include/litmus/budget.h b/include/litmus/budget.h
+new file mode 100644
+index 0000000..732530e
+--- /dev/null
++++ b/include/litmus/budget.h
+@@ -0,0 +1,8 @@
++#ifndef _LITMUS_BUDGET_H_
++#define _LITMUS_BUDGET_H_
++
++/* Update the per-processor enforcement timer (arm/reproram/cancel) for
++ * the next task. */
++void update_enforcement_timer(struct task_struct* t);
++
++#endif
+diff --git a/include/litmus/edf_common.h b/include/litmus/edf_common.h
+new file mode 100644
+index 0000000..80d4321
+--- /dev/null
++++ b/include/litmus/edf_common.h
+@@ -0,0 +1,27 @@
++/*
++ * EDF common data structures and utility functions shared by all EDF
++ * based scheduler plugins
++ */
++
++/* CLEANUP: Add comments and make it less messy.
++ *
++ */
++
++#ifndef __UNC_EDF_COMMON_H__
++#define __UNC_EDF_COMMON_H__
++
++#include
++
++void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
++ release_jobs_t release);
++
++int edf_higher_prio(struct task_struct* first,
++ struct task_struct* second);
++
++int edf_ready_order(struct bheap_node* a, struct bheap_node* b);
++
++int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t);
++
++int edf_set_hp_task(struct pi_semaphore *sem);
++int edf_set_hp_cpu_task(struct pi_semaphore *sem, int cpu);
++#endif
+diff --git a/include/litmus/fdso.h b/include/litmus/fdso.h
+new file mode 100644
+index 0000000..61f1b5b
+--- /dev/null
++++ b/include/litmus/fdso.h
+@@ -0,0 +1,70 @@
++/* fdso.h - file descriptor attached shared objects
++ *
++ * (c) 2007 B. Brandenburg, LITMUS^RT project
++ */
++
++#ifndef _LINUX_FDSO_H_
++#define _LINUX_FDSO_H_
++
++#include
++#include
++
++#include
++#include
++
++#define MAX_OBJECT_DESCRIPTORS 32
++
++typedef enum {
++ MIN_OBJ_TYPE = 0,
++
++ FMLP_SEM = 0,
++ SRP_SEM = 1,
++
++ MAX_OBJ_TYPE = 1
++} obj_type_t;
++
++struct inode_obj_id {
++ struct list_head list;
++ atomic_t count;
++ struct inode* inode;
++
++ obj_type_t type;
++ void* obj;
++ unsigned int id;
++};
++
++
++struct od_table_entry {
++ unsigned int used;
++
++ struct inode_obj_id* obj;
++ void* extra;
++};
++
++struct fdso_ops {
++ void* (*create) (void);
++ void (*destroy)(void*);
++ int (*open) (struct od_table_entry*, void* __user);
++ int (*close) (struct od_table_entry*);
++};
++
++/* translate a userspace supplied od into the raw table entry
++ * returns NULL if od is invalid
++ */
++struct od_table_entry* __od_lookup(int od);
++
++/* translate a userspace supplied od into the associated object
++ * returns NULL if od is invalid
++ */
++static inline void* od_lookup(int od, obj_type_t type)
++{
++ struct od_table_entry* e = __od_lookup(od);
++ return e && e->obj->type == type ? e->obj->obj : NULL;
++}
++
++#define lookup_fmlp_sem(od)((struct pi_semaphore*) od_lookup(od, FMLP_SEM))
++#define lookup_srp_sem(od) ((struct srp_semaphore*) od_lookup(od, SRP_SEM))
++#define lookup_ics(od) ((struct ics*) od_lookup(od, ICS_ID))
++
++
++#endif
+diff --git a/include/litmus/feather_buffer.h b/include/litmus/feather_buffer.h
+new file mode 100644
+index 0000000..6c18277
+--- /dev/null
++++ b/include/litmus/feather_buffer.h
+@@ -0,0 +1,94 @@
++#ifndef _FEATHER_BUFFER_H_
++#define _FEATHER_BUFFER_H_
++
++/* requires UINT_MAX and memcpy */
++
++#define SLOT_FREE 0
++#define SLOT_BUSY 1
++#define SLOT_READY 2
++
++struct ft_buffer {
++ unsigned int slot_count;
++ unsigned int slot_size;
++
++ int free_count;
++ unsigned int write_idx;
++ unsigned int read_idx;
++
++ char* slots;
++ void* buffer_mem;
++ unsigned int failed_writes;
++};
++
++static inline int init_ft_buffer(struct ft_buffer* buf,
++ unsigned int slot_count,
++ unsigned int slot_size,
++ char* slots,
++ void* buffer_mem)
++{
++ int i = 0;
++ if (!slot_count || UINT_MAX % slot_count != slot_count - 1) {
++ /* The slot count must divide UNIT_MAX + 1 so that when it
++ * wraps around the index correctly points to 0.
++ */
++ return 0;
++ } else {
++ buf->slot_count = slot_count;
++ buf->slot_size = slot_size;
++ buf->slots = slots;
++ buf->buffer_mem = buffer_mem;
++ buf->free_count = slot_count;
++ buf->write_idx = 0;
++ buf->read_idx = 0;
++ buf->failed_writes = 0;
++ for (i = 0; i < slot_count; i++)
++ buf->slots[i] = SLOT_FREE;
++ return 1;
++ }
++}
++
++static inline int ft_buffer_start_write(struct ft_buffer* buf, void **ptr)
++{
++ int free = fetch_and_dec(&buf->free_count);
++ unsigned int idx;
++ if (free <= 0) {
++ fetch_and_inc(&buf->free_count);
++ *ptr = 0;
++ fetch_and_inc(&buf->failed_writes);
++ return 0;
++ } else {
++ idx = fetch_and_inc((int*) &buf->write_idx) % buf->slot_count;
++ buf->slots[idx] = SLOT_BUSY;
++ *ptr = ((char*) buf->buffer_mem) + idx * buf->slot_size;
++ return 1;
++ }
++}
++
++static inline void ft_buffer_finish_write(struct ft_buffer* buf, void *ptr)
++{
++ unsigned int idx = ((char*) ptr - (char*) buf->buffer_mem) / buf->slot_size;
++ buf->slots[idx] = SLOT_READY;
++}
++
++
++/* exclusive reader access is assumed */
++static inline int ft_buffer_read(struct ft_buffer* buf, void* dest)
++{
++ unsigned int idx;
++ if (buf->free_count == buf->slot_count)
++ /* nothing available */
++ return 0;
++ idx = buf->read_idx % buf->slot_count;
++ if (buf->slots[idx] == SLOT_READY) {
++ memcpy(dest, ((char*) buf->buffer_mem) + idx * buf->slot_size,
++ buf->slot_size);
++ buf->slots[idx] = SLOT_FREE;
++ buf->read_idx++;
++ fetch_and_inc(&buf->free_count);
++ return 1;
++ } else
++ return 0;
++}
++
++
++#endif
+diff --git a/include/litmus/feather_trace.h b/include/litmus/feather_trace.h
+new file mode 100644
+index 0000000..028dfb2
+--- /dev/null
++++ b/include/litmus/feather_trace.h
+@@ -0,0 +1,65 @@
++#ifndef _FEATHER_TRACE_H_
++#define _FEATHER_TRACE_H_
++
++#include
++
++int ft_enable_event(unsigned long id);
++int ft_disable_event(unsigned long id);
++int ft_is_event_enabled(unsigned long id);
++int ft_disable_all_events(void);
++
++/* atomic_* funcitons are inline anyway */
++static inline int fetch_and_inc(int *val)
++{
++ return atomic_add_return(1, (atomic_t*) val) - 1;
++}
++
++static inline int fetch_and_dec(int *val)
++{
++ return atomic_sub_return(1, (atomic_t*) val) + 1;
++}
++
++/* Don't use rewriting implementation if kernel text pages are read-only.
++ * Ftrace gets around this by using the identity mapping, but that's more
++ * effort that is warrented right now for Feather-Trace.
++ * Eventually, it may make sense to replace Feather-Trace with ftrace.
++ */
++#if defined(CONFIG_ARCH_HAS_FEATHER_TRACE) && !defined(CONFIG_DEBUG_RODATA)
++
++#include
++
++#else /* !__ARCH_HAS_FEATHER_TRACE */
++
++/* provide default implementation */
++
++#include /* for get_cycles() */
++
++static inline unsigned long long ft_timestamp(void)
++{
++ return get_cycles();
++}
++
++#define feather_callback
++
++#define MAX_EVENTS 1024
++
++extern int ft_events[MAX_EVENTS];
++
++#define ft_event(id, callback) \
++ if (ft_events[id]) callback();
++
++#define ft_event0(id, callback) \
++ if (ft_events[id]) callback(id);
++
++#define ft_event1(id, callback, param) \
++ if (ft_events[id]) callback(id, param);
++
++#define ft_event2(id, callback, param, param2) \
++ if (ft_events[id]) callback(id, param, param2);
++
++#define ft_event3(id, callback, p, p2, p3) \
++ if (ft_events[id]) callback(id, p, p2, p3);
++
++#endif /* __ARCH_HAS_FEATHER_TRACE */
++
++#endif
+diff --git a/include/litmus/ftdev.h b/include/litmus/ftdev.h
+new file mode 100644
+index 0000000..7697b46
+--- /dev/null
++++ b/include/litmus/ftdev.h
+@@ -0,0 +1,49 @@
++#ifndef _LITMUS_FTDEV_H_
++#define _LITMUS_FTDEV_H_
++
++#include
++#include
++#include
++#include
++
++#define MAX_FTDEV_MINORS NR_CPUS
++
++#define FTDEV_ENABLE_CMD 0
++#define FTDEV_DISABLE_CMD 1
++
++struct ftdev;
++
++/* return 0 if buffer can be opened, otherwise -$REASON */
++typedef int (*ftdev_can_open_t)(struct ftdev* dev, unsigned int buf_no);
++/* return 0 on success, otherwise -$REASON */
++typedef int (*ftdev_alloc_t)(struct ftdev* dev, unsigned int buf_no);
++typedef void (*ftdev_free_t)(struct ftdev* dev, unsigned int buf_no);
++
++
++struct ftdev_event;
++
++struct ftdev_minor {
++ struct ft_buffer* buf;
++ unsigned int readers;
++ struct mutex lock;
++ /* FIXME: filter for authorized events */
++ struct ftdev_event* events;
++};
++
++struct ftdev {
++ struct cdev cdev;
++ /* FIXME: don't waste memory, allocate dynamically */
++ struct ftdev_minor minor[MAX_FTDEV_MINORS];
++ unsigned int minor_cnt;
++ ftdev_alloc_t alloc;
++ ftdev_free_t free;
++ ftdev_can_open_t can_open;
++};
++
++struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size);
++void free_ft_buffer(struct ft_buffer* buf);
++
++void ftdev_init(struct ftdev* ftdev, struct module* owner);
++int register_ftdev(struct ftdev* ftdev, const char* name, int major);
++
++#endif
+diff --git a/include/litmus/jobs.h b/include/litmus/jobs.h
+new file mode 100644
+index 0000000..9bd361e
+--- /dev/null
++++ b/include/litmus/jobs.h
+@@ -0,0 +1,9 @@
++#ifndef __LITMUS_JOBS_H__
++#define __LITMUS_JOBS_H__
++
++void prepare_for_next_period(struct task_struct *t);
++void release_at(struct task_struct *t, lt_t start);
++long complete_job(void);
++
++#endif
++
+diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h
+new file mode 100644
+index 0000000..5d20276
+--- /dev/null
++++ b/include/litmus/litmus.h
+@@ -0,0 +1,267 @@
++/*
++ * Constant definitions related to
++ * scheduling policy.
++ */
++
++#ifndef _LINUX_LITMUS_H_
++#define _LINUX_LITMUS_H_
++
++#include
++#include
++
++#ifdef CONFIG_RELEASE_MASTER
++extern atomic_t release_master_cpu;
++#endif
++
++extern atomic_t __log_seq_no;
++
++#define TRACE(fmt, args...) \
++ sched_trace_log_message("%d P%d: " fmt, atomic_add_return(1, &__log_seq_no), \
++ raw_smp_processor_id(), ## args)
++
++#define TRACE_TASK(t, fmt, args...) \
++ TRACE("(%s/%d) " fmt, (t)->comm, (t)->pid, ##args)
++
++#define TRACE_CUR(fmt, args...) \
++ TRACE_TASK(current, fmt, ## args)
++
++#define TRACE_BUG_ON(cond) \
++ do { if (cond) TRACE("BUG_ON(%s) at %s:%d " \
++ "called from %p current=%s/%d state=%d " \
++ "flags=%x partition=%d cpu=%d rtflags=%d"\
++ " job=%u timeslice=%u\n", \
++ #cond, __FILE__, __LINE__, __builtin_return_address(0), current->comm, \
++ current->pid, current->state, current->flags, \
++ get_partition(current), smp_processor_id(), get_rt_flags(current), \
++ current->rt_param.job_params.job_no, \
++ current->rt.time_slice\
++ ); } while(0);
++
++
++/* in_list - is a given list_head queued on some list?
++ */
++static inline int in_list(struct list_head* list)
++{
++ return !( /* case 1: deleted */
++ (list->next == LIST_POISON1 &&
++ list->prev == LIST_POISON2)
++ ||
++ /* case 2: initialized */
++ (list->next == list &&
++ list->prev == list)
++ );
++}
++
++#define NO_CPU 0xffffffff
++
++void litmus_fork(struct task_struct *tsk);
++void litmus_exec(void);
++/* clean up real-time state of a task */
++void exit_litmus(struct task_struct *dead_tsk);
++
++long litmus_admit_task(struct task_struct *tsk);
++void litmus_exit_task(struct task_struct *tsk);
++
++#define is_realtime(t) ((t)->policy == SCHED_LITMUS)
++#define rt_transition_pending(t) \
++ ((t)->rt_param.transition_pending)
++
++#define tsk_rt(t) (&(t)->rt_param)
++
++/* Realtime utility macros */
++#define get_rt_flags(t) (tsk_rt(t)->flags)
++#define set_rt_flags(t,f) (tsk_rt(t)->flags=(f))
++#define get_exec_cost(t) (tsk_rt(t)->task_params.exec_cost)
++#define get_exec_time(t) (tsk_rt(t)->job_params.exec_time)
++#define get_rt_period(t) (tsk_rt(t)->task_params.period)
++#define get_rt_phase(t) (tsk_rt(t)->task_params.phase)
++#define get_partition(t) (tsk_rt(t)->task_params.cpu)
++#define get_deadline(t) (tsk_rt(t)->job_params.deadline)
++#define get_release(t) (tsk_rt(t)->job_params.release)
++#define get_class(t) (tsk_rt(t)->task_params.cls)
++
++inline static int budget_exhausted(struct task_struct* t)
++{
++ return get_exec_time(t) >= get_exec_cost(t);
++}
++
++inline static lt_t budget_remaining(struct task_struct* t)
++{
++ if (!budget_exhausted(t))
++ return get_exec_time(t) - get_exec_cost(t);
++ else
++ /* avoid overflow */
++ return 0;
++}
++
++#define budget_enforced(t) (tsk_rt(t)->task_params.budget_policy != NO_ENFORCEMENT)
++
++#define budget_precisely_enforced(t) (tsk_rt(t)->task_params.budget_policy \
++ == PRECISE_ENFORCEMENT)
++
++#define is_hrt(t) \
++ (tsk_rt(t)->task_params.class == RT_CLASS_HARD)
++#define is_srt(t) \
++ (tsk_rt(t)->task_params.class == RT_CLASS_SOFT)
++#define is_be(t) \
++ (tsk_rt(t)->task_params.class == RT_CLASS_BEST_EFFORT)
++
++/* Our notion of time within LITMUS: kernel monotonic time. */
++static inline lt_t litmus_clock(void)
++{
++ return ktime_to_ns(ktime_get());
++}
++
++/* A macro to convert from nanoseconds to ktime_t. */
++#define ns_to_ktime(t) ktime_add_ns(ktime_set(0, 0), t)
++
++#define get_domain(t) (tsk_rt(t)->domain)
++
++/* Honor the flag in the preempt_count variable that is set
++ * when scheduling is in progress.
++ */
++#define is_running(t) \
++ ((t)->state == TASK_RUNNING || \
++ task_thread_info(t)->preempt_count & PREEMPT_ACTIVE)
++
++#define is_blocked(t) \
++ (!is_running(t))
++#define is_released(t, now) \
++ (lt_before_eq(get_release(t), now))
++#define is_tardy(t, now) \
++ (lt_before_eq(tsk_rt(t)->job_params.deadline, now))
++
++/* real-time comparison macros */
++#define earlier_deadline(a, b) (lt_before(\
++ (a)->rt_param.job_params.deadline,\
++ (b)->rt_param.job_params.deadline))
++#define earlier_release(a, b) (lt_before(\
++ (a)->rt_param.job_params.release,\
++ (b)->rt_param.job_params.release))
++
++void preempt_if_preemptable(struct task_struct* t, int on_cpu);
++
++#ifdef CONFIG_SRP
++void srp_ceiling_block(void);
++#else
++#define srp_ceiling_block() /* nothing */
++#endif
++
++#define bheap2task(hn) ((struct task_struct*) hn->value)
++
++#ifdef CONFIG_NP_SECTION
++
++static inline int is_kernel_np(struct task_struct *t)
++{
++ return tsk_rt(t)->kernel_np;
++}
++
++static inline int is_user_np(struct task_struct *t)
++{
++ return tsk_rt(t)->ctrl_page ? tsk_rt(t)->ctrl_page->np_flag : 0;
++}
++
++static inline void request_exit_np(struct task_struct *t)
++{
++ if (is_user_np(t)) {
++ /* Set the flag that tells user space to call
++ * into the kernel at the end of a critical section. */
++ if (likely(tsk_rt(t)->ctrl_page)) {
++ TRACE_TASK(t, "setting delayed_preemption flag\n");
++ tsk_rt(t)->ctrl_page->delayed_preemption = 1;
++ }
++ }
++}
++
++static inline void clear_exit_np(struct task_struct *t)
++{
++ if (likely(tsk_rt(t)->ctrl_page))
++ tsk_rt(t)->ctrl_page->delayed_preemption = 0;
++}
++
++static inline void make_np(struct task_struct *t)
++{
++ tsk_rt(t)->kernel_np++;
++}
++
++/* Caller should check if preemption is necessary when
++ * the function return 0.
++ */
++static inline int take_np(struct task_struct *t)
++{
++ return --tsk_rt(t)->kernel_np;
++}
++
++#else
++
++static inline int is_kernel_np(struct task_struct* t)
++{
++ return 0;
++}
++
++static inline int is_user_np(struct task_struct* t)
++{
++ return 0;
++}
++
++static inline void request_exit_np(struct task_struct *t)
++{
++ /* request_exit_np() shouldn't be called if !CONFIG_NP_SECTION */
++ BUG();
++}
++
++static inline void clear_exit_np(struct task_struct* t)
++{
++}
++
++#endif
++
++static inline int is_np(struct task_struct *t)
++{
++#ifdef CONFIG_SCHED_DEBUG_TRACE
++ int kernel, user;
++ kernel = is_kernel_np(t);
++ user = is_user_np(t);
++ if (kernel || user)
++ TRACE_TASK(t, " is non-preemptive: kernel=%d user=%d\n",
++
++ kernel, user);
++ return kernel || user;
++#else
++ return unlikely(is_kernel_np(t) || is_user_np(t));
++#endif
++}
++
++static inline int is_present(struct task_struct* t)
++{
++ return t && tsk_rt(t)->present;
++}
++
++
++/* make the unit explicit */
++typedef unsigned long quanta_t;
++
++enum round {
++ FLOOR,
++ CEIL
++};
++
++
++/* Tick period is used to convert ns-specified execution
++ * costs and periods into tick-based equivalents.
++ */
++extern ktime_t tick_period;
++
++static inline quanta_t time2quanta(lt_t time, enum round round)
++{
++ s64 quantum_length = ktime_to_ns(tick_period);
++
++ if (do_div(time, quantum_length) && round == CEIL)
++ time++;
++ return (quanta_t) time;
++}
++
++/* By how much is cpu staggered behind CPU 0? */
++u64 cpu_stagger_offset(int cpu);
++
++#endif
+diff --git a/include/litmus/rt_domain.h b/include/litmus/rt_domain.h
+new file mode 100644
+index 0000000..ac24929
+--- /dev/null
++++ b/include/litmus/rt_domain.h
+@@ -0,0 +1,182 @@
++/* CLEANUP: Add comments and make it less messy.
++ *
++ */
++
++#ifndef __UNC_RT_DOMAIN_H__
++#define __UNC_RT_DOMAIN_H__
++
++#include
++
++#define RELEASE_QUEUE_SLOTS 127 /* prime */
++
++struct _rt_domain;
++
++typedef int (*check_resched_needed_t)(struct _rt_domain *rt);
++typedef void (*release_jobs_t)(struct _rt_domain *rt, struct bheap* tasks);
++
++struct release_queue {
++ /* each slot maintains a list of release heaps sorted
++ * by release time */
++ struct list_head slot[RELEASE_QUEUE_SLOTS];
++};
++
++typedef struct _rt_domain {
++ /* runnable rt tasks are in here */
++ raw_spinlock_t ready_lock;
++ struct bheap ready_queue;
++
++ /* real-time tasks waiting for release are in here */
++ raw_spinlock_t release_lock;
++ struct release_queue release_queue;
++
++#ifdef CONFIG_RELEASE_MASTER
++ int release_master;
++#endif
++
++ /* for moving tasks to the release queue */
++ raw_spinlock_t tobe_lock;
++ struct list_head tobe_released;
++
++ /* how do we check if we need to kick another CPU? */
++ check_resched_needed_t check_resched;
++
++ /* how do we release jobs? */
++ release_jobs_t release_jobs;
++
++ /* how are tasks ordered in the ready queue? */
++ bheap_prio_t order;
++} rt_domain_t;
++
++struct release_heap {
++ /* list_head for per-time-slot list */
++ struct list_head list;
++ lt_t release_time;
++ /* all tasks to be released at release_time */
++ struct bheap heap;
++ /* used to trigger the release */
++ struct hrtimer timer;
++
++#ifdef CONFIG_RELEASE_MASTER
++ /* used to delegate releases */
++ struct hrtimer_start_on_info info;
++#endif
++ /* required for the timer callback */
++ rt_domain_t* dom;
++};
++
++
++static inline struct task_struct* __next_ready(rt_domain_t* rt)
++{
++ struct bheap_node *hn = bheap_peek(rt->order, &rt->ready_queue);
++ if (hn)
++ return bheap2task(hn);
++ else
++ return NULL;
++}
++
++void rt_domain_init(rt_domain_t *rt, bheap_prio_t order,
++ check_resched_needed_t check,
++ release_jobs_t relase);
++
++void __add_ready(rt_domain_t* rt, struct task_struct *new);
++void __merge_ready(rt_domain_t* rt, struct bheap *tasks);
++void __add_release(rt_domain_t* rt, struct task_struct *task);
++
++static inline struct task_struct* __take_ready(rt_domain_t* rt)
++{
++ struct bheap_node* hn = bheap_take(rt->order, &rt->ready_queue);
++ if (hn)
++ return bheap2task(hn);
++ else
++ return NULL;
++}
++
++static inline struct task_struct* __peek_ready(rt_domain_t* rt)
++{
++ struct bheap_node* hn = bheap_peek(rt->order, &rt->ready_queue);
++ if (hn)
++ return bheap2task(hn);
++ else
++ return NULL;
++}
++
++static inline int is_queued(struct task_struct *t)
++{
++ BUG_ON(!tsk_rt(t)->heap_node);
++ return bheap_node_in_heap(tsk_rt(t)->heap_node);
++}
++
++static inline void remove(rt_domain_t* rt, struct task_struct *t)
++{
++ bheap_delete(rt->order, &rt->ready_queue, tsk_rt(t)->heap_node);
++}
++
++static inline void add_ready(rt_domain_t* rt, struct task_struct *new)
++{
++ unsigned long flags;
++ /* first we need the write lock for rt_ready_queue */
++ raw_spin_lock_irqsave(&rt->ready_lock, flags);
++ __add_ready(rt, new);
++ raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
++}
++
++static inline void merge_ready(rt_domain_t* rt, struct bheap* tasks)
++{
++ unsigned long flags;
++ raw_spin_lock_irqsave(&rt->ready_lock, flags);
++ __merge_ready(rt, tasks);
++ raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
++}
++
++static inline struct task_struct* take_ready(rt_domain_t* rt)
++{
++ unsigned long flags;
++ struct task_struct* ret;
++ /* first we need the write lock for rt_ready_queue */
++ raw_spin_lock_irqsave(&rt->ready_lock, flags);
++ ret = __take_ready(rt);
++ raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
++ return ret;
++}
++
++
++static inline void add_release(rt_domain_t* rt, struct task_struct *task)
++{
++ unsigned long flags;
++ raw_spin_lock_irqsave(&rt->tobe_lock, flags);
++ __add_release(rt, task);
++ raw_spin_unlock_irqrestore(&rt->tobe_lock, flags);
++}
++
++#ifdef CONFIG_RELEASE_MASTER
++void __add_release_on(rt_domain_t* rt, struct task_struct *task,
++ int target_cpu);
++
++static inline void add_release_on(rt_domain_t* rt,
++ struct task_struct *task,
++ int target_cpu)
++{
++ unsigned long flags;
++ raw_spin_lock_irqsave(&rt->tobe_lock, flags);
++ __add_release_on(rt, task, target_cpu);
++ raw_spin_unlock_irqrestore(&rt->tobe_lock, flags);
++}
++#endif
++
++static inline int __jobs_pending(rt_domain_t* rt)
++{
++ return !bheap_empty(&rt->ready_queue);
++}
++
++static inline int jobs_pending(rt_domain_t* rt)
++{
++ unsigned long flags;
++ int ret;
++ /* first we need the write lock for rt_ready_queue */
++ raw_spin_lock_irqsave(&rt->ready_lock, flags);
++ ret = !bheap_empty(&rt->ready_queue);
++ raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
++ return ret;
++}
++
++#endif
+diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
+new file mode 100644
+index 0000000..a7a183f
+--- /dev/null
++++ b/include/litmus/rt_param.h
+@@ -0,0 +1,196 @@
++/*
++ * Definition of the scheduler plugin interface.
++ *
++ */
++#ifndef _LINUX_RT_PARAM_H_
++#define _LINUX_RT_PARAM_H_
++
++/* Litmus time type. */
++typedef unsigned long long lt_t;
++
++static inline int lt_after(lt_t a, lt_t b)
++{
++ return ((long long) b) - ((long long) a) < 0;
++}
++#define lt_before(a, b) lt_after(b, a)
++
++static inline int lt_after_eq(lt_t a, lt_t b)
++{
++ return ((long long) a) - ((long long) b) >= 0;
++}
++#define lt_before_eq(a, b) lt_after_eq(b, a)
++
++/* different types of clients */
++typedef enum {
++ RT_CLASS_HARD,
++ RT_CLASS_SOFT,
++ RT_CLASS_BEST_EFFORT
++} task_class_t;
++
++typedef enum {
++ NO_ENFORCEMENT, /* job may overrun unhindered */
++ QUANTUM_ENFORCEMENT, /* budgets are only checked on quantum boundaries */
++ PRECISE_ENFORCEMENT /* NOT IMPLEMENTED - enforced with hrtimers */
++} budget_policy_t;
++
++struct rt_task {
++ lt_t exec_cost;
++ lt_t period;
++ lt_t phase;
++ unsigned int cpu;
++ task_class_t cls;
++ budget_policy_t budget_policy; /* ignored by pfair */
++};
++
++/* The definition of the data that is shared between the kernel and real-time
++ * tasks via a shared page (see litmus/ctrldev.c).
++ *
++ * WARNING: User space can write to this, so don't trust
++ * the correctness of the fields!
++ *
++ * This servees two purposes: to enable efficient signaling
++ * of non-preemptive sections (user->kernel) and
++ * delayed preemptions (kernel->user), and to export
++ * some real-time relevant statistics such as preemption and
++ * migration data to user space. We can't use a device to export
++ * statistics because we want to avoid system call overhead when
++ * determining preemption/migration overheads).
++ */
++struct control_page {
++ /* Is the task currently in a non-preemptive section? */
++ int np_flag;
++ /* Should the task call into the kernel when it leaves
++ * its non-preemptive section? */
++ int delayed_preemption;
++
++ /* to be extended */
++};
++
++/* don't export internal data structures to user space (liblitmus) */
++#ifdef __KERNEL__
++
++struct _rt_domain;
++struct bheap_node;
++struct release_heap;
++
++struct rt_job {
++ /* Time instant the the job was or will be released. */
++ lt_t release;
++ /* What is the current deadline? */
++ lt_t deadline;
++
++ /* How much service has this job received so far? */
++ lt_t exec_time;
++
++ /* Which job is this. This is used to let user space
++ * specify which job to wait for, which is important if jobs
++ * overrun. If we just call sys_sleep_next_period() then we
++ * will unintentionally miss jobs after an overrun.
++ *
++ * Increase this sequence number when a job is released.
++ */
++ unsigned int job_no;
++};
++
++struct pfair_param;
++
++/* RT task parameters for scheduling extensions
++ * These parameters are inherited during clone and therefore must
++ * be explicitly set up before the task set is launched.
++ */
++struct rt_param {
++ /* is the task sleeping? */
++ unsigned int flags:8;
++
++ /* do we need to check for srp blocking? */
++ unsigned int srp_non_recurse:1;
++
++ /* is the task present? (true if it can be scheduled) */
++ unsigned int present:1;
++
++ /* user controlled parameters */
++ struct rt_task task_params;
++
++ /* timing parameters */
++ struct rt_job job_params;
++
++ /* task representing the current "inherited" task
++ * priority, assigned by inherit_priority and
++ * return priority in the scheduler plugins.
++ * could point to self if PI does not result in
++ * an increased task priority.
++ */
++ struct task_struct* inh_task;
++
++#ifdef CONFIG_NP_SECTION
++ /* For the FMLP under PSN-EDF, it is required to make the task
++ * non-preemptive from kernel space. In order not to interfere with
++ * user space, this counter indicates the kernel space np setting.
++ * kernel_np > 0 => task is non-preemptive
++ */
++ unsigned int kernel_np;
++#endif
++
++ /* This field can be used by plugins to store where the task
++ * is currently scheduled. It is the responsibility of the
++ * plugin to avoid race conditions.
++ *
++ * This used by GSN-EDF and PFAIR.
++ */
++ volatile int scheduled_on;
++
++ /* Is the stack of the task currently in use? This is updated by
++ * the LITMUS core.
++ *
++ * Be careful to avoid deadlocks!
++ */
++ volatile int stack_in_use;
++
++ /* This field can be used by plugins to store where the task
++ * is currently linked. It is the responsibility of the plugin
++ * to avoid race conditions.
++ *
++ * Used by GSN-EDF.
++ */
++ volatile int linked_on;
++
++ /* PFAIR/PD^2 state. Allocated on demand. */
++ struct pfair_param* pfair;
++
++ /* Fields saved before BE->RT transition.
++ */
++ int old_policy;
++ int old_prio;
++
++ /* ready queue for this task */
++ struct _rt_domain* domain;
++
++ /* heap element for this task
++ *
++ * Warning: Don't statically allocate this node. The heap
++ * implementation swaps these between tasks, thus after
++ * dequeuing from a heap you may end up with a different node
++ * then the one you had when enqueuing the task. For the same
++ * reason, don't obtain and store references to this node
++ * other than this pointer (which is updated by the heap
++ * implementation).
++ */
++ struct bheap_node* heap_node;
++ struct release_heap* rel_heap;
++
++ /* Used by rt_domain to queue task in release list.
++ */
++ struct list_head list;
++
++ /* Pointer to the page shared between userspace and kernel. */
++ struct control_page * ctrl_page;
++};
++
++/* Possible RT flags */
++#define RT_F_RUNNING 0x00000000
++#define RT_F_SLEEP 0x00000001
++#define RT_F_EXIT_SEM 0x00000008
++
++#endif
++
++#endif
+diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h
+new file mode 100644
+index 0000000..9c1c9f2
+--- /dev/null
++++ b/include/litmus/sched_plugin.h
+@@ -0,0 +1,162 @@
++/*
++ * Definition of the scheduler plugin interface.
++ *
++ */
++#ifndef _LINUX_SCHED_PLUGIN_H_
++#define _LINUX_SCHED_PLUGIN_H_
++
++#include
++
++/* struct for semaphore with priority inheritance */
++struct pi_semaphore {
++ atomic_t count;
++ int sleepers;
++ wait_queue_head_t wait;
++ struct {
++ /* highest-prio holder/waiter */
++ struct task_struct *task;
++ struct task_struct* cpu_task[NR_CPUS];
++ } hp;
++ /* current lock holder */
++ struct task_struct *holder;
++};
++
++/************************ setup/tear down ********************/
++
++typedef long (*activate_plugin_t) (void);
++typedef long (*deactivate_plugin_t) (void);
++
++
++
++/********************* scheduler invocation ******************/
++
++/* Plugin-specific realtime tick handler */
++typedef void (*scheduler_tick_t) (struct task_struct *cur);
++/* Novell make sched decision function */
++typedef struct task_struct* (*schedule_t)(struct task_struct * prev);
++/* Clean up after the task switch has occured.
++ * This function is called after every (even non-rt) task switch.
++ */
++typedef void (*finish_switch_t)(struct task_struct *prev);
++
++
++/********************* task state changes ********************/
++
++/* Called to setup a new real-time task.
++ * Release the first job, enqueue, etc.
++ * Task may already be running.
++ */
++typedef void (*task_new_t) (struct task_struct *task,
++ int on_rq,
++ int running);
++
++/* Called to re-introduce a task after blocking.
++ * Can potentially be called multiple times.
++ */
++typedef void (*task_wake_up_t) (struct task_struct *task);
++/* called to notify the plugin of a blocking real-time task
++ * it will only be called for real-time tasks and before schedule is called */
++typedef void (*task_block_t) (struct task_struct *task);
++/* Called when a real-time task exits or changes to a different scheduling
++ * class.
++ * Free any allocated resources
++ */
++typedef void (*task_exit_t) (struct task_struct *);
++
++/* Called when the new_owner is released from the wait queue
++ * it should now inherit the priority from sem, _before_ it gets readded
++ * to any queue
++ */
++typedef long (*inherit_priority_t) (struct pi_semaphore *sem,
++ struct task_struct *new_owner);
++
++/* Called when the current task releases a semahpore where it might have
++ * inherited a piority from
++ */
++typedef long (*return_priority_t) (struct pi_semaphore *sem);
++
++/* Called when a task tries to acquire a semaphore and fails. Check if its
++ * priority is higher than that of the current holder.
++ */
++typedef long (*pi_block_t) (struct pi_semaphore *sem, struct task_struct *t);
++
++
++
++
++/********************* sys call backends ********************/
++/* This function causes the caller to sleep until the next release */
++typedef long (*complete_job_t) (void);
++
++typedef long (*admit_task_t)(struct task_struct* tsk);
++
++typedef void (*release_at_t)(struct task_struct *t, lt_t start);
++
++struct sched_plugin {
++ struct list_head list;
++ /* basic info */
++ char *plugin_name;
++
++ /* setup */
++ activate_plugin_t activate_plugin;
++ deactivate_plugin_t deactivate_plugin;
++
++#ifdef CONFIG_SRP
++ unsigned int srp_active;
++#endif
++
++ /* scheduler invocation */
++ scheduler_tick_t tick;
++ schedule_t schedule;
++ finish_switch_t finish_switch;
++
++ /* syscall backend */
++ complete_job_t complete_job;
++ release_at_t release_at;
++
++ /* task state changes */
++ admit_task_t admit_task;
++
++ task_new_t task_new;
++ task_wake_up_t task_wake_up;
++ task_block_t task_block;
++ task_exit_t task_exit;
++
++#ifdef CONFIG_FMLP
++ /* priority inheritance */
++ unsigned int fmlp_active;
++ inherit_priority_t inherit_priority;
++ return_priority_t return_priority;
++ pi_block_t pi_block;
++#endif
++} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
++
++
++extern struct sched_plugin *litmus;
++
++/* cluster size: cache_index = 2 L2, cache_index = 3 L3 */
++extern int cluster_cache_index;
++
++int register_sched_plugin(struct sched_plugin* plugin);
++struct sched_plugin* find_sched_plugin(const char* name);
++int print_sched_plugins(char* buf, int max);
++
++static inline int srp_active(void)
++{
++#ifdef CONFIG_SRP
++ return litmus->srp_active;
++#else
++ return 0;
++#endif
++}
++static inline int fmlp_active(void)
++{
++#ifdef CONFIG_FMLP
++ return litmus->fmlp_active;
++#else
++ return 0;
++#endif
++}
++
++extern struct sched_plugin linux_sched_plugin;
++
++#endif
+diff --git a/include/litmus/sched_trace.h b/include/litmus/sched_trace.h
+new file mode 100644
+index 0000000..e1b0c97
+--- /dev/null
++++ b/include/litmus/sched_trace.h
+@@ -0,0 +1,192 @@
++/*
++ * sched_trace.h -- record scheduler events to a byte stream for offline analysis.
++ */
++#ifndef _LINUX_SCHED_TRACE_H_
++#define _LINUX_SCHED_TRACE_H_
++
++/* all times in nanoseconds */
++
++struct st_trace_header {
++ u8 type; /* Of what type is this record? */
++ u8 cpu; /* On which CPU was it recorded? */
++ u16 pid; /* PID of the task. */
++ u32 job; /* The job sequence number. */
++};
++
++#define ST_NAME_LEN 16
++struct st_name_data {
++ char cmd[ST_NAME_LEN];/* The name of the executable of this process. */
++};
++
++struct st_param_data { /* regular params */
++ u32 wcet;
++ u32 period;
++ u32 phase;
++ u8 partition;
++ u8 __unused[3];
++};
++
++struct st_release_data { /* A job is was/is going to be released. */
++ u64 release; /* What's the release time? */
++ u64 deadline; /* By when must it finish? */
++};
++
++struct st_assigned_data { /* A job was asigned to a CPU. */
++ u64 when;
++ u8 target; /* Where should it execute? */
++ u8 __unused[3];
++};
++
++struct st_switch_to_data { /* A process was switched to on a given CPU. */
++ u64 when; /* When did this occur? */
++ u32 exec_time; /* Time the current job has executed. */
++
++};
++
++struct st_switch_away_data { /* A process was switched away from on a given CPU. */
++ u64 when;
++ u64 exec_time;
++};
++
++struct st_completion_data { /* A job completed. */
++ u64 when;
++ u8 forced:1; /* Set to 1 if job overran and kernel advanced to the
++ * next task automatically; set to 0 otherwise.
++ */
++ u8 __uflags:7;
++ u8 __unused[3];
++};
++
++struct st_block_data { /* A task blocks. */
++ u64 when;
++ u64 __unused;
++};
++
++struct st_resume_data { /* A task resumes. */
++ u64 when;
++ u64 __unused;
++};
++
++struct st_sys_release_data {
++ u64 when;
++ u64 release;
++};
++
++#define DATA(x) struct st_ ## x ## _data x;
++
++typedef enum {
++ ST_NAME = 1, /* Start at one, so that we can spot
++ * uninitialized records. */
++ ST_PARAM,
++ ST_RELEASE,
++ ST_ASSIGNED,
++ ST_SWITCH_TO,
++ ST_SWITCH_AWAY,
++ ST_COMPLETION,
++ ST_BLOCK,
++ ST_RESUME,
++ ST_SYS_RELEASE,
++} st_event_record_type_t;
++
++struct st_event_record {
++ struct st_trace_header hdr;
++ union {
++ u64 raw[2];
++
++ DATA(name);
++ DATA(param);
++ DATA(release);
++ DATA(assigned);
++ DATA(switch_to);
++ DATA(switch_away);
++ DATA(completion);
++ DATA(block);
++ DATA(resume);
++ DATA(sys_release);
++
++ } data;
++};
++
++#undef DATA
++
++#ifdef __KERNEL__
++
++#include
++#include
++
++#ifdef CONFIG_SCHED_TASK_TRACE
++
++#define SCHED_TRACE(id, callback, task) \
++ ft_event1(id, callback, task)
++#define SCHED_TRACE2(id, callback, task, xtra) \
++ ft_event2(id, callback, task, xtra)
++
++/* provide prototypes; needed on sparc64 */
++#ifndef NO_TASK_TRACE_DECLS
++feather_callback void do_sched_trace_task_name(unsigned long id,
++ struct task_struct* task);
++feather_callback void do_sched_trace_task_param(unsigned long id,
++ struct task_struct* task);
++feather_callback void do_sched_trace_task_release(unsigned long id,
++ struct task_struct* task);
++feather_callback void do_sched_trace_task_switch_to(unsigned long id,
++ struct task_struct* task);
++feather_callback void do_sched_trace_task_switch_away(unsigned long id,
++ struct task_struct* task);
++feather_callback void do_sched_trace_task_completion(unsigned long id,
++ struct task_struct* task,
++ unsigned long forced);
++feather_callback void do_sched_trace_task_block(unsigned long id,
++ struct task_struct* task);
++feather_callback void do_sched_trace_task_resume(unsigned long id,
++ struct task_struct* task);
++feather_callback void do_sched_trace_sys_release(unsigned long id,
++ lt_t* start);
++#endif
++
++#else
++
++#define SCHED_TRACE(id, callback, task) /* no tracing */
++#define SCHED_TRACE2(id, callback, task, xtra) /* no tracing */
++
++#endif
++
++
++#define SCHED_TRACE_BASE_ID 500
++
++
++#define sched_trace_task_name(t) \
++ SCHED_TRACE(SCHED_TRACE_BASE_ID + 1, do_sched_trace_task_name, t)
++#define sched_trace_task_param(t) \
++ SCHED_TRACE(SCHED_TRACE_BASE_ID + 2, do_sched_trace_task_param, t)
++#define sched_trace_task_release(t) \
++ SCHED_TRACE(SCHED_TRACE_BASE_ID + 3, do_sched_trace_task_release, t)
++#define sched_trace_task_switch_to(t) \
++ SCHED_TRACE(SCHED_TRACE_BASE_ID + 4, do_sched_trace_task_switch_to, t)
++#define sched_trace_task_switch_away(t) \
++ SCHED_TRACE(SCHED_TRACE_BASE_ID + 5, do_sched_trace_task_switch_away, t)
++#define sched_trace_task_completion(t, forced) \
++ SCHED_TRACE2(SCHED_TRACE_BASE_ID + 6, do_sched_trace_task_completion, t, \
++ (unsigned long) forced)
++#define sched_trace_task_block(t) \
++ SCHED_TRACE(SCHED_TRACE_BASE_ID + 7, do_sched_trace_task_block, t)
++#define sched_trace_task_resume(t) \
++ SCHED_TRACE(SCHED_TRACE_BASE_ID + 8, do_sched_trace_task_resume, t)
++/* when is a pointer, it does not need an explicit cast to unsigned long */
++#define sched_trace_sys_release(when) \
++ SCHED_TRACE(SCHED_TRACE_BASE_ID + 9, do_sched_trace_sys_release, when)
++
++#define sched_trace_quantum_boundary() /* NOT IMPLEMENTED */
++
++#ifdef CONFIG_SCHED_DEBUG_TRACE
++void sched_trace_log_message(const char* fmt, ...);
++void dump_trace_buffer(int max);
++#else
++
++#define sched_trace_log_message(fmt, ...)
++
++#endif
++
++#endif /* __KERNEL__ */
++
++#endif
+diff --git a/include/litmus/trace.h b/include/litmus/trace.h
+new file mode 100644
+index 0000000..b32c711
+--- /dev/null
++++ b/include/litmus/trace.h
+@@ -0,0 +1,113 @@
++#ifndef _SYS_TRACE_H_
++#define _SYS_TRACE_H_
++
++#ifdef CONFIG_SCHED_OVERHEAD_TRACE
++
++#include
++#include
++
++
++/*********************** TIMESTAMPS ************************/
++
++enum task_type_marker {
++ TSK_BE,
++ TSK_RT,
++ TSK_UNKNOWN
++};
++
++struct timestamp {
++ uint64_t timestamp;
++ uint32_t seq_no;
++ uint8_t cpu;
++ uint8_t event;
++ uint8_t task_type;
++};
++
++/* tracing callbacks */
++feather_callback void save_timestamp(unsigned long event);
++feather_callback void save_timestamp_def(unsigned long event, unsigned long type);
++feather_callback void save_timestamp_task(unsigned long event, unsigned long t_ptr);
++feather_callback void save_timestamp_cpu(unsigned long event, unsigned long cpu);
++
++
++#define TIMESTAMP(id) ft_event0(id, save_timestamp)
++
++#define DTIMESTAMP(id, def) ft_event1(id, save_timestamp_def, (unsigned long) def)
++
++#define TTIMESTAMP(id, task) \
++ ft_event1(id, save_timestamp_task, (unsigned long) task)
++
++#define CTIMESTAMP(id, cpu) \
++ ft_event1(id, save_timestamp_cpu, (unsigned long) cpu)
++
++#else /* !CONFIG_SCHED_OVERHEAD_TRACE */
++
++#define TIMESTAMP(id) /* no tracing */
++
++#define DTIMESTAMP(id, def) /* no tracing */
++
++#define TTIMESTAMP(id, task) /* no tracing */
++
++#define CTIMESTAMP(id, cpu) /* no tracing */
++
++#endif
++
++
++/* Convention for timestamps
++ * =========================
++ *
++ * In order to process the trace files with a common tool, we use the following
++ * convention to measure execution times: The end time id of a code segment is
++ * always the next number after the start time event id.
++ */
++
++#define TS_SCHED_START DTIMESTAMP(100, TSK_UNKNOWN) /* we only
++ * care
++ * about
++ * next */
++#define TS_SCHED_END(t) TTIMESTAMP(101, t)
++#define TS_SCHED2_START(t) TTIMESTAMP(102, t)
++#define TS_SCHED2_END(t) TTIMESTAMP(103, t)
++
++#define TS_CXS_START(t) TTIMESTAMP(104, t)
++#define TS_CXS_END(t) TTIMESTAMP(105, t)
++
++#define TS_RELEASE_START DTIMESTAMP(106, TSK_RT)
++#define TS_RELEASE_END DTIMESTAMP(107, TSK_RT)
++
++#define TS_TICK_START(t) TTIMESTAMP(110, t)
++#define TS_TICK_END(t) TTIMESTAMP(111, t)
++
++
++#define TS_PLUGIN_SCHED_START /* TIMESTAMP(120) */ /* currently unused */
++#define TS_PLUGIN_SCHED_END /* TIMESTAMP(121) */
++
++#define TS_PLUGIN_TICK_START /* TIMESTAMP(130) */
++#define TS_PLUGIN_TICK_END /* TIMESTAMP(131) */
++
++#define TS_ENTER_NP_START TIMESTAMP(140)
++#define TS_ENTER_NP_END TIMESTAMP(141)
++
++#define TS_EXIT_NP_START TIMESTAMP(150)
++#define TS_EXIT_NP_END TIMESTAMP(151)
++
++#define TS_SRP_UP_START TIMESTAMP(160)
++#define TS_SRP_UP_END TIMESTAMP(161)
++#define TS_SRP_DOWN_START TIMESTAMP(162)
++#define TS_SRP_DOWN_END TIMESTAMP(163)
++
++#define TS_PI_UP_START TIMESTAMP(170)
++#define TS_PI_UP_END TIMESTAMP(171)
++#define TS_PI_DOWN_START TIMESTAMP(172)
++#define TS_PI_DOWN_END TIMESTAMP(173)
++
++#define TS_FIFO_UP_START TIMESTAMP(180)
++#define TS_FIFO_UP_END TIMESTAMP(181)
++#define TS_FIFO_DOWN_START TIMESTAMP(182)
++#define TS_FIFO_DOWN_END TIMESTAMP(183)
++
++#define TS_SEND_RESCHED_START(c) CTIMESTAMP(190, c)
++#define TS_SEND_RESCHED_END DTIMESTAMP(191, TSK_UNKNOWN)
++
++
++#endif /* !_SYS_TRACE_H_ */
+diff --git a/include/litmus/unistd_32.h b/include/litmus/unistd_32.h
+new file mode 100644
+index 0000000..dbddc65
+--- /dev/null
++++ b/include/litmus/unistd_32.h
+@@ -0,0 +1,23 @@
++/*
++ * included from arch/x86/include/asm/unistd_32.h
++ *
++ * LITMUS^RT syscalls with "relative" numbers
++ */
++#define __LSC(x) (__NR_LITMUS + x)
++
++#define __NR_set_rt_task_param __LSC(0)
++#define __NR_get_rt_task_param __LSC(1)
++#define __NR_complete_job __LSC(2)
++#define __NR_od_open __LSC(3)
++#define __NR_od_close __LSC(4)
++#define __NR_fmlp_down __LSC(5)
++#define __NR_fmlp_up __LSC(6)
++#define __NR_srp_down __LSC(7)
++#define __NR_srp_up __LSC(8)
++#define __NR_query_job_no __LSC(9)
++#define __NR_wait_for_job_release __LSC(10)
++#define __NR_wait_for_ts_release __LSC(11)
++#define __NR_release_ts __LSC(12)
++#define __NR_null_call __LSC(13)
++
++#define NR_litmus_syscalls 14
+diff --git a/include/litmus/unistd_64.h b/include/litmus/unistd_64.h
+new file mode 100644
+index 0000000..f0618e7
+--- /dev/null
++++ b/include/litmus/unistd_64.h
+@@ -0,0 +1,37 @@
++/*
++ * included from arch/x86/include/asm/unistd_64.h
++ *
++ * LITMUS^RT syscalls with "relative" numbers
++ */
++#define __LSC(x) (__NR_LITMUS + x)
++
++#define __NR_set_rt_task_param __LSC(0)
++__SYSCALL(__NR_set_rt_task_param, sys_set_rt_task_param)
++#define __NR_get_rt_task_param __LSC(1)
++__SYSCALL(__NR_get_rt_task_param, sys_get_rt_task_param)
++#define __NR_complete_job __LSC(2)
++__SYSCALL(__NR_complete_job, sys_complete_job)
++#define __NR_od_open __LSC(3)
++__SYSCALL(__NR_od_open, sys_od_open)
++#define __NR_od_close __LSC(4)
++__SYSCALL(__NR_od_close, sys_od_close)
++#define __NR_fmlp_down __LSC(5)
++__SYSCALL(__NR_fmlp_down, sys_fmlp_down)
++#define __NR_fmlp_up __LSC(6)
++__SYSCALL(__NR_fmlp_up, sys_fmlp_up)
++#define __NR_srp_down __LSC(7)
++__SYSCALL(__NR_srp_down, sys_srp_down)
++#define __NR_srp_up __LSC(8)
++__SYSCALL(__NR_srp_up, sys_srp_up)
++#define __NR_query_job_no __LSC(9)
++__SYSCALL(__NR_query_job_no, sys_query_job_no)
++#define __NR_wait_for_job_release __LSC(10)
++__SYSCALL(__NR_wait_for_job_release, sys_wait_for_job_release)
++#define __NR_wait_for_ts_release __LSC(11)
++__SYSCALL(__NR_wait_for_ts_release, sys_wait_for_ts_release)
++#define __NR_release_ts __LSC(12)
++__SYSCALL(__NR_release_ts, sys_release_ts)
++#define __NR_null_call __LSC(13)
++__SYSCALL(__NR_null_call, sys_null_call)
++
++#define NR_litmus_syscalls 14
+diff --git a/kernel/exit.c b/kernel/exit.c
+index 7f2683a..256ce8c 100644
+--- a/kernel/exit.c
++++ b/kernel/exit.c
+@@ -57,6 +57,8 @@
+ #include
+ #include "cred-internals.h"
+
++extern void exit_od_table(struct task_struct *t);
++
+ static void exit_mm(struct task_struct * tsk);
+
+ static void __unhash_process(struct task_struct *p)
+@@ -968,6 +970,8 @@ NORET_TYPE void do_exit(long code)
+ if (unlikely(tsk->audit_context))
+ audit_free(tsk);
+
++ exit_od_table(tsk);
++
+ tsk->exit_code = code;
+ taskstats_exit(tsk, group_dead);
+
+diff --git a/kernel/fork.c b/kernel/fork.c
+index 4c14942..166eb78 100644
+--- a/kernel/fork.c
++++ b/kernel/fork.c
+@@ -75,6 +75,9 @@
+
+ #include
+
++#include
++#include
++
+ /*
+ * Protected counters by write_lock_irq(&tasklist_lock)
+ */
+@@ -171,6 +174,7 @@ void __put_task_struct(struct task_struct *tsk)
+ WARN_ON(atomic_read(&tsk->usage));
+ WARN_ON(tsk == current);
+
++ exit_litmus(tsk);
+ exit_creds(tsk);
+ delayacct_tsk_free(tsk);
+
+@@ -253,6 +257,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
+
+ tsk->stack = ti;
+
++ /* Don't let the new task be a real-time task. */
++ litmus_fork(tsk);
++
+ err = prop_local_init_single(&tsk->dirties);
+ if (err)
+ goto out;
+diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
+index 0086628..fdf9596 100644
+--- a/kernel/hrtimer.c
++++ b/kernel/hrtimer.c
+@@ -46,6 +46,8 @@
+ #include
+ #include
+
++#include
++
+ #include
+
+ #include
+@@ -1041,6 +1043,98 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
+ }
+ EXPORT_SYMBOL_GPL(hrtimer_start);
+
++#ifdef CONFIG_ARCH_HAS_SEND_PULL_TIMERS
++
++/**
++ * hrtimer_start_on_info_init - Initialize hrtimer_start_on_info
++ */
++void hrtimer_start_on_info_init(struct hrtimer_start_on_info *info)
++{
++ memset(info, 0, sizeof(struct hrtimer_start_on_info));
++ atomic_set(&info->state, HRTIMER_START_ON_INACTIVE);
++}
++
++/**
++ * hrtimer_pull - PULL_TIMERS_VECTOR callback on remote cpu
++ */
++void hrtimer_pull(void)
++{
++ struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
++ struct hrtimer_start_on_info *info;
++ struct list_head *pos, *safe, list;
++
++ raw_spin_lock(&base->lock);
++ list_replace_init(&base->to_pull, &list);
++ raw_spin_unlock(&base->lock);
++
++ list_for_each_safe(pos, safe, &list) {
++ info = list_entry(pos, struct hrtimer_start_on_info, list);
++ TRACE("pulled timer 0x%x\n", info->timer);
++ list_del(pos);
++ hrtimer_start(info->timer, info->time, info->mode);
++ }
++}
++
++/**
++ * hrtimer_start_on - trigger timer arming on remote cpu
++ * @cpu: remote cpu
++ * @info: save timer information for enqueuing on remote cpu
++ * @timer: timer to be pulled
++ * @time: expire time
++ * @mode: timer mode
++ */
++int hrtimer_start_on(int cpu, struct hrtimer_start_on_info* info,
++ struct hrtimer *timer, ktime_t time,
++ const enum hrtimer_mode mode)
++{
++ unsigned long flags;
++ struct hrtimer_cpu_base* base;
++ int in_use = 0, was_empty;
++
++ /* serialize access to info through the timer base */
++ lock_hrtimer_base(timer, &flags);
++
++ in_use = (atomic_read(&info->state) != HRTIMER_START_ON_INACTIVE);
++ if (!in_use) {
++ INIT_LIST_HEAD(&info->list);
++ info->timer = timer;
++ info->time = time;
++ info->mode = mode;
++ /* mark as in use */
++ atomic_set(&info->state, HRTIMER_START_ON_QUEUED);
++ }
++
++ unlock_hrtimer_base(timer, &flags);
++
++ if (!in_use) {
++ /* initiate pull */
++ preempt_disable();
++ if (cpu == smp_processor_id()) {
++ /* start timer locally; we may get called
++ * with rq->lock held, do not wake up anything
++ */
++ TRACE("hrtimer_start_on: starting on local CPU\n");
++ __hrtimer_start_range_ns(info->timer, info->time,
++ 0, info->mode, 0);
++ } else {
++ TRACE("hrtimer_start_on: pulling to remote CPU\n");
++ base = &per_cpu(hrtimer_bases, cpu);
++ raw_spin_lock_irqsave(&base->lock, flags);
++ was_empty = list_empty(&base->to_pull);
++ list_add(&info->list, &base->to_pull);
++ raw_spin_unlock_irqrestore(&base->lock, flags);
++ if (was_empty)
++ /* only send IPI if other no else
++ * has done so already
++ */
++ smp_send_pull_timers(cpu);
++ }
++ preempt_enable();
++ }
++ return in_use;
++}
++
++#endif
+
+ /**
+ * hrtimer_try_to_cancel - try to deactivate a timer
+@@ -1631,6 +1725,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
+ cpu_base->clock_base[i].cpu_base = cpu_base;
+
+ hrtimer_init_hres(cpu_base);
++ INIT_LIST_HEAD(&cpu_base->to_pull);
+ }
+
+ #ifdef CONFIG_HOTPLUG_CPU
+diff --git a/kernel/printk.c b/kernel/printk.c
+index 75077ad..ee54355 100644
+--- a/kernel/printk.c
++++ b/kernel/printk.c
+@@ -71,6 +71,13 @@ int console_printk[4] = {
+ };
+
+ /*
++ * divert printk() messages when there is a LITMUS^RT debug listener
++ */
++#include
++int trace_override = 0;
++int trace_recurse = 0;
++
++/*
+ * Low level drivers may need that to know if they can schedule in
+ * their unblank() callback or not. So let's export it.
+ */
+@@ -708,6 +715,9 @@ asmlinkage int vprintk(const char *fmt, va_list args)
+ /* Emit the output into the temporary buffer */
+ printed_len += vscnprintf(printk_buf + printed_len,
+ sizeof(printk_buf) - printed_len, fmt, args);
++ /* if LITMUS^RT tracer is active divert printk() msgs */
++ if (trace_override && !trace_recurse)
++ TRACE("%s", printk_buf);
+
+
+ p = printk_buf;
+@@ -777,7 +787,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
+ * Try to acquire and then immediately release the
+ * console semaphore. The release will do all the
+ * actual magic (print out buffers, wake up klogd,
+- * etc).
++ * etc).
+ *
+ * The acquire_console_semaphore_for_printk() function
+ * will release 'logbuf_lock' regardless of whether it
+@@ -1014,7 +1024,7 @@ int printk_needs_cpu(int cpu)
+
+ void wake_up_klogd(void)
+ {
+- if (waitqueue_active(&log_wait))
++ if (!trace_override && waitqueue_active(&log_wait))
+ __raw_get_cpu_var(printk_pending) = 1;
+ }
+
+diff --git a/kernel/sched.c b/kernel/sched.c
+index 3c2a54f..5e3c509 100644
+--- a/kernel/sched.c
++++ b/kernel/sched.c
+@@ -78,6 +78,9 @@
+
+ #include "sched_cpupri.h"
+
++#include
++#include
++
+ #define CREATE_TRACE_POINTS
+ #include
+
+@@ -450,6 +453,12 @@ struct rt_rq {
+ #endif
+ };
+
++/* Litmus related fields in a runqueue */
++struct litmus_rq {
++ unsigned long nr_running;
++ struct task_struct *prev;
++};
++
+ #ifdef CONFIG_SMP
+
+ /*
+@@ -512,6 +521,7 @@ struct rq {
+
+ struct cfs_rq cfs;
+ struct rt_rq rt;
++ struct litmus_rq litmus;
+
+ #ifdef CONFIG_FAIR_GROUP_SCHED
+ /* list of leaf cfs_rq on this cpu: */
+@@ -1833,7 +1843,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
+
+ static const struct sched_class rt_sched_class;
+
+-#define sched_class_highest (&rt_sched_class)
++#define sched_class_highest (&litmus_sched_class)
+ #define for_each_class(class) \
+ for (class = sched_class_highest; class; class = class->next)
+
+@@ -1932,6 +1942,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
+ #include "sched_idletask.c"
+ #include "sched_fair.c"
+ #include "sched_rt.c"
++#include "../litmus/sched_litmus.c"
+ #ifdef CONFIG_SCHED_DEBUG
+ # include "sched_debug.c"
+ #endif
+@@ -2372,6 +2383,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
+ unsigned long flags;
+ struct rq *rq;
+
++ if (is_realtime(p))
++ TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state);
++
+ if (!sched_feat(SYNC_WAKEUPS))
+ wake_flags &= ~WF_SYNC;
+
+@@ -2390,7 +2404,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
+ orig_cpu = cpu;
+
+ #ifdef CONFIG_SMP
+- if (unlikely(task_running(rq, p)))
++ if (unlikely(task_running(rq, p)) || is_realtime(p))
+ goto out_activate;
+
+ /*
+@@ -2497,6 +2511,8 @@ out_running:
+ }
+ #endif
+ out:
++ if (is_realtime(p))
++ TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state);
+ task_rq_unlock(rq, &flags);
+ put_cpu();
+
+@@ -2814,6 +2830,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
+ */
+ prev_state = prev->state;
+ finish_arch_switch(prev);
++ litmus->finish_switch(prev);
++ prev->rt_param.stack_in_use = NO_CPU;
+ #ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+ local_irq_disable();
+ #endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+@@ -2843,6 +2861,15 @@ static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
+ {
+ if (prev->sched_class->pre_schedule)
+ prev->sched_class->pre_schedule(rq, prev);
++
++ /* LITMUS^RT not very clean hack: we need to save the prev task
++ * as our scheduling decision rely on it (as we drop the rq lock
++ * something in prev can change...); there is no way to escape
++ * this ack apart from modifying pick_nex_task(rq, _prev_) or
++ * falling back on the previous solution of decoupling
++ * scheduling decisions
++ */
++ rq->litmus.prev = prev;
+ }
+
+ /* rq->lock is NOT held, but preemption is disabled */
+@@ -3520,18 +3547,26 @@ void scheduler_tick(void)
+
+ sched_clock_tick();
+
++ TS_TICK_START(current);
++
+ raw_spin_lock(&rq->lock);
+ update_rq_clock(rq);
+ update_cpu_load(rq);
+ curr->sched_class->task_tick(rq, curr, 0);
++
++ /* litmus_tick may force current to resched */
++ litmus_tick(rq, curr);
++
+ raw_spin_unlock(&rq->lock);
+
+ perf_event_task_tick(curr);
+
+ #ifdef CONFIG_SMP
+ rq->idle_at_tick = idle_cpu(cpu);
+- trigger_load_balance(rq, cpu);
++ if (!is_realtime(current))
++ trigger_load_balance(rq, cpu);
+ #endif
++ TS_TICK_END(current);
+ }
+
+ notrace unsigned long get_parent_ip(unsigned long addr)
+@@ -3672,12 +3707,20 @@ pick_next_task(struct rq *rq)
+ /*
+ * Optimization: we know that if all tasks are in
+ * the fair class we can call that function directly:
+- */
+- if (likely(rq->nr_running == rq->cfs.nr_running)) {
++
++ * NOT IN LITMUS^RT!
++
++ * This breaks many assumptions in the plugins.
++ * Do not uncomment without thinking long and hard
++ * about how this affects global plugins such as GSN-EDF.
++
++ if (rq->nr_running == rq->cfs.nr_running) {
++ TRACE("taking shortcut in pick_next_task()\n");
+ p = fair_sched_class.pick_next_task(rq);
+ if (likely(p))
+ return p;
+ }
++ */
+
+ class = sched_class_highest;
+ for ( ; ; ) {
+@@ -3712,6 +3755,8 @@ need_resched:
+
+ release_kernel_lock(prev);
+ need_resched_nonpreemptible:
++ TS_SCHED_START;
++ sched_trace_task_switch_away(prev);
+
+ schedule_debug(prev);
+
+@@ -3746,15 +3791,22 @@ need_resched_nonpreemptible:
+ rq->curr = next;
+ ++*switch_count;
+
++ TS_SCHED_END(next);
++ TS_CXS_START(next);
+ context_switch(rq, prev, next); /* unlocks the rq */
++ TS_CXS_END(current);
+ /*
+ * the context switch might have flipped the stack from under
+ * us, hence refresh the local variables.
+ */
+ cpu = smp_processor_id();
+ rq = cpu_rq(cpu);
+- } else
++ } else {
++ TS_SCHED_END(prev);
+ raw_spin_unlock_irq(&rq->lock);
++ }
++
++ sched_trace_task_switch_to(current);
+
+ post_schedule(rq);
+
+@@ -3767,6 +3819,9 @@ need_resched_nonpreemptible:
+ preempt_enable_no_resched();
+ if (need_resched())
+ goto need_resched;
++
++ if (srp_active())
++ srp_ceiling_block();
+ }
+ EXPORT_SYMBOL(schedule);
+
+@@ -4043,6 +4098,17 @@ void complete_all(struct completion *x)
+ }
+ EXPORT_SYMBOL(complete_all);
+
++void complete_n(struct completion *x, int n)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&x->wait.lock, flags);
++ x->done += n;
++ __wake_up_common(&x->wait, TASK_NORMAL, n, 0, NULL);
++ spin_unlock_irqrestore(&x->wait.lock, flags);
++}
++EXPORT_SYMBOL(complete_n);
++
+ static inline long __sched
+ do_wait_for_common(struct completion *x, long timeout, int state)
+ {
+@@ -4471,7 +4537,9 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
+ p->normal_prio = normal_prio(p);
+ /* we are holding p->pi_lock already */
+ p->prio = rt_mutex_getprio(p);
+- if (rt_prio(p->prio))
++ if (p->policy == SCHED_LITMUS)
++ p->sched_class = &litmus_sched_class;
++ else if (rt_prio(p->prio))
+ p->sched_class = &rt_sched_class;
+ else
+ p->sched_class = &fair_sched_class;
+@@ -4516,7 +4584,7 @@ recheck:
+
+ if (policy != SCHED_FIFO && policy != SCHED_RR &&
+ policy != SCHED_NORMAL && policy != SCHED_BATCH &&
+- policy != SCHED_IDLE)
++ policy != SCHED_IDLE && policy != SCHED_LITMUS)
+ return -EINVAL;
+ }
+
+@@ -4531,6 +4599,8 @@ recheck:
+ return -EINVAL;
+ if (rt_policy(policy) != (param->sched_priority != 0))
+ return -EINVAL;
++ if (policy == SCHED_LITMUS && policy == p->policy)
++ return -EINVAL;
+
+ /*
+ * Allow unprivileged RT tasks to decrease priority:
+@@ -4585,6 +4655,12 @@ recheck:
+ return retval;
+ }
+
++ if (policy == SCHED_LITMUS) {
++ retval = litmus_admit_task(p);
++ if (retval)
++ return retval;
++ }
++
+ /*
+ * make sure no PI-waiters arrive (or leave) while we are
+ * changing the priority of the task:
+@@ -4612,10 +4688,19 @@ recheck:
+
+ p->sched_reset_on_fork = reset_on_fork;
+
++ if (p->policy == SCHED_LITMUS)
++ litmus_exit_task(p);
++
+ oldprio = p->prio;
+ prev_class = p->sched_class;
+ __setscheduler(rq, p, policy, param->sched_priority);
+
++ if (policy == SCHED_LITMUS) {
++ p->rt_param.stack_in_use = running ? rq->cpu : NO_CPU;
++ p->rt_param.present = running;
++ litmus->task_new(p, on_rq, running);
++ }
++
+ if (running)
+ p->sched_class->set_curr_task(rq);
+ if (on_rq) {
+@@ -4785,10 +4870,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+ rcu_read_lock();
+
+ p = find_process_by_pid(pid);
+- if (!p) {
++ /* Don't set affinity if task not found and for LITMUS tasks */
++ if (!p || is_realtime(p)) {
+ rcu_read_unlock();
+ put_online_cpus();
+- return -ESRCH;
++ return p ? -EPERM : -ESRCH;
+ }
+
+ /* Prevent p going away */
+diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
+index 5a5ea2c..b1af6d4 100644
+--- a/kernel/sched_fair.c
++++ b/kernel/sched_fair.c
+@@ -1708,7 +1708,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
+ int sync = wake_flags & WF_SYNC;
+ int scale = cfs_rq->nr_running >= sched_nr_latency;
+
+- if (unlikely(rt_prio(p->prio)))
++ if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS)
+ goto preempt;
+
+ if (unlikely(p->sched_class != &fair_sched_class))
+diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
+index b5b920a..c2fbb02 100644
+--- a/kernel/sched_rt.c
++++ b/kernel/sched_rt.c
+@@ -1014,7 +1014,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
+ */
+ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
+ {
+- if (p->prio < rq->curr->prio) {
++ if (p->prio < rq->curr->prio || p->policy == SCHED_LITMUS) {
+ resched_task(rq->curr);
+ return;
+ }
+diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
+index f992762..0adc54b 100644
+--- a/kernel/time/tick-sched.c
++++ b/kernel/time/tick-sched.c
+@@ -721,6 +721,46 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
+ }
+
+ /**
++ * tick_set_quanta_type - get the quanta type as a boot option
++ * Default is standard setup with ticks staggered over first
++ * half of tick period.
++ */
++int quanta_type = LINUX_DEFAULT_TICKS;
++static int __init tick_set_quanta_type(char *str)
++{
++ if (strcmp("aligned", str) == 0) {
++ quanta_type = LITMUS_ALIGNED_TICKS;
++ printk(KERN_INFO "LITMUS^RT: setting aligned quanta\n");
++ }
++ else if (strcmp("staggered", str) == 0) {
++ quanta_type = LITMUS_STAGGERED_TICKS;
++ printk(KERN_INFO "LITMUS^RT: setting staggered quanta\n");
++ }
++ return 1;
++}
++__setup("quanta=", tick_set_quanta_type);
++
++u64 cpu_stagger_offset(int cpu)
++{
++ u64 offset = 0;
++ switch (quanta_type) {
++ case LITMUS_ALIGNED_TICKS:
++ offset = 0;
++ break;
++ case LITMUS_STAGGERED_TICKS:
++ offset = ktime_to_ns(tick_period);
++ do_div(offset, num_possible_cpus());
++ offset *= cpu;
++ break;
++ default:
++ offset = ktime_to_ns(tick_period) >> 1;
++ do_div(offset, num_possible_cpus());
++ offset *= cpu;
++ }
++ return offset;
++}
++
++/**
+ * tick_setup_sched_timer - setup the tick emulation timer
+ */
+ void tick_setup_sched_timer(void)
+@@ -737,9 +777,11 @@ void tick_setup_sched_timer(void)
+
+ /* Get the next period (per cpu) */
+ hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
+- offset = ktime_to_ns(tick_period) >> 1;
+- do_div(offset, num_possible_cpus());
+- offset *= smp_processor_id();
++
++ /* Offset must be set correctly to achieve desired quanta type. */
++ offset = cpu_stagger_offset(smp_processor_id());
++
++ /* Add the correct offset to expiration time */
+ hrtimer_add_expires_ns(&ts->sched_timer, offset);
+
+ for (;;) {
+diff --git a/litmus/Kconfig b/litmus/Kconfig
+new file mode 100644
+index 0000000..9888589
+--- /dev/null
++++ b/litmus/Kconfig
+@@ -0,0 +1,134 @@
++menu "LITMUS^RT"
++
++menu "Scheduling"
++
++config PLUGIN_CEDF
++ bool "Clustered-EDF"
++ depends on X86 && SYSFS
++ default y
++ help
++ Include the Clustered EDF (C-EDF) plugin in the kernel.
++ This is appropriate for large platforms with shared caches.
++ On smaller platforms (e.g., ARM PB11MPCore), using C-EDF
++ makes little sense since there aren't any shared caches.
++
++config PLUGIN_PFAIR
++ bool "PFAIR"
++ depends on HIGH_RES_TIMERS && !NO_HZ
++ default y
++ help
++ Include the PFAIR plugin (i.e., the PD^2 scheduler) in the kernel.
++ The PFAIR plugin requires high resolution timers (for staggered quanta)
++ and does not support NO_HZ (quanta could be missed when the system is idle).
++
++ If unsure, say Yes.
++
++config RELEASE_MASTER
++ bool "Release-master Support"
++ depends on ARCH_HAS_SEND_PULL_TIMERS
++ default n
++ help
++ Allow one processor to act as a dedicated interrupt processor
++ that services all timer interrupts, but that does not schedule
++ real-time tasks. See RTSS'09 paper for details
++ (http://www.cs.unc.edu/~anderson/papers.html).
++ Currently only supported by GSN-EDF.
++
++endmenu
++
++menu "Real-Time Synchronization"
++
++config NP_SECTION
++ bool "Non-preemptive section support"
++ default n
++ help
++ Allow tasks to become non-preemptable.
++ Note that plugins still need to explicitly support non-preemptivity.
++ Currently, only GSN-EDF and PSN-EDF have such support.
++
++ This is required to support the FMLP.
++ If disabled, all tasks will be considered preemptable at all times.
++
++config SRP
++ bool "Stack Resource Policy (SRP)"
++ default n
++ help
++ Include support for Baker's Stack Resource Policy.
++
++ Say Yes if you want FMLP local long critical section
++ synchronization support.
++
++config FMLP
++ bool "FMLP support"
++ depends on NP_SECTION
++ default n
++ help
++ Include support for deterministic multiprocessor real-time
++ synchronization support.
++
++ Say Yes if you want FMLP long critical section
++ synchronization support.
++
++endmenu
++
++menu "Tracing"
++
++config FEATHER_TRACE
++ bool "Feather-Trace Infrastructure"
++ default y
++ help
++ Feather-Trace basic tracing infrastructure. Includes device file
++ driver and instrumentation point support.
++
++ There are actually two implementations of Feather-Trace.
++ 1) A slower, but portable, default implementation.
++ 2) Architecture-specific implementations that rewrite kernel .text at runtime.
++
++ If enabled, Feather-Trace will be based on 2) if available (currently only for x86).
++ However, if DEBUG_RODATA=y, then Feather-Trace will choose option 1) in any case
++ to avoid problems with write-protected .text pages.
++
++ Bottom line: to avoid increased overheads, choose DEBUG_RODATA=n.
++
++ Note that this option only enables the basic Feather-Trace infrastructure;
++ you still need to enable SCHED_TASK_TRACE and/or SCHED_OVERHEAD_TRACE to
++ actually enable any events.
++
++config SCHED_TASK_TRACE
++ bool "Trace real-time tasks"
++ depends on FEATHER_TRACE
++ default y
++ help
++ Include support for the sched_trace_XXX() tracing functions. This
++ allows the collection of real-time task events such as job
++ completions, job releases, early completions, etc. This results in a
++ small overhead in the scheduling code. Disable if the overhead is not
++ acceptable (e.g., benchmarking).
++
++ Say Yes for debugging.
++ Say No for overhead tracing.
++
++config SCHED_OVERHEAD_TRACE
++ bool "Record timestamps for overhead measurements"
++ depends on FEATHER_TRACE
++ default n
++ help
++ Export event stream for overhead tracing.
++ Say Yes for overhead tracing.
++
++config SCHED_DEBUG_TRACE
++ bool "TRACE() debugging"
++ default y
++ help
++ Include support for sched_trace_log_messageg(), which is used to
++ implement TRACE(). If disabled, no TRACE() messages will be included
++ in the kernel, and no overheads due to debugging statements will be
++ incurred by the scheduler. Disable if the overhead is not acceptable
++ (e.g. benchmarking).
++
++ Say Yes for debugging.
++ Say No for overhead tracing.
++
++endmenu
++
++endmenu
+diff --git a/litmus/Makefile b/litmus/Makefile
+new file mode 100644
+index 0000000..f301d28
+--- /dev/null
++++ b/litmus/Makefile
+@@ -0,0 +1,25 @@
++#
++# Makefile for LITMUS^RT
++#
++
++obj-y = sched_plugin.o litmus.o \
++ budget.o \
++ jobs.o \
++ sync.o \
++ rt_domain.o \
++ edf_common.o \
++ fdso.o \
++ srp.o \
++ fmlp.o \
++ bheap.o \
++ ctrldev.o \
++ sched_gsn_edf.o \
++ sched_psn_edf.o
++
++obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o
++obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o
++
++obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
++obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
++obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o
++obj-$(CONFIG_SCHED_OVERHEAD_TRACE) += trace.o
+diff --git a/litmus/bheap.c b/litmus/bheap.c
+new file mode 100644
+index 0000000..528af97
+--- /dev/null
++++ b/litmus/bheap.c
+@@ -0,0 +1,314 @@
++#include "linux/kernel.h"
++#include "litmus/bheap.h"
++
++void bheap_init(struct bheap* heap)
++{
++ heap->head = NULL;
++ heap->min = NULL;
++}
++
++void bheap_node_init(struct bheap_node** _h, void* value)
++{
++ struct bheap_node* h = *_h;
++ h->parent = NULL;
++ h->next = NULL;
++ h->child = NULL;
++ h->degree = NOT_IN_HEAP;
++ h->value = value;
++ h->ref = _h;
++}
++
++
++/* make child a subtree of root */
++static void __bheap_link(struct bheap_node* root,
++ struct bheap_node* child)
++{
++ child->parent = root;
++ child->next = root->child;
++ root->child = child;
++ root->degree++;
++}
++
++/* merge root lists */
++static struct bheap_node* __bheap_merge(struct bheap_node* a,
++ struct bheap_node* b)
++{
++ struct bheap_node* head = NULL;
++ struct bheap_node** pos = &head;
++
++ while (a && b) {
++ if (a->degree < b->degree) {
++ *pos = a;
++ a = a->next;
++ } else {
++ *pos = b;
++ b = b->next;
++ }
++ pos = &(*pos)->next;
++ }
++ if (a)
++ *pos = a;
++ else
++ *pos = b;
++ return head;
++}
++
++/* reverse a linked list of nodes. also clears parent pointer */
++static struct bheap_node* __bheap_reverse(struct bheap_node* h)
++{
++ struct bheap_node* tail = NULL;
++ struct bheap_node* next;
++
++ if (!h)
++ return h;
++
++ h->parent = NULL;
++ while (h->next) {
++ next = h->next;
++ h->next = tail;
++ tail = h;
++ h = next;
++ h->parent = NULL;
++ }
++ h->next = tail;
++ return h;
++}
++
++static void __bheap_min(bheap_prio_t higher_prio, struct bheap* heap,
++ struct bheap_node** prev, struct bheap_node** node)
++{
++ struct bheap_node *_prev, *cur;
++ *prev = NULL;
++
++ if (!heap->head) {
++ *node = NULL;
++ return;
++ }
++
++ *node = heap->head;
++ _prev = heap->head;
++ cur = heap->head->next;
++ while (cur) {
++ if (higher_prio(cur, *node)) {
++ *node = cur;
++ *prev = _prev;
++ }
++ _prev = cur;
++ cur = cur->next;
++ }
++}
++
++static void __bheap_union(bheap_prio_t higher_prio, struct bheap* heap,
++ struct bheap_node* h2)
++{
++ struct bheap_node* h1;
++ struct bheap_node *prev, *x, *next;
++ if (!h2)
++ return;
++ h1 = heap->head;
++ if (!h1) {
++ heap->head = h2;
++ return;
++ }
++ h1 = __bheap_merge(h1, h2);
++ prev = NULL;
++ x = h1;
++ next = x->next;
++ while (next) {
++ if (x->degree != next->degree ||
++ (next->next && next->next->degree == x->degree)) {
++ /* nothing to do, advance */
++ prev = x;
++ x = next;
++ } else if (higher_prio(x, next)) {
++ /* x becomes the root of next */
++ x->next = next->next;
++ __bheap_link(x, next);
++ } else {
++ /* next becomes the root of x */
++ if (prev)
++ prev->next = next;
++ else
++ h1 = next;
++ __bheap_link(next, x);
++ x = next;
++ }
++ next = x->next;
++ }
++ heap->head = h1;
++}
++
++static struct bheap_node* __bheap_extract_min(bheap_prio_t higher_prio,
++ struct bheap* heap)
++{
++ struct bheap_node *prev, *node;
++ __bheap_min(higher_prio, heap, &prev, &node);
++ if (!node)
++ return NULL;
++ if (prev)
++ prev->next = node->next;
++ else
++ heap->head = node->next;
++ __bheap_union(higher_prio, heap, __bheap_reverse(node->child));
++ return node;
++}
++
++/* insert (and reinitialize) a node into the heap */
++void bheap_insert(bheap_prio_t higher_prio, struct bheap* heap,
++ struct bheap_node* node)
++{
++ struct bheap_node *min;
++ node->child = NULL;
++ node->parent = NULL;
++ node->next = NULL;
++ node->degree = 0;
++ if (heap->min && higher_prio(node, heap->min)) {
++ /* swap min cache */
++ min = heap->min;
++ min->child = NULL;
++ min->parent = NULL;
++ min->next = NULL;
++ min->degree = 0;
++ __bheap_union(higher_prio, heap, min);
++ heap->min = node;
++ } else
++ __bheap_union(higher_prio, heap, node);
++}
++
++void bheap_uncache_min(bheap_prio_t higher_prio, struct bheap* heap)
++{
++ struct bheap_node* min;
++ if (heap->min) {
++ min = heap->min;
++ heap->min = NULL;
++ bheap_insert(higher_prio, heap, min);
++ }
++}
++
++/* merge addition into target */
++void bheap_union(bheap_prio_t higher_prio,
++ struct bheap* target, struct bheap* addition)
++{
++ /* first insert any cached minima, if necessary */
++ bheap_uncache_min(higher_prio, target);
++ bheap_uncache_min(higher_prio, addition);
++ __bheap_union(higher_prio, target, addition->head);
++ /* this is a destructive merge */
++ addition->head = NULL;
++}
++
++struct bheap_node* bheap_peek(bheap_prio_t higher_prio,
++ struct bheap* heap)
++{
++ if (!heap->min)
++ heap->min = __bheap_extract_min(higher_prio, heap);
++ return heap->min;
++}
++
++struct bheap_node* bheap_take(bheap_prio_t higher_prio,
++ struct bheap* heap)
++{
++ struct bheap_node *node;
++ if (!heap->min)
++ heap->min = __bheap_extract_min(higher_prio, heap);
++ node = heap->min;
++ heap->min = NULL;
++ if (node)
++ node->degree = NOT_IN_HEAP;
++ return node;
++}
++
++int bheap_decrease(bheap_prio_t higher_prio, struct bheap_node* node)
++{
++ struct bheap_node *parent;
++ struct bheap_node** tmp_ref;
++ void* tmp;
++
++ /* bubble up */
++ parent = node->parent;
++ while (parent && higher_prio(node, parent)) {
++ /* swap parent and node */
++ tmp = parent->value;
++ parent->value = node->value;
++ node->value = tmp;
++ /* swap references */
++ *(parent->ref) = node;
++ *(node->ref) = parent;
++ tmp_ref = parent->ref;
++ parent->ref = node->ref;
++ node->ref = tmp_ref;
++ /* step up */
++ node = parent;
++ parent = node->parent;
++ }
++
++ return parent != NULL;
++}
++
++void bheap_delete(bheap_prio_t higher_prio, struct bheap* heap,
++ struct bheap_node* node)
++{
++ struct bheap_node *parent, *prev, *pos;
++ struct bheap_node** tmp_ref;
++ void* tmp;
++
++ if (heap->min != node) {
++ /* bubble up */
++ parent = node->parent;
++ while (parent) {
++ /* swap parent and node */
++ tmp = parent->value;
++ parent->value = node->value;
++ node->value = tmp;
++ /* swap references */
++ *(parent->ref) = node;
++ *(node->ref) = parent;
++ tmp_ref = parent->ref;
++ parent->ref = node->ref;
++ node->ref = tmp_ref;
++ /* step up */
++ node = parent;
++ parent = node->parent;
++ }
++ /* now delete:
++ * first find prev */
++ prev = NULL;
++ pos = heap->head;
++ while (pos != node) {
++ prev = pos;
++ pos = pos->next;
++ }
++ /* we have prev, now remove node */
++ if (prev)
++ prev->next = node->next;
++ else
++ heap->head = node->next;
++ __bheap_union(higher_prio, heap, __bheap_reverse(node->child));
++ } else
++ heap->min = NULL;
++ node->degree = NOT_IN_HEAP;
++}
++
++/* allocate a heap node for value and insert into the heap */
++int bheap_add(bheap_prio_t higher_prio, struct bheap* heap,
++ void* value, int gfp_flags)
++{
++ struct bheap_node* hn = bheap_node_alloc(gfp_flags);
++ if (likely(hn)) {
++ bheap_node_init(&hn, value);
++ bheap_insert(higher_prio, heap, hn);
++ }
++ return hn != NULL;
++}
++
++void* bheap_take_del(bheap_prio_t higher_prio,
++ struct bheap* heap)
++{
++ struct bheap_node* hn = bheap_take(higher_prio, heap);
++ void* ret = NULL;
++ if (hn) {
++ ret = hn->value;
++ bheap_node_free(hn);
++ }
++ return ret;
++}
+diff --git a/litmus/budget.c b/litmus/budget.c
+new file mode 100644
+index 0000000..b99177a
+--- /dev/null
++++ b/litmus/budget.c
+@@ -0,0 +1,109 @@
++#include
++#include
++
++#include
++
++struct enforcement_timer {
++ /* The enforcement timer is used to accurately police
++ * slice budgets. */
++ struct hrtimer timer;
++ int armed;
++};
++
++DEFINE_PER_CPU(struct enforcement_timer, budget_timer);
++
++static enum hrtimer_restart on_enforcement_timeout(struct hrtimer *timer)
++{
++ struct enforcement_timer* et = container_of(timer,
++ struct enforcement_timer,
++ timer);
++ unsigned long flags;
++
++ local_irq_save(flags);
++ TRACE("enforcement timer fired.\n");
++ et->armed = 0;
++ /* activate scheduler */
++ set_tsk_need_resched(current);
++ local_irq_restore(flags);
++
++ return HRTIMER_NORESTART;
++}
++
++/* assumes called with IRQs off */
++static void cancel_enforcement_timer(struct enforcement_timer* et)
++{
++ int ret;
++
++ TRACE("cancelling enforcement timer.\n");
++
++ /* Since interrupts are disabled and et->armed is only
++ * modified locally, we do not need any locks.
++ */
++
++ if (et->armed) {
++ ret = hrtimer_try_to_cancel(&et->timer);
++ /* Should never be inactive. */
++ BUG_ON(ret == 0);
++ /* Should never be running concurrently. */
++ BUG_ON(ret == -1);
++
++ et->armed = 0;
++ }
++}
++
++/* assumes called with IRQs off */
++static void arm_enforcement_timer(struct enforcement_timer* et,
++ struct task_struct* t)
++{
++ lt_t when_to_fire;
++ TRACE_TASK(t, "arming enforcement timer.\n");
++
++ /* Calling this when there is no budget left for the task
++ * makes no sense, unless the task is non-preemptive. */
++ BUG_ON(budget_exhausted(t) && (!is_np(t)));
++
++ /* __hrtimer_start_range_ns() cancels the timer
++ * anyway, so we don't have to check whether it is still armed */
++
++ if (likely(!is_np(t))) {
++ when_to_fire = litmus_clock() + budget_remaining(t);
++ __hrtimer_start_range_ns(&et->timer,
++ ns_to_ktime(when_to_fire),
++ 0 /* delta */,
++ HRTIMER_MODE_ABS_PINNED,
++ 0 /* no wakeup */);
++ et->armed = 1;
++ }
++}
++
++
++/* expects to be called with IRQs off */
++void update_enforcement_timer(struct task_struct* t)
++{
++ struct enforcement_timer* et = &__get_cpu_var(budget_timer);
++
++ if (t && budget_precisely_enforced(t)) {
++ /* Make sure we call into the scheduler when this budget
++ * expires. */
++ arm_enforcement_timer(et, t);
++ } else if (et->armed) {
++ /* Make sure we don't cause unnecessary interrupts. */
++ cancel_enforcement_timer(et);
++ }
++}
++
++
++static int __init init_budget_enforcement(void)
++{
++ int cpu;
++ struct enforcement_timer* et;
++
++ for (cpu = 0; cpu < NR_CPUS; cpu++) {
++ et = &per_cpu(budget_timer, cpu);
++ hrtimer_init(&et->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
++ et->timer.function = on_enforcement_timeout;
++ }
++ return 0;
++}
++
++module_init(init_budget_enforcement);
+diff --git a/litmus/ctrldev.c b/litmus/ctrldev.c
+new file mode 100644
+index 0000000..6677a67
+--- /dev/null
++++ b/litmus/ctrldev.c
+@@ -0,0 +1,150 @@
++#include
++#include
++#include
++#include
++#include
++
++#include
++
++/* only one page for now, but we might want to add a RO version at some point */
++
++#define CTRL_NAME "litmus/ctrl"
++
++/* allocate t->rt_param.ctrl_page*/
++static int alloc_ctrl_page(struct task_struct *t)
++{
++ int err = 0;
++
++ /* only allocate if the task doesn't have one yet */
++ if (!tsk_rt(t)->ctrl_page) {
++ tsk_rt(t)->ctrl_page = (void*) get_zeroed_page(GFP_KERNEL);
++ if (!tsk_rt(t)->ctrl_page)
++ err = -ENOMEM;
++ /* will get de-allocated in task teardown */
++ TRACE_TASK(t, "%s ctrl_page = %p\n", __FUNCTION__,
++ tsk_rt(t)->ctrl_page);
++ }
++ return err;
++}
++
++static int map_ctrl_page(struct task_struct *t, struct vm_area_struct* vma)
++{
++ int err;
++ unsigned long pfn;
++
++ struct page* ctrl = virt_to_page(tsk_rt(t)->ctrl_page);
++
++ /* Increase ref count. Is decreased when vma is destroyed. */
++ get_page(ctrl);
++
++ /* compute page frame number */
++ pfn = page_to_pfn(ctrl);
++
++ TRACE_CUR(CTRL_NAME
++ ": mapping %p (pfn:%lx, %lx) to 0x%lx (prot:%lx)\n",
++ tsk_rt(t)->ctrl_page, pfn, page_to_pfn(ctrl), vma->vm_start,
++ vma->vm_page_prot);
++
++ /* Map it into the vma. Make sure to use PAGE_SHARED, otherwise
++ * userspace actually gets a copy-on-write page. */
++ err = remap_pfn_range(vma, vma->vm_start, pfn, PAGE_SIZE, PAGE_SHARED);
++
++ if (err)
++ TRACE_CUR(CTRL_NAME ": remap_pfn_range() failed (%d)\n", err);
++
++ return err;
++}
++
++static void litmus_ctrl_vm_close(struct vm_area_struct* vma)
++{
++ TRACE_CUR("%s flags=0x%x prot=0x%x\n", __FUNCTION__,
++ vma->vm_flags, vma->vm_page_prot);
++
++ TRACE_CUR(CTRL_NAME
++ ": %p:%p vma:%p vma->vm_private_data:%p closed.\n",
++ (void*) vma->vm_start, (void*) vma->vm_end, vma,
++ vma->vm_private_data, current->comm,
++ current->pid);
++}
++
++static int litmus_ctrl_vm_fault(struct vm_area_struct* vma,
++ struct vm_fault* vmf)
++{
++ /* This function should never be called, since
++ * all pages should have been mapped by mmap()
++ * already. */
++ TRACE_CUR("%s flags=0x%x\n", __FUNCTION__, vma->vm_flags);
++
++ /* nope, you only get one page */
++ return VM_FAULT_SIGBUS;
++}
++
++static struct vm_operations_struct litmus_ctrl_vm_ops = {
++ .close = litmus_ctrl_vm_close,
++ .fault = litmus_ctrl_vm_fault,
++};
++
++static int litmus_ctrl_mmap(struct file* filp, struct vm_area_struct* vma)
++{
++ int err = 0;
++
++ /* first make sure mapper knows what he's doing */
++
++ /* you can only get one page */
++ if (vma->vm_end - vma->vm_start != PAGE_SIZE)
++ return -EINVAL;
++
++ /* you can only map the "first" page */
++ if (vma->vm_pgoff != 0)
++ return -EINVAL;
++
++ /* you can't share it with anyone */
++ if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
++ return -EINVAL;
++
++ vma->vm_ops = &litmus_ctrl_vm_ops;
++ /* this mapping should not be kept across forks,
++ * and cannot be expanded */
++ vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
++
++ err = alloc_ctrl_page(current);
++ if (!err)
++ err = map_ctrl_page(current, vma);
++
++ TRACE_CUR("%s flags=0x%x prot=0x%lx\n",
++ __FUNCTION__, vma->vm_flags, vma->vm_page_prot);
++
++ return err;
++}
++
++static struct file_operations litmus_ctrl_fops = {
++ .owner = THIS_MODULE,
++ .mmap = litmus_ctrl_mmap,
++};
++
++static struct miscdevice litmus_ctrl_dev = {
++ .name = CTRL_NAME,
++ .minor = MISC_DYNAMIC_MINOR,
++ .fops = &litmus_ctrl_fops,
++};
++
++static int __init init_litmus_ctrl_dev(void)
++{
++ int err;
++
++ BUILD_BUG_ON(sizeof(struct control_page) > PAGE_SIZE);
++
++ printk("Initializing LITMUS^RT control device.\n");
++ err = misc_register(&litmus_ctrl_dev);
++ if (err)
++ printk("Could not allocate %s device (%d).\n", CTRL_NAME, err);
++ return err;
++}
++
++static void __exit exit_litmus_ctrl_dev(void)
++{
++ misc_deregister(&litmus_ctrl_dev);
++}
++
++module_init(init_litmus_ctrl_dev);
++module_exit(exit_litmus_ctrl_dev);
+diff --git a/litmus/edf_common.c b/litmus/edf_common.c
+new file mode 100644
+index 0000000..06daec6
+--- /dev/null
++++ b/litmus/edf_common.c
+@@ -0,0 +1,102 @@
++/*
++ * kernel/edf_common.c
++ *
++ * Common functions for EDF based scheduler.
++ */
++
++#include
++#include
++#include
++
++#include
++#include
++#include
++
++#include
++
++/* edf_higher_prio - returns true if first has a higher EDF priority
++ * than second. Deadline ties are broken by PID.
++ *
++ * both first and second may be NULL
++ */
++int edf_higher_prio(struct task_struct* first,
++ struct task_struct* second)
++{
++ struct task_struct *first_task = first;
++ struct task_struct *second_task = second;
++
++ /* There is no point in comparing a task to itself. */
++ if (first && first == second) {
++ TRACE_TASK(first,
++ "WARNING: pointless edf priority comparison.\n");
++ return 0;
++ }
++
++
++ /* Check for inherited priorities. Change task
++ * used for comparison in such a case.
++ */
++ if (first && first->rt_param.inh_task)
++ first_task = first->rt_param.inh_task;
++ if (second && second->rt_param.inh_task)
++ second_task = second->rt_param.inh_task;
++
++ return
++ /* it has to exist in order to have higher priority */
++ first_task && (
++ /* does the second task exist and is it a real-time task? If
++ * not, the first task (which is a RT task) has higher
++ * priority.
++ */
++ !second_task || !is_realtime(second_task) ||
++
++ /* is the deadline of the first task earlier?
++ * Then it has higher priority.
++ */
++ earlier_deadline(first_task, second_task) ||
++
++ /* Do we have a deadline tie?
++ * Then break by PID.
++ */
++ (get_deadline(first_task) == get_deadline(second_task) &&
++ (first_task->pid < second_task->pid ||
++
++ /* If the PIDs are the same then the task with the inherited
++ * priority wins.
++ */
++ (first_task->pid == second_task->pid &&
++ !second->rt_param.inh_task))));
++}
++
++int edf_ready_order(struct bheap_node* a, struct bheap_node* b)
++{
++ return edf_higher_prio(bheap2task(a), bheap2task(b));
++}
++
++void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
++ release_jobs_t release)
++{
++ rt_domain_init(rt, edf_ready_order, resched, release);
++}
++
++/* need_to_preempt - check whether the task t needs to be preempted
++ * call only with irqs disabled and with ready_lock acquired
++ * THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT!
++ */
++int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t)
++{
++ /* we need the read lock for edf_ready_queue */
++ /* no need to preempt if there is nothing pending */
++ if (!__jobs_pending(rt))
++ return 0;
++ /* we need to reschedule if t doesn't exist */
++ if (!t)
++ return 1;
++
++ /* NOTE: We cannot check for non-preemptibility since we
++ * don't know what address space we're currently in.
++ */
++
++ /* make sure to get non-rt stuff out of the way */
++ return !is_realtime(t) || edf_higher_prio(__next_ready(rt), t);
++}
+diff --git a/litmus/fdso.c b/litmus/fdso.c
+new file mode 100644
+index 0000000..85be716
+--- /dev/null
++++ b/litmus/fdso.c
+@@ -0,0 +1,281 @@
++/* fdso.c - file descriptor attached shared objects
++ *
++ * (c) 2007 B. Brandenburg, LITMUS^RT project
++ *
++ * Notes:
++ * - objects descriptor (OD) tables are not cloned during a fork.
++ * - objects are created on-demand, and freed after the last reference
++ * is dropped.
++ * - for now, object types are hard coded.
++ * - As long as we have live objects, we keep a reference to the inode.
++ */
++
++#include
++#include
++#include
++#include
++#include
++
++#include
++
++extern struct fdso_ops fmlp_sem_ops;
++extern struct fdso_ops srp_sem_ops;
++
++static const struct fdso_ops* fdso_ops[] = {
++ &fmlp_sem_ops,
++ &srp_sem_ops,
++};
++
++static void* fdso_create(obj_type_t type)
++{
++ if (fdso_ops[type]->create)
++ return fdso_ops[type]->create();
++ else
++ return NULL;
++}
++
++static void fdso_destroy(obj_type_t type, void* obj)
++{
++ fdso_ops[type]->destroy(obj);
++}
++
++static int fdso_open(struct od_table_entry* entry, void* __user config)
++{
++ if (fdso_ops[entry->obj->type]->open)
++ return fdso_ops[entry->obj->type]->open(entry, config);
++ else
++ return 0;
++}
++
++static int fdso_close(struct od_table_entry* entry)
++{
++ if (fdso_ops[entry->obj->type]->close)
++ return fdso_ops[entry->obj->type]->close(entry);
++ else
++ return 0;
++}
++
++/* inode must be locked already */
++static struct inode_obj_id* alloc_inode_obj(struct inode* inode,
++ obj_type_t type,
++ unsigned int id)
++{
++ struct inode_obj_id* obj;
++ void* raw_obj;
++
++ raw_obj = fdso_create(type);
++ if (!raw_obj)
++ return NULL;
++
++ obj = kmalloc(sizeof(*obj), GFP_KERNEL);
++ if (!obj)
++ return NULL;
++ INIT_LIST_HEAD(&obj->list);
++ atomic_set(&obj->count, 1);
++ obj->type = type;
++ obj->id = id;
++ obj->obj = raw_obj;
++ obj->inode = inode;
++
++ list_add(&obj->list, &inode->i_obj_list);
++ atomic_inc(&inode->i_count);
++
++ printk(KERN_DEBUG "alloc_inode_obj(%p, %d, %d): object created\n", inode, type, id);
++ return obj;
++}
++
++/* inode must be locked already */
++static struct inode_obj_id* get_inode_obj(struct inode* inode,
++ obj_type_t type,
++ unsigned int id)
++{
++ struct list_head* pos;
++ struct inode_obj_id* obj = NULL;
++
++ list_for_each(pos, &inode->i_obj_list) {
++ obj = list_entry(pos, struct inode_obj_id, list);
++ if (obj->id == id && obj->type == type) {
++ atomic_inc(&obj->count);
++ return obj;
++ }
++ }
++ printk(KERN_DEBUG "get_inode_obj(%p, %d, %d): couldn't find object\n", inode, type, id);
++ return NULL;
++}
++
++
++static void put_inode_obj(struct inode_obj_id* obj)
++{
++ struct inode* inode;
++ int let_go = 0;
++
++ inode = obj->inode;
++ if (atomic_dec_and_test(&obj->count)) {
++
++ mutex_lock(&inode->i_obj_mutex);
++ /* no new references can be obtained */
++ if (!atomic_read(&obj->count)) {
++ list_del(&obj->list);
++ fdso_destroy(obj->type, obj->obj);
++ kfree(obj);
++ let_go = 1;
++ }
++ mutex_unlock(&inode->i_obj_mutex);
++ if (let_go)
++ iput(inode);
++ }
++}
++
++static struct od_table_entry* get_od_entry(struct task_struct* t)
++{
++ struct od_table_entry* table;
++ int i;
++
++
++ table = t->od_table;
++ if (!table) {
++ table = kzalloc(sizeof(*table) * MAX_OBJECT_DESCRIPTORS,
++ GFP_KERNEL);
++ t->od_table = table;
++ }
++
++ for (i = 0; table && i < MAX_OBJECT_DESCRIPTORS; i++)
++ if (!table[i].used) {
++ table[i].used = 1;
++ return table + i;
++ }
++ return NULL;
++}
++
++static int put_od_entry(struct od_table_entry* od)
++{
++ put_inode_obj(od->obj);
++ od->used = 0;
++ return 0;
++}
++
++void exit_od_table(struct task_struct* t)
++{
++ int i;
++
++ if (t->od_table) {
++ for (i = 0; i < MAX_OBJECT_DESCRIPTORS; i++)
++ if (t->od_table[i].used)
++ put_od_entry(t->od_table + i);
++ kfree(t->od_table);
++ t->od_table = NULL;
++ }
++}
++
++static int do_sys_od_open(struct file* file, obj_type_t type, int id,
++ void* __user config)
++{
++ int idx = 0, err;
++ struct inode* inode;
++ struct inode_obj_id* obj = NULL;
++ struct od_table_entry* entry;
++
++ inode = file->f_dentry->d_inode;
++
++ entry = get_od_entry(current);
++ if (!entry)
++ return -ENOMEM;
++
++ mutex_lock(&inode->i_obj_mutex);
++ obj = get_inode_obj(inode, type, id);
++ if (!obj)
++ obj = alloc_inode_obj(inode, type, id);
++ if (!obj) {
++ idx = -ENOMEM;
++ entry->used = 0;
++ } else {
++ entry->obj = obj;
++ entry->extra = NULL;
++ idx = entry - current->od_table;
++ }
++
++ mutex_unlock(&inode->i_obj_mutex);
++
++ err = fdso_open(entry, config);
++ if (err < 0) {
++ /* The class rejected the open call.
++ * We need to clean up and tell user space.
++ */
++ put_od_entry(entry);
++ idx = err;
++ }
++
++ return idx;
++}
++
++
++struct od_table_entry* __od_lookup(int od)
++{
++ struct task_struct *t = current;
++
++ if (!t->od_table)
++ return NULL;
++ if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
++ return NULL;
++ if (!t->od_table[od].used)
++ return NULL;
++ return t->od_table + od;
++}
++
++
++asmlinkage long sys_od_open(int fd, int type, int obj_id, void* __user config)
++{
++ int ret = 0;
++ struct file* file;
++
++ /*
++ 1) get file from fd, get inode from file
++ 2) lock inode
++ 3) try to lookup object
++ 4) if not present create and enqueue object, inc inode refcnt
++ 5) increment refcnt of object
++ 6) alloc od_table_entry, setup ptrs
++ 7) unlock inode
++ 8) return offset in od_table as OD
++ */
++
++ if (type < MIN_OBJ_TYPE || type > MAX_OBJ_TYPE) {
++ ret = -EINVAL;
++ goto out;
++ }
++
++ file = fget(fd);
++ if (!file) {
++ ret = -EBADF;
++ goto out;
++ }
++
++ ret = do_sys_od_open(file, type, obj_id, config);
++
++ fput(file);
++
++out:
++ return ret;
++}
++
++
++asmlinkage long sys_od_close(int od)
++{
++ int ret = -EINVAL;
++ struct task_struct *t = current;
++
++ if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
++ return ret;
++
++ if (!t->od_table || !t->od_table[od].used)
++ return ret;
++
++
++ /* give the class a chance to reject the close
++ */
++ ret = fdso_close(t->od_table + od);
++ if (ret == 0)
++ ret = put_od_entry(t->od_table + od);
++
++ return ret;
++}
+diff --git a/litmus/fmlp.c b/litmus/fmlp.c
+new file mode 100644
+index 0000000..03fa735
+--- /dev/null
++++ b/litmus/fmlp.c
+@@ -0,0 +1,268 @@
++/*
++ * FMLP implementation.
++ * Much of the code here is borrowed from include/asm-i386/semaphore.h
++ */
++
++#include
++
++#include
++#include
++#include
++#include
++
++#include
++#include
++#include
++
++#include
++
++#include
++
++#ifdef CONFIG_FMLP
++
++static void* create_fmlp_semaphore(void)
++{
++ struct pi_semaphore* sem;
++ int i;
++
++ sem = kmalloc(sizeof(*sem), GFP_KERNEL);
++ if (!sem)
++ return NULL;
++ atomic_set(&sem->count, 1);
++ sem->sleepers = 0;
++ init_waitqueue_head(&sem->wait);
++ sem->hp.task = NULL;
++ sem->holder = NULL;
++ for (i = 0; i < NR_CPUS; i++)
++ sem->hp.cpu_task[i] = NULL;
++ return sem;
++}
++
++static int open_fmlp_semaphore(struct od_table_entry* entry, void* __user arg)
++{
++ if (!fmlp_active())
++ return -EBUSY;
++ return 0;
++}
++
++static void destroy_fmlp_semaphore(void* sem)
++{
++ /* XXX assert invariants */
++ kfree(sem);
++}
++
++struct fdso_ops fmlp_sem_ops = {
++ .create = create_fmlp_semaphore,
++ .open = open_fmlp_semaphore,
++ .destroy = destroy_fmlp_semaphore
++};
++
++struct wq_pair {
++ struct task_struct* tsk;
++ struct pi_semaphore* sem;
++};
++
++static int rt_pi_wake_up(wait_queue_t *wait, unsigned mode, int sync,
++ void *key)
++{
++ struct wq_pair* wqp = (struct wq_pair*) wait->private;
++ set_rt_flags(wqp->tsk, RT_F_EXIT_SEM);
++ litmus->inherit_priority(wqp->sem, wqp->tsk);
++ TRACE_TASK(wqp->tsk,
++ "woken up by rt_pi_wake_up() (RT_F_SEM_EXIT, PI)\n");
++ /* point to task for default_wake_function() */
++ wait->private = wqp->tsk;
++ default_wake_function(wait, mode, sync, key);
++
++ /* Always return true since we know that if we encountered a task
++ * that was already running the wake_up raced with the schedule in
++ * rt_pi_down(). In that case the task in rt_pi_down() will be scheduled
++ * immediately and own the lock. We must not wake up another task in
++ * any case.
++ */
++ return 1;
++}
++
++/* caller is responsible for locking */
++int edf_set_hp_task(struct pi_semaphore *sem)
++{
++ struct list_head *tmp, *next;
++ struct task_struct *queued;
++ int ret = 0;
++
++ sem->hp.task = NULL;
++ list_for_each_safe(tmp, next, &sem->wait.task_list) {
++ queued = ((struct wq_pair*)
++ list_entry(tmp, wait_queue_t,
++ task_list)->private)->tsk;
++
++ /* Compare task prios, find high prio task. */
++ if (edf_higher_prio(queued, sem->hp.task)) {
++ sem->hp.task = queued;
++ ret = 1;
++ }
++ }
++ return ret;
++}
++
++/* caller is responsible for locking */
++int edf_set_hp_cpu_task(struct pi_semaphore *sem, int cpu)
++{
++ struct list_head *tmp, *next;
++ struct task_struct *queued;
++ int ret = 0;
++
++ sem->hp.cpu_task[cpu] = NULL;
++ list_for_each_safe(tmp, next, &sem->wait.task_list) {
++ queued = ((struct wq_pair*)
++ list_entry(tmp, wait_queue_t,
++ task_list)->private)->tsk;
++
++ /* Compare task prios, find high prio task. */
++ if (get_partition(queued) == cpu &&
++ edf_higher_prio(queued, sem->hp.cpu_task[cpu])) {
++ sem->hp.cpu_task[cpu] = queued;
++ ret = 1;
++ }
++ }
++ return ret;
++}
++
++static int do_fmlp_down(struct pi_semaphore* sem)
++{
++ unsigned long flags;
++ struct task_struct *tsk = current;
++ struct wq_pair pair;
++ int suspended = 1;
++ wait_queue_t wait = {
++ .private = &pair,
++ .func = rt_pi_wake_up,
++ .task_list = {NULL, NULL}
++ };
++
++ pair.tsk = tsk;
++ pair.sem = sem;
++ spin_lock_irqsave(&sem->wait.lock, flags);
++
++ if (atomic_dec_return(&sem->count) < 0 ||
++ waitqueue_active(&sem->wait)) {
++ /* we need to suspend */
++ tsk->state = TASK_UNINTERRUPTIBLE;
++ add_wait_queue_exclusive_locked(&sem->wait, &wait);
++
++ TRACE_CUR("suspends on PI lock %p\n", sem);
++ litmus->pi_block(sem, tsk);
++
++ /* release lock before sleeping */
++ spin_unlock_irqrestore(&sem->wait.lock, flags);
++
++ TS_PI_DOWN_END;
++ preempt_enable_no_resched();
++
++
++ /* we depend on the FIFO order
++ * Thus, we don't need to recheck when we wake up, we
++ * are guaranteed to have the lock since there is only one
++ * wake up per release
++ */
++ schedule();
++
++ TRACE_CUR("woke up, now owns PI lock %p\n", sem);
++
++ /* try_to_wake_up() set our state to TASK_RUNNING,
++ * all we need to do is to remove our wait queue entry
++ */
++ remove_wait_queue(&sem->wait, &wait);
++ } else {
++ /* no priority inheritance necessary, since there are no queued
++ * tasks.
++ */
++ suspended = 0;
++ TRACE_CUR("acquired PI lock %p, no contention\n", sem);
++ sem->holder = tsk;
++
++ /* don't know if we're global or partitioned. */
++ sem->hp.task = tsk;
++ sem->hp.cpu_task[get_partition(tsk)] = tsk;
++
++ litmus->inherit_priority(sem, tsk);
++ spin_unlock_irqrestore(&sem->wait.lock, flags);
++ }
++ return suspended;
++}
++
++static void do_fmlp_up(struct pi_semaphore* sem)
++{
++ unsigned long flags;
++
++ spin_lock_irqsave(&sem->wait.lock, flags);
++
++ TRACE_CUR("releases PI lock %p\n", sem);
++ litmus->return_priority(sem);
++ sem->holder = NULL;
++ if (atomic_inc_return(&sem->count) < 1)
++ /* there is a task queued */
++ wake_up_locked(&sem->wait);
++
++ spin_unlock_irqrestore(&sem->wait.lock, flags);
++}
++
++asmlinkage long sys_fmlp_down(int sem_od)
++{
++ long ret = 0;
++ struct pi_semaphore * sem;
++ int suspended = 0;
++
++ preempt_disable();
++ TS_PI_DOWN_START;
++
++ sem = lookup_fmlp_sem(sem_od);
++ if (sem)
++ suspended = do_fmlp_down(sem);
++ else
++ ret = -EINVAL;
++
++ if (!suspended) {
++ TS_PI_DOWN_END;
++ preempt_enable();
++ }
++
++ return ret;
++}
++
++asmlinkage long sys_fmlp_up(int sem_od)
++{
++ long ret = 0;
++ struct pi_semaphore * sem;
++
++ preempt_disable();
++ TS_PI_UP_START;
++
++ sem = lookup_fmlp_sem(sem_od);
++ if (sem)
++ do_fmlp_up(sem);
++ else
++ ret = -EINVAL;
++
++
++ TS_PI_UP_END;
++ preempt_enable();
++
++ return ret;
++}
++
++#else
++
++struct fdso_ops fmlp_sem_ops = {};
++
++asmlinkage long sys_fmlp_down(int sem_od)
++{
++ return -ENOSYS;
++}
++
++asmlinkage long sys_fmlp_up(int sem_od)
++{
++ return -ENOSYS;
++}
++
++#endif
+diff --git a/litmus/ft_event.c b/litmus/ft_event.c
+new file mode 100644
+index 0000000..399a07b
+--- /dev/null
++++ b/litmus/ft_event.c
+@@ -0,0 +1,43 @@
++#include
++
++#include
++
++#if !defined(CONFIG_ARCH_HAS_FEATHER_TRACE) || defined(CONFIG_DEBUG_RODATA)
++/* provide dummy implementation */
++
++int ft_events[MAX_EVENTS];
++
++int ft_enable_event(unsigned long id)
++{
++ if (id < MAX_EVENTS) {
++ ft_events[id]++;
++ return 1;
++ } else
++ return 0;
++}
++
++int ft_disable_event(unsigned long id)
++{
++ if (id < MAX_EVENTS && ft_events[id]) {
++ ft_events[id]--;
++ return 1;
++ } else
++ return 0;
++}
++
++int ft_disable_all_events(void)
++{
++ int i;
++
++ for (i = 0; i < MAX_EVENTS; i++)
++ ft_events[i] = 0;
++
++ return MAX_EVENTS;
++}
++
++int ft_is_event_enabled(unsigned long id)
++{
++ return id < MAX_EVENTS && ft_events[id];
++}
++
++#endif
+diff --git a/litmus/ftdev.c b/litmus/ftdev.c
+new file mode 100644
+index 0000000..51dafae
+--- /dev/null
++++ b/litmus/ftdev.c
+@@ -0,0 +1,360 @@
++#include
++#include
++#include
++#include
++#include
++#include
++
++#include
++#include
++#include
++
++struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size)
++{
++ struct ft_buffer* buf;
++ size_t total = (size + 1) * count;
++ char* mem;
++ int order = 0, pages = 1;
++
++ buf = kmalloc(sizeof(*buf), GFP_KERNEL);
++ if (!buf)
++ return NULL;
++
++ total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
++ while (pages < total) {
++ order++;
++ pages *= 2;
++ }
++
++ mem = (char*) __get_free_pages(GFP_KERNEL, order);
++ if (!mem) {
++ kfree(buf);
++ return NULL;
++ }
++
++ if (!init_ft_buffer(buf, count, size,
++ mem + (count * size), /* markers at the end */
++ mem)) { /* buffer objects */
++ free_pages((unsigned long) mem, order);
++ kfree(buf);
++ return NULL;
++ }
++ return buf;
++}
++
++void free_ft_buffer(struct ft_buffer* buf)
++{
++ int order = 0, pages = 1;
++ size_t total;
++
++ if (buf) {
++ total = (buf->slot_size + 1) * buf->slot_count;
++ total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
++ while (pages < total) {
++ order++;
++ pages *= 2;
++ }
++ free_pages((unsigned long) buf->buffer_mem, order);
++ kfree(buf);
++ }
++}
++
++struct ftdev_event {
++ int id;
++ struct ftdev_event* next;
++};
++
++static int activate(struct ftdev_event** chain, int id)
++{
++ struct ftdev_event* ev = kmalloc(sizeof(*ev), GFP_KERNEL);
++ if (ev) {
++ printk(KERN_INFO
++ "Enabling feather-trace event %d.\n", (int) id);
++ ft_enable_event(id);
++ ev->id = id;
++ ev->next = *chain;
++ *chain = ev;
++ }
++ return ev ? 0 : -ENOMEM;
++}
++
++static void deactivate(struct ftdev_event** chain, int id)
++{
++ struct ftdev_event **cur = chain;
++ struct ftdev_event *nxt;
++ while (*cur) {
++ if ((*cur)->id == id) {
++ nxt = (*cur)->next;
++ kfree(*cur);
++ *cur = nxt;
++ printk(KERN_INFO
++ "Disabling feather-trace event %d.\n", (int) id);
++ ft_disable_event(id);
++ break;
++ }
++ cur = &(*cur)->next;
++ }
++}
++
++static int ftdev_open(struct inode *in, struct file *filp)
++{
++ struct ftdev* ftdev;
++ struct ftdev_minor* ftdm;
++ unsigned int buf_idx = iminor(in);
++ int err = 0;
++
++ ftdev = container_of(in->i_cdev, struct ftdev, cdev);
++
++ if (buf_idx >= ftdev->minor_cnt) {
++ err = -ENODEV;
++ goto out;
++ }
++ if (ftdev->can_open && (err = ftdev->can_open(ftdev, buf_idx)))
++ goto out;
++
++ ftdm = ftdev->minor + buf_idx;
++ filp->private_data = ftdm;
++
++ if (mutex_lock_interruptible(&ftdm->lock)) {
++ err = -ERESTARTSYS;
++ goto out;
++ }
++
++ if (!ftdm->readers && ftdev->alloc)
++ err = ftdev->alloc(ftdev, buf_idx);
++ if (0 == err)
++ ftdm->readers++;
++
++ mutex_unlock(&ftdm->lock);
++out:
++ return err;
++}
++
++static int ftdev_release(struct inode *in, struct file *filp)
++{
++ struct ftdev* ftdev;
++ struct ftdev_minor* ftdm;
++ unsigned int buf_idx = iminor(in);
++ int err = 0;
++
++ ftdev = container_of(in->i_cdev, struct ftdev, cdev);
++
++ if (buf_idx >= ftdev->minor_cnt) {
++ err = -ENODEV;
++ goto out;
++ }
++ ftdm = ftdev->minor + buf_idx;
++
++ if (mutex_lock_interruptible(&ftdm->lock)) {
++ err = -ERESTARTSYS;
++ goto out;
++ }
++
++ if (ftdm->readers == 1) {
++ while (ftdm->events)
++ deactivate(&ftdm->events, ftdm->events->id);
++
++ /* wait for any pending events to complete */
++ set_current_state(TASK_UNINTERRUPTIBLE);
++ schedule_timeout(HZ);
++
++ printk(KERN_ALERT "Failed trace writes: %u\n",
++ ftdm->buf->failed_writes);
++
++ if (ftdev->free)
++ ftdev->free(ftdev, buf_idx);
++ }
++
++ ftdm->readers--;
++ mutex_unlock(&ftdm->lock);
++out:
++ return err;
++}
++
++/* based on ft_buffer_read
++ * @returns < 0 : page fault
++ * = 0 : no data available
++ * = 1 : one slot copied
++ */
++static int ft_buffer_copy_to_user(struct ft_buffer* buf, char __user *dest)
++{
++ unsigned int idx;
++ int err = 0;
++ if (buf->free_count != buf->slot_count) {
++ /* data available */
++ idx = buf->read_idx % buf->slot_count;
++ if (buf->slots[idx] == SLOT_READY) {
++ err = copy_to_user(dest, ((char*) buf->buffer_mem) +
++ idx * buf->slot_size,
++ buf->slot_size);
++ if (err == 0) {
++ /* copy ok */
++ buf->slots[idx] = SLOT_FREE;
++ buf->read_idx++;
++ fetch_and_inc(&buf->free_count);
++ err = 1;
++ }
++ }
++ }
++ return err;
++}
++
++static ssize_t ftdev_read(struct file *filp,
++ char __user *to, size_t len, loff_t *f_pos)
++{
++ /* we ignore f_pos, this is strictly sequential */
++
++ ssize_t err = 0;
++ size_t chunk;
++ int copied;
++ struct ftdev_minor* ftdm = filp->private_data;
++
++ if (mutex_lock_interruptible(&ftdm->lock)) {
++ err = -ERESTARTSYS;
++ goto out;
++ }
++
++
++ chunk = ftdm->buf->slot_size;
++ while (len >= chunk) {
++ copied = ft_buffer_copy_to_user(ftdm->buf, to);
++ if (copied == 1) {
++ len -= chunk;
++ to += chunk;
++ err += chunk;
++ } else if (err == 0 && copied == 0 && ftdm->events) {
++ /* Only wait if there are any events enabled and only
++ * if we haven't copied some data yet. We cannot wait
++ * here with copied data because that data would get
++ * lost if the task is interrupted (e.g., killed).
++ */
++ set_current_state(TASK_INTERRUPTIBLE);
++ schedule_timeout(50);
++ if (signal_pending(current)) {
++ if (err == 0)
++ /* nothing read yet, signal problem */
++ err = -ERESTARTSYS;
++ break;
++ }
++ } else if (copied < 0) {
++ /* page fault */
++ err = copied;
++ break;
++ } else
++ /* nothing left to get, return to user space */
++ break;
++ }
++ mutex_unlock(&ftdm->lock);
++out:
++ return err;
++}
++
++typedef uint32_t cmd_t;
++
++static ssize_t ftdev_write(struct file *filp, const char __user *from,
++ size_t len, loff_t *f_pos)
++{
++ struct ftdev_minor* ftdm = filp->private_data;
++ ssize_t err = -EINVAL;
++ cmd_t cmd;
++ cmd_t id;
++
++ if (len % sizeof(cmd) || len < 2 * sizeof(cmd))
++ goto out;
++
++ if (copy_from_user(&cmd, from, sizeof(cmd))) {
++ err = -EFAULT;
++ goto out;
++ }
++ len -= sizeof(cmd);
++ from += sizeof(cmd);
++
++ if (cmd != FTDEV_ENABLE_CMD && cmd != FTDEV_DISABLE_CMD)
++ goto out;
++
++ if (mutex_lock_interruptible(&ftdm->lock)) {
++ err = -ERESTARTSYS;
++ goto out;
++ }
++
++ err = sizeof(cmd);
++ while (len) {
++ if (copy_from_user(&id, from, sizeof(cmd))) {
++ err = -EFAULT;
++ goto out_unlock;
++ }
++ /* FIXME: check id against list of acceptable events */
++ len -= sizeof(cmd);
++ from += sizeof(cmd);
++ if (cmd == FTDEV_DISABLE_CMD)
++ deactivate(&ftdm->events, id);
++ else if (activate(&ftdm->events, id) != 0) {
++ err = -ENOMEM;
++ goto out_unlock;
++ }
++ err += sizeof(cmd);
++ }
++
++out_unlock:
++ mutex_unlock(&ftdm->lock);
++out:
++ return err;
++}
++
++struct file_operations ftdev_fops = {
++ .owner = THIS_MODULE,
++ .open = ftdev_open,
++ .release = ftdev_release,
++ .write = ftdev_write,
++ .read = ftdev_read,
++};
++
++
++void ftdev_init(struct ftdev* ftdev, struct module* owner)
++{
++ int i;
++ cdev_init(&ftdev->cdev, &ftdev_fops);
++ ftdev->cdev.owner = owner;
++ ftdev->cdev.ops = &ftdev_fops;
++ ftdev->minor_cnt = 0;
++ for (i = 0; i < MAX_FTDEV_MINORS; i++) {
++ mutex_init(&ftdev->minor[i].lock);
++ ftdev->minor[i].readers = 0;
++ ftdev->minor[i].buf = NULL;
++ ftdev->minor[i].events = NULL;
++ }
++ ftdev->alloc = NULL;
++ ftdev->free = NULL;
++ ftdev->can_open = NULL;
++}
++
++int register_ftdev(struct ftdev* ftdev, const char* name, int major)
++{
++ dev_t trace_dev;
++ int error = 0;
++
++ if(major) {
++ trace_dev = MKDEV(major, 0);
++ error = register_chrdev_region(trace_dev, ftdev->minor_cnt,
++ name);
++ } else {
++ error = alloc_chrdev_region(&trace_dev, 0, ftdev->minor_cnt,
++ name);
++ major = MAJOR(trace_dev);
++ }
++ if (error)
++ {
++ printk(KERN_WARNING "ftdev(%s): "
++ "Could not register major/minor number %d/%u\n",
++ name, major, ftdev->minor_cnt);
++ return error;
++ }
++ error = cdev_add(&ftdev->cdev, trace_dev, ftdev->minor_cnt);
++ if (error) {
++ printk(KERN_WARNING "ftdev(%s): "
++ "Could not add cdev for major/minor = %d/%u.\n",
++ name, major, ftdev->minor_cnt);
++ return error;
++ }
++ return error;
++}
+diff --git a/litmus/jobs.c b/litmus/jobs.c
+new file mode 100644
+index 0000000..36e3146
+--- /dev/null
++++ b/litmus/jobs.c
+@@ -0,0 +1,43 @@
++/* litmus/jobs.c - common job control code
++ */
++
++#include
++
++#include
++#include
++
++void prepare_for_next_period(struct task_struct *t)
++{
++ BUG_ON(!t);
++ /* prepare next release */
++ t->rt_param.job_params.release = t->rt_param.job_params.deadline;
++ t->rt_param.job_params.deadline += get_rt_period(t);
++ t->rt_param.job_params.exec_time = 0;
++ /* update job sequence number */
++ t->rt_param.job_params.job_no++;
++
++ /* don't confuse Linux */
++ t->rt.time_slice = 1;
++}
++
++void release_at(struct task_struct *t, lt_t start)
++{
++ t->rt_param.job_params.deadline = start;
++ prepare_for_next_period(t);
++ set_rt_flags(t, RT_F_RUNNING);
++}
++
++
++/*
++ * Deactivate current task until the beginning of the next period.
++ */
++long complete_job(void)
++{
++ /* Mark that we do not excute anymore */
++ set_rt_flags(current, RT_F_SLEEP);
++ /* call schedule, this will return when a new job arrives
++ * it also takes care of preparing for the next release
++ */
++ schedule();
++ return 0;
++}
+diff --git a/litmus/litmus.c b/litmus/litmus.c
+new file mode 100644
+index 0000000..b04a42b
+--- /dev/null
++++ b/litmus/litmus.c
+@@ -0,0 +1,799 @@
++/*
++ * litmus.c -- Implementation of the LITMUS syscalls,
++ * the LITMUS intialization code,
++ * and the procfs interface..
++ */
++#include
++#include
++#include
++
++#include
++#include
++#include
++
++#include
++#include
++#include
++
++#include
++
++#include
++
++#include
++
++/* Number of RT tasks that exist in the system */
++atomic_t rt_task_count = ATOMIC_INIT(0);
++static DEFINE_RAW_SPINLOCK(task_transition_lock);
++/* synchronize plugin switching */
++atomic_t cannot_use_plugin = ATOMIC_INIT(0);
++
++/* Give log messages sequential IDs. */
++atomic_t __log_seq_no = ATOMIC_INIT(0);
++
++#ifdef CONFIG_RELEASE_MASTER
++/* current master CPU for handling timer IRQs */
++atomic_t release_master_cpu = ATOMIC_INIT(NO_CPU);
++#endif
++
++static struct kmem_cache * bheap_node_cache;
++extern struct kmem_cache * release_heap_cache;
++
++struct bheap_node* bheap_node_alloc(int gfp_flags)
++{
++ return kmem_cache_alloc(bheap_node_cache, gfp_flags);
++}
++
++void bheap_node_free(struct bheap_node* hn)
++{
++ kmem_cache_free(bheap_node_cache, hn);
++}
++
++struct release_heap* release_heap_alloc(int gfp_flags);
++void release_heap_free(struct release_heap* rh);
++
++/*
++ * sys_set_task_rt_param
++ * @pid: Pid of the task which scheduling parameters must be changed
++ * @param: New real-time extension parameters such as the execution cost and
++ * period
++ * Syscall for manipulating with task rt extension params
++ * Returns EFAULT if param is NULL.
++ * ESRCH if pid is not corrsponding
++ * to a valid task.
++ * EINVAL if either period or execution cost is <=0
++ * EPERM if pid is a real-time task
++ * 0 if success
++ *
++ * Only non-real-time tasks may be configured with this system call
++ * to avoid races with the scheduler. In practice, this means that a
++ * task's parameters must be set _before_ calling sys_prepare_rt_task()
++ *
++ * find_task_by_vpid() assumes that we are in the same namespace of the
++ * target.
++ */
++asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param)
++{
++ struct rt_task tp;
++ struct task_struct *target;
++ int retval = -EINVAL;
++
++ printk("Setting up rt task parameters for process %d.\n", pid);
++
++ if (pid < 0 || param == 0) {
++ goto out;
++ }
++ if (copy_from_user(&tp, param, sizeof(tp))) {
++ retval = -EFAULT;
++ goto out;
++ }
++
++ /* Task search and manipulation must be protected */
++ read_lock_irq(&tasklist_lock);
++ if (!(target = find_task_by_vpid(pid))) {
++ retval = -ESRCH;
++ goto out_unlock;
++ }
++
++ if (is_realtime(target)) {
++ /* The task is already a real-time task.
++ * We cannot not allow parameter changes at this point.
++ */
++ retval = -EBUSY;
++ goto out_unlock;
++ }
++
++ if (tp.exec_cost <= 0)
++ goto out_unlock;
++ if (tp.period <= 0)
++ goto out_unlock;
++ if (!cpu_online(tp.cpu))
++ goto out_unlock;
++ if (tp.period < tp.exec_cost)
++ {
++ printk(KERN_INFO "litmus: real-time task %d rejected "
++ "because wcet > period\n", pid);
++ goto out_unlock;
++ }
++ if (tp.budget_policy != NO_ENFORCEMENT &&
++ tp.budget_policy != QUANTUM_ENFORCEMENT &&
++ tp.budget_policy != PRECISE_ENFORCEMENT)
++ {
++ printk(KERN_INFO "litmus: real-time task %d rejected "
++ "because unsupported budget enforcement policy "
++ "specified (%d)\n",
++ pid, tp.budget_policy);
++ goto out_unlock;
++ }
++
++ target->rt_param.task_params = tp;
++
++ retval = 0;
++ out_unlock:
++ read_unlock_irq(&tasklist_lock);
++ out:
++ return retval;
++}
++
++/*
++ * Getter of task's RT params
++ * returns EINVAL if param or pid is NULL
++ * returns ESRCH if pid does not correspond to a valid task
++ * returns EFAULT if copying of parameters has failed.
++ *
++ * find_task_by_vpid() assumes that we are in the same namespace of the
++ * target.
++ */
++asmlinkage long sys_get_rt_task_param(pid_t pid, struct rt_task __user * param)
++{
++ int retval = -EINVAL;
++ struct task_struct *source;
++ struct rt_task lp;
++ if (param == 0 || pid < 0)
++ goto out;
++ read_lock(&tasklist_lock);
++ if (!(source = find_task_by_vpid(pid))) {
++ retval = -ESRCH;
++ goto out_unlock;
++ }
++ lp = source->rt_param.task_params;
++ read_unlock(&tasklist_lock);
++ /* Do copying outside the lock */
++ retval =
++ copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0;
++ return retval;
++ out_unlock:
++ read_unlock(&tasklist_lock);
++ out:
++ return retval;
++
++}
++
++/*
++ * This is the crucial function for periodic task implementation,
++ * It checks if a task is periodic, checks if such kind of sleep
++ * is permitted and calls plugin-specific sleep, which puts the
++ * task into a wait array.
++ * returns 0 on successful wakeup
++ * returns EPERM if current conditions do not permit such sleep
++ * returns EINVAL if current task is not able to go to sleep
++ */
++asmlinkage long sys_complete_job(void)
++{
++ int retval = -EPERM;
++ if (!is_realtime(current)) {
++ retval = -EINVAL;
++ goto out;
++ }
++ /* Task with negative or zero period cannot sleep */
++ if (get_rt_period(current) <= 0) {
++ retval = -EINVAL;
++ goto out;
++ }
++ /* The plugin has to put the task into an
++ * appropriate queue and call schedule
++ */
++ retval = litmus->complete_job();
++ out:
++ return retval;
++}
++
++/* This is an "improved" version of sys_complete_job that
++ * addresses the problem of unintentionally missing a job after
++ * an overrun.
++ *
++ * returns 0 on successful wakeup
++ * returns EPERM if current conditions do not permit such sleep
++ * returns EINVAL if current task is not able to go to sleep
++ */
++asmlinkage long sys_wait_for_job_release(unsigned int job)
++{
++ int retval = -EPERM;
++ if (!is_realtime(current)) {
++ retval = -EINVAL;
++ goto out;
++ }
++
++ /* Task with negative or zero period cannot sleep */
++ if (get_rt_period(current) <= 0) {
++ retval = -EINVAL;
++ goto out;
++ }
++
++ retval = 0;
++
++ /* first wait until we have "reached" the desired job
++ *
++ * This implementation has at least two problems:
++ *
++ * 1) It doesn't gracefully handle the wrap around of
++ * job_no. Since LITMUS is a prototype, this is not much
++ * of a problem right now.
++ *
++ * 2) It is theoretically racy if a job release occurs
++ * between checking job_no and calling sleep_next_period().
++ * A proper solution would requiring adding another callback
++ * in the plugin structure and testing the condition with
++ * interrupts disabled.
++ *
++ * FIXME: At least problem 2 should be taken care of eventually.
++ */
++ while (!retval && job > current->rt_param.job_params.job_no)
++ /* If the last job overran then job <= job_no and we
++ * don't send the task to sleep.
++ */
++ retval = litmus->complete_job();
++ out:
++ return retval;
++}
++
++/* This is a helper syscall to query the current job sequence number.
++ *
++ * returns 0 on successful query
++ * returns EPERM if task is not a real-time task.
++ * returns EFAULT if &job is not a valid pointer.
++ */
++asmlinkage long sys_query_job_no(unsigned int __user *job)
++{
++ int retval = -EPERM;
++ if (is_realtime(current))
++ retval = put_user(current->rt_param.job_params.job_no, job);
++
++ return retval;
++}
++
++/* sys_null_call() is only used for determining raw system call
++ * overheads (kernel entry, kernel exit). It has no useful side effects.
++ * If ts is non-NULL, then the current Feather-Trace time is recorded.
++ */
++asmlinkage long sys_null_call(cycles_t __user *ts)
++{
++ long ret = 0;
++ cycles_t now;
++
++ if (ts) {
++ now = get_cycles();
++ ret = put_user(now, ts);
++ }
++
++ return ret;
++}
++
++/* p is a real-time task. Re-init its state as a best-effort task. */
++static void reinit_litmus_state(struct task_struct* p, int restore)
++{
++ struct rt_task user_config = {};
++ void* ctrl_page = NULL;
++
++ if (restore) {
++ /* Safe user-space provided configuration data.
++ * and allocated page. */
++ user_config = p->rt_param.task_params;
++ ctrl_page = p->rt_param.ctrl_page;
++ }
++
++ /* We probably should not be inheriting any task's priority
++ * at this point in time.
++ */
++ WARN_ON(p->rt_param.inh_task);
++
++ /* We need to restore the priority of the task. */
++// __setscheduler(p, p->rt_param.old_policy, p->rt_param.old_prio); XXX why is this commented?
++
++ /* Cleanup everything else. */
++ memset(&p->rt_param, 0, sizeof(p->rt_param));
++
++ /* Restore preserved fields. */
++ if (restore) {
++ p->rt_param.task_params = user_config;
++ p->rt_param.ctrl_page = ctrl_page;
++ }
++}
++
++long litmus_admit_task(struct task_struct* tsk)
++{
++ long retval = 0;
++ unsigned long flags;
++
++ BUG_ON(is_realtime(tsk));
++
++ if (get_rt_period(tsk) == 0 ||
++ get_exec_cost(tsk) > get_rt_period(tsk)) {
++ TRACE_TASK(tsk, "litmus admit: invalid task parameters "
++ "(%lu, %lu)\n",
++ get_exec_cost(tsk), get_rt_period(tsk));
++ retval = -EINVAL;
++ goto out;
++ }
++
++ if (!cpu_online(get_partition(tsk))) {
++ TRACE_TASK(tsk, "litmus admit: cpu %d is not online\n",
++ get_partition(tsk));
++ retval = -EINVAL;
++ goto out;
++ }
++
++ INIT_LIST_HEAD(&tsk_rt(tsk)->list);
++
++ /* avoid scheduler plugin changing underneath us */
++ raw_spin_lock_irqsave(&task_transition_lock, flags);
++
++ /* allocate heap node for this task */
++ tsk_rt(tsk)->heap_node = bheap_node_alloc(GFP_ATOMIC);
++ tsk_rt(tsk)->rel_heap = release_heap_alloc(GFP_ATOMIC);
++
++ if (!tsk_rt(tsk)->heap_node || !tsk_rt(tsk)->rel_heap) {
++ printk(KERN_WARNING "litmus: no more heap node memory!?\n");
++
++ bheap_node_free(tsk_rt(tsk)->heap_node);
++ release_heap_free(tsk_rt(tsk)->rel_heap);
++
++ retval = -ENOMEM;
++ goto out_unlock;
++ } else {
++ bheap_node_init(&tsk_rt(tsk)->heap_node, tsk);
++ }
++
++ retval = litmus->admit_task(tsk);
++
++ if (!retval) {
++ sched_trace_task_name(tsk);
++ sched_trace_task_param(tsk);
++ atomic_inc(&rt_task_count);
++ }
++
++out_unlock:
++ raw_spin_unlock_irqrestore(&task_transition_lock, flags);
++out:
++ return retval;
++}
++
++void litmus_exit_task(struct task_struct* tsk)
++{
++ if (is_realtime(tsk)) {
++ sched_trace_task_completion(tsk, 1);
++
++ litmus->task_exit(tsk);
++
++ BUG_ON(bheap_node_in_heap(tsk_rt(tsk)->heap_node));
++ bheap_node_free(tsk_rt(tsk)->heap_node);
++ release_heap_free(tsk_rt(tsk)->rel_heap);
++
++ atomic_dec(&rt_task_count);
++ reinit_litmus_state(tsk, 1);
++ }
++}
++
++/* IPI callback to synchronize plugin switching */
++static void synch_on_plugin_switch(void* info)
++{
++ while (atomic_read(&cannot_use_plugin))
++ cpu_relax();
++}
++
++/* Switching a plugin in use is tricky.
++ * We must watch out that no real-time tasks exists
++ * (and that none is created in parallel) and that the plugin is not
++ * currently in use on any processor (in theory).
++ */
++int switch_sched_plugin(struct sched_plugin* plugin)
++{
++ unsigned long flags;
++ int ret = 0;
++
++ BUG_ON(!plugin);
++
++ /* forbid other cpus to use the plugin */
++ atomic_set(&cannot_use_plugin, 1);
++ /* send IPI to force other CPUs to synch with us */
++ smp_call_function(synch_on_plugin_switch, NULL, 0);
++
++ /* stop task transitions */
++ raw_spin_lock_irqsave(&task_transition_lock, flags);
++
++ /* don't switch if there are active real-time tasks */
++ if (atomic_read(&rt_task_count) == 0) {
++ ret = litmus->deactivate_plugin();
++ if (0 != ret)
++ goto out;
++ ret = plugin->activate_plugin();
++ if (0 != ret) {
++ printk(KERN_INFO "Can't activate %s (%d).\n",
++ plugin->plugin_name, ret);
++ plugin = &linux_sched_plugin;
++ }
++ printk(KERN_INFO "Switching to LITMUS^RT plugin %s.\n", plugin->plugin_name);
++ litmus = plugin;
++ } else
++ ret = -EBUSY;
++out:
++ raw_spin_unlock_irqrestore(&task_transition_lock, flags);
++ atomic_set(&cannot_use_plugin, 0);
++ return ret;
++}
++
++/* Called upon fork.
++ * p is the newly forked task.
++ */
++void litmus_fork(struct task_struct* p)
++{
++ if (is_realtime(p))
++ /* clean out any litmus related state, don't preserve anything */
++ reinit_litmus_state(p, 0);
++ else
++ /* non-rt tasks might have ctrl_page set */
++ tsk_rt(p)->ctrl_page = NULL;
++
++ /* od tables are never inherited across a fork */
++ p->od_table = NULL;
++}
++
++/* Called upon execve().
++ * current is doing the exec.
++ * Don't let address space specific stuff leak.
++ */
++void litmus_exec(void)
++{
++ struct task_struct* p = current;
++
++ if (is_realtime(p)) {
++ WARN_ON(p->rt_param.inh_task);
++ if (tsk_rt(p)->ctrl_page) {
++ free_page((unsigned long) tsk_rt(p)->ctrl_page);
++ tsk_rt(p)->ctrl_page = NULL;
++ }
++ }
++}
++
++void exit_litmus(struct task_struct *dead_tsk)
++{
++ /* We also allow non-RT tasks to
++ * allocate control pages to allow
++ * measurements with non-RT tasks.
++ * So check if we need to free the page
++ * in any case.
++ */
++ if (tsk_rt(dead_tsk)->ctrl_page) {
++ TRACE_TASK(dead_tsk,
++ "freeing ctrl_page %p\n",
++ tsk_rt(dead_tsk)->ctrl_page);
++ free_page((unsigned long) tsk_rt(dead_tsk)->ctrl_page);
++ }
++
++ /* main cleanup only for RT tasks */
++ if (is_realtime(dead_tsk))
++ litmus_exit_task(dead_tsk);
++}
++
++
++#ifdef CONFIG_MAGIC_SYSRQ
++int sys_kill(int pid, int sig);
++
++static void sysrq_handle_kill_rt_tasks(int key, struct tty_struct *tty)
++{
++ struct task_struct *t;
++ read_lock(&tasklist_lock);
++ for_each_process(t) {
++ if (is_realtime(t)) {
++ sys_kill(t->pid, SIGKILL);
++ }
++ }
++ read_unlock(&tasklist_lock);
++}
++
++static struct sysrq_key_op sysrq_kill_rt_tasks_op = {
++ .handler = sysrq_handle_kill_rt_tasks,
++ .help_msg = "quit-rt-tasks(X)",
++ .action_msg = "sent SIGKILL to all LITMUS^RT real-time tasks",
++};
++#endif
++
++/* in litmus/sync.c */
++int count_tasks_waiting_for_release(void);
++
++static int proc_read_stats(char *page, char **start,
++ off_t off, int count,
++ int *eof, void *data)
++{
++ int len;
++
++ len = snprintf(page, PAGE_SIZE,
++ "real-time tasks = %d\n"
++ "ready for release = %d\n",
++ atomic_read(&rt_task_count),
++ count_tasks_waiting_for_release());
++ return len;
++}
++
++static int proc_read_plugins(char *page, char **start,
++ off_t off, int count,
++ int *eof, void *data)
++{
++ int len;
++
++ len = print_sched_plugins(page, PAGE_SIZE);
++ return len;
++}
++
++static int proc_read_curr(char *page, char **start,
++ off_t off, int count,
++ int *eof, void *data)
++{
++ int len;
++
++ len = snprintf(page, PAGE_SIZE, "%s\n", litmus->plugin_name);
++ return len;
++}
++
++static int proc_write_curr(struct file *file,
++ const char *buffer,
++ unsigned long count,
++ void *data)
++{
++ int len, ret;
++ char name[65];
++ struct sched_plugin* found;
++
++ if(count > 64)
++ len = 64;
++ else
++ len = count;
++
++ if(copy_from_user(name, buffer, len))
++ return -EFAULT;
++
++ name[len] = '\0';
++ /* chomp name */
++ if (len > 1 && name[len - 1] == '\n')
++ name[len - 1] = '\0';
++
++ found = find_sched_plugin(name);
++
++ if (found) {
++ ret = switch_sched_plugin(found);
++ if (ret != 0)
++ printk(KERN_INFO "Could not switch plugin: %d\n", ret);
++ } else
++ printk(KERN_INFO "Plugin '%s' is unknown.\n", name);
++
++ return len;
++}
++
++static int proc_read_cluster_size(char *page, char **start,
++ off_t off, int count,
++ int *eof, void *data)
++{
++ int len;
++ if (cluster_cache_index == 2)
++ len = snprintf(page, PAGE_SIZE, "L2\n");
++ else if (cluster_cache_index == 3)
++ len = snprintf(page, PAGE_SIZE, "L3\n");
++ else if (cluster_cache_index == 1)
++ len = snprintf(page, PAGE_SIZE, "L1\n");
++ else
++ len = snprintf(page, PAGE_SIZE, "ALL\n");
++
++ return len;
++}
++
++static int proc_write_cluster_size(struct file *file,
++ const char *buffer,
++ unsigned long count,
++ void *data)
++{
++ int len;
++ /* L2, L3 */
++ char cache_name[33];
++
++ if(count > 32)
++ len = 32;
++ else
++ len = count;
++
++ if(copy_from_user(cache_name, buffer, len))
++ return -EFAULT;
++
++ cache_name[len] = '\0';
++ /* chomp name */
++ if (len > 1 && cache_name[len - 1] == '\n')
++ cache_name[len - 1] = '\0';
++
++ /* do a quick and dirty comparison to find the cluster size */
++ if (!strcmp(cache_name, "L2"))
++ cluster_cache_index = 2;
++ else if (!strcmp(cache_name, "L3"))
++ cluster_cache_index = 3;
++ else if (!strcmp(cache_name, "L1"))
++ cluster_cache_index = 1;
++ else if (!strcmp(cache_name, "ALL"))
++ cluster_cache_index = num_online_cpus();
++ else
++ printk(KERN_INFO "Cluster '%s' is unknown.\n", cache_name);
++
++ return len;
++}
++
++#ifdef CONFIG_RELEASE_MASTER
++static int proc_read_release_master(char *page, char **start,
++ off_t off, int count,
++ int *eof, void *data)
++{
++ int len, master;
++ master = atomic_read(&release_master_cpu);
++ if (master == NO_CPU)
++ len = snprintf(page, PAGE_SIZE, "NO_CPU\n");
++ else
++ len = snprintf(page, PAGE_SIZE, "%d\n", master);
++ return len;
++}
++
++static int proc_write_release_master(struct file *file,
++ const char *buffer,
++ unsigned long count,
++ void *data)
++{
++ int cpu, err, online = 0;
++ char msg[64];
++
++ if (count > 63)
++ return -EINVAL;
++
++ if (copy_from_user(msg, buffer, count))
++ return -EFAULT;
++
++ /* terminate */
++ msg[count] = '\0';
++ /* chomp */
++ if (count > 1 && msg[count - 1] == '\n')
++ msg[count - 1] = '\0';
++
++ if (strcmp(msg, "NO_CPU") == 0) {
++ atomic_set(&release_master_cpu, NO_CPU);
++ return count;
++ } else {
++ err = sscanf(msg, "%d", &cpu);
++ if (err == 1 && cpu >= 0 && (online = cpu_online(cpu))) {
++ atomic_set(&release_master_cpu, cpu);
++ return count;
++ } else {
++ TRACE("invalid release master: '%s' "
++ "(err:%d cpu:%d online:%d)\n",
++ msg, err, cpu, online);
++ return -EINVAL;
++ }
++ }
++}
++#endif
++
++static struct proc_dir_entry *litmus_dir = NULL,
++ *curr_file = NULL,
++ *stat_file = NULL,
++ *plugs_file = NULL,
++#ifdef CONFIG_RELEASE_MASTER
++ *release_master_file = NULL,
++#endif
++ *clus_cache_idx_file = NULL;
++
++static int __init init_litmus_proc(void)
++{
++ litmus_dir = proc_mkdir("litmus", NULL);
++ if (!litmus_dir) {
++ printk(KERN_ERR "Could not allocate LITMUS^RT procfs entry.\n");
++ return -ENOMEM;
++ }
++
++ curr_file = create_proc_entry("active_plugin",
++ 0644, litmus_dir);
++ if (!curr_file) {
++ printk(KERN_ERR "Could not allocate active_plugin "
++ "procfs entry.\n");
++ return -ENOMEM;
++ }
++ curr_file->read_proc = proc_read_curr;
++ curr_file->write_proc = proc_write_curr;
++
++#ifdef CONFIG_RELEASE_MASTER
++ release_master_file = create_proc_entry("release_master",
++ 0644, litmus_dir);
++ if (!release_master_file) {
++ printk(KERN_ERR "Could not allocate release_master "
++ "procfs entry.\n");
++ return -ENOMEM;
++ }
++ release_master_file->read_proc = proc_read_release_master;
++ release_master_file->write_proc = proc_write_release_master;
++#endif
++
++ clus_cache_idx_file = create_proc_entry("cluster_cache",
++ 0644, litmus_dir);
++ if (!clus_cache_idx_file) {
++ printk(KERN_ERR "Could not allocate cluster_cache "
++ "procfs entry.\n");
++ return -ENOMEM;
++ }
++ clus_cache_idx_file->read_proc = proc_read_cluster_size;
++ clus_cache_idx_file->write_proc = proc_write_cluster_size;
++
++ stat_file = create_proc_read_entry("stats", 0444, litmus_dir,
++ proc_read_stats, NULL);
++
++ plugs_file = create_proc_read_entry("plugins", 0444, litmus_dir,
++ proc_read_plugins, NULL);
++
++ return 0;
++}
++
++static void exit_litmus_proc(void)
++{
++ if (plugs_file)
++ remove_proc_entry("plugins", litmus_dir);
++ if (stat_file)
++ remove_proc_entry("stats", litmus_dir);
++ if (curr_file)
++ remove_proc_entry("active_plugin", litmus_dir);
++ if (clus_cache_idx_file)
++ remove_proc_entry("cluster_cache", litmus_dir);
++#ifdef CONFIG_RELEASE_MASTER
++ if (release_master_file)
++ remove_proc_entry("release_master", litmus_dir);
++#endif
++ if (litmus_dir)
++ remove_proc_entry("litmus", NULL);
++}
++
++extern struct sched_plugin linux_sched_plugin;
++
++static int __init _init_litmus(void)
++{
++ /* Common initializers,
++ * mode change lock is used to enforce single mode change
++ * operation.
++ */
++ printk("Starting LITMUS^RT kernel\n");
++
++ register_sched_plugin(&linux_sched_plugin);
++
++ bheap_node_cache = KMEM_CACHE(bheap_node, SLAB_PANIC);
++ release_heap_cache = KMEM_CACHE(release_heap, SLAB_PANIC);
++
++#ifdef CONFIG_MAGIC_SYSRQ
++ /* offer some debugging help */
++ if (!register_sysrq_key('x', &sysrq_kill_rt_tasks_op))
++ printk("Registered kill rt tasks magic sysrq.\n");
++ else
++ printk("Could not register kill rt tasks magic sysrq.\n");
++#endif
++
++ init_litmus_proc();
++
++ return 0;
++}
++
++static void _exit_litmus(void)
++{
++ exit_litmus_proc();
++ kmem_cache_destroy(bheap_node_cache);
++ kmem_cache_destroy(release_heap_cache);
++}
++
++module_init(_init_litmus);
++module_exit(_exit_litmus);
+diff --git a/litmus/rt_domain.c b/litmus/rt_domain.c
+new file mode 100644
+index 0000000..81a5ac1
+--- /dev/null
++++ b/litmus/rt_domain.c
+@@ -0,0 +1,355 @@
++/*
++ * litmus/rt_domain.c
++ *
++ * LITMUS real-time infrastructure. This file contains the
++ * functions that manipulate RT domains. RT domains are an abstraction
++ * of a ready queue and a release queue.
++ */
++
++#include
++#include
++#include
++#include
++
++#include
++#include
++#include
++
++#include
++
++#include
++
++#include
++
++/* Uncomment when debugging timer races... */
++#if 0
++#define VTRACE_TASK TRACE_TASK
++#define VTRACE TRACE
++#else
++#define VTRACE_TASK(t, fmt, args...) /* shut up */
++#define VTRACE(fmt, args...) /* be quiet already */
++#endif
++
++static int dummy_resched(rt_domain_t *rt)
++{
++ return 0;
++}
++
++static int dummy_order(struct bheap_node* a, struct bheap_node* b)
++{
++ return 0;
++}
++
++/* default implementation: use default lock */
++static void default_release_jobs(rt_domain_t* rt, struct bheap* tasks)
++{
++ merge_ready(rt, tasks);
++}
++
++static unsigned int time2slot(lt_t time)
++{
++ return (unsigned int) time2quanta(time, FLOOR) % RELEASE_QUEUE_SLOTS;
++}
++
++static enum hrtimer_restart on_release_timer(struct hrtimer *timer)
++{
++ unsigned long flags;
++ struct release_heap* rh;
++
++ VTRACE("on_release_timer(0x%p) starts.\n", timer);
++
++ TS_RELEASE_START;
++
++ rh = container_of(timer, struct release_heap, timer);
++
++ raw_spin_lock_irqsave(&rh->dom->release_lock, flags);
++ VTRACE("CB has the release_lock 0x%p\n", &rh->dom->release_lock);
++ /* remove from release queue */
++ list_del(&rh->list);
++ raw_spin_unlock_irqrestore(&rh->dom->release_lock, flags);
++ VTRACE("CB returned release_lock 0x%p\n", &rh->dom->release_lock);
++
++ /* call release callback */
++ rh->dom->release_jobs(rh->dom, &rh->heap);
++ /* WARNING: rh can be referenced from other CPUs from now on. */
++
++ TS_RELEASE_END;
++
++ VTRACE("on_release_timer(0x%p) ends.\n", timer);
++
++ return HRTIMER_NORESTART;
++}
++
++/* allocated in litmus.c */
++struct kmem_cache * release_heap_cache;
++
++struct release_heap* release_heap_alloc(int gfp_flags)
++{
++ struct release_heap* rh;
++ rh= kmem_cache_alloc(release_heap_cache, gfp_flags);
++ if (rh) {
++ /* initialize timer */
++ hrtimer_init(&rh->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
++ rh->timer.function = on_release_timer;
++ }
++ return rh;
++}
++
++void release_heap_free(struct release_heap* rh)
++{
++ /* make sure timer is no longer in use */
++ hrtimer_cancel(&rh->timer);
++ kmem_cache_free(release_heap_cache, rh);
++}
++
++/* Caller must hold release lock.
++ * Will return heap for given time. If no such heap exists prior to
++ * the invocation it will be created.
++ */
++static struct release_heap* get_release_heap(rt_domain_t *rt,
++ struct task_struct* t,
++ int use_task_heap)
++{
++ struct list_head* pos;
++ struct release_heap* heap = NULL;
++ struct release_heap* rh;
++ lt_t release_time = get_release(t);
++ unsigned int slot = time2slot(release_time);
++
++ /* initialize pos for the case that the list is empty */
++ pos = rt->release_queue.slot[slot].next;
++ list_for_each(pos, &rt->release_queue.slot[slot]) {
++ rh = list_entry(pos, struct release_heap, list);
++ if (release_time == rh->release_time) {
++ /* perfect match -- this happens on hyperperiod
++ * boundaries
++ */
++ heap = rh;
++ break;
++ } else if (lt_before(release_time, rh->release_time)) {
++ /* we need to insert a new node since rh is
++ * already in the future
++ */
++ break;
++ }
++ }
++ if (!heap && use_task_heap) {
++ /* use pre-allocated release heap */
++ rh = tsk_rt(t)->rel_heap;
++
++ rh->dom = rt;
++ rh->release_time = release_time;
++
++ /* add to release queue */
++ list_add(&rh->list, pos->prev);
++ heap = rh;
++ }
++ return heap;
++}
++
++static void reinit_release_heap(struct task_struct* t)
++{
++ struct release_heap* rh;
++
++ /* use pre-allocated release heap */
++ rh = tsk_rt(t)->rel_heap;
++
++ /* Make sure it is safe to use. The timer callback could still
++ * be executing on another CPU; hrtimer_cancel() will wait
++ * until the timer callback has completed. However, under no
++ * circumstances should the timer be active (= yet to be
++ * triggered).
++ *
++ * WARNING: If the CPU still holds the release_lock at this point,
++ * deadlock may occur!
++ */
++ BUG_ON(hrtimer_cancel(&rh->timer));
++
++ /* initialize */
++ bheap_init(&rh->heap);
++#ifdef CONFIG_RELEASE_MASTER
++ atomic_set(&rh->info.state, HRTIMER_START_ON_INACTIVE);
++#endif
++}
++/* arm_release_timer() - start local release timer or trigger
++ * remote timer (pull timer)
++ *
++ * Called by add_release() with:
++ * - tobe_lock taken
++ * - IRQ disabled
++ */
++#ifdef CONFIG_RELEASE_MASTER
++#define arm_release_timer(t) arm_release_timer_on((t), NO_CPU)
++static void arm_release_timer_on(rt_domain_t *_rt , int target_cpu)
++#else
++static void arm_release_timer(rt_domain_t *_rt)
++#endif
++{
++ rt_domain_t *rt = _rt;
++ struct list_head list;
++ struct list_head *pos, *safe;
++ struct task_struct* t;
++ struct release_heap* rh;
++
++ VTRACE("arm_release_timer() at %llu\n", litmus_clock());
++ list_replace_init(&rt->tobe_released, &list);
++
++ list_for_each_safe(pos, safe, &list) {
++ /* pick task of work list */
++ t = list_entry(pos, struct task_struct, rt_param.list);
++ sched_trace_task_release(t);
++ list_del(pos);
++
++ /* put into release heap while holding release_lock */
++ raw_spin_lock(&rt->release_lock);
++ VTRACE_TASK(t, "I have the release_lock 0x%p\n", &rt->release_lock);
++
++ rh = get_release_heap(rt, t, 0);
++ if (!rh) {
++ /* need to use our own, but drop lock first */
++ raw_spin_unlock(&rt->release_lock);
++ VTRACE_TASK(t, "Dropped release_lock 0x%p\n",
++ &rt->release_lock);
++
++ reinit_release_heap(t);
++ VTRACE_TASK(t, "release_heap ready\n");
++
++ raw_spin_lock(&rt->release_lock);
++ VTRACE_TASK(t, "Re-acquired release_lock 0x%p\n",
++ &rt->release_lock);
++
++ rh = get_release_heap(rt, t, 1);
++ }
++ bheap_insert(rt->order, &rh->heap, tsk_rt(t)->heap_node);
++ VTRACE_TASK(t, "arm_release_timer(): added to release heap\n");
++
++ raw_spin_unlock(&rt->release_lock);
++ VTRACE_TASK(t, "Returned the release_lock 0x%p\n", &rt->release_lock);
++
++ /* To avoid arming the timer multiple times, we only let the
++ * owner do the arming (which is the "first" task to reference
++ * this release_heap anyway).
++ */
++ if (rh == tsk_rt(t)->rel_heap) {
++ VTRACE_TASK(t, "arming timer 0x%p\n", &rh->timer);
++ /* we cannot arm the timer using hrtimer_start()
++ * as it may deadlock on rq->lock
++ *
++ * PINNED mode is ok on both local and remote CPU
++ */
++#ifdef CONFIG_RELEASE_MASTER
++ if (rt->release_master == NO_CPU &&
++ target_cpu == NO_CPU)
++#endif
++ __hrtimer_start_range_ns(&rh->timer,
++ ns_to_ktime(rh->release_time),
++ 0, HRTIMER_MODE_ABS_PINNED, 0);
++#ifdef CONFIG_RELEASE_MASTER
++ else
++ hrtimer_start_on(
++ /* target_cpu overrides release master */
++ (target_cpu != NO_CPU ?
++ target_cpu : rt->release_master),
++ &rh->info, &rh->timer,
++ ns_to_ktime(rh->release_time),
++ HRTIMER_MODE_ABS_PINNED);
++#endif
++ } else
++ VTRACE_TASK(t, "0x%p is not my timer\n", &rh->timer);
++ }
++}
++
++void rt_domain_init(rt_domain_t *rt,
++ bheap_prio_t order,
++ check_resched_needed_t check,
++ release_jobs_t release
++ )
++{
++ int i;
++
++ BUG_ON(!rt);
++ if (!check)
++ check = dummy_resched;
++ if (!release)
++ release = default_release_jobs;
++ if (!order)
++ order = dummy_order;
++
++#ifdef CONFIG_RELEASE_MASTER
++ rt->release_master = NO_CPU;
++#endif
++
++ bheap_init(&rt->ready_queue);
++ INIT_LIST_HEAD(&rt->tobe_released);
++ for (i = 0; i < RELEASE_QUEUE_SLOTS; i++)
++ INIT_LIST_HEAD(&rt->release_queue.slot[i]);
++
++ raw_spin_lock_init(&rt->ready_lock);
++ raw_spin_lock_init(&rt->release_lock);
++ raw_spin_lock_init(&rt->tobe_lock);
++
++ rt->check_resched = check;
++ rt->release_jobs = release;
++ rt->order = order;
++}
++
++/* add_ready - add a real-time task to the rt ready queue. It must be runnable.
++ * @new: the newly released task
++ */
++void __add_ready(rt_domain_t* rt, struct task_struct *new)
++{
++ TRACE("rt: adding %s/%d (%llu, %llu) rel=%llu to ready queue at %llu\n",
++ new->comm, new->pid, get_exec_cost(new), get_rt_period(new),
++ get_release(new), litmus_clock());
++
++ BUG_ON(bheap_node_in_heap(tsk_rt(new)->heap_node));
++
++ bheap_insert(rt->order, &rt->ready_queue, tsk_rt(new)->heap_node);
++ rt->check_resched(rt);
++}
++
++/* merge_ready - Add a sorted set of tasks to the rt ready queue. They must be runnable.
++ * @tasks - the newly released tasks
++ */
++void __merge_ready(rt_domain_t* rt, struct bheap* tasks)
++{
++ bheap_union(rt->order, &rt->ready_queue, tasks);
++ rt->check_resched(rt);
++}
++
++
++#ifdef CONFIG_RELEASE_MASTER
++void __add_release_on(rt_domain_t* rt, struct task_struct *task,
++ int target_cpu)
++{
++ TRACE_TASK(task, "add_release_on(), rel=%llu, target=%d\n",
++ get_release(task), target_cpu);
++ list_add(&tsk_rt(task)->list, &rt->tobe_released);
++ task->rt_param.domain = rt;
++
++ /* start release timer */
++ TS_SCHED2_START(task);
++
++ arm_release_timer_on(rt, target_cpu);
++
++ TS_SCHED2_END(task);
++}
++#endif
++
++/* add_release - add a real-time task to the rt release queue.
++ * @task: the sleeping task
++ */
++void __add_release(rt_domain_t* rt, struct task_struct *task)
++{
++ TRACE_TASK(task, "add_release(), rel=%llu\n", get_release(task));
++ list_add(&tsk_rt(task)->list, &rt->tobe_released);
++ task->rt_param.domain = rt;
++
++ /* start release timer */
++ TS_SCHED2_START(task);
++
++ arm_release_timer(rt);
++
++ TS_SCHED2_END(task);
++}
++
+diff --git a/litmus/sched_cedf.c b/litmus/sched_cedf.c
+new file mode 100644
+index 0000000..f5b7708
+--- /dev/null
++++ b/litmus/sched_cedf.c
+@@ -0,0 +1,773 @@
++/*
++ * litmus/sched_cedf.c
++ *
++ * Implementation of the C-EDF scheduling algorithm.
++ *
++ * This implementation is based on G-EDF:
++ * - CPUs are clustered around L2 or L3 caches.
++ * - Clusters topology is automatically detected (this is arch dependent
++ * and is working only on x86 at the moment --- and only with modern
++ * cpus that exports cpuid4 information)
++ * - The plugins _does not_ attempt to put tasks in the right cluster i.e.
++ * the programmer needs to be aware of the topology to place tasks
++ * in the desired cluster
++ * - default clustering is around L2 cache (cache index = 2)
++ * supported clusters are: L1 (private cache: pedf), L2, L3, ALL (all
++ * online_cpus are placed in a single cluster).
++ *
++ * For details on functions, take a look at sched_gsn_edf.c
++ *
++ * Currently, we do not support changes in the number of online cpus.
++ * If the num_online_cpus() dynamically changes, the plugin is broken.
++ *
++ * This version uses the simple approach and serializes all scheduling
++ * decisions by the use of a queue lock. This is probably not the
++ * best way to do it, but it should suffice for now.
++ */
++
++#include
++#include
++#include
++#include
++
++#include
++#include
++#include
++#include
++#include
++
++#include
++
++#include
++
++/* forward declaration... a funny thing with C ;) */
++struct clusterdomain;
++
++/* cpu_entry_t - maintain the linked and scheduled state
++ *
++ * A cpu also contains a pointer to the cedf_domain_t cluster
++ * that owns it (struct clusterdomain*)
++ */
++typedef struct {
++ int cpu;
++ struct clusterdomain* cluster; /* owning cluster */
++ struct task_struct* linked; /* only RT tasks */
++ struct task_struct* scheduled; /* only RT tasks */
++ atomic_t will_schedule; /* prevent unneeded IPIs */
++ struct bheap_node* hn;
++} cpu_entry_t;
++
++/* one cpu_entry_t per CPU */
++DEFINE_PER_CPU(cpu_entry_t, cedf_cpu_entries);
++
++#define set_will_schedule() \
++ (atomic_set(&__get_cpu_var(cedf_cpu_entries).will_schedule, 1))
++#define clear_will_schedule() \
++ (atomic_set(&__get_cpu_var(cedf_cpu_entries).will_schedule, 0))
++#define test_will_schedule(cpu) \
++ (atomic_read(&per_cpu(cedf_cpu_entries, cpu).will_schedule))
++
++/*
++ * In C-EDF there is a cedf domain _per_ cluster
++ * The number of clusters is dynamically determined accordingly to the
++ * total cpu number and the cluster size
++ */
++typedef struct clusterdomain {
++ /* rt_domain for this cluster */
++ rt_domain_t domain;
++ /* cpus in this cluster */
++ cpu_entry_t* *cpus;
++ /* map of this cluster cpus */
++ cpumask_var_t cpu_map;
++ /* the cpus queue themselves according to priority in here */
++ struct bheap_node *heap_node;
++ struct bheap cpu_heap;
++ /* lock for this cluster */
++#define lock domain.ready_lock
++} cedf_domain_t;
++
++/* a cedf_domain per cluster; allocation is done at init/activation time */
++cedf_domain_t *cedf;
++
++#define remote_cluster(cpu) ((cedf_domain_t *) per_cpu(cedf_cpu_entries, cpu).cluster)
++#define task_cpu_cluster(task) remote_cluster(get_partition(task))
++
++/* Uncomment WANT_ALL_SCHED_EVENTS if you want to see all scheduling
++ * decisions in the TRACE() log; uncomment VERBOSE_INIT for verbose
++ * information during the initialization of the plugin (e.g., topology)
++#define WANT_ALL_SCHED_EVENTS
++ */
++#define VERBOSE_INIT
++
++static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
++{
++ cpu_entry_t *a, *b;
++ a = _a->value;
++ b = _b->value;
++ /* Note that a and b are inverted: we want the lowest-priority CPU at
++ * the top of the heap.
++ */
++ return edf_higher_prio(b->linked, a->linked);
++}
++
++/* update_cpu_position - Move the cpu entry to the correct place to maintain
++ * order in the cpu queue. Caller must hold cedf lock.
++ */
++static void update_cpu_position(cpu_entry_t *entry)
++{
++ cedf_domain_t *cluster = entry->cluster;
++
++ if (likely(bheap_node_in_heap(entry->hn)))
++ bheap_delete(cpu_lower_prio,
++ &cluster->cpu_heap,
++ entry->hn);
++
++ bheap_insert(cpu_lower_prio, &cluster->cpu_heap, entry->hn);
++}
++
++/* caller must hold cedf lock */
++static cpu_entry_t* lowest_prio_cpu(cedf_domain_t *cluster)
++{
++ struct bheap_node* hn;
++ hn = bheap_peek(cpu_lower_prio, &cluster->cpu_heap);
++ return hn->value;
++}
++
++
++/* link_task_to_cpu - Update the link of a CPU.
++ * Handles the case where the to-be-linked task is already
++ * scheduled on a different CPU.
++ */
++static noinline void link_task_to_cpu(struct task_struct* linked,
++ cpu_entry_t *entry)
++{
++ cpu_entry_t *sched;
++ struct task_struct* tmp;
++ int on_cpu;
++
++ BUG_ON(linked && !is_realtime(linked));
++
++ /* Currently linked task is set to be unlinked. */
++ if (entry->linked) {
++ entry->linked->rt_param.linked_on = NO_CPU;
++ }
++
++ /* Link new task to CPU. */
++ if (linked) {
++ set_rt_flags(linked, RT_F_RUNNING);
++ /* handle task is already scheduled somewhere! */
++ on_cpu = linked->rt_param.scheduled_on;
++ if (on_cpu != NO_CPU) {
++ sched = &per_cpu(cedf_cpu_entries, on_cpu);
++ /* this should only happen if not linked already */
++ BUG_ON(sched->linked == linked);
++
++ /* If we are already scheduled on the CPU to which we
++ * wanted to link, we don't need to do the swap --
++ * we just link ourselves to the CPU and depend on
++ * the caller to get things right.
++ */
++ if (entry != sched) {
++ TRACE_TASK(linked,
++ "already scheduled on %d, updating link.\n",
++ sched->cpu);
++ tmp = sched->linked;
++ linked->rt_param.linked_on = sched->cpu;
++ sched->linked = linked;
++ update_cpu_position(sched);
++ linked = tmp;
++ }
++ }
++ if (linked) /* might be NULL due to swap */
++ linked->rt_param.linked_on = entry->cpu;
++ }
++ entry->linked = linked;
++#ifdef WANT_ALL_SCHED_EVENTS
++ if (linked)
++ TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
++ else
++ TRACE("NULL linked to %d.\n", entry->cpu);
++#endif
++ update_cpu_position(entry);
++}
++
++/* unlink - Make sure a task is not linked any longer to an entry
++ * where it was linked before. Must hold cedf_lock.
++ */
++static noinline void unlink(struct task_struct* t)
++{
++ cpu_entry_t *entry;
++
++ if (unlikely(!t)) {
++ TRACE_BUG_ON(!t);
++ return;
++ }
++
++
++ if (t->rt_param.linked_on != NO_CPU) {
++ /* unlink */
++ entry = &per_cpu(cedf_cpu_entries, t->rt_param.linked_on);
++ t->rt_param.linked_on = NO_CPU;
++ link_task_to_cpu(NULL, entry);
++ } else if (is_queued(t)) {
++ /* This is an interesting situation: t is scheduled,
++ * but was just recently unlinked. It cannot be
++ * linked anywhere else (because then it would have
++ * been relinked to this CPU), thus it must be in some
++ * queue. We must remove it from the list in this
++ * case.
++ *
++ * in C-EDF case is should be somewhere in the queue for
++ * its domain, therefore and we can get the domain using
++ * task_cpu_cluster
++ */
++ remove(&(task_cpu_cluster(t))->domain, t);
++ }
++}
++
++
++/* preempt - force a CPU to reschedule
++ */
++static void preempt(cpu_entry_t *entry)
++{
++ preempt_if_preemptable(entry->scheduled, entry->cpu);
++}
++
++/* requeue - Put an unlinked task into gsn-edf domain.
++ * Caller must hold cedf_lock.
++ */
++static noinline void requeue(struct task_struct* task)
++{
++ cedf_domain_t *cluster = task_cpu_cluster(task);
++ BUG_ON(!task);
++ /* sanity check before insertion */
++ BUG_ON(is_queued(task));
++
++ if (is_released(task, litmus_clock()))
++ __add_ready(&cluster->domain, task);
++ else {
++ /* it has got to wait */
++ add_release(&cluster->domain, task);
++ }
++}
++
++/* check for any necessary preemptions */
++static void check_for_preemptions(cedf_domain_t *cluster)
++{
++ struct task_struct *task;
++ cpu_entry_t* last;
++
++ for(last = lowest_prio_cpu(cluster);
++ edf_preemption_needed(&cluster->domain, last->linked);
++ last = lowest_prio_cpu(cluster)) {
++ /* preemption necessary */
++ task = __take_ready(&cluster->domain);
++ TRACE("check_for_preemptions: attempting to link task %d to %d\n",
++ task->pid, last->cpu);
++ if (last->linked)
++ requeue(last->linked);
++ link_task_to_cpu(task, last);
++ preempt(last);
++ }
++}
++
++/* cedf_job_arrival: task is either resumed or released */
++static noinline void cedf_job_arrival(struct task_struct* task)
++{
++ cedf_domain_t *cluster = task_cpu_cluster(task);
++ BUG_ON(!task);
++
++ requeue(task);
++ check_for_preemptions(cluster);
++}
++
++static void cedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
++{
++ cedf_domain_t* cluster = container_of(rt, cedf_domain_t, domain);
++ unsigned long flags;
++
++ raw_spin_lock_irqsave(&cluster->lock, flags);
++
++ __merge_ready(&cluster->domain, tasks);
++ check_for_preemptions(cluster);
++
++ raw_spin_unlock_irqrestore(&cluster->lock, flags);
++}
++
++/* caller holds cedf_lock */
++static noinline void job_completion(struct task_struct *t, int forced)
++{
++ BUG_ON(!t);
++
++ sched_trace_task_completion(t, forced);
++
++ TRACE_TASK(t, "job_completion().\n");
++
++ /* set flags */
++ set_rt_flags(t, RT_F_SLEEP);
++ /* prepare for next period */
++ prepare_for_next_period(t);
++ if (is_released(t, litmus_clock()))
++ sched_trace_task_release(t);
++ /* unlink */
++ unlink(t);
++ /* requeue
++ * But don't requeue a blocking task. */
++ if (is_running(t))
++ cedf_job_arrival(t);
++}
++
++/* cedf_tick - this function is called for every local timer
++ * interrupt.
++ *
++ * checks whether the current task has expired and checks
++ * whether we need to preempt it if it has not expired
++ */
++static void cedf_tick(struct task_struct* t)
++{
++ if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
++ if (!is_np(t)) {
++ /* np tasks will be preempted when they become
++ * preemptable again
++ */
++ set_tsk_need_resched(t);
++ set_will_schedule();
++ TRACE("cedf_scheduler_tick: "
++ "%d is preemptable "
++ " => FORCE_RESCHED\n", t->pid);
++ } else if (is_user_np(t)) {
++ TRACE("cedf_scheduler_tick: "
++ "%d is non-preemptable, "
++ "preemption delayed.\n", t->pid);
++ request_exit_np(t);
++ }
++ }
++}
++
++/* Getting schedule() right is a bit tricky. schedule() may not make any
++ * assumptions on the state of the current task since it may be called for a
++ * number of reasons. The reasons include a scheduler_tick() determined that it
++ * was necessary, because sys_exit_np() was called, because some Linux
++ * subsystem determined so, or even (in the worst case) because there is a bug
++ * hidden somewhere. Thus, we must take extreme care to determine what the
++ * current state is.
++ *
++ * The CPU could currently be scheduling a task (or not), be linked (or not).
++ *
++ * The following assertions for the scheduled task could hold:
++ *
++ * - !is_running(scheduled) // the job blocks
++ * - scheduled->timeslice == 0 // the job completed (forcefully)
++ * - get_rt_flag() == RT_F_SLEEP // the job completed (by syscall)
++ * - linked != scheduled // we need to reschedule (for any reason)
++ * - is_np(scheduled) // rescheduling must be delayed,
++ * sys_exit_np must be requested
++ *
++ * Any of these can occur together.
++ */
++static struct task_struct* cedf_schedule(struct task_struct * prev)
++{
++ cpu_entry_t* entry = &__get_cpu_var(cedf_cpu_entries);
++ cedf_domain_t *cluster = entry->cluster;
++ int out_of_time, sleep, preempt, np, exists, blocks;
++ struct task_struct* next = NULL;
++
++ raw_spin_lock(&cluster->lock);
++ clear_will_schedule();
++
++ /* sanity checking */
++ BUG_ON(entry->scheduled && entry->scheduled != prev);
++ BUG_ON(entry->scheduled && !is_realtime(prev));
++ BUG_ON(is_realtime(prev) && !entry->scheduled);
++
++ /* (0) Determine state */
++ exists = entry->scheduled != NULL;
++ blocks = exists && !is_running(entry->scheduled);
++ out_of_time = exists &&
++ budget_enforced(entry->scheduled) &&
++ budget_exhausted(entry->scheduled);
++ np = exists && is_np(entry->scheduled);
++ sleep = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
++ preempt = entry->scheduled != entry->linked;
++
++#ifdef WANT_ALL_SCHED_EVENTS
++ TRACE_TASK(prev, "invoked cedf_schedule.\n");
++#endif
++
++ if (exists)
++ TRACE_TASK(prev,
++ "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
++ "state:%d sig:%d\n",
++ blocks, out_of_time, np, sleep, preempt,
++ prev->state, signal_pending(prev));
++ if (entry->linked && preempt)
++ TRACE_TASK(prev, "will be preempted by %s/%d\n",
++ entry->linked->comm, entry->linked->pid);
++
++
++ /* If a task blocks we have no choice but to reschedule.
++ */
++ if (blocks)
++ unlink(entry->scheduled);
++
++ /* Request a sys_exit_np() call if we would like to preempt but cannot.
++ * We need to make sure to update the link structure anyway in case
++ * that we are still linked. Multiple calls to request_exit_np() don't
++ * hurt.
++ */
++ if (np && (out_of_time || preempt || sleep)) {
++ unlink(entry->scheduled);
++ request_exit_np(entry->scheduled);
++ }
++
++ /* Any task that is preemptable and either exhausts its execution
++ * budget or wants to sleep completes. We may have to reschedule after
++ * this. Don't do a job completion if we block (can't have timers running
++ * for blocked jobs). Preemption go first for the same reason.
++ */
++ if (!np && (out_of_time || sleep) && !blocks && !preempt)
++ job_completion(entry->scheduled, !sleep);
++
++ /* Link pending task if we became unlinked.
++ */
++ if (!entry->linked)
++ link_task_to_cpu(__take_ready(&cluster->domain), entry);
++
++ /* The final scheduling decision. Do we need to switch for some reason?
++ * If linked is different from scheduled, then select linked as next.
++ */
++ if ((!np || blocks) &&
++ entry->linked != entry->scheduled) {
++ /* Schedule a linked job? */
++ if (entry->linked) {
++ entry->linked->rt_param.scheduled_on = entry->cpu;
++ next = entry->linked;
++ }
++ if (entry->scheduled) {
++ /* not gonna be scheduled soon */
++ entry->scheduled->rt_param.scheduled_on = NO_CPU;
++ TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
++ }
++ } else
++ /* Only override Linux scheduler if we have a real-time task
++ * scheduled that needs to continue.
++ */
++ if (exists)
++ next = prev;
++
++ raw_spin_unlock(&cluster->lock);
++
++#ifdef WANT_ALL_SCHED_EVENTS
++ TRACE("cedf_lock released, next=0x%p\n", next);
++
++ if (next)
++ TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
++ else if (exists && !next)
++ TRACE("becomes idle at %llu.\n", litmus_clock());
++#endif
++
++
++ return next;
++}
++
++
++/* _finish_switch - we just finished the switch away from prev
++ */
++static void cedf_finish_switch(struct task_struct *prev)
++{
++ cpu_entry_t* entry = &__get_cpu_var(cedf_cpu_entries);
++
++ entry->scheduled = is_realtime(current) ? current : NULL;
++#ifdef WANT_ALL_SCHED_EVENTS
++ TRACE_TASK(prev, "switched away from\n");
++#endif
++}
++
++
++/* Prepare a task for running in RT mode
++ */
++static void cedf_task_new(struct task_struct * t, int on_rq, int running)
++{
++ unsigned long flags;
++ cpu_entry_t* entry;
++ cedf_domain_t* cluster;
++
++ TRACE("gsn edf: task new %d\n", t->pid);
++
++ /* the cluster doesn't change even if t is running */
++ cluster = task_cpu_cluster(t);
++
++ raw_spin_lock_irqsave(&cluster->domain.ready_lock, flags);
++
++ /* setup job params */
++ release_at(t, litmus_clock());
++
++ if (running) {
++ entry = &per_cpu(cedf_cpu_entries, task_cpu(t));
++ BUG_ON(entry->scheduled);
++
++ entry->scheduled = t;
++ tsk_rt(t)->scheduled_on = task_cpu(t);
++ } else {
++ t->rt_param.scheduled_on = NO_CPU;
++ }
++ t->rt_param.linked_on = NO_CPU;
++
++ cedf_job_arrival(t);
++ raw_spin_unlock_irqrestore(&(cluster->domain.ready_lock), flags);
++}
++
++static void cedf_task_wake_up(struct task_struct *task)
++{
++ unsigned long flags;
++ lt_t now;
++ cedf_domain_t *cluster;
++
++ TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
++
++ cluster = task_cpu_cluster(task);
++
++ raw_spin_lock_irqsave(&cluster->lock, flags);
++ /* We need to take suspensions because of semaphores into
++ * account! If a job resumes after being suspended due to acquiring
++ * a semaphore, it should never be treated as a new job release.
++ */
++ if (get_rt_flags(task) == RT_F_EXIT_SEM) {
++ set_rt_flags(task, RT_F_RUNNING);
++ } else {
++ now = litmus_clock();
++ if (is_tardy(task, now)) {
++ /* new sporadic release */
++ release_at(task, now);
++ sched_trace_task_release(task);
++ }
++ else {
++ if (task->rt.time_slice) {
++ /* came back in time before deadline
++ */
++ set_rt_flags(task, RT_F_RUNNING);
++ }
++ }
++ }
++ cedf_job_arrival(task);
++ raw_spin_unlock_irqrestore(&cluster->lock, flags);
++}
++
++static void cedf_task_block(struct task_struct *t)
++{
++ unsigned long flags;
++ cedf_domain_t *cluster;
++
++ TRACE_TASK(t, "block at %llu\n", litmus_clock());
++
++ cluster = task_cpu_cluster(t);
++
++ /* unlink if necessary */
++ raw_spin_lock_irqsave(&cluster->lock, flags);
++ unlink(t);
++ raw_spin_unlock_irqrestore(&cluster->lock, flags);
++
++ BUG_ON(!is_realtime(t));
++}
++
++
++static void cedf_task_exit(struct task_struct * t)
++{
++ unsigned long flags;
++ cedf_domain_t *cluster = task_cpu_cluster(t);
++
++ /* unlink if necessary */
++ raw_spin_lock_irqsave(&cluster->lock, flags);
++ unlink(t);
++ if (tsk_rt(t)->scheduled_on != NO_CPU) {
++ cluster->cpus[tsk_rt(t)->scheduled_on]->scheduled = NULL;
++ tsk_rt(t)->scheduled_on = NO_CPU;
++ }
++ raw_spin_unlock_irqrestore(&cluster->lock, flags);
++
++ BUG_ON(!is_realtime(t));
++ TRACE_TASK(t, "RIP\n");
++}
++
++static long cedf_admit_task(struct task_struct* tsk)
++{
++ return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL;
++}
++
++/* total number of cluster */
++static int num_clusters;
++/* we do not support cluster of different sizes */
++static unsigned int cluster_size;
++
++#ifdef VERBOSE_INIT
++static void print_cluster_topology(cpumask_var_t mask, int cpu)
++{
++ int chk;
++ char buf[255];
++
++ chk = cpulist_scnprintf(buf, 254, mask);
++ buf[chk] = '\0';
++ printk(KERN_INFO "CPU = %d, shared cpu(s) = %s\n", cpu, buf);
++
++}
++#endif
++
++static int clusters_allocated = 0;
++
++static void cleanup_cedf(void)
++{
++ int i;
++
++ if (clusters_allocated) {
++ for (i = 0; i < num_clusters; i++) {
++ kfree(cedf[i].cpus);
++ kfree(cedf[i].heap_node);
++ free_cpumask_var(cedf[i].cpu_map);
++ }
++
++ kfree(cedf);
++ }
++}
++
++static long cedf_activate_plugin(void)
++{
++ int i, j, cpu, ccpu, cpu_count;
++ cpu_entry_t *entry;
++
++ cpumask_var_t mask;
++ int chk = 0;
++
++ /* de-allocate old clusters, if any */
++ cleanup_cedf();
++
++ printk(KERN_INFO "C-EDF: Activate Plugin, cache index = %d\n",
++ cluster_cache_index);
++
++ /* need to get cluster_size first */
++ if(!zalloc_cpumask_var(&mask, GFP_ATOMIC))
++ return -ENOMEM;
++
++ if (unlikely(cluster_cache_index == num_online_cpus())) {
++
++ cluster_size = num_online_cpus();
++ } else {
++
++ chk = get_shared_cpu_map(mask, 0, cluster_cache_index);
++ if (chk) {
++ /* if chk != 0 then it is the max allowed index */
++ printk(KERN_INFO "C-EDF: Cannot support cache index = %d\n",
++ cluster_cache_index);
++ printk(KERN_INFO "C-EDF: Using cache index = %d\n",
++ chk);
++ cluster_cache_index = chk;
++ }
++
++ cluster_size = cpumask_weight(mask);
++ }
++
++ if ((num_online_cpus() % cluster_size) != 0) {
++ /* this can't be right, some cpus are left out */
++ printk(KERN_ERR "C-EDF: Trying to group %d cpus in %d!\n",
++ num_online_cpus(), cluster_size);
++ return -1;
++ }
++
++ num_clusters = num_online_cpus() / cluster_size;
++ printk(KERN_INFO "C-EDF: %d cluster(s) of size = %d\n",
++ num_clusters, cluster_size);
++
++ /* initialize clusters */
++ cedf = kmalloc(num_clusters * sizeof(cedf_domain_t), GFP_ATOMIC);
++ for (i = 0; i < num_clusters; i++) {
++
++ cedf[i].cpus = kmalloc(cluster_size * sizeof(cpu_entry_t),
++ GFP_ATOMIC);
++ cedf[i].heap_node = kmalloc(
++ cluster_size * sizeof(struct bheap_node),
++ GFP_ATOMIC);
++ bheap_init(&(cedf[i].cpu_heap));
++ edf_domain_init(&(cedf[i].domain), NULL, cedf_release_jobs);
++
++ if(!zalloc_cpumask_var(&cedf[i].cpu_map, GFP_ATOMIC))
++ return -ENOMEM;
++ }
++
++ /* cycle through cluster and add cpus to them */
++ for (i = 0; i < num_clusters; i++) {
++
++ for_each_online_cpu(cpu) {
++ /* check if the cpu is already in a cluster */
++ for (j = 0; j < num_clusters; j++)
++ if (cpumask_test_cpu(cpu, cedf[j].cpu_map))
++ break;
++ /* if it is in a cluster go to next cpu */
++ if (cpumask_test_cpu(cpu, cedf[j].cpu_map))
++ continue;
++
++ /* this cpu isn't in any cluster */
++ /* get the shared cpus */
++ if (unlikely(cluster_cache_index == num_online_cpus()))
++ cpumask_copy(mask, cpu_online_mask);
++ else
++ get_shared_cpu_map(mask, cpu, cluster_cache_index);
++
++ cpumask_copy(cedf[i].cpu_map, mask);
++#ifdef VERBOSE_INIT
++ print_cluster_topology(mask, cpu);
++#endif
++ /* add cpus to current cluster and init cpu_entry_t */
++ cpu_count = 0;
++ for_each_cpu(ccpu, cedf[i].cpu_map) {
++
++ entry = &per_cpu(cedf_cpu_entries, ccpu);
++ cedf[i].cpus[cpu_count] = entry;
++ atomic_set(&entry->will_schedule, 0);
++ entry->cpu = ccpu;
++ entry->cluster = &cedf[i];
++ entry->hn = &(cedf[i].heap_node[cpu_count]);
++ bheap_node_init(&entry->hn, entry);
++
++ cpu_count++;
++
++ entry->linked = NULL;
++ entry->scheduled = NULL;
++ update_cpu_position(entry);
++ }
++ /* done with this cluster */
++ break;
++ }
++ }
++
++ free_cpumask_var(mask);
++ clusters_allocated = 1;
++ return 0;
++}
++
++/* Plugin object */
++static struct sched_plugin cedf_plugin __cacheline_aligned_in_smp = {
++ .plugin_name = "C-EDF",
++ .finish_switch = cedf_finish_switch,
++ .tick = cedf_tick,
++ .task_new = cedf_task_new,
++ .complete_job = complete_job,
++ .task_exit = cedf_task_exit,
++ .schedule = cedf_schedule,
++ .task_wake_up = cedf_task_wake_up,
++ .task_block = cedf_task_block,
++ .admit_task = cedf_admit_task,
++ .activate_plugin = cedf_activate_plugin,
++};
++
++
++static int __init init_cedf(void)
++{
++ return register_sched_plugin(&cedf_plugin);
++}
++
++static void clean_cedf(void)
++{
++ cleanup_cedf();
++}
++
++module_init(init_cedf);
++module_exit(clean_cedf);
+diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c
+new file mode 100644
+index 0000000..e101768
+--- /dev/null
++++ b/litmus/sched_gsn_edf.c
+@@ -0,0 +1,842 @@
++/*
++ * litmus/sched_gsn_edf.c
++ *
++ * Implementation of the GSN-EDF scheduling algorithm.
++ *
++ * This version uses the simple approach and serializes all scheduling
++ * decisions by the use of a queue lock. This is probably not the
++ * best way to do it, but it should suffice for now.
++ */
++
++#include
++#include
++#include
++
++#include
++#include
++#include
++#include
++#include
++
++#include
++
++#include
++
++/* Overview of GSN-EDF operations.
++ *
++ * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This
++ * description only covers how the individual operations are implemented in
++ * LITMUS.
++ *
++ * link_task_to_cpu(T, cpu) - Low-level operation to update the linkage
++ * structure (NOT the actually scheduled
++ * task). If there is another linked task To
++ * already it will set To->linked_on = NO_CPU
++ * (thereby removing its association with this
++ * CPU). However, it will not requeue the
++ * previously linked task (if any). It will set
++ * T's state to RT_F_RUNNING and check whether
++ * it is already running somewhere else. If T
++ * is scheduled somewhere else it will link
++ * it to that CPU instead (and pull the linked
++ * task to cpu). T may be NULL.
++ *
++ * unlink(T) - Unlink removes T from all scheduler data
++ * structures. If it is linked to some CPU it
++ * will link NULL to that CPU. If it is
++ * currently queued in the gsnedf queue it will
++ * be removed from the rt_domain. It is safe to
++ * call unlink(T) if T is not linked. T may not
++ * be NULL.
++ *
++ * requeue(T) - Requeue will insert T into the appropriate
++ * queue. If the system is in real-time mode and
++ * the T is released already, it will go into the
++ * ready queue. If the system is not in
++ * real-time mode is T, then T will go into the
++ * release queue. If T's release time is in the
++ * future, it will go into the release
++ * queue. That means that T's release time/job
++ * no/etc. has to be updated before requeu(T) is
++ * called. It is not safe to call requeue(T)
++ * when T is already queued. T may not be NULL.
++ *
++ * gsnedf_job_arrival(T) - This is the catch all function when T enters
++ * the system after either a suspension or at a
++ * job release. It will queue T (which means it
++ * is not safe to call gsnedf_job_arrival(T) if
++ * T is already queued) and then check whether a
++ * preemption is necessary. If a preemption is
++ * necessary it will update the linkage
++ * accordingly and cause scheduled to be called
++ * (either with an IPI or need_resched). It is
++ * safe to call gsnedf_job_arrival(T) if T's
++ * next job has not been actually released yet
++ * (releast time in the future). T will be put
++ * on the release queue in that case.
++ *
++ * job_completion(T) - Take care of everything that needs to be done
++ * to prepare T for its next release and place
++ * it in the right queue with
++ * gsnedf_job_arrival().
++ *
++ *
++ * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is
++ * equivalent to unlink(T). Note that if you unlink a task from a CPU none of
++ * the functions will automatically propagate pending task from the ready queue
++ * to a linked task. This is the job of the calling function ( by means of
++ * __take_ready).
++ */
++
++
++/* cpu_entry_t - maintain the linked and scheduled state
++ */
++typedef struct {
++ int cpu;
++ struct task_struct* linked; /* only RT tasks */
++ struct task_struct* scheduled; /* only RT tasks */
++ atomic_t will_schedule; /* prevent unneeded IPIs */
++ struct bheap_node* hn;
++} cpu_entry_t;
++DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries);
++
++cpu_entry_t* gsnedf_cpus[NR_CPUS];
++
++#define set_will_schedule() \
++ (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 1))
++#define clear_will_schedule() \
++ (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 0))
++#define test_will_schedule(cpu) \
++ (atomic_read(&per_cpu(gsnedf_cpu_entries, cpu).will_schedule))
++
++
++/* the cpus queue themselves according to priority in here */
++static struct bheap_node gsnedf_heap_node[NR_CPUS];
++static struct bheap gsnedf_cpu_heap;
++
++static rt_domain_t gsnedf;
++#define gsnedf_lock (gsnedf.ready_lock)
++
++
++/* Uncomment this if you want to see all scheduling decisions in the
++ * TRACE() log.
++#define WANT_ALL_SCHED_EVENTS
++ */
++
++static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
++{
++ cpu_entry_t *a, *b;
++ a = _a->value;
++ b = _b->value;
++ /* Note that a and b are inverted: we want the lowest-priority CPU at
++ * the top of the heap.
++ */
++ return edf_higher_prio(b->linked, a->linked);
++}
++
++/* update_cpu_position - Move the cpu entry to the correct place to maintain
++ * order in the cpu queue. Caller must hold gsnedf lock.
++ */
++static void update_cpu_position(cpu_entry_t *entry)
++{
++ if (likely(bheap_node_in_heap(entry->hn)))
++ bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn);
++ bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn);
++}
++
++/* caller must hold gsnedf lock */
++static cpu_entry_t* lowest_prio_cpu(void)
++{
++ struct bheap_node* hn;
++ hn = bheap_peek(cpu_lower_prio, &gsnedf_cpu_heap);
++ return hn->value;
++}
++
++
++/* link_task_to_cpu - Update the link of a CPU.
++ * Handles the case where the to-be-linked task is already
++ * scheduled on a different CPU.
++ */
++static noinline void link_task_to_cpu(struct task_struct* linked,
++ cpu_entry_t *entry)
++{
++ cpu_entry_t *sched;
++ struct task_struct* tmp;
++ int on_cpu;
++
++ BUG_ON(linked && !is_realtime(linked));
++
++ /* Currently linked task is set to be unlinked. */
++ if (entry->linked) {
++ entry->linked->rt_param.linked_on = NO_CPU;
++ }
++
++ /* Link new task to CPU. */
++ if (linked) {
++ set_rt_flags(linked, RT_F_RUNNING);
++ /* handle task is already scheduled somewhere! */
++ on_cpu = linked->rt_param.scheduled_on;
++ if (on_cpu != NO_CPU) {
++ sched = &per_cpu(gsnedf_cpu_entries, on_cpu);
++ /* this should only happen if not linked already */
++ BUG_ON(sched->linked == linked);
++
++ /* If we are already scheduled on the CPU to which we
++ * wanted to link, we don't need to do the swap --
++ * we just link ourselves to the CPU and depend on
++ * the caller to get things right.
++ */
++ if (entry != sched) {
++ TRACE_TASK(linked,
++ "already scheduled on %d, updating link.\n",
++ sched->cpu);
++ tmp = sched->linked;
++ linked->rt_param.linked_on = sched->cpu;
++ sched->linked = linked;
++ update_cpu_position(sched);
++ linked = tmp;
++ }
++ }
++ if (linked) /* might be NULL due to swap */
++ linked->rt_param.linked_on = entry->cpu;
++ }
++ entry->linked = linked;
++#ifdef WANT_ALL_SCHED_EVENTS
++ if (linked)
++ TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
++ else
++ TRACE("NULL linked to %d.\n", entry->cpu);
++#endif
++ update_cpu_position(entry);
++}
++
++/* unlink - Make sure a task is not linked any longer to an entry
++ * where it was linked before. Must hold gsnedf_lock.
++ */
++static noinline void unlink(struct task_struct* t)
++{
++ cpu_entry_t *entry;
++
++ if (unlikely(!t)) {
++ TRACE_BUG_ON(!t);
++ return;
++ }
++
++ if (t->rt_param.linked_on != NO_CPU) {
++ /* unlink */
++ entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on);
++ t->rt_param.linked_on = NO_CPU;
++ link_task_to_cpu(NULL, entry);
++ } else if (is_queued(t)) {
++ /* This is an interesting situation: t is scheduled,
++ * but was just recently unlinked. It cannot be
++ * linked anywhere else (because then it would have
++ * been relinked to this CPU), thus it must be in some
++ * queue. We must remove it from the list in this
++ * case.
++ */
++ remove(&gsnedf, t);
++ }
++}
++
++
++/* preempt - force a CPU to reschedule
++ */
++static void preempt(cpu_entry_t *entry)
++{
++ preempt_if_preemptable(entry->scheduled, entry->cpu);
++}
++
++/* requeue - Put an unlinked task into gsn-edf domain.
++ * Caller must hold gsnedf_lock.
++ */
++static noinline void requeue(struct task_struct* task)
++{
++ BUG_ON(!task);
++ /* sanity check before insertion */
++ BUG_ON(is_queued(task));
++
++ if (is_released(task, litmus_clock()))
++ __add_ready(&gsnedf, task);
++ else {
++ /* it has got to wait */
++ add_release(&gsnedf, task);
++ }
++}
++
++/* check for any necessary preemptions */
++static void check_for_preemptions(void)
++{
++ struct task_struct *task;
++ cpu_entry_t* last;
++
++ for(last = lowest_prio_cpu();
++ edf_preemption_needed(&gsnedf, last->linked);
++ last = lowest_prio_cpu()) {
++ /* preemption necessary */
++ task = __take_ready(&gsnedf);
++ TRACE("check_for_preemptions: attempting to link task %d to %d\n",
++ task->pid, last->cpu);
++ if (last->linked)
++ requeue(last->linked);
++ link_task_to_cpu(task, last);
++ preempt(last);
++ }
++}
++
++/* gsnedf_job_arrival: task is either resumed or released */
++static noinline void gsnedf_job_arrival(struct task_struct* task)
++{
++ BUG_ON(!task);
++
++ requeue(task);
++ check_for_preemptions();
++}
++
++static void gsnedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
++{
++ unsigned long flags;
++
++ raw_spin_lock_irqsave(&gsnedf_lock, flags);
++
++ __merge_ready(rt, tasks);
++ check_for_preemptions();
++
++ raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
++}
++
++/* caller holds gsnedf_lock */
++static noinline void job_completion(struct task_struct *t, int forced)
++{
++ BUG_ON(!t);
++
++ sched_trace_task_completion(t, forced);
++
++ TRACE_TASK(t, "job_completion().\n");
++
++ /* set flags */
++ set_rt_flags(t, RT_F_SLEEP);
++ /* prepare for next period */
++ prepare_for_next_period(t);
++ if (is_released(t, litmus_clock()))
++ sched_trace_task_release(t);
++ /* unlink */
++ unlink(t);
++ /* requeue
++ * But don't requeue a blocking task. */
++ if (is_running(t))
++ gsnedf_job_arrival(t);
++}
++
++/* gsnedf_tick - this function is called for every local timer
++ * interrupt.
++ *
++ * checks whether the current task has expired and checks
++ * whether we need to preempt it if it has not expired
++ */
++static void gsnedf_tick(struct task_struct* t)
++{
++ if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
++ if (!is_np(t)) {
++ /* np tasks will be preempted when they become
++ * preemptable again
++ */
++ set_tsk_need_resched(t);
++ set_will_schedule();
++ TRACE("gsnedf_scheduler_tick: "
++ "%d is preemptable "
++ " => FORCE_RESCHED\n", t->pid);
++ } else if (is_user_np(t)) {
++ TRACE("gsnedf_scheduler_tick: "
++ "%d is non-preemptable, "
++ "preemption delayed.\n", t->pid);
++ request_exit_np(t);
++ }
++ }
++}
++
++/* Getting schedule() right is a bit tricky. schedule() may not make any
++ * assumptions on the state of the current task since it may be called for a
++ * number of reasons. The reasons include a scheduler_tick() determined that it
++ * was necessary, because sys_exit_np() was called, because some Linux
++ * subsystem determined so, or even (in the worst case) because there is a bug
++ * hidden somewhere. Thus, we must take extreme care to determine what the
++ * current state is.
++ *
++ * The CPU could currently be scheduling a task (or not), be linked (or not).
++ *
++ * The following assertions for the scheduled task could hold:
++ *
++ * - !is_running(scheduled) // the job blocks
++ * - scheduled->timeslice == 0 // the job completed (forcefully)
++ * - get_rt_flag() == RT_F_SLEEP // the job completed (by syscall)
++ * - linked != scheduled // we need to reschedule (for any reason)
++ * - is_np(scheduled) // rescheduling must be delayed,
++ * sys_exit_np must be requested
++ *
++ * Any of these can occur together.
++ */
++static struct task_struct* gsnedf_schedule(struct task_struct * prev)
++{
++ cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries);
++ int out_of_time, sleep, preempt, np, exists, blocks;
++ struct task_struct* next = NULL;
++
++#ifdef CONFIG_RELEASE_MASTER
++ /* Bail out early if we are the release master.
++ * The release master never schedules any real-time tasks.
++ */
++ if (gsnedf.release_master == entry->cpu)
++ return NULL;
++#endif
++
++ raw_spin_lock(&gsnedf_lock);
++ clear_will_schedule();
++
++ /* sanity checking */
++ BUG_ON(entry->scheduled && entry->scheduled != prev);
++ BUG_ON(entry->scheduled && !is_realtime(prev));
++ BUG_ON(is_realtime(prev) && !entry->scheduled);
++
++ /* (0) Determine state */
++ exists = entry->scheduled != NULL;
++ blocks = exists && !is_running(entry->scheduled);
++ out_of_time = exists &&
++ budget_enforced(entry->scheduled) &&
++ budget_exhausted(entry->scheduled);
++ np = exists && is_np(entry->scheduled);
++ sleep = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
++ preempt = entry->scheduled != entry->linked;
++
++#ifdef WANT_ALL_SCHED_EVENTS
++ TRACE_TASK(prev, "invoked gsnedf_schedule.\n");
++#endif
++
++ if (exists)
++ TRACE_TASK(prev,
++ "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
++ "state:%d sig:%d\n",
++ blocks, out_of_time, np, sleep, preempt,
++ prev->state, signal_pending(prev));
++ if (entry->linked && preempt)
++ TRACE_TASK(prev, "will be preempted by %s/%d\n",
++ entry->linked->comm, entry->linked->pid);
++
++
++ /* If a task blocks we have no choice but to reschedule.
++ */
++ if (blocks)
++ unlink(entry->scheduled);
++
++ /* Request a sys_exit_np() call if we would like to preempt but cannot.
++ * We need to make sure to update the link structure anyway in case
++ * that we are still linked. Multiple calls to request_exit_np() don't
++ * hurt.
++ */
++ if (np && (out_of_time || preempt || sleep)) {
++ unlink(entry->scheduled);
++ request_exit_np(entry->scheduled);
++ }
++
++ /* Any task that is preemptable and either exhausts its execution
++ * budget or wants to sleep completes. We may have to reschedule after
++ * this. Don't do a job completion if we block (can't have timers running
++ * for blocked jobs). Preemption go first for the same reason.
++ */
++ if (!np && (out_of_time || sleep) && !blocks && !preempt)
++ job_completion(entry->scheduled, !sleep);
++
++ /* Link pending task if we became unlinked.
++ */
++ if (!entry->linked)
++ link_task_to_cpu(__take_ready(&gsnedf), entry);
++
++ /* The final scheduling decision. Do we need to switch for some reason?
++ * If linked is different from scheduled, then select linked as next.
++ */
++ if ((!np || blocks) &&
++ entry->linked != entry->scheduled) {
++ /* Schedule a linked job? */
++ if (entry->linked) {
++ entry->linked->rt_param.scheduled_on = entry->cpu;
++ next = entry->linked;
++ }
++ if (entry->scheduled) {
++ /* not gonna be scheduled soon */
++ entry->scheduled->rt_param.scheduled_on = NO_CPU;
++ TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
++ }
++ } else
++ /* Only override Linux scheduler if we have a real-time task
++ * scheduled that needs to continue.
++ */
++ if (exists)
++ next = prev;
++
++ raw_spin_unlock(&gsnedf_lock);
++
++#ifdef WANT_ALL_SCHED_EVENTS
++ TRACE("gsnedf_lock released, next=0x%p\n", next);
++
++ if (next)
++ TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
++ else if (exists && !next)
++ TRACE("becomes idle at %llu.\n", litmus_clock());
++#endif
++
++
++ return next;
++}
++
++
++/* _finish_switch - we just finished the switch away from prev
++ */
++static void gsnedf_finish_switch(struct task_struct *prev)
++{
++ cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries);
++
++ entry->scheduled = is_realtime(current) ? current : NULL;
++#ifdef WANT_ALL_SCHED_EVENTS
++ TRACE_TASK(prev, "switched away from\n");
++#endif
++}
++
++
++/* Prepare a task for running in RT mode
++ */
++static void gsnedf_task_new(struct task_struct * t, int on_rq, int running)
++{
++ unsigned long flags;
++ cpu_entry_t* entry;
++
++ TRACE("gsn edf: task new %d\n", t->pid);
++
++ raw_spin_lock_irqsave(&gsnedf_lock, flags);
++
++ /* setup job params */
++ release_at(t, litmus_clock());
++
++ if (running) {
++ entry = &per_cpu(gsnedf_cpu_entries, task_cpu(t));
++ BUG_ON(entry->scheduled);
++
++#ifdef CONFIG_RELEASE_MASTER
++ if (entry->cpu != gsnedf.release_master) {
++#endif
++ entry->scheduled = t;
++ tsk_rt(t)->scheduled_on = task_cpu(t);
++#ifdef CONFIG_RELEASE_MASTER
++ } else {
++ /* do not schedule on release master */
++ preempt(entry); /* force resched */
++ tsk_rt(t)->scheduled_on = NO_CPU;
++ }
++#endif
++ } else {
++ t->rt_param.scheduled_on = NO_CPU;
++ }
++ t->rt_param.linked_on = NO_CPU;
++
++ gsnedf_job_arrival(t);
++ raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
++}
++
++static void gsnedf_task_wake_up(struct task_struct *task)
++{
++ unsigned long flags;
++ lt_t now;
++
++ TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
++
++ raw_spin_lock_irqsave(&gsnedf_lock, flags);
++ /* We need to take suspensions because of semaphores into
++ * account! If a job resumes after being suspended due to acquiring
++ * a semaphore, it should never be treated as a new job release.
++ */
++ if (get_rt_flags(task) == RT_F_EXIT_SEM) {
++ set_rt_flags(task, RT_F_RUNNING);
++ } else {
++ now = litmus_clock();
++ if (is_tardy(task, now)) {
++ /* new sporadic release */
++ release_at(task, now);
++ sched_trace_task_release(task);
++ }
++ else {
++ if (task->rt.time_slice) {
++ /* came back in time before deadline
++ */
++ set_rt_flags(task, RT_F_RUNNING);
++ }
++ }
++ }
++ gsnedf_job_arrival(task);
++ raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
++}
++
++static void gsnedf_task_block(struct task_struct *t)
++{
++ unsigned long flags;
++
++ TRACE_TASK(t, "block at %llu\n", litmus_clock());
++
++ /* unlink if necessary */
++ raw_spin_lock_irqsave(&gsnedf_lock, flags);
++ unlink(t);
++ raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
++
++ BUG_ON(!is_realtime(t));
++}
++
++
++static void gsnedf_task_exit(struct task_struct * t)
++{
++ unsigned long flags;
++
++ /* unlink if necessary */
++ raw_spin_lock_irqsave(&gsnedf_lock, flags);
++ unlink(t);
++ if (tsk_rt(t)->scheduled_on != NO_CPU) {
++ gsnedf_cpus[tsk_rt(t)->scheduled_on]->scheduled = NULL;
++ tsk_rt(t)->scheduled_on = NO_CPU;
++ }
++ raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
++
++ BUG_ON(!is_realtime(t));
++ TRACE_TASK(t, "RIP\n");
++}
++
++#ifdef CONFIG_FMLP
++
++/* Update the queue position of a task that got it's priority boosted via
++ * priority inheritance. */
++static void update_queue_position(struct task_struct *holder)
++{
++ /* We don't know whether holder is in the ready queue. It should, but
++ * on a budget overrun it may already be in a release queue. Hence,
++ * calling unlink() is not possible since it assumes that the task is
++ * not in a release queue. However, we can safely check whether
++ * sem->holder is currently in a queue or scheduled after locking both
++ * the release and the ready queue lock. */
++
++ /* Assumption: caller holds gsnedf_lock */
++
++ int check_preempt = 0;
++
++ if (tsk_rt(holder)->linked_on != NO_CPU) {
++ TRACE_TASK(holder, "%s: linked on %d\n",
++ __FUNCTION__, tsk_rt(holder)->linked_on);
++ /* Holder is scheduled; need to re-order CPUs.
++ * We can't use heap_decrease() here since
++ * the cpu_heap is ordered in reverse direction, so
++ * it is actually an increase. */
++ bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap,
++ gsnedf_cpus[tsk_rt(holder)->linked_on]->hn);
++ bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap,
++ gsnedf_cpus[tsk_rt(holder)->linked_on]->hn);
++ } else {
++ /* holder may be queued: first stop queue changes */
++ raw_spin_lock(&gsnedf.release_lock);
++ if (is_queued(holder)) {
++ TRACE_TASK(holder, "%s: is queued\n",
++ __FUNCTION__);
++ /* We need to update the position
++ * of holder in some heap. Note that this
++ * may be a release heap. */
++ check_preempt =
++ !bheap_decrease(edf_ready_order,
++ tsk_rt(holder)->heap_node);
++ } else {
++ /* Nothing to do: if it is not queued and not linked
++ * then it is currently being moved by other code
++ * (e.g., a timer interrupt handler) that will use the
++ * correct priority when enqueuing the task. */
++ TRACE_TASK(holder, "%s: is NOT queued => Done.\n",
++ __FUNCTION__);
++ }
++ raw_spin_unlock(&gsnedf.release_lock);
++
++ /* If holder was enqueued in a release heap, then the following
++ * preemption check is pointless, but we can't easily detect
++ * that case. If you want to fix this, then consider that
++ * simply adding a state flag requires O(n) time to update when
++ * releasing n tasks, which conflicts with the goal to have
++ * O(log n) merges. */
++ if (check_preempt) {
++ /* heap_decrease() hit the top level of the heap: make
++ * sure preemption checks get the right task, not the
++ * potentially stale cache. */
++ bheap_uncache_min(edf_ready_order,
++ &gsnedf.ready_queue);
++ check_for_preemptions();
++ }
++ }
++}
++
++static long gsnedf_pi_block(struct pi_semaphore *sem,
++ struct task_struct *new_waiter)
++{
++ /* This callback has to handle the situation where a new waiter is
++ * added to the wait queue of the semaphore.
++ *
++ * We must check if has a higher priority than the currently
++ * highest-priority task, and then potentially reschedule.
++ */
++
++ BUG_ON(!new_waiter);
++
++ if (edf_higher_prio(new_waiter, sem->hp.task)) {
++ TRACE_TASK(new_waiter, " boosts priority via %p\n", sem);
++ /* called with IRQs disabled */
++ raw_spin_lock(&gsnedf_lock);
++ /* store new highest-priority task */
++ sem->hp.task = new_waiter;
++ if (sem->holder) {
++ TRACE_TASK(sem->holder,
++ " holds %p and will inherit from %s/%d\n",
++ sem,
++ new_waiter->comm, new_waiter->pid);
++ /* let holder inherit */
++ sem->holder->rt_param.inh_task = new_waiter;
++ update_queue_position(sem->holder);
++ }
++ raw_spin_unlock(&gsnedf_lock);
++ }
++
++ return 0;
++}
++
++static long gsnedf_inherit_priority(struct pi_semaphore *sem,
++ struct task_struct *new_owner)
++{
++ /* We don't need to acquire the gsnedf_lock since at the time of this
++ * call new_owner isn't actually scheduled yet (it's still sleeping)
++ * and since the calling function already holds sem->wait.lock, which
++ * prevents concurrent sem->hp.task changes.
++ */
++
++ if (sem->hp.task && sem->hp.task != new_owner) {
++ new_owner->rt_param.inh_task = sem->hp.task;
++ TRACE_TASK(new_owner, "inherited priority from %s/%d\n",
++ sem->hp.task->comm, sem->hp.task->pid);
++ } else
++ TRACE_TASK(new_owner,
++ "cannot inherit priority, "
++ "no higher priority job waits.\n");
++ return 0;
++}
++
++/* This function is called on a semaphore release, and assumes that
++ * the current task is also the semaphore holder.
++ */
++static long gsnedf_return_priority(struct pi_semaphore *sem)
++{
++ struct task_struct* t = current;
++ int ret = 0;
++
++ /* Find new highest-priority semaphore task
++ * if holder task is the current hp.task.
++ *
++ * Calling function holds sem->wait.lock.
++ */
++ if (t == sem->hp.task)
++ edf_set_hp_task(sem);
++
++ TRACE_CUR("gsnedf_return_priority for lock %p\n", sem);
++
++ if (t->rt_param.inh_task) {
++ /* interrupts already disabled by PI code */
++ raw_spin_lock(&gsnedf_lock);
++
++ /* Reset inh_task to NULL. */
++ t->rt_param.inh_task = NULL;
++
++ /* Check if rescheduling is necessary */
++ unlink(t);
++ gsnedf_job_arrival(t);
++ raw_spin_unlock(&gsnedf_lock);
++ }
++
++ return ret;
++}
++
++#endif
++
++static long gsnedf_admit_task(struct task_struct* tsk)
++{
++ return 0;
++}
++
++static long gsnedf_activate_plugin(void)
++{
++ int cpu;
++ cpu_entry_t *entry;
++
++ bheap_init(&gsnedf_cpu_heap);
++#ifdef CONFIG_RELEASE_MASTER
++ gsnedf.release_master = atomic_read(&release_master_cpu);
++#endif
++
++ for_each_online_cpu(cpu) {
++ entry = &per_cpu(gsnedf_cpu_entries, cpu);
++ bheap_node_init(&entry->hn, entry);
++ atomic_set(&entry->will_schedule, 0);
++ entry->linked = NULL;
++ entry->scheduled = NULL;
++#ifdef CONFIG_RELEASE_MASTER
++ if (cpu != gsnedf.release_master) {
++#endif
++ TRACE("GSN-EDF: Initializing CPU #%d.\n", cpu);
++ update_cpu_position(entry);
++#ifdef CONFIG_RELEASE_MASTER
++ } else {
++ TRACE("GSN-EDF: CPU %d is release master.\n", cpu);
++ }
++#endif
++ }
++ return 0;
++}
++
++/* Plugin object */
++static struct sched_plugin gsn_edf_plugin __cacheline_aligned_in_smp = {
++ .plugin_name = "GSN-EDF",
++ .finish_switch = gsnedf_finish_switch,
++ .tick = gsnedf_tick,
++ .task_new = gsnedf_task_new,
++ .complete_job = complete_job,
++ .task_exit = gsnedf_task_exit,
++ .schedule = gsnedf_schedule,
++ .task_wake_up = gsnedf_task_wake_up,
++ .task_block = gsnedf_task_block,
++#ifdef CONFIG_FMLP
++ .fmlp_active = 1,
++ .pi_block = gsnedf_pi_block,
++ .inherit_priority = gsnedf_inherit_priority,
++ .return_priority = gsnedf_return_priority,
++#endif
++ .admit_task = gsnedf_admit_task,
++ .activate_plugin = gsnedf_activate_plugin,
++};
++
++
++static int __init init_gsn_edf(void)
++{
++ int cpu;
++ cpu_entry_t *entry;
++
++ bheap_init(&gsnedf_cpu_heap);
++ /* initialize CPU state */
++ for (cpu = 0; cpu < NR_CPUS; cpu++) {
++ entry = &per_cpu(gsnedf_cpu_entries, cpu);
++ gsnedf_cpus[cpu] = entry;
++ atomic_set(&entry->will_schedule, 0);
++ entry->cpu = cpu;
++ entry->hn = &gsnedf_heap_node[cpu];
++ bheap_node_init(&entry->hn, entry);
++ }
++ edf_domain_init(&gsnedf, NULL, gsnedf_release_jobs);
++ return register_sched_plugin(&gsn_edf_plugin);
++}
++
++
++module_init(init_gsn_edf);
+diff --git a/litmus/sched_litmus.c b/litmus/sched_litmus.c
+new file mode 100644
+index 0000000..3ed713b
+--- /dev/null
++++ b/litmus/sched_litmus.c
+@@ -0,0 +1,315 @@
++/* This file is included from kernel/sched.c */
++
++#include
++#include
++#include
++
++static void update_time_litmus(struct rq *rq, struct task_struct *p)
++{
++ u64 delta = rq->clock - p->se.exec_start;
++ if (unlikely((s64)delta < 0))
++ delta = 0;
++ /* per job counter */
++ p->rt_param.job_params.exec_time += delta;
++ /* task counter */
++ p->se.sum_exec_runtime += delta;
++ /* sched_clock() */
++ p->se.exec_start = rq->clock;
++ cpuacct_charge(p, delta);
++}
++
++static void double_rq_lock(struct rq *rq1, struct rq *rq2);
++static void double_rq_unlock(struct rq *rq1, struct rq *rq2);
++
++/*
++ * litmus_tick gets called by scheduler_tick() with HZ freq
++ * Interrupts are disabled
++ */
++static void litmus_tick(struct rq *rq, struct task_struct *p)
++{
++ TS_PLUGIN_TICK_START;
++
++ if (is_realtime(p))
++ update_time_litmus(rq, p);
++
++ /* plugin tick */
++ litmus->tick(p);
++
++ TS_PLUGIN_TICK_END;
++
++ return;
++}
++
++static struct task_struct *
++litmus_schedule(struct rq *rq, struct task_struct *prev)
++{
++ struct rq* other_rq;
++ struct task_struct *next;
++
++ long was_running;
++ lt_t _maybe_deadlock = 0;
++
++ /* let the plugin schedule */
++ next = litmus->schedule(prev);
++
++ /* check if a global plugin pulled a task from a different RQ */
++ if (next && task_rq(next) != rq) {
++ /* we need to migrate the task */
++ other_rq = task_rq(next);
++ TRACE_TASK(next, "migrate from %d\n", other_rq->cpu);
++
++ /* while we drop the lock, the prev task could change its
++ * state
++ */
++ was_running = is_running(prev);
++ mb();
++ raw_spin_unlock(&rq->lock);
++
++ /* Don't race with a concurrent switch. This could deadlock in
++ * the case of cross or circular migrations. It's the job of
++ * the plugin to make sure that doesn't happen.
++ */
++ TRACE_TASK(next, "stack_in_use=%d\n",
++ next->rt_param.stack_in_use);
++ if (next->rt_param.stack_in_use != NO_CPU) {
++ TRACE_TASK(next, "waiting to deschedule\n");
++ _maybe_deadlock = litmus_clock();
++ }
++ while (next->rt_param.stack_in_use != NO_CPU) {
++ cpu_relax();
++ mb();
++ if (next->rt_param.stack_in_use == NO_CPU)
++ TRACE_TASK(next,"descheduled. Proceeding.\n");
++
++ if (lt_before(_maybe_deadlock + 10000000,
++ litmus_clock())) {
++ /* We've been spinning for 10ms.
++ * Something can't be right!
++ * Let's abandon the task and bail out; at least
++ * we will have debug info instead of a hard
++ * deadlock.
++ */
++ TRACE_TASK(next,"stack too long in use. "
++ "Deadlock?\n");
++ next = NULL;
++
++ /* bail out */
++ raw_spin_lock(&rq->lock);
++ return next;
++ }
++ }
++#ifdef __ARCH_WANT_UNLOCKED_CTXSW
++ if (next->oncpu)
++ TRACE_TASK(next, "waiting for !oncpu");
++ while (next->oncpu) {
++ cpu_relax();
++ mb();
++ }
++#endif
++ double_rq_lock(rq, other_rq);
++ mb();
++ if (is_realtime(prev) && is_running(prev) != was_running) {
++ TRACE_TASK(prev,
++ "state changed while we dropped"
++ " the lock: is_running=%d, was_running=%d\n",
++ is_running(prev), was_running);
++ if (is_running(prev) && !was_running) {
++ /* prev task became unblocked
++ * we need to simulate normal sequence of events
++ * to scheduler plugins.
++ */
++ litmus->task_block(prev);
++ litmus->task_wake_up(prev);
++ }
++ }
++
++ set_task_cpu(next, smp_processor_id());
++
++ /* DEBUG: now that we have the lock we need to make sure a
++ * couple of things still hold:
++ * - it is still a real-time task
++ * - it is still runnable (could have been stopped)
++ * If either is violated, then the active plugin is
++ * doing something wrong.
++ */
++ if (!is_realtime(next) || !is_running(next)) {
++ /* BAD BAD BAD */
++ TRACE_TASK(next,"BAD: migration invariant FAILED: "
++ "rt=%d running=%d\n",
++ is_realtime(next),
++ is_running(next));
++ /* drop the task */
++ next = NULL;
++ }
++ /* release the other CPU's runqueue, but keep ours */
++ raw_spin_unlock(&other_rq->lock);
++ }
++ if (next) {
++ next->rt_param.stack_in_use = rq->cpu;
++ next->se.exec_start = rq->clock;
++ }
++
++ update_enforcement_timer(next);
++ return next;
++}
++
++static void enqueue_task_litmus(struct rq *rq, struct task_struct *p,
++ int wakeup, bool head)
++{
++ if (wakeup) {
++ sched_trace_task_resume(p);
++ tsk_rt(p)->present = 1;
++ /* LITMUS^RT plugins need to update the state
++ * _before_ making it available in global structures.
++ * Linux gets away with being lazy about the task state
++ * update. We can't do that, hence we update the task
++ * state already here.
++ *
++ * WARNING: this needs to be re-evaluated when porting
++ * to newer kernel versions.
++ */
++ p->state = TASK_RUNNING;
++ litmus->task_wake_up(p);
++
++ rq->litmus.nr_running++;
++ } else
++ TRACE_TASK(p, "ignoring an enqueue, not a wake up.\n");
++}
++
++static void dequeue_task_litmus(struct rq *rq, struct task_struct *p, int sleep)
++{
++ if (sleep) {
++ litmus->task_block(p);
++ tsk_rt(p)->present = 0;
++ sched_trace_task_block(p);
++
++ rq->litmus.nr_running--;
++ } else
++ TRACE_TASK(p, "ignoring a dequeue, not going to sleep.\n");
++}
++
++static void yield_task_litmus(struct rq *rq)
++{
++ BUG_ON(rq->curr != current);
++ /* sched_yield() is called to trigger delayed preemptions.
++ * Thus, mark the current task as needing to be rescheduled.
++ * This will cause the scheduler plugin to be invoked, which can
++ * then determine if a preemption is still required.
++ */
++ clear_exit_np(current);
++ set_tsk_need_resched(current);
++}
++
++/* Plugins are responsible for this.
++ */
++static void check_preempt_curr_litmus(struct rq *rq, struct task_struct *p, int flags)
++{
++}
++
++static void put_prev_task_litmus(struct rq *rq, struct task_struct *p)
++{
++}
++
++static void pre_schedule_litmus(struct rq *rq, struct task_struct *prev)
++{
++ update_time_litmus(rq, prev);
++ if (!is_running(prev))
++ tsk_rt(prev)->present = 0;
++}
++
++/* pick_next_task_litmus() - litmus_schedule() function
++ *
++ * return the next task to be scheduled
++ */
++static struct task_struct *pick_next_task_litmus(struct rq *rq)
++{
++ /* get the to-be-switched-out task (prev) */
++ struct task_struct *prev = rq->litmus.prev;
++ struct task_struct *next;
++
++ /* if not called from schedule() but from somewhere
++ * else (e.g., migration), return now!
++ */
++ if(!rq->litmus.prev)
++ return NULL;
++
++ rq->litmus.prev = NULL;
++
++ TS_PLUGIN_SCHED_START;
++ next = litmus_schedule(rq, prev);
++ TS_PLUGIN_SCHED_END;
++
++ return next;
++}
++
++static void task_tick_litmus(struct rq *rq, struct task_struct *p, int queued)
++{
++ /* nothing to do; tick related tasks are done by litmus_tick() */
++ return;
++}
++
++static void switched_to_litmus(struct rq *rq, struct task_struct *p, int running)
++{
++}
++
++static void prio_changed_litmus(struct rq *rq, struct task_struct *p,
++ int oldprio, int running)
++{
++}
++
++unsigned int get_rr_interval_litmus(struct rq *rq, struct task_struct *p)
++{
++ /* return infinity */
++ return 0;
++}
++
++/* This is called when a task became a real-time task, either due to a SCHED_*
++ * class transition or due to PI mutex inheritance. We don't handle Linux PI
++ * mutex inheritance yet (and probably never will). Use LITMUS provided
++ * synchronization primitives instead.
++ */
++static void set_curr_task_litmus(struct rq *rq)
++{
++ rq->curr->se.exec_start = rq->clock;
++}
++
++
++#ifdef CONFIG_SMP
++/* execve tries to rebalance task in this scheduling domain.
++ * We don't care about the scheduling domain; can gets called from
++ * exec, fork, wakeup.
++ */
++static int select_task_rq_litmus(struct task_struct *p, int sd_flag, int flags)
++{
++ /* preemption is already disabled.
++ * We don't want to change cpu here
++ */
++ return task_cpu(p);
++}
++#endif
++
++static const struct sched_class litmus_sched_class = {
++ .next = &rt_sched_class,
++ .enqueue_task = enqueue_task_litmus,
++ .dequeue_task = dequeue_task_litmus,
++ .yield_task = yield_task_litmus,
++
++ .check_preempt_curr = check_preempt_curr_litmus,
++
++ .pick_next_task = pick_next_task_litmus,
++ .put_prev_task = put_prev_task_litmus,
++
++#ifdef CONFIG_SMP
++ .select_task_rq = select_task_rq_litmus,
++
++ .pre_schedule = pre_schedule_litmus,
++#endif
++
++ .set_curr_task = set_curr_task_litmus,
++ .task_tick = task_tick_litmus,
++
++ .get_rr_interval = get_rr_interval_litmus,
++
++ .prio_changed = prio_changed_litmus,
++ .switched_to = switched_to_litmus,
++};
+diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
+new file mode 100644
+index 0000000..ea77d32
+--- /dev/null
++++ b/litmus/sched_pfair.c
+@@ -0,0 +1,897 @@
++/*
++ * kernel/sched_pfair.c
++ *
++ * Implementation of the (global) Pfair scheduling algorithm.
++ *
++ */
++
++#include
++#include
++#include
++#include
++#include
++#include
++#include
++#include
++
++#include
++#include
++#include
++#include
++#include
++
++#include
++
++struct subtask {
++ /* measured in quanta relative to job release */
++ quanta_t release;
++ quanta_t deadline;
++ quanta_t overlap; /* called "b bit" by PD^2 */
++ quanta_t group_deadline;
++};
++
++struct pfair_param {
++ quanta_t quanta; /* number of subtasks */
++ quanta_t cur; /* index of current subtask */
++
++ quanta_t release; /* in quanta */
++ quanta_t period; /* in quanta */
++
++ quanta_t last_quantum; /* when scheduled last */
++ int last_cpu; /* where scheduled last */
++
++ unsigned int sporadic_release; /* On wakeup, new sporadic release? */
++
++ struct subtask subtasks[0]; /* allocate together with pfair_param */
++};
++
++#define tsk_pfair(tsk) ((tsk)->rt_param.pfair)
++
++struct pfair_state {
++ int cpu;
++ volatile quanta_t cur_tick; /* updated by the CPU that is advancing
++ * the time */
++ volatile quanta_t local_tick; /* What tick is the local CPU currently
++ * executing? Updated only by the local
++ * CPU. In QEMU, this may lag behind the
++ * current tick. In a real system, with
++ * proper timers and aligned quanta,
++ * that should only be the
++ * case for a very short time after the
++ * time advanced. With staggered quanta,
++ * it will lag for the duration of the
++ * offset.
++ */
++
++ struct task_struct* linked; /* the task that should be executing */
++ struct task_struct* local; /* the local copy of linked */
++ struct task_struct* scheduled; /* what is actually scheduled */
++
++ unsigned long missed_quanta;
++ lt_t offset; /* stagger offset */
++};
++
++/* Currently, we limit the maximum period of any task to 2000 quanta.
++ * The reason is that it makes the implementation easier since we do not
++ * need to reallocate the release wheel on task arrivals.
++ * In the future
++ */
++#define PFAIR_MAX_PERIOD 2000
++
++/* This is the release queue wheel. It is indexed by pfair_time %
++ * PFAIR_MAX_PERIOD. Each heap is ordered by PFAIR priority, so that it can be
++ * merged with the ready queue.
++ */
++static struct bheap release_queue[PFAIR_MAX_PERIOD];
++
++DEFINE_PER_CPU(struct pfair_state, pfair_state);
++struct pfair_state* *pstate; /* short cut */
++
++static quanta_t pfair_time = 0; /* the "official" PFAIR clock */
++static quanta_t merge_time = 0; /* Updated after the release queue has been
++ * merged. Used by drop_all_references().
++ */
++
++static rt_domain_t pfair;
++
++/* The pfair_lock is used to serialize all scheduling events.
++ */
++#define pfair_lock pfair.ready_lock
++
++/* Enable for lots of trace info.
++ * #define PFAIR_DEBUG
++ */
++
++#ifdef PFAIR_DEBUG
++#define PTRACE_TASK(t, f, args...) TRACE_TASK(t, f, ## args)
++#define PTRACE(f, args...) TRACE(f, ## args)
++#else
++#define PTRACE_TASK(t, f, args...)
++#define PTRACE(f, args...)
++#endif
++
++/* gcc will inline all of these accessor functions... */
++static struct subtask* cur_subtask(struct task_struct* t)
++{
++ return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur;
++}
++
++static quanta_t cur_deadline(struct task_struct* t)
++{
++ return cur_subtask(t)->deadline + tsk_pfair(t)->release;
++}
++
++
++static quanta_t cur_sub_release(struct task_struct* t)
++{
++ return cur_subtask(t)->release + tsk_pfair(t)->release;
++}
++
++static quanta_t cur_release(struct task_struct* t)
++{
++#ifdef EARLY_RELEASE
++ /* only the release of the first subtask counts when we early
++ * release */
++ return tsk_pfair(t)->release;
++#else
++ return cur_sub_release(t);
++#endif
++}
++
++static quanta_t cur_overlap(struct task_struct* t)
++{
++ return cur_subtask(t)->overlap;
++}
++
++static quanta_t cur_group_deadline(struct task_struct* t)
++{
++ quanta_t gdl = cur_subtask(t)->group_deadline;
++ if (gdl)
++ return gdl + tsk_pfair(t)->release;
++ else
++ return gdl;
++}
++
++
++static int pfair_higher_prio(struct task_struct* first,
++ struct task_struct* second)
++{
++ return /* first task must exist */
++ first && (
++ /* Does the second task exist and is it a real-time task? If
++ * not, the first task (which is a RT task) has higher
++ * priority.
++ */
++ !second || !is_realtime(second) ||
++
++ /* Is the (subtask) deadline of the first task earlier?
++ * Then it has higher priority.
++ */
++ time_before(cur_deadline(first), cur_deadline(second)) ||
++
++ /* Do we have a deadline tie?
++ * Then break by B-bit.
++ */
++ (cur_deadline(first) == cur_deadline(second) &&
++ (cur_overlap(first) > cur_overlap(second) ||
++
++ /* Do we have a B-bit tie?
++ * Then break by group deadline.
++ */
++ (cur_overlap(first) == cur_overlap(second) &&
++ (time_after(cur_group_deadline(first),
++ cur_group_deadline(second)) ||
++
++ /* Do we have a group deadline tie?
++ * Then break by PID, which are unique.
++ */
++ (cur_group_deadline(first) ==
++ cur_group_deadline(second) &&
++ first->pid < second->pid))))));
++}
++
++int pfair_ready_order(struct bheap_node* a, struct bheap_node* b)
++{
++ return pfair_higher_prio(bheap2task(a), bheap2task(b));
++}
++
++/* return the proper release queue for time t */
++static struct bheap* relq(quanta_t t)
++{
++ struct bheap* rq = &release_queue[t % PFAIR_MAX_PERIOD];
++ return rq;
++}
++
++static void prepare_release(struct task_struct* t, quanta_t at)
++{
++ tsk_pfair(t)->release = at;
++ tsk_pfair(t)->cur = 0;
++}
++
++static void __pfair_add_release(struct task_struct* t, struct bheap* queue)
++{
++ bheap_insert(pfair_ready_order, queue,
++ tsk_rt(t)->heap_node);
++}
++
++static void pfair_add_release(struct task_struct* t)
++{
++ BUG_ON(bheap_node_in_heap(tsk_rt(t)->heap_node));
++ __pfair_add_release(t, relq(cur_release(t)));
++}
++
++/* pull released tasks from the release queue */
++static void poll_releases(quanta_t time)
++{
++ __merge_ready(&pfair, relq(time));
++ merge_time = time;
++}
++
++static void check_preempt(struct task_struct* t)
++{
++ int cpu = NO_CPU;
++ if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on &&
++ tsk_rt(t)->present) {
++ /* the task can be scheduled and
++ * is not scheduled where it ought to be scheduled
++ */
++ cpu = tsk_rt(t)->linked_on != NO_CPU ?
++ tsk_rt(t)->linked_on :
++ tsk_rt(t)->scheduled_on;
++ PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n",
++ tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on);
++ /* preempt */
++ if (cpu == smp_processor_id())
++ set_tsk_need_resched(current);
++ else {
++ smp_send_reschedule(cpu);
++ }
++ }
++}
++
++/* caller must hold pfair_lock */
++static void drop_all_references(struct task_struct *t)
++{
++ int cpu;
++ struct pfair_state* s;
++ struct bheap* q;
++ if (bheap_node_in_heap(tsk_rt(t)->heap_node)) {
++ /* figure out what queue the node is in */
++ if (time_before_eq(cur_release(t), merge_time))
++ q = &pfair.ready_queue;
++ else
++ q = relq(cur_release(t));
++ bheap_delete(pfair_ready_order, q,
++ tsk_rt(t)->heap_node);
++ }
++ for (cpu = 0; cpu < num_online_cpus(); cpu++) {
++ s = &per_cpu(pfair_state, cpu);
++ if (s->linked == t)
++ s->linked = NULL;
++ if (s->local == t)
++ s->local = NULL;
++ if (s->scheduled == t)
++ s->scheduled = NULL;
++ }
++}
++
++/* returns 1 if the task needs to go the release queue */
++static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
++{
++ struct pfair_param* p = tsk_pfair(t);
++ int to_relq;
++ p->cur = (p->cur + 1) % p->quanta;
++ if (!p->cur) {
++ sched_trace_task_completion(t, 1);
++ if (tsk_rt(t)->present) {
++ /* we start a new job */
++ prepare_for_next_period(t);
++ sched_trace_task_release(t);
++ get_rt_flags(t) = RT_F_RUNNING;
++ p->release += p->period;
++ } else {
++ /* remove task from system until it wakes */
++ drop_all_references(t);
++ tsk_pfair(t)->sporadic_release = 1;
++ TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n",
++ cpu, p->cur);
++ return 0;
++ }
++ }
++ to_relq = time_after(cur_release(t), time);
++ TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d\n",
++ cpu, p->cur, to_relq);
++ return to_relq;
++}
++
++static void advance_subtasks(quanta_t time)
++{
++ int cpu, missed;
++ struct task_struct* l;
++ struct pfair_param* p;
++
++ for_each_online_cpu(cpu) {
++ l = pstate[cpu]->linked;
++ missed = pstate[cpu]->linked != pstate[cpu]->local;
++ if (l) {
++ p = tsk_pfair(l);
++ p->last_quantum = time;
++ p->last_cpu = cpu;
++ if (advance_subtask(time, l, cpu)) {
++ pstate[cpu]->linked = NULL;
++ pfair_add_release(l);
++ }
++ }
++ }
++}
++
++static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu)
++{
++ int cpu;
++ if (tsk_rt(t)->scheduled_on != NO_CPU) {
++ /* always observe scheduled_on linkage */
++ default_cpu = tsk_rt(t)->scheduled_on;
++ } else if (tsk_pfair(t)->last_quantum == time - 1) {
++ /* back2back quanta */
++ /* Only observe last_quantum if no scheduled_on is in the way.
++ * This should only kick in if a CPU missed quanta, and that
++ * *should* only happen in QEMU.
++ */
++ cpu = tsk_pfair(t)->last_cpu;
++ if (!pstate[cpu]->linked ||
++ tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) {
++ default_cpu = cpu;
++ }
++ }
++ return default_cpu;
++}
++
++/* returns one if linking was redirected */
++static int pfair_link(quanta_t time, int cpu,
++ struct task_struct* t)
++{
++ int target = target_cpu(time, t, cpu);
++ struct task_struct* prev = pstate[cpu]->linked;
++ struct task_struct* other;
++
++ if (target != cpu) {
++ other = pstate[target]->linked;
++ pstate[target]->linked = t;
++ tsk_rt(t)->linked_on = target;
++ if (!other)
++ /* linked ok, but reschedule this CPU */
++ return 1;
++ if (target < cpu) {
++ /* link other to cpu instead */
++ tsk_rt(other)->linked_on = cpu;
++ pstate[cpu]->linked = other;
++ if (prev) {
++ /* prev got pushed back into the ready queue */
++ tsk_rt(prev)->linked_on = NO_CPU;
++ __add_ready(&pfair, prev);
++ }
++ /* we are done with this cpu */
++ return 0;
++ } else {
++ /* re-add other, it's original CPU was not considered yet */
++ tsk_rt(other)->linked_on = NO_CPU;
++ __add_ready(&pfair, other);
++ /* reschedule this CPU */
++ return 1;
++ }
++ } else {
++ pstate[cpu]->linked = t;
++ tsk_rt(t)->linked_on = cpu;
++ if (prev) {
++ /* prev got pushed back into the ready queue */
++ tsk_rt(prev)->linked_on = NO_CPU;
++ __add_ready(&pfair, prev);
++ }
++ /* we are done with this CPU */
++ return 0;
++ }
++}
++
++static void schedule_subtasks(quanta_t time)
++{
++ int cpu, retry;
++
++ for_each_online_cpu(cpu) {
++ retry = 1;
++ while (retry) {
++ if (pfair_higher_prio(__peek_ready(&pfair),
++ pstate[cpu]->linked))
++ retry = pfair_link(time, cpu,
++ __take_ready(&pfair));
++ else
++ retry = 0;
++ }
++ }
++}
++
++static void schedule_next_quantum(quanta_t time)
++{
++ int cpu;
++
++ /* called with interrupts disabled */
++ PTRACE("--- Q %lu at %llu PRE-SPIN\n",
++ time, litmus_clock());
++ raw_spin_lock(&pfair_lock);
++ PTRACE("<<< Q %lu at %llu\n",
++ time, litmus_clock());
++
++ sched_trace_quantum_boundary();
++
++ advance_subtasks(time);
++ poll_releases(time);
++ schedule_subtasks(time);
++
++ for (cpu = 0; cpu < num_online_cpus(); cpu++)
++ if (pstate[cpu]->linked)
++ PTRACE_TASK(pstate[cpu]->linked,
++ " linked on %d.\n", cpu);
++ else
++ PTRACE("(null) linked on %d.\n", cpu);
++
++ /* We are done. Advance time. */
++ mb();
++ for (cpu = 0; cpu < num_online_cpus(); cpu++) {
++ if (pstate[cpu]->local_tick != pstate[cpu]->cur_tick) {
++ TRACE("BAD Quantum not acked on %d "
++ "(l:%lu c:%lu p:%lu)\n",
++ cpu,
++ pstate[cpu]->local_tick,
++ pstate[cpu]->cur_tick,
++ pfair_time);
++ pstate[cpu]->missed_quanta++;
++ }
++ pstate[cpu]->cur_tick = time;
++ }
++ PTRACE(">>> Q %lu at %llu\n",
++ time, litmus_clock());
++ raw_spin_unlock(&pfair_lock);
++}
++
++static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state)
++{
++ quanta_t loc;
++
++ goto first; /* skip mb() on first iteration */
++ do {
++ cpu_relax();
++ mb();
++ first: loc = state->cur_tick;
++ /* FIXME: what if loc > cur? */
++ } while (time_before(loc, q));
++ PTRACE("observed cur_tick:%lu >= q:%lu\n",
++ loc, q);
++}
++
++static quanta_t current_quantum(struct pfair_state* state)
++{
++ lt_t t = litmus_clock() - state->offset;
++ return time2quanta(t, FLOOR);
++}
++
++static void catchup_quanta(quanta_t from, quanta_t target,
++ struct pfair_state* state)
++{
++ quanta_t cur = from, time;
++ TRACE("+++< BAD catching up quanta from %lu to %lu\n",
++ from, target);
++ while (time_before(cur, target)) {
++ wait_for_quantum(cur, state);
++ cur++;
++ time = cmpxchg(&pfair_time,
++ cur - 1, /* expected */
++ cur /* next */
++ );
++ if (time == cur - 1)
++ schedule_next_quantum(cur);
++ }
++ TRACE("+++> catching up done\n");
++}
++
++/* pfair_tick - this function is called for every local timer
++ * interrupt.
++ */
++static void pfair_tick(struct task_struct* t)
++{
++ struct pfair_state* state = &__get_cpu_var(pfair_state);
++ quanta_t time, cur;
++ int retry = 10;
++
++ do {
++ cur = current_quantum(state);
++ PTRACE("q %lu at %llu\n", cur, litmus_clock());
++
++ /* Attempt to advance time. First CPU to get here
++ * will prepare the next quantum.
++ */
++ time = cmpxchg(&pfair_time,
++ cur - 1, /* expected */
++ cur /* next */
++ );
++ if (time == cur - 1) {
++ /* exchange succeeded */
++ wait_for_quantum(cur - 1, state);
++ schedule_next_quantum(cur);
++ retry = 0;
++ } else if (time_before(time, cur - 1)) {
++ /* the whole system missed a tick !? */
++ catchup_quanta(time, cur, state);
++ retry--;
++ } else if (time_after(time, cur)) {
++ /* our timer lagging behind!? */
++ TRACE("BAD pfair_time:%lu > cur:%lu\n", time, cur);
++ retry--;
++ } else {
++ /* Some other CPU already started scheduling
++ * this quantum. Let it do its job and then update.
++ */
++ retry = 0;
++ }
++ } while (retry);
++
++ /* Spin locally until time advances. */
++ wait_for_quantum(cur, state);
++
++ /* copy assignment */
++ /* FIXME: what if we race with a future update? Corrupted state? */
++ state->local = state->linked;
++ /* signal that we are done */
++ mb();
++ state->local_tick = state->cur_tick;
++
++ if (state->local != current
++ && (is_realtime(current) || is_present(state->local)))
++ set_tsk_need_resched(current);
++}
++
++static int safe_to_schedule(struct task_struct* t, int cpu)
++{
++ int where = tsk_rt(t)->scheduled_on;
++ if (where != NO_CPU && where != cpu) {
++ TRACE_TASK(t, "BAD: can't be scheduled on %d, "
++ "scheduled already on %d.\n", cpu, where);
++ return 0;
++ } else
++ return tsk_rt(t)->present && get_rt_flags(t) == RT_F_RUNNING;
++}
++
++static struct task_struct* pfair_schedule(struct task_struct * prev)
++{
++ struct pfair_state* state = &__get_cpu_var(pfair_state);
++ int blocks;
++ struct task_struct* next = NULL;
++
++ raw_spin_lock(&pfair_lock);
++
++ blocks = is_realtime(prev) && !is_running(prev);
++
++ if (state->local && safe_to_schedule(state->local, state->cpu))
++ next = state->local;
++
++ if (prev != next) {
++ tsk_rt(prev)->scheduled_on = NO_CPU;
++ if (next)
++ tsk_rt(next)->scheduled_on = state->cpu;
++ }
++
++ raw_spin_unlock(&pfair_lock);
++
++ if (next)
++ TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n",
++ tsk_pfair(next)->release, pfair_time, litmus_clock());
++ else if (is_realtime(prev))
++ TRACE("Becomes idle at %lu (%llu)\n", pfair_time, litmus_clock());
++
++ return next;
++}
++
++static void pfair_task_new(struct task_struct * t, int on_rq, int running)
++{
++ unsigned long flags;
++
++ TRACE("pfair: task new %d state:%d\n", t->pid, t->state);
++
++ raw_spin_lock_irqsave(&pfair_lock, flags);
++ if (running)
++ t->rt_param.scheduled_on = task_cpu(t);
++ else
++ t->rt_param.scheduled_on = NO_CPU;
++
++ prepare_release(t, pfair_time + 1);
++ tsk_pfair(t)->sporadic_release = 0;
++ pfair_add_release(t);
++ check_preempt(t);
++
++ raw_spin_unlock_irqrestore(&pfair_lock, flags);
++}
++
++static void pfair_task_wake_up(struct task_struct *t)
++{
++ unsigned long flags;
++ lt_t now;
++
++ TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n",
++ litmus_clock(), cur_release(t), pfair_time);
++
++ raw_spin_lock_irqsave(&pfair_lock, flags);
++
++ /* It is a little unclear how to deal with Pfair
++ * tasks that block for a while and then wake. For now,
++ * if a task blocks and wakes before its next job release,
++ * then it may resume if it is currently linked somewhere
++ * (as if it never blocked at all). Otherwise, we have a
++ * new sporadic job release.
++ */
++ if (tsk_pfair(t)->sporadic_release) {
++ now = litmus_clock();
++ release_at(t, now);
++ prepare_release(t, time2quanta(now, CEIL));
++ sched_trace_task_release(t);
++ /* FIXME: race with pfair_time advancing */
++ pfair_add_release(t);
++ tsk_pfair(t)->sporadic_release = 0;
++ }
++
++ check_preempt(t);
++
++ raw_spin_unlock_irqrestore(&pfair_lock, flags);
++ TRACE_TASK(t, "wake up done at %llu\n", litmus_clock());
++}
++
++static void pfair_task_block(struct task_struct *t)
++{
++ BUG_ON(!is_realtime(t));
++ TRACE_TASK(t, "blocks at %llu, state:%d\n",
++ litmus_clock(), t->state);
++}
++
++static void pfair_task_exit(struct task_struct * t)
++{
++ unsigned long flags;
++
++ BUG_ON(!is_realtime(t));
++
++ /* Remote task from release or ready queue, and ensure
++ * that it is not the scheduled task for ANY CPU. We
++ * do this blanket check because occassionally when
++ * tasks exit while blocked, the task_cpu of the task
++ * might not be the same as the CPU that the PFAIR scheduler
++ * has chosen for it.
++ */
++ raw_spin_lock_irqsave(&pfair_lock, flags);
++
++ TRACE_TASK(t, "RIP, state:%d\n", t->state);
++ drop_all_references(t);
++
++ raw_spin_unlock_irqrestore(&pfair_lock, flags);
++
++ kfree(t->rt_param.pfair);
++ t->rt_param.pfair = NULL;
++}
++
++
++static void pfair_release_at(struct task_struct* task, lt_t start)
++{
++ unsigned long flags;
++ quanta_t release;
++
++ BUG_ON(!is_realtime(task));
++
++ raw_spin_lock_irqsave(&pfair_lock, flags);
++ release_at(task, start);
++ release = time2quanta(start, CEIL);
++
++ if (release - pfair_time >= PFAIR_MAX_PERIOD)
++ release = pfair_time + PFAIR_MAX_PERIOD;
++
++ TRACE_TASK(task, "sys release at %lu\n", release);
++
++ drop_all_references(task);
++ prepare_release(task, release);
++ pfair_add_release(task);
++
++ /* Clear sporadic release flag, since this release subsumes any
++ * sporadic release on wake.
++ */
++ tsk_pfair(task)->sporadic_release = 0;
++
++ raw_spin_unlock_irqrestore(&pfair_lock, flags);
++}
++
++static void init_subtask(struct subtask* sub, unsigned long i,
++ lt_t quanta, lt_t period)
++{
++ /* since i is zero-based, the formulas are shifted by one */
++ lt_t tmp;
++
++ /* release */
++ tmp = period * i;
++ do_div(tmp, quanta); /* floor */
++ sub->release = (quanta_t) tmp;
++
++ /* deadline */
++ tmp = period * (i + 1);
++ if (do_div(tmp, quanta)) /* ceil */
++ tmp++;
++ sub->deadline = (quanta_t) tmp;
++
++ /* next release */
++ tmp = period * (i + 1);
++ do_div(tmp, quanta); /* floor */
++ sub->overlap = sub->deadline - (quanta_t) tmp;
++
++ /* Group deadline.
++ * Based on the formula given in Uma's thesis.
++ */
++ if (2 * quanta >= period) {
++ /* heavy */
++ tmp = (sub->deadline - (i + 1)) * period;
++ if (period > quanta &&
++ do_div(tmp, (period - quanta))) /* ceil */
++ tmp++;
++ sub->group_deadline = (quanta_t) tmp;
++ } else
++ sub->group_deadline = 0;
++}
++
++static void dump_subtasks(struct task_struct* t)
++{
++ unsigned long i;
++ for (i = 0; i < t->rt_param.pfair->quanta; i++)
++ TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n",
++ i + 1,
++ t->rt_param.pfair->subtasks[i].release,
++ t->rt_param.pfair->subtasks[i].deadline,
++ t->rt_param.pfair->subtasks[i].overlap,
++ t->rt_param.pfair->subtasks[i].group_deadline);
++}
++
++static long pfair_admit_task(struct task_struct* t)
++{
++ lt_t quanta;
++ lt_t period;
++ s64 quantum_length = ktime_to_ns(tick_period);
++ struct pfair_param* param;
++ unsigned long i;
++
++ /* Pfair is a tick-based method, so the time
++ * of interest is jiffies. Calculate tick-based
++ * times for everything.
++ * (Ceiling of exec cost, floor of period.)
++ */
++
++ quanta = get_exec_cost(t);
++ period = get_rt_period(t);
++
++ quanta = time2quanta(get_exec_cost(t), CEIL);
++
++ if (do_div(period, quantum_length))
++ printk(KERN_WARNING
++ "The period of %s/%d is not a multiple of %llu.\n",
++ t->comm, t->pid, (unsigned long long) quantum_length);
++
++ if (period >= PFAIR_MAX_PERIOD) {
++ printk(KERN_WARNING
++ "PFAIR: Rejecting task %s/%d; its period is too long.\n",
++ t->comm, t->pid);
++ return -EINVAL;
++ }
++
++ if (quanta == period) {
++ /* special case: task has weight 1.0 */
++ printk(KERN_INFO
++ "Admitting weight 1.0 task. (%s/%d, %llu, %llu).\n",
++ t->comm, t->pid, quanta, period);
++ quanta = 1;
++ period = 1;
++ }
++
++ param = kmalloc(sizeof(*param) +
++ quanta * sizeof(struct subtask), GFP_ATOMIC);
++
++ if (!param)
++ return -ENOMEM;
++
++ param->quanta = quanta;
++ param->cur = 0;
++ param->release = 0;
++ param->period = period;
++
++ for (i = 0; i < quanta; i++)
++ init_subtask(param->subtasks + i, i, quanta, period);
++
++ if (t->rt_param.pfair)
++ /* get rid of stale allocation */
++ kfree(t->rt_param.pfair);
++
++ t->rt_param.pfair = param;
++
++ /* spew out some debug info */
++ dump_subtasks(t);
++
++ return 0;
++}
++
++static long pfair_activate_plugin(void)
++{
++ int cpu;
++ struct pfair_state* state;
++
++ state = &__get_cpu_var(pfair_state);
++ pfair_time = current_quantum(state);
++
++ TRACE("Activating PFAIR at q=%lu\n", pfair_time);
++
++ for (cpu = 0; cpu < num_online_cpus(); cpu++) {
++ state = &per_cpu(pfair_state, cpu);
++ state->cur_tick = pfair_time;
++ state->local_tick = pfair_time;
++ state->missed_quanta = 0;
++ state->offset = cpu_stagger_offset(cpu);
++ }
++
++ return 0;
++}
++
++/* Plugin object */
++static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = {
++ .plugin_name = "PFAIR",
++ .tick = pfair_tick,
++ .task_new = pfair_task_new,
++ .task_exit = pfair_task_exit,
++ .schedule = pfair_schedule,
++ .task_wake_up = pfair_task_wake_up,
++ .task_block = pfair_task_block,
++ .admit_task = pfair_admit_task,
++ .release_at = pfair_release_at,
++ .complete_job = complete_job,
++ .activate_plugin = pfair_activate_plugin,
++};
++
++static int __init init_pfair(void)
++{
++ int cpu, i;
++ struct pfair_state *state;
++
++
++ /*
++ * initialize short_cut for per-cpu pfair state;
++ * there may be a problem here if someone removes a cpu
++ * while we are doing this initialization... and if cpus
++ * are added / removed later... is it a _real_ problem?
++ */
++ pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL);
++
++ /* initialize release queue */
++ for (i = 0; i < PFAIR_MAX_PERIOD; i++)
++ bheap_init(&release_queue[i]);
++
++ /* initialize CPU state */
++ for (cpu = 0; cpu < num_online_cpus(); cpu++) {
++ state = &per_cpu(pfair_state, cpu);
++ state->cpu = cpu;
++ state->cur_tick = 0;
++ state->local_tick = 0;
++ state->linked = NULL;
++ state->local = NULL;
++ state->scheduled = NULL;
++ state->missed_quanta = 0;
++ state->offset = cpu_stagger_offset(cpu);
++ pstate[cpu] = state;
++ }
++
++ rt_domain_init(&pfair, pfair_ready_order, NULL, NULL);
++ return register_sched_plugin(&pfair_plugin);
++}
++
++static void __exit clean_pfair(void)
++{
++ kfree(pstate);
++}
++
++module_init(init_pfair);
++module_exit(clean_pfair);
+diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c
+new file mode 100644
+index 0000000..3543b7b
+--- /dev/null
++++ b/litmus/sched_plugin.c
+@@ -0,0 +1,265 @@
++/* sched_plugin.c -- core infrastructure for the scheduler plugin system
++ *
++ * This file includes the initialization of the plugin system, the no-op Linux
++ * scheduler plugin, some dummy functions, and some helper functions.
++ */
++
++#include
++#include
++
++#include
++#include
++
++#include
++
++/*
++ * Generic function to trigger preemption on either local or remote cpu
++ * from scheduler plugins. The key feature is that this function is
++ * non-preemptive section aware and does not invoke the scheduler / send
++ * IPIs if the to-be-preempted task is actually non-preemptive.
++ */
++void preempt_if_preemptable(struct task_struct* t, int on_cpu)
++{
++ /* t is the real-time task executing on CPU on_cpu If t is NULL, then
++ * on_cpu is currently scheduling background work.
++ */
++
++ int send_ipi;
++
++ if (smp_processor_id() == on_cpu) {
++ /* local CPU case */
++ if (t) {
++ /* check if we need to poke userspace */
++ if (is_user_np(t))
++ /* yes, poke it */
++ request_exit_np(t);
++ else
++ /* no, see if we are allowed to preempt the
++ * currently-executing task */
++ if (!is_kernel_np(t))
++ set_tsk_need_resched(t);
++ } else
++ /* move non-real-time task out of the way */
++ set_tsk_need_resched(current);
++ } else {
++ /* remote CPU case */
++ if (!t)
++ /* currently schedules non-real-time work */
++ send_ipi = 1;
++ else {
++ /* currently schedules real-time work */
++ if (is_user_np(t)) {
++ /* need to notify user space of delayed
++ * preemption */
++
++ /* to avoid a race, set the flag, then test
++ * again */
++ request_exit_np(t);
++ /* make sure it got written */
++ mb();
++ }
++ /* Only send an ipi if remote task might have raced our
++ * request, i.e., send an IPI to make sure if it exited
++ * its critical section.
++ */
++ send_ipi = !is_np(t) && !is_kernel_np(t);
++ }
++ if (likely(send_ipi))
++ smp_send_reschedule(on_cpu);
++ }
++}
++
++
++/*************************************************************
++ * Dummy plugin functions *
++ *************************************************************/
++
++static void litmus_dummy_finish_switch(struct task_struct * prev)
++{
++}
++
++static struct task_struct* litmus_dummy_schedule(struct task_struct * prev)
++{
++ return NULL;
++}
++
++static void litmus_dummy_tick(struct task_struct* tsk)
++{
++}
++
++static long litmus_dummy_admit_task(struct task_struct* tsk)
++{
++ printk(KERN_CRIT "LITMUS^RT: Linux plugin rejects %s/%d.\n",
++ tsk->comm, tsk->pid);
++ return -EINVAL;
++}
++
++static void litmus_dummy_task_new(struct task_struct *t, int on_rq, int running)
++{
++}
++
++static void litmus_dummy_task_wake_up(struct task_struct *task)
++{
++}
++
++static void litmus_dummy_task_block(struct task_struct *task)
++{
++}
++
++static void litmus_dummy_task_exit(struct task_struct *task)
++{
++}
++
++static long litmus_dummy_complete_job(void)
++{
++ return -ENOSYS;
++}
++
++static long litmus_dummy_activate_plugin(void)
++{
++ return 0;
++}
++
++static long litmus_dummy_deactivate_plugin(void)
++{
++ return 0;
++}
++
++#ifdef CONFIG_FMLP
++
++static long litmus_dummy_inherit_priority(struct pi_semaphore *sem,
++ struct task_struct *new_owner)
++{
++ return -ENOSYS;
++}
++
++static long litmus_dummy_return_priority(struct pi_semaphore *sem)
++{
++ return -ENOSYS;
++}
++
++static long litmus_dummy_pi_block(struct pi_semaphore *sem,
++ struct task_struct *new_waiter)
++{
++ return -ENOSYS;
++}
++
++#endif
++
++
++/* The default scheduler plugin. It doesn't do anything and lets Linux do its
++ * job.
++ */
++struct sched_plugin linux_sched_plugin = {
++ .plugin_name = "Linux",
++ .tick = litmus_dummy_tick,
++ .task_new = litmus_dummy_task_new,
++ .task_exit = litmus_dummy_task_exit,
++ .task_wake_up = litmus_dummy_task_wake_up,
++ .task_block = litmus_dummy_task_block,
++ .complete_job = litmus_dummy_complete_job,
++ .schedule = litmus_dummy_schedule,
++ .finish_switch = litmus_dummy_finish_switch,
++ .activate_plugin = litmus_dummy_activate_plugin,
++ .deactivate_plugin = litmus_dummy_deactivate_plugin,
++#ifdef CONFIG_FMLP
++ .inherit_priority = litmus_dummy_inherit_priority,
++ .return_priority = litmus_dummy_return_priority,
++ .pi_block = litmus_dummy_pi_block,
++#endif
++ .admit_task = litmus_dummy_admit_task
++};
++
++/*
++ * The cluster size is needed in C-EDF: it makes sense only to cluster
++ * around L2 or L3, so if cluster_cache_index = 2 (default) we cluster
++ * all the CPUs that shares a L2 cache, while cluster_cache_index = 3
++ * we cluster all CPs that shares a L3 cache
++ */
++int cluster_cache_index = 2;
++
++/*
++ * The reference to current plugin that is used to schedule tasks within
++ * the system. It stores references to actual function implementations
++ * Should be initialized by calling "init_***_plugin()"
++ */
++struct sched_plugin *litmus = &linux_sched_plugin;
++
++/* the list of registered scheduling plugins */
++static LIST_HEAD(sched_plugins);
++static DEFINE_RAW_SPINLOCK(sched_plugins_lock);
++
++#define CHECK(func) {\
++ if (!plugin->func) \
++ plugin->func = litmus_dummy_ ## func;}
++
++/* FIXME: get reference to module */
++int register_sched_plugin(struct sched_plugin* plugin)
++{
++ printk(KERN_INFO "Registering LITMUS^RT plugin %s.\n",
++ plugin->plugin_name);
++
++ /* make sure we don't trip over null pointers later */
++ CHECK(finish_switch);
++ CHECK(schedule);
++ CHECK(tick);
++ CHECK(task_wake_up);
++ CHECK(task_exit);
++ CHECK(task_block);
++ CHECK(task_new);
++ CHECK(complete_job);
++ CHECK(activate_plugin);
++ CHECK(deactivate_plugin);
++#ifdef CONFIG_FMLP
++ CHECK(inherit_priority);
++ CHECK(return_priority);
++ CHECK(pi_block);
++#endif
++ CHECK(admit_task);
++
++ if (!plugin->release_at)
++ plugin->release_at = release_at;
++
++ raw_spin_lock(&sched_plugins_lock);
++ list_add(&plugin->list, &sched_plugins);
++ raw_spin_unlock(&sched_plugins_lock);
++
++ return 0;
++}
++
++
++/* FIXME: reference counting, etc. */
++struct sched_plugin* find_sched_plugin(const char* name)
++{
++ struct list_head *pos;
++ struct sched_plugin *plugin;
++
++ raw_spin_lock(&sched_plugins_lock);
++ list_for_each(pos, &sched_plugins) {
++ plugin = list_entry(pos, struct sched_plugin, list);
++ if (!strcmp(plugin->plugin_name, name))
++ goto out_unlock;
++ }
++ plugin = NULL;
++
++out_unlock:
++ raw_spin_unlock(&sched_plugins_lock);
++ return plugin;
++}
++
++int print_sched_plugins(char* buf, int max)
++{
++ int count = 0;
++ struct list_head *pos;
++ struct sched_plugin *plugin;
++
++ raw_spin_lock(&sched_plugins_lock);
++ list_for_each(pos, &sched_plugins) {
++ plugin = list_entry(pos, struct sched_plugin, list);
++ count += snprintf(buf + count, max - count, "%s\n", plugin->plugin_name);
++ if (max - count <= 0)
++ break;
++ }
++ raw_spin_unlock(&sched_plugins_lock);
++ return count;
++}
+diff --git a/litmus/sched_psn_edf.c b/litmus/sched_psn_edf.c
+new file mode 100644
+index 0000000..73f6473
+--- /dev/null
++++ b/litmus/sched_psn_edf.c
+@@ -0,0 +1,482 @@
++/*
++ * kernel/sched_psn_edf.c
++ *
++ * Implementation of the PSN-EDF scheduler plugin.
++ * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c.
++ *
++ * Suspensions and non-preemptable sections are supported.
++ * Priority inheritance is not supported.
++ */
++
++#include
++#include
++#include
++#include
++
++#include
++
++#include
++#include
++#include
++#include
++
++
++typedef struct {
++ rt_domain_t domain;
++ int cpu;
++ struct task_struct* scheduled; /* only RT tasks */
++/*
++ * scheduling lock slock
++ * protects the domain and serializes scheduling decisions
++ */
++#define slock domain.ready_lock
++
++} psnedf_domain_t;
++
++DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains);
++
++#define local_edf (&__get_cpu_var(psnedf_domains).domain)
++#define local_pedf (&__get_cpu_var(psnedf_domains))
++#define remote_edf(cpu) (&per_cpu(psnedf_domains, cpu).domain)
++#define remote_pedf(cpu) (&per_cpu(psnedf_domains, cpu))
++#define task_edf(task) remote_edf(get_partition(task))
++#define task_pedf(task) remote_pedf(get_partition(task))
++
++
++static void psnedf_domain_init(psnedf_domain_t* pedf,
++ check_resched_needed_t check,
++ release_jobs_t release,
++ int cpu)
++{
++ edf_domain_init(&pedf->domain, check, release);
++ pedf->cpu = cpu;
++ pedf->scheduled = NULL;
++}
++
++static void requeue(struct task_struct* t, rt_domain_t *edf)
++{
++ if (t->state != TASK_RUNNING)
++ TRACE_TASK(t, "requeue: !TASK_RUNNING\n");
++
++ set_rt_flags(t, RT_F_RUNNING);
++ if (is_released(t, litmus_clock()))
++ __add_ready(edf, t);
++ else
++ add_release(edf, t); /* it has got to wait */
++}
++
++/* we assume the lock is being held */
++static void preempt(psnedf_domain_t *pedf)
++{
++ preempt_if_preemptable(pedf->scheduled, pedf->cpu);
++}
++
++/* This check is trivial in partioned systems as we only have to consider
++ * the CPU of the partition.
++ */
++static int psnedf_check_resched(rt_domain_t *edf)
++{
++ psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain);
++
++ /* because this is a callback from rt_domain_t we already hold
++ * the necessary lock for the ready queue
++ */
++ if (edf_preemption_needed(edf, pedf->scheduled)) {
++ preempt(pedf);
++ return 1;
++ } else
++ return 0;
++}
++
++static void job_completion(struct task_struct* t, int forced)
++{
++ sched_trace_task_completion(t,forced);
++ TRACE_TASK(t, "job_completion().\n");
++
++ set_rt_flags(t, RT_F_SLEEP);
++ prepare_for_next_period(t);
++}
++
++static void psnedf_tick(struct task_struct *t)
++{
++ psnedf_domain_t *pedf = local_pedf;
++
++ /* Check for inconsistency. We don't need the lock for this since
++ * ->scheduled is only changed in schedule, which obviously is not
++ * executing in parallel on this CPU
++ */
++ BUG_ON(is_realtime(t) && t != pedf->scheduled);
++
++ if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
++ if (!is_np(t)) {
++ set_tsk_need_resched(t);
++ TRACE("psnedf_scheduler_tick: "
++ "%d is preemptable "
++ " => FORCE_RESCHED\n", t->pid);
++ } else if (is_user_np(t)) {
++ TRACE("psnedf_scheduler_tick: "
++ "%d is non-preemptable, "
++ "preemption delayed.\n", t->pid);
++ request_exit_np(t);
++ }
++ }
++}
++
++static struct task_struct* psnedf_schedule(struct task_struct * prev)
++{
++ psnedf_domain_t* pedf = local_pedf;
++ rt_domain_t* edf = &pedf->domain;
++ struct task_struct* next;
++
++ int out_of_time, sleep, preempt,
++ np, exists, blocks, resched;
++
++ raw_spin_lock(&pedf->slock);
++
++ /* sanity checking
++ * differently from gedf, when a task exits (dead)
++ * pedf->schedule may be null and prev _is_ realtime
++ */
++ BUG_ON(pedf->scheduled && pedf->scheduled != prev);
++ BUG_ON(pedf->scheduled && !is_realtime(prev));
++
++ /* (0) Determine state */
++ exists = pedf->scheduled != NULL;
++ blocks = exists && !is_running(pedf->scheduled);
++ out_of_time = exists &&
++ budget_enforced(pedf->scheduled) &&
++ budget_exhausted(pedf->scheduled);
++ np = exists && is_np(pedf->scheduled);
++ sleep = exists && get_rt_flags(pedf->scheduled) == RT_F_SLEEP;
++ preempt = edf_preemption_needed(edf, prev);
++
++ /* If we need to preempt do so.
++ * The following checks set resched to 1 in case of special
++ * circumstances.
++ */
++ resched = preempt;
++
++ /* If a task blocks we have no choice but to reschedule.
++ */
++ if (blocks)
++ resched = 1;
++
++ /* Request a sys_exit_np() call if we would like to preempt but cannot.
++ * Multiple calls to request_exit_np() don't hurt.
++ */
++ if (np && (out_of_time || preempt || sleep))
++ request_exit_np(pedf->scheduled);
++
++ /* Any task that is preemptable and either exhausts its execution
++ * budget or wants to sleep completes. We may have to reschedule after
++ * this.
++ */
++ if (!np && (out_of_time || sleep) && !blocks) {
++ job_completion(pedf->scheduled, !sleep);
++ resched = 1;
++ }
++
++ /* The final scheduling decision. Do we need to switch for some reason?
++ * Switch if we are in RT mode and have no task or if we need to
++ * resched.
++ */
++ next = NULL;
++ if ((!np || blocks) && (resched || !exists)) {
++ /* When preempting a task that does not block, then
++ * re-insert it into either the ready queue or the
++ * release queue (if it completed). requeue() picks
++ * the appropriate queue.
++ */
++ if (pedf->scheduled && !blocks)
++ requeue(pedf->scheduled, edf);
++ next = __take_ready(edf);
++ } else
++ /* Only override Linux scheduler if we have a real-time task
++ * scheduled that needs to continue.
++ */
++ if (exists)
++ next = prev;
++
++ if (next) {
++ TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
++ set_rt_flags(next, RT_F_RUNNING);
++ } else {
++ TRACE("becoming idle at %llu\n", litmus_clock());
++ }
++
++ pedf->scheduled = next;
++ raw_spin_unlock(&pedf->slock);
++
++ return next;
++}
++
++
++/* Prepare a task for running in RT mode
++ */
++static void psnedf_task_new(struct task_struct * t, int on_rq, int running)
++{
++ rt_domain_t* edf = task_edf(t);
++ psnedf_domain_t* pedf = task_pedf(t);
++ unsigned long flags;
++
++ TRACE_TASK(t, "psn edf: task new, cpu = %d\n",
++ t->rt_param.task_params.cpu);
++
++ /* setup job parameters */
++ release_at(t, litmus_clock());
++
++ /* The task should be running in the queue, otherwise signal
++ * code will try to wake it up with fatal consequences.
++ */
++ raw_spin_lock_irqsave(&pedf->slock, flags);
++ if (running) {
++ /* there shouldn't be anything else running at the time */
++ BUG_ON(pedf->scheduled);
++ pedf->scheduled = t;
++ } else {
++ requeue(t, edf);
++ /* maybe we have to reschedule */
++ preempt(pedf);
++ }
++ raw_spin_unlock_irqrestore(&pedf->slock, flags);
++}
++
++static void psnedf_task_wake_up(struct task_struct *task)
++{
++ unsigned long flags;
++ psnedf_domain_t* pedf = task_pedf(task);
++ rt_domain_t* edf = task_edf(task);
++ lt_t now;
++
++ TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
++ raw_spin_lock_irqsave(&pedf->slock, flags);
++ BUG_ON(is_queued(task));
++ /* We need to take suspensions because of semaphores into
++ * account! If a job resumes after being suspended due to acquiring
++ * a semaphore, it should never be treated as a new job release.
++ *
++ * FIXME: This should be done in some more predictable and userspace-controlled way.
++ */
++ now = litmus_clock();
++ if (is_tardy(task, now) &&
++ get_rt_flags(task) != RT_F_EXIT_SEM) {
++ /* new sporadic release */
++ release_at(task, now);
++ sched_trace_task_release(task);
++ }
++
++ /* Only add to ready queue if it is not the currently-scheduled
++ * task. This could be the case if a task was woken up concurrently
++ * on a remote CPU before the executing CPU got around to actually
++ * de-scheduling the task, i.e., wake_up() raced with schedule()
++ * and won.
++ */
++ if (pedf->scheduled != task)
++ requeue(task, edf);
++
++ raw_spin_unlock_irqrestore(&pedf->slock, flags);
++ TRACE_TASK(task, "wake up done\n");
++}
++
++static void psnedf_task_block(struct task_struct *t)
++{
++ /* only running tasks can block, thus t is in no queue */
++ TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state);
++
++ BUG_ON(!is_realtime(t));
++ BUG_ON(is_queued(t));
++}
++
++static void psnedf_task_exit(struct task_struct * t)
++{
++ unsigned long flags;
++ psnedf_domain_t* pedf = task_pedf(t);
++ rt_domain_t* edf;
++
++ raw_spin_lock_irqsave(&pedf->slock, flags);
++ if (is_queued(t)) {
++ /* dequeue */
++ edf = task_edf(t);
++ remove(edf, t);
++ }
++ if (pedf->scheduled == t)
++ pedf->scheduled = NULL;
++
++ TRACE_TASK(t, "RIP, now reschedule\n");
++
++ preempt(pedf);
++ raw_spin_unlock_irqrestore(&pedf->slock, flags);
++}
++
++#ifdef CONFIG_FMLP
++static long psnedf_pi_block(struct pi_semaphore *sem,
++ struct task_struct *new_waiter)
++{
++ psnedf_domain_t* pedf;
++ rt_domain_t* edf;
++ struct task_struct* t;
++ int cpu = get_partition(new_waiter);
++
++ BUG_ON(!new_waiter);
++
++ if (edf_higher_prio(new_waiter, sem->hp.cpu_task[cpu])) {
++ TRACE_TASK(new_waiter, " boosts priority\n");
++ pedf = task_pedf(new_waiter);
++ edf = task_edf(new_waiter);
++
++ /* interrupts already disabled */
++ raw_spin_lock(&pedf->slock);
++
++ /* store new highest-priority task */
++ sem->hp.cpu_task[cpu] = new_waiter;
++ if (sem->holder &&
++ get_partition(sem->holder) == get_partition(new_waiter)) {
++ /* let holder inherit */
++ sem->holder->rt_param.inh_task = new_waiter;
++ t = sem->holder;
++ if (is_queued(t)) {
++ /* queued in domain*/
++ remove(edf, t);
++ /* readd to make priority change take place */
++ /* FIXME: this looks outdated */
++ if (is_released(t, litmus_clock()))
++ __add_ready(edf, t);
++ else
++ add_release(edf, t);
++ }
++ }
++
++ /* check if we need to reschedule */
++ if (edf_preemption_needed(edf, current))
++ preempt(pedf);
++
++ raw_spin_unlock(&pedf->slock);
++ }
++
++ return 0;
++}
++
++static long psnedf_inherit_priority(struct pi_semaphore *sem,
++ struct task_struct *new_owner)
++{
++ int cpu = get_partition(new_owner);
++
++ new_owner->rt_param.inh_task = sem->hp.cpu_task[cpu];
++ if (sem->hp.cpu_task[cpu] && new_owner != sem->hp.cpu_task[cpu]) {
++ TRACE_TASK(new_owner,
++ "inherited priority from %s/%d\n",
++ sem->hp.cpu_task[cpu]->comm,
++ sem->hp.cpu_task[cpu]->pid);
++ } else
++ TRACE_TASK(new_owner,
++ "cannot inherit priority: "
++ "no higher priority job waits on this CPU!\n");
++ /* make new owner non-preemptable as required by FMLP under
++ * PSN-EDF.
++ */
++ make_np(new_owner);
++ return 0;
++}
++
++
++/* This function is called on a semaphore release, and assumes that
++ * the current task is also the semaphore holder.
++ */
++static long psnedf_return_priority(struct pi_semaphore *sem)
++{
++ struct task_struct* t = current;
++ psnedf_domain_t* pedf = task_pedf(t);
++ rt_domain_t* edf = task_edf(t);
++ int ret = 0;
++ int cpu = get_partition(current);
++ int still_np;
++
++
++ /* Find new highest-priority semaphore task
++ * if holder task is the current hp.cpu_task[cpu].
++ *
++ * Calling function holds sem->wait.lock.
++ */
++ if (t == sem->hp.cpu_task[cpu])
++ edf_set_hp_cpu_task(sem, cpu);
++
++ still_np = take_np(current);
++
++ /* Since we don't nest resources, this
++ * should always be zero */
++ BUG_ON(still_np);
++
++ if (current->rt_param.inh_task) {
++ TRACE_CUR("return priority of %s/%d\n",
++ current->rt_param.inh_task->comm,
++ current->rt_param.inh_task->pid);
++ } else
++ TRACE_CUR(" no priority to return %p\n", sem);
++
++
++ /* Always check for delayed preemptions that might have become
++ * necessary due to non-preemptive execution.
++ */
++ raw_spin_lock(&pedf->slock);
++
++ /* Reset inh_task to NULL. */
++ current->rt_param.inh_task = NULL;
++
++ /* check if we need to reschedule */
++ if (edf_preemption_needed(edf, current))
++ preempt(pedf);
++
++ raw_spin_unlock(&pedf->slock);
++
++
++ return ret;
++}
++
++#endif
++
++static long psnedf_admit_task(struct task_struct* tsk)
++{
++ return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL;
++}
++
++/* Plugin object */
++static struct sched_plugin psn_edf_plugin __cacheline_aligned_in_smp = {
++ .plugin_name = "PSN-EDF",
++#ifdef CONFIG_SRP
++ .srp_active = 1,
++#endif
++ .tick = psnedf_tick,
++ .task_new = psnedf_task_new,
++ .complete_job = complete_job,
++ .task_exit = psnedf_task_exit,
++ .schedule = psnedf_schedule,
++ .task_wake_up = psnedf_task_wake_up,
++ .task_block = psnedf_task_block,
++#ifdef CONFIG_FMLP
++ .fmlp_active = 1,
++ .pi_block = psnedf_pi_block,
++ .inherit_priority = psnedf_inherit_priority,
++ .return_priority = psnedf_return_priority,
++#endif
++ .admit_task = psnedf_admit_task
++};
++
++
++static int __init init_psn_edf(void)
++{
++ int i;
++
++ /* We do not really want to support cpu hotplug, do we? ;)
++ * However, if we are so crazy to do so,
++ * we cannot use num_online_cpu()
++ */
++ for (i = 0; i < num_online_cpus(); i++) {
++ psnedf_domain_init(remote_pedf(i),
++ psnedf_check_resched,
++ NULL, i);
++ }
++ return register_sched_plugin(&psn_edf_plugin);
++}
++
++module_init(init_psn_edf);
++
+diff --git a/litmus/sched_task_trace.c b/litmus/sched_task_trace.c
+new file mode 100644
+index 0000000..39a543e
+--- /dev/null
++++ b/litmus/sched_task_trace.c
+@@ -0,0 +1,204 @@
++/*
++ * sched_task_trace.c -- record scheduling events to a byte stream
++ */
++
++#define NO_TASK_TRACE_DECLS
++
++#include
++#include
++#include
++
++#include
++#include
++
++#include
++#include
++#include
++
++
++/* set MAJOR to 0 to have it dynamically assigned */
++#define FT_TASK_TRACE_MAJOR 253
++#define NO_EVENTS 4096 /* this is a buffer of 12 4k pages per CPU */
++
++#define now() litmus_clock()
++
++struct local_buffer {
++ struct st_event_record record[NO_EVENTS];
++ char flag[NO_EVENTS];
++ struct ft_buffer ftbuf;
++};
++
++DEFINE_PER_CPU(struct local_buffer, st_event_buffer);
++
++static struct ftdev st_dev;
++
++static int st_dev_can_open(struct ftdev *dev, unsigned int cpu)
++{
++ return cpu_online(cpu) ? 0 : -ENODEV;
++}
++
++static int __init init_sched_task_trace(void)
++{
++ struct local_buffer* buf;
++ int i, ok = 0;
++ ftdev_init(&st_dev, THIS_MODULE);
++ for (i = 0; i < NR_CPUS; i++) {
++ buf = &per_cpu(st_event_buffer, i);
++ ok += init_ft_buffer(&buf->ftbuf, NO_EVENTS,
++ sizeof(struct st_event_record),
++ buf->flag,
++ buf->record);
++ st_dev.minor[i].buf = &buf->ftbuf;
++ }
++ if (ok == NR_CPUS) {
++ st_dev.minor_cnt = NR_CPUS;
++ st_dev.can_open = st_dev_can_open;
++ return register_ftdev(&st_dev, "sched_trace", FT_TASK_TRACE_MAJOR);
++ } else {
++ return -EINVAL;
++ }
++}
++
++module_init(init_sched_task_trace);
++
++
++static inline struct st_event_record* get_record(u8 type, struct task_struct* t)
++{
++ struct st_event_record* rec = NULL;
++ struct local_buffer* buf;
++
++ buf = &get_cpu_var(st_event_buffer);
++ if (ft_buffer_start_write(&buf->ftbuf, (void**) &rec)) {
++ rec->hdr.type = type;
++ rec->hdr.cpu = smp_processor_id();
++ rec->hdr.pid = t ? t->pid : 0;
++ rec->hdr.job = t ? t->rt_param.job_params.job_no : 0;
++ } else {
++ put_cpu_var(st_event_buffer);
++ }
++ /* rec will be NULL if it failed */
++ return rec;
++}
++
++static inline void put_record(struct st_event_record* rec)
++{
++ struct local_buffer* buf;
++ buf = &__get_cpu_var(st_event_buffer);
++ ft_buffer_finish_write(&buf->ftbuf, rec);
++ put_cpu_var(st_event_buffer);
++}
++
++feather_callback void do_sched_trace_task_name(unsigned long id, unsigned long _task)
++{
++ struct task_struct *t = (struct task_struct*) _task;
++ struct st_event_record* rec = get_record(ST_NAME, t);
++ int i;
++ if (rec) {
++ for (i = 0; i < min(TASK_COMM_LEN, ST_NAME_LEN); i++)
++ rec->data.name.cmd[i] = t->comm[i];
++ put_record(rec);
++ }
++}
++
++feather_callback void do_sched_trace_task_param(unsigned long id, unsigned long _task)
++{
++ struct task_struct *t = (struct task_struct*) _task;
++ struct st_event_record* rec = get_record(ST_PARAM, t);
++ if (rec) {
++ rec->data.param.wcet = get_exec_cost(t);
++ rec->data.param.period = get_rt_period(t);
++ rec->data.param.phase = get_rt_phase(t);
++ rec->data.param.partition = get_partition(t);
++ put_record(rec);
++ }
++}
++
++feather_callback void do_sched_trace_task_release(unsigned long id, unsigned long _task)
++{
++ struct task_struct *t = (struct task_struct*) _task;
++ struct st_event_record* rec = get_record(ST_RELEASE, t);
++ if (rec) {
++ rec->data.release.release = get_release(t);
++ rec->data.release.deadline = get_deadline(t);
++ put_record(rec);
++ }
++}
++
++/* skipped: st_assigned_data, we don't use it atm */
++
++feather_callback void do_sched_trace_task_switch_to(unsigned long id,
++ unsigned long _task)
++{
++ struct task_struct *t = (struct task_struct*) _task;
++ struct st_event_record* rec;
++ if (is_realtime(t)) {
++ rec = get_record(ST_SWITCH_TO, t);
++ if (rec) {
++ rec->data.switch_to.when = now();
++ rec->data.switch_to.exec_time = get_exec_time(t);
++ put_record(rec);
++ }
++ }
++}
++
++feather_callback void do_sched_trace_task_switch_away(unsigned long id,
++ unsigned long _task)
++{
++ struct task_struct *t = (struct task_struct*) _task;
++ struct st_event_record* rec;
++ if (is_realtime(t)) {
++ rec = get_record(ST_SWITCH_AWAY, t);
++ if (rec) {
++ rec->data.switch_away.when = now();
++ rec->data.switch_away.exec_time = get_exec_time(t);
++ put_record(rec);
++ }
++ }
++}
++
++feather_callback void do_sched_trace_task_completion(unsigned long id,
++ unsigned long _task,
++ unsigned long forced)
++{
++ struct task_struct *t = (struct task_struct*) _task;
++ struct st_event_record* rec = get_record(ST_COMPLETION, t);
++ if (rec) {
++ rec->data.completion.when = now();
++ rec->data.completion.forced = forced;
++ put_record(rec);
++ }
++}
++
++feather_callback void do_sched_trace_task_block(unsigned long id,
++ unsigned long _task)
++{
++ struct task_struct *t = (struct task_struct*) _task;
++ struct st_event_record* rec = get_record(ST_BLOCK, t);
++ if (rec) {
++ rec->data.block.when = now();
++ put_record(rec);
++ }
++}
++
++feather_callback void do_sched_trace_task_resume(unsigned long id,
++ unsigned long _task)
++{
++ struct task_struct *t = (struct task_struct*) _task;
++ struct st_event_record* rec = get_record(ST_RESUME, t);
++ if (rec) {
++ rec->data.resume.when = now();
++ put_record(rec);
++ }
++}
++
++feather_callback void do_sched_trace_sys_release(unsigned long id,
++ unsigned long _start)
++{
++ lt_t *start = (lt_t*) _start;
++ struct st_event_record* rec = get_record(ST_SYS_RELEASE, NULL);
++ if (rec) {
++ rec->data.sys_release.when = now();
++ rec->data.sys_release.release = *start;
++ put_record(rec);
++ }
++}
+diff --git a/litmus/sched_trace.c b/litmus/sched_trace.c
+new file mode 100644
+index 0000000..1fa2094
+--- /dev/null
++++ b/litmus/sched_trace.c
+@@ -0,0 +1,378 @@
++/*
++ * sched_trace.c -- record scheduling events to a byte stream.
++ */
++#include
++#include
++
++#include
++#include
++#include
++#include
++#include
++#include
++
++#include
++
++#include
++#include
++
++#define SCHED_TRACE_NAME "litmus/log"
++
++/* Allocate a buffer of about 32k per CPU */
++#define LITMUS_TRACE_BUF_PAGES 8
++#define LITMUS_TRACE_BUF_SIZE (PAGE_SIZE * LITMUS_TRACE_BUF_PAGES * NR_CPUS)
++
++/* Max length of one read from the buffer */
++#define MAX_READ_LEN (64 * 1024)
++
++/* Max length for one write --- from kernel --- to the buffer */
++#define MSG_SIZE 255
++
++/* Inner ring buffer structure */
++typedef struct {
++ rwlock_t del_lock;
++
++ /* the buffer */
++ struct kfifo kfifo;
++} ring_buffer_t;
++
++/* Main buffer structure */
++typedef struct {
++ ring_buffer_t buf;
++ atomic_t reader_cnt;
++ struct semaphore reader_mutex;
++} trace_buffer_t;
++
++
++/*
++ * Inner buffer management functions
++ */
++void rb_init(ring_buffer_t* buf)
++{
++ rwlock_init(&buf->del_lock);
++}
++
++int rb_alloc_buf(ring_buffer_t* buf, unsigned int size)
++{
++ unsigned long flags;
++ int ret = 0;
++
++ write_lock_irqsave(&buf->del_lock, flags);
++
++ /* kfifo size must be a power of 2
++ * atm kfifo alloc is automatically rounding the size
++ */
++ ret = kfifo_alloc(&buf->kfifo, size, GFP_ATOMIC);
++
++ write_unlock_irqrestore(&buf->del_lock, flags);
++
++ if(ret < 0)
++ printk(KERN_ERR "kfifo_alloc failed\n");
++
++ return ret;
++}
++
++int rb_free_buf(ring_buffer_t* buf)
++{
++ unsigned long flags;
++
++ write_lock_irqsave(&buf->del_lock, flags);
++
++ BUG_ON(!kfifo_initialized(&buf->kfifo));
++ kfifo_free(&buf->kfifo);
++
++ write_unlock_irqrestore(&buf->del_lock, flags);
++
++ return 0;
++}
++
++/*
++ * Assumption: concurrent writes are serialized externally
++ *
++ * Will only succeed if there is enough space for all len bytes.
++ */
++int rb_put(ring_buffer_t* buf, char* mem, size_t len)
++{
++ unsigned long flags;
++ int error = 0;
++
++ read_lock_irqsave(&buf->del_lock, flags);
++
++ if (!kfifo_initialized(&buf->kfifo)) {
++ error = -ENODEV;
++ goto out;
++ }
++
++ if((kfifo_in(&buf->kfifo, mem, len)) < len) {
++ error = -ENOMEM;
++ goto out;
++ }
++
++ out:
++ read_unlock_irqrestore(&buf->del_lock, flags);
++ return error;
++}
++
++/* Assumption: concurrent reads are serialized externally */
++int rb_get(ring_buffer_t* buf, char* mem, size_t len)
++{
++ unsigned long flags;
++ int error = 0;
++
++ read_lock_irqsave(&buf->del_lock, flags);
++ if (!kfifo_initialized(&buf->kfifo)) {
++ error = -ENODEV;
++ goto out;
++ }
++
++ error = kfifo_out(&buf->kfifo, (unsigned char*)mem, len);
++
++ out:
++ read_unlock_irqrestore(&buf->del_lock, flags);
++ return error;
++}
++
++/*
++ * Device Driver management
++ */
++static DEFINE_RAW_SPINLOCK(log_buffer_lock);
++static trace_buffer_t log_buffer;
++
++static void init_log_buffer(void)
++{
++ rb_init(&log_buffer.buf);
++ atomic_set(&log_buffer.reader_cnt,0);
++ init_MUTEX(&log_buffer.reader_mutex);
++}
++
++static DEFINE_PER_CPU(char[MSG_SIZE], fmt_buffer);
++
++/*
++ * sched_trace_log_message - Write to the trace buffer (log_buffer)
++ *
++ * This is the only function accessing the log_buffer from inside the
++ * kernel for writing.
++ * Concurrent access to sched_trace_log_message must be serialized using
++ * log_buffer_lock
++ * The maximum length of a formatted message is 255
++ */
++void sched_trace_log_message(const char* fmt, ...)
++{
++ unsigned long flags;
++ va_list args;
++ size_t len;
++ char* buf;
++
++ va_start(args, fmt);
++ local_irq_save(flags);
++
++ /* format message */
++ buf = __get_cpu_var(fmt_buffer);
++ len = vscnprintf(buf, MSG_SIZE, fmt, args);
++
++ raw_spin_lock(&log_buffer_lock);
++ /* Don't copy the trailing null byte, we don't want null bytes
++ * in a text file.
++ */
++ rb_put(&log_buffer.buf, buf, len);
++ raw_spin_unlock(&log_buffer_lock);
++
++ local_irq_restore(flags);
++ va_end(args);
++}
++
++/*
++ * log_read - Read the trace buffer
++ *
++ * This function is called as a file operation from userspace.
++ * Readers can sleep. Access is serialized through reader_mutex
++ */
++static ssize_t log_read(struct file *filp, char __user *to, size_t len,
++ loff_t *f_pos)
++{
++ /* we ignore f_pos, this is strictly sequential */
++
++ ssize_t error = -EINVAL;
++ char* mem;
++ trace_buffer_t *tbuf = filp->private_data;
++
++ if (down_interruptible(&tbuf->reader_mutex)) {
++ error = -ERESTARTSYS;
++ goto out;
++ }
++
++ if (len > MAX_READ_LEN)
++ len = MAX_READ_LEN;
++
++ mem = kmalloc(len, GFP_KERNEL);
++ if (!mem) {
++ error = -ENOMEM;
++ goto out_unlock;
++ }
++
++ error = rb_get(&tbuf->buf, mem, len);
++ while (!error) {
++ set_current_state(TASK_INTERRUPTIBLE);
++ schedule_timeout(110);
++ if (signal_pending(current))
++ error = -ERESTARTSYS;
++ else
++ error = rb_get(&tbuf->buf, mem, len);
++ }
++
++ if (error > 0 && copy_to_user(to, mem, error))
++ error = -EFAULT;
++
++ kfree(mem);
++ out_unlock:
++ up(&tbuf->reader_mutex);
++ out:
++ return error;
++}
++
++/*
++ * Enable redirection of printk() messages to the trace buffer.
++ * Defined in kernel/printk.c
++ */
++extern int trace_override;
++extern int trace_recurse;
++
++/*
++ * log_open - open the global log message ring buffer.
++ */
++static int log_open(struct inode *in, struct file *filp)
++{
++ int error = -EINVAL;
++ trace_buffer_t* tbuf;
++
++ tbuf = &log_buffer;
++
++ if (down_interruptible(&tbuf->reader_mutex)) {
++ error = -ERESTARTSYS;
++ goto out;
++ }
++
++ /* first open must allocate buffers */
++ if (atomic_inc_return(&tbuf->reader_cnt) == 1) {
++ if ((error = rb_alloc_buf(&tbuf->buf, LITMUS_TRACE_BUF_SIZE)))
++ {
++ atomic_dec(&tbuf->reader_cnt);
++ goto out_unlock;
++ }
++ }
++
++ error = 0;
++ filp->private_data = tbuf;
++
++ printk(KERN_DEBUG
++ "sched_trace kfifo with buffer starting at: 0x%p\n",
++ (tbuf->buf.kfifo).buffer);
++
++ /* override printk() */
++ trace_override++;
++
++ out_unlock:
++ up(&tbuf->reader_mutex);
++ out:
++ return error;
++}
++
++static int log_release(struct inode *in, struct file *filp)
++{
++ int error = -EINVAL;
++ trace_buffer_t* tbuf = filp->private_data;
++
++ BUG_ON(!filp->private_data);
++
++ if (down_interruptible(&tbuf->reader_mutex)) {
++ error = -ERESTARTSYS;
++ goto out;
++ }
++
++ /* last release must deallocate buffers */
++ if (atomic_dec_return(&tbuf->reader_cnt) == 0) {
++ error = rb_free_buf(&tbuf->buf);
++ }
++
++ /* release printk() overriding */
++ trace_override--;
++
++ printk(KERN_DEBUG "sched_trace kfifo released\n");
++
++ up(&tbuf->reader_mutex);
++ out:
++ return error;
++}
++
++/*
++ * log_fops - The file operations for accessing the global LITMUS log message
++ * buffer.
++ *
++ * Except for opening the device file it uses the same operations as trace_fops.
++ */
++static struct file_operations log_fops = {
++ .owner = THIS_MODULE,
++ .open = log_open,
++ .release = log_release,
++ .read = log_read,
++};
++
++static struct miscdevice litmus_log_dev = {
++ .name = SCHED_TRACE_NAME,
++ .minor = MISC_DYNAMIC_MINOR,
++ .fops = &log_fops,
++};
++
++#ifdef CONFIG_MAGIC_SYSRQ
++void dump_trace_buffer(int max)
++{
++ char line[80];
++ int len;
++ int count = 0;
++
++ /* potential, but very unlikely, race... */
++ trace_recurse = 1;
++ while ((max == 0 || count++ < max) &&
++ (len = rb_get(&log_buffer.buf, line, sizeof(line) - 1)) > 0) {
++ line[len] = '\0';
++ printk("%s", line);
++ }
++ trace_recurse = 0;
++}
++
++static void sysrq_dump_trace_buffer(int key, struct tty_struct *tty)
++{
++ dump_trace_buffer(100);
++}
++
++static struct sysrq_key_op sysrq_dump_trace_buffer_op = {
++ .handler = sysrq_dump_trace_buffer,
++ .help_msg = "dump-trace-buffer(Y)",
++ .action_msg = "writing content of TRACE() buffer",
++};
++#endif
++
++static int __init init_sched_trace(void)
++{
++ printk("Initializing TRACE() device\n");
++ init_log_buffer();
++
++#ifdef CONFIG_MAGIC_SYSRQ
++ /* offer some debugging help */
++ if (!register_sysrq_key('y', &sysrq_dump_trace_buffer_op))
++ printk("Registered dump-trace-buffer(Y) magic sysrq.\n");
++ else
++ printk("Could not register dump-trace-buffer(Y) magic sysrq.\n");
++#endif
++
++
++ return misc_register(&litmus_log_dev);
++}
++
++static void __exit exit_sched_trace(void)
++{
++ misc_deregister(&litmus_log_dev);
++}
++
++module_init(init_sched_trace);
++module_exit(exit_sched_trace);
+diff --git a/litmus/srp.c b/litmus/srp.c
+new file mode 100644
+index 0000000..71639b9
+--- /dev/null
++++ b/litmus/srp.c
+@@ -0,0 +1,318 @@
++/* ************************************************************************** */
++/* STACK RESOURCE POLICY */
++/* ************************************************************************** */
++
++#include
++#include
++#include
++#include
++
++#include
++
++#include
++
++
++#ifdef CONFIG_SRP
++
++struct srp_priority {
++ struct list_head list;
++ unsigned int period;
++ pid_t pid;
++};
++
++#define list2prio(l) list_entry(l, struct srp_priority, list)
++
++/* SRP task priority comparison function. Smaller periods have highest
++ * priority, tie-break is PID. Special case: period == 0 <=> no priority
++ */
++static int srp_higher_prio(struct srp_priority* first,
++ struct srp_priority* second)
++{
++ if (!first->period)
++ return 0;
++ else
++ return !second->period ||
++ first->period < second->period || (
++ first->period == second->period &&
++ first->pid < second->pid);
++}
++
++struct srp {
++ struct list_head ceiling;
++ wait_queue_head_t ceiling_blocked;
++};
++
++
++atomic_t srp_objects_in_use = ATOMIC_INIT(0);
++
++DEFINE_PER_CPU(struct srp, srp);
++
++
++/* Initialize SRP semaphores at boot time. */
++static int __init srp_init(void)
++{
++ int i;
++
++ printk("Initializing SRP per-CPU ceilings...");
++ for (i = 0; i < NR_CPUS; i++) {
++ init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked);
++ INIT_LIST_HEAD(&per_cpu(srp, i).ceiling);
++ }
++ printk(" done!\n");
++
++ return 0;
++}
++module_init(srp_init);
++
++
++#define system_ceiling(srp) list2prio(srp->ceiling.next)
++
++
++#define UNDEF_SEM -2
++
++
++/* struct for uniprocessor SRP "semaphore" */
++struct srp_semaphore {
++ struct srp_priority ceiling;
++ struct task_struct* owner;
++ int cpu; /* cpu associated with this "semaphore" and resource */
++};
++
++#define ceiling2sem(c) container_of(c, struct srp_semaphore, ceiling)
++
++static int srp_exceeds_ceiling(struct task_struct* first,
++ struct srp* srp)
++{
++ return list_empty(&srp->ceiling) ||
++ get_rt_period(first) < system_ceiling(srp)->period ||
++ (get_rt_period(first) == system_ceiling(srp)->period &&
++ first->pid < system_ceiling(srp)->pid) ||
++ ceiling2sem(system_ceiling(srp))->owner == first;
++}
++
++static void srp_add_prio(struct srp* srp, struct srp_priority* prio)
++{
++ struct list_head *pos;
++ if (in_list(&prio->list)) {
++ printk(KERN_CRIT "WARNING: SRP violation detected, prio is already in "
++ "ceiling list! cpu=%d, srp=%p\n", smp_processor_id(), ceiling2sem(prio));
++ return;
++ }
++ list_for_each(pos, &srp->ceiling)
++ if (unlikely(srp_higher_prio(prio, list2prio(pos)))) {
++ __list_add(&prio->list, pos->prev, pos);
++ return;
++ }
++
++ list_add_tail(&prio->list, &srp->ceiling);
++}
++
++
++static void* create_srp_semaphore(void)
++{
++ struct srp_semaphore* sem;
++
++ sem = kmalloc(sizeof(*sem), GFP_KERNEL);
++ if (!sem)
++ return NULL;
++
++ INIT_LIST_HEAD(&sem->ceiling.list);
++ sem->ceiling.period = 0;
++ sem->cpu = UNDEF_SEM;
++ sem->owner = NULL;
++ atomic_inc(&srp_objects_in_use);
++ return sem;
++}
++
++static noinline int open_srp_semaphore(struct od_table_entry* entry, void* __user arg)
++{
++ struct srp_semaphore* sem = (struct srp_semaphore*) entry->obj->obj;
++ int ret = 0;
++ struct task_struct* t = current;
++ struct srp_priority t_prio;
++
++ TRACE("opening SRP semaphore %p, cpu=%d\n", sem, sem->cpu);
++ if (!srp_active())
++ return -EBUSY;
++
++ if (sem->cpu == UNDEF_SEM)
++ sem->cpu = get_partition(t);
++ else if (sem->cpu != get_partition(t))
++ ret = -EPERM;
++
++ if (ret == 0) {
++ t_prio.period = get_rt_period(t);
++ t_prio.pid = t->pid;
++ if (srp_higher_prio(&t_prio, &sem->ceiling)) {
++ sem->ceiling.period = t_prio.period;
++ sem->ceiling.pid = t_prio.pid;
++ }
++ }
++
++ return ret;
++}
++
++static void destroy_srp_semaphore(void* sem)
++{
++ /* XXX invariants */
++ atomic_dec(&srp_objects_in_use);
++ kfree(sem);
++}
++
++struct fdso_ops srp_sem_ops = {
++ .create = create_srp_semaphore,
++ .open = open_srp_semaphore,
++ .destroy = destroy_srp_semaphore
++};
++
++
++static void do_srp_down(struct srp_semaphore* sem)
++{
++ /* Update ceiling. */
++ srp_add_prio(&__get_cpu_var(srp), &sem->ceiling);
++ WARN_ON(sem->owner != NULL);
++ sem->owner = current;
++ TRACE_CUR("acquired srp 0x%p\n", sem);
++}
++
++static void do_srp_up(struct srp_semaphore* sem)
++{
++ /* Determine new system priority ceiling for this CPU. */
++ WARN_ON(!in_list(&sem->ceiling.list));
++ if (in_list(&sem->ceiling.list))
++ list_del(&sem->ceiling.list);
++
++ sem->owner = NULL;
++
++ /* Wake tasks on this CPU, if they exceed current ceiling. */
++ TRACE_CUR("released srp 0x%p\n", sem);
++ wake_up_all(&__get_cpu_var(srp).ceiling_blocked);
++}
++
++/* Adjust the system-wide priority ceiling if resource is claimed. */
++asmlinkage long sys_srp_down(int sem_od)
++{
++ int cpu;
++ int ret = -EINVAL;
++ struct srp_semaphore* sem;
++
++ /* disabling preemptions is sufficient protection since
++ * SRP is strictly per CPU and we don't interfere with any
++ * interrupt handlers
++ */
++ preempt_disable();
++ TS_SRP_DOWN_START;
++
++ cpu = smp_processor_id();
++ sem = lookup_srp_sem(sem_od);
++ if (sem && sem->cpu == cpu) {
++ do_srp_down(sem);
++ ret = 0;
++ }
++
++ TS_SRP_DOWN_END;
++ preempt_enable();
++ return ret;
++}
++
++/* Adjust the system-wide priority ceiling if resource is freed. */
++asmlinkage long sys_srp_up(int sem_od)
++{
++ int cpu;
++ int ret = -EINVAL;
++ struct srp_semaphore* sem;
++
++ preempt_disable();
++ TS_SRP_UP_START;
++
++ cpu = smp_processor_id();
++ sem = lookup_srp_sem(sem_od);
++
++ if (sem && sem->cpu == cpu) {
++ do_srp_up(sem);
++ ret = 0;
++ }
++
++ TS_SRP_UP_END;
++ preempt_enable();
++ return ret;
++}
++
++static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync,
++ void *key)
++{
++ int cpu = smp_processor_id();
++ struct task_struct *tsk = wait->private;
++ if (cpu != get_partition(tsk))
++ TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b",
++ get_partition(tsk));
++ else if (srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
++ return default_wake_function(wait, mode, sync, key);
++ return 0;
++}
++
++
++
++static void do_ceiling_block(struct task_struct *tsk)
++{
++ wait_queue_t wait = {
++ .private = tsk,
++ .func = srp_wake_up,
++ .task_list = {NULL, NULL}
++ };
++
++ tsk->state = TASK_UNINTERRUPTIBLE;
++ add_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
++ tsk->rt_param.srp_non_recurse = 1;
++ preempt_enable_no_resched();
++ schedule();
++ preempt_disable();
++ tsk->rt_param.srp_non_recurse = 0;
++ remove_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
++}
++
++/* Wait for current task priority to exceed system-wide priority ceiling.
++ */
++void srp_ceiling_block(void)
++{
++ struct task_struct *tsk = current;
++
++ /* Only applies to real-time tasks, but optimize for RT tasks. */
++ if (unlikely(!is_realtime(tsk)))
++ return;
++
++ /* Avoid recursive ceiling blocking. */
++ if (unlikely(tsk->rt_param.srp_non_recurse))
++ return;
++
++ /* Bail out early if there aren't any SRP resources around. */
++ if (likely(!atomic_read(&srp_objects_in_use)))
++ return;
++
++ preempt_disable();
++ if (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) {
++ TRACE_CUR("is priority ceiling blocked.\n");
++ while (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
++ do_ceiling_block(tsk);
++ TRACE_CUR("finally exceeds system ceiling.\n");
++ } else
++ TRACE_CUR("is not priority ceiling blocked\n");
++ preempt_enable();
++}
++
++
++#else
++
++asmlinkage long sys_srp_down(int sem_od)
++{
++ return -ENOSYS;
++}
++
++asmlinkage long sys_srp_up(int sem_od)
++{
++ return -ENOSYS;
++}
++
++struct fdso_ops srp_sem_ops = {};
++
++#endif
+diff --git a/litmus/sync.c b/litmus/sync.c
+new file mode 100644
+index 0000000..bf75fde
+--- /dev/null
++++ b/litmus/sync.c
+@@ -0,0 +1,104 @@
++/* litmus/sync.c - Support for synchronous and asynchronous task system releases.
++ *
++ *
++ */
++
++#include
++#include
++#include
++#include
++#include
++#include
++
++#include
++#include
++#include
++
++#include
++
++static DECLARE_COMPLETION(ts_release);
++
++static long do_wait_for_ts_release(void)
++{
++ long ret = 0;
++
++ /* If the interruption races with a release, the completion object
++ * may have a non-zero counter. To avoid this problem, this should
++ * be replaced by wait_for_completion().
++ *
++ * For debugging purposes, this is interruptible for now.
++ */
++ ret = wait_for_completion_interruptible(&ts_release);
++
++ return ret;
++}
++
++int count_tasks_waiting_for_release(void)
++{
++ unsigned long flags;
++ int task_count = 0;
++ struct list_head *pos;
++
++ spin_lock_irqsave(&ts_release.wait.lock, flags);
++ list_for_each(pos, &ts_release.wait.task_list) {
++ task_count++;
++ }
++ spin_unlock_irqrestore(&ts_release.wait.lock, flags);
++
++ return task_count;
++}
++
++static long do_release_ts(lt_t start)
++{
++ int task_count = 0;
++ unsigned long flags;
++ struct list_head *pos;
++ struct task_struct *t;
++
++
++ spin_lock_irqsave(&ts_release.wait.lock, flags);
++ TRACE("<<<<<< synchronous task system release >>>>>>\n");
++
++ sched_trace_sys_release(&start);
++ list_for_each(pos, &ts_release.wait.task_list) {
++ t = (struct task_struct*) list_entry(pos,
++ struct __wait_queue,
++ task_list)->private;
++ task_count++;
++ litmus->release_at(t, start + t->rt_param.task_params.phase);
++ sched_trace_task_release(t);
++ }
++
++ spin_unlock_irqrestore(&ts_release.wait.lock, flags);
++
++ complete_n(&ts_release, task_count);
++
++ return task_count;
++}
++
++
++asmlinkage long sys_wait_for_ts_release(void)
++{
++ long ret = -EPERM;
++ struct task_struct *t = current;
++
++ if (is_realtime(t))
++ ret = do_wait_for_ts_release();
++
++ return ret;
++}
++
++
++asmlinkage long sys_release_ts(lt_t __user *__delay)
++{
++ long ret;
++ lt_t delay;
++
++ /* FIXME: check capabilities... */
++
++ ret = copy_from_user(&delay, __delay, sizeof(delay));
++ if (ret == 0)
++ ret = do_release_ts(litmus_clock() + delay);
++
++ return ret;
++}
+diff --git a/litmus/trace.c b/litmus/trace.c
+new file mode 100644
+index 0000000..4403769
+--- /dev/null
++++ b/litmus/trace.c
+@@ -0,0 +1,103 @@
++#include
++
++#include
++#include
++#include
++
++/******************************************************************************/
++/* Allocation */
++/******************************************************************************/
++
++static struct ftdev overhead_dev;
++
++#define trace_ts_buf overhead_dev.minor[0].buf
++
++static unsigned int ts_seq_no = 0;
++
++static inline void __save_timestamp_cpu(unsigned long event,
++ uint8_t type, uint8_t cpu)
++{
++ unsigned int seq_no;
++ struct timestamp *ts;
++ seq_no = fetch_and_inc((int *) &ts_seq_no);
++ if (ft_buffer_start_write(trace_ts_buf, (void**) &ts)) {
++ ts->event = event;
++ ts->timestamp = ft_timestamp();
++ ts->seq_no = seq_no;
++ ts->cpu = cpu;
++ ts->task_type = type;
++ ft_buffer_finish_write(trace_ts_buf, ts);
++ }
++}
++
++static inline void __save_timestamp(unsigned long event,
++ uint8_t type)
++{
++ __save_timestamp_cpu(event, type, raw_smp_processor_id());
++}
++
++feather_callback void save_timestamp(unsigned long event)
++{
++ __save_timestamp(event, TSK_UNKNOWN);
++}
++
++feather_callback void save_timestamp_def(unsigned long event,
++ unsigned long type)
++{
++ __save_timestamp(event, (uint8_t) type);
++}
++
++feather_callback void save_timestamp_task(unsigned long event,
++ unsigned long t_ptr)
++{
++ int rt = is_realtime((struct task_struct *) t_ptr);
++ __save_timestamp(event, rt ? TSK_RT : TSK_BE);
++}
++
++feather_callback void save_timestamp_cpu(unsigned long event,
++ unsigned long cpu)
++{
++ __save_timestamp_cpu(event, TSK_UNKNOWN, cpu);
++}
++
++/******************************************************************************/
++/* DEVICE FILE DRIVER */
++/******************************************************************************/
++
++/*
++ * should be 8M; it is the max we can ask to buddy system allocator (MAX_ORDER)
++ * and we might not get as much
++ */
++#define NO_TIMESTAMPS (2 << 11)
++
++/* set MAJOR to 0 to have it dynamically assigned */
++#define FT_TRACE_MAJOR 252
++
++static int alloc_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
++{
++ unsigned int count = NO_TIMESTAMPS;
++ while (count && !trace_ts_buf) {
++ printk("time stamp buffer: trying to allocate %u time stamps.\n", count);
++ ftdev->minor[idx].buf = alloc_ft_buffer(count, sizeof(struct timestamp));
++ count /= 2;
++ }
++ return ftdev->minor[idx].buf ? 0 : -ENOMEM;
++}
++
++static void free_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
++{
++ free_ft_buffer(ftdev->minor[idx].buf);
++ ftdev->minor[idx].buf = NULL;
++}
++
++static int __init init_ft_overhead_trace(void)
++{
++ printk("Initializing Feather-Trace overhead tracing device.\n");
++ ftdev_init(&overhead_dev, THIS_MODULE);
++ overhead_dev.minor_cnt = 1; /* only one buffer */
++ overhead_dev.alloc = alloc_timestamp_buffer;
++ overhead_dev.free = free_timestamp_buffer;
++ return register_ftdev(&overhead_dev, "ft_trace", FT_TRACE_MAJOR);
++}
++
++module_init(init_ft_overhead_trace);
diff --git a/index.html b/index.html
index ddb17b8..088c5f2 100644
--- a/index.html
+++ b/index.html
@@ -64,8 +64,8 @@ Have a look at our group's
- The current version of LITMUSRT is 2010.1 and is based on Linux 2.6.32.
- It was released on 05/19/2010 and includes plugins for the following
+ The current version of LITMUSRT is 2010.2 and is based on Linux 2.6.34.
+ It was released on 10/21/2010 and includes plugins for the following
scheduling policies:
@@ -298,15 +298,51 @@ Technology and Applications Symposium, pp. 342-353, April 2008.
it is also available as a git repository (see Development below).
- The current release of LITMUSRT is 2010.1.
+ The current release of LITMUSRT is 2010.2.
It consists of our Linux kernel modifications in the form of
- a patch against Linux 2.6.32 and
+ a patch against Linux 2.6.34 and
liblitmus, the user-space API for real-time
tasks, as well as ft_tools, a collection of tools
used for tracing with Feather-Trace (which is part of the LITMUSRT patch).
+
+ LITMUSRT 2010.2
+
+
+ Based on Linux 2.6.34. Released in October 2010.
+
+
+
Files:
+
+
Major changes since LITMUSRT 2010.1:
+
+ -
+ Rebased LITMUSRT from Linux 2.6.32 to Linux 2.6.34.
+
+ -
+ Added support for configurable budget enforcement (no enforcement, coarse-grained enforcement on timer ticks, and precise enforcement using high-resolution timers).
+
+ - Add support for one single cluster (all cpus) under C-EDF
+ - Made some features optional (C-EDF, PFair, release-master mode).
+ - Fixed several link and compile errors.
+
+
+
+
LITMUSRT 2010.1