diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index 0dfee81..da6f1e9 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -1210,6 +1210,7 @@ config KPROBES
a probepoint and specifies the callback. Kprobes is useful
for kernel debugging, non-intrusive instrumentation and testing.
If in doubt, say "N".
+
endmenu
source "arch/i386/Kconfig.debug"
@@ -1259,3 +1260,30 @@ config X86_TRAMPOLINE
config KTIME_SCALAR
bool
default y
+
+
+menu "LITMUS^RT"
+
+
+config SCHED_TASK_TRACE
+ bool "Trace real-time tasks"
+ default y
+ help
+ Include support for the sched_trace_XXX() tracing functions. This
+ allows the collection of real-time task events such as job
+ completions, job releases, early completions, etc. This results in a
+ small overhead in the scheduling code. Disable if the overhead is not
+ acceptable (e.g., benchmarking).
+
+config SCHED_DEBUG_TRACE
+ bool "TRACE() debugging"
+ default y
+ help
+ Include support for sched_trace_log_messageg(), which is used to
+ implement TRACE(). If disabled, no TRACE() messages will be included
+ in the kernel, and no overheads due to debugging statements will be
+ incurred by the scheduler. Disable if the overhead is not acceptable
+ (e.g. benchmarking).
+
+
+endmenu
diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
index 776d9be..2e8909f 100644
--- a/arch/i386/kernel/apic.c
+++ b/arch/i386/kernel/apic.c
@@ -26,6 +26,7 @@
#include <linux/sysdev.h>
#include <linux/cpu.h>
#include <linux/module.h>
+#include <linux/litmus.h>
#include <asm/atomic.h>
#include <asm/smp.h>
@@ -43,6 +44,8 @@
#include "io_ports.h"
+#include <linux/trace.h>
+
/*
* cpu_mask that denotes the CPUs that needs timer interrupt coming in as
* IPIs in place of local APIC timers
@@ -54,6 +57,15 @@ static cpumask_t timer_bcast_ipi;
*/
static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
+/*
+ * Definitions and variables related to quantum synchronization.
+ */
+#define WAIT_TO_SYNC 30000 /* time after boot until sync */
+static int stagger = 0; /* are we using staggered quanta? */
+static atomic_t qsync_time = ATOMIC_INIT(INITIAL_JIFFIES);
+static atomic_t quantum_sync_barrier = ATOMIC_INIT(0);
+static atomic_t sync_done = ATOMIC_INIT(0);
+
static inline void lapic_disable(void)
{
enable_local_apic = -1;
@@ -786,6 +798,23 @@ static int __init apic_set_verbosity(char *str)
__setup("apic=", apic_set_verbosity);
+/*
+ * Determine whether to use aligned or staggerd quanta.
+ */
+
+static int __init apic_synch_type(char *str)
+{
+ if (strcmp("aligned", str) == 0)
+ stagger = 0;
+ else if (strcmp("staggered", str) == 0)
+ stagger = 1;
+ else
+ stagger = 0; /* aligned quanta by default */
+ return 1;
+}
+
+__setup("quanta=", apic_synch_type);
+
static int __init detect_init_APIC (void)
{
u32 h, l, features;
@@ -1198,6 +1227,47 @@ EXPORT_SYMBOL(switch_ipi_to_APIC_timer);
#undef APIC_DIVISOR
/*
+ * This function is called to align all quanta, and to stagger quanta if
+ * necessary. It relies on a barrier to synchronize all processors, so
+ * that they all reset their APIC timers at the same time. If quanta
+ * should be staggered, the appropriate stagger delay is then added at
+ * each processor.
+ */
+
+void synchronize_quanta(void)
+{
+ int cpu = smp_processor_id();
+ int total_cpus = num_online_cpus();
+ int stagger_interval = jiffies_to_usecs(1) / total_cpus;
+
+ /*
+ * Disable APIC timer, wait for all other processors to reach barrier,
+ * and re-enable all timers concurrently.
+ */
+ disable_APIC_timer();
+ atomic_inc(&quantum_sync_barrier);
+ while (atomic_read(&quantum_sync_barrier) < total_cpus) {
+ /* Delay, otherwise atomic_inc's cannot occur. */
+ udelay(1);
+ }
+
+ /* Add necessary stagger for this CPU, if required. */
+ if (stagger) {
+ int stagger_us = cpu * stagger_interval;
+ udelay(stagger_us);
+ }
+
+ /* Re-enable all timers. */
+ __setup_APIC_LVTT(calibration_result);
+ enable_APIC_timer();
+
+ /* The first CPU signals that quantum sync is complete. */
+ if (cpu == 0)
+ atomic_inc(&sync_done);
+}
+
+
+/*
* Local timer interrupt handler. It does both profiling and
* process statistics/rescheduling.
*
@@ -1209,11 +1279,32 @@ EXPORT_SYMBOL(switch_ipi_to_APIC_timer);
inline void smp_local_timer_interrupt(void)
{
+/* s64 offset; */
+
+ TS_TICK_START;
+
profile_tick(CPU_PROFILING);
#ifdef CONFIG_SMP
update_process_times(user_mode_vm(get_irq_regs()));
#endif
+ /* Print out timing data - can be commented out if necessary. */
+/* offset = get_nsec_offset(); */
+/* TRACE("%d\n", offset); */
+
+ /*
+ * Synchronize quanta if we have reached qsync_time plus wait
+ * interval. The synchronization code itself is placed in its own
+ * (non-inline) function, to avoid issues with creating an inline
+ * function that is too large.
+ */
+ if (unlikely(!atomic_read(&sync_done) &&
+ time_after(jiffies,
+ (unsigned long)(atomic_read(&qsync_time) +
+ msecs_to_jiffies(WAIT_TO_SYNC))))) {
+ synchronize_quanta();
+ }
+
/*
* We take the 'long' return path, and there every subsystem
* grabs the apropriate locks (kernel lock/ irq lock).
@@ -1224,6 +1315,7 @@ inline void smp_local_timer_interrupt(void)
* Currently this isn't too much of an issue (performance wise),
* we can take more than 100K local irqs per second on a 100 MHz P5.
*/
+ TS_TICK_END;
}
/*
diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c
index e3d4b73..9670f77 100644
--- a/arch/i386/kernel/i386_ksyms.c
+++ b/arch/i386/kernel/i386_ksyms.c
@@ -6,6 +6,7 @@ EXPORT_SYMBOL(__down_failed);
EXPORT_SYMBOL(__down_failed_interruptible);
EXPORT_SYMBOL(__down_failed_trylock);
EXPORT_SYMBOL(__up_wakeup);
+
/* Networking helper routines. */
EXPORT_SYMBOL(csum_partial_copy_generic);
diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 2697e92..9a5348f 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S
@@ -319,3 +319,28 @@ ENTRY(sys_call_table)
.long sys_move_pages
.long sys_getcpu
.long sys_epoll_pwait
+ /* LITMUS syscalls */
+ .long sys_sched_setpolicy /* 320 */
+ .long sys_sched_getpolicy
+ .long sys_set_rt_mode
+ .long sys_set_rt_task_param
+ .long sys_get_rt_task_param
+ .long sys_prepare_rt_task /* 325 */
+ .long sys_ni_syscall /* CLEANUP: sys_reset_stat */
+ .long sys_sleep_next_period
+ .long sys_scheduler_setup
+ .long sys_enter_np
+ .long sys_exit_np /* 330 */
+ .long sys_pi_sema_init
+ .long sys_pi_down
+ .long sys_pi_up
+ .long sys_pi_sema_free
+ .long sys_sema_init /* 335 */
+ .long sys_down
+ .long sys_up
+ .long sys_sema_free
+ .long sys_srp_sema_init
+ .long sys_srp_down /* 340 */
+ .long sys_srp_up
+ .long sys_reg_task_srp_sem
+ .long sys_srp_sema_free /* 343 */
diff --git a/include/asm-i386/semaphore.h b/include/asm-i386/semaphore.h
index 4e34a46..7212f4b 100644
--- a/include/asm-i386/semaphore.h
+++ b/include/asm-i386/semaphore.h
@@ -45,6 +45,7 @@ struct semaphore {
atomic_t count;
int sleepers;
wait_queue_head_t wait;
+ int used; /* allows semaphores to allocated to user space processes */
};
diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 833fa17..ac5756d 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h
@@ -325,10 +325,36 @@
#define __NR_move_pages 317
#define __NR_getcpu 318
#define __NR_epoll_pwait 319
+/* LITMUS */
+#define __NR_sched_setpolicy 320
+#define __NR_sched_getpolicy 321
+/* Syscall definitions for mode change and task creation-manipulation */
+#define __NR_set_rt_mode 322
+#define __NR_set_rt_task_param 323
+#define __NR_get_rt_task_param 324
+#define __NR_prepare_rt_task 325
+#define __NR_reset_stat 326
+#define __NR_sleep_next_period 327
+#define __NR_scheduler_setup 328
+#define __NR_enter_np 329
+#define __NR_exit_np 330
+#define __NR_pi_sema_init 331
+#define __NR_pi_down 332
+#define __NR_pi_up 333
+#define __NR_pi_sema_free 334
+#define __NR_sema_init 335
+#define __NR_down 336
+#define __NR_up 337
+#define __NR_sema_free 338
+#define __NR_srp_sema_init 339
+#define __NR_srp_down 340
+#define __NR_srp_up 341
+#define __NR_reg_task_srp_sem 342
+#define __NR_srp_sema_free 343
#ifdef __KERNEL__
-#define NR_syscalls 320
+#define NR_syscalls 343
#define __ARCH_WANT_IPC_PARSE_VERSION
#define __ARCH_WANT_OLD_READDIR
diff --git a/include/linux/edf_common.h b/include/linux/edf_common.h
new file mode 100644
index 0000000..6b0eb2f
--- /dev/null
+++ b/include/linux/edf_common.h
@@ -0,0 +1,77 @@
+/* EDF common data structures and utility functions shared by all EDF
+ * based scheduler plugins
+ */
+
+/* CLEANUP: Add comments and make it less messy.
+ *
+ */
+
+#ifndef __UNC_EDF_COMMON_H__
+#define __UNC_EDF_COMMON_H__
+
+struct _edf_domain;
+
+typedef int (*edf_check_resched_needed_t)(struct _edf_domain *edf);
+typedef struct _edf_domain {
+ /* runnable rt tasks are in here */
+ rwlock_t ready_lock;
+ struct list_head ready_queue;
+
+ /* real-time tasks waiting for release are in here */
+ spinlock_t release_lock;
+ struct list_head release_queue;
+
+ /* how do we check if we need to kick another CPU? */
+ edf_check_resched_needed_t check_resched;
+} edf_domain_t;
+
+#define next_ready(edf) \
+ (list_entry((edf)->ready_queue.next, struct task_struct, rt_list))
+
+void edf_domain_init(edf_domain_t *edf, edf_check_resched_needed_t f);
+
+int edf_higher_prio(struct task_struct* first,
+ struct task_struct* second);
+
+void __add_ready(edf_domain_t* edf, struct task_struct *new);
+void __add_release(edf_domain_t* edf, struct task_struct *task);
+
+struct task_struct* __take_ready(edf_domain_t* edf);
+struct task_struct* __peek_ready(edf_domain_t* edf);
+
+
+void try_release_pending(edf_domain_t* edf);
+void __release_pending(edf_domain_t* edf);
+void __prepare_new_release(struct task_struct *t, jiffie_t start);
+#define prepare_new_release(t) __prepare_new_release(t, jiffies)
+void prepare_for_next_period(struct task_struct *t);
+void prepare_new_releases(edf_domain_t *edf, jiffie_t start);
+void __prepare_new_releases(edf_domain_t *edf, jiffie_t start);
+int preemption_needed(edf_domain_t* edf, struct task_struct *t);
+long edf_sleep_next_period(void);
+
+#define job_completed(t) (!is_be(t) && \
+ (t)->rt_param.times.exec_time == (t)->rt_param.basic_params.exec_cost)
+
+static inline void add_ready(edf_domain_t* edf, struct task_struct *new)
+{
+ unsigned long flags;
+ /* first we need the write lock for edf_ready_queue */
+ write_lock_irqsave(&edf->ready_lock, flags);
+ __add_ready(edf, new);
+ write_unlock_irqrestore(&edf->ready_lock, flags);
+}
+
+static inline void add_release(edf_domain_t* edf, struct task_struct *task)
+{
+ unsigned long flags;
+ /* first we need the write lock for edf_ready_queue */
+ spin_lock_irqsave(&edf->release_lock, flags);
+ __add_release(edf, task);
+ spin_unlock_irqrestore(&edf->release_lock, flags);
+}
+
+int edf_set_hp_task(struct pi_semaphore *sem);
+int edf_set_hp_cpu_task(struct pi_semaphore *sem, int cpu);
+
+#endif
diff --git a/include/linux/feather_buffer.h b/include/linux/feather_buffer.h
new file mode 100644
index 0000000..c477772
--- /dev/null
+++ b/include/linux/feather_buffer.h
@@ -0,0 +1,108 @@
+#ifndef _FEATHER_BUFFER_H_
+#define _FEATHER_BUFFER_H_
+
+/* requires UINT_MAX and memcpy */
+
+static inline int fetch_and_inc(int *val)
+{
+ int ret = 1;
+ __asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" );
+ return ret;
+}
+
+static inline int fetch_and_dec(int *val)
+{
+ int ret = -1;
+ __asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" );
+ return ret;
+}
+
+#define SLOT_FREE 0
+#define SLOT_BUSY 1
+#define SLOT_READY 2
+
+struct ft_buffer {
+ unsigned int slot_count;
+ unsigned int slot_size;
+
+ int free_count;
+ unsigned int write_idx;
+ unsigned int read_idx;
+
+ char* slots;
+ void* buffer_mem;
+ unsigned int failed_writes;
+};
+
+static inline int init_ft_buffer(struct ft_buffer* buf,
+ unsigned int slot_count,
+ unsigned int slot_size,
+ char* slots,
+ void* buffer_mem)
+{
+ int i = 0;
+ if (!slot_count || UINT_MAX % slot_count != slot_count - 1) {
+ /* The slot count must divide UNIT_MAX + 1 so that when it
+ * wraps around the index correctly points to 0.
+ */
+ return 0;
+ } else {
+ buf->slot_count = slot_count;
+ buf->slot_size = slot_size;
+ buf->slots = slots;
+ buf->buffer_mem = buffer_mem;
+ buf->free_count = slot_count;
+ buf->write_idx = 0;
+ buf->read_idx = 0;
+ buf->failed_writes = 0;
+ for (i = 0; i < slot_count; i++)
+ buf->slots[i] = SLOT_FREE;
+ return 1;
+ }
+}
+
+static inline int ft_buffer_start_write(struct ft_buffer* buf, void **ptr)
+{
+ int free = fetch_and_dec(&buf->free_count);
+ unsigned int idx;
+ if (free <= 0) {
+ fetch_and_inc(&buf->free_count);
+ *ptr = 0;
+ fetch_and_inc(&buf->failed_writes);
+ return 0;
+ } else {
+ idx = fetch_and_inc((int*) &buf->write_idx) % buf->slot_count;
+ buf->slots[idx] = SLOT_BUSY;
+ *ptr = ((char*) buf->buffer_mem) + idx * buf->slot_size;
+ return 1;
+ }
+}
+
+static inline void ft_buffer_finish_write(struct ft_buffer* buf, void *ptr)
+{
+ unsigned int idx = ((char*) ptr - (char*) buf->buffer_mem) / buf->slot_size;
+ buf->slots[idx] = SLOT_READY;
+}
+
+
+/* exclusive reader access is assumed */
+static inline int ft_buffer_read(struct ft_buffer* buf, void* dest)
+{
+ unsigned int idx;
+ if (buf->free_count == buf->slot_count)
+ /* nothing available */
+ return 0;
+ idx = buf->read_idx % buf->slot_count;
+ if (buf->slots[idx] == SLOT_READY) {
+ memcpy(dest, ((char*) buf->buffer_mem) + idx * buf->slot_size,
+ buf->slot_size);
+ buf->slots[idx] = SLOT_FREE;
+ buf->read_idx++;
+ fetch_and_inc(&buf->free_count);
+ return 1;
+ } else
+ return 0;
+}
+
+
+#endif
diff --git a/include/linux/feather_trace.h b/include/linux/feather_trace.h
new file mode 100644
index 0000000..57a21a5
--- /dev/null
+++ b/include/linux/feather_trace.h
@@ -0,0 +1,93 @@
+#ifndef _FEATHER_TRACE_H_
+#define _FEATHER_TRACE_H_
+
+#define feather_callback __attribute__((regparm(0)))
+
+/* make the compiler reload any register that is not saved in
+ * a cdecl function call
+ */
+#define CLOBBER_LIST "memory", "cc", "eax", "ecx", "edx"
+
+#define ft_event(id, callback) \
+ __asm__ __volatile__( \
+ "1: jmp 2f \n\t" \
+ " call " #callback " \n\t" \
+ ".section __event_table, \"aw\" \n\t" \
+ ".long " #id ", 0, 1b, 2f \n\t" \
+ ".previous \n\t" \
+ "2: \n\t" \
+ : : : CLOBBER_LIST)
+
+#define ft_event0(id, callback) \
+ __asm__ __volatile__( \
+ "1: jmp 2f \n\t" \
+ " subl $4, %%esp \n\t" \
+ " movl $" #id ", (%%esp) \n\t" \
+ " call " #callback " \n\t" \
+ " addl $4, %%esp \n\t" \
+ ".section __event_table, \"aw\" \n\t" \
+ ".long " #id ", 0, 1b, 2f \n\t" \
+ ".previous \n\t" \
+ "2: \n\t" \
+ : : : CLOBBER_LIST)
+
+#define ft_event1(id, callback, param) \
+ __asm__ __volatile__( \
+ "1: jmp 2f \n\t" \
+ " subl $8, %%esp \n\t" \
+ " movl %0, 4(%%esp) \n\t" \
+ " movl $" #id ", (%%esp) \n\t" \
+ " call " #callback " \n\t" \
+ " addl $8, %%esp \n\t" \
+ ".section __event_table, \"aw\" \n\t" \
+ ".long " #id ", 0, 1b, 2f \n\t" \
+ ".previous \n\t" \
+ "2: \n\t" \
+ : : "r" (param) : CLOBBER_LIST)
+
+#define ft_event2(id, callback, param, param2) \
+ __asm__ __volatile__( \
+ "1: jmp 2f \n\t" \
+ " subl $12, %%esp \n\t" \
+ " movl %1, 8(%%esp) \n\t" \
+ " movl %0, 4(%%esp) \n\t" \
+ " movl $" #id ", (%%esp) \n\t" \
+ " call " #callback " \n\t" \
+ " addl $12, %%esp \n\t" \
+ ".section __event_table, \"aw\" \n\t" \
+ ".long " #id ", 0, 1b, 2f \n\t" \
+ ".previous \n\t" \
+ "2: \n\t" \
+ : : "r" (param), "r" (param2) : CLOBBER_LIST)
+
+
+#define ft_event3(id, callback, p, p2, p3) \
+ __asm__ __volatile__( \
+ "1: jmp 2f \n\t" \
+ " subl $16, %%esp \n\t" \
+ " movl %1, 12(%%esp) \n\t" \
+ " movl %1, 8(%%esp) \n\t" \
+ " movl %0, 4(%%esp) \n\t" \
+ " movl $" #id ", (%%esp) \n\t" \
+ " call " #callback " \n\t" \
+ " addl $16, %%esp \n\t" \
+ ".section __event_table, \"aw\" \n\t" \
+ ".long " #id ", 0, 1b, 2f \n\t" \
+ ".previous \n\t" \
+ "2: \n\t" \
+ : : "r" (p), "r" (p2), "r" (p3) : CLOBBER_LIST)
+
+
+static inline unsigned long long ft_read_tsc(void)
+{
+ unsigned long long ret;
+ __asm__ __volatile__("rdtsc" : "=A" (ret));
+ return ret;
+}
+
+int ft_enable_event(unsigned long id);
+int ft_disable_event(unsigned long id);
+int ft_is_event_enabled(unsigned long id);
+int ft_disable_all_events(void);
+
+#endif
diff --git a/include/linux/fifo_common.h b/include/linux/fifo_common.h
new file mode 100644
index 0000000..5364e2b
--- /dev/null
+++ b/include/linux/fifo_common.h
@@ -0,0 +1,33 @@
+/* FIFO common definitions and utility functions.
+ */
+#ifndef __UNC_SCHED_FIFO_H__
+#define __UNC_SCHED_FIFO_H__
+
+typedef struct {
+ struct list_head queue;
+ atomic_t count;
+ spinlock_t lock;
+ unsigned int time_slice;
+} fifo_domain_t;
+
+#define FIFO_INIT(name, time_slice) \
+ { LIST_HEAD_INIT(name.queue), \
+ ATOMIC_INIT(0), \
+ SPIN_LOCK_UNLOCKED, \
+ time_slice}
+
+void fifo_domain_init(fifo_domain_t* fifo, unsigned int exec_budget);
+void fifo_enqueue(fifo_domain_t* fifo, struct task_struct* task);
+void fifo_add(fifo_domain_t* fifo, struct task_struct* task);
+void lifo_add(fifo_domain_t* fifo, struct task_struct* task);
+struct task_struct* __fifo_take(fifo_domain_t* fifo);
+struct task_struct* fifo_take(fifo_domain_t* fifo);
+struct task_struct* fifo_take_rq(fifo_domain_t* fifo, runqueue_t* rq, int cpu);
+
+static inline int fifo_jobs_pending(fifo_domain_t* fifo)
+{
+ return atomic_read(&fifo->count) > 0;
+}
+
+
+#endif
diff --git a/include/linux/litmus.h b/include/linux/litmus.h
new file mode 100644
index 0000000..73ea643
--- /dev/null
+++ b/include/linux/litmus.h
@@ -0,0 +1,124 @@
+/*
+ * Constant definitions related to
+ * scheduling policy.
+ */
+
+#ifndef _LINUX_LITMUS_H_
+#define _LINUX_LITMUS_H_
+
+#include <linux/jiffies.h>
+#include <linux/sched_trace.h>
+
+typedef enum {
+ SCHED_BEG = 0,
+ SCHED_LINUX = 0,
+ SCHED_PFAIR = 1,
+ SCHED_PFAIR_STAGGER = 2,
+ SCHED_PART_EDF = 3,
+ SCHED_PART_EEVDF = 4,
+ SCHED_GLOBAL_EDF = 5,
+ SCHED_PFAIR_DESYNC = 6,
+ SCHED_GLOBAL_EDF_NP = 7,
+ SCHED_CUSTOM = 8,
+ SCHED_EDF_HSB = 9,
+ SCHED_GSN_EDF = 10,
+ SCHED_PSN_EDF = 11,
+
+ /* Add your scheduling policy here */
+
+ SCHED_END = 11,
+ SCHED_DEFAULT = 0,
+ SCHED_INVALID = -1,
+} spolicy;
+
+/* no options */
+#define SCHED_NONE 0
+/* make scheduling decisions at quantum boundaries */
+#define SCHED_QUANTUM 1
+/* only schedule RT tasks at slot boundaries */
+#define SCHED_RT_AT_BOUND 2
+/* default slot size - number of 1ms jiffies in a scheduling quantum */
+#define DEFAULT_SLOT_SIZE 1
+/* stagger value for no staggering of slot boundaries */
+#define DEFAULT_NO_STAGGER 0
+/* default stagger - number of 1ms jiffies by which processors
+ * are staggered, modulo the slot size
+ */
+#define DEFAULT_STAGGER 2
+
+/* Runtime modes */
+/* CLEANUP: Should maybe an enum? */
+#define MAX_MODES 2
+#define MODE_NON_RT 0
+#define MODE_RT_RUN 1
+
+/* Plugin boot options, for convenience */
+#define PLUGIN_LINUX "linux"
+#define PLUGIN_PFAIR "pfair"
+#define PLUGIN_PART_EDF "part_edf"
+#define PLUGIN_GLOBAL_EDF "global_edf"
+#define PLUGIN_PFAIR_STAGGER "stagger"
+#define PLUGIN_PFAIR_DESYNC "desync"
+#define PLUGIN_GLOBAL_EDF_NP "global_edf_np"
+#define PLUGIN_EDF_HSB "edf_hsb"
+#define PLUGIN_GSN_EDF "gsn_edf"
+#define PLUGIN_PSN_EDF "psn_edf"
+
+
+/* Additional clone flags
+ Indicates that the thread is to be used in
+ realtime mode, therefore it should not be
+ woken up in a linux manner,
+ we just set its state to TASK_STOPPED
+ It must be prepared and added to the ready queue explicitly
+*/
+
+/* Type definition for our quantums */
+typedef unsigned long long quantum_t;
+
+extern spolicy sched_policy;
+extern int sched_options;
+/* Make this function available to plugins */
+void set_sched_options(int);
+
+extern unsigned long slot_size;
+extern unsigned long stagger_offset;
+
+/* RT mode start time */
+extern volatile unsigned long rt_start_time;
+
+/* Here we store the current mode of the system */
+extern atomic_t rt_mode;
+
+#define get_rt_mode() (atomic_read(&rt_mode))
+#define set_rt_mode(a) atomic_set(&rt_mode,(a))
+
+/* CLEANUP: Should be queue_lock, does it really belong here? */
+extern spinlock_t litmus_task_set_lock;
+
+
+#define TRACE(fmt, args...) \
+ sched_trace_log_message("%d: " fmt, raw_smp_processor_id(), ## args)
+
+#define TRACE_TASK(t, fmt, args...) \
+ TRACE("(%s/%d) " fmt, (t)->comm, (t)->pid, ##args)
+
+#define TRACE_CUR(fmt, args...) \
+ TRACE_TASK(current, fmt, ## args)
+
+/* in_list - is a given list_head queued on some list?
+ */
+static inline int in_list(struct list_head* list)
+{
+ return !( /* case 1: deleted */
+ (list->next == LIST_POISON1 &&
+ list->prev == LIST_POISON2)
+ ||
+ /* case 2: initialized */
+ (list->next == list &&
+ list->prev == list)
+ );
+}
+
+
+#endif
diff --git a/include/linux/pfair_common.h b/include/linux/pfair_common.h
new file mode 100644
index 0000000..67e18c6
--- /dev/null
+++ b/include/linux/pfair_common.h
@@ -0,0 +1,40 @@
+/* PFAIR common data structures and utility functions shared by all PFAIR
+ * based scheduler plugins
+ */
+
+#ifndef __UNC_PFAIR_COMMON_H__
+#define __UNC_PFAIR_COMMON_H__
+
+#include <linux/queuelock.h>
+#include <linux/cpumask.h>
+
+typedef struct _pfair_domain {
+ /* Global lock to protect the data structures */
+ queuelock_t pfair_lock;
+ /* runnable rt tasks are in here */
+ struct list_head ready_queue;
+
+ /* real-time tasks waiting for release are in here */
+ struct list_head release_queue;
+
+ /* CPU's in the domain */
+ cpumask_t domain_cpus;
+
+} pfair_domain_t;
+
+#define next_ready(pfair) \
+ (list_entry((pfair)->ready_queue.next, struct task_struct, rt_list))
+void pfair_domain_init(pfair_domain_t *pfair);
+void pfair_add_ready(pfair_domain_t* pfair, struct task_struct *new);
+struct task_struct* __pfair_take_ready(pfair_domain_t* pfair);
+void pfair_add_release(pfair_domain_t* pfair, struct task_struct *task);
+void pfair_try_release_pending(pfair_domain_t* pfair);
+void __pfair_prepare_new_release(struct task_struct *t, jiffie_t start);
+
+void pfair_prepare_next_job(struct task_struct *t);
+void pfair_prepare_next_subtask(struct task_struct *t);
+
+void pfair_prepare_new_releases(pfair_domain_t *pfair, jiffie_t start);
+
+#endif
+
diff --git a/include/linux/pfair_math.h b/include/linux/pfair_math.h
new file mode 100644
index 0000000..dab1778
--- /dev/null
+++ b/include/linux/pfair_math.h
@@ -0,0 +1,77 @@
+/* PFAIR Mathematical functions */
+#ifndef __UNC_PFAIR_MATH_H__
+#define __UNC_PFAIR_MATH_H__
+
+#include <linux/rt_param.h>
+#include <asm/div64.h>
+#include <linux/litmus.h>
+#include <linux/sched.h>
+
+/*
+* This file defines mathematical functions "ceiling", "floor",
+* and PFAIR specific functions for computing the release and
+* the deadline of a subtask, as well as tie breakers:
+* b-bit and group deadline.
+*/
+static inline quantum_t FLOOR(quantum_t a, unsigned long b)
+{
+ BUG_ON( b == 0);
+ do_div(a, b);
+ return a;
+}
+static inline quantum_t CEIL(quantum_t a, unsigned long b)
+{
+ quantum_t t = FLOOR(a, b);
+ return (quantum_t)((t * b == a) ? t : (t + 1));
+}
+
+
+/*
+* invariant - i-1=get_passed_quanta(t)
+*
+* release time of i-th subtask of j-th job is
+* r_{ij}+\lfloor i-1/wt(T) \rfloor
+* This operation should be robust to wrap-around
+* so we can compare the result with jiffies safely
+*/
+static inline quantum_t release_time(struct task_struct * t)
+{
+ quantum_t e = get_exec_cost(t);
+ quantum_t p = get_rt_period(t);
+ return FLOOR((get_passed_quanta(t)) * p, e);
+}
+/*
+* deadline time of i-th subtask of j-th job is
+* r_{ij}+\lceil i/wt(T) \rceil
+* This operation should be robust to wrap-around
+* so we can compare the result with jiffies safely
+*/
+static inline quantum_t pfair_deadline(struct task_struct * t)
+{
+ quantum_t e = get_exec_cost(t);
+ quantum_t p = get_rt_period(t);
+ return CEIL((get_passed_quanta(t) + 1) * p, e);
+}
+/* In PFAIR b-bit is defined as
+* \lceil i/wt(T) \rceil-\lfloor i/wt(T) \rfloor
+*/
+static inline int b_bit(struct task_struct *t)
+{
+ quantum_t e = get_exec_cost(t);
+ quantum_t p = get_rt_period(t);
+ return CEIL((get_passed_quanta(t) + 1) * p, e)-
+ FLOOR((get_passed_quanta(t) + 1) * p, e);
+}
+/*
+* Group deadline
+*/
+static inline quantum_t group_deadline(struct task_struct * t)
+{
+ quantum_t p = get_rt_period(t);
+ quantum_t e = get_exec_cost(t);
+ quantum_t stage1 = CEIL((get_passed_quanta(t) + 1) * p, e);
+ quantum_t stage2 = CEIL(stage1 * (p - e), p);
+ return CEIL(stage2 * p, p - e);
+}
+
+#endif /* __UNC_PFAIR_MATH_H__ */
diff --git a/include/linux/queuelock.h b/include/linux/queuelock.h
new file mode 100644
index 0000000..454ff81
--- /dev/null
+++ b/include/linux/queuelock.h
@@ -0,0 +1,98 @@
+#ifndef _UNC_QUEUELOCK_H_
+#define _UNC_QUEUELOCK_H_
+/**
+* Queue lock
+*
+* This is an implementation of T. Anderson's queue lock.
+* It strives to follow the normal Linux locking conventions
+* as much as possible. The rules for acquiring a lock are:
+*
+* 1) The caller must ensure interrupts and preemptions are disabled.
+*
+* 2) The caller _cannot_ recursively acquire the lock.
+*
+* 3) The caller may not sleep while holding the lock. This is currently
+* not enforced, but it will not work.
+*/
+
+#include <linux/cache.h>
+#include <asm/atomic.h>
+#include <linux/smp.h>
+
+typedef struct {
+ /* pad the values being spun on to make sure
+ that they are cache local
+ */
+ union {
+ volatile enum {
+ MUST_WAIT,
+ HAS_LOCK
+ } val;
+ char padding[SMP_CACHE_BYTES];
+ } slots[NR_CPUS];
+
+ /* since spin_slot is not being spun on it can be
+ * in a shared cache line. next_slot will be evicted
+ * anyway on every attempt to acquire the lock.
+ */
+ int spin_slot[NR_CPUS];
+
+ /* The next slot that will be available.
+ */
+ atomic_t next_slot;
+} queuelock_t;
+
+
+static inline void queue_lock_init(queuelock_t *lock)
+{
+ int i;
+ for (i = 0; i < NR_CPUS; i++) {
+ lock->slots[i].val = MUST_WAIT;
+ lock->spin_slot[i] = i;
+ }
+ lock->slots[0].val = HAS_LOCK;
+ atomic_set(&lock->next_slot, 0);
+}
+
+
+static inline void queue_lock(queuelock_t *lock)
+{
+ int me = smp_processor_id();
+ volatile int* spin_var;
+ /* Get slot to spin on. atomic_inc_return() returns the incremented
+ * value, so take one of again
+ */
+ lock->spin_slot[me] = atomic_inc_return(&lock->next_slot) - 1;
+ /* check for wrap-around
+ * This could probably optimized away if we ensure that NR_CPUS divides
+ * INT_MAX...
+ */
+ if (unlikely(lock->spin_slot[me] == NR_CPUS - 1))
+ atomic_add(-NR_CPUS, &lock->next_slot);
+ /* range limit*/
+ lock->spin_slot[me] %= NR_CPUS;
+ /* spin until you acquire the lock */
+ spin_var = (int*) &lock->slots[lock->spin_slot[me]].val;
+ while (*spin_var == MUST_WAIT)
+ cpu_relax();
+
+ /* reset the lock */
+ lock->slots[lock->spin_slot[me]].val = MUST_WAIT;
+ barrier();
+}
+
+
+static inline void queue_unlock(queuelock_t *lock)
+{
+ int me = smp_processor_id();
+ barrier();
+ lock->slots[(lock->spin_slot[me] + 1) % NR_CPUS].val = HAS_LOCK;
+}
+
+#define queue_lock_irqsave(lock, flags) \
+ do { local_irq_save(flags); queue_lock(lock); } while (0);
+
+#define queue_unlock_irqrestore(lock, flags) \
+ do { queue_unlock(lock); local_irq_restore(flags); } while (0);
+
+#endif /* _UNC_QUEUELOCK_H_ */
diff --git a/include/linux/rt_param.h b/include/linux/rt_param.h
new file mode 100644
index 0000000..a305619
--- /dev/null
+++ b/include/linux/rt_param.h
@@ -0,0 +1,174 @@
+/*
+ * Definition of the scheduler plugin interface.
+ *
+ */
+#ifndef _LINUX_RT_PARAM_H_
+#define _LINUX_RT_PARAM_H_
+
+#include <linux/wait.h>
+
+typedef unsigned long jiffie_t;
+
+/* different types of clients */
+typedef enum {
+ RT_CLASS_HARD,
+ RT_CLASS_SOFT,
+ RT_CLASS_BEST_EFFORT
+} task_class_t;
+
+typedef struct rt_param {
+ unsigned long exec_cost;
+ unsigned long period;
+ unsigned int cpu;
+ task_class_t class;
+} rt_param_t;
+
+typedef struct {
+ /* when will this task be release the next time? */
+ jiffie_t release;
+ /* time instant the last job was released */
+ jiffie_t last_release;
+ /* what is the current deadline? */
+ jiffie_t deadline;
+ /* b-bit tie breaker for PFAIR, it is ignored in EDF */
+ int b_bit;
+ /* group deadline tie breaker, it is ignored in EDF */
+ jiffie_t group_deadline;
+ /* how long has this task executed so far?
+ * In case of capacity sharing a job completion cannot be
+ * detected by checking time_slice == 0 as the job may have
+ * executed while using another capacity. Use this counter
+ * to keep track of the time spent on a CPU by a job.
+ *
+ * In other words: The number of consumed quanta since the
+ * last job release.
+ */
+ unsigned int exec_time;
+} in_times_t;
+
+
+/* RT task parameters for scheduling extensions
+ * These parameters are inherited during clone and therefore must
+ * be explicitly set up before the task set is launched.
+ */
+typedef struct task_rt_param {
+ /* Real-time marker */
+ int is_realtime;
+ /* user controlled parameters */
+ rt_param_t basic_params;
+ /* is the task sleeping? */
+ unsigned int flags;
+ /* task representing the current "inherited" task
+ * priority, assigned by inherit_priority and
+ * return priority in the scheduler plugins.
+ * could point to self if PI does not result in
+ * an increased task priority.
+ */
+ struct task_struct* inh_task;
+
+ unsigned int is_non_preemptable;
+
+ /* put information for feedback control stuff and
+ * information about the performance of the task here
+ */
+ struct {
+ /* How many non-tardy jobs since the last tardy job? */
+ unsigned int nontardy_jobs_ctr;
+ } stats;
+
+ in_times_t times;
+ in_times_t backup;
+
+ /* is this task under control of litmus?
+ *
+ * this is necessary because otherwise signal delivery code
+ * may try to wake up a task that is already queued in plugin
+ * data structures.
+ */
+ int litmus_controlled:1;
+ int subject_to_srp:1;
+
+
+ /* This field can be used by plugins to store where the task
+ * is currently scheduled. It is the responsibility of the
+ * plugin to avoid race conditions.
+ */
+ int scheduled_on;
+
+ /* This field can be used by plugins to store where the task
+ * is currently linked. It is the responsibility of the plugin
+ * to avoid race conditions.
+ */
+ int linked_on;
+} task_rt_param_t;
+
+/* Possible RT flags */
+#define RT_F_RUNNING 0x00000000
+#define RT_F_SLEEP 0x00000001
+#define RT_F_EXP_QUANTA 0x00000002
+#define RT_F_NON_PREEMTABLE 0x00000004
+#define RT_F_EXIT_SEM 0x00000008
+
+/* Realtime utility macros */
+#define get_passed_quanta(t) ((t)->rt_param.times.exec_time)
+#define inc_passed_quanta(t) ((t)->rt_param.times.exec_time += 1)
+#define get_rt_flags(t) ((t)->rt_param.flags)
+#define set_rt_flags(t,f) (t)->rt_param.flags=(f)
+#define get_exec_cost(t) ((t)->rt_param.basic_params.exec_cost)
+#define get_rt_period(t) ((t)->rt_param.basic_params.period)
+#define set_rt_period(t,p) (t)->rt_param.basic_params.period=(p)
+#define set_exec_cost(t,e) (t)->rt_param.basic_params.exec_cost=(e)
+#define get_partition(t) (t)->rt_param.basic_params.cpu
+#define get_deadline(t) ((t)->rt_param.times.deadline)
+#define get_class(t) ((t)->rt_param.basic_params.class)
+
+#define is_realtime(t) ((t)->rt_param.is_realtime)
+#define is_subject_to_srp(t) ((t)->rt_param.subject_to_srp)
+#define is_hrt(t) \
+ ((t)->rt_param.basic_params.class == RT_CLASS_HARD)
+#define is_srt(t) \
+ ((t)->rt_param.basic_params.class == RT_CLASS_SOFT)
+#define is_be(t) \
+ ((t)->rt_param.basic_params.class == RT_CLASS_BEST_EFFORT)
+#define is_np(t) ((t)->rt_param.is_non_preemptable)
+
+#define clear_rt_params(t) \
+memset(&(t)->rt_param,0, sizeof(struct task_rt_param))
+
+#define get_last_release_time(t) ((t)->rt_param.times.last_release)
+#define set_last_release_time(t,r) ((t)->rt_param.times.last_release=(r))
+
+#define get_release(t) ((t)->rt_param.times.release)
+#define set_release(t,r) ((t)->rt_param.times.release=(r))
+
+/* honor the flag that is set when scheduling is in progress
+ * This is some dirty hack in Linux that creates race conditions in our code
+ * if don't pay attention to it.
+ */
+#define is_running(t) \
+ ((t)->state == TASK_RUNNING || \
+ (t)->thread_info->preempt_count & PREEMPT_ACTIVE)
+
+#define is_blocked(t) (!is_running(t))
+#define is_released(t) (time_before_eq((t)->rt_param.times.release, jiffies))
+#define is_tardy(t) (time_before_eq((t)->rt_param.times.deadline, jiffies))
+#define task_slack(t) ( (int) (t)->rt_param.times.deadline - (int) jiffies - \
+ (int) ((t)->rt_param.basic_params.exec_cost - \
+ (t)->rt_param.times.exec_time))
+
+
+/* real-time comparison macros */
+#define earlier_deadline(a, b) (time_before(\
+ (a)->rt_param.times.deadline,\
+ (b)->rt_param.times.deadline))
+#define earlier_release(a, b) (time_before(\
+ (a)->rt_param.times.release,\
+ (b)->rt_param.times.release))
+
+#define backup_times(t) do { (t)->rt_param.backup=(t)->rt_param.times; \
+ } while(0);
+#define restore_times(t) do { (t)->rt_param.times=(t)->rt_param.backup; \
+ } while(0);
+
+
+#endif
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4463735..f533ae3 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3,6 +3,8 @@
#include <linux/auxvec.h> /* For AT_VECTOR_SIZE */
+#include <linux/rt_param.h>
+
/*
* cloning flags:
*/
@@ -26,6 +28,8 @@
#define CLONE_STOPPED 0x02000000 /* Start in stopped state */
#define CLONE_NEWUTS 0x04000000 /* New utsname group? */
#define CLONE_NEWIPC 0x08000000 /* New ipcs */
+#define CLONE_REALTIME 0x10000000 /* LITMUS real-time task creation */
+
/*
* Scheduling policies
@@ -1051,6 +1055,12 @@ struct task_struct {
#ifdef CONFIG_FAULT_INJECTION
int make_it_fail;
#endif
+ /* litmus parameters and state */
+ task_rt_param_t rt_param;
+
+ /* allow scheduler plugins to queue in release lists, etc. */
+ struct list_head rt_list;
+
};
static inline pid_t process_group(struct task_struct *tsk)
diff --git a/include/linux/sched_plugin.h b/include/linux/sched_plugin.h
new file mode 100644
index 0000000..6f09512
--- /dev/null
+++ b/include/linux/sched_plugin.h
@@ -0,0 +1,168 @@
+/*
+ * Definition of the scheduler plugin interface.
+ *
+ */
+#ifndef _LINUX_SCHED_PLUGIN_H_
+#define _LINUX_SCHED_PLUGIN_H_
+
+#include <linux/sched.h>
+
+/* struct for semaphore with priority inheritance */
+struct pi_semaphore {
+ atomic_t count;
+ int sleepers;
+ wait_queue_head_t wait;
+ union {
+ /* highest-prio holder/waiter */
+ struct task_struct *task;
+ struct task_struct* cpu_task[NR_CPUS];
+ } hp;
+ /* current lock holder */
+ struct task_struct *holder;
+ /* is the semaphore being used? */
+ int used;
+};
+
+
+/* Enforce runqueues to be opaque objects.
+ *
+ * This allows us to pass around pointers to runqueues,
+ * without actually having to rip it out of sched.c. It
+ * also discourages plugins from trying to be
+ * overly clever.
+ */
+typedef void runqueue_t;
+
+/********************* real-time callbacks ********************/
+
+/* Special plugin shutdown hook that clear plugin data structures
+ Currently is not supported
+*/
+typedef void (*plugin_shutdown_hook_t) (void);
+
+
+/********************* scheduler invocation ******************/
+
+typedef enum {
+ NO_RESCHED = 0,
+ FORCE_RESCHED = 1
+} reschedule_check_t;
+
+
+/* Plugin-specific realtime tick handler */
+typedef reschedule_check_t (*scheduler_tick_t) (void);
+/* Novell make sched decision function */
+typedef int (*schedule_t) (struct task_struct * prev,
+ struct task_struct ** next,
+ runqueue_t * rq);
+/* Clean up after the task switch has occured.
+ * This function is called after every (even non-rt) task switch.
+ */
+typedef void (*finish_switch_t)(struct task_struct *prev);
+
+
+/********************* task state changes ********************/
+
+/* called to setup a new real-time task */
+typedef long (*prepare_task_t) (struct task_struct *task);
+/* called to re-introduce a task after blocking */
+typedef void (*wake_up_task_t) (struct task_struct *task);
+/* called to notify the plugin of a blocking real-time task
+ * it will only be called for real-time tasks and before schedule is called */
+typedef void (*task_blocks_t) (struct task_struct *task);
+/* called when a real-time task exits. Free any allocated resources */
+typedef long (*tear_down_t) (struct task_struct *);
+
+/* called when a real-time task wants to enter a non-preemptable section */
+typedef long (*enter_np_t) (struct task_struct *);
+/* called when a real-time task wants to leave a non-preemptable section */
+typedef long (*exit_np_t) (struct task_struct *);
+
+
+/* Called when the new_owner is released from the wait queue
+ * it should now inherit the priority from sem, _before_ it gets readded
+ * to any queue
+ */
+typedef long (*inherit_priority_t) (struct pi_semaphore *sem,
+ struct task_struct *new_owner);
+
+/* Called when the current task releases a semahpore where it might have
+ * inherited a piority from
+ */
+typedef long (*return_priority_t) (struct pi_semaphore *sem);
+
+/* Called when a task tries to acquire a semaphore and fails. Check if its
+ * priority is higher than that of the current holder.
+ */
+typedef long (*pi_block_t) (struct pi_semaphore *sem, struct task_struct *t);
+
+
+/********************* sys call backends ********************/
+/* This function causes the caller to sleep until the next release */
+typedef long (*sleep_next_period_t) (void);
+
+typedef int (*scheduler_setup_t) (int cmd, void __user *parameter);
+
+typedef int (*mode_change_t) (int);
+
+struct sched_plugin {
+ /* basic info */
+ char *plugin_name;
+ int ready_to_use;
+
+ /* management interface */
+ plugin_shutdown_hook_t shutdown_hook; /*currently unsupported */
+ mode_change_t mode_change;
+
+ /* scheduler invocation */
+ scheduler_tick_t scheduler_tick;
+ scheduler_tick_t algo_scheduler_tick;
+ schedule_t schedule;
+ finish_switch_t finish_switch;
+
+ /* syscall backend */
+ sleep_next_period_t sleep_next_period;
+ scheduler_setup_t scheduler_setup;
+
+ /* task state changes */
+ prepare_task_t prepare_task;
+ wake_up_task_t wake_up_task;
+ task_blocks_t task_blocks;
+ tear_down_t tear_down;
+
+ /* non-preemptable sections */
+ enter_np_t enter_np;
+ exit_np_t exit_np;
+
+ /* priority inheritance */
+ inherit_priority_t inherit_priority;
+ return_priority_t return_priority;
+ pi_block_t pi_block;
+} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
+
+typedef struct sched_plugin sched_plugin_t;
+
+extern sched_plugin_t *curr_sched_plugin;
+
+
+/* common scheduler tick */
+reschedule_check_t rt_scheduler_tick(void);
+
+
+/* Don't pull in our definitions on top of the real ones
+ * in sched.c!
+ */
+#ifndef __SCHED_C__
+
+/* External linux scheduler facilities */
+void deactivate_task(struct task_struct *, runqueue_t *);
+/* This function is defined in sched.c. We need acces to it for
+ * indirect switching.
+ */
+void __activate_task(struct task_struct *, runqueue_t *);
+void __setscheduler(struct task_struct *, int, int);
+
+#endif
+
+extern int get_sched_options(void);
+#endif
diff --git a/include/linux/sched_trace.h b/include/linux/sched_trace.h
new file mode 100644
index 0000000..47cd4ed
--- /dev/null
+++ b/include/linux/sched_trace.h
@@ -0,0 +1,150 @@
+/* sched_trace.h -- record scheduler events to a byte stream for offline analysis.
+ */
+#ifndef _LINUX_SCHED_TRACE_H_
+#define _LINUX_SCHED_TRACE_H_
+
+#include <linux/sched.h>
+
+typedef enum {
+ ST_INVOCATION = 0,
+ ST_ARRIVAL = 1,
+ ST_DEPARTURE = 2,
+ ST_PREEMPTION = 3,
+ ST_SCHEDULED = 4,
+ ST_JOB_RELEASE = 5,
+ ST_JOB_COMPLETION = 6,
+ ST_CAPACITY_RELEASE = 7,
+ ST_CAPACITY_ALLOCATION = 8,
+} trace_type_t;
+
+typedef struct {
+ trace_type_t trace:8;
+ unsigned long long timestamp;
+} trace_header_t;
+
+
+typedef struct {
+ unsigned int is_rt:1;
+ unsigned int is_server:1;
+ task_class_t class:4;
+ unsigned int budget:24;
+ u32 deadline;
+
+ pid_t pid;
+} task_info_t;
+
+typedef struct {
+ trace_header_t header;
+ unsigned long flags;
+} invocation_record_t;
+
+typedef struct {
+ trace_header_t header;
+ task_info_t task;
+} arrival_record_t;
+
+typedef struct {
+ trace_header_t header;
+ task_info_t task;
+} departure_record_t;
+
+typedef struct {
+ trace_header_t header;
+ task_info_t task;
+ task_info_t by;
+} preemption_record_t;
+
+typedef struct {
+ trace_header_t header;
+ task_info_t task;
+} scheduled_record_t;
+
+typedef struct {
+ trace_header_t header;
+ task_info_t task;
+ u16 period;
+ u16 wcet;
+} release_record_t;
+
+typedef struct {
+ trace_header_t header;
+ task_info_t task;
+ u16 period;
+ u16 wcet;
+ int tardiness;
+} completion_record_t;
+
+typedef struct {
+ trace_header_t header;
+ task_info_t task;
+} cap_release_record_t;
+
+typedef struct {
+ trace_header_t header;
+ task_info_t task;
+ u16 budget;
+ u32 deadline;
+ pid_t donor;
+} cap_allocation_record_t;
+
+#ifdef CONFIG_SCHED_TASK_TRACE
+void sched_trace_scheduler_invocation(void);
+
+void sched_trace_task_arrival(struct task_struct *t);
+void sched_trace_task_departure(struct task_struct *t);
+void sched_trace_task_preemption(struct task_struct *t,
+ struct task_struct* by);
+void sched_trace_task_scheduled(struct task_struct *);
+
+void sched_trace_job_release(struct task_struct *t);
+void sched_trace_job_completion(struct task_struct *t);
+
+void sched_trace_capacity_release(struct task_struct *t);
+void sched_trace_capacity_allocation(struct task_struct *t,
+ u16 budget, u32 deadline, pid_t donor);
+
+void sched_trace_capacity_alloc_srv(pid_t srv, u32 srv_dl, task_class_t cls,
+ u16 srv_budget,
+ u16 budget, u32 deadline, pid_t donor);
+
+void sched_trace_server_release(int id, unsigned int wcet,
+ unsigned int period,
+ task_class_t class);
+
+void sched_trace_server_completion(int id, unsigned int budget,
+ jiffie_t deadline,
+ task_class_t class);
+
+void sched_trace_server_scheduled(int id, task_class_t class,
+ unsigned int budget, jiffie_t deadline);
+
+#else
+#define sched_trace_scheduler_invocation(x)
+
+#define sched_trace_task_arrival(t)
+#define sched_trace_task_departure(t)
+#define sched_trace_task_preemption(t, by)
+#define sched_trace_task_scheduled(t)
+#define sched_trace_job_release(t)
+#define sched_trace_job_completion(t)
+#define sched_trace_capacity_release(t)
+#define sched_trace_capacity_allocation(t, budget, deadline, donor)
+#define sched_trace_capacity_alloc_srv(srv, srv_dl, cls, srv_budget,\
+ budget, deadline, donor)
+#define sched_trace_server_release(id, wcet, period, class)
+#define sched_trace_server_completion(id, budget, deadline, class)
+#define sched_trace_server_scheduled(id, class, budget, deadline)
+#endif
+
+
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+void sched_trace_log_message(const char* fmt, ...);
+
+#else
+
+#define sched_trace_log_message(fmt, ...)
+
+#endif
+
+
+#endif
diff --git a/include/linux/trace.h b/include/linux/trace.h
new file mode 100644
index 0000000..9e457aa
--- /dev/null
+++ b/include/linux/trace.h
@@ -0,0 +1,74 @@
+
+#ifndef _SYS_TRACE_H_
+#define _SYS_TRACE_H_
+
+#include <linux/feather_trace.h>
+#include <linux/feather_buffer.h>
+
+
+/*********************** TIMESTAMPS ************************/
+
+struct timestamp {
+ unsigned long event;
+ unsigned long long timestamp;
+ unsigned int seq_no;
+ int cpu;
+};
+
+
+/* buffer holding time stamps - will be provided by driver */
+extern struct ft_buffer* trace_ts_buf;
+
+/* save_timestamp: stores current time as struct timestamp
+ * in trace_ts_buf
+ */
+asmlinkage void save_timestamp(unsigned long event);
+
+#define TIMESTAMP(id) ft_event0(id, save_timestamp)
+
+/* Convention for timestamps
+ * =========================
+ *
+ * In order to process the trace files with a common tool, we use the following
+ * convention to measure execution times: The end time id of a code segment is
+ * always the next number after the start time event id.
+ */
+
+#define TS_SCHED_START TIMESTAMP(100)
+#define TS_SCHED_END TIMESTAMP(101)
+#define TS_CXS_START TIMESTAMP(102)
+#define TS_CXS_END TIMESTAMP(103)
+
+#define TS_TICK_START TIMESTAMP(110)
+#define TS_TICK_END TIMESTAMP(111)
+
+#define TS_PLUGIN_SCHED_START TIMESTAMP(120)
+#define TS_PLUGIN_SCHED_END TIMESTAMP(121)
+
+#define TS_PLUGIN_TICK_START TIMESTAMP(130)
+#define TS_PLUGIN_TICK_END TIMESTAMP(131)
+
+#define TS_ENTER_NP_START TIMESTAMP(140)
+#define TS_ENTER_NP_END TIMESTAMP(141)
+
+#define TS_EXIT_NP_START TIMESTAMP(150)
+#define TS_EXIT_NP_END TIMESTAMP(151)
+
+#define TS_SRP_UP_START TIMESTAMP(160)
+#define TS_SRP_UP_END TIMESTAMP(161)
+#define TS_SRP_DOWN_START TIMESTAMP(162)
+#define TS_SRP_DOWN_END TIMESTAMP(163)
+
+#define TS_PI_UP_START TIMESTAMP(170)
+#define TS_PI_UP_END TIMESTAMP(171)
+#define TS_PI_DOWN_START TIMESTAMP(172)
+#define TS_PI_DOWN_END TIMESTAMP(173)
+
+#define TS_FIFO_UP_START TIMESTAMP(180)
+#define TS_FIFO_UP_END TIMESTAMP(181)
+#define TS_FIFO_DOWN_START TIMESTAMP(182)
+#define TS_FIFO_DOWN_END TIMESTAMP(183)
+
+
+
+#endif /* !_SYS_TRACE_H_ */
diff --git a/include/linux/wait.h b/include/linux/wait.h
index e820d00..c7e96b6 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -161,6 +161,8 @@ wait_queue_head_t *FASTCALL(bit_waitqueue(void *, int));
#define wake_up_locked(x) __wake_up_locked((x), TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE)
#define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1)
+#define pi_wake_up(x) __pi_wake_up(x, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, NULL)
+
#define __wait_event(wq, condition) \
do { \
DEFINE_WAIT(__wait); \
diff --git a/kernel/Makefile b/kernel/Makefile
index 14f4d45..ce9dfa0 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -8,7 +8,12 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
signal.o sys.o kmod.o workqueue.o pid.o \
rcupdate.o extable.o params.o posix-timers.o \
kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
- hrtimer.o rwsem.o latency.o nsproxy.o srcu.o
+ hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \
+ sched_plugin.o litmus.o sched_trace.o \
+ edf_common.o fifo_common.o pfair_common.o\
+ sched_global_edf.o sched_part_edf.o sched_edf_hsb.o sched_pfair.o \
+ sched_gsn_edf.o sched_psn_edf.o litmus_sem.o \
+ trace.o ft_event.o
obj-$(CONFIG_STACKTRACE) += stacktrace.o
obj-y += time/
diff --git a/kernel/edf_common.c b/kernel/edf_common.c
new file mode 100644
index 0000000..fa83450
--- /dev/null
+++ b/kernel/edf_common.c
@@ -0,0 +1,299 @@
+/*
+ * kernel/edf_common.c
+ *
+ * Common functions for EDF based scheduler.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+#include <linux/sched_trace.h>
+
+#include <linux/edf_common.h>
+
+
+static int dummy_resched(edf_domain_t *edf)
+{
+ return 0;
+}
+
+void edf_domain_init(edf_domain_t *edf, edf_check_resched_needed_t f)
+{
+ BUG_ON(!edf);
+ if (!f)
+ f = dummy_resched;
+ INIT_LIST_HEAD(&edf->ready_queue);
+ INIT_LIST_HEAD(&edf->release_queue);
+ edf->ready_lock = RW_LOCK_UNLOCKED;
+ edf->release_lock = SPIN_LOCK_UNLOCKED;
+ edf->check_resched = f;
+}
+
+
+/* edf_higher_prio - returns true if first has a higher EDF priority
+ * than second. Deadline ties are broken by PID.
+ *
+ * first first must not be NULL and a real-time task.
+ * second may be NULL or a non-rt task.
+ */
+int edf_higher_prio(struct task_struct* first,
+ struct task_struct* second)
+{
+ struct task_struct *first_task = first;
+ struct task_struct *second_task = second;
+
+ /* Check for inherited priorities. Change task
+ * used for comparison in such a case.
+ */
+ if (first && first->rt_param.inh_task)
+ first_task = first->rt_param.inh_task;
+ if (second && second->rt_param.inh_task)
+ second_task = second->rt_param.inh_task;
+
+ return
+ /* does the second task exist and is it a real-time task? If
+ * not, the first task (which is a RT task) has higher
+ * priority.
+ */
+ !second_task || !is_realtime(second_task) ||
+
+ /* is the deadline of the first task earlier?
+ * Then it has higher priority.
+ */
+ earlier_deadline(first_task, second_task) ||
+
+ /* Do we have a deadline tie?
+ * Then break by PID.
+ */
+ (get_deadline(first_task) == get_deadline(second_task) &&
+ (first_task->pid < second_task->pid ||
+
+ /* If the PIDs are the same then the task with the inherited
+ * priority wins.
+ */
+ (first_task->pid == second_task->pid &&
+ !second->rt_param.inh_task)));
+}
+
+
+/* add_ready - add a real-time task to the edf ready queue. It must be runnable.
+ * @new: the newly released task
+ */
+void __add_ready(edf_domain_t* edf, struct task_struct *new)
+{
+ struct list_head *pos;
+ struct task_struct *queued;
+ unsigned int passed = 0;
+
+ BUG_ON(!new);
+ TRACE("edf: adding %s/%d (%u, %u) to ready queue\n",
+ new->comm, new->pid, get_exec_cost(new), get_rt_period(new));
+
+ /* find a spot where our deadline is earlier than the next */
+ list_for_each(pos, &edf->ready_queue) {
+ queued = list_entry(pos, struct task_struct, rt_list);
+ if (unlikely(edf_higher_prio(new, queued))) {
+ /* the task at pos has a later deadline */
+ /* insert the new task in front of it */
+ __list_add(&new->rt_list, pos->prev, pos);
+ goto out;
+ }
+ passed++;
+ }
+ /* if we get to this point either the list is empty or new has the
+ * lowest priority. Let's add it to the end. */
+ list_add_tail(&new->rt_list, &edf->ready_queue);
+ out:
+ if (!passed)
+ edf->check_resched(edf);
+}
+
+struct task_struct* __take_ready(edf_domain_t* edf)
+{
+ struct task_struct *t = __peek_ready(edf);
+
+ /* kick it out of the ready list */
+ if (t)
+ list_del(&t->rt_list);
+ return t;
+}
+
+
+struct task_struct* __peek_ready(edf_domain_t* edf)
+{
+ struct task_struct *t = NULL;
+ /* either not yet released, preempted, or non-rt */
+ if (!list_empty(&edf->ready_queue))
+ /* take next rt task */
+ t = list_entry(edf->ready_queue.next, struct task_struct,
+ rt_list);
+ return t;
+}
+
+
+/* add_release - add a real-time task to the edf release queue.
+ * @task: the sleeping task
+ */
+void __add_release(edf_domain_t* edf, struct task_struct *task)
+{
+ struct list_head *pos;
+ struct task_struct *queued;
+
+ BUG_ON(!task);
+ /* first we need the lock for edf_release_queue */
+ TRACE("edf: adding %s/%d (%u, %u) to release queue\n",
+ task->comm, task->pid, get_exec_cost(task), get_rt_period(task));
+
+ /* find a spot where our deadline is earlier than the next */
+ list_for_each_prev(pos, &edf->release_queue) {
+ queued = list_entry(pos, struct task_struct, rt_list);
+ if ((unlikely(earlier_release(queued, task)))) {
+ /* the task at pos has an earlier release */
+ /* insert the new task in behind it */
+ __list_add(&task->rt_list, pos, pos->next);
+ return;
+ }
+ }
+ /* if we get to this point either the list is empty or task has the
+ * earliest release. Let's add it to the front. */
+ list_add(&task->rt_list, &edf->release_queue);
+}
+
+void __release_pending(edf_domain_t* edf)
+{
+ struct list_head *pos, *save;
+ struct task_struct *queued;
+ list_for_each_safe(pos, save, &edf->release_queue) {
+ queued = list_entry(pos, struct task_struct, rt_list);
+ if (likely(is_released(queued))) {
+ /* this one is ready to go*/
+ list_del(pos);
+ set_rt_flags(queued, RT_F_RUNNING);
+
+ sched_trace_job_release(queued);
+
+ /* now it can be picked up */
+ barrier();
+ add_ready(edf, queued);
+ }
+ else
+ /* the release queue is ordered */
+ break;
+ }
+}
+
+void try_release_pending(edf_domain_t* edf)
+{
+ unsigned long flags;
+
+ if (spin_trylock_irqsave(&edf->release_lock, flags)) {
+ __release_pending(edf);
+ spin_unlock_irqrestore(&edf->release_lock, flags);
+ }
+}
+
+void __prepare_new_release(struct task_struct *t, jiffie_t start) {
+ t->rt_param.times.deadline = start;
+ t->rt_param.stats.nontardy_jobs_ctr = 0xf0000000;
+ prepare_for_next_period(t);
+ set_rt_flags(t, RT_F_RUNNING);
+}
+
+void prepare_for_next_period(struct task_struct *t)
+{
+ BUG_ON(!t);
+ /* update tardy job ctr */
+ if (jiffies > t->rt_param.times.deadline)
+ t->rt_param.stats.nontardy_jobs_ctr = 0;
+ else
+ t->rt_param.stats.nontardy_jobs_ctr++;
+ /* prepare next release */
+ t->rt_param.times.release = t->rt_param.times.deadline;
+ t->rt_param.times.deadline += get_rt_period(t);
+ t->rt_param.times.exec_time = 0;
+ t->time_slice = get_exec_cost(t);
+
+ /* who uses this? statistics? */
+ t->first_time_slice = 0;
+}
+
+void prepare_new_releases(edf_domain_t *edf, jiffie_t start)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&edf->release_lock, flags);
+ write_lock(&edf->ready_lock);
+
+ __prepare_new_releases(edf, start);
+
+ write_unlock(&edf->ready_lock);
+ spin_unlock_irqrestore(&edf->release_lock, flags);
+}
+
+void __prepare_new_releases(edf_domain_t *edf, jiffie_t start)
+{
+
+ struct list_head tmp_list;
+ struct list_head *pos, *n;
+ struct task_struct *t;
+
+ INIT_LIST_HEAD(&tmp_list);
+
+ while (!list_empty(&edf->release_queue)) {
+ pos = edf->release_queue.next;
+ list_del(pos);
+ list_add(pos, &tmp_list);
+ }
+ while (!list_empty(&edf->ready_queue)) {
+ pos = edf->ready_queue.next;
+ list_del(pos);
+ list_add(pos, &tmp_list);
+ }
+
+ list_for_each_safe(pos, n, &tmp_list) {
+ t = list_entry(pos, struct task_struct, rt_list);
+ list_del(pos);
+ __prepare_new_release(t, start);
+ __add_release(edf, t);
+ }
+
+}
+
+/* need_to_preempt - check whether the task t needs to be preempted
+ * call only with irqs disabled and with ready_lock acquired
+ */
+int preemption_needed(edf_domain_t* edf, struct task_struct *t)
+{
+ /* we need the read lock for edf_ready_queue */
+ /* no need to preempt if there is nothing pending */
+ if (list_empty(&edf->ready_queue))
+ return 0;
+ /* we need to reschedule if t doesn't exist */
+ if (!t)
+ return 1;
+ /* don't preempt if t is non-preemptable */
+ if (!is_np(t))
+ /* make sure to get non-rt stuff out of the way */
+ return !is_realtime(t) || edf_higher_prio(next_ready(edf), t);
+ return 0;
+}
+
+
+/*
+ * Deactivate current task until the beginning of the next period.
+ */
+long edf_sleep_next_period(void)
+{
+ /* Mark that we do not excute anymore */
+ set_rt_flags(current, RT_F_SLEEP);
+ /* call schedule, this will return when a new job arrives
+ * it also takes care of preparing for the next release
+ */
+ sched_trace_job_completion(current);
+ schedule();
+ return 0;
+}
+
diff --git a/kernel/fifo_common.c b/kernel/fifo_common.c
new file mode 100644
index 0000000..98186cd
--- /dev/null
+++ b/kernel/fifo_common.c
@@ -0,0 +1,118 @@
+/*
+ * kernel/fifo_common.c
+ *
+ * Fifo helper functions. Could one day be a FIFO plugin if someone
+ * is interested.
+ *
+ * The current FIFO implementaion automatically chops Linux tasks into
+ * smaller jobs by assigning a fixed time slice. Once that time slice expires,
+ * it is treated as a new job release (that is queued in the back).
+ *
+ * The result is that it provides FIFO properties on a job level and round-robin
+ * on a task level if the tasks execute continuously.
+ */
+
+#include <asm/uaccess.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+#include <linux/sched_trace.h>
+#include <linux/fifo_common.h>
+
+/* This function is defined in sched.c. We need access it for
+ * indirect switching.
+ */
+void __activate_task(struct task_struct *p, runqueue_t *rq);
+
+void fifo_domain_init(fifo_domain_t* fifo, unsigned int exec_budget)
+{
+ INIT_LIST_HEAD(&fifo->queue);
+ atomic_set(&fifo->count, 0);
+ fifo->time_slice = exec_budget;
+ fifo->lock = SPIN_LOCK_UNLOCKED;
+}
+
+void fifo_add(fifo_domain_t* fifo, struct task_struct* task)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&fifo->lock, flags);
+
+ list_add_tail(&task->run_list, &fifo->queue);
+ atomic_inc(&fifo->count);
+
+ spin_unlock_irqrestore(&fifo->lock, flags);
+}
+
+void lifo_add(fifo_domain_t* fifo, struct task_struct* task)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&fifo->lock, flags);
+
+ list_add(&task->run_list, &fifo->queue);
+ atomic_inc(&fifo->count);
+
+ spin_unlock_irqrestore(&fifo->lock, flags);
+}
+
+/* This is a best-effort attempt at maintaining FIFO order.
+ * If we re-add a task comming from a preemption, it should go to
+ * the front as it arived early than the other queued tasks.
+ * Of course, this is not guaranteed to work correctly. Right now,
+ * it is only used for best-effort jobs, so it doesn't really matter
+ * all that much. A correct implementation would have to maintain
+ * arrival times and perform cross-processor preemptions...
+ */
+void fifo_enqueue(fifo_domain_t* fifo, struct task_struct* task)
+{
+ task->array = NULL;
+
+ if (!task->time_slice) {
+ task->time_slice = fifo->time_slice;
+ sched_trace_job_release(task);
+ fifo_add(fifo, task);
+ } else
+ lifo_add(fifo, task);
+}
+
+struct task_struct* __fifo_take(fifo_domain_t* fifo)
+{
+ struct task_struct * task = NULL;
+
+ if (atomic_read(&fifo->count)) {
+ BUG_ON(list_empty(&fifo->queue));
+ task = list_entry(fifo->queue.next, struct task_struct,
+ run_list);
+ list_del(fifo->queue.next);
+ atomic_dec(&fifo->count);
+ }
+
+ return task;
+}
+
+struct task_struct* fifo_take(fifo_domain_t* fifo)
+{
+ unsigned long flags;
+ struct task_struct* t;
+
+ spin_lock_irqsave(&fifo->lock, flags);
+ t = __fifo_take(fifo);
+ spin_unlock_irqrestore(&fifo->lock, flags);
+ return t;
+}
+
+
+struct task_struct* fifo_take_rq(fifo_domain_t* fifo, runqueue_t* rq, int cpu)
+{
+ struct task_struct *task = fifo_take(fifo);
+
+ if (task) {
+ set_task_cpu(task, cpu);
+ __activate_task(task, rq);
+ }
+ return task;
+}
diff --git a/kernel/fork.c b/kernel/fork.c
index d57118d..6874058 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -57,6 +57,9 @@
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+
/*
* Protected counters by write_lock_irq(&tasklist_lock)
*/
@@ -118,6 +121,9 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(atomic_read(&tsk->usage));
WARN_ON(tsk == current);
+ if (is_realtime(tsk))
+ curr_sched_plugin->tear_down(tsk);
+
security_task_free(tsk);
free_uid(tsk->user);
put_group_info(tsk->group_info);
diff --git a/kernel/ft_event.c b/kernel/ft_event.c
new file mode 100644
index 0000000..10318ee
--- /dev/null
+++ b/kernel/ft_event.c
@@ -0,0 +1,104 @@
+#include <linux/types.h>
+
+#include <linux/feather_trace.h>
+
+/* the feather trace management functions assume
+ * exclusive access to the event table
+ */
+
+
+#define BYTE_JUMP 0xeb
+#define BYTE_JUMP_LEN 0x02
+
+/* for each event, there is an entry in the event table */
+struct trace_event {
+ long id;
+ long count;
+ long start_addr;
+ long end_addr;
+};
+
+extern struct trace_event __start___event_table[];
+extern struct trace_event __stop___event_table[];
+
+int ft_enable_event(unsigned long id)
+{
+ struct trace_event* te = __start___event_table;
+ int count = 0;
+ char* delta;
+ unsigned char* instr;
+
+ while (te < __stop___event_table) {
+ if (te->id == id && ++te->count == 1) {
+ instr = (unsigned char*) te->start_addr;
+ /* make sure we don't clobber something wrong */
+ if (*instr == BYTE_JUMP) {
+ delta = (((unsigned char*) te->start_addr) + 1);
+ *delta = 0;
+ }
+ }
+ if (te->id == id)
+ count++;
+ te++;
+ }
+ return count;
+}
+
+int ft_disable_event(unsigned long id)
+{
+ struct trace_event* te = __start___event_table;
+ int count = 0;
+ char* delta;
+ unsigned char* instr;
+
+ while (te < __stop___event_table) {
+ if (te->id == id && --te->count == 0) {
+ instr = (unsigned char*) te->start_addr;
+ if (*instr == BYTE_JUMP) {
+ delta = (((unsigned char*) te->start_addr) + 1);
+ *delta = te->end_addr - te->start_addr -
+ BYTE_JUMP_LEN;
+ }
+ }
+ if (te->id == id)
+ count++;
+ te++;
+ }
+ return count;
+}
+
+int ft_disable_all_events(void)
+{
+ struct trace_event* te = __start___event_table;
+ int count = 0;
+ char* delta;
+ unsigned char* instr;
+
+ while (te < __stop___event_table) {
+ if (te->count) {
+ instr = (unsigned char*) te->start_addr;
+ if (*instr == BYTE_JUMP) {
+ delta = (((unsigned char*) te->start_addr)
+ + 1);
+ *delta = te->end_addr - te->start_addr -
+ BYTE_JUMP_LEN;
+ te->count = 0;
+ count++;
+ }
+ }
+ te++;
+ }
+ return count;
+}
+
+int ft_is_event_enabled(unsigned long id)
+{
+ struct trace_event* te = __start___event_table;
+
+ while (te < __stop___event_table) {
+ if (te->id == id)
+ return te->count;
+ te++;
+ }
+ return 0;
+}
diff --git a/kernel/litmus.c b/kernel/litmus.c
new file mode 100644
index 0000000..02d6851
--- /dev/null
+++ b/kernel/litmus.c
@@ -0,0 +1,523 @@
+/* litmus.c -- Implementation of the LITMUS syscalls, the LITMUS intialization,
+ * and the common tick function.
+ */
+#include <asm/uaccess.h>
+#include <linux/sysrq.h>
+
+#include <linux/queuelock.h>
+#include <linux/litmus.h>
+#include <linux/sched.h>
+#include <linux/sched_plugin.h>
+
+#include <linux/trace.h>
+
+/* Variables that govern the scheduling process */
+spolicy sched_policy = SCHED_DEFAULT;
+int sched_options = 0;
+
+/* avoid races with multiple task wake-ups */
+spinlock_t litmus_task_set_lock = SPIN_LOCK_UNLOCKED;
+
+/* This is a flag for switching the system into RT mode when it is booted up
+ * In RT-mode non-realtime tasks are shut down and scheduled as spare
+ * time available
+ */
+
+/* The system is booting in non-realtime mode */
+atomic_t rt_mode = ATOMIC_INIT(MODE_NON_RT);
+/* Here we specify a mode change to be made */
+atomic_t new_mode = ATOMIC_INIT(MODE_NON_RT);
+/* Number of RT tasks that exist in the system */
+atomic_t n_rt_tasks = ATOMIC_INIT(0);
+
+/* Only one process can perform mode change */
+static queuelock_t mode_change_lock;
+
+/* A time instant when we switched to RT mode */
+volatile jiffie_t rt_start_time = 0;
+
+/**
+ * sys_set_rt_mode
+ * @newmode: new mode the scheduler must be switched to
+ * External syscall for setting the RT mode flag
+ * Returns EINVAL if mode is not recognized or mode transition is
+ * not permitted
+ * On success 0 is returned
+ *
+ * FIXME: In a "real" OS we cannot just let any user switch the mode...
+ */
+asmlinkage long sys_set_rt_mode(int newmode)
+{
+ if ((newmode == MODE_NON_RT) || (newmode == MODE_RT_RUN)) {
+ printk(KERN_INFO "real-time mode switch to %s\n",
+ (newmode == MODE_RT_RUN ? "rt" : "non-rt"));
+ atomic_set(&new_mode, newmode);
+ return 0;
+ }
+ return -EINVAL;
+}
+
+/*
+ * sys_set_task_rt_param
+ * @pid: Pid of the task which scheduling parameters must be changed
+ * @param: New real-time extension parameters such as the execution cost and
+ * period
+ * Syscall for manipulating with task rt extension params
+ * Returns EFAULT if param is NULL.
+ * ESRCH if pid is not corrsponding
+ * to a valid task.
+ * EINVAL if either period or execution cost is <=0
+ * 0 if success
+ *
+ * FIXME: This code is racy during real-time mode.
+ */
+asmlinkage long sys_set_rt_task_param(pid_t pid, rt_param_t __user * param)
+{
+ rt_param_t tp;
+ struct task_struct *target;
+ int retval = -EINVAL;
+
+ printk("Setting up rt task parameters for process %d.\n", pid);
+
+ if (pid < 0 || param == 0) {
+ goto out;
+ }
+ if (copy_from_user(&tp, param, sizeof(tp))) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+ /* Task search and manipulation must be protected */
+ read_lock_irq(&tasklist_lock);
+ if (!(target = find_task_by_pid(pid))) {
+ retval = -ESRCH;
+ goto out_unlock;
+ }
+ if (tp.exec_cost <= 0)
+ goto out_unlock;
+ if (tp.period <= 0)
+ goto out_unlock;
+ if (!cpu_online(tp.cpu))
+ goto out_unlock;
+ if (tp.period < tp.exec_cost)
+ {
+ printk(KERN_INFO "litmus: real-time task %d rejected "
+ "because wcet > period\n", pid);
+ goto out_unlock;
+ }
+
+ /* Assign params */
+ target->rt_param.basic_params = tp;
+
+ retval = 0;
+ out_unlock:
+ read_unlock_irq(&tasklist_lock);
+ out:
+ return retval;
+}
+
+/* Getter of task's RT params
+ * returns EINVAL if param or pid is NULL
+ * returns ESRCH if pid does not correspond to a valid task
+ * returns EFAULT if copying of parameters has failed.
+ */
+asmlinkage long sys_get_rt_task_param(pid_t pid, rt_param_t __user * param)
+{
+ int retval = -EINVAL;
+ struct task_struct *source;
+ rt_param_t lp;
+ if (param == 0 || pid < 0)
+ goto out;
+ read_lock(&tasklist_lock);
+ if (!(source = find_task_by_pid(pid))) {
+ retval = -ESRCH;
+ goto out_unlock;
+ }
+ lp = source->rt_param.basic_params;
+ read_unlock(&tasklist_lock);
+ /* Do copying outside the lock */
+ retval =
+ copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0;
+ return retval;
+ out_unlock:
+ read_unlock(&tasklist_lock);
+ out:
+ return retval;
+
+}
+
+/*
+ * sys_prepare_rt_task
+ * @pid: Pid of the task we want to prepare for RT mode
+ * Syscall for adding a task to RT queue, plugin dependent.
+ * Must be called before RT tasks are going to start up.
+ * Returns EPERM if current plugin does not define prepare operation
+ * or scheduling policy does not allow the operation.
+ * ESRCH if pid does not correspond to a valid task.
+ * EINVAL if a task is non-realtime or in invalid state
+ * from underlying plugin function
+ * EAGAIN if a task is not in the right state
+ * ENOMEM if there is no memory space to handle this task
+ * 0 if success
+ */
+asmlinkage long sys_prepare_rt_task(pid_t pid)
+{
+ int retval = -EINVAL;
+ struct task_struct *target = 0;
+ /* If a plugin does not define preparation mode then nothing to do */
+ if (curr_sched_plugin->prepare_task == 0
+ || sched_policy == SCHED_DEFAULT) {
+ retval = -EPERM;
+ goto out_prepare;
+ }
+ read_lock_irq(&tasklist_lock);
+ if (!(target = find_task_by_pid(pid))) {
+ retval = -ESRCH;
+ goto out_prepare_unlock;
+ }
+ if (!cpu_online(get_partition(target)))
+ {
+ printk(KERN_WARNING "litmus prepare: cpu %d is not online\n",
+ get_partition(target));
+ goto out_prepare_unlock;
+ }
+ retval = curr_sched_plugin->prepare_task(target);
+ if (!retval) {
+ atomic_inc(&n_rt_tasks);
+ target->rt_param.is_realtime = 1;
+ }
+ out_prepare_unlock:
+ read_unlock_irq(&tasklist_lock);
+ out_prepare:
+ return retval;
+}
+
+/* implemented in kernel/litmus_sem.c */
+void srp_ceiling_block(void);
+
+/*
+ * This is the crucial function for periodic task implementation,
+ * It checks if a task is periodic, checks if such kind of sleep
+ * is permitted and calls plugin-specific sleep, which puts the
+ * task into a wait array.
+ * returns 0 on successful wakeup
+ * returns EPERM if current conditions do not permit such sleep
+ * returns EINVAL if current task is not able to go to sleep
+ */
+asmlinkage long sys_sleep_next_period(void)
+{
+ int retval = -EPERM;
+ if (!is_realtime(current)) {
+ retval = -EINVAL;
+ goto out;
+ }
+ /* Task with negative or zero period cannot sleep */
+ if (get_rt_period(current) <= 0) {
+ retval = -EINVAL;
+ goto out;
+ }
+ /* The plugin has to put the task into an
+ * appropriate queue and call schedule
+ */
+ retval = curr_sched_plugin->sleep_next_period();
+ if (!retval && is_subject_to_srp(current))
+ srp_ceiling_block();
+ out:
+ return retval;
+}
+
+/*
+ * sys_enter_np() allows real-time tasks to request to enter a
+ * non-preemptable section.
+ * returns 0 if the request was granted.
+ * returns EPERM if current scheduler plugin does not allow the task to
+ * enter a non-preemptable section
+ * returns EINVAL if current task is not a real-time task
+ */
+asmlinkage long sys_enter_np(void)
+{
+ int retval = -EINVAL;
+ preempt_disable();
+ TS_ENTER_NP_START;
+ if (!is_realtime(current))
+ goto out;
+ /* Let the plugin decide. The default callback will return -EPERM.
+ */
+ retval = curr_sched_plugin->enter_np(current);
+ TRACE("enter_np(%s/%d) => %d and np=%d\n",
+ current->comm, current->pid, retval, is_np(current));
+ out:
+ TS_ENTER_NP_END;
+ preempt_enable();
+ return retval;
+}
+
+/*
+ * sys_exit_np() allows real-time tasks to signal that they leave a
+ * non-preemptable section.
+ * returns 0 if the signal was valid and processed.
+ * returns EPERM if current scheduler plugin does not allow the task to
+ * exit a non-preemptable section at the current time
+ * returns EINVAL if current task is not a real-time task
+ */
+asmlinkage long sys_exit_np(void)
+{
+ int retval = -EINVAL;
+ preempt_disable();
+ TS_EXIT_NP_START;
+ if (!is_realtime(current))
+ goto out;
+ /* Let the plugin decide. The default callback will return -EPERM.
+ */
+ retval = curr_sched_plugin->exit_np(current);
+ TRACE("exit_np(%s/%d) => %d and np=%d\n",
+ current->comm, current->pid, retval, is_np(current));
+ out:
+ TS_EXIT_NP_END;
+ preempt_enable();
+ return retval;
+}
+
+
+/* Set scheduling options for all cpus. */
+void set_sched_options(int options)
+{
+ sched_options = options;
+}
+
+/* The LITMUS tick function. It manages the change to and from real-time mode
+ * and then calls the plugin's tick function.
+ */
+reschedule_check_t __sched rt_scheduler_tick(void)
+{
+ /* Check for mode change */
+ if ((get_rt_mode() != atomic_read(&new_mode))) {
+ queue_lock(&mode_change_lock);
+ // If the mode is already changed, proceed
+ if (get_rt_mode() == atomic_read(&new_mode)) {
+ queue_unlock(&mode_change_lock);
+ goto proceed;
+ }
+ // change the mode
+ if ((atomic_read(&new_mode) == MODE_RT_RUN)) {
+ /* The deferral of entering real-time mode should be
+ * handled by deferring task releases in the plugin.
+ * The plugin interface does not really need to know
+ * about quanta, that is the plugin's job.
+ */
+
+ /* update rt start time */
+ rt_start_time = jiffies;
+ printk(KERN_INFO "Real-Time mode enabled\n");
+ }
+ if (curr_sched_plugin->mode_change)
+ curr_sched_plugin->
+ mode_change(atomic_read(&new_mode));
+ set_rt_mode(atomic_read(&new_mode));
+ queue_unlock(&mode_change_lock);
+ }
+
+ proceed:
+ /* Call plugin-defined tick handler
+ *
+ * It is the plugin's tick handler' job to detect quantum
+ * boundaries in pfair.
+ */
+ return curr_sched_plugin->algo_scheduler_tick();
+}
+
+asmlinkage spolicy sys_sched_setpolicy(spolicy newpolicy)
+{
+ /* Dynamic policy change is disabled at the moment */
+ return SCHED_INVALID;
+}
+
+asmlinkage spolicy sys_sched_getpolicy(void)
+{
+ return sched_policy;
+}
+
+
+asmlinkage int sys_scheduler_setup(int cmd, void __user *parameter)
+{
+ return curr_sched_plugin->scheduler_setup(cmd, parameter);
+}
+
+#ifdef CONFIG_MAGIC_SYSRQ
+/* We offer the possibility to change the real-time mode of the system
+ * with a magic sys request. This helps in debugging in case the system fails
+ * to perform its planned switch back to normal mode. This may happen if we have
+ * total system utilization and the task that is supposed to do the switch is
+ * always preempted (if it is not a real-time task).
+ */
+
+int sys_kill(int pid, int sig);
+
+static void sysrq_handle_toGgle_rt_mode(int key, struct tty_struct *tty)
+{
+ sys_set_rt_mode(get_rt_mode() == MODE_NON_RT);
+}
+
+static struct sysrq_key_op sysrq_toGgle_rt_mode_op = {
+ .handler = sysrq_handle_toGgle_rt_mode,
+ .help_msg = "toGgle-rt-mode",
+ .action_msg = "real-time mode changed",
+};
+
+static void sysrq_handle_kill_rt_tasks(int key, struct tty_struct *tty)
+{
+ struct task_struct *t;
+ read_lock(&tasklist_lock);
+ for_each_process(t) {
+ if (is_realtime(t)) {
+ sys_kill(t->pid, SIGKILL);
+ }
+ }
+ read_unlock(&tasklist_lock);
+}
+
+static struct sysrq_key_op sysrq_kill_rt_tasks_op = {
+ .handler = sysrq_handle_kill_rt_tasks,
+ .help_msg = "Quit-rt-tasks",
+ .action_msg = "sent SIGKILL to all real-time tasks",
+};
+#endif
+
+/*
+ * Scheduler initialization so that customized scheduler is
+ * enabled at boot time
+ * by setting boot option "rtsched=plugin_name", e.g. "rtsched=pfair"
+ */
+
+/* All we need to know about other plugins is their initialization
+ * functions. These functions initialize internal data structures of a
+ * scheduler and return a pointer to initialized sched_plugin data
+ * structure with pointers to scheduling function implementations.
+ * If called repeatedly these init functions just return an existing
+ * plugin pointer.
+ */
+sched_plugin_t *init_global_edf_plugin(void);
+sched_plugin_t *init_global_edf_np_plugin(void);
+sched_plugin_t *init_part_edf_plugin(void);
+sched_plugin_t *init_edf_hsb_plugin(void);
+sched_plugin_t *init_pfair_plugin(void);
+sched_plugin_t *init_gsn_edf_plugin(void);
+sched_plugin_t *init_psn_edf_plugin(void);
+
+/* keep everything needed to setup plugins in one place */
+
+/* we are lazy, so we use a convention for function naming to fill
+ * a table
+ */
+#define PLUGIN(caps, small) \
+ {PLUGIN_ ## caps, SCHED_ ## caps, init_ ## small ## _plugin}
+
+#define init_nosetup_plugin 0
+
+static struct {
+ const char *name;
+ const spolicy policy_id;
+ sched_plugin_t *(*init) (void);
+} available_plugins[] = {
+ PLUGIN(LINUX, nosetup),
+ PLUGIN(GLOBAL_EDF_NP, global_edf_np),
+ PLUGIN(GLOBAL_EDF, global_edf),
+ PLUGIN(PART_EDF, part_edf),
+ PLUGIN(EDF_HSB, edf_hsb),
+ PLUGIN(PFAIR, pfair),
+ PLUGIN(GSN_EDF, gsn_edf),
+ PLUGIN(PSN_EDF, psn_edf),
+
+ /*********************************************
+ * Add your custom plugin here
+ **********************************************/
+};
+
+/* Some plugins may leave important functions unused. We define dummies
+ * so that we don't have to check for null pointers all over the place.
+ */
+void litmus_dummy_finish_switch(struct task_struct * prev);
+int litmus_dummy_schedule(struct task_struct * prev, struct task_struct** next,
+ runqueue_t* q);
+reschedule_check_t litmus_dummy_scheduler_tick(void);
+long litmus_dummy_prepare_task(struct task_struct *t);
+void litmus_dummy_wake_up_task(struct task_struct *task);
+void litmus_dummy_task_blocks(struct task_struct *task);
+long litmus_dummy_tear_down(struct task_struct *task);
+int litmus_dummy_scheduler_setup(int cmd, void __user *parameter);
+long litmus_dummy_sleep_next_period(void);
+long litmus_dummy_enter_np(struct task_struct *task);
+long litmus_dummy_exit_np(struct task_struct *task);
+long litmus_dummy_inherit_priority(struct pi_semaphore *sem,
+ struct task_struct *new_owner);
+long litmus_dummy_return_priority(struct pi_semaphore *sem);
+long litmus_dummy_pi_block(struct pi_semaphore *sem,
+ struct task_struct *t);
+
+#define CHECK(func) {\
+ if (!curr_sched_plugin->func) \
+ curr_sched_plugin->func = litmus_dummy_ ## func;}
+
+static int boot_sched_setup(char *plugin_name)
+{
+ int i = 0;
+
+ /* Common initializers,
+ * mode change lock is used to enforce single mode change
+ * operation.
+ */
+ queue_lock_init(&mode_change_lock);
+
+ printk("Starting LITMUS^RT kernel\n");
+
+ /* Look for a matching plugin.
+ */
+ for (i = 0; i < ARRAY_SIZE(available_plugins); i++) {
+ if (!strcmp(plugin_name, available_plugins[i].name)) {
+ printk("Using %s scheduler plugin\n", plugin_name);
+ sched_policy = available_plugins[i].policy_id;
+ if (available_plugins[i].init)
+ curr_sched_plugin = available_plugins[i].init();
+ goto out;
+ }
+ }
+
+
+ /* Otherwise we have default linux scheduler */
+ printk("Plugin name %s is unknown, using default %s\n", plugin_name,
+ curr_sched_plugin->plugin_name);
+
+out:
+ /* make sure we don't trip over null pointers later */
+ CHECK(finish_switch);
+ CHECK(schedule);
+ CHECK(scheduler_tick);
+ CHECK(wake_up_task);
+ CHECK(tear_down);
+ CHECK(task_blocks);
+ CHECK(prepare_task);
+ CHECK(scheduler_setup);
+ CHECK(sleep_next_period);
+ CHECK(enter_np);
+ CHECK(exit_np);
+ CHECK(inherit_priority);
+ CHECK(return_priority);
+ CHECK(pi_block);
+
+#ifdef CONFIG_MAGIC_SYSRQ
+ /* offer some debugging help */
+ if (!register_sysrq_key('g', &sysrq_toGgle_rt_mode_op))
+ printk("Registered eXit real-time mode magic sysrq.\n");
+ else
+ printk("Could not register eXit real-time mode magic sysrq.\n");
+ if (!register_sysrq_key('q', &sysrq_kill_rt_tasks_op))
+ printk("Registered kill rt tasks magic sysrq.\n");
+ else
+ printk("Could not register kill rt tasks magic sysrq.\n");
+#endif
+ printk("Litmus setup complete.");
+ return 1;
+}
+
+/* Register for boot option */
+__setup("rtsched=", boot_sched_setup);
diff --git a/kernel/litmus_sem.c b/kernel/litmus_sem.c
new file mode 100644
index 0000000..71233cc
--- /dev/null
+++ b/kernel/litmus_sem.c
@@ -0,0 +1,755 @@
+
+/*
+ * SMP- and interrupt-safe semaphores. Also PI and SRP implementations.
+ * Much of the code here is borrowed from include/asm-i386/semaphore.h.
+ *
+ * NOTE: This implementation is very much a prototype and horribly insecure. It
+ * is intended to be a proof of concept, not a feature-complete solution.
+ */
+
+#include <asm/atomic.h>
+#include <asm/semaphore.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+#include <linux/spinlock.h>
+#include <linux/queuelock.h>
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+#include <linux/edf_common.h>
+
+
+#include <linux/trace.h>
+/* ************************************************************************** */
+/* STANDARD FIFO SEMAPHORES */
+/* ************************************************************************** */
+
+#define MAX_SEMAPHORES 256
+
+struct semaphore sems[MAX_SEMAPHORES]; /* all sems */
+typedef int sema_id; /* Userspace ID of a semaphore */
+
+static int rt_fifo_wake_up(wait_queue_t *wait, unsigned mode, int sync,
+ void *key)
+{
+ struct task_struct* t = (struct task_struct*) wait->private;
+ set_rt_flags(t, RT_F_EXIT_SEM);
+ TRACE_TASK(t, "woken up by rt_fifo_wake_up(), set RT_F_EXIT_SEM\n");
+ default_wake_function(wait, mode, sync, key);
+ /* for reason why we always return 1 see rt_pi_wake_up() below */
+ return 1;
+}
+
+static fastcall void rt_fifo_up(struct semaphore * sem)
+{
+ TRACE_CUR("releases lock %p\n");
+ preempt_disable();
+ TS_FIFO_UP_START;
+ if (atomic_inc_return(&sem->count) < 1)
+ /* there is a task queued */
+ wake_up(&sem->wait);
+ TS_FIFO_UP_END;
+ preempt_enable();
+}
+
+/* not optimized like the Linux down() implementation, but then
+ * again we incur the cost of a syscall anyway, so this hardly matters
+ */
+static fastcall void rt_fifo_down(struct semaphore * sem)
+{
+ struct task_struct *tsk = current;
+ wait_queue_t wait = {
+ .private = tsk,
+ .func = rt_fifo_wake_up,
+ .task_list = {NULL, NULL}
+ };
+
+ preempt_disable();
+ TS_FIFO_DOWN_START;
+
+ spin_lock(&sem->wait.lock);
+ if (atomic_dec_return(&sem->count) < 0 ||
+ waitqueue_active(&sem->wait)) {
+ /* we need to suspend */
+ tsk->state = TASK_UNINTERRUPTIBLE;
+ add_wait_queue_exclusive_locked(&sem->wait, &wait);
+
+ TRACE_CUR("suspends on lock %p\n", sem);
+
+ /* release lock before sleeping */
+ spin_unlock(&sem->wait.lock);
+
+ TS_FIFO_DOWN_END;
+ preempt_enable();
+
+ /* we depend on the FIFO order
+ * Thus, we don't need to recheck when we wake up, we
+ * are guaranteed to have the lock since there is only one
+ * wake up per release
+ */
+ schedule();
+
+ TRACE_CUR("woke up, now owns lock %p\n", sem);
+
+ /* try_to_wake_up() set our state to TASK_RUNNING,
+ * all we need to do is to remove our wait queue entry
+ */
+ spin_lock(&sem->wait.lock);
+ remove_wait_queue_locked(&sem->wait, &wait);
+ spin_unlock(&sem->wait.lock);
+ } else {
+ TRACE_CUR("acquired lock %p, no contention\n", sem);
+ spin_unlock(&sem->wait.lock);
+ TS_FIFO_DOWN_END;
+ preempt_enable();
+ }
+}
+
+
+
+/* Initialize semaphores at boot time. */
+static int __init sema_boot_init(void)
+{
+ sema_id sem_id;
+
+ printk("Initializing semaphores...");
+ for (sem_id = 0; sem_id < MAX_SEMAPHORES; sem_id++)
+ sems[sem_id].used = 0;
+ printk(" done!\n");
+
+ return 0;
+}
+__initcall(sema_boot_init);
+
+/* Find a free semaphore and return. */
+asmlinkage long sys_sema_init (void)
+{
+ sema_id sem_id;
+
+ for (sem_id = 0; sem_id < MAX_SEMAPHORES; sem_id++) {
+ if (!cmpxchg(&sems[sem_id].used, 0, 1)) {
+ sema_init(&sems[sem_id], 1);
+ return sem_id;
+ }
+ }
+ return -ENOMEM;
+}
+
+asmlinkage long sys_down(sema_id sem_id)
+{
+ if (sem_id < 0 || sem_id >= MAX_SEMAPHORES)
+ return -EINVAL;
+
+ if (!sems[sem_id].used)
+ return -EINVAL;
+ /* This allows for FIFO sems and gives others a chance... */
+ rt_fifo_down(sems + sem_id);
+ return 0;
+}
+
+asmlinkage long sys_up(sema_id sem_id)
+{
+ if (sem_id < 0 || sem_id >= MAX_SEMAPHORES)
+ return -EINVAL;
+
+ if (!sems[sem_id].used)
+ return -EINVAL;
+ rt_fifo_up(sems + sem_id);
+ return 0;
+}
+
+asmlinkage long sys_sema_free(sema_id sem_id)
+{
+ struct list_head *tmp, *next;
+ unsigned long flags;
+
+ if (sem_id < 0 || sem_id >= MAX_SEMAPHORES)
+ return -EINVAL;
+
+ if (!sems[sem_id].used)
+ return -EINVAL;
+
+ spin_lock_irqsave(&sems[sem_id].wait.lock, flags);
+ if (waitqueue_active(&sems[sem_id].wait)) {
+ list_for_each_safe(tmp, next, &sems[sem_id].wait.task_list) {
+ wait_queue_t *curr = list_entry(tmp, wait_queue_t,
+ task_list);
+ list_del(tmp);
+ set_rt_flags((struct task_struct*)curr->private,
+ RT_F_EXIT_SEM);
+ curr->func(curr,
+ TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
+ 0, NULL);
+ }
+ }
+
+ spin_unlock_irqrestore(&sems[sem_id].wait.lock, flags);
+ sems[sem_id].used = 0;
+
+ return 0;
+}
+
+
+
+
+/* ************************************************************************** */
+/* PRIORITY INHERITANCE */
+/* ************************************************************************** */
+
+
+#define MAX_PI_SEMAPHORES 256
+
+struct pi_semaphore pi_sems[MAX_PI_SEMAPHORES]; /* all PI sems */
+typedef int pi_sema_id; /* Userspace ID of a pi_semaphore */
+
+struct wq_pair {
+ struct task_struct* tsk;
+ struct pi_semaphore* sem;
+};
+
+static int rt_pi_wake_up(wait_queue_t *wait, unsigned mode, int sync,
+ void *key)
+{
+ struct wq_pair* wqp = (struct wq_pair*) wait->private;
+ set_rt_flags(wqp->tsk, RT_F_EXIT_SEM);
+ curr_sched_plugin->inherit_priority(wqp->sem, wqp->tsk);
+ TRACE_TASK(wqp->tsk,
+ "woken up by rt_pi_wake_up() (RT_F_SEM_EXIT, PI)\n");
+ /* point to task for default_wake_function() */
+ wait->private = wqp->tsk;
+ default_wake_function(wait, mode, sync, key);
+
+ /* Always return true since we know that if we encountered a task
+ * that was already running the wake_up raced with the schedule in
+ * rt_pi_down(). In that case the task in rt_pi_down() will be scheduled
+ * immediately and own the lock. We must not wake up another task in
+ * any case.
+ */
+ return 1;
+}
+
+
+/* caller is responsible for locking */
+int edf_set_hp_task(struct pi_semaphore *sem)
+{
+ struct list_head *tmp, *next;
+ struct task_struct *queued;
+ int ret = 0;
+
+ sem->hp.task = NULL;
+ list_for_each_safe(tmp, next, &sem->wait.task_list) {
+ queued = ((struct wq_pair*)
+ list_entry(tmp, wait_queue_t,
+ task_list)->private)->tsk;
+
+ /* Compare task prios, find high prio task. */
+ if (edf_higher_prio(queued, sem->hp.task)) {
+ sem->hp.task = queued;
+ ret = 1;
+ }
+ }
+ return ret;
+}
+
+
+/* caller is responsible for locking */
+int edf_set_hp_cpu_task(struct pi_semaphore *sem, int cpu)
+{
+ struct list_head *tmp, *next;
+ struct task_struct *queued;
+ int ret = 0;
+
+ sem->hp.cpu_task[cpu] = NULL;
+ list_for_each_safe(tmp, next, &sem->wait.task_list) {
+ queued = ((struct wq_pair*)
+ list_entry(tmp, wait_queue_t,
+ task_list)->private)->tsk;
+
+ /* Compare task prios, find high prio task. */
+ if (get_partition(queued) == cpu &&
+ edf_higher_prio(queued, sem->hp.cpu_task[cpu])) {
+ sem->hp.cpu_task[cpu] = queued;
+ ret = 1;
+ }
+ }
+ return ret;
+}
+
+
+/* Initialize PI semaphores at boot time. */
+static int __init pi_sema_boot_init(void)
+{
+ pi_sema_id sem_id;
+
+ printk("Initializing PI semaphores...");
+ for (sem_id = 0; sem_id < MAX_PI_SEMAPHORES; sem_id++)
+ pi_sems[sem_id].used = 0;
+ printk(" done!\n");
+
+ return 0;
+}
+__initcall(pi_sema_boot_init);
+
+/* Find a free semaphore and return. */
+asmlinkage long sys_pi_sema_init (void)
+{
+ pi_sema_id sem_id;
+ int i = 0;
+
+ for (sem_id = 0; sem_id < MAX_PI_SEMAPHORES; sem_id++) {
+ if (!cmpxchg(&pi_sems[sem_id].used, 0, 1)) {
+ atomic_set(&pi_sems[sem_id].count, 1);
+ pi_sems[sem_id].sleepers = 0;
+ init_waitqueue_head(&pi_sems[sem_id].wait);
+ pi_sems[sem_id].hp.task = NULL;
+ pi_sems[sem_id].holder = NULL;
+ for (i = 0; i < NR_CPUS; i++)
+ pi_sems[sem_id].hp.cpu_task[i] = NULL;
+ return sem_id;
+ }
+ }
+ return -ENOMEM;
+}
+
+asmlinkage long sys_pi_down(pi_sema_id sem_id)
+{
+ struct pi_semaphore * sem;
+ unsigned long flags;
+ struct task_struct *tsk = current;
+ struct wq_pair pair;
+ long ret = -EINVAL;
+ wait_queue_t wait = {
+ .private = &pair,
+ .func = rt_pi_wake_up,
+ .task_list = {NULL, NULL}
+ };
+
+ preempt_disable();
+ TS_PI_DOWN_START;
+
+ if (sem_id < 0 || sem_id >= MAX_PI_SEMAPHORES)
+ goto out;
+
+ if (!pi_sems[sem_id].used)
+ goto out;
+
+ sem = pi_sems + sem_id;
+ pair.tsk = tsk;
+ pair.sem = sem;
+ spin_lock_irqsave(&sem->wait.lock, flags);
+
+ if (atomic_dec_return(&sem->count) < 0 ||
+ waitqueue_active(&sem->wait)) {
+ /* we need to suspend */
+ tsk->state = TASK_UNINTERRUPTIBLE;
+ add_wait_queue_exclusive_locked(&sem->wait, &wait);
+
+ TRACE_CUR("suspends on PI lock %p\n", sem);
+ curr_sched_plugin->pi_block(sem, tsk);
+
+ /* release lock before sleeping */
+ spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+ TS_PI_DOWN_END;
+ preempt_enable();
+
+
+ /* we depend on the FIFO order
+ * Thus, we don't need to recheck when we wake up, we
+ * are guaranteed to have the lock since there is only one
+ * wake up per release
+ */
+ schedule();
+
+ TRACE_CUR("woke up, now owns PI lock %p\n", sem);
+
+ /* try_to_wake_up() set our state to TASK_RUNNING,
+ * all we need to do is to remove our wait queue entry
+ */
+ remove_wait_queue(&sem->wait, &wait);
+ } else {
+ /* no priority inheritance necessary, since there are no queued
+ * tasks.
+ */
+ TRACE_CUR("acquired PI lock %p, no contention\n", sem);
+ sem->holder = tsk;
+ sem->hp.task = tsk;
+ curr_sched_plugin->inherit_priority(sem, tsk);
+ spin_unlock_irqrestore(&sem->wait.lock, flags);
+ out:
+ TS_PI_DOWN_END;
+ preempt_enable();
+ }
+ ret = 0;
+ return ret;
+}
+
+asmlinkage long sys_pi_up(pi_sema_id sem_id)
+{
+ unsigned long flags;
+ long ret = -EINVAL;
+ struct pi_semaphore * sem;
+
+ preempt_disable();
+ TS_PI_UP_START;
+
+ if (sem_id < 0 || sem_id >= MAX_PI_SEMAPHORES)
+ goto out;
+
+ if (!pi_sems[sem_id].used)
+ goto out;
+
+ sem = pi_sems + sem_id;
+ spin_lock_irqsave(&sem->wait.lock, flags);
+
+ TRACE_CUR("releases PI lock %p\n", sem);
+ curr_sched_plugin->return_priority(sem);
+ sem->holder = NULL;
+ if (atomic_inc_return(&sem->count) < 1)
+ /* there is a task queued */
+ wake_up_locked(&sem->wait);
+
+ spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+ ret = 0;
+ out:
+ TS_PI_UP_END;
+ preempt_enable();
+ return ret;
+}
+
+/* Clear wait queue and wakeup waiting tasks, and free semaphore. */
+asmlinkage long sys_pi_sema_free(pi_sema_id sem_id)
+{
+ struct list_head *tmp, *next;
+ unsigned long flags;
+
+ if (sem_id < 0 || sem_id >= MAX_PI_SEMAPHORES)
+ return -EINVAL;
+
+ if (!pi_sems[sem_id].used)
+ return -EINVAL;
+
+ spin_lock_irqsave(&pi_sems[sem_id].wait.lock, flags);
+ if (waitqueue_active(&pi_sems[sem_id].wait)) {
+ list_for_each_safe(tmp, next,
+ &pi_sems[sem_id].wait.task_list) {
+ wait_queue_t *curr = list_entry(tmp, wait_queue_t,
+ task_list);
+ list_del(tmp);
+ set_rt_flags((struct task_struct*)curr->private,
+ RT_F_EXIT_SEM);
+ curr->func(curr,
+ TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
+ 0, NULL);
+ }
+ }
+
+ spin_unlock_irqrestore(&pi_sems[sem_id].wait.lock, flags);
+ pi_sems[sem_id].used = 0;
+
+ return 0;
+}
+
+
+
+
+/* ************************************************************************** */
+/* STACK RESOURCE POLICY */
+/* ************************************************************************** */
+
+#define MAX_SRP_SEMAPHORES 256
+
+struct srp_priority {
+ struct list_head list;
+ unsigned int period;
+ pid_t pid;
+};
+
+#define list2prio(l) list_entry(l, struct srp_priority, list)
+
+static int srp_higher_prio(struct srp_priority* first,
+ struct srp_priority* second)
+{
+ if (!first->period)
+ return 0;
+ else
+ return !second->period ||
+ first->period < second->period || (
+ first->period == second->period &&
+ first->pid < second->pid);
+}
+
+struct srp {
+ struct list_head ceiling;
+ wait_queue_head_t ceiling_blocked;
+};
+
+#define system_ceiling(srp) list2prio(srp->ceiling.next)
+
+static int srp_exceeds_ceiling(struct task_struct* first,
+ struct srp* srp)
+{
+ return list_empty(&srp->ceiling) ||
+ get_rt_period(first) < system_ceiling(srp)->period ||
+ (get_rt_period(first) == system_ceiling(srp)->period &&
+ first->pid < system_ceiling(srp)->pid);
+}
+
+static void srp_add_prio(struct srp* srp, struct srp_priority* prio)
+{
+ struct list_head *pos;
+ if (in_list(&prio->list)) {
+ TRACE_CUR("WARNING: SRP violation detected, prio is already in "
+ "ceiling list!\n");
+ return;
+ }
+ list_for_each(pos, &srp->ceiling)
+ if (unlikely(srp_higher_prio(prio, list2prio(pos)))) {
+ __list_add(&prio->list, pos->prev, pos);
+ return;
+ }
+
+ list_add_tail(&prio->list, &srp->ceiling);
+}
+
+/* struct for uniprocessor SRP "semaphore" */
+struct srp_semaphore {
+ struct srp_priority ceiling;
+ int cpu; /* cpu associated with this "semaphore" and resource */
+ int claimed; /* is the resource claimed (ceiling should be used)? */
+ int used; /* is the semaphore being used? */
+};
+
+
+struct srp_semaphore srp_sems[MAX_SRP_SEMAPHORES]; /* all SRP sems */
+typedef int srp_sema_id; /* Userspace ID of a srp_semaphore */
+
+DEFINE_PER_CPU(struct srp, srp);
+
+/* Initialize SRP semaphores at boot time. */
+static int __init srp_sema_boot_init(void)
+{
+ srp_sema_id sem_id;
+ int i;
+
+ printk("Initializing SRP semaphores...");
+ for (sem_id = 0; sem_id < MAX_SRP_SEMAPHORES; sem_id++) {
+ srp_sems[sem_id].used = 0;
+ srp_sems[sem_id].claimed = 0;
+ srp_sems[sem_id].cpu = -1;
+ INIT_LIST_HEAD(&srp_sems[sem_id].ceiling.list);
+ }
+ for (i = 0; i < NR_CPUS; i++) {
+ init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked);
+ INIT_LIST_HEAD(&per_cpu(srp, i).ceiling);
+ }
+ printk(" done!\n");
+
+ return 0;
+}
+__initcall(srp_sema_boot_init);
+
+/* Find a free semaphore and return. */
+asmlinkage long sys_srp_sema_init (void)
+{
+ srp_sema_id sem_id;
+
+ if (!is_realtime(current))
+ return -EPERM;
+
+ for (sem_id = 0; sem_id < MAX_SRP_SEMAPHORES; sem_id++) {
+ if (!cmpxchg(&srp_sems[sem_id].used, 0, 1)) {
+ srp_sems[sem_id].ceiling.period = 0;
+ srp_sems[sem_id].cpu = get_partition(current);
+ return sem_id;
+ }
+ }
+ return -ENOMEM;
+}
+
+/* SRP task priority comparison function. Smaller periods have highest
+ * priority, tie-break is PID.
+ */
+
+/* Adjust the system-wide priority ceiling if resource is claimed. */
+asmlinkage long sys_srp_down(srp_sema_id sem_id)
+{
+ int cpu;
+ int ret = -EINVAL;
+
+ /* disabling preemptions is sufficient protection since
+ * SRP is strictly per CPU and we don't interfere with any
+ * interrupt handlers
+ */
+ preempt_disable();
+ TS_SRP_DOWN_START;
+
+
+ cpu = smp_processor_id();
+
+ if (sem_id < 0 || sem_id >= MAX_SRP_SEMAPHORES ||
+ srp_sems[sem_id].cpu != cpu)
+ goto out;
+
+ if (!srp_sems[sem_id].used)
+ goto out;
+
+ /* claim... */
+ srp_sems[sem_id].claimed = 1;
+ /* ...and update ceiling */
+ srp_add_prio(&__get_cpu_var(srp), &srp_sems[sem_id].ceiling);
+
+ ret = 0;
+ out:
+ TS_SRP_DOWN_END;
+ preempt_enable();
+ return ret;
+}
+
+/* Adjust the system-wide priority ceiling if resource is freed. */
+asmlinkage long sys_srp_up(srp_sema_id sem_id)
+{
+ int cpu;
+ int ret = -EINVAL;
+
+ preempt_disable();
+ TS_SRP_UP_START;
+
+ cpu = smp_processor_id();
+
+ if (sem_id < 0 || sem_id >= MAX_SRP_SEMAPHORES ||
+ srp_sems[sem_id].cpu != cpu)
+ goto out;
+
+ if (!srp_sems[sem_id].used)
+ goto out;
+
+ srp_sems[sem_id].claimed = 0;
+ /* Determine new system priority ceiling for this CPU. */
+ if (in_list(&srp_sems[sem_id].ceiling.list))
+ list_del(&srp_sems[sem_id].ceiling.list);
+ else
+ TRACE_CUR("WARNING: SRP violation detected, prio not in ceiling"
+ " list!\n");
+
+ /* Wake tasks on this CPU, if they exceed current ceiling. */
+ wake_up_all(&__get_cpu_var(srp).ceiling_blocked);
+ ret = 0;
+ out:
+ TS_SRP_UP_END;
+ preempt_enable();
+ return ret;
+}
+
+/* Indicate that task will use a resource associated with a given
+ * semaphore. Should be done *a priori* before RT task system is
+ * executed, so this does *not* update the system priority
+ * ceiling! (The ceiling would be meaningless anyway, as the SRP
+ * breaks without this a priori knowledge.)
+ */
+asmlinkage long sys_reg_task_srp_sem(srp_sema_id sem_id, pid_t t_pid)
+{
+ struct pid *task_pid;
+ struct task_struct *t;
+ struct srp_priority t_prio;
+
+ if (sem_id < 0 || sem_id >= MAX_SRP_SEMAPHORES)
+ return -EINVAL;
+
+ task_pid = find_get_pid(t_pid);
+ if (!task_pid)
+ return -EINVAL;
+
+ t = get_pid_task(task_pid, PIDTYPE_PID);
+ if (!t)
+ return -EINVAL;
+
+ if (!is_realtime(t))
+ return -EPERM;
+
+ if (!srp_sems[sem_id].used)
+ return -EINVAL;
+
+ if (srp_sems[sem_id].cpu != get_partition(t))
+ return -EINVAL;
+
+ preempt_disable();
+ t->rt_param.subject_to_srp = 1;
+ t_prio.period = get_rt_period(t);
+ t_prio.pid = t->pid;
+ if (srp_higher_prio(&t_prio, &srp_sems[sem_id].ceiling)) {
+ srp_sems[sem_id].ceiling.period = t_prio.period;
+ srp_sems[sem_id].ceiling.pid = t_prio.pid;
+ }
+
+ preempt_enable();
+
+ return 0;
+}
+
+static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync,
+ void *key)
+{
+ int cpu = smp_processor_id();
+ struct task_struct *tsk = wait->private;
+ if (cpu != get_partition(tsk))
+ TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b",
+ get_partition(tsk));
+ else if (srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
+ return default_wake_function(wait, mode, sync, key);
+ return 0;
+}
+
+
+/* Wait for current task priority to exceed system-wide priority ceiling.
+ * Can be used to determine when it is safe to run a job after its release.
+ */
+void srp_ceiling_block(void)
+{
+ struct task_struct *tsk = current;
+ wait_queue_t wait = {
+ .private = tsk,
+ .func = srp_wake_up,
+ .task_list = {NULL, NULL}
+ };
+
+ preempt_disable();
+ if (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) {
+ tsk->state = TASK_UNINTERRUPTIBLE;
+ add_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
+ TRACE_CUR("is priority ceiling blocked.\n");
+ preempt_enable();
+ schedule();
+ TRACE_CUR("finally exceeds system ceiling.\n");
+ remove_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
+ } else {
+ TRACE_CUR("is not priority ceiling blocked\n");
+ preempt_enable();
+ }
+}
+
+/* Free semaphore, adjusting the system-wide priority ceiling if necessary. */
+asmlinkage long sys_srp_sema_free(srp_sema_id sem_id)
+{
+ int cpu;
+
+ preempt_disable();
+ cpu = smp_processor_id();
+
+ if (sem_id < 0 || sem_id >= MAX_SRP_SEMAPHORES ||
+ srp_sems[sem_id].cpu != cpu)
+ return -EINVAL;
+
+ srp_sems[sem_id].claimed = 0;
+ srp_sems[sem_id].used = 0;
+
+ preempt_enable();
+ return 0;
+}
+
+
+
+/* ************************************************************************** */
+
+
+
diff --git a/kernel/pfair_common.c b/kernel/pfair_common.c
new file mode 100644
index 0000000..a9e636d
--- /dev/null
+++ b/kernel/pfair_common.c
@@ -0,0 +1,241 @@
+/*
+ * Common functions for PFAIR based scheduler.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+#include <linux/sched_trace.h>
+
+#include <linux/pfair_common.h>
+#include <linux/pfair_math.h>
+/* Comparison of two tasks whether
+ * the lhs has higher priority than the rhs */
+int is_pfair_hp(struct task_struct *lhs, struct task_struct *rhs)
+{
+ /* Favor subtasks with earlier deadlines */
+ if(time_before(get_deadline(lhs), get_deadline(rhs)))
+ return 1;
+ if(get_deadline(lhs) == get_deadline(rhs)) {
+ /* If deadlines are equal,
+ * favor non-zero b-bit (a heavy task) */
+ if(lhs->rt_param.times.b_bit > rhs->rt_param.times.b_bit)
+ return 1;
+
+ if(lhs->rt_param.times.b_bit == rhs->rt_param.times.b_bit &&
+ lhs->rt_param.times.b_bit == 1)
+ /* If b-bit is 1, favor tasks with later
+ * group deadline */
+ return time_after(lhs->rt_param.times.group_deadline,
+ rhs->rt_param.times.group_deadline);
+
+ }
+ return 0;
+}
+
+void pfair_domain_init(pfair_domain_t *pfair)
+{
+ BUG_ON(!pfair);
+ INIT_LIST_HEAD(&pfair->ready_queue);
+ INIT_LIST_HEAD(&pfair->release_queue);
+ queue_lock_init(&pfair->pfair_lock);
+ cpus_setall(pfair->domain_cpus);
+ /* Use cpu 0 to keep the system alive
+ * TODO: Remove later or make it configurable
+ * */
+ cpu_clear(0, pfair->domain_cpus);
+}
+
+
+/* add_ready - add a real-time task to the PFAIR ready queue.
+ * It must be runnable. Global domain lock must be held before
+ * calling this function.
+ *
+ * @new: the newly released task
+ */
+void pfair_add_ready(pfair_domain_t* pfair, struct task_struct *new)
+{
+ struct list_head *pos;
+ struct task_struct *queued;
+
+ BUG_ON(!new);
+ /* find a spot where our deadline is earlier than the next */
+ list_for_each(pos, &pfair->ready_queue) {
+ queued = list_entry(pos, struct task_struct, rt_list);
+ if (unlikely(is_pfair_hp(new, queued))) {
+ /* the task at pos has a later deadline */
+ /* insert the new task in front of it */
+ __list_add(&new->rt_list, pos->prev, pos);
+ return;
+ }
+ }
+ /* if we get to this point either the list is empty or new has the
+ * lowest priority. Let's add it to the end. */
+ list_add_tail(&new->rt_list, &pfair->ready_queue);
+}
+/**
+ * Extraction function.
+ */
+struct task_struct* __pfair_take_ready(pfair_domain_t* pfair)
+{
+ struct task_struct *t = NULL;
+ /* either not yet released, preempted, or non-rt */
+ if (!list_empty(&pfair->ready_queue)) {
+
+ /* take next rt task */
+ t = list_entry(pfair->ready_queue.next, struct task_struct,
+ rt_list);
+
+ /* kick it out of the ready list */
+ list_del(&t->rt_list);
+ }
+ return t;
+}
+
+
+/* add_release - add a real-time task to the PFAIR release queue.
+ * Domain lock must be acquired before the function is called.
+ *
+ * @task: the sleeping task
+ */
+void pfair_add_release(pfair_domain_t* pfair, struct task_struct *task)
+{
+ struct list_head *pos;
+ struct task_struct *queued;
+
+ BUG_ON(!task);
+ /* find a spot where our deadline is earlier than the next */
+ list_for_each_prev(pos, &pfair->release_queue) {
+ queued = list_entry(pos, struct task_struct, rt_list);
+ if ((unlikely(time_before(queued->rt_param.times.release,
+ task->rt_param.times.release)))) {
+ /* the task at pos has an earlier release */
+ /* insert the new task in behind it */
+ __list_add(&task->rt_list, pos, pos->next);
+ return;
+ }
+ }
+ /* if we get to this point either the list is empty or task has the
+ * earliest release. Let's add it to the front. */
+ list_add(&task->rt_list, &pfair->release_queue);
+}
+/**
+ * This function is called from tick handler, it acquires the lock
+ * automatically. Only one processor effectively merges the queues.
+ */
+void pfair_try_release_pending(pfair_domain_t* pfair)
+{
+ unsigned long flags;
+ struct list_head *pos, *save;
+ struct task_struct *queued;
+ queue_lock_irqsave(&pfair->pfair_lock, flags);
+
+ list_for_each_safe(pos, save, &pfair->release_queue) {
+ queued = list_entry(pos, struct task_struct, rt_list);
+ if (likely(time_before_eq(
+ queued->rt_param.times.release, jiffies))) {
+ /* this one is ready to go*/
+ list_del(pos);
+ set_rt_flags(queued, RT_F_RUNNING);
+
+ sched_trace_job_release(queued);
+ /* now it can be picked up */
+ barrier();
+ pfair_add_ready(pfair, queued);
+ }
+ else
+ /* the release queue is ordered */
+ break;
+ }
+ queue_unlock_irqrestore(&pfair->pfair_lock, flags);
+}
+/*
+ * Subtask preparation. Assuming that last_release
+ * denotes the time when the job was released.
+ */
+void pfair_prepare_next_subtask(struct task_struct *t)
+{
+ BUG_ON(!t);
+ /* assign subtask release time, deadline, b-bit,
+ * and group deadline
+ */
+ t->rt_param.times.release = t->rt_param.times.last_release
+ +release_time(t);
+ t->rt_param.times.deadline = t->rt_param.times.last_release
+ +pfair_deadline(t);
+ t->rt_param.times.b_bit = b_bit(t);
+ t->rt_param.times.group_deadline = t->rt_param.times.last_release
+ +group_deadline(t);
+}
+
+void pfair_prepare_next_job(struct task_struct *t)
+{
+ BUG_ON(!t);
+ /* update tardy job ctr */
+ if (jiffies > t->rt_param.times.deadline)
+ t->rt_param.stats.nontardy_jobs_ctr = 0;
+ else
+ t->rt_param.stats.nontardy_jobs_ctr++;
+
+ /* prepare next job release */
+ /* make passed quantums zero so that we could compute new release times
+ * and deadlines for subtasks correctly
+ */
+ t->rt_param.times.exec_time = 0;
+ /* assign job-wide release time,
+ * this is the starting point to
+ * compute subtask releases, deadlines and group deadlines
+ */
+ t->rt_param.times.last_release = t->rt_param.times.last_release
+ +get_rt_period(t);
+ /* Release the first subtask. */
+ pfair_prepare_next_subtask(t);
+ t->first_time_slice = 0;
+}
+
+void __pfair_prepare_new_release(struct task_struct *t, jiffie_t start)
+{
+ t->rt_param.times.release = start;
+ t->rt_param.times.last_release = start;
+ t->rt_param.stats.nontardy_jobs_ctr = 0xf0000000;
+ t->rt_param.times.exec_time = 0;
+ t->first_time_slice = 0;
+ pfair_prepare_next_subtask(t);
+ set_rt_flags(t, RT_F_RUNNING);
+}
+
+void pfair_prepare_new_releases(pfair_domain_t *pfair, jiffie_t start)
+{
+ unsigned long flags;
+ struct list_head tmp_list;
+ struct list_head *pos, *n;
+ struct task_struct *t;
+
+ INIT_LIST_HEAD(&tmp_list);
+
+ queue_lock_irqsave(&pfair->pfair_lock, flags);
+
+
+ while (!list_empty(&pfair->release_queue)) {
+ pos = pfair->release_queue.next;
+ list_del(pos);
+ list_add(pos, &tmp_list);
+ }
+ while (!list_empty(&pfair->ready_queue)) {
+ pos = pfair->ready_queue.next;
+ list_del(pos);
+ list_add(pos, &tmp_list);
+ }
+
+ list_for_each_safe(pos, n, &tmp_list) {
+ t = list_entry(pos, struct task_struct, rt_list);
+ list_del(pos);
+ __pfair_prepare_new_release(t, start);
+ pfair_add_release(pfair, t);
+ }
+ queue_unlock_irqrestore(&pfair->pfair_lock, flags);
+}
+
diff --git a/kernel/sched.c b/kernel/sched.c
index cca93cc..40cf184 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -56,6 +56,14 @@
#include <asm/unistd.h>
+#include <linux/litmus.h>
+#define __SCHED_C__
+#include <linux/sched_plugin.h>
+#include <linux/sched_trace.h>
+#include <linux/rt_param.h>
+
+#include <linux/trace.h>
+
/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -836,7 +844,7 @@ static int effective_prio(struct task_struct *p)
* keep the priority unchanged. Otherwise, update priority
* to the normal priority:
*/
- if (!rt_prio(p->prio))
+ if (!rt_prio(p->prio) && !is_realtime(p))
return p->normal_prio;
return p->prio;
}
@@ -844,7 +852,7 @@ static int effective_prio(struct task_struct *p)
/*
* __activate_task - move a task to the runqueue.
*/
-static void __activate_task(struct task_struct *p, struct rq *rq)
+void __activate_task(struct task_struct *p, struct rq *rq)
{
struct prio_array *target = rq->active;
@@ -999,7 +1007,7 @@ out:
/*
* deactivate_task - remove a task from the runqueue.
*/
-static void deactivate_task(struct task_struct *p, struct rq *rq)
+void deactivate_task(struct task_struct *p, struct rq *rq)
{
dec_nr_running(p, rq);
dequeue_task(p, p->array);
@@ -1408,13 +1416,44 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
#endif
rq = task_rq_lock(p, &flags);
+
+ if (is_realtime(p))
+ TRACE("try_to_wake_up(%s/%d)\n", p->comm, p->pid);
+
old_state = p->state;
if (!(old_state & state))
- goto out;
+ goto out;
if (p->array)
goto out_running;
+
+ spin_lock(&litmus_task_set_lock);
+ if (p->rt_param.litmus_controlled) {
+ /* Already included. This can happen
+ * if the task dropped all locks to call
+ * schedule() but a wake up raced and came in
+ * early.
+ */
+
+ spin_unlock(&litmus_task_set_lock);
+ goto out_running;
+ }
+
+ sched_trace_task_arrival(p);
+ if (is_realtime(p)) {
+ p->rt_param.litmus_controlled = 1;
+ curr_sched_plugin->wake_up_task(p);
+
+ spin_unlock(&litmus_task_set_lock);
+ goto out_running;
+ }
+
+ p->rt_param.litmus_controlled = 0;
+ spin_unlock(&litmus_task_set_lock);
+
+
+
cpu = task_cpu(p);
this_cpu = smp_processor_id();
@@ -1580,6 +1619,7 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
#endif
set_task_cpu(p, cpu);
+ clear_rt_params(p);
/*
* We mark the process as running here, but have not actually
@@ -1595,6 +1635,10 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
p->prio = current->normal_prio;
INIT_LIST_HEAD(&p->run_list);
+ INIT_LIST_HEAD(&p->rt_list);
+ p->rt_param.basic_params.class = RT_CLASS_BEST_EFFORT;
+ p->rt_param.litmus_controlled = 0;
+ p->rt_param.inh_task = NULL;
p->array = NULL;
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
if (unlikely(sched_info_on()))
@@ -1647,6 +1691,13 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
unsigned long flags;
int this_cpu, cpu;
+ if (clone_flags & CLONE_REALTIME) {
+ /* just mark the task as stopped */
+ /* CLEANUP: Do we have to remove the task from the rq? */
+ p->state = TASK_STOPPED;
+ return;
+ }
+
rq = task_rq_lock(p, &flags);
BUG_ON(p->state != TASK_RUNNING);
this_cpu = smp_processor_id();
@@ -1730,6 +1781,9 @@ void fastcall sched_exit(struct task_struct *p)
unsigned long flags;
struct rq *rq;
+ if (is_realtime(p))
+ return;
+
/*
* If the child was a (relative-) CPU hog then decrease
* the sleep_avg of the parent as well.
@@ -1801,6 +1855,13 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
*/
prev_state = prev->state;
finish_arch_switch(prev);
+ /* Requeue previous real-time task before we drop the rq lock, cause
+ * that may lead to a preemption.
+ */
+ curr_sched_plugin->finish_switch(prev);
+ sched_trace_task_scheduled(current);
+ /* trace before IRQs are enabled */
+ TS_CXS_END;
finish_lock_switch(rq, prev);
if (mm)
mmdrop(mm);
@@ -1811,7 +1872,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
*/
kprobe_flush_task(prev);
put_task_struct(prev);
- }
+ }
}
/**
@@ -2990,7 +3051,7 @@ static inline void idle_balance(int cpu, struct rq *rq)
static inline void wake_priority_sleeper(struct rq *rq)
{
#ifdef CONFIG_SCHED_SMT
- if (!rq->nr_running)
+ if (!rq->nr_running || get_rt_mode() == MODE_RT_RUN)
return;
spin_lock(&rq->lock);
@@ -3220,14 +3281,29 @@ void scheduler_tick(void)
update_cpu_clock(p, rq, now);
- if (p == rq->idle)
- /* Task on the idle queue */
- wake_priority_sleeper(rq);
- else
- task_running_tick(rq, p);
+ /* check whether the RT scheduler plugin requires a call to
+ * schedule
+ */
+ TS_PLUGIN_TICK_START;
+ if (curr_sched_plugin->scheduler_tick() == FORCE_RESCHED)
+ set_tsk_need_resched(p);
+ TS_PLUGIN_TICK_END;
+
+ /* real-time accounting is done by the plugin
+ * call linux functions only for background tasks
+ */
+ if (!is_realtime(p)) {
+ if (p == rq->idle)
+ /* Task on the idle queue */
+ wake_priority_sleeper(rq);
+ else
+ task_running_tick(rq, p);
+ }
+
#ifdef CONFIG_SMP
update_load(rq);
- if (time_after_eq(jiffies, rq->next_balance))
+ if (time_after_eq(jiffies, rq->next_balance) &&
+ get_rt_mode() == MODE_NON_RT)
raise_softirq(SCHED_SOFTIRQ);
#endif
}
@@ -3420,6 +3496,7 @@ asmlinkage void __sched schedule(void)
long *switch_count;
struct rq *rq;
+
/*
* Test if we are atomic. Since do_exit() needs to call into
* schedule() atomically, we ignore that path for now.
@@ -3427,8 +3504,9 @@ asmlinkage void __sched schedule(void)
*/
if (unlikely(in_atomic() && !current->exit_state)) {
printk(KERN_ERR "BUG: scheduling while atomic: "
- "%s/0x%08x/%d\n",
- current->comm, preempt_count(), current->pid);
+ "%s/0x%08x/%d %s\n",
+ current->comm, preempt_count(), current->pid,
+ is_realtime(current) ? "rt" : "non-rt");
debug_show_held_locks(current);
if (irqs_disabled())
print_irqtrace_events(current);
@@ -3438,6 +3516,7 @@ asmlinkage void __sched schedule(void)
need_resched:
preempt_disable();
+ TS_SCHED_START;
prev = current;
release_kernel_lock(prev);
need_resched_nonpreemptible:
@@ -3470,6 +3549,7 @@ need_resched_nonpreemptible:
spin_lock_irq(&rq->lock);
switch_count = &prev->nivcsw;
+ /* check for blocking tasks */
if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
switch_count = &prev->nvcsw;
if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
@@ -3478,13 +3558,66 @@ need_resched_nonpreemptible:
else {
if (prev->state == TASK_UNINTERRUPTIBLE)
rq->nr_uninterruptible++;
+ /* we need to remove real-time tasks from the runqueue*/
+
+ /* protect against races with signal delivery and IO
+ * interrupts on other CPUs
+ *
+ * FIXME: This is probably not sufficient,
+ * as (in theory) after
+ * unlocking the task_set_lock this task could
+ * be scheduled elsewere before we switched away
+ * from it. This has not been observed
+ * yet. To get this locking right is tricky.
+ */
+ spin_lock(&litmus_task_set_lock);
+ if (prev->rt_param.litmus_controlled)
+ prev->rt_param.litmus_controlled = 0;
+ spin_unlock(&litmus_task_set_lock);
+
+ if (is_realtime(prev)) {
+ TRACE("schedule: %s/%d blocks. state = %d\n",
+ prev->comm, prev->pid, prev->state);
+ curr_sched_plugin->task_blocks(prev);
+ /* Enable this for all tasks to get _a lot_ of
+ * data. Can be helpful for debugging.
+ */
+ sched_trace_task_departure(prev);
+ }
+
+ /* only indirect switching is supported in the current
+ * version of LITMUS
+ */
deactivate_task(prev, rq);
}
}
+ next = NULL;
+
+ /* consult the real-time plugin */
+ TS_PLUGIN_SCHED_START;
+ curr_sched_plugin->schedule(prev, &next, rq);
+ TS_PLUGIN_SCHED_END;
+ /* If the real-time plugin wants to switch to a specific task
+ * it'll be on the rq and have the highest priority. There will
+ * be exaclty one such task, thus the selection of the next task
+ * is unambiguous and the following code can only get
+ * triggered if there are no RT tasks pending (on this CPU). Thus,
+ * we may as well skip it.
+ */
+ if (next)
+ goto switch_tasks;
+
cpu = smp_processor_id();
if (unlikely(!rq->nr_running)) {
- idle_balance(cpu, rq);
+ /* only load-balance if we are not in RT mode
+ *
+ * TODO: Maybe this can be relaxed by modifiying the
+ * load-balancing routines in such a way that they never touch
+ * real-time tasks.
+ */
+ if (get_rt_mode() == MODE_NON_RT)
+ idle_balance(cpu, rq);
if (!rq->nr_running) {
next = rq->idle;
rq->expired_timestamp = 0;
@@ -3528,7 +3661,7 @@ need_resched_nonpreemptible:
}
}
next->sleep_type = SLEEP_NORMAL;
- if (dependent_sleeper(cpu, rq, next))
+ if (get_rt_mode() == MODE_NON_RT && dependent_sleeper(cpu, rq, next))
next = rq->idle;
switch_tasks:
if (next == rq->idle)
@@ -3546,7 +3679,11 @@ switch_tasks:
prev->timestamp = prev->last_ran = now;
sched_info_switch(prev, next);
+ TS_SCHED_END;
if (likely(prev != next)) {
+ TS_CXS_START;
+ if (is_running(prev))
+ sched_trace_task_preemption(prev, next);
next->timestamp = now;
rq->nr_switches++;
rq->curr = next;
@@ -3560,9 +3697,10 @@ switch_tasks:
* CPUs since it called schedule(), thus the 'rq' on its stack
* frame will be invalid.
*/
- finish_task_switch(this_rq(), prev);
- } else
+ finish_task_switch(this_rq(), prev);
+ } else {
spin_unlock_irq(&rq->lock);
+ }
prev = current;
if (unlikely(reacquire_kernel_lock(prev) < 0))
@@ -3691,6 +3829,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
}
}
+
/**
* __wake_up - wake up threads blocked on a waitqueue.
* @q: the waitqueue
@@ -3709,6 +3848,7 @@ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
}
EXPORT_SYMBOL(__wake_up);
+
/*
* Same as __wake_up but called with the spinlock in wait_queue_head_t held.
*/
@@ -3717,6 +3857,7 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
__wake_up_common(q, mode, 1, 0, NULL);
}
+
/**
* __wake_up_sync - wake up threads blocked on a waitqueue.
* @q: the waitqueue
@@ -4175,7 +4316,7 @@ static inline struct task_struct *find_process_by_pid(pid_t pid)
}
/* Actually do priority change: must hold rq lock. */
-static void __setscheduler(struct task_struct *p, int policy, int prio)
+void __setscheduler(struct task_struct *p, int policy, int prio)
{
BUG_ON(p->array);
@@ -5397,7 +5538,7 @@ static struct notifier_block __cpuinitdata migration_notifier = {
.priority = 10
};
-int __init migration_init(void)
+int __init linux_migration_init(void)
{
void *cpu = (void *)(long)smp_processor_id();
int err;
@@ -6859,7 +7000,7 @@ static int update_sched_domains(struct notifier_block *nfb,
return NOTIFY_OK;
}
-void __init sched_init_smp(void)
+void __init linux_sched_init_smp(void)
{
cpumask_t non_isolated_cpus;
@@ -6877,7 +7018,7 @@ void __init sched_init_smp(void)
BUG();
}
#else
-void __init sched_init_smp(void)
+void __init linux_sched_init_smp(void)
{
}
#endif /* CONFIG_SMP */
@@ -6892,7 +7033,7 @@ int in_sched_functions(unsigned long addr)
&& addr < (unsigned long)__sched_text_end);
}
-void __init sched_init(void)
+void __init linux_sched_init(void)
{
int i, j, k;
diff --git a/kernel/sched_edf_hsb.c b/kernel/sched_edf_hsb.c
new file mode 100644
index 0000000..d190426
--- /dev/null
+++ b/kernel/sched_edf_hsb.c
@@ -0,0 +1,1802 @@
+/*
+ * kernel/sched_edf_hsb.c
+ *
+ * Implementation of the EDF-HSB scheduler plugin.
+ *
+ */
+
+#include <asm/uaccess.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+#include <linux/edf_common.h>
+#include <linux/fifo_common.h>
+#include <linux/sched_trace.h>
+
+/* undefine to remove capacity sharing */
+#define HSB_CAP_SHARE_ENABLED
+
+/* fake server PIDs */
+#define HRT_BASE_PID 50000
+#define SRT_BASE_PID 60000
+
+
+/******************************************************************************/
+/* Capacity queue */
+/******************************************************************************/
+
+int cap_check_resched(jiffie_t deadline);
+
+typedef struct {
+ int budget;
+ jiffie_t deadline;
+ pid_t donor;
+
+ struct list_head list;
+} capacity_t;
+
+typedef struct {
+ spinlock_t lock;
+ struct list_head queue;
+} capacity_queue_t;
+
+#define next_cap(q) list_entry((q)->queue.next, capacity_t, list)
+
+void capacity_queue_init(capacity_queue_t* queue)
+{
+ queue->lock = SPIN_LOCK_UNLOCKED;
+ INIT_LIST_HEAD(&queue->queue);
+}
+
+void __add_capacity(capacity_queue_t* queue, capacity_t *cap)
+{
+ struct list_head* pos;
+ capacity_t* queued;
+
+ list_for_each_prev(pos, &queue->queue) {
+ queued = list_entry(pos, capacity_t, list);
+ if ( time_before_eq(queued->deadline, cap->deadline)) {
+ __list_add(&cap->list, pos, pos->next);
+ return;
+ }
+ }
+ list_add(&cap->list, &queue->queue);
+}
+
+int __capacity_available(capacity_queue_t* queue)
+{
+ capacity_t *cap;
+
+ while (!list_empty(&queue->queue)) {
+ cap = list_entry(queue->queue.next, capacity_t, list);
+
+
+ if (time_before_eq(cap->deadline, jiffies)) {
+ list_del(queue->queue.next);
+ kfree(cap);
+ cap = NULL;
+ } else
+ break;
+ }
+
+ return !list_empty(&queue->queue);
+}
+
+void __return_capacity(capacity_queue_t* queue, capacity_t *cap)
+{
+ if (!cap->budget || time_before_eq(cap->deadline, jiffies))
+ kfree(cap);
+ else
+ __add_capacity(queue, cap);
+}
+
+
+void return_capacity(capacity_queue_t* queue, capacity_t *cap)
+
+{
+ unsigned long flags;
+
+ if (!cap->budget || time_before_eq(cap->deadline, jiffies))
+ kfree(cap);
+ else {
+ spin_lock_irqsave(&queue->lock, flags);
+ __add_capacity(queue, cap);
+ spin_unlock_irqrestore(&queue->lock, flags);
+ }
+}
+
+
+#define MIN_TIME_DELTA 1
+#define MIN_BUDGET 1
+
+#ifdef HSB_CAP_SHARE_ENABLED
+void release_capacity(capacity_queue_t* queue, unsigned int budget,
+ jiffie_t deadline, struct task_struct* t)
+{
+ capacity_t* cap;
+ unsigned long flags;
+
+ if (deadline >= jiffies + MIN_TIME_DELTA && budget >= MIN_BUDGET) {
+ cap = kmalloc(sizeof(capacity_t), GFP_ATOMIC);
+ if (cap) {
+ cap->budget = budget;
+ cap->deadline = deadline;
+ if (t)
+ cap->donor = t->pid;
+ else
+ cap->donor = 0;
+ spin_lock_irqsave(&queue->lock, flags);
+ __add_capacity(queue, cap);
+ cap_check_resched(next_cap(queue)->deadline);
+ spin_unlock_irqrestore(&queue->lock, flags);
+ if (t)
+ sched_trace_capacity_release(t);
+ }
+ }
+}
+
+void __release_capacity(capacity_queue_t* queue, unsigned int budget,
+ jiffie_t deadline, struct task_struct* t)
+{
+ capacity_t* cap;
+
+ if (deadline >= jiffies + MIN_TIME_DELTA && budget >= MIN_BUDGET) {
+ cap = kmalloc(sizeof(capacity_t), GFP_ATOMIC);
+ if (cap) {
+ cap->budget = budget;
+ cap->deadline = deadline;
+ if (t)
+ cap->donor = t->pid;
+ else
+ cap->donor = 0;
+ /* no locking, no resched check -- called from schedule */
+ __add_capacity(queue, cap);
+ if (t)
+ sched_trace_capacity_release(t);
+ }
+ }
+}
+
+
+capacity_t* __take_capacity(capacity_queue_t* queue, jiffie_t deadline, int deadline_matters)
+{
+ capacity_t* cap = NULL;
+
+ while (!list_empty(&queue->queue)) {
+ cap = list_entry(queue->queue.next, capacity_t, list);
+
+ if (deadline_matters && time_before(deadline, cap->deadline)) {
+ cap = NULL;
+ break;
+ }
+
+ list_del(queue->queue.next);
+ if (cap->deadline > jiffies) {
+ if (cap->deadline - jiffies < cap->budget)
+ cap->budget = cap->deadline - jiffies;
+ break;
+ }
+ kfree(cap);
+ cap = NULL;
+ }
+
+ return cap;
+}
+#else
+
+/* no capacity sharing */
+void release_capacity(capacity_queue_t* queue, unsigned int budget,
+ jiffie_t deadline, struct task_struct* t)
+{
+}
+
+capacity_t* __take_capacity(capacity_queue_t* queue, jiffie_t deadline, int deadline_matters)
+{
+ return NULL;
+}
+#endif
+
+
+/******************************************************************************/
+/* server abstractions */
+/******************************************************************************/
+
+
+/* hrt_server_t - Abstraction of a hard real-time server.
+ *
+ * One HRT server per CPU. If it is unused period and wcet may be zero.
+ * HRT servers are strictly periodic and retain their budget.
+ */
+typedef struct {
+ edf_domain_t domain;
+
+ unsigned int period;
+ unsigned int wcet;
+
+ jiffie_t deadline;
+ int budget;
+} hrt_server_t;
+
+/* be_server_t - Abstraction of best-effort server.
+ *
+ * This is pretty much only an accounting abstraction.
+ */
+typedef struct {
+ unsigned int period;
+ unsigned int wcet;
+
+ jiffie_t deadline;
+ jiffie_t release;
+ int budget;
+
+ struct list_head list;
+ pid_t pid;
+} be_server_t;
+
+/* cast to int to allow for negative slack, i.e. tardiness */
+#define server_slack(srv) \
+ ( ((int) (srv)->deadline - (int) jiffies) - (int) (srv)->budget )
+
+typedef struct {
+ int cpu;
+
+ hrt_server_t hrt;
+ be_server_t* be;
+ capacity_t* cap;
+
+ task_class_t exec_class;
+ jiffie_t cur_deadline;
+ atomic_t will_schedule;
+
+ struct list_head list;
+ spinlock_t lock;
+} cpu_state_t;
+
+
+DEFINE_PER_CPU(cpu_state_t, hsb_cpu_state);
+
+#define hrt_dom(cpu) (&per_cpu(hsb_cpu_state, cpu).hrt.domain)
+
+#define set_will_schedule() \
+ (atomic_set(&__get_cpu_var(hsb_cpu_state).will_schedule, 1))
+#define clear_will_schedule() \
+ (atomic_set(&__get_cpu_var(hsb_cpu_state).will_schedule, 0))
+#define test_will_schedule(cpu) \
+ (atomic_read(&per_cpu(hsb_cpu_state, cpu).will_schedule))
+
+
+static void prepare_hrt_release(hrt_server_t *srv, jiffie_t start)
+{
+ if (srv->period && srv->wcet) {
+ srv->deadline = start;
+ srv->budget = 0;
+ }
+}
+
+static void check_for_hrt_release(hrt_server_t *srv) {
+ if (srv->wcet && srv->period &&
+ time_before_eq(srv->deadline, jiffies)) {
+ srv->deadline += srv->period;
+ srv->budget = srv->wcet;
+ sched_trace_server_release(HRT_BASE_PID + smp_processor_id(),
+ srv->budget, srv->period, RT_CLASS_HARD);
+ }
+}
+
+/* A HRT client is eligible if either its deadline is before the
+ * the server deadline or if the server has zero slack. The server
+ * must have budget left.
+ */
+static inline int hrt_client_eligible(hrt_server_t *srv)
+{
+ if (!list_empty(&srv->domain.ready_queue))
+ return srv->budget && (
+ time_before(get_deadline(next_ready(&srv->domain)),
+ srv->deadline)
+ || server_slack(srv) <= 0);
+ else
+ return 0;
+}
+
+static void hsb_cpu_state_init(cpu_state_t* cpu_state,
+ edf_check_resched_needed_t check,
+ int cpu)
+{
+ edf_domain_init(&cpu_state->hrt.domain, check);
+ cpu_state->hrt.budget = 0;
+ cpu_state->hrt.deadline = 0;
+ cpu_state->hrt.period = 0;
+ cpu_state->hrt.wcet = 0;
+
+ cpu_state->be = NULL;
+ cpu_state->cap = NULL;
+
+ cpu_state->cur_deadline = 0;
+ cpu_state->cpu = cpu;
+ cpu_state->lock = SPIN_LOCK_UNLOCKED;
+ cpu_state->exec_class = RT_CLASS_BEST_EFFORT;
+
+ atomic_set(&cpu_state->will_schedule, 0);
+ INIT_LIST_HEAD(&cpu_state->list);
+}
+
+/******************************************************************************/
+/* BE queue functions - mostly like edf_common.c */
+/******************************************************************************/
+
+#define be_earlier_deadline(a, b) (time_before(\
+ (a)->deadline, (b)->deadline))
+#define be_earlier_release(a, b) (time_before(\
+ (a)->release, (b)->release))
+
+
+static void be_add_ready(edf_domain_t* edf, be_server_t *new)
+{
+ unsigned long flags;
+ struct list_head *pos;
+ be_server_t *queued;
+ unsigned int passed = 0;
+
+ BUG_ON(!new);
+ /* first we need the write lock for edf_ready_queue */
+ write_lock_irqsave(&edf->ready_lock, flags);
+ /* find a spot where our deadline is earlier than the next */
+ list_for_each(pos, &edf->ready_queue) {
+ queued = list_entry(pos, be_server_t, list);
+ if (unlikely(be_earlier_deadline(new, queued))) {
+ __list_add(&new->list, pos->prev, pos);
+ goto out;
+ }
+ passed++;
+ }
+ /* if we get to this point either the list is empty or new has the
+ * lowest priority. Let's add it to the end. */
+ list_add_tail(&new->list, &edf->ready_queue);
+ out:
+ if (!passed)
+ edf->check_resched(edf);
+ write_unlock_irqrestore(&edf->ready_lock, flags);
+}
+
+static be_server_t* be_take_ready(edf_domain_t* edf)
+{
+ be_server_t *t = NULL;
+
+ if (!list_empty(&edf->ready_queue)) {
+ t = list_entry(edf->ready_queue.next, be_server_t, list);
+ /* kick it out of the ready list */
+ list_del(&t->list);
+ }
+ return t;
+}
+
+/*static be_server_t* get_be_server(edf_domain_t* edf)
+{
+ be_server_t *t = NULL;
+
+ spin_lock(&edf->release_lock);
+ write_lock(&edf->ready_lock);
+ t = be_take_ready(edf);
+
+ if (!t && !list_empty(&edf->release_queue)) {
+ t = list_entry(edf->release_queue.next, be_server_t, list);
+
+ list_del(&t->list);
+ }
+
+ write_unlock(&edf->ready_lock);
+ spin_unlock(&edf->release_lock);
+ return t;
+}*/
+
+static void be_add_release(edf_domain_t* edf, be_server_t *srv)
+{
+ unsigned long flags;
+ struct list_head *pos;
+ be_server_t *queued;
+
+ spin_lock_irqsave(&edf->release_lock, flags);
+ list_for_each_prev(pos, &edf->release_queue) {
+ queued = list_entry(pos, be_server_t, list);
+ if ((unlikely(be_earlier_release(queued, srv)))) {
+ /* the task at pos has an earlier release */
+ /* insert the new task in behind it */
+ __list_add(&srv->list, pos, pos->next);
+ goto out;
+ }
+ }
+
+ list_add(&srv->list, &edf->release_queue);
+ out:
+ spin_unlock_irqrestore(&edf->release_lock, flags);
+}
+
+static void be_try_release_pending(edf_domain_t* edf)
+{
+ unsigned long flags;
+ struct list_head *pos, *save;
+ be_server_t *queued;
+
+ if (spin_trylock_irqsave(&edf->release_lock, flags)) {
+ list_for_each_safe(pos, save, &edf->release_queue) {
+ queued = list_entry(pos, be_server_t, list);
+ if (likely(time_before_eq(
+ queued->release,
+ jiffies))) {
+ list_del(pos);
+ be_add_ready(edf, queued);
+ sched_trace_server_release(
+ queued->pid, queued->budget,
+ queued->period, RT_CLASS_BEST_EFFORT);
+ } else
+ /* the release queue is ordered */
+ break;
+ }
+ spin_unlock_irqrestore(&edf->release_lock, flags);
+ }
+}
+
+static void be_prepare_new_release(be_server_t *t, jiffie_t start) {
+ t->release = start;
+ t->deadline = t->release + t->period;
+ t->budget = t->wcet;
+}
+
+static void be_prepare_new_releases(edf_domain_t *edf, jiffie_t start)
+{
+ unsigned long flags;
+ struct list_head tmp_list;
+ struct list_head *pos, *n;
+ be_server_t *t;
+
+ INIT_LIST_HEAD(&tmp_list);
+
+ spin_lock_irqsave(&edf->release_lock, flags);
+ write_lock(&edf->ready_lock);
+
+
+ while (!list_empty(&edf->release_queue)) {
+ pos = edf->release_queue.next;
+ list_del(pos);
+ list_add(pos, &tmp_list);
+ }
+
+ while (!list_empty(&edf->ready_queue)) {
+ pos = edf->ready_queue.next;
+ list_del(pos);
+ list_add(pos, &tmp_list);
+
+ }
+
+ write_unlock(&edf->ready_lock);
+ spin_unlock_irqrestore(&edf->release_lock, flags);
+
+ list_for_each_safe(pos, n, &tmp_list) {
+ t = list_entry(pos, be_server_t, list);
+ list_del(pos);
+ be_prepare_new_release(t, start);
+ be_add_release(edf, t);
+ }
+
+}
+
+static void be_prepare_for_next_period(be_server_t *t)
+{
+ BUG_ON(!t);
+ /* prepare next release */
+ t->release = t->deadline;
+ t->deadline += t->period;
+ t->budget = t->wcet;
+}
+
+#define be_next_ready(edf) \
+ list_entry((edf)->ready_queue.next, be_server_t, list)
+
+
+/* need_to_preempt - check whether the task t needs to be preempted by a
+ * best-effort server.
+ */
+static inline int be_preemption_needed(edf_domain_t* edf, cpu_state_t* state)
+{
+ /* we need the read lock for edf_ready_queue */
+ if (!list_empty(&edf->ready_queue))
+ {
+
+ if (state->exec_class == RT_CLASS_SOFT) {
+ if (state->cap)
+ return time_before(
+ be_next_ready(edf)->deadline,
+ state->cap->deadline);
+ else
+ return time_before(
+ be_next_ready(edf)->deadline,
+ state->cur_deadline);
+ } else
+ return 1;
+ }
+ return 0;
+}
+
+static void be_enqueue(edf_domain_t* edf, be_server_t* srv)
+{
+ int new_release = 0;
+ if (!srv->budget) {
+ be_prepare_for_next_period(srv);
+ new_release = 1;
+ }
+
+ if (time_before_eq(srv->release, jiffies) &&
+ get_rt_mode() == MODE_RT_RUN) {
+ be_add_ready(edf, srv);
+ if (new_release)
+ sched_trace_server_release(
+ srv->pid, srv->budget,
+ srv->period, RT_CLASS_BEST_EFFORT);
+ } else
+ be_add_release(edf, srv);
+}
+
+static void be_preempt(edf_domain_t *be, cpu_state_t *state)
+{
+ be_server_t *srv;
+
+ spin_lock(&state->lock);
+ srv = state->be;
+ state->be = NULL;
+ spin_unlock(&state->lock);
+
+ /* add outside of lock to avoid deadlock */
+ if (srv)
+ be_enqueue(be, srv);
+}
+
+
+/******************************************************************************/
+/* Actual HSB implementation */
+/******************************************************************************/
+
+/* always acquire the cpu lock as the last lock to avoid deadlocks */
+static spinlock_t hsb_cpu_lock = SPIN_LOCK_UNLOCKED;
+/* the cpus queue themselves according to priority in here */
+static LIST_HEAD(hsb_cpu_queue);
+
+
+/* the global soft real-time domain */
+static edf_domain_t srt;
+/* the global best-effort server domain
+ * belongs conceptually to the srt domain, but has
+ * be_server_t* queued instead of tast_t*
+ */
+static edf_domain_t be;
+
+static fifo_domain_t hsb_fifo;
+
+static capacity_queue_t cap_queue;
+
+
+
+
+/* adjust_cpu_queue - Move the cpu entry to the correct place to maintain
+ * order in the cpu queue.
+ *
+ */
+static void adjust_cpu_queue(task_class_t class, jiffie_t deadline,
+ be_server_t *be)
+{
+ struct list_head *pos;
+ cpu_state_t *other;
+ cpu_state_t *entry;
+
+ spin_lock(&hsb_cpu_lock);
+
+ entry = &__get_cpu_var(hsb_cpu_state);
+
+ spin_lock(&entry->lock);
+ entry->exec_class = class;
+ entry->cur_deadline = deadline;
+ entry->be = be;
+
+ spin_unlock(&entry->lock);
+
+
+
+ if (be)
+ sched_trace_server_scheduled(
+ be->pid, RT_CLASS_BEST_EFFORT, be->budget,
+ be->deadline);
+ else if (class == RT_CLASS_HARD)
+ sched_trace_server_scheduled(
+ HRT_BASE_PID + smp_processor_id(), RT_CLASS_HARD,
+ entry->hrt.budget, entry->hrt.deadline);
+
+ list_del(&entry->list);
+ /* If we do not execute real-time jobs we just move
+ * to the end of the queue .
+ * If we execute hard real-time jobs we move the start
+ * of the queue.
+ */
+
+ switch (entry->exec_class) {
+ case RT_CLASS_HARD:
+ list_add(&entry->list, &hsb_cpu_queue);
+ break;
+
+ case RT_CLASS_SOFT:
+ list_for_each(pos, &hsb_cpu_queue) {
+ other = list_entry(pos, cpu_state_t, list);
+ if (other->exec_class > RT_CLASS_SOFT ||
+ time_before_eq(entry->cur_deadline,
+ other->cur_deadline))
+ {
+ __list_add(&entry->list, pos->prev, pos);
+ goto out;
+ }
+ }
+ /* possible fall through if lowest SRT priority */
+
+ case RT_CLASS_BEST_EFFORT:
+ list_add_tail(&entry->list, &hsb_cpu_queue);
+ break;
+
+ default:
+ /* something wrong in the variable */
+ BUG();
+ }
+ out:
+ spin_unlock(&hsb_cpu_lock);
+}
+
+
+/* hrt_check_resched - check whether the HRT server on given CPU needs to
+ * preempt the running task.
+ */
+static int hrt_check_resched(edf_domain_t *edf)
+{
+ hrt_server_t *srv = container_of(edf, hrt_server_t, domain);
+ cpu_state_t *state = container_of(srv, cpu_state_t, hrt);
+ int ret = 0;
+
+ spin_lock(&state->lock);
+
+ if (hrt_client_eligible(srv)) {
+ if (state->exec_class > RT_CLASS_HARD ||
+ time_before(
+ get_deadline(next_ready(edf)),
+ state->cur_deadline)
+ ) {
+ if (state->cpu == smp_processor_id())
+ set_tsk_need_resched(current);
+ else
+ smp_send_reschedule(state->cpu);
+ }
+ }
+
+ spin_unlock(&state->lock);
+ return ret;
+}
+
+
+/* srt_check_resched - Check whether another CPU needs to switch to a SRT task.
+ *
+ * The function only checks and kicks the last CPU. It will reschedule and
+ * kick the next if necessary, and so on. The caller is responsible for making
+ * sure that it is not the last entry or that a reschedule is not necessary.
+ *
+ * Caller must hold edf->ready_lock!
+ */
+static int srt_check_resched(edf_domain_t *edf)
+{
+ cpu_state_t *last;
+ int ret = 0;
+
+ spin_lock(&hsb_cpu_lock);
+
+ if (!list_empty(&srt.ready_queue)) {
+ last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list);
+ /* guard against concurrent updates */
+ spin_lock(&last->lock);
+ if (last->exec_class == RT_CLASS_BEST_EFFORT || (
+ last->exec_class == RT_CLASS_SOFT &&
+ time_before(get_deadline(next_ready(&srt)),
+ last->cur_deadline)))
+ {
+ if (smp_processor_id() == last->cpu)
+ set_tsk_need_resched(current);
+ else
+ if (!test_will_schedule(last->cpu))
+ smp_send_reschedule(last->cpu);
+ ret = 1;
+ }
+ spin_unlock(&last->lock);
+ }
+
+ spin_unlock(&hsb_cpu_lock);
+ return ret;
+}
+
+
+/* be_check_resched - Check whether another CPU needs to switch to a BE server..
+ *
+ * Caller must hold edf->ready_lock!
+ */
+static int be_check_resched(edf_domain_t *edf)
+{
+ cpu_state_t *last;
+ int soft, bg;
+ int ret = 0;
+
+ spin_lock(&hsb_cpu_lock);
+
+ if (!list_empty(&be.ready_queue)) {
+ last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list);
+ /* guard against concurrent updates */
+ spin_lock(&last->lock);
+
+ bg = last->exec_class == RT_CLASS_BEST_EFFORT;
+ soft = last->exec_class == RT_CLASS_SOFT;
+
+ if (bg || (soft && time_before(be_next_ready(&be)->deadline,
+ last->cur_deadline)))
+ {
+ if (smp_processor_id() == last->cpu)
+ set_tsk_need_resched(current);
+ else
+ if (!test_will_schedule(last->cpu))
+ smp_send_reschedule(last->cpu);
+ ret = 1;
+ }
+
+ spin_unlock(&last->lock);
+ }
+
+ spin_unlock(&hsb_cpu_lock);
+ return ret;
+}
+
+
+int cap_check_resched(jiffie_t deadline)
+{
+ unsigned long flags;
+ cpu_state_t *last;
+ int soft, bg;
+ int ret = 0;
+
+
+
+ if (get_rt_mode() == MODE_RT_RUN) {
+ spin_lock_irqsave(&hsb_cpu_lock, flags);
+
+ last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list);
+ /* guard against concurrent updates */
+ spin_lock(&last->lock);
+
+ bg = last->exec_class == RT_CLASS_BEST_EFFORT;
+ soft = last->exec_class == RT_CLASS_SOFT;
+
+ if (bg || (soft && time_before(deadline,
+ last->cur_deadline)))
+ {
+ if (smp_processor_id() == last->cpu)
+ set_tsk_need_resched(current);
+ else
+ if (!test_will_schedule(last->cpu))
+ smp_send_reschedule(last->cpu);
+ ret = 1;
+ }
+
+ spin_unlock(&last->lock);
+
+ spin_unlock_irqrestore(&hsb_cpu_lock, flags);
+ }
+ return ret;
+}
+
+int fifo_check_resched(void)
+{
+ unsigned long flags;
+ cpu_state_t *last;
+ int ret = 0;
+
+ if (get_rt_mode() == MODE_RT_RUN) {
+ spin_lock_irqsave(&hsb_cpu_lock, flags);
+
+
+ last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list);
+ /* guard against concurrent updates */
+
+ spin_lock(&last->lock);
+
+ if (last->exec_class == RT_CLASS_BEST_EFFORT)
+ {
+ if (smp_processor_id() == last->cpu)
+ set_tsk_need_resched(current);
+ else
+ if (!test_will_schedule(last->cpu))
+ smp_send_reschedule(last->cpu);
+ ret = 1;
+ }
+
+ spin_unlock(&last->lock);
+
+ spin_unlock_irqrestore(&hsb_cpu_lock, flags);
+ }
+ return ret;
+}
+
+
+
+static inline int hsb_preemption_needed(edf_domain_t* edf, cpu_state_t* state)
+{
+ /* we need the read lock for edf_ready_queue */
+ if (!list_empty(&edf->ready_queue))
+ {
+ if (state->exec_class == RT_CLASS_SOFT) {
+ if (state->cap)
+ return time_before(get_deadline(next_ready(edf))
+ , state->cap->deadline);
+ else
+ return time_before(get_deadline(next_ready(edf))
+ , state->cur_deadline);
+ } else
+ return 1;
+ }
+ return 0;
+}
+
+static inline int cap_preemption_needed(capacity_queue_t* q, cpu_state_t* state)
+{
+ /* we need the read lock for edf_ready_queue */
+ if (!list_empty(&q->queue))
+ {
+ if (state->exec_class == RT_CLASS_SOFT) {
+ if (state->cap)
+ return time_before(next_cap(q)->deadline
+ , state->cap->deadline);
+ else
+ return time_before(next_cap(q)->deadline
+ , state->cur_deadline);
+ } else
+ return 1;
+ }
+ return 0;
+}
+
+/* hsb_scheduler_tick - this function is called for every local timer
+ * interrupt.
+ *
+ * checks whether the current task has expired and checks
+ * whether we need to preempt it if it has not expired
+ */
+static reschedule_check_t hsb_scheduler_tick(void)
+{
+ unsigned long flags;
+ struct task_struct *t = current;
+ int resched = 0;
+
+ cpu_state_t *state = &__get_cpu_var(hsb_cpu_state);
+
+ /* expire tasks even if not in real-time mode
+ * this makes sure that at the end of real-time mode
+ * no tasks "run away forever".
+ */
+
+ /* charge BE server only if we are not running on a spare capacity */
+ if (state->be && !state->cap && --state->be->budget <= 0) {
+ sched_trace_server_completion(state->be->pid, 0,
+ state->be->deadline,
+ RT_CLASS_BEST_EFFORT);
+ be_preempt(&be, state);
+ resched = 1;
+ }
+
+ if (state->cap)
+ if (--state->cap->budget <= 0 ||
+ time_before_eq(state->cap->deadline, jiffies)) {
+ kfree(state->cap);
+ state->cap = NULL;
+ resched = 1;
+ }
+
+ if (is_realtime(t)) {
+ if (is_hrt(t) && (--state->hrt.budget <= 0)) {
+ sched_trace_server_completion(
+ HRT_BASE_PID + smp_processor_id(), 0,
+ state->hrt.deadline, RT_CLASS_HARD);
+ resched = 1;
+ }
+
+ /* account for received service... */
+ t->rt_param.times.exec_time++;
+
+ /* ...and charge current budget */
+ if (!state->cap) {
+ --t->time_slice;
+ /* a task always should be able to finish its job */
+ BUG_ON(!is_be(t) && !t->time_slice && !job_completed(t));
+ }
+
+ if (job_completed(t) || (is_be(t) && !t->time_slice)) {
+ sched_trace_job_completion(t);
+ set_rt_flags(t, RT_F_SLEEP);
+ resched = 1;
+ }
+ }
+
+
+ if (get_rt_mode() == MODE_RT_RUN)
+ {
+ try_release_pending(&state->hrt.domain);
+ check_for_hrt_release(&state->hrt);
+ try_release_pending(&srt);
+ be_try_release_pending(&be);
+
+ if (!resched)
+ switch (state->exec_class) {
+ case RT_CLASS_HARD:
+ read_lock_irqsave(&state->hrt.domain.ready_lock,
+ flags);
+ resched = preemption_needed(&state->hrt.domain,
+ t);
+ read_unlock_irqrestore(
+ &state->hrt.domain.ready_lock, flags);
+ break;
+
+ case RT_CLASS_SOFT:
+ case RT_CLASS_BEST_EFFORT:
+ local_irq_save(flags);
+
+ /* check for HRT jobs */
+ read_lock(&state->hrt.domain.ready_lock);
+ resched = hrt_client_eligible(&state->hrt);
+ read_unlock(&state->hrt.domain.ready_lock);
+
+ /* check for spare capacities */
+ if (!resched) {
+ spin_lock(&cap_queue.lock);
+ resched =
+ cap_preemption_needed(&cap_queue,
+ state);
+ spin_unlock(&cap_queue.lock);
+ }
+
+ /* check for SRT jobs */
+ if (!resched) {
+ read_lock(&srt.ready_lock);
+ resched = hsb_preemption_needed(
+ &srt, state);
+ read_unlock(&srt.ready_lock);
+ }
+
+ /* check for BE jobs */
+ if (!resched) {
+ read_lock(&be.ready_lock);
+ resched = be_preemption_needed(
+ &be, state);
+ read_unlock(&be.ready_lock);
+ }
+
+ /* check for background jobs */
+ if (!resched && !is_realtime(current))
+ resched = fifo_jobs_pending(&hsb_fifo);
+ local_irq_restore(flags);
+ break;
+
+ default:
+ /* something wrong in the variable */
+ BUG();
+ }
+ }
+
+ if (resched) {
+ set_will_schedule();
+ return FORCE_RESCHED;
+ } else
+ return NO_RESCHED;
+}
+
+static int schedule_hrt(struct task_struct * prev,
+ struct task_struct ** next, runqueue_t * rq)
+{
+ unsigned long flags;
+ int deactivate = 1;
+ cpu_state_t *state;
+
+
+ state = &__get_cpu_var(hsb_cpu_state);
+
+ write_lock_irqsave(&state->hrt.domain.ready_lock, flags);
+
+
+ if (state->cap) {
+ /* hrt_schedule does not have the cap_queue lock */
+ return_capacity(&cap_queue, state->cap);
+ state->cap = NULL;
+ }
+
+ if (is_hrt(prev) && is_released(prev) && is_running(prev)
+ && !preemption_needed(&state->hrt.domain, prev)) {
+ /* This really should only happen if the task has
+ * 100% utilization or when we got a bogus/delayed
+ * resched IPI.
+ */
+ TRACE("HRT: prev will be next, already released\n");
+ *next = prev;
+ deactivate = 0;
+ } else {
+ /* either not yet released, preempted, or non-rt */
+ *next = __take_ready(&state->hrt.domain);
+ /* the logic in hsb_schedule makes sure *next must exist
+ * if we get here */
+ BUG_ON(!*next);
+ /* stick the task into the runqueue */
+ __activate_task(*next, rq);
+ set_task_cpu(*next, smp_processor_id());
+ }
+
+ set_rt_flags(*next, RT_F_RUNNING);
+ adjust_cpu_queue(RT_CLASS_HARD, get_deadline(*next), NULL);
+ clear_will_schedule();
+
+ write_unlock_irqrestore(&state->hrt.domain.ready_lock, flags);
+ return deactivate;
+}
+
+
+static struct task_struct* find_min_slack_task(struct task_struct *prev,
+ edf_domain_t* edf)
+{
+ struct list_head *pos;
+ struct task_struct* tsk = NULL;
+ struct task_struct* cur;
+
+ if (is_realtime(prev) && is_running(prev) &&
+ get_rt_flags(prev) != RT_F_SLEEP)
+ tsk = prev;
+ list_for_each(pos, &edf->ready_queue) {
+ cur = list_entry(pos, struct task_struct, rt_list);
+ if (!tsk || task_slack(tsk) > task_slack(cur))
+ tsk = cur;
+ }
+ return tsk;
+}
+
+static struct task_struct* null_heuristic(struct task_struct *prev,
+ edf_domain_t* edf,
+ fifo_domain_t* fifo)
+{
+ if (fifo_jobs_pending( fifo))
+ return NULL;
+ else if (!list_empty(&edf->ready_queue))
+ return list_entry(edf->ready_queue.next,
+ struct task_struct, rt_list);
+ else
+ return NULL;
+}
+
+/*static struct task_struct* history_heuristic(struct task_struct *prev, edf_domain_t* edf)
+{
+ struct list_head *pos;
+ struct task_struct* tsk = NULL;
+ struct task_struct* cur;
+
+ if (is_realtime(prev) && is_running(prev) &&
+ get_rt_flags(prev) != RT_F_SLEEP)
+ tsk = prev;
+ list_for_each(pos, &edf->ready_queue) {
+ cur = list_entry(pos, struct task_struct, rt_list);
+ if (!tsk ||
+ tsk->rt_param.stats.nontardy_jobs_ctr >
+ cur->rt_param.stats.nontardy_jobs_ctr)
+ tsk = cur;
+ }
+ if (tsk && tsk->rt_param.stats.nontardy_jobs_ctr < 5)
+ return tsk;
+ else
+ return NULL;
+}
+*/
+/* TODO: write slack heuristic.*/
+/*static struct task_struct* slack_heuristic(struct task_struct *prev, edf_domain_t* edf)
+{
+ struct list_head *pos;
+ struct task_struct* tsk = NULL;
+ struct task_struct* cur;
+
+ if (is_realtime(prev) && is_running(prev) &&
+ get_rt_flags(prev) != RT_F_SLEEP)
+ tsk = prev;
+ list_for_each(pos, &edf->ready_queue) {
+ cur = list_entry(pos, struct task_struct, rt_list);
+ if (!tsk ||
+ tsk->rt_param.stats.nontardy_job_ctr >
+ cur->rt_param.stats.nontardy_job_ctr)
+ tsk = cur;
+ }
+ if (tsk && tsk->rt_param.stats.nontardy_job_ctr < 5)
+ return tsk;
+ else
+ return NULL;
+}*/
+
+
+/* caller holds all locks
+ */
+
+static int schedule_capacity(struct task_struct *prev,
+ struct task_struct **next, runqueue_t *rq)
+{
+ cpu_state_t *state = &__get_cpu_var(hsb_cpu_state);
+ capacity_t* old;
+
+ if (state->cap) {
+ old = state->cap;
+ state->cap = __take_capacity(&cap_queue, old->deadline, 1);
+ if (!state->cap)
+ state->cap = old;
+ else
+ __return_capacity(&cap_queue, old);
+ } else
+ state->cap = __take_capacity(&cap_queue, 0, 0);
+
+
+ /* pick a task likely to be tardy */
+ *next = find_min_slack_task(prev, &srt);
+
+ /* only give away spare capacities if there is no task that
+ * is going to be tardy
+ */
+ if (*next && task_slack(*next) >= 0)
+ *next = null_heuristic(prev, &srt, &hsb_fifo);
+ if (*next && *next != prev)
+ list_del(&(*next)->rt_list);
+
+
+ /* if there is none pick a BE job */
+ if (!*next) {
+ if (is_realtime(prev) && is_be(prev) && is_running(prev) &&
+ get_rt_flags(prev) != RT_F_SLEEP)
+ *next = prev;
+ else
+ *next = fifo_take(&hsb_fifo);
+ }
+
+ if (state->be)
+ be_preempt(&be, state);
+ BUG_ON(!state->cap);
+ if (*next && state->cap->donor) {
+ sched_trace_capacity_allocation(
+ *next, state->cap->budget, state->cap->deadline,
+ state->cap->donor);
+ }
+
+ return *next != prev;
+}
+
+
+
+#define BG 0
+#define SRT 1
+#define BE 2
+#define CAP 3
+
+static inline int what_first(edf_domain_t *be, edf_domain_t *srt, capacity_queue_t* q)
+{
+ jiffie_t sdl = 0, bdl= 0, cdl = 0, cur;
+ int _srt = !list_empty(&srt->ready_queue);
+ int _be = !list_empty(&be->ready_queue);
+ int _cap = __capacity_available(q);
+
+
+ int ret = BG; /* nothing ready => background mode*/
+ cur = 0;
+
+ if (_srt)
+ sdl = get_deadline(next_ready(srt));
+ if (_be)
+ bdl = be_next_ready(be)->deadline;
+ if (_cap)
+ cdl = next_cap(q)->deadline;
+
+
+
+ if (_cap) {
+ ret = CAP;
+ cur = cdl;
+ }
+ if (_srt && (time_before(sdl, cur) || !ret)) {
+ ret = SRT;
+ cur = sdl;
+ }
+ if (_be && (time_before(bdl, cur) || !ret)) {
+ ret = BE;
+ cur = bdl;
+ }
+ return ret;
+}
+
+
+
+static int schedule_srt_be_cap(struct task_struct *prev,
+ struct task_struct **next, runqueue_t *rq)
+{
+ task_class_t class = RT_CLASS_BEST_EFFORT;
+ jiffie_t deadline = 0;
+ unsigned long flags;
+ int deactivate = 1;
+ be_server_t* bes;
+ cpu_state_t* state;
+ int type;
+
+reschedule:
+ write_lock_irqsave(&srt.ready_lock, flags);
+ write_lock(&be.ready_lock);
+ spin_lock(&cap_queue.lock);
+
+
+ state = &__get_cpu_var(hsb_cpu_state);
+ bes = NULL;
+
+ clear_will_schedule();
+
+ if (is_realtime(prev) && (is_released(prev) || is_be(prev)) &&
+ is_running(prev) && !hsb_preemption_needed(&srt, state) &&
+ !be_preemption_needed(&be, state)
+ ) {
+ /* Our current task's next job has already been
+ * released and has higher priority than the highest
+ * prioriy waiting task; in other words: it is tardy.
+ * We just keep it.
+ */
+ TRACE("prev will be next, already released\n");
+ *next = prev;
+ class = prev->rt_param.basic_params.class;
+ deadline = get_deadline(*next);
+ deactivate = 0;
+ } else {
+ /* either not yet released, preempted, or non-rt */
+ type = what_first(&be, &srt, &cap_queue);
+ switch (type) {
+ case CAP:
+ /* capacity */
+ deactivate = schedule_capacity(prev, next, rq);
+ deadline = state->cap->deadline;
+ if (*next)
+ class = RT_CLASS_SOFT;
+ else
+ class = RT_CLASS_BEST_EFFORT;
+ break;
+ case BE:
+ /* be */
+ *next = NULL;
+ bes = be_take_ready(&be);
+ if (bes) {
+ class = RT_CLASS_SOFT;
+ deadline = bes->deadline;
+ *next = fifo_take(&hsb_fifo);
+ if (!*next) {
+ /* deactivate */
+ __release_capacity(&cap_queue,
+ bes->budget,
+ bes->deadline, NULL);
+ bes->budget = 0;
+ barrier();
+ spin_unlock(&cap_queue.lock);
+ write_unlock(&be.ready_lock);
+ write_unlock_irqrestore(&srt.ready_lock,
+ flags);
+ be_enqueue(&be, bes);
+ goto reschedule;
+ }
+ }
+ break;
+ case SRT:
+ /* srt */
+ *next = __take_ready(&srt);
+ if (*next) {
+ class = RT_CLASS_SOFT;
+ deadline = get_deadline(*next);
+ }
+ break;
+ case BG:
+ /* background server mode */
+ class = RT_CLASS_BEST_EFFORT;
+ deadline = 0;
+ *next = fifo_take(&hsb_fifo);
+ break;
+ }
+
+
+ /* give back capacities */
+ if (type != CAP && state->cap) {
+ __return_capacity(&cap_queue, state->cap);
+ state->cap = NULL;
+ }
+ if (*next && deactivate) {
+ /* mark the task as executing on this cpu */
+ set_task_cpu(*next, smp_processor_id());
+ /* stick the task into the runqueue */
+ __activate_task(*next, rq);
+ }
+ }
+
+ adjust_cpu_queue(class, deadline, bes);
+
+ switch (type) {
+ case BG:
+ break;
+ case BE:
+ be.check_resched(&be);
+ break;
+ case SRT:
+ srt.check_resched(&srt);
+ break;
+ case CAP:
+ if (!list_empty(&cap_queue.queue))
+ cap_check_resched(list_entry(cap_queue.queue.next,
+ capacity_t, list)->deadline);
+ break;
+ }
+
+
+ if(*next)
+ set_rt_flags(*next, RT_F_RUNNING);
+
+ spin_unlock(&cap_queue.lock);
+ write_unlock(&be.ready_lock);
+ write_unlock_irqrestore(&srt.ready_lock, flags);
+ return deactivate;
+}
+
+
+static int hsb_schedule(struct task_struct * prev, struct task_struct ** next,
+ runqueue_t * rq)
+{
+ int need_deactivate = 1;
+ cpu_state_t *state = NULL;
+
+ preempt_disable();
+
+ state = &__get_cpu_var(hsb_cpu_state);
+
+ be_preempt(&be, state);
+
+
+ if (is_realtime(prev) && !is_be(prev) &&
+ get_rt_flags(prev) == RT_F_SLEEP)
+ {
+ TRACE("preparing %d for next period\n", prev->pid);
+ release_capacity(&cap_queue, prev->time_slice,
+ prev->rt_param.times.deadline, prev);
+ prepare_for_next_period(prev);
+ }
+
+ if (get_rt_mode() == MODE_RT_RUN) {
+ /* we need to schedule hrt if a hrt job is pending or when
+ * we have a non expired hrt job on the cpu
+ */
+
+ if (hrt_client_eligible(&state->hrt) ||
+ unlikely((is_hrt(prev) && is_running(prev) &&
+ get_rt_flags(prev) != RT_F_SLEEP))) {
+ if (state->cap) {
+ return_capacity(&cap_queue, state->cap);
+ state->cap = NULL;
+ }
+ need_deactivate = schedule_hrt(prev, next, rq);
+ } else
+ need_deactivate = schedule_srt_be_cap(prev, next, rq);
+
+ }
+
+ if (is_realtime(prev) && need_deactivate && prev->array) {
+ /* take it out of the run queue */
+ deactivate_task(prev, rq);
+ }
+
+ preempt_enable();
+
+ return 0;
+}
+
+/* put task into correct queue */
+static inline void hsb_add_release(struct task_struct *t)
+{
+ if (is_hrt(t))
+ add_release(hrt_dom(get_partition(t)), t);
+ else if (is_srt(t))
+ add_release(&srt, t);
+ else if (is_be(t)) {
+ t->time_slice = 0;
+ fifo_enqueue(&hsb_fifo, t);
+ fifo_check_resched();
+ } else
+ BUG();
+
+}
+
+/* put task into correct queue */
+static inline void hsb_add_ready(struct task_struct *t)
+{
+ if (is_hrt(t))
+ add_ready(hrt_dom(get_partition(t)), t);
+ else if (is_srt(t))
+ add_ready(&srt, t);
+ else if (is_be(t)) {
+ fifo_enqueue(&hsb_fifo, t);
+ fifo_check_resched();
+ }
+ else
+ BUG();
+}
+
+
+/* _finish_switch - we just finished the switch away from prev
+ * it is now safe to requeue the task
+ */
+static void hsb_finish_switch(struct task_struct *prev)
+{
+ if (!is_realtime(prev) || !is_running(prev))
+ return;
+
+ TRACE("finish switch for %d\n", prev->pid);
+
+ if (is_be(prev)) {
+ fifo_enqueue(&hsb_fifo, prev);
+ return;
+ }
+
+ if (get_rt_flags(prev) == RT_F_SLEEP ||
+ get_rt_mode() != MODE_RT_RUN) {
+ /* this task has expired
+ * _schedule has already taken care of updating
+ * the release and
+ * deadline. We just must check if has been released.
+ */
+ if (is_released(prev) && get_rt_mode() == MODE_RT_RUN) {
+ sched_trace_job_release(prev);
+ hsb_add_ready(prev);
+ TRACE("%d goes straight to ready queue\n", prev->pid);
+ }
+ else
+ /* it has got to wait */
+ hsb_add_release(prev);
+ }
+ else {
+ /* this is a forced preemption
+ * thus the task stays in the ready_queue
+ * we only must make it available to other cpus
+ */
+ hsb_add_ready(prev);
+ }
+}
+
+
+/* Prepare a task for running in RT mode
+ * Enqueues the task into master queue data structure
+ * returns
+ * -EPERM if task is not TASK_STOPPED
+ */
+static long hsb_prepare_task(struct task_struct * t)
+{
+ TRACE("edf-hsb: prepare task %d\n", t->pid);
+
+ if (t->state == TASK_STOPPED) {
+ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
+
+ if (get_rt_mode() == MODE_RT_RUN && !is_be(t))
+ /* The action is already on.
+ * Prepare immediate release
+ */
+ prepare_new_release(t);
+ /* The task should be running in the queue, otherwise signal
+ * code will try to wake it up with fatal consequences.
+ */
+ t->state = TASK_RUNNING;
+ if (is_be(t))
+ t->rt_param.times.deadline = 0;
+ hsb_add_release(t);
+ return 0;
+ }
+ else
+ return -EPERM;
+}
+
+static void hsb_wake_up_task(struct task_struct *task)
+{
+ /* We must determine whether task should go into the release
+ * queue or into the ready queue. It may enter the ready queue
+ * if it has credit left in its time slice and has not yet reached
+ * its deadline. If it is now passed its deadline we assume this the
+ * arrival of a new sporadic job and thus put it in the ready queue
+ * anyway.If it has zero budget and the next release is in the future
+ * it has to go to the release queue.
+ */
+ TRACE("edf-hsb: wake up %d with budget=%d\n",
+ task->pid, task->time_slice);
+ task->state = TASK_RUNNING;
+
+ if (is_be(task)) {
+ hsb_add_release(task);
+ }
+ else if (is_tardy(task)) {
+ /* new sporadic release */
+ prepare_new_release(task);
+ sched_trace_job_release(task);
+ hsb_add_ready(task);
+ }
+ else if (task->time_slice) {
+ /* came back in time before deadline
+ * TODO: clip budget to fit into period, otherwise it could
+ * cause a deadline overrun in the next period, i.e.
+ * over allocation in the next period.
+ */
+ set_rt_flags(task, RT_F_RUNNING);
+ hsb_add_ready(task);
+ }
+ else {
+ hsb_add_release(task);
+ }
+
+}
+
+static void hsb_task_blocks(struct task_struct *t)
+{
+ /* CLEANUP: The BUG_ON actually triggerd in a really weierd case if a
+ * BEST_EFFORT gets caught in a migration right after execv
+ * The next version of Litmus should deal with this more gracefully.
+ */
+
+ /*BUG_ON(!is_realtime(t));*/
+ /* not really anything to do since it can only block if
+ * it is running, and when it is not running it is not in any
+ * queue anyway.
+ *
+ * TODO: Check whether the assumption is correct for SIGKILL and
+ * SIGSTOP.
+ */
+ TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice);
+ /*BUG_ON(t->rt_list.next != LIST_POISON1);*/
+ /*BUG_ON(t->rt_list.prev != LIST_POISON2);*/
+
+ if (is_be(t))
+ sched_trace_job_completion(t);
+}
+
+
+/* When _tear_down is called, the task should not be in any queue any more
+ * as it must have blocked first. We don't have any internal state for the task,
+ * it is all in the task_struct.
+ */
+static long hsb_tear_down(struct task_struct * t)
+{
+ /* CLEANUP: see hsb_task_blocks */
+ /*BUG_ON(!is_realtime(t));
+ TRACE("edf-hsb: tear down called for %d \n", t->pid);
+ BUG_ON(t->array);
+ BUG_ON(t->rt_list.next != LIST_POISON1);
+ BUG_ON(t->rt_list.prev != LIST_POISON2);*/
+ return 0;
+}
+
+static int hsb_mode_change(int new_mode)
+{
+ int cpu;
+ cpu_state_t *entry;
+ jiffie_t start;
+
+ TRACE("[%d] edf-hsb: mode changed to %d\n", smp_processor_id(),
+ new_mode);
+ if (new_mode == MODE_RT_RUN) {
+ start = jiffies + 20;
+ prepare_new_releases(&srt, start);
+ be_prepare_new_releases(&be, start);
+
+ /* initialize per CPU state
+ * we can't do this at boot time because we don't know
+ * which CPUs will be online and we can't put non-existing
+ * cpus into the queue
+ */
+ spin_lock(&hsb_cpu_lock);
+ /* get old cruft out of the way in case we reenter real-time
+ * mode for a second time
+ */
+ while (!list_empty(&hsb_cpu_queue))
+ list_del(hsb_cpu_queue.next);
+ /* reinitialize */
+ for_each_online_cpu(cpu) {
+ entry = &per_cpu(hsb_cpu_state, cpu);
+ atomic_set(&entry->will_schedule, 0);
+ entry->exec_class = RT_CLASS_BEST_EFFORT;
+ entry->cur_deadline = 0;
+ list_add(&entry->list, &hsb_cpu_queue);
+
+ prepare_new_releases(&entry->hrt.domain, start);
+ prepare_hrt_release(&entry->hrt, start);
+ }
+ spin_unlock(&hsb_cpu_lock);
+
+ }
+ TRACE("[%d] edf-hsb: mode change done\n", smp_processor_id());
+ return 0;
+}
+
+
+typedef enum {
+ EDF_HSB_SET_HRT,
+ EDF_HSB_GET_HRT,
+ EDF_HSB_CREATE_BE
+} edf_hsb_setup_cmds_t;
+
+typedef struct {
+ int cpu;
+ unsigned int wcet;
+ unsigned int period;
+} setup_hrt_param_t;
+
+typedef struct {
+ unsigned int wcet;
+ unsigned int period;
+} create_be_param_t;
+
+typedef struct {
+ union {
+ setup_hrt_param_t setup_hrt;
+ create_be_param_t create_be;
+ };
+} param_t;
+
+static pid_t next_be_server_pid = SRT_BASE_PID;
+
+static int hsb_scheduler_setup(int cmd, void __user* up)
+{
+ unsigned long flags;
+ int error = -EINVAL;
+ cpu_state_t* state;
+ be_server_t* srv;
+ param_t param;
+
+ switch (cmd) {
+ case EDF_HSB_SET_HRT:
+ if (copy_from_user(¶m, up, sizeof(setup_hrt_param_t))) {
+ error = -EFAULT;
+ goto out;
+ }
+ if (!cpu_online(param.setup_hrt.cpu)) {
+ printk(KERN_WARNING "scheduler setup: "
+ "CPU %d is not online!\n", param.setup_hrt.cpu);
+ error = -EINVAL;
+ goto out;
+ }
+ if (param.setup_hrt.period < param.setup_hrt.wcet) {
+ printk(KERN_WARNING "period < wcet!\n");
+ error = -EINVAL;
+ goto out;
+ }
+
+ state = &per_cpu(hsb_cpu_state, param.setup_hrt.cpu);
+ spin_lock_irqsave(&state->lock, flags);
+
+ state->hrt.wcet = param.setup_hrt.wcet;
+ state->hrt.period = param.setup_hrt.period;
+
+ spin_unlock_irqrestore(&state->lock, flags);
+
+ printk(KERN_WARNING "edf-hsb: set HRT #%d to (%d, %d)\n",
+ param.setup_hrt.cpu, param.setup_hrt.wcet,
+ param.setup_hrt.period);
+
+ error = 0;
+
+ break;
+
+ case EDF_HSB_GET_HRT:
+ if (copy_from_user(¶m, up, sizeof(setup_hrt_param_t))) {
+ error = -EFAULT;
+ goto out;
+ }
+ if (!cpu_online(param.setup_hrt.cpu)) {
+ error = -EINVAL;
+ goto out;
+ }
+ state = &per_cpu(hsb_cpu_state, param.setup_hrt.cpu);
+ spin_lock_irqsave(&state->lock, flags);
+
+ param.setup_hrt.wcet = state->hrt.wcet;
+ param.setup_hrt.period = state->hrt.period;
+
+ spin_unlock_irqrestore(&state->lock, flags);
+
+ if (copy_to_user(up, ¶m, sizeof(setup_hrt_param_t))) {
+ error = -EFAULT;
+ goto out;
+ }
+ error = 0;
+ break;
+
+ case EDF_HSB_CREATE_BE:
+ if (copy_from_user(¶m, up, sizeof(create_be_param_t))) {
+ error = -EFAULT;
+ goto out;
+ }
+ if (param.create_be.period < param.create_be.wcet ||
+ !param.create_be.period || !param.create_be.wcet) {
+ error = -EINVAL;
+ goto out;
+ }
+ srv = (be_server_t*) kmalloc(sizeof(be_server_t), GFP_KERNEL);
+ if (!srv) {
+ error = -ENOMEM;
+ goto out;
+ }
+ srv->wcet = param.create_be.wcet;
+ srv->period = param.create_be.period;
+ srv->pid = next_be_server_pid++;
+ INIT_LIST_HEAD(&srv->list);
+ be_prepare_new_release(srv, jiffies);
+ be_enqueue(&be, srv);
+
+ printk(KERN_WARNING "edf-hsb: created a BE with (%d, %d)\n",
+ param.create_be.wcet, param.create_be.period);
+
+ error = 0;
+ break;
+
+ default:
+ printk(KERN_WARNING "edf-hsb: unknown command %d\n", cmd);
+ }
+
+out:
+ return error;
+}
+
+/* Plugin object */
+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
+ .ready_to_use = 0
+};
+
+
+/*
+ * Plugin initialization code.
+ */
+#define INIT_SCHED_PLUGIN (struct sched_plugin){\
+ .plugin_name = "EDF-HSB",\
+ .ready_to_use = 1,\
+ .algo_scheduler_tick = hsb_scheduler_tick,\
+ .scheduler_tick = rt_scheduler_tick,\
+ .prepare_task = hsb_prepare_task,\
+ .sleep_next_period = edf_sleep_next_period,\
+ .tear_down = hsb_tear_down,\
+ .shutdown_hook = 0,\
+ .schedule = hsb_schedule,\
+ .finish_switch = hsb_finish_switch,\
+ .mode_change = hsb_mode_change,\
+ .wake_up_task = hsb_wake_up_task,\
+ .task_blocks = hsb_task_blocks, \
+ .scheduler_setup = hsb_scheduler_setup \
+}
+
+
+sched_plugin_t *__init init_edf_hsb_plugin(void)
+{
+ int i;
+
+ if (!s_plugin.ready_to_use)
+ {
+ set_sched_options(SCHED_NONE);
+ capacity_queue_init(&cap_queue);
+ edf_domain_init(&srt, srt_check_resched);
+ edf_domain_init(&be, be_check_resched);
+ fifo_domain_init(&hsb_fifo, 50);
+ for (i = 0; i < NR_CPUS; i++)
+ {
+ hsb_cpu_state_init(&per_cpu(hsb_cpu_state, i),
+ hrt_check_resched, i);
+ printk("HRT server %d initialized.\n", i);
+ }
+ s_plugin = INIT_SCHED_PLUGIN;
+ }
+ return &s_plugin;
+}
diff --git a/kernel/sched_global_edf.c b/kernel/sched_global_edf.c
new file mode 100644
index 0000000..0781de1
--- /dev/null
+++ b/kernel/sched_global_edf.c
@@ -0,0 +1,565 @@
+/*
+ * kernel/sched-global-edf.c
+ *
+ * Re-Implementation of the Global EDF scheduler.
+ *
+ * This version works without using the struct queue. It uses the
+ * builtin kernel lists.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+#include <linux/edf_common.h>
+#include <linux/sched_trace.h>
+
+
+/* cpu_entry_t - maintain state of the priority of cpu's current task
+ * this is needed to check for priority inversions.
+ */
+typedef struct {
+ int cpu;
+ int executes_realtime;
+ jiffie_t cur_deadline;
+ struct list_head list;
+ atomic_t will_schedule;
+} cpu_entry_t;
+DEFINE_PER_CPU(cpu_entry_t, gedf_cpu_entries);
+
+#define set_will_schedule() \
+ (atomic_set(&__get_cpu_var(gedf_cpu_entries).will_schedule, 1))
+#define clear_will_schedule() \
+ (atomic_set(&__get_cpu_var(gedf_cpu_entries).will_schedule, 0))
+#define test_will_schedule(cpu) \
+ (atomic_read(&per_cpu(gedf_cpu_entries, cpu).will_schedule))
+
+
+/* always acquire the cpu lock as the last lock to avoid deadlocks */
+static spinlock_t gedf_cpu_lock = SPIN_LOCK_UNLOCKED;
+/* the cpus queue themselves according to priority in here */
+static LIST_HEAD(gedf_cpu_queue);
+
+
+static edf_domain_t gedf;
+
+#define DUMP(args...) TRACE(args)
+
+/* adjust_cpu_queue - Move the cpu entry to the correct place to maintain
+ * order in the cpu queue. Caller must hold ready write lock.
+ *
+ */
+static void adjust_cpu_queue(int exec_rt, jiffie_t deadline)
+{
+ struct list_head *pos;
+ cpu_entry_t *other;
+ cpu_entry_t *entry;
+
+ spin_lock(&gedf_cpu_lock);
+
+ entry = &__get_cpu_var(gedf_cpu_entries);
+ entry->executes_realtime = exec_rt;
+ entry->cur_deadline = deadline;
+
+ /* TODO: move instead of del+reinsert */
+ list_del(&entry->list);
+ /* if we do not execute real-time jobs we just move
+ * to the end of the queue
+ */
+ if (entry->executes_realtime)
+ list_for_each(pos, &gedf_cpu_queue) {
+ other = list_entry(pos, cpu_entry_t, list);
+ if (!other->executes_realtime ||
+ time_before_eq(entry->cur_deadline,
+ other->cur_deadline))
+ {
+ __list_add(&entry->list, pos->prev, pos);
+ goto out;
+ }
+ }
+ /* if we get this far we have the lowest priority task */
+ list_add_tail(&entry->list, &gedf_cpu_queue);
+
+ out:
+ spin_unlock(&gedf_cpu_lock);
+}
+
+
+/* check_reschedule_needed - Check whether another CPU needs to reschedule.
+ *
+ * The function only checks and kicks the last CPU. It will reschedule and
+ * kick the next if necessary, and so on. The caller is responsible for making
+ * sure that it is not the last entry or that a reschedule is not necessary.
+ *
+ * TODO: This function is probably way too trigger happy. It should only send
+ * IPIs if the other CPU is not going to reschedule anyway. But that is
+ * hard to detect reliably. Too many schedules will hurt performance
+ * but do not cause incorrect schedules.
+ */
+static int gedf_check_resched(edf_domain_t *edf)
+{
+ cpu_entry_t *last;
+ int ret = 0;
+
+ spin_lock(&gedf_cpu_lock);
+
+ if (!list_empty(&edf->ready_queue)) {
+ last = list_entry(gedf_cpu_queue.prev, cpu_entry_t, list);
+ if (!last->executes_realtime ||
+ time_before(next_ready(edf)->rt_param.times.deadline,
+ last->cur_deadline))
+ {
+ if (smp_processor_id() == last->cpu)
+ set_tsk_need_resched(current);
+ else
+ if (!test_will_schedule(last->cpu))
+ smp_send_reschedule(last->cpu);
+ ret = 1;
+ }
+ }
+
+ spin_unlock(&gedf_cpu_lock);
+ return ret;
+}
+
+
+
+/* gedf_scheduler_tick - this function is called for every local timer
+ * interrupt.
+ *
+ * checks whether the current task has expired and checks
+ * whether we need to preempt it if it has not expired
+ */
+static reschedule_check_t gedf_scheduler_tick(void)
+{
+ unsigned long flags;
+ struct task_struct *t = current;
+ reschedule_check_t want_resched = NO_RESCHED;
+
+ /* expire tasks even if not in real-time mode
+ * this makes sure that at the end of real-time mode
+ * no tasks "run away forever".
+ */
+ BUG_ON(is_realtime(t) && t->time_slice > 100000);
+ if (is_realtime(t) && (!--t->time_slice)) {
+ /* this task has exhausted its budget in this period */
+ set_rt_flags(t, RT_F_SLEEP);
+ want_resched = FORCE_RESCHED;
+ set_will_schedule();
+ sched_trace_job_completion(t);
+ }
+ if (get_rt_mode() == MODE_RT_RUN)
+ {
+ /* check whether anything is waiting to be released
+ * this could probably be moved to the global timer
+ * interrupt handler since the state will only change
+ * once per jiffie
+ */
+ try_release_pending(&gedf);
+ if (want_resched != FORCE_RESCHED)
+ {
+ read_lock_irqsave(&gedf.ready_lock, flags);
+ if (preemption_needed(&gedf, t))
+ {
+ want_resched = FORCE_RESCHED;
+ set_will_schedule();
+ }
+ read_unlock_irqrestore(&gedf.ready_lock, flags);
+ }
+ }
+ return want_resched;
+}
+
+/* This is main Global EDF schedule function
+ *
+ * Assumes the caller holds the lock for rq and that irqs are disabled
+ * This is function only works for indirect switching
+ */
+static int gedf_schedule(struct task_struct * prev,
+ struct task_struct ** next,
+ runqueue_t * rq)
+{
+ int need_deactivate = 1;
+ int rt;
+ jiffie_t deadline;
+ unsigned long flags;
+
+
+ if (is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP)
+ {
+ DUMP("preparing %d for next period\n", prev->pid);
+ prepare_for_next_period(prev);
+ }
+
+ if (get_rt_mode() == MODE_RT_RUN) {
+ write_lock_irqsave(&gedf.ready_lock, flags);
+
+ clear_will_schedule();
+
+ if (is_realtime(prev) && is_released(prev) && is_running(prev)
+ && !preemption_needed(&gedf, prev)) {
+ /* Our current task's next job has already been
+ * released and has higher priority than the highest
+ * prioriy waiting task; in other words: it is tardy.
+ * We just keep it.
+ */
+ DUMP("prev will be next, already released\n");
+ *next = prev;
+ rt = 1;
+ deadline = prev->rt_param.times.deadline;
+ need_deactivate = 0;
+ } else {
+ /* either not yet released, preempted, or non-rt */
+ *next = __take_ready(&gedf);
+ if (*next) {
+ /* mark the task as executing on this cpu */
+ set_task_cpu(*next, smp_processor_id());
+
+ /* stick the task into the runqueue */
+ __activate_task(*next, rq);
+ rt = 1;
+ deadline = (*next)->rt_param.times.deadline;
+ }
+ else
+ rt = deadline = 0;
+ }
+
+ adjust_cpu_queue(rt, deadline);
+
+ if (rt) {
+ set_rt_flags(*next, RT_F_RUNNING);
+ gedf.check_resched(&gedf);
+ }
+ write_unlock_irqrestore(&gedf.ready_lock, flags);
+ }
+
+ if (is_realtime(prev) && need_deactivate && prev->array) {
+ /* take it out of the run queue */
+ deactivate_task(prev, rq);
+ }
+
+ /* don't put back into release yet.
+ * We first need to actually switch
+ * stacks before we can execute it
+ * on a different CPU */
+
+ /* in the current implementation nobody cares about the return value */
+ return 0;
+}
+
+
+/* _finish_switch - we just finished the switch away from prev
+ * it is now safe to requeue the task
+ */
+static void gedf_finish_switch(struct task_struct *prev)
+{
+ if (!is_realtime(prev) || !is_running(prev))
+ return;
+
+ /*printk(KERN_INFO "gedf finish switch for %d\n", prev->pid);*/
+ if (get_rt_flags(prev) == RT_F_SLEEP ||
+ get_rt_mode() != MODE_RT_RUN) {
+ /* this task has expired
+ * _schedule has already taken care of updating
+ * the release and
+ * deadline. We just must check if has been released.
+ */
+ if (time_before_eq(prev->rt_param.times.release, jiffies)
+ && get_rt_mode() == MODE_RT_RUN) {
+ /* already released */
+ add_ready(&gedf, prev);
+ DUMP("%d goes straight to ready queue\n", prev->pid);
+ }
+ else
+ /* it has got to wait */
+ add_release(&gedf, prev);
+ }
+ else {
+ /* this is a forced preemption
+ * thus the task stays in the ready_queue
+ * we only must make it available to others
+ */
+ add_ready(&gedf, prev);
+ }
+}
+
+
+/* Prepare a task for running in RT mode
+ * Enqueues the task into master queue data structure
+ * returns
+ * -EPERM if task is not TASK_STOPPED
+ */
+static long gedf_prepare_task(struct task_struct * t)
+{
+ TRACE("global edf: prepare task %d\n", t->pid);
+
+ if (t->state == TASK_STOPPED) {
+ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
+
+ if (get_rt_mode() == MODE_RT_RUN)
+ /* The action is already on.
+ * Prepare immediate release
+ */
+ prepare_new_release(t);
+ /* The task should be running in the queue, otherwise signal
+ * code will try to wake it up with fatal consequences.
+ */
+ t->state = TASK_RUNNING;
+ add_release(&gedf, t);
+ return 0;
+ }
+ else
+ return -EPERM;
+}
+
+static void gedf_wake_up_task(struct task_struct *task)
+{
+ /* We must determine whether task should go into the release
+ * queue or into the ready queue. It may enter the ready queue
+ * if it has credit left in its time slice and has not yet reached
+ * its deadline. If it is now passed its deadline we assume this the
+ * arrival of a new sporadic job and thus put it in the ready queue
+ * anyway.If it has zero budget and the next release is in the future
+ * it has to go to the release queue.
+ */
+ TRACE("global edf: wake up %d with budget=%d\n",
+ task->pid, task->time_slice);
+ task->state = TASK_RUNNING;
+ if (is_tardy(task)) {
+ /* new sporadic release */
+ prepare_new_release(task);
+ sched_trace_job_release(task);
+ add_ready(&gedf, task);
+ }
+ else if (task->time_slice) {
+ /* came back in time before deadline
+ * TODO: clip budget to fit into period, otherwise it could
+ * cause a deadline overrun in the next period, i.e.
+ * over allocation in the next period.
+ */
+ set_rt_flags(task, RT_F_RUNNING);
+ add_ready(&gedf, task);
+ }
+ else {
+ add_release(&gedf, task);
+ }
+
+}
+
+static void gedf_task_blocks(struct task_struct *t)
+{
+ BUG_ON(!is_realtime(t));
+ /* not really anything to do since it can only block if
+ * it is running, and when it is not running it is not in any
+ * queue anyway.
+ *
+ * TODO: Check whether the assumption is correct for SIGKILL and
+ * SIGSTOP.
+ */
+ TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice);
+ BUG_ON(t->rt_list.next != LIST_POISON1);
+ BUG_ON(t->rt_list.prev != LIST_POISON2);
+}
+
+
+/* When _tear_down is called, the task should not be in any queue any more
+ * as it must have blocked first. We don't have any internal state for the task,
+ * it is all in the task_struct.
+ */
+static long gedf_tear_down(struct task_struct * t)
+{
+ BUG_ON(!is_realtime(t));
+ TRACE("global edf: tear down called for %d \n", t->pid);
+ BUG_ON(t->array);
+ BUG_ON(t->rt_list.next != LIST_POISON1);
+ BUG_ON(t->rt_list.prev != LIST_POISON2);
+ return 0;
+}
+
+
+static int gedf_mode_change(int new_mode)
+{
+ int cpu;
+ cpu_entry_t *entry;
+
+/* printk(KERN_INFO "[%d] global edf: mode changed to %d\n", smp_processor_id(),
+ new_mode);*/
+ if (new_mode == MODE_RT_RUN) {
+ prepare_new_releases(&gedf, jiffies + 10);
+
+ /* initialize per CPU state
+ * we can't do this at boot time because we don't know
+ * which CPUs will be online and we can't put non-existing
+ * cpus into the queue
+ */
+ spin_lock(&gedf_cpu_lock);
+ /* get old cruft out of the way in case we reenter real-time
+ * mode for a second time
+ */
+ while (!list_empty(&gedf_cpu_queue))
+ list_del(gedf_cpu_queue.next);
+ /* reinitialize */
+ for_each_online_cpu(cpu) {
+ entry = &per_cpu(gedf_cpu_entries, cpu);
+ atomic_set(&entry->will_schedule, 0);
+ entry->executes_realtime = 0;
+ entry->cur_deadline = 0;
+ entry->cpu = cpu;
+ list_add(&entry->list, &gedf_cpu_queue);
+ }
+ spin_unlock(&gedf_cpu_lock);
+ }
+ /*printk(KERN_INFO "[%d] global edf: mode change done\n", smp_processor_id()); */
+ return 0;
+}
+
+
+/* Plugin object */
+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
+ .ready_to_use = 0
+};
+
+
+/*
+ * Plugin initialization code.
+ */
+#define INIT_SCHED_PLUGIN (struct sched_plugin){\
+ .plugin_name = "Global EDF",\
+ .ready_to_use = 1,\
+ .algo_scheduler_tick = gedf_scheduler_tick,\
+ .scheduler_tick = rt_scheduler_tick,\
+ .prepare_task = gedf_prepare_task,\
+ .sleep_next_period = edf_sleep_next_period,\
+ .tear_down = gedf_tear_down,\
+ .shutdown_hook = 0,\
+ .schedule = gedf_schedule,\
+ .finish_switch = gedf_finish_switch,\
+ .mode_change = gedf_mode_change,\
+ .wake_up_task = gedf_wake_up_task,\
+ .task_blocks = gedf_task_blocks \
+ }
+
+
+sched_plugin_t *__init init_global_edf_plugin(void)
+{
+ if (!s_plugin.ready_to_use)
+ {
+ set_sched_options(SCHED_NONE);
+ edf_domain_init(&gedf, gedf_check_resched);
+ s_plugin = INIT_SCHED_PLUGIN;
+ }
+ return &s_plugin;
+}
+
+
+
+/*****************************************************************************/
+/*****************************************************************************/
+/*****************************************************************************/
+/* NON-PREEMPTIVE GLOBAL EDF */
+
+
+/* gedf_np_scheduler_tick - this function is called for every local timer
+ * interrupt.
+ *
+ * checks whether the current task has expired and checks
+ * whether we need to preempt it if it has not expired
+ */
+static reschedule_check_t gedf_np_scheduler_tick(void)
+{
+ if (get_rt_mode() == MODE_RT_RUN)
+ {
+ /* check whether anything is waiting to be released
+ * this could probably be moved to the global timer
+ * interrupt handler since the state will only change
+ * once per jiffie
+ */
+ try_release_pending(&gedf);
+ }
+
+ /* expire tasks even if not in real-time mode
+ * this makes sure that at the end of real-time mode
+ * no tasks "run away forever".
+ */
+ BUG_ON(current->time_slice > 1000);
+ if (is_realtime(current) && (!--current->time_slice)) {
+ /* this task has exhausted its budget in this period */
+ set_rt_flags(current, RT_F_SLEEP);
+ return FORCE_RESCHED;
+ }
+ else
+ return NO_RESCHED;
+}
+
+/* gedf_np_check_resched - Check whether another CPU needs to reschedule.
+ *
+ * The function only checks and kicks the last CPU. It will reschedule and
+ * kick the next if necessary, and so on. The caller is responsible for making
+ * sure that it is not the last entry or that a reschedule is not necessary.
+ *
+ */
+static int gedf_np_check_resched(edf_domain_t *edf)
+{
+ cpu_entry_t *last;
+ int ret = 0;
+
+ spin_lock(&gedf_cpu_lock);
+
+ if (!list_empty(&edf->ready_queue)) {
+ last = list_entry(gedf_cpu_queue.prev, cpu_entry_t, list);
+ /* preemption happens only for non-realtime tasks */
+ if (!last->executes_realtime)
+ {
+ if (smp_processor_id() == last->cpu)
+ set_tsk_need_resched(current);
+ else
+ smp_send_reschedule(last->cpu);
+ ret = 1;
+ goto out;
+ }
+ }
+
+ out:
+ spin_unlock(&gedf_cpu_lock);
+ return ret;
+}
+
+
+/* non-preemptive global EDF
+ *
+ * Non-preemptive EDF is almost the same as normal EDF. We only have to
+ * adjust the scheduler tick and the resched function.
+ */
+#define INIT_SCHED_PLUGIN_NP (struct sched_plugin){\
+ .plugin_name = "Non-Preemptive Global EDF",\
+ .ready_to_use = 1,\
+ .algo_scheduler_tick = gedf_np_scheduler_tick,\
+ .scheduler_tick = rt_scheduler_tick,\
+ .prepare_task = gedf_prepare_task,\
+ .sleep_next_period = edf_sleep_next_period,\
+ .tear_down = gedf_tear_down,\
+ .shutdown_hook = 0,\
+ .schedule = gedf_schedule,\
+ .finish_switch = gedf_finish_switch,\
+ .mode_change = gedf_mode_change,\
+ .wake_up_task = gedf_wake_up_task,\
+ .task_blocks = gedf_task_blocks \
+ }
+
+
+/* as we only set the plugin at boot time,
+ * we use the same structure as preemptive EDF. This simplifies a lot
+ * of the funtions.
+ */
+sched_plugin_t* __init init_global_edf_np_plugin(void)
+{
+ if (!s_plugin.ready_to_use)
+ {
+ set_sched_options(SCHED_NONE);
+ edf_domain_init(&gedf, gedf_np_check_resched);
+ s_plugin = INIT_SCHED_PLUGIN_NP;
+ }
+ return &s_plugin;
+}
diff --git a/kernel/sched_gsn_edf.c b/kernel/sched_gsn_edf.c
new file mode 100644
index 0000000..9042588
--- /dev/null
+++ b/kernel/sched_gsn_edf.c
@@ -0,0 +1,760 @@
+/*
+ * kernel/sched_gsn_edf.c
+ *
+ * Implementation of the GSN-EDF scheduling algorithm.
+ *
+ * This version uses the simple approach and serializes all scheduling
+ * decisions by the use of a queue lock. This is probably not the
+ * best way to do it, but it should suffice for now. It should not
+ * affect the benchmarks since all synchronization primitives will
+ * take the same performance hit, if any.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <linux/queuelock.h>
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+#include <linux/edf_common.h>
+#include <linux/sched_trace.h>
+
+int in_gsnedf_schedule[NR_CPUS] = {0, 0, 0, 0};
+int in_gsnedf_scheduler_tick[NR_CPUS] = {0, 0, 0, 0};
+int in_gsnedf_finish_switch[NR_CPUS] = {0, 0, 0, 0};
+
+/* cpu_entry_t - maintain the linked and scheduled state
+ */
+typedef struct {
+ int cpu;
+ struct task_struct* linked; /* only RT tasks */
+ struct task_struct* scheduled; /* only RT tasks */
+ struct list_head list;
+ atomic_t will_schedule; /* prevent unneeded IPIs */
+} cpu_entry_t;
+DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries);
+
+#define set_will_schedule() \
+ (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 1))
+#define clear_will_schedule() \
+ (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 0))
+#define test_will_schedule(cpu) \
+ (atomic_read(&per_cpu(gsnedf_cpu_entries, cpu).will_schedule))
+
+
+#define NO_CPU 0xffffffff
+
+/* The gsnedf_lock is used to serialize all scheduling events.
+ * It protects
+ */
+static queuelock_t gsnedf_lock;
+/* the cpus queue themselves according to priority in here */
+static LIST_HEAD(gsnedf_cpu_queue);
+
+static edf_domain_t gsnedf;
+
+
+/* update_cpu_position - Move the cpu entry to the correct place to maintain
+ * order in the cpu queue. Caller must hold gsnedf lock.
+ */
+static void update_cpu_position(cpu_entry_t *entry)
+{
+ cpu_entry_t *other;
+ struct list_head *pos;
+ list_del(&entry->list);
+ /* if we do not execute real-time jobs we just move
+ * to the end of the queue
+ */
+ if (entry->linked) {
+ list_for_each(pos, &gsnedf_cpu_queue) {
+ other = list_entry(pos, cpu_entry_t, list);
+ if (edf_higher_prio(entry->linked, other->linked)) {
+ __list_add(&entry->list, pos->prev, pos);
+ return;
+ }
+ }
+ }
+ /* if we get this far we have the lowest priority job */
+ list_add_tail(&entry->list, &gsnedf_cpu_queue);
+}
+
+/* link_task_to_cpu - Update the link of a CPU.
+ * Handles the case where the to-be-linked task is already
+ * scheduled on a different CPU.
+ */
+static noinline void link_task_to_cpu(struct task_struct* linked,
+ cpu_entry_t *entry)
+
+{
+ cpu_entry_t *sched;
+ struct task_struct* tmp;
+ int on_cpu;
+
+ BUG_ON(linked && !is_realtime(linked));
+
+ /* Currently linked task is set to be unlinked. */
+ if (entry->linked) {
+ entry->linked->rt_param.linked_on = NO_CPU;
+ }
+
+ /* Link new task to CPU. */
+ if (linked) {
+ set_rt_flags(linked, RT_F_RUNNING);
+ /* handle task is already scheduled somewhere! */
+ on_cpu = linked->rt_param.scheduled_on;
+ if (on_cpu != NO_CPU) {
+ sched = &per_cpu(gsnedf_cpu_entries, on_cpu);
+ /* this should only happen if not linked already */
+ BUG_ON(sched->linked == linked);
+
+ /* If we are already scheduled on the CPU to which we
+ * wanted to link, we don't need to do the swap --
+ * we just link ourselves to the CPU and depend on
+ * the caller to get things right.
+ */
+ if (entry != sched) {
+ tmp = sched->linked;
+ linked->rt_param.linked_on = sched->cpu;
+ sched->linked = linked;
+ update_cpu_position(sched);
+ linked = tmp;
+ }
+ }
+ if (linked) /* might be NULL due to swap */
+ linked->rt_param.linked_on = entry->cpu;
+ }
+ entry->linked = linked;
+ update_cpu_position(entry);
+}
+
+/* unlink - Make sure a task is not linked any longer to an entry
+ * where it was linked before. Must hold gsnedf_lock.
+ */
+static noinline void unlink(struct task_struct* t)
+{
+ cpu_entry_t *entry;
+
+ BUG_ON(!t);
+
+ if (t->rt_param.linked_on != NO_CPU) {
+ /* unlink */
+ entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on);
+ t->rt_param.linked_on = NO_CPU;
+ link_task_to_cpu(NULL, entry);
+ } else if (in_list(&t->rt_list)) {
+ /* This is an interesting situation: t is scheduled,
+ * but was just recently unlinked. It cannot be
+ * linked anywhere else (because then it would have
+ * been relinked to this CPU), thus it must be in some
+ * queue. We must remove it from the list in this
+ * case.
+ */
+ list_del(&t->rt_list);
+ }
+}
+
+
+/* preempt - force a CPU to reschedule
+ */
+static noinline void preempt(cpu_entry_t *entry)
+{
+ if (entry->scheduled && is_np(entry->scheduled))
+ return;
+ if (smp_processor_id() == entry->cpu)
+ set_tsk_need_resched(current);
+ else
+ if (!test_will_schedule(entry->cpu))
+ smp_send_reschedule(entry->cpu);
+}
+
+/* requeue - Put an unlinked task into gsn-edf domain.
+ * Caller must hold gsnedf_lock.
+ */
+static noinline void requeue(struct task_struct* task)
+{
+ BUG_ON(!task);
+ /* sanity check rt_list before insertion */
+ BUG_ON(in_list(&task->rt_list));
+
+ if (get_rt_flags(task) == RT_F_SLEEP ||
+ get_rt_mode() != MODE_RT_RUN) {
+ /* this task has expired
+ * _schedule has already taken care of updating
+ * the release and
+ * deadline. We just must check if it has been released.
+ */
+ if (is_released(task) && get_rt_mode() == MODE_RT_RUN)
+ __add_ready(&gsnedf, task);
+ else {
+ /* it has got to wait */
+ __add_release(&gsnedf, task);
+ }
+
+ } else
+ /* this is a forced preemption
+ * thus the task stays in the ready_queue
+ * we only must make it available to others
+ */
+ __add_ready(&gsnedf, task);
+}
+
+/* gsnedf_job_arrival: task is either resumed or released */
+static noinline void gsnedf_job_arrival(struct task_struct* task)
+{
+ cpu_entry_t* last;
+
+ BUG_ON(list_empty(&gsnedf_cpu_queue));
+ BUG_ON(!task);
+
+ /* first queue arriving job */
+ requeue(task);
+
+ /* then check for any necessary preemptions */
+ last = list_entry(gsnedf_cpu_queue.prev, cpu_entry_t, list);
+ if (preemption_needed(&gsnedf, last->linked)) {
+ /* preemption necessary */
+ task = __take_ready(&gsnedf);
+ TRACE("job_arrival: task %d linked to %d\n", task->pid, last->cpu);
+ if (last->linked)
+ requeue(last->linked);
+
+ link_task_to_cpu(task, last);
+ preempt(last);
+ }
+}
+
+/* check for current job releases */
+static noinline void gsnedf_release_jobs(void)
+{
+ struct list_head *pos, *save;
+ struct task_struct *queued;
+
+ list_for_each_safe(pos, save, &gsnedf.release_queue) {
+ queued = list_entry(pos, struct task_struct, rt_list);
+ if (likely(is_released(queued))) {
+ /* this one is ready to go*/
+ list_del(pos);
+ set_rt_flags(queued, RT_F_RUNNING);
+
+ sched_trace_job_release(queued);
+ gsnedf_job_arrival(queued);
+ }
+ else
+ /* the release queue is ordered */
+ break;
+ }
+}
+
+/* gsnedf_scheduler_tick - this function is called for every local timer
+ * interrupt.
+ *
+ * checks whether the current task has expired and checks
+ * whether we need to preempt it if it has not expired
+ */
+static reschedule_check_t gsnedf_scheduler_tick(void)
+{
+ unsigned long flags;
+ struct task_struct* t = current;
+ reschedule_check_t want_resched = NO_RESCHED;
+ cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries);
+
+ /* debug */
+ in_gsnedf_scheduler_tick[smp_processor_id()] = 1;
+
+ /* expire tasks even if not in real-time mode
+ * this makes sure that at the end of real-time mode
+ * no task "runs away forever".
+ */
+ if (is_realtime(t))
+ TRACE_TASK(t, "scheduler tick\n");
+
+ if (is_realtime(t) && t->time_slice && !--t->time_slice) {
+ if (!is_np(t)) { /* np tasks will be preempted when they become
+ preemptable again */
+ set_rt_flags(t, RT_F_SLEEP);
+ want_resched = FORCE_RESCHED;
+ set_will_schedule();
+ sched_trace_job_completion(t);
+ /* prepare for next period */
+ prepare_for_next_period(t);
+ queue_lock_irqsave(&gsnedf_lock, flags);
+ /* unlink */
+ unlink(t);
+ /* requeue */
+ gsnedf_job_arrival(t);
+ queue_unlock_irqrestore(&gsnedf_lock, flags);
+ } else
+ TRACE("gsnedf_scheduler_tick: "
+ "%d is non-preemptable, "
+ "preemption delayed.\n", t->pid);
+
+ }
+ if (get_rt_mode() == MODE_RT_RUN) {
+ in_gsnedf_scheduler_tick[smp_processor_id()] = 666;
+
+ queue_lock_irqsave(&gsnedf_lock, flags);
+
+ /* (1) try to release pending jobs */
+ gsnedf_release_jobs();
+
+ /* (2) check if we need to reschedule */
+ if (entry->linked != entry->scheduled &&
+ (!entry->scheduled || !is_np(entry->scheduled))) {
+ want_resched = FORCE_RESCHED;
+ set_will_schedule();
+ }
+ queue_unlock_irqrestore(&gsnedf_lock, flags);
+ }
+
+ /* debug */
+ in_gsnedf_scheduler_tick[smp_processor_id()] = 0;
+
+ return want_resched;
+}
+
+/* This is main Global EDF schedule function
+ *
+ * Assumes the caller holds the lock for rq and that irqs are disabled
+ */
+static int gsnedf_schedule(struct task_struct * prev,
+ struct task_struct ** next,
+ runqueue_t * rq)
+{
+ cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries);
+
+ in_gsnedf_schedule[smp_processor_id()] = 1;
+
+ /* will be released in finish_switch */
+ queue_lock(&gsnedf_lock);
+ clear_will_schedule();
+
+ /* (1) check for blocking jobs */
+ if (prev == entry->linked &&
+ (get_rt_mode() != MODE_RT_RUN || !is_running(prev))) {
+ link_task_to_cpu(NULL, entry);
+ }
+
+ /* (2) if not linked then get rt task */
+ if (get_rt_mode() == MODE_RT_RUN && !entry->linked) {
+ link_task_to_cpu(__take_ready(&gsnedf), entry);
+ }
+
+ /* (3) if linked different from scheduled
+ * select linked as next
+ */
+ BUG_ON(entry->scheduled && entry->scheduled != prev);
+ if (entry->linked != entry->scheduled) {
+ /* do we need to take care of a previously scheduled
+ * job? */
+ if (entry->scheduled) {
+ BUG_ON(!is_realtime(prev));
+ if (prev->array)
+ /* take it out of the run queue */
+ deactivate_task(prev, rq);
+ }
+ /* do we need to schedule a linked job? */
+ if (entry->linked) {
+ *next = entry->linked;
+ /* mark the task as executing on this cpu */
+ set_task_cpu(*next, smp_processor_id());
+ /* stick the task into the runqueue */
+ __activate_task(*next, rq);
+ }
+ } else
+ *next = entry->linked;
+
+ /* unlock in case that we don't affect real-time tasks or
+ * if nothing changed and finish_switch won't be called
+ */
+ if (prev == *next || (!is_realtime(prev) && !*next))
+ queue_unlock(&gsnedf_lock);
+
+ in_gsnedf_schedule[smp_processor_id()] = 0;
+
+ return 0;
+}
+
+
+/* _finish_switch - we just finished the switch away from prev
+ */
+static void gsnedf_finish_switch(struct task_struct *prev)
+{
+ cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries);
+
+ in_gsnedf_finish_switch[smp_processor_id()] = 1;
+
+ if (is_realtime(current))
+ entry->scheduled = current;
+ else
+ entry->scheduled = NULL;
+
+ prev->rt_param.scheduled_on = NO_CPU;
+ current->rt_param.scheduled_on = smp_processor_id();
+
+ /* unlock in case schedule() left it locked */
+ if (is_realtime(current) || is_realtime(prev))
+ queue_unlock(&gsnedf_lock);
+
+
+ in_gsnedf_finish_switch[smp_processor_id()] = 0;
+}
+
+
+/* Prepare a task for running in RT mode
+ * Enqueues the task into master queue data structure
+ * returns
+ * -EPERM if task is not TASK_STOPPED
+ */
+static long gsnedf_prepare_task(struct task_struct * t)
+{
+ unsigned long flags;
+ TRACE("gsn edf: prepare task %d\n", t->pid);
+
+ if (t->state == TASK_STOPPED) {
+ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
+
+ t->rt_param.scheduled_on = NO_CPU;
+ t->rt_param.linked_on = NO_CPU;
+ t->rt_param.is_non_preemptable = 0;
+ if (get_rt_mode() == MODE_RT_RUN)
+ /* The action is already on.
+ * Prepare immediate release
+ */
+ prepare_new_release(t);
+ /* The task should be running in the queue, otherwise signal
+ * code will try to wake it up with fatal consequences.
+ */
+ t->state = TASK_RUNNING;
+
+ queue_lock_irqsave(&gsnedf_lock, flags);
+ requeue(t);
+ queue_unlock_irqrestore(&gsnedf_lock, flags);
+ return 0;
+ }
+ else
+ return -EPERM;
+}
+
+static void gsnedf_wake_up_task(struct task_struct *task)
+{
+ unsigned long flags;
+ /* We must determine whether task should go into the release
+ * queue or into the ready queue. It may enter the ready queue
+ * if it has credit left in its time slice and has not yet reached
+ * its deadline. If it is now passed its deadline we assume this the
+ * arrival of a new sporadic job and thus put it in the ready queue
+ * anyway.If it has zero budget and the next release is in the future
+ * it has to go to the release queue.
+ */
+ TRACE("gsnedf: %d unsuspends with budget=%d\n",
+ task->pid, task->time_slice);
+ task->state = TASK_RUNNING;
+
+ /* We need to take suspensions because of semaphores into
+ * account! If a job resumes after being suspended due to acquiring
+ * a semaphore, it should never be treated as a new job release.
+ */
+ if (get_rt_flags(task) == RT_F_EXIT_SEM) {
+ set_rt_flags(task, RT_F_RUNNING);
+ } else {
+ if (is_tardy(task)) {
+ /* new sporadic release */
+ prepare_new_release(task);
+ sched_trace_job_release(task);
+ }
+ else if (task->time_slice)
+ /* came back in time before deadline
+ */
+ set_rt_flags(task, RT_F_RUNNING);
+ }
+
+ queue_lock_irqsave(&gsnedf_lock, flags);
+ gsnedf_job_arrival(task);
+ queue_unlock_irqrestore(&gsnedf_lock, flags);
+}
+
+static void gsnedf_task_blocks(struct task_struct *t)
+{
+ unsigned long flags;
+
+ /* unlink if necessary */
+ queue_lock_irqsave(&gsnedf_lock, flags);
+ unlink(t);
+ queue_unlock_irqrestore(&gsnedf_lock, flags);
+
+ BUG_ON(!is_realtime(t));
+ TRACE("task %d suspends with budget=%d\n", t->pid, t->time_slice);
+ BUG_ON(t->rt_list.next != LIST_POISON1);
+ BUG_ON(t->rt_list.prev != LIST_POISON2);
+}
+
+
+/* When _tear_down is called, the task should not be in any queue any more
+ * as it must have blocked first. We don't have any internal state for the task,
+ * it is all in the task_struct.
+ */
+static long gsnedf_tear_down(struct task_struct * t)
+{
+ BUG_ON(!is_realtime(t));
+ TRACE_TASK(t, "tear down called");
+ BUG_ON(t->array);
+ BUG_ON(t->rt_list.next != LIST_POISON1);
+ BUG_ON(t->rt_list.prev != LIST_POISON2);
+ return 0;
+}
+
+
+static long gsnedf_enter_np(struct task_struct * t)
+{
+ unsigned long flags;
+
+ queue_lock_irqsave(&gsnedf_lock, flags);
+ t->rt_param.is_non_preemptable++;
+ queue_unlock_irqrestore(&gsnedf_lock, flags);
+ return 0;
+}
+
+static long gsnedf_exit_np(struct task_struct * t)
+{
+ unsigned long flags;
+ int ret = 0;
+ cpu_entry_t *entry;
+
+ queue_lock_irqsave(&gsnedf_lock, flags);
+ if (is_np(t)) {
+ t->rt_param.is_non_preemptable--;
+ entry = &__get_cpu_var(gsnedf_cpu_entries);
+ if (!is_np(t) && (!t->time_slice || entry->linked != t)) {
+ BUG_ON(t != entry->scheduled);
+ /* t is now preemptable and not linked */
+ set_will_schedule();
+ if (!t->time_slice) {
+ set_rt_flags(t, RT_F_SLEEP);
+ sched_trace_job_completion(t);
+ /* prepare for next period */
+ prepare_for_next_period(t);
+ }
+ /* unlink */
+ unlink(t);
+ /* requeue */
+ gsnedf_job_arrival(t);
+ /* reschedule if necessary */
+ if (entry->linked != entry->scheduled) {
+ TRACE("gsnedf_exit_np: delayed "
+ "preemption of %d\n",
+ t->pid);
+ set_tsk_need_resched(current);
+ } else
+ TRACE("gsnedf_exit_np: no preemption-necessary, "
+ " %s/%d got relinked\n",
+ entry->scheduled->comm,
+ entry->scheduled->pid);
+ }
+ } else
+ ret = -EPERM;
+ queue_unlock_irqrestore(&gsnedf_lock, flags);
+ return ret;
+}
+
+static long gsnedf_pi_block(struct pi_semaphore *sem,
+ struct task_struct *new_waiter)
+{
+ /* This callback has to handle the situation where a new waiter is
+ * added to the wait queue of the semaphore.
+ *
+ * We must check if has a higher priority than the currently
+ * highest-priority task, and then potentially reschedule.
+ */
+
+ BUG_ON(!new_waiter);
+
+ if (edf_higher_prio(new_waiter, sem->hp.task)) {
+ TRACE_TASK(new_waiter, " boosts priority\n");
+ /* called with IRQs disabled */
+ queue_lock(&gsnedf_lock);
+ /* store new highest-priority task */
+ sem->hp.task = new_waiter;
+ if (sem->holder) {
+ /* let holder inherit */
+ sem->holder->rt_param.inh_task = new_waiter;
+ unlink(sem->holder);
+ gsnedf_job_arrival(sem->holder);
+ }
+ queue_unlock(&gsnedf_lock);
+ }
+
+ return 0;
+}
+
+static long gsnedf_inherit_priority(struct pi_semaphore *sem,
+ struct task_struct *new_owner)
+{
+ /* We don't need to acquire the gsnedf_lock since at the time of this
+ * call new_owner isn't actually scheduled yet (it's still sleeping)
+ * and since the calling function already holds sem->wait.lock, which
+ * prevents concurrent sem->hp.task changes.
+ */
+
+ if (sem->hp.task && sem->hp.task != new_owner) {
+ new_owner->rt_param.inh_task = sem->hp.task;
+ TRACE_TASK(new_owner, "inherited priority from %s/%d\n",
+ sem->hp.task->comm, sem->hp.task->pid);
+ } else
+ TRACE_TASK(new_owner,
+ "cannot inherit priority, "
+ "no higher priority job waits.\n");
+ return 0;
+}
+
+/* This function is called on a semaphore release, and assumes that
+ * the current task is also the semaphore holder.
+ */
+static long gsnedf_return_priority(struct pi_semaphore *sem)
+{
+ struct task_struct* t = current;
+ int ret = 0;
+
+ /* Find new highest-priority semaphore task
+ * if holder task is the current hp.task.
+ *
+ * Calling function holds sem->wait.lock.
+ */
+ if (t == sem->hp.task)
+ edf_set_hp_task(sem);
+
+ TRACE_CUR("gsnedf_return_priority for lock %p\n", sem);
+
+ if (t->rt_param.inh_task) {
+ /* interrupts already disabled by PI code */
+ queue_lock(&gsnedf_lock);
+
+ /* Reset inh_task to NULL. */
+ t->rt_param.inh_task = NULL;
+
+ /* Check if rescheduling is necessary */
+ unlink(t);
+ gsnedf_job_arrival(t);
+ queue_unlock(&gsnedf_lock);
+ }
+
+ return ret;
+}
+
+/*
+ * Deactivate current task until the beginning of the next period.
+ */
+static long gsnedf_sleep_next_period(void)
+{
+ unsigned long flags;
+ struct task_struct* t = current;
+
+ queue_lock_irqsave(&gsnedf_lock, flags);
+
+ /* Mark that we do not excute anymore */
+ set_rt_flags(t, RT_F_SLEEP);
+ sched_trace_job_completion(t);
+ /* prepare for next period */
+ prepare_for_next_period(t);
+
+ /* unlink */
+ unlink(t);
+ /* requeue */
+ gsnedf_job_arrival(t);
+
+ /* will reschedule on return to user mode */
+ set_tsk_need_resched(t);
+
+ queue_unlock_irqrestore(&gsnedf_lock, flags);
+
+ return 0;
+}
+
+
+static int gsnedf_mode_change(int new_mode)
+{
+ unsigned long flags;
+ int cpu;
+ cpu_entry_t *entry;
+
+ if (new_mode == MODE_RT_RUN) {
+ queue_lock_irqsave(&gsnedf_lock, flags);
+
+ __prepare_new_releases(&gsnedf, jiffies + 10);
+
+ /* get old cruft out of the way in case we reenter real-time
+ * mode for a second time
+ */
+ while (!list_empty(&gsnedf_cpu_queue))
+ list_del(gsnedf_cpu_queue.next);
+ /* reinitialize */
+ for_each_online_cpu(cpu) {
+ entry = &per_cpu(gsnedf_cpu_entries, cpu);
+ atomic_set(&entry->will_schedule, 0);
+ entry->linked = NULL;
+ entry->scheduled = NULL;
+ list_add(&entry->list, &gsnedf_cpu_queue);
+ }
+
+ queue_unlock_irqrestore(&gsnedf_lock, flags);
+
+ }
+ return 0;
+}
+
+
+/* Plugin object */
+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
+ .ready_to_use = 0
+};
+
+
+/*
+ * Plugin initialization code.
+ */
+#define INIT_SCHED_PLUGIN (struct sched_plugin){ \
+ .plugin_name = "GSN-EDF", \
+ .ready_to_use = 1, \
+ .algo_scheduler_tick = gsnedf_scheduler_tick, \
+ .scheduler_tick = rt_scheduler_tick, \
+ .prepare_task = gsnedf_prepare_task, \
+ .sleep_next_period = gsnedf_sleep_next_period, \
+ .tear_down = gsnedf_tear_down, \
+ .schedule = gsnedf_schedule, \
+ .finish_switch = gsnedf_finish_switch, \
+ .mode_change = gsnedf_mode_change, \
+ .wake_up_task = gsnedf_wake_up_task, \
+ .task_blocks = gsnedf_task_blocks, \
+ .enter_np = gsnedf_enter_np, \
+ .exit_np = gsnedf_exit_np, \
+ .inherit_priority = gsnedf_inherit_priority, \
+ .return_priority = gsnedf_return_priority, \
+ .pi_block = gsnedf_pi_block \
+}
+
+
+sched_plugin_t *__init init_gsn_edf_plugin(void)
+{
+ int cpu;
+ cpu_entry_t *entry;
+
+ if (!s_plugin.ready_to_use)
+ {
+ /* initialize CPU state */
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ entry = &per_cpu(gsnedf_cpu_entries, cpu);
+ atomic_set(&entry->will_schedule, 0);
+ entry->linked = NULL;
+ entry->scheduled = NULL;
+ entry->cpu = cpu;
+ }
+
+ queue_lock_init(&gsnedf_lock);
+ set_sched_options(SCHED_NONE);
+ edf_domain_init(&gsnedf, NULL);
+ s_plugin = INIT_SCHED_PLUGIN;
+ }
+ return &s_plugin;
+}
+
+
diff --git a/kernel/sched_part_edf.c b/kernel/sched_part_edf.c
new file mode 100644
index 0000000..c382722
--- /dev/null
+++ b/kernel/sched_part_edf.c
@@ -0,0 +1,345 @@
+/*
+ * kernel/sched_part_edf.c
+ *
+ * Implementation of the partitioned EDF scheduler plugin.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+#include <linux/edf_common.h>
+
+
+typedef struct {
+ edf_domain_t domain;
+ int cpu;
+ struct task_struct* scheduled; /* only RT tasks */
+ spinlock_t lock;
+} part_edf_domain_t;
+
+
+#define local_edf (&__get_cpu_var(part_edf_domains).domain)
+#define local_pedf (&__get_cpu_var(part_edf_domains))
+#define remote_edf(cpu) (&per_cpu(part_edf_domains, cpu).domain)
+#define remote_pedf(cpu) (&per_cpu(part_edf_domains, cpu))
+#define task_edf(task) remote_edf(get_partition(task))
+
+static void part_edf_domain_init(part_edf_domain_t* pedf,
+ edf_check_resched_needed_t check,
+ int cpu)
+{
+ edf_domain_init(&pedf->domain, check);
+ pedf->cpu = cpu;
+ pedf->lock = SPIN_LOCK_UNLOCKED;
+ pedf->scheduled = NULL;
+}
+
+DEFINE_PER_CPU(part_edf_domain_t, part_edf_domains);
+
+/* This check is trivial in partioned systems as we only have to consider
+ * the CPU of the partition.
+ *
+ */
+static int part_edf_check_resched(edf_domain_t *edf)
+{
+ part_edf_domain_t *pedf = container_of(edf, part_edf_domain_t, domain);
+ int ret = 0;
+
+ spin_lock(&pedf->lock);
+
+ /* because this is a callback from edf_domain_t we already hold
+ * the necessary lock for the ready queue
+ */
+ if (preemption_needed(edf, pedf->scheduled)) {
+ if (pedf->cpu == smp_processor_id())
+ set_tsk_need_resched(current);
+ else
+ smp_send_reschedule(pedf->cpu);
+ ret = 1;
+ }
+ spin_unlock(&pedf->lock);
+ return ret;
+}
+
+
+static reschedule_check_t part_edf_scheduler_tick(void)
+{
+ unsigned long flags;
+ struct task_struct *t = current;
+ reschedule_check_t want_resched = NO_RESCHED;
+ edf_domain_t *edf = local_edf;
+ part_edf_domain_t *pedf = local_pedf;
+
+ /* Check for inconsistency. We don't need the lock for this since
+ * ->scheduled is only changed in schedule, which obviously is not
+ * executing in parallel on this CPU
+ */
+ BUG_ON(is_realtime(t) && t != pedf->scheduled);
+
+ /* expire tasks even if not in real-time mode
+ * this makes sure that at the end of real-time mode
+ * no tasks "run away forever".
+ */
+ if (is_realtime(t) && (!--t->time_slice)) {
+ /* this task has exhausted its budget in this period */
+ set_rt_flags(t, RT_F_SLEEP);
+ want_resched = FORCE_RESCHED;
+ }
+ if (get_rt_mode() == MODE_RT_RUN)
+ {
+ /* check whether anything is waiting to be released
+ * this could probably be moved to the global timer
+ * interrupt handler since the state will only change
+ * once per jiffie
+ */
+ try_release_pending(edf);
+ if (want_resched != FORCE_RESCHED)
+ {
+ read_lock_irqsave(&edf->ready_lock, flags);
+ if (preemption_needed(edf, t))
+ want_resched = FORCE_RESCHED;
+ read_unlock_irqrestore(&edf->ready_lock, flags);
+ }
+ }
+ return want_resched;
+}
+
+static int part_edf_schedule(struct task_struct * prev,
+ struct task_struct ** next,
+ runqueue_t * rq)
+{
+ int need_deactivate = 1;
+ part_edf_domain_t* pedf = local_pedf;
+ edf_domain_t* edf = &pedf->domain;
+
+
+ if (is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP)
+ prepare_for_next_period(prev);
+
+ if (get_rt_mode() == MODE_RT_RUN) {
+ write_lock(&edf->ready_lock);
+ if (is_realtime(prev) && is_released(prev) && is_running(prev)
+ && !preemption_needed(edf, prev)) {
+ /* this really should only happen if the task has
+ * 100% utilization...
+ */
+ TRACE("prev will be next, already released\n");
+ *next = prev;
+ need_deactivate = 0;
+ } else {
+ /* either not yet released, preempted, or non-rt */
+ *next = __take_ready(edf);
+ if (*next) {
+ /* stick the task into the runqueue */
+ __activate_task(*next, rq);
+ set_task_cpu(*next, smp_processor_id());
+ }
+ }
+ spin_lock(&pedf->lock);
+ pedf->scheduled = *next;
+ spin_unlock(&pedf->lock);
+ if (*next)
+ set_rt_flags(*next, RT_F_RUNNING);
+
+ write_unlock(&edf->ready_lock);
+ }
+
+ if (is_realtime(prev) && need_deactivate && prev->array) {
+ /* take it out of the run queue */
+ deactivate_task(prev, rq);
+ }
+
+ return 0;
+}
+
+
+static void part_edf_finish_switch(struct task_struct *prev)
+{
+ edf_domain_t* edf = local_edf;
+
+ if (!is_realtime(prev) || !is_running(prev))
+ return;
+
+ if (get_rt_flags(prev) == RT_F_SLEEP ||
+ get_rt_mode() != MODE_RT_RUN) {
+ /* this task has expired
+ * _schedule has already taken care of updating
+ * the release and
+ * deadline. We just must check if has been released.
+ */
+ if (is_released(prev) && get_rt_mode() == MODE_RT_RUN) {
+ /* already released */
+ add_ready(edf, prev);
+ TRACE("%d goes straight to ready queue\n", prev->pid);
+ } else
+ /* it has got to wait */
+ add_release(edf, prev);
+ } else {
+ /* this is a forced preemption
+ * thus the task stays in the ready_queue
+ * we only must make it available to others
+ */
+ add_ready(edf, prev);
+ }
+}
+
+
+/* Prepare a task for running in RT mode
+ * Enqueues the task into master queue data structure
+ * returns
+ * -EPERM if task is not TASK_STOPPED
+ */
+static long part_edf_prepare_task(struct task_struct * t)
+{
+ edf_domain_t* edf = task_edf(t);
+
+
+ TRACE("[%d] part edf: prepare task %d on CPU %d\n",
+ smp_processor_id(), t->pid, get_partition(t));
+ if (t->state == TASK_STOPPED) {
+ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
+
+ if (get_rt_mode() == MODE_RT_RUN)
+ /* The action is already on.
+ * Prepare immediate release.
+ */
+ prepare_new_release(t);
+ /* The task should be running in the queue, otherwise signal
+ * code will try to wake it up with fatal consequences.
+ */
+ t->state = TASK_RUNNING;
+ add_release(edf, t);
+ return 0;
+ } else
+ return -EPERM;
+}
+
+static void part_edf_wake_up_task(struct task_struct *task)
+{
+ edf_domain_t* edf;
+
+ edf = task_edf(task);
+
+ /* We must determine whether task should go into the release
+ * queue or into the ready queue. It may enter the ready queue
+ * if it has credit left in its time slice and has not yet reached
+ * its deadline. If it is now passed its deadline we assume this the
+ * arrival of a new sporadic job and thus put it in the ready queue
+ * anyway.If it has zero budget and the next release is in the future
+ * it has to go to the release queue.
+ */
+ TRACE("part edf: wake up %d with budget=%d for cpu %d\n",
+ task->pid, task->time_slice, get_partition(task));
+ task->state = TASK_RUNNING;
+ if (is_tardy(task)) {
+ /* new sporadic release */
+ prepare_new_release(task);
+ add_ready(edf, task);
+
+ } else if (task->time_slice) {
+ /* came back in time before deadline
+ * TODO: clip budget to fit into period, otherwise it could
+ * cause a deadline overrun in the next period, i.e.
+ * over allocation in the next period.
+ */
+ set_rt_flags(task, RT_F_RUNNING);
+ add_ready(edf, task);
+
+ } else {
+ add_release(edf, task);
+ }
+
+}
+
+static void part_edf_task_blocks(struct task_struct *t)
+{
+ BUG_ON(!is_realtime(t));
+ /* not really anything to do since it can only block if
+ * it is running, and when it is not running it is not in any
+ * queue anyway.
+ *
+ * TODO: Check whether the assumption is correct for SIGKILL and
+ * SIGSTOP.
+ */
+ TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice);
+ BUG_ON(in_list(&t->rt_list));
+}
+
+
+/* When _tear_down is called, the task should not be in any queue any more
+ * as it must have blocked first. We don't have any internal state for the task,
+ * it is all in the task_struct.
+ */
+static long part_edf_tear_down(struct task_struct * t)
+{
+ BUG_ON(!is_realtime(t));
+ TRACE("part edf: tear down called for %d \n", t->pid);
+ BUG_ON(t->array);
+ BUG_ON(in_list(&t->rt_list));
+ return 0;
+}
+
+
+static int part_edf_mode_change(int new_mode)
+{
+ int cpu;
+
+ if (new_mode == MODE_RT_RUN)
+ for_each_online_cpu(cpu)
+ prepare_new_releases(remote_edf(cpu), jiffies);
+ TRACE("[%d] part edf: mode changed to %d\n",
+ smp_processor_id(), new_mode);
+ return 0;
+}
+
+
+/* Plugin object */
+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
+ .ready_to_use = 0
+};
+
+
+/*
+ * Plugin initialization code.
+ */
+#define INIT_SCHED_PLUGIN (struct sched_plugin) {\
+ .plugin_name = "Partitioned EDF",\
+ .ready_to_use = 1,\
+ .algo_scheduler_tick = part_edf_scheduler_tick,\
+ .scheduler_tick = rt_scheduler_tick,\
+ .prepare_task = part_edf_prepare_task,\
+ .sleep_next_period = edf_sleep_next_period,\
+ .tear_down = part_edf_tear_down,\
+ .shutdown_hook = NULL,\
+ .schedule = part_edf_schedule,\
+ .finish_switch = part_edf_finish_switch,\
+ .mode_change = part_edf_mode_change,\
+ .wake_up_task = part_edf_wake_up_task,\
+ .task_blocks = part_edf_task_blocks \
+}
+
+
+sched_plugin_t *__init init_part_edf_plugin(void)
+{
+ int i;
+
+ if (!s_plugin.ready_to_use)
+ {
+ set_sched_options(SCHED_NONE);
+ for (i = 0; i < NR_CPUS; i++)
+ {
+ part_edf_domain_init(remote_pedf(i),
+ part_edf_check_resched, i);
+ printk("CPU partition %d initialized.", i);
+ }
+ s_plugin = INIT_SCHED_PLUGIN;
+ }
+ return &s_plugin;
+}
+
+
+
diff --git a/kernel/sched_pfair.c b/kernel/sched_pfair.c
new file mode 100644
index 0000000..4fa6ba2
--- /dev/null
+++ b/kernel/sched_pfair.c
@@ -0,0 +1,507 @@
+/*
+ *
+ * Implementation of synchronized PFAIR PD2 scheduler
+ *
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+#include <linux/pfair_common.h>
+#include <linux/sched_trace.h>
+#include <linux/queuelock.h>
+
+struct cpu_state {
+ struct task_struct * t;
+ volatile jiffie_t jiffie_marker;
+};
+/* PFAIR scheduling domain, release and ready queues */
+static pfair_domain_t pfair __cacheline_aligned_in_smp;
+
+/* An indicator that quantum boundary was crossed
+ * and a decision has to be made
+ */
+static int sync_go[NR_CPUS];
+
+
+/* A collection of CPU states protected by pfair lock */
+DEFINE_PER_CPU(struct cpu_state, states);
+
+/*
+ * This function gets called by the timer code, with HZ frequency
+ * with interrupts disabled.
+ *
+ * The function merges the release queue with the ready queue
+ * and indicates that quantum boundary was crossed.
+ *
+ * It also suggests to schedule off currently running
+ * real-time task if the mode is non-real-time.
+ */
+static reschedule_check_t pfair_scheduler_tick(void)
+{
+ int want_resched = NO_RESCHED;
+ sync_go[smp_processor_id()] = 0;
+ if (!cpu_isset(smp_processor_id(), pfair.domain_cpus))
+ goto out;
+ /* Now determine if we want current task to be preempted */
+ if (get_rt_mode() == MODE_RT_RUN) {
+ pfair_try_release_pending(&pfair);
+ want_resched = FORCE_RESCHED;
+ /* indicate that the interrupt fired */
+ sync_go[smp_processor_id()] = 1;
+ barrier();
+ } else if (is_realtime(current) && is_running(current)) {
+ /* In non real-time mode we want to
+ * schedule off real-time tasks */
+ want_resched = FORCE_RESCHED;
+ } else if (is_realtime(current) && !is_running(current)) {
+ TRACE("[%d] %d Timer interrupt on not runninng %d\n",
+ smp_processor_id(),
+ jiffies-rt_start_time, current->pid);
+ }
+out:
+ return want_resched;
+}
+
+/**
+ * This function is called by the processor
+ * that performs rescheduling. It saves the timing
+ * parameters of currently running jobs that were not rescheduled yet
+ * and releases next subtask for these jobs placing them into
+ * release and ready queues.
+ */
+static void pretend_release(cpumask_t p)
+{
+ int i = 0;
+ struct task_struct * t = NULL;
+ /* for all the tasks increment the number of used quanta
+ * and release next subtask or job depending on the number
+ * of used quanta
+ */
+ for_each_cpu_mask(i, p) {
+ t = per_cpu(states, i).t;
+ if (t != NULL) {
+ backup_times(t);
+ inc_passed_quanta(t);
+ if ( get_passed_quanta(t) == get_exec_cost(t)) {
+ pfair_prepare_next_job(t);
+ } else {
+ pfair_prepare_next_subtask(t);
+ }
+ /*
+ TRACE("[%d] %d pretending release %d with (%d, %d)\n",
+ smp_processor_id(),
+ jiffies-rt_start_time,t->pid,
+ get_release(t)-rt_start_time,
+ get_deadline(t)-rt_start_time);*/
+ /* detect if the job or subtask has to be released now*/
+ if (time_before_eq(get_release(t), jiffies))
+ pfair_add_ready(&pfair, t);
+ else
+ pfair_add_release(&pfair, t);
+ }
+ }
+}
+/*
+ * Rollback the the pretended release of tasks.
+ * Timing parameters are restored and tasks are removed
+ * from the queues as it was before calling the schedule() function.
+ *
+ */
+static void rollback_release(cpumask_t p)
+{
+ int i = -1;
+ struct task_struct * t = NULL;
+ /*
+ * Rollback the pretended changes
+ */
+ for_each_cpu_mask(i, p) {
+ t = per_cpu(states, i).t;
+ if (t != NULL) {
+ restore_times(t);
+ if(t->rt_list.prev != LIST_POISON1 ||
+ t->rt_list.next != LIST_POISON2) {
+ /* Delete the task from a queue */
+ list_del(&t->rt_list);
+ }
+ }
+ }
+}
+
+/*
+ * The procedure creates a list of cpu's whose tasks have not been
+ * rescheduled yet. These are CPU's with jiffie marker different from
+ * the value of jiffies.
+ */
+static void find_participants(cpumask_t * target)
+{
+ cpumask_t res;int i;
+ cpus_clear(res);
+ for_each_online_cpu(i) {
+ if(per_cpu(states, i).jiffie_marker != jiffies)
+ cpu_set(i, res);
+ }
+ /* Examine only cpus in the domain */
+ cpus_and(res, pfair.domain_cpus, res);
+ (*target) = res;
+}
+
+/*
+ * This is main PFAIR schedule function,
+ * each processor pretends that some currently running tasks are
+ * released in the next quantum and determines whether it should
+ * keep the task that is currently running (this is usually the case
+ * for heavy tasks).
+*/
+static int pfair_schedule(struct task_struct *prev,
+ struct task_struct **next,
+ runqueue_t * rq)
+{
+ int cpu =-1;
+ int k =-1;
+ int need_deactivate = 1;
+ int keep =0;
+ unsigned long flags;
+ cpumask_t participants;
+ /* A temporary array */
+ struct task_struct * rs_old_ptr[NR_CPUS];
+
+ *next = NULL;
+ cpu = smp_processor_id();
+ /* CPU's not in the domain just bypass */
+ if (!cpu_isset(cpu, pfair.domain_cpus)) {
+ goto out;
+ }
+ queue_lock_irqsave(&pfair.pfair_lock, flags);
+
+ /* If we happen to run in non-realtime mode
+ * then we have to schedule off currently running tasks
+ * */
+ if (get_rt_mode() != MODE_RT_RUN) {
+ if (is_realtime(prev)) {
+ per_cpu(states, cpu).t = NULL;
+ TRACE("[%d] %d Suspending %d\n",
+ cpu, jiffies - rt_start_time,
+ prev->pid);
+ /* Move the task to the
+ * release queue for future runs
+ * FIXME: Do something smarter.
+ * For example create a set where
+ * prepared or inactive tasks are placed
+ * and then released.
+ * */
+ set_release(prev, get_release(prev) + 1000);
+ pfair_add_release(&pfair, prev);
+ }
+ goto out_deactivate;
+ }
+ /* If the current task stops or dies */
+ if (is_realtime(prev) && !is_running(prev)) {
+ /* remove it from the running set */
+ per_cpu(states, cpu).t = NULL;
+ }
+ /* Make pfair decisions at quantum boundaries only,
+ * but schedule off stopped or dead tasks */
+
+ if ((sync_go[cpu]--) != 1)
+ goto out_deactivate;
+
+ /*TRACE("[%d] %d Scheduler activation", cpu, jiffies-rt_start_time);
+ cpus_and(res, pfair.domain_cpus, cpu_online_map);
+ for_each_cpu_mask(k, res) {
+ TRACE("%d" ,(per_cpu(states, k).jiffie_marker!=jiffies));
+ }
+ TRACE("\n");*/
+
+ /* Find processors that have not rescheduled yet */
+ find_participants(&participants);
+ /* For each task on remote cpu's pretend release */
+ pretend_release(participants);
+ /* Clear temporary array */
+ for_each_possible_cpu(k) { rs_old_ptr[k] = NULL; }
+ /* Select a new subset of eligible tasks */
+ for_each_cpu_mask(k, participants) {
+ rs_old_ptr[k] = __pfair_take_ready (&pfair);
+ /* Check if our current task must be scheduled in the next quantum */
+ if (rs_old_ptr[k] == per_cpu(states, cpu).t) {
+ /* this is our current task, keep it */
+ *next = per_cpu(states, cpu).t;
+ need_deactivate = 0;
+ keep = 1;
+ break;
+ }
+ }
+ /* Put all the extracted tasks back into the ready queue */
+ for_each_cpu_mask(k, participants) {
+ if (rs_old_ptr[k] != NULL){
+ pfair_add_ready(&pfair, rs_old_ptr[k]);
+ rs_old_ptr[k] = NULL;
+ }
+ }
+ /* Rollback the pretended release,
+ * task parameters are restored and running tasks are removed
+ * from queues */
+ rollback_release(participants);
+ /*
+ * If the current task is not scheduled in the next quantum
+ * then select a new pfair task
+ */
+ if(!keep) {
+ *next = per_cpu(states, cpu).t = __pfair_take_ready(&pfair);
+ if (*next != NULL) {
+ /*TRACE("[%d] %d Scheduling %d with (%d, %d)\n",
+ cpu, jiffies-rt_start_time,
+ get_release(*next),
+ get_deadline(*next));
+ */
+ set_task_cpu(*next, cpu);
+ __activate_task(*next, rq);
+ }
+ } else {
+ if (is_realtime(prev)) {
+ /*TRACE("[%d] %d prev==next %d\n",
+ cpu,jiffies-rt_start_time,
+ (prev)->pid);*/
+
+ /* The task will not be switched off but we
+ * need to track the execution time
+ */
+ inc_passed_quanta(prev);
+ }
+ }
+
+ /*Show that our task does not participate in subsequent selections*/
+ __get_cpu_var(states).jiffie_marker = jiffies;
+
+out_deactivate:
+ if ( is_realtime(prev) && need_deactivate && prev->array) {
+ /* take prev out of the linux run queue */
+ deactivate_task(prev, rq);
+ }
+ queue_unlock_irqrestore(&pfair.pfair_lock, flags);
+out:
+ return 0;
+}
+
+static void pfair_finish_task_switch(struct task_struct *t)
+{
+ if (!is_realtime(t) || !is_running(t))
+ return;
+
+ queue_lock(&pfair.pfair_lock);
+ /* Release in real-time mode only,
+ * if the mode is non real-time, then
+ * the task is already in the release queue
+ * with the time far in the future
+ */
+ if (get_rt_mode() == MODE_RT_RUN) {
+ inc_passed_quanta(t);
+ if ( get_passed_quanta(t) == get_exec_cost(t)) {
+ sched_trace_job_completion(t);
+ pfair_prepare_next_job(t);
+ } else {
+ pfair_prepare_next_subtask(t);
+ }
+ /*TRACE("[%d] %d releasing %d with (%d, %d)\n",
+ smp_processor_id(),
+ jiffies-rt_start_time,
+ t->pid,
+ get_release(t)-rt_start_time,
+ get_deadline(t)-rt_start_time);*/
+ if (time_before_eq(get_release(t), jiffies))
+ pfair_add_ready(&pfair, t);
+ else
+ pfair_add_release(&pfair, t);
+ }
+ queue_unlock(&pfair.pfair_lock);
+}
+
+/* Prepare a task for running in RT mode
+ * Enqueues the task into master queue data structure
+ * returns
+ * -EPERM if task is not TASK_STOPPED
+ */
+static long pfair_prepare_task(struct task_struct * t)
+{
+ unsigned long flags;
+ TRACE("pfair: prepare task %d\n", t->pid);
+ if (t->state == TASK_STOPPED) {
+ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
+
+ if (get_rt_mode() == MODE_RT_RUN)
+ /* The action is already on.
+ * Prepare immediate release
+ */
+ __pfair_prepare_new_release(t, jiffies);
+ /* The task should be running in the queue, otherwise signal
+ * code will try to wake it up with fatal consequences.
+ */
+ t->state = TASK_RUNNING;
+ queue_lock_irqsave(&pfair.pfair_lock, flags);
+ pfair_add_release(&pfair, t);
+ queue_unlock_irqrestore(&pfair.pfair_lock, flags);
+ return 0;
+ } else
+ return -EPERM;
+}
+
+
+
+static void pfair_wake_up_task(struct task_struct *task)
+{
+
+ unsigned long flags;
+
+ /* We must determine whether task should go into the release
+ * queue or into the ready queue.
+ * The task enters the ready queue if the previous deadline was missed,
+ * so we treat the invoked job as a new sporadic release.
+ *
+ * The job can also enter the ready queue if it was invoked before its
+ * global deadline, but its budjet must be clipped down to one quantum
+ */
+ task->state = TASK_RUNNING;
+ if (time_after_eq(jiffies, task->rt_param.times.last_release
+ + get_rt_period(task))) {
+ /* new sporadic release */
+ TRACE("[%d] Sporadic release of %d at %d\n",
+ smp_processor_id(),
+ jiffies-rt_start_time,
+ task->pid);
+ __pfair_prepare_new_release(task, jiffies);
+ queue_lock_irqsave(&pfair.pfair_lock, flags);
+ sched_trace_job_release(task);
+ pfair_add_ready(&pfair, task);
+ queue_unlock_irqrestore(&pfair.pfair_lock, flags);
+ } else if (task->time_slice) {
+ /* came back in time before deadline
+ * clip the budget to be the last subtask of a job or
+ * the new job.
+ */
+ task->rt_param.times.exec_time = get_exec_cost(task) - 1;
+ if (task->rt_param.times.exec_time == 0) {
+ pfair_prepare_next_job(task);
+ } else {
+ pfair_prepare_next_subtask(task);
+ }
+ TRACE("[%d] %d Resume of %d with %d, %d, %d\n",
+ smp_processor_id(), jiffies-rt_start_time,
+ task->pid, get_release(task)-rt_start_time,
+ get_deadline(task)-rt_start_time,
+ get_passed_quanta(task));
+
+ set_rt_flags(task, RT_F_RUNNING);
+ queue_lock_irqsave(&pfair.pfair_lock, flags);
+ sched_trace_job_release(task);
+ if (time_after_eq(jiffies, get_release(task))) {
+ pfair_add_ready(&pfair, task);
+ } else {
+ pfair_add_release(&pfair, task);
+ }
+ queue_unlock_irqrestore(&pfair.pfair_lock, flags);
+
+ } else {
+ TRACE("[%d] %d Strange release of %d with %d, %d, %d\n",
+ smp_processor_id(), jiffies-rt_start_time,
+ task->pid,
+ get_release(task), get_deadline(task),
+ get_passed_quanta(task));
+
+ queue_lock_irqsave(&pfair.pfair_lock, flags);
+ pfair_add_release(&pfair, task);
+ queue_unlock_irqrestore(&pfair.pfair_lock, flags);
+ }
+}
+
+
+static void pfair_task_blocks(struct task_struct *t)
+{
+ unsigned long flags;
+ int i;
+ cpumask_t res;
+ BUG_ON(!is_realtime(t));
+ /* If the task blocks, then it must be removed from the running set */
+ queue_lock_irqsave(&pfair.pfair_lock, flags);
+ cpus_and(res,pfair.domain_cpus, cpu_online_map);
+ for_each_cpu_mask(i, res) {
+ if (per_cpu(states, i).t == t)
+ per_cpu(states, i).t = NULL;
+ }
+ /* If the task is running and in some
+ * list it might have been released by another
+ * processor
+ */
+ if((t->rt_list.next != LIST_POISON1 ||
+ t->rt_list.prev != LIST_POISON2)) {
+ TRACE("[%d] %d task %d is deleted from the list\n",
+ smp_processor_id(),
+ jiffies-rt_start_time, t->pid);
+ list_del(&t->rt_list);
+ }
+ queue_unlock_irqrestore(&pfair.pfair_lock, flags);
+ TRACE("[%d] %d task %d blocks with budget=%d state=%d\n",
+ smp_processor_id(), jiffies-rt_start_time,
+ t->pid, t->time_slice, t->state);
+}
+
+static long pfair_tear_down(struct task_struct * t)
+{
+ BUG_ON(!is_realtime(t));
+ TRACE("pfair: tear down called for %d \n", t->pid);
+ BUG_ON(t->array);
+ BUG_ON(t->rt_list.next != LIST_POISON1);
+ BUG_ON(t->rt_list.prev != LIST_POISON2);
+ return 0;
+}
+
+static int pfair_mode_change(int new_mode)
+{
+ printk(KERN_INFO "[%d] pfair mode change %d\n",
+ smp_processor_id(), new_mode);
+ if (new_mode == MODE_RT_RUN) {
+ pfair_prepare_new_releases(&pfair, jiffies + 10);
+ }
+ printk(KERN_INFO "[%d] pfair: mode change done\n", smp_processor_id());
+ return 0;
+}
+
+/* Plugin object */
+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
+ .ready_to_use = 0
+};
+/*
+* PFAIR plugin initialization macro.
+*/
+#define INIT_PFAIR_PLUGIN (struct sched_plugin){\
+ .plugin_name = "PFAIR",\
+ .ready_to_use = 1,\
+ .algo_scheduler_tick = pfair_scheduler_tick,\
+ .scheduler_tick = rt_scheduler_tick,\
+ .prepare_task = pfair_prepare_task,\
+ .sleep_next_period = 0,\
+ .tear_down = pfair_tear_down,\
+ .shutdown_hook = 0,\
+ .schedule = pfair_schedule,\
+ .finish_switch = pfair_finish_task_switch,\
+ .mode_change = pfair_mode_change,\
+ .wake_up_task = pfair_wake_up_task,\
+ .task_blocks = pfair_task_blocks \
+ }
+
+sched_plugin_t* __init init_pfair_plugin(void)
+{
+ int i=0;
+ if (!s_plugin.ready_to_use) {
+ set_sched_options(SCHED_NONE);
+ pfair_domain_init(&pfair);
+ for (i=0; i<NR_CPUS; i++) {
+ sync_go[i] = 0;
+ per_cpu(states, i).t = NULL;
+ }
+ s_plugin = INIT_PFAIR_PLUGIN;
+ }
+ return &s_plugin;
+}
diff --git a/kernel/sched_plugin.c b/kernel/sched_plugin.c
new file mode 100644
index 0000000..5d5fb88
--- /dev/null
+++ b/kernel/sched_plugin.c
@@ -0,0 +1,172 @@
+/* sched_plugin.c -- core infrastructure for the scheduler plugin system
+ *
+ * This file includes the initialization of the plugin system, the no-op Linux
+ * scheduler plugin and some dummy functions.
+ */
+
+
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+
+
+/* These are the original Linux initialization functions.
+ * We replace them here with our initialization code and call them
+ * after setting up LITMUS.
+ */
+void linux_sched_init(void);
+void linux_sched_init_smp(void);
+int linux_migration_init(void);
+
+/*************************************************************
+ * Dummy plugin functions *
+ *************************************************************/
+
+void litmus_dummy_finish_switch(struct task_struct * prev)
+{
+}
+
+int litmus_dummy_schedule(struct task_struct * prev,
+ struct task_struct** next,
+ runqueue_t* q)
+{
+ return 0;
+}
+
+reschedule_check_t litmus_dummy_scheduler_tick(void)
+{
+ return NO_RESCHED;
+}
+
+
+long litmus_dummy_prepare_task(struct task_struct *t)
+{
+ return 0;
+}
+
+void litmus_dummy_wake_up_task(struct task_struct *task)
+{
+ printk(KERN_WARNING "task %d: unhandled real-time wake up!\n",
+ task->pid);
+}
+
+void litmus_dummy_task_blocks(struct task_struct *task)
+{
+}
+
+long litmus_dummy_tear_down(struct task_struct *task)
+{
+ return 0;
+}
+
+long litmus_dummy_enter_np(struct task_struct *task)
+{
+ return -EPERM;
+}
+
+long litmus_dummy_exit_np(struct task_struct *task)
+{
+ return -EPERM;
+}
+
+int litmus_dummy_scheduler_setup(int cmd, void __user *parameter)
+{
+ return -EPERM;
+}
+
+long litmus_dummy_sleep_next_period(void)
+{
+ return -EPERM;
+}
+
+long litmus_dummy_inherit_priority(struct pi_semaphore *sem,
+ struct task_struct *new_owner)
+{
+ return -EPERM;
+}
+
+long litmus_dummy_return_priority(struct pi_semaphore *sem)
+{
+ return -EPERM;
+}
+
+long litmus_dummy_pi_block(struct pi_semaphore *sem,
+ struct task_struct *new_waiter)
+{
+ return -EPERM;
+}
+
+
+/* The default scheduler plugin. It doesn't do anything and lets Linux do its
+ * job.
+ */
+
+sched_plugin_t linux_sched_plugin = {
+ .plugin_name = "Linux",
+ .ready_to_use = 1,
+ .algo_scheduler_tick = 0,
+ .scheduler_tick = litmus_dummy_scheduler_tick,
+ .prepare_task = litmus_dummy_prepare_task,
+ .tear_down = litmus_dummy_tear_down,
+ .wake_up_task = litmus_dummy_wake_up_task,
+ .task_blocks = litmus_dummy_task_blocks,
+ .sleep_next_period = litmus_dummy_sleep_next_period,
+ .shutdown_hook = 0,
+ .schedule = litmus_dummy_schedule,
+ .finish_switch = litmus_dummy_finish_switch,
+ .scheduler_setup = litmus_dummy_scheduler_setup,
+ .inherit_priority = litmus_dummy_inherit_priority,
+ .return_priority = litmus_dummy_return_priority,
+ .pi_block = litmus_dummy_pi_block
+};
+
+/*
+ * The reference to current plugin that is used to schedule tasks within
+ * the system. It stores references to actual function implementations
+ * Should be initialized by calling "init_***_plugin()"
+ */
+sched_plugin_t *curr_sched_plugin = &linux_sched_plugin;
+
+
+/* At sched-init */
+void __init sched_init(void)
+{
+ printk("Entering custom sched init, plugin %s\n",
+ curr_sched_plugin->plugin_name);
+ /* Init tracing facility before plugin functions are called */
+
+ /* CLEANUP: reenable this if needed
+ pstats = INIT_PSTATS;
+
+ */
+
+ /* Call linux sched init tasks */
+ linux_sched_init();
+ printk("Sched init complete\n");
+}
+
+void __init sched_init_smp(void)
+{
+ printk("Entering custom SMP init, plugin %s\n",
+ curr_sched_plugin->plugin_name);
+ /* Call linux smp initializer */
+ linux_sched_init_smp();
+ /* Enable tracing facilities here */
+ /*
+ CLEANUP: Reenable if needed.
+ if (smp_processor_id() == 0) {
+ if (init_trace()) {
+ printk("Tracing disabled\n");
+ } else {
+ printk("Default tracing enabled\n");
+ }
+ } */
+ printk("Sched init SMP complete\n");
+}
+
+int __init migration_init(void)
+{
+ printk("Entering migration init\n");
+
+ /* Call linux migration init as it was before */
+ return linux_migration_init();
+}
diff --git a/kernel/sched_psn_edf.c b/kernel/sched_psn_edf.c
new file mode 100644
index 0000000..8c3c2d8
--- /dev/null
+++ b/kernel/sched_psn_edf.c
@@ -0,0 +1,531 @@
+
+/*
+ * kernel/sched_psn_edf.c
+ *
+ * Implementation of the PSN-EDF scheduler plugin.
+ * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c.
+ *
+ * Suspensions and non-preemptable sections are supported.
+ * Priority inheritance is not supported.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+
+#include <linux/litmus.h>
+#include <linux/sched_plugin.h>
+#include <linux/edf_common.h>
+
+
+typedef struct {
+ edf_domain_t domain;
+ int cpu;
+ struct task_struct* scheduled; /* only RT tasks */
+ spinlock_t lock; /* protects the domain and
+ * serializes scheduling decisions
+ */
+} psnedf_domain_t;
+
+DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains);
+
+#define local_edf (&__get_cpu_var(psnedf_domains).domain)
+#define local_pedf (&__get_cpu_var(psnedf_domains))
+#define remote_edf(cpu) (&per_cpu(psnedf_domains, cpu).domain)
+#define remote_pedf(cpu) (&per_cpu(psnedf_domains, cpu))
+#define task_edf(task) remote_edf(get_partition(task))
+#define task_pedf(task) remote_pedf(get_partition(task))
+
+
+static void psnedf_domain_init(psnedf_domain_t* pedf,
+ edf_check_resched_needed_t check,
+ int cpu)
+{
+ edf_domain_init(&pedf->domain, check);
+ pedf->cpu = cpu;
+ pedf->lock = SPIN_LOCK_UNLOCKED;
+ pedf->scheduled = NULL;
+}
+
+/* we assume the lock is being held */
+static void preempt(psnedf_domain_t *pedf)
+{
+ /* don't interrupt non-preemptable tasks */
+ if (pedf->scheduled && is_np(pedf->scheduled))
+ return;
+
+ if (pedf->cpu == smp_processor_id())
+ set_tsk_need_resched(current);
+ else
+ smp_send_reschedule(pedf->cpu);
+}
+
+/* This check is trivial in partioned systems as we only have to consider
+ * the CPU of the partition.
+ */
+static int psnedf_check_resched(edf_domain_t *edf)
+{
+ psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain);
+ int ret = 0;
+
+ /* because this is a callback from edf_domain_t we already hold
+ * the necessary lock for the ready queue
+ */
+ if (preemption_needed(edf, pedf->scheduled)) {
+ preempt(pedf);
+ ret = 1;
+ }
+ return ret;
+}
+
+
+static reschedule_check_t psnedf_scheduler_tick(void)
+{
+ unsigned long flags;
+ struct task_struct *t = current;
+ reschedule_check_t want_resched = NO_RESCHED;
+ edf_domain_t *edf = local_edf;
+ psnedf_domain_t *pedf = local_pedf;
+
+ /* Check for inconsistency. We don't need the lock for this since
+ * ->scheduled is only changed in schedule, which obviously is not
+ * executing in parallel on this CPU
+ */
+ BUG_ON(is_realtime(t) && t != pedf->scheduled);
+
+ if (is_realtime(t))
+ TRACE("%s/%d was hit by scheduler tick\n", t->comm, t->pid);
+
+ /* expire tasks even if not in real-time mode
+ * this makes sure that at the end of real-time mode
+ * no tasks "run away forever".
+ */
+ if (is_realtime(t) && t->time_slice && !--t->time_slice) {
+ if (!is_np(t)) {
+ TRACE("%s/%d was marked as RT_F_SLEEP, "
+ "state=%d\n",
+ t->comm, t->pid, t->state);
+ set_rt_flags(t, RT_F_SLEEP);
+ want_resched = FORCE_RESCHED;
+ } else
+ TRACE("psnedf_scheduler_tick: "
+ "%d is non-preemptable, "
+ "preemption delayed.\n", t->pid);
+ }
+
+ if (get_rt_mode() == MODE_RT_RUN)
+ {
+ /* check whether anything is waiting to be released
+ * this could probably be moved to the global timer
+ * interrupt handler since the state will only change
+ * once per jiffie
+ */
+ spin_lock_irqsave(&pedf->lock, flags);
+ __release_pending(edf);
+ if (want_resched != FORCE_RESCHED &&
+ preemption_needed(edf, t))
+ want_resched = FORCE_RESCHED;
+
+ spin_unlock_irqrestore(&pedf->lock, flags);
+
+ }
+ return want_resched;
+}
+
+static int psnedf_schedule(struct task_struct * prev,
+ struct task_struct ** next,
+ runqueue_t * rq)
+{
+ int need_deactivate = 1;
+ psnedf_domain_t* pedf = local_pedf;
+ edf_domain_t* edf = &pedf->domain;
+
+ /* if a real-time task is non-preemptable, then schedule it again.
+ */
+ if (is_realtime(prev) &&
+ is_running(prev) &&
+ is_np(prev)) {
+ TRACE("psnedf_schedule: is_np(%d) = %d => reschedule prev\n",
+ prev->pid, is_np(prev));
+ *next = prev;
+ return 0;
+ }
+
+ if (is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP)
+ prepare_for_next_period(prev);
+
+ if (get_rt_mode() == MODE_RT_RUN) {
+ spin_lock(&pedf->lock);
+ if (is_realtime(prev) && is_released(prev) && is_running(prev)
+ && !preemption_needed(edf, prev)) {
+ /* this really should only happen if the task has
+ * 100% utilization...
+ */
+ TRACE("prev will be next, already released\n");
+ *next = prev;
+ need_deactivate = 0;
+ } else {
+ /* either not yet released, preempted, or non-rt */
+ *next = __take_ready(edf);
+ if (*next) {
+ /* stick the task into the runqueue */
+ __activate_task(*next, rq);
+ set_task_cpu(*next, smp_processor_id());
+ }
+ }
+ pedf->scheduled = *next;
+ if (*next)
+ set_rt_flags(*next, RT_F_RUNNING);
+
+ spin_unlock(&pedf->lock);
+ }
+
+ if (is_realtime(prev) && need_deactivate && prev->array) {
+ /* take it out of the run queue */
+ deactivate_task(prev, rq);
+ }
+
+ return 0;
+}
+
+
+static void psnedf_finish_switch(struct task_struct *prev)
+{
+ edf_domain_t* edf = local_edf;
+ psnedf_domain_t* pedf = local_pedf;
+
+ if (!is_realtime(prev))
+ return;
+
+ if (is_blocked(prev)) {
+ TRACE("psdnedf: %s/%d is not requeued "
+ "(state=%d, prev->preempt_count=%x",
+ prev->comm, prev->pid, prev->state,
+ prev->thread_info->preempt_count);
+ return;
+ }
+
+ if (prev->state != TASK_RUNNING)
+ TRACE("psdnedf: %s/%d is requeued because of preempt hack"
+ "(state=%d, prev->preempt_count=%x",
+ prev->comm, prev->pid, prev->state,
+ prev->thread_info->preempt_count);
+
+ /* IRQs are still disabled from by schedule(), which is calling us */
+ spin_lock(&pedf->lock);
+ if ((get_rt_flags(prev) == RT_F_SLEEP && !is_released(prev)) ||
+ get_rt_mode() != MODE_RT_RUN)
+ __add_release(edf, prev); /* it has got to wait */
+ else
+ __add_ready(edf, prev);
+ spin_unlock(&pedf->lock);
+}
+
+
+/* Prepare a task for running in RT mode
+ * Enqueues the task into master queue data structure
+ * returns
+ * -EPERM if task is not TASK_STOPPED
+ */
+static long psnedf_prepare_task(struct task_struct * t)
+{
+ edf_domain_t* edf = task_edf(t);
+ psnedf_domain_t* pedf = task_pedf(t);
+ unsigned long flags;
+
+ TRACE("[%d] psn edf: prepare task %d on CPU %d\n",
+ smp_processor_id(), t->pid, get_partition(t));
+ if (t->state == TASK_STOPPED) {
+ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
+
+ if (get_rt_mode() == MODE_RT_RUN)
+ /* The action is already on.
+ * Prepare immediate release.
+ */
+ prepare_new_release(t);
+ /* The task should be running in the queue, otherwise signal
+ * code will try to wake it up with fatal consequences.
+ */
+ t->state = TASK_RUNNING;
+ spin_lock_irqsave(&pedf->lock, flags);
+ __add_release(edf, t);
+ spin_unlock_irqrestore(&pedf->lock, flags);
+ return 0;
+ } else
+ return -EPERM;
+}
+
+static void psnedf_wake_up_task(struct task_struct *task)
+{
+ unsigned long flags;
+ psnedf_domain_t* pedf = task_pedf(task);
+ edf_domain_t* edf = task_edf(task);
+
+ TRACE("psnedf: %d unsuspends with budget=%d\n",
+ task->pid, task->time_slice);
+
+ BUG_ON(in_list(&task->rt_list));
+
+ task->state = TASK_RUNNING;
+
+ /* We need to take suspensions because of semaphores into
+ * account! If a job resumes after being suspended due to acquiring
+ * a semaphore, it should never be treated as a new job release.
+ */
+ if (is_tardy(task) && get_rt_flags(task) != RT_F_EXIT_SEM) {
+ /* new sporadic release */
+ prepare_new_release(task);
+ sched_trace_job_release(task);
+ }
+
+ spin_lock_irqsave(&pedf->lock, flags);
+ if (task->time_slice) {
+ set_rt_flags(task, RT_F_RUNNING);
+ __add_ready(edf, task);
+ } else
+ __add_release(edf, task);
+ spin_unlock_irqrestore(&pedf->lock, flags);
+
+}
+
+static void psnedf_task_blocks(struct task_struct *t)
+{
+ BUG_ON(!is_realtime(t));
+ /* not really anything to do since it can only block if
+ * it is running, and when it is not running it is not in any
+ * queue anyway.
+ */
+ TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice);
+ BUG_ON(in_list(&t->rt_list));
+}
+
+
+/* When _tear_down is called, the task should not be in any queue any more
+ * as it must have blocked first. We don't have any internal state for the task,
+ * it is all in the task_struct.
+ */
+static long psnedf_tear_down(struct task_struct * t)
+{
+ BUG_ON(!is_realtime(t));
+ TRACE_TASK(t, "tear down called");
+ BUG_ON(t->array);
+ BUG_ON(in_list(&t->rt_list));
+ return 0;
+}
+
+static long psnedf_enter_np(struct task_struct * t)
+{
+ unsigned long flags;
+ psnedf_domain_t* pedf = task_pedf(t);
+
+ spin_lock_irqsave(&pedf->lock, flags);
+ t->rt_param.is_non_preemptable++;
+ spin_unlock_irqrestore(&pedf->lock, flags);
+ return 0;
+}
+
+static long psnedf_exit_np(struct task_struct * t)
+{
+ unsigned long flags;
+ psnedf_domain_t* pedf = task_pedf(t);
+ int ret = 0;
+
+ spin_lock_irqsave(&pedf->lock, flags);
+ if (is_np(t)) {
+ if (!--t->rt_param.is_non_preemptable &&
+ (!t->time_slice ||
+ preemption_needed(task_edf(t), t))) {
+ if (!t->time_slice) {
+ set_rt_flags(t, RT_F_SLEEP);
+ TRACE("psnedf_exit_np: delayed preemption "
+ "of %d\n", t->pid);
+ }
+ BUG_ON(t != local_pedf->scheduled);
+ set_tsk_need_resched(t);
+ }
+ } else
+ ret = -EPERM;
+ spin_unlock_irqrestore(&pedf->lock, flags);
+ return ret;
+}
+
+static long psnedf_pi_block(struct pi_semaphore *sem,
+ struct task_struct *new_waiter)
+{
+ psnedf_domain_t* pedf;
+ edf_domain_t* edf;
+ struct task_struct* t;
+ int cpu = get_partition(new_waiter);
+
+ BUG_ON(!new_waiter);
+
+ if (edf_higher_prio(new_waiter, sem->hp.cpu_task[cpu])) {
+ TRACE_TASK(new_waiter, " boosts priority\n");
+ pedf = task_pedf(new_waiter);
+ edf = task_edf(new_waiter);
+
+ /* interrupts already disabled */
+ spin_lock(&pedf->lock);
+
+ /* store new highest-priority task */
+ sem->hp.cpu_task[cpu] = new_waiter;
+ if (sem->holder &&
+ get_partition(sem->holder) == get_partition(new_waiter)) {
+ /* let holder inherit */
+ sem->holder->rt_param.inh_task = new_waiter;
+ t = sem->holder;
+ if (in_list(&t->rt_list)) {
+ /* queued in domain*/
+ list_del(&t->rt_list);
+ /* readd to make priority change take place */
+ if (is_released(t))
+ __add_ready(edf, t);
+ else
+ __add_release(edf, t);
+ }
+ }
+
+ /* check if we need to reschedule */
+ if (preemption_needed(edf, current))
+ preempt(pedf);
+
+ spin_unlock(&pedf->lock);
+ }
+
+ return 0;
+}
+
+static long psnedf_inherit_priority(struct pi_semaphore *sem,
+ struct task_struct *new_owner)
+{
+ int cpu = get_partition(new_owner);
+
+ new_owner->rt_param.inh_task = sem->hp.cpu_task[cpu];
+ if (sem->hp.cpu_task[cpu] && new_owner != sem->hp.cpu_task[cpu]) {
+ TRACE_TASK(new_owner,
+ "inherited priority from %s/%d\n",
+ sem->hp.cpu_task[cpu]->comm,
+ sem->hp.cpu_task[cpu]->pid);
+ } else
+ TRACE_TASK(new_owner,
+ "cannot inherit priority: "
+ "no higher priority job waits on this CPU!\n");
+ /* make new owner non-preemptable as required by FMLP/FLEX under
+ * PSN-EDF.
+ */
+ psnedf_enter_np(new_owner);
+ return 0;
+}
+
+
+/* This function is called on a semaphore release, and assumes that
+ * the current task is also the semaphore holder.
+ */
+static long psnedf_return_priority(struct pi_semaphore *sem)
+{
+ struct task_struct* t = current;
+ psnedf_domain_t* pedf = task_pedf(t);
+ edf_domain_t* edf = task_edf(t);
+ int ret = 0;
+ int cpu = get_partition(current);
+
+
+ /* Find new highest-priority semaphore task
+ * if holder task is the current hp.cpu_task[cpu].
+ *
+ * Calling function holds sem->wait.lock.
+ */
+ if (t == sem->hp.cpu_task[cpu])
+ edf_set_hp_cpu_task(sem, cpu);
+
+ psnedf_exit_np(t);
+ if (current->rt_param.inh_task) {
+ TRACE_CUR("return priority of %s/%d\n",
+ current->rt_param.inh_task->comm,
+ current->rt_param.inh_task->pid);
+ spin_lock(&pedf->lock);
+
+ /* Reset inh_task to NULL. */
+ current->rt_param.inh_task = NULL;
+
+ /* check if we need to reschedule */
+ if (preemption_needed(edf, current))
+ preempt(pedf);
+
+ spin_unlock(&pedf->lock);
+ } else
+ TRACE_CUR(" no priority to return %p\n", sem);
+
+ return ret;
+}
+
+
+static int psnedf_mode_change(int new_mode)
+{
+ int cpu;
+
+ if (new_mode == MODE_RT_RUN)
+ for_each_online_cpu(cpu) {
+ spin_lock(&remote_pedf(cpu)->lock);
+ __prepare_new_releases(remote_edf(cpu), jiffies);
+ spin_unlock(&remote_pedf(cpu)->lock);
+ }
+
+ TRACE("[%d] psn edf: mode changed to %d\n",
+ smp_processor_id(), new_mode);
+ return 0;
+}
+
+
+/* Plugin object */
+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
+ .ready_to_use = 0
+};
+
+
+/*
+ * Plugin initialization code.
+ */
+#define INIT_SCHED_PLUGIN (struct sched_plugin) {\
+ .plugin_name = "PSN-EDF",\
+ .ready_to_use = 1,\
+ .algo_scheduler_tick = psnedf_scheduler_tick,\
+ .scheduler_tick = rt_scheduler_tick,\
+ .prepare_task = psnedf_prepare_task,\
+ .sleep_next_period = edf_sleep_next_period,\
+ .tear_down = psnedf_tear_down,\
+ .shutdown_hook = NULL,\
+ .schedule = psnedf_schedule,\
+ .finish_switch = psnedf_finish_switch,\
+ .mode_change = psnedf_mode_change,\
+ .wake_up_task = psnedf_wake_up_task,\
+ .task_blocks = psnedf_task_blocks, \
+ .enter_np = psnedf_enter_np, \
+ .exit_np = psnedf_exit_np, \
+ .pi_block = psnedf_pi_block, \
+ .inherit_priority = psnedf_inherit_priority, \
+ .return_priority = psnedf_return_priority \
+}
+
+
+sched_plugin_t *__init init_psn_edf_plugin(void)
+{
+ int i;
+
+ if (!s_plugin.ready_to_use)
+ {
+ set_sched_options(SCHED_NONE);
+ for (i = 0; i < NR_CPUS; i++)
+ {
+ psnedf_domain_init(remote_pedf(i),
+ psnedf_check_resched, i);
+ printk("PSN-EDF: CPU partition %d initialized.\n", i);
+ }
+ s_plugin = INIT_SCHED_PLUGIN;
+ }
+ return &s_plugin;
+}
+
+
+
diff --git a/kernel/sched_trace.c b/kernel/sched_trace.c
new file mode 100644
index 0000000..be495f1
--- /dev/null
+++ b/kernel/sched_trace.c
@@ -0,0 +1,725 @@
+/* sched_trace.c -- record scheduling events to a byte stream.
+ *
+ * TODO: Move ring buffer to a lockfree implementation.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <asm/semaphore.h>
+#include <asm/uaccess.h>
+#include <linux/module.h>
+
+#include <linux/queuelock.h>
+#include <linux/sched_trace.h>
+#include <linux/litmus.h>
+
+
+typedef struct {
+ /* guard read and write pointers */
+ spinlock_t lock;
+ /* guard against concurrent freeing of buffer */
+ rwlock_t del_lock;
+
+ /* memory allocated for ring buffer */
+ unsigned long order;
+ char* buf;
+ char* end;
+
+ /* Read/write pointer. May not cross.
+ * They point to the position of next write and
+ * last read.
+ */
+ char* writep;
+ char* readp;
+
+} ring_buffer_t;
+
+#define EMPTY_RING_BUFFER { \
+ .lock = SPIN_LOCK_UNLOCKED, \
+ .del_lock = RW_LOCK_UNLOCKED, \
+ .buf = NULL, \
+ .end = NULL, \
+ .writep = NULL, \
+ .readp = NULL \
+}
+
+void rb_init(ring_buffer_t* buf)
+{
+ *buf = (ring_buffer_t) EMPTY_RING_BUFFER;
+}
+
+int rb_alloc_buf(ring_buffer_t* buf, unsigned long order)
+{
+ unsigned long flags;
+ int error = 0;
+ char *mem;
+
+ /* do memory allocation while not atomic */
+ mem = (char *) __get_free_pages(GFP_KERNEL, order);
+ if (!mem)
+ return -ENOMEM;
+ write_lock_irqsave(&buf->del_lock, flags);
+ BUG_ON(buf->buf);
+ buf->buf = mem;
+ buf->end = buf->buf + PAGE_SIZE * (1 << order) - 1;
+ memset(buf->buf, 0xff, buf->end - buf->buf);
+ buf->order = order;
+ buf->writep = buf->buf + 1;
+ buf->readp = buf->buf;
+ write_unlock_irqrestore(&buf->del_lock, flags);
+ return error;
+}
+
+int rb_free_buf(ring_buffer_t* buf)
+{
+ unsigned long flags;
+ int error = 0;
+ write_lock_irqsave(&buf->del_lock, flags);
+ BUG_ON(!buf->buf);
+ free_pages((unsigned long) buf->buf, buf->order);
+ buf->buf = NULL;
+ buf->end = NULL;
+ buf->writep = NULL;
+ buf->readp = NULL;
+ write_unlock_irqrestore(&buf->del_lock, flags);
+ return error;
+}
+
+/* Assumption: concurrent writes are serialized externally
+ *
+ * Will only succeed if there is enough space for all len bytes.
+ */
+int rb_put(ring_buffer_t* buf, char* mem, size_t len)
+{
+ unsigned long flags;
+ char* r , *w;
+ int error = 0;
+ read_lock_irqsave(&buf->del_lock, flags);
+ if (!buf->buf) {
+ error = -ENODEV;
+ goto out;
+ }
+ spin_lock(&buf->lock);
+ r = buf->readp;
+ w = buf->writep;
+ spin_unlock(&buf->lock);
+ if (r < w && buf->end - w >= len - 1) {
+ /* easy case: there is enough space in the buffer
+ * to write it in one continous chunk*/
+ memcpy(w, mem, len);
+ w += len;
+ if (w > buf->end)
+ /* special case: fit exactly into buffer
+ * w is now buf->end + 1
+ */
+ w = buf->buf;
+ } else if (w < r && r - w >= len) { /* >= len because may not cross */
+ /* we are constrained by the read pointer but we there
+ * is enough space
+ */
+ memcpy(w, mem, len);
+ w += len;
+ } else if (r <= w && buf->end - w < len - 1) {
+ /* the wrap around case: there may or may not be space */
+ if ((buf->end - w) + (r - buf->buf) >= len - 1) {
+ /* copy chunk that fits at the end */
+ memcpy(w, mem, buf->end - w + 1);
+ mem += buf->end - w + 1;
+ len -= (buf->end - w + 1);
+ w = buf->buf;
+ /* copy the rest */
+ memcpy(w, mem, len);
+ w += len;
+ }
+ else
+ error = -ENOMEM;
+ } else {
+ error = -ENOMEM;
+ }
+ if (!error) {
+ spin_lock(&buf->lock);
+ buf->writep = w;
+ spin_unlock(&buf->lock);
+ }
+ out:
+ read_unlock_irqrestore(&buf->del_lock, flags);
+ return error;
+}
+
+/* Assumption: concurrent reads are serialized externally */
+int rb_get(ring_buffer_t* buf, char* mem, size_t len)
+{
+ unsigned long flags;
+ char* r , *w;
+ int error = 0;
+ read_lock_irqsave(&buf->del_lock, flags);
+ if (!buf->buf) {
+ error = -ENODEV;
+ goto out;
+ }
+ spin_lock(&buf->lock);
+ r = buf->readp;
+ w = buf->writep;
+ spin_unlock(&buf->lock);
+
+ if (w <= r && buf->end - r >= len) {
+ /* easy case: there is enough data in the buffer
+ * to get it in one chunk*/
+ memcpy(mem, r + 1, len);
+ r += len;
+ error = len;
+
+ } else if (r + 1 < w && w - r - 1 >= len) {
+ /* we are constrained by the write pointer but
+ * there is enough data
+ */
+ memcpy(mem, r + 1, len);
+ r += len;
+ error = len;
+
+ } else if (r + 1 < w && w - r - 1 < len) {
+ /* we are constrained by the write pointer and there
+ * there is not enough data
+ */
+ memcpy(mem, r + 1, w - r - 1);
+ error = w - r - 1;
+ r += w - r - 1;
+
+ } else if (w <= r && buf->end - r < len) {
+ /* the wrap around case: there may or may not be enough data
+ * first let's get what is available
+ */
+ memcpy(mem, r + 1, buf->end - r);
+ error += (buf->end - r);
+ mem += (buf->end - r);
+ len -= (buf->end - r);
+ r += (buf->end - r);
+
+ if (w > buf->buf) {
+ /* there is more to get */
+ r = buf->buf - 1;
+ if (w - r >= len) {
+ /* plenty */
+ memcpy(mem, r + 1, len);
+ error += len;
+ r += len;
+ } else {
+ memcpy(mem, r + 1, w - r - 1);
+ error += w - r - 1;
+ r += w - r - 1;
+ }
+ }
+ } /* nothing available */
+
+ if (error > 0) {
+ spin_lock(&buf->lock);
+ buf->readp = r;
+ spin_unlock(&buf->lock);
+ }
+ out:
+ read_unlock_irqrestore(&buf->del_lock, flags);
+ return error;
+}
+
+
+
+/******************************************************************************/
+/* DEVICE FILE DRIVER */
+/******************************************************************************/
+
+
+
+/* Allocate a buffer of about 1 MB per CPU.
+ *
+ */
+#define BUFFER_ORDER 8
+
+typedef struct {
+ ring_buffer_t buf;
+ atomic_t reader_cnt;
+ struct semaphore reader_mutex;
+} trace_buffer_t;
+
+
+/* This does not initialize the semaphore!! */
+
+#define EMPTY_TRACE_BUFFER \
+ { .buf = EMPTY_RING_BUFFER, .reader_cnt = ATOMIC_INIT(0)}
+
+static DEFINE_PER_CPU(trace_buffer_t, trace_buffer);
+
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+static spinlock_t log_buffer_lock = SPIN_LOCK_UNLOCKED;
+#endif
+static trace_buffer_t log_buffer = EMPTY_TRACE_BUFFER;
+
+static void init_buffers(void)
+{
+ int i;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ rb_init(&per_cpu(trace_buffer, i).buf);
+ init_MUTEX(&per_cpu(trace_buffer, i).reader_mutex);
+ atomic_set(&per_cpu(trace_buffer, i).reader_cnt, 0);
+ }
+ /* only initialize the mutex, the rest was initialized as part
+ * of the static initialization macro
+ */
+ init_MUTEX(&log_buffer.reader_mutex);
+}
+
+static int trace_release(struct inode *in, struct file *filp)
+{
+ int error = -EINVAL;
+ trace_buffer_t* buf = filp->private_data;
+
+ BUG_ON(!filp->private_data);
+
+ if (down_interruptible(&buf->reader_mutex)) {
+ error = -ERESTARTSYS;
+ goto out;
+ }
+
+ /* last release must deallocate buffers */
+ if (atomic_dec_return(&buf->reader_cnt) == 0) {
+ error = rb_free_buf(&buf->buf);
+ }
+
+ up(&buf->reader_mutex);
+ out:
+ return error;
+}
+
+static ssize_t trace_read(struct file *filp, char __user *to, size_t len,
+ loff_t *f_pos)
+{
+ /* we ignore f_pos, this is strictly sequential */
+
+ ssize_t error = -EINVAL;
+ char* mem;
+ trace_buffer_t *buf = filp->private_data;
+
+ if (down_interruptible(&buf->reader_mutex)) {
+ error = -ERESTARTSYS;
+ goto out;
+ }
+
+ if (len > 64 * 1024)
+ len = 64 * 1024;
+ mem = kmalloc(len, GFP_KERNEL);
+ if (!mem) {
+ error = -ENOMEM;
+ goto out_unlock;
+ }
+
+ error = rb_get(&buf->buf, mem, len);
+ while (!error) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(110);
+ if (signal_pending(current))
+ error = -ERESTARTSYS;
+ else
+ error = rb_get(&buf->buf, mem, len);
+ }
+
+ if (error > 0 && copy_to_user(to, mem, error))
+ error = -EFAULT;
+
+ kfree(mem);
+ out_unlock:
+ up(&buf->reader_mutex);
+ out:
+ return error;
+}
+
+
+/* trace_open - Open one of the per-CPU sched_trace buffers.
+ */
+static int trace_open(struct inode *in, struct file *filp)
+{
+ int error = -EINVAL;
+ int cpu = MINOR(in->i_rdev);
+ trace_buffer_t* buf;
+
+ if (!cpu_online(cpu)) {
+ printk(KERN_WARNING "sched trace: "
+ "CPU #%d is not online. (open failed)\n", cpu);
+ error = -ENODEV;
+ goto out;
+ }
+
+ buf = &per_cpu(trace_buffer, cpu);
+
+ if (down_interruptible(&buf->reader_mutex)) {
+ error = -ERESTARTSYS;
+ goto out;
+ }
+
+ /* first open must allocate buffers */
+ if (atomic_inc_return(&buf->reader_cnt) == 1) {
+ if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER)))
+ {
+ atomic_dec(&buf->reader_cnt);
+ goto out_unlock;
+ }
+ }
+
+ error = 0;
+ filp->private_data = buf;
+
+ out_unlock:
+ up(&buf->reader_mutex);
+ out:
+ return error;
+}
+
+/* log_open - open the global log message ring buffer.
+ */
+static int log_open(struct inode *in, struct file *filp)
+{
+ int error = -EINVAL;
+ trace_buffer_t* buf;
+
+ buf = &log_buffer;
+
+ if (down_interruptible(&buf->reader_mutex)) {
+ error = -ERESTARTSYS;
+ goto out;
+ }
+
+ /* first open must allocate buffers */
+ if (atomic_inc_return(&buf->reader_cnt) == 1) {
+ if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER)))
+ {
+ atomic_dec(&buf->reader_cnt);
+ goto out_unlock;
+ }
+ }
+
+ error = 0;
+ filp->private_data = buf;
+
+ out_unlock:
+ up(&buf->reader_mutex);
+ out:
+ return error;
+}
+
+/******************************************************************************/
+/* Device Registration */
+/******************************************************************************/
+
+/* the major numbes are from the unassigned/local use block
+ *
+ * This should be converted to dynamic allocation at some point...
+ */
+#define TRACE_MAJOR 250
+#define LOG_MAJOR 251
+
+/* trace_fops - The file operations for accessing the per-CPU scheduling event
+ * trace buffers.
+ */
+struct file_operations trace_fops = {
+ .owner = THIS_MODULE,
+ .open = trace_open,
+ .release = trace_release,
+ .read = trace_read,
+};
+
+/* log_fops - The file operations for accessing the global LITMUS log message
+ * buffer.
+ *
+ * Except for opening the device file it uses the same operations as trace_fops.
+ */
+struct file_operations log_fops = {
+ .owner = THIS_MODULE,
+ .open = log_open,
+ .release = trace_release,
+ .read = trace_read,
+};
+
+static int __init register_buffer_dev(const char* name,
+ struct file_operations* fops,
+ int major, int count)
+{
+ dev_t trace_dev;
+ struct cdev *cdev;
+ int error = 0;
+
+ trace_dev = MKDEV(major, 0);
+ error = register_chrdev_region(trace_dev, count, name);
+ if (error)
+ {
+ printk(KERN_WARNING "sched trace: "
+ "Could not register major/minor number %d\n", major);
+ return error;
+ }
+ cdev = cdev_alloc();
+ if (!cdev) {
+ printk(KERN_WARNING "sched trace: "
+ "Could not get a cdev for %s.\n", name);
+ return -ENOMEM;
+ }
+ cdev->owner = THIS_MODULE;
+ cdev->ops = fops;
+ error = cdev_add(cdev, trace_dev, count);
+ if (error) {
+ printk(KERN_WARNING "sched trace: "
+ "add_cdev failed for %s.\n", name);
+ return -ENOMEM;
+ }
+ return error;
+
+}
+
+static int __init init_sched_trace(void)
+{
+ int error1 = 0, error2 = 0;
+
+ printk("Initializing scheduler trace device\n");
+ init_buffers();
+
+ error1 = register_buffer_dev("schedtrace", &trace_fops,
+ TRACE_MAJOR, NR_CPUS);
+
+ error2 = register_buffer_dev("litmus_log", &log_fops,
+ LOG_MAJOR, 1);
+ if (error1 || error2)
+ return min(error1, error2);
+ else
+ return 0;
+}
+
+module_init(init_sched_trace);
+
+/******************************************************************************/
+/* KERNEL API */
+/******************************************************************************/
+
+/* The per-CPU LITMUS log buffer. Don't put it on the stack, it is too big for
+ * that and the kernel gets very picky with nested interrupts and small stacks.
+ */
+
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+
+#define MSG_SIZE 255
+static DEFINE_PER_CPU(char[MSG_SIZE], fmt_buffer);
+
+/* sched_trace_log_message - This is the only function that accesses the the
+ * log buffer inside the kernel for writing.
+ * Concurrent access to it is serialized via the
+ * log_buffer_lock.
+ *
+ * The maximum length of a formatted message is 255.
+ */
+void sched_trace_log_message(const char* fmt, ...)
+{
+ unsigned long flags;
+ va_list args;
+ size_t len;
+ char* buf;
+
+ va_start(args, fmt);
+ local_irq_save(flags);
+
+ /* format message */
+ buf = __get_cpu_var(fmt_buffer);
+ len = vscnprintf(buf, MSG_SIZE, fmt, args);
+
+ spin_lock(&log_buffer_lock);
+ /* Don't copy the trailing null byte, we don't want null bytes
+ * in a text file.
+ */
+ rb_put(&log_buffer.buf, buf, len);
+ spin_unlock(&log_buffer_lock);
+
+ local_irq_restore(flags);
+ va_end(args);
+}
+
+#endif
+
+#ifdef CONFIG_SCHED_TASK_TRACE
+
+static inline void __put_trace(char* mem, size_t size)
+{
+ trace_buffer_t* buf = &__get_cpu_var(trace_buffer);
+ rb_put(&buf->buf, mem, size);
+}
+
+#define put_trace(obj) \
+ if (get_rt_mode() == MODE_RT_RUN) \
+ __put_trace((char *) &obj, sizeof(obj))
+
+#define header(rec, type) \
+{ \
+ rec.header.trace = type; \
+ rec.header.timestamp = sched_clock(); \
+}
+
+#define tinfo(info, t) \
+{ \
+ info.is_rt = is_realtime(t); \
+ info.is_server = 0; \
+ info.class = get_class(t); \
+ info.budget = (t)->time_slice; \
+ info.pid = (t)->pid; \
+ info.deadline = (t)->rt_param.times.deadline; \
+}
+
+#define rtinfo(info, t) \
+{ \
+ info.wcet = get_exec_cost(t); \
+ info.period = get_rt_period(t); \
+}
+
+void sched_trace_scheduler_invocation(void)
+{
+ invocation_record_t rec;
+ header(rec, ST_INVOCATION);
+ rec.flags = current->flags;
+ put_trace(rec);
+}
+
+void sched_trace_task_arrival(struct task_struct *t)
+{
+ arrival_record_t rec;
+ header(rec, ST_ARRIVAL);
+ tinfo(rec.task, t);
+ put_trace(rec);
+}
+
+
+void sched_trace_task_departure(struct task_struct *t)
+{
+ departure_record_t rec;
+ header(rec, ST_DEPARTURE);
+ tinfo(rec.task, t);
+ put_trace(rec);
+}
+
+void sched_trace_task_preemption(struct task_struct *t, struct task_struct* by)
+{
+ preemption_record_t rec;
+ header(rec, ST_PREEMPTION);
+ tinfo(rec.task, t);
+ tinfo(rec.by, by);
+ put_trace(rec);
+}
+
+
+void sched_trace_task_scheduled(struct task_struct *t)
+{
+ scheduled_record_t rec;
+ header(rec, ST_SCHEDULED);
+ tinfo(rec.task, t);
+ put_trace(rec);
+}
+
+
+void sched_trace_job_release(struct task_struct *t)
+{
+ release_record_t rec;
+ header(rec, ST_JOB_RELEASE);
+ tinfo(rec.task, t);
+ rtinfo(rec, t);
+ put_trace(rec);
+}
+
+void sched_trace_job_completion(struct task_struct *t)
+{
+ completion_record_t rec;
+ header(rec, ST_JOB_COMPLETION);
+ tinfo(rec.task, t);
+ rtinfo(rec, t);
+ rec.tardiness = jiffies - t->rt_param.times.deadline;
+ put_trace(rec);
+}
+
+
+void sched_trace_server_scheduled(int id, task_class_t class,
+ unsigned int budget, jiffie_t deadline)
+{
+ scheduled_record_t rec;
+ header(rec, ST_SCHEDULED);
+ rec.task.pid = id;
+ rec.task.is_rt = 1;
+ rec.task.is_server = 1;
+ rec.task.class = class;
+ rec.task.budget = budget;
+ rec.task.deadline = deadline;
+ put_trace(rec);
+}
+
+void sched_trace_server_release(int id, unsigned int wcet,
+ unsigned int period, task_class_t class)
+{
+ release_record_t rec;
+ header(rec, ST_JOB_RELEASE);
+ rec.task.pid = id;
+ rec.task.is_rt = 1;
+ rec.task.is_server = 1;
+ rec.task.class = class;
+ rec.task.budget = wcet;
+ rec.period = period;
+ rec.wcet = wcet;
+ put_trace(rec);
+}
+
+void sched_trace_server_completion(int id, unsigned int budget,
+ jiffie_t deadline, task_class_t class)
+{
+ completion_record_t rec;
+ header(rec, ST_JOB_COMPLETION);
+ rec.task.pid = id;
+ rec.task.is_rt = 1;
+ rec.task.is_server = 1;
+ rec.task.class = class;
+ rec.task.budget = budget;
+ rec.task.deadline = deadline;
+ rec.period = 0;
+ rec.tardiness = jiffies - deadline;
+ put_trace(rec);
+
+}
+
+void sched_trace_capacity_release(struct task_struct *t)
+{
+ cap_release_record_t rec;
+ header(rec, ST_CAPACITY_RELEASE);
+ tinfo(rec.task, t);
+ put_trace(rec);
+}
+
+void sched_trace_capacity_allocation(struct task_struct *t, u16 budget, u32 deadline,
+ pid_t donor)
+{
+ cap_allocation_record_t rec;
+ header(rec, ST_CAPACITY_ALLOCATION);
+ tinfo(rec.task, t);
+ rec.donor = donor;
+ rec.budget = budget;
+ rec.deadline = deadline;
+ put_trace(rec);
+}
+
+void sched_trace_capacity_alloc_srv(pid_t srv, u32 srv_dl, task_class_t cls,
+ u16 srv_budget,
+ u16 budget, u32 deadline, pid_t donor)
+{
+ cap_allocation_record_t rec;
+ header(rec, ST_CAPACITY_ALLOCATION);
+ rec.task.pid = srv;
+ rec.task.is_rt = 1;
+ rec.task.is_server = 1;
+ rec.task.class = cls;
+ rec.task.budget = srv_budget;
+ rec.task.deadline = srv_dl;
+ rec.donor = donor;
+ rec.budget = budget;
+ rec.deadline = deadline;
+ put_trace(rec);
+}
+
+#endif
diff --git a/kernel/timer.c b/kernel/timer.c
index c2a8ccf..77a1b6b 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -737,6 +737,27 @@ static inline s64 __get_nsec_offset(void)
return ns_offset;
}
+/* Non-static, non-inline, public version of function above.
+ * It's up to the programmer to decide how to use it, no guarantees
+ * about anything are made here.
+ */
+s64 get_nsec_offset(void)
+{
+ cycle_t cycle_now, cycle_delta;
+ s64 ns_offset;
+
+ /* read clocksource: */
+ cycle_now = clocksource_read(clock);
+
+ /* calculate the delta since the last update_wall_time: */
+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
+
+ /* convert to nanoseconds: */
+ ns_offset = cyc2ns(clock, cycle_delta);
+
+ return ns_offset;
+}
+
/**
* __get_realtime_clock_ts - Returns the time of day in a timespec
* @ts: pointer to the timespec to be set
@@ -789,6 +810,7 @@ void do_gettimeofday(struct timeval *tv)
}
EXPORT_SYMBOL(do_gettimeofday);
+
/**
* do_settimeofday - Sets the time of day
* @tv: pointer to the timespec variable containing the new time
diff --git a/kernel/trace.c b/kernel/trace.c
new file mode 100644
index 0000000..ecebe6c
--- /dev/null
+++ b/kernel/trace.c
@@ -0,0 +1,257 @@
+#include <linux/fs.h>
+#include <linux/cdev.h>
+#include <asm/semaphore.h>
+#include <asm/uaccess.h>
+#include <linux/module.h>
+
+#include <linux/trace.h>
+
+/******************************************************************************/
+/* Allocation */
+/******************************************************************************/
+
+struct ft_buffer* trace_ts_buf = NULL;
+
+static unsigned int ts_seq_no = 0;
+
+feather_callback void save_timestamp(unsigned long event)
+{
+ unsigned int seq_no = fetch_and_inc((int *) &ts_seq_no);
+ struct timestamp *ts;
+ if (ft_buffer_start_write(trace_ts_buf, (void**) &ts)) {
+ ts->event = event;
+ ts->timestamp = ft_read_tsc();
+ ts->seq_no = seq_no;
+ ts->cpu = raw_smp_processor_id();
+ ft_buffer_finish_write(trace_ts_buf, ts);
+ }
+}
+
+static struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size)
+{
+ struct ft_buffer* buf;
+ size_t total = (size + 1) * count;
+ char* mem;
+ int order = 0, pages = 1;
+
+ buf = kmalloc(sizeof(struct ft_buffer), GFP_KERNEL);
+ if (!buf)
+ return NULL;
+
+ total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
+ while (pages < total) {
+ order++;
+ pages *= 2;
+ }
+
+ mem = (char*) __get_free_pages(GFP_KERNEL, order);
+ if (!mem) {
+ kfree(buf);
+ return NULL;
+ }
+
+ if (!init_ft_buffer(buf, count, size,
+ mem + (count * size), /* markers at the end */
+ mem)) { /* buffer objects */
+ free_pages((unsigned long) mem, order);
+ kfree(buf);
+ return NULL;
+ }
+ return buf;
+}
+
+static void free_ft_buffer(struct ft_buffer* buf)
+{
+ int order = 0, pages = 1;
+ size_t total;
+
+ if (buf) {
+ total = (buf->slot_size + 1) * buf->slot_count;
+ total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
+ while (pages < total) {
+ order++;
+ pages *= 2;
+ }
+ free_pages((unsigned long) buf->buffer_mem, order);
+ kfree(buf);
+ }
+}
+
+
+/******************************************************************************/
+/* DEVICE FILE DRIVER */
+/******************************************************************************/
+
+#define NO_TIMESTAMPS 262144
+
+static int trace_release(struct inode *in, struct file *filp)
+{
+ int error = -EINVAL;
+
+ /* disable events */
+ ft_disable_all_events();
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ /* wait for any pending events to complete */
+ schedule_timeout(HZ);
+ printk(KERN_ALERT "Failed trace writes: %u\n",
+ trace_ts_buf->failed_writes);
+ free_ft_buffer(trace_ts_buf);
+ trace_ts_buf = NULL;
+ return error;
+}
+
+static ssize_t trace_read(struct file *filp, char __user *to, size_t len,
+ loff_t *f_pos)
+{
+ /* we ignore f_pos, this is strictly sequential */
+ ssize_t error = 0;
+ struct timestamp ts;
+
+ while (len >= sizeof(struct timestamp)) {
+ if (ft_buffer_read(trace_ts_buf, &ts)) {
+ if (copy_to_user(to, &ts, sizeof(struct timestamp))) {
+ error = -EFAULT;
+ break;
+ } else {
+ len -= sizeof(struct timestamp);
+ to += sizeof(struct timestamp);
+ error += sizeof(struct timestamp);
+ }
+ } else {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(50);
+ if (signal_pending(current)) {
+ error = -ERESTARTSYS;
+ break;
+ }
+ }
+ }
+ return error;
+}
+
+#define ENABLE_CMD 0
+#define DISABLE_CMD 1
+
+static ssize_t trace_write(struct file *filp, const char __user *from,
+ size_t len, loff_t *f_pos)
+{
+ ssize_t error = -EINVAL;
+ unsigned long cmd;
+ unsigned long id;
+
+ if (len % sizeof(long) || len < 2 * sizeof(long))
+ goto out;
+
+ if (copy_from_user(&cmd, from, sizeof(long))) {
+ error = -EFAULT;
+ goto out;
+ }
+ len -= sizeof(long);
+ from += sizeof(long);
+
+ if (cmd != ENABLE_CMD && cmd != DISABLE_CMD)
+ goto out;
+
+ error = sizeof(long);
+ while (len) {
+ if (copy_from_user(&id, from, sizeof(long))) {
+ error = -EFAULT;
+ goto out;
+ }
+ len -= sizeof(long);
+ from += sizeof(long);
+ if (cmd) {
+ printk(KERN_INFO
+ "Disabling feather-trace event %lu.\n", id);
+ ft_disable_event(id);
+ } else {
+ printk(KERN_INFO
+ "Enabling feather-trace event %lu.\n", id);
+ ft_enable_event(id);
+ }
+ error += sizeof(long);
+ }
+
+
+ out:
+ return error;
+}
+
+static int trace_open(struct inode *in, struct file *filp)
+{
+ int err = 0;
+ unsigned int count = NO_TIMESTAMPS;
+ while (count && !trace_ts_buf) {
+ printk("trace: trying to allocate %u time stamps.\n", count);
+ trace_ts_buf = alloc_ft_buffer(count, sizeof(struct timestamp));
+ count /= 2;
+ }
+ if (!trace_ts_buf)
+ err = -ENOMEM;
+
+ return err;
+}
+
+/******************************************************************************/
+/* Device Registration */
+/******************************************************************************/
+
+#define FT_TRACE_MAJOR 252
+
+struct file_operations ft_trace_fops = {
+ .owner = THIS_MODULE,
+ .open = trace_open,
+ .release = trace_release,
+ .write = trace_write,
+ .read = trace_read,
+};
+
+
+static int __init register_buffer_dev(const char* name,
+ struct file_operations* fops,
+ int major, int count)
+{
+ dev_t trace_dev;
+ struct cdev *cdev;
+ int error = 0;
+
+ trace_dev = MKDEV(major, 0);
+ error = register_chrdev_region(trace_dev, count, name);
+ if (error)
+ {
+ printk(KERN_WARNING "trace: "
+ "Could not register major/minor number %d\n", major);
+ return error;
+ }
+ cdev = cdev_alloc();
+ if (!cdev) {
+ printk(KERN_WARNING "trace: "
+ "Could not get a cdev for %s.\n", name);
+ return -ENOMEM;
+ }
+ cdev->owner = THIS_MODULE;
+ cdev->ops = fops;
+ error = cdev_add(cdev, trace_dev, count);
+ if (error) {
+ printk(KERN_WARNING "trace: "
+ "add_cdev failed for %s.\n", name);
+ return -ENOMEM;
+ }
+ return error;
+
+}
+
+static int __init init_sched_trace(void)
+{
+ int error = 0;
+
+ printk("Initializing Feather-Trace device\n");
+ /* dummy entry to make linker happy */
+ ft_event0(666, save_timestamp);
+
+ error = register_buffer_dev("ft_trace", &ft_trace_fops,
+ FT_TRACE_MAJOR, 1);
+ return error;
+}
+
+module_init(init_sched_trace);
diff --git a/lib/semaphore-sleepers.c b/lib/semaphore-sleepers.c
index 1281805..3f4d543 100644
--- a/lib/semaphore-sleepers.c
+++ b/lib/semaphore-sleepers.c
@@ -108,7 +108,7 @@ fastcall int __sched __down_interruptible(struct semaphore * sem)
/*
* With signals pending, this turns into
* the trylock failure case - we won't be
- * sleeping, and we* can't get the lock as
+ * sleeping, and we can't get the lock as
* it has contention. Just correct the count
* and exit.
*/