summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBjoern Brandenburg <bbb@bbb1-cs.cs.unc.edu>2008-01-28 14:13:24 -0500
committerBjoern Brandenburg <bbb@bbb1-cs.cs.unc.edu>2008-01-28 14:13:24 -0500
commitd3605639a4e641ae7591734f9e8f836605e58f1c (patch)
treed45d12db3f6dc22412e137e835670c9a8779b215
parent24a3d78b334a52123f168a451fa4a5db4bb157e0 (diff)
release LITMUS 2007.3
-rw-r--r--download/2007.3/SHA256SUMS3
-rw-r--r--download/2007.3/liblitmus-2007.3.tgzbin0 -> 13351 bytes
-rw-r--r--download/2007.3/libso-2007.3.tgzbin0 -> 14815 bytes
-rw-r--r--download/2007.3/litmus-rt-2007.3.patch12859
-rw-r--r--index.html107
5 files changed, 12935 insertions, 34 deletions
diff --git a/download/2007.3/SHA256SUMS b/download/2007.3/SHA256SUMS
new file mode 100644
index 0000000..3adb07b
--- /dev/null
+++ b/download/2007.3/SHA256SUMS
@@ -0,0 +1,3 @@
18f1f5335de7a1aab158adf90aa7010eea066c5dd153b6f98b2d4bb1785682e3b liblitmus-2007.3.tgz
25989f228bdadfd52633344e55e1b6db49f44b72e699c05fdde96832e08bca47c libso-2007.3.tgz
34d51589a5cb92b9c7df11de4b550700493d067c2fa2b0228c464ff4c18436941 litmus-rt-2007.3.patch
diff --git a/download/2007.3/liblitmus-2007.3.tgz b/download/2007.3/liblitmus-2007.3.tgz
new file mode 100644
index 0000000..17ff20a
--- /dev/null
+++ b/download/2007.3/liblitmus-2007.3.tgz
Binary files differ
diff --git a/download/2007.3/libso-2007.3.tgz b/download/2007.3/libso-2007.3.tgz
new file mode 100644
index 0000000..97c6437
--- /dev/null
+++ b/download/2007.3/libso-2007.3.tgz
Binary files differ
diff --git a/download/2007.3/litmus-rt-2007.3.patch b/download/2007.3/litmus-rt-2007.3.patch
new file mode 100644
index 0000000..a81602a
--- /dev/null
+++ b/download/2007.3/litmus-rt-2007.3.patch
@@ -0,0 +1,12859 @@
1 arch/i386/Kconfig | 28 +
2 arch/i386/kernel/apic.c | 92 ++
3 arch/i386/kernel/i386_ksyms.c | 1 +
4 arch/i386/kernel/signal.c | 13 +
5 arch/i386/kernel/syscall_table.S | 27 +
6 fs/Makefile | 2 +-
7 fs/exec.c | 5 +-
8 fs/fdso.c | 281 +++++++
9 fs/inode.c | 2 +
10 include/asm-i386/thread_info.h | 2 +
11 include/asm-i386/unistd.h | 28 +-
12 include/linux/edf_common.h | 36 +
13 include/linux/fdso.h | 70 ++
14 include/linux/feather_buffer.h | 108 +++
15 include/linux/feather_trace.h | 93 ++
16 include/linux/fifo_common.h | 18 +
17 include/linux/fpmath.h | 111 +++
18 include/linux/fs.h | 5 +
19 include/linux/ics.h | 35 +
20 include/linux/list.h | 30 +
21 include/linux/litmus.h | 141 ++++
22 include/linux/pfair_common.h | 40 +
23 include/linux/pfair_math.h | 80 ++
24 include/linux/queuelock.h | 98 +++
25 include/linux/rt_domain.h | 98 +++
26 include/linux/rt_param.h | 277 ++++++
27 include/linux/sched.h | 14 +
28 include/linux/sched_plugin.h | 147 ++++
29 include/linux/sched_trace.h | 182 ++++
30 include/linux/trace.h | 74 ++
31 include/linux/uaccess.h | 16 +
32 include/linux/wait.h | 2 +
33 kernel/Makefile | 8 +-
34 kernel/edf_common.c | 135 +++
35 kernel/exit.c | 4 +
36 kernel/fifo_common.c | 86 ++
37 kernel/fork.c | 5 +
38 kernel/ft_event.c | 104 +++
39 kernel/ics.c | 229 +++++
40 kernel/litmus.c | 1034 +++++++++++++++++++++++
41 kernel/litmus_sem.c | 567 +++++++++++++
42 kernel/pfair_common.c | 237 ++++++
43 kernel/rt_domain.c | 185 ++++
44 kernel/sched.c | 204 ++++-
45 kernel/sched_adaptive.c | 1454 ++++++++++++++++++++++++++++++++
46 kernel/sched_edf_hsb.c | 1724 ++++++++++++++++++++++++++++++++++++++
47 kernel/sched_global_edf.c | 550 ++++++++++++
48 kernel/sched_gsn_edf.c | 816 ++++++++++++++++++
49 kernel/sched_part_edf.c | 340 ++++++++
50 kernel/sched_pfair.c | 503 +++++++++++
51 kernel/sched_plugin.c | 108 +++
52 kernel/sched_psn_edf.c | 523 ++++++++++++
53 kernel/sched_trace.c | 755 +++++++++++++++++
54 kernel/timer.c | 22 +
55 kernel/trace.c | 302 +++++++
56 lib/semaphore-sleepers.c | 2 +-
57 56 files changed, 12028 insertions(+), 25 deletions(-)
58
59diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
60index 0dfee81..da6f1e9 100644
61--- a/arch/i386/Kconfig
62+++ b/arch/i386/Kconfig
63@@ -1210,6 +1210,7 @@ config KPROBES
64 a probepoint and specifies the callback. Kprobes is useful
65 for kernel debugging, non-intrusive instrumentation and testing.
66 If in doubt, say "N".
67+
68 endmenu
69
70 source "arch/i386/Kconfig.debug"
71@@ -1259,3 +1260,30 @@ config X86_TRAMPOLINE
72 config KTIME_SCALAR
73 bool
74 default y
75+
76+
77+menu "LITMUS^RT"
78+
79+
80+config SCHED_TASK_TRACE
81+ bool "Trace real-time tasks"
82+ default y
83+ help
84+ Include support for the sched_trace_XXX() tracing functions. This
85+ allows the collection of real-time task events such as job
86+ completions, job releases, early completions, etc. This results in a
87+ small overhead in the scheduling code. Disable if the overhead is not
88+ acceptable (e.g., benchmarking).
89+
90+config SCHED_DEBUG_TRACE
91+ bool "TRACE() debugging"
92+ default y
93+ help
94+ Include support for sched_trace_log_messageg(), which is used to
95+ implement TRACE(). If disabled, no TRACE() messages will be included
96+ in the kernel, and no overheads due to debugging statements will be
97+ incurred by the scheduler. Disable if the overhead is not acceptable
98+ (e.g. benchmarking).
99+
100+
101+endmenu
102diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
103index 776d9be..2e8909f 100644
104--- a/arch/i386/kernel/apic.c
105+++ b/arch/i386/kernel/apic.c
106@@ -26,6 +26,7 @@
107 #include <linux/sysdev.h>
108 #include <linux/cpu.h>
109 #include <linux/module.h>
110+#include <linux/litmus.h>
111
112 #include <asm/atomic.h>
113 #include <asm/smp.h>
114@@ -43,6 +44,8 @@
115
116 #include "io_ports.h"
117
118+#include <linux/trace.h>
119+
120 /*
121 * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
122 * IPIs in place of local APIC timers
123@@ -54,6 +57,15 @@ static cpumask_t timer_bcast_ipi;
124 */
125 static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
126
127+/*
128+ * Definitions and variables related to quantum synchronization.
129+ */
130+#define WAIT_TO_SYNC 30000 /* time after boot until sync */
131+static int stagger = 0; /* are we using staggered quanta? */
132+static atomic_t qsync_time = ATOMIC_INIT(INITIAL_JIFFIES);
133+static atomic_t quantum_sync_barrier = ATOMIC_INIT(0);
134+static atomic_t sync_done = ATOMIC_INIT(0);
135+
136 static inline void lapic_disable(void)
137 {
138 enable_local_apic = -1;
139@@ -786,6 +798,23 @@ static int __init apic_set_verbosity(char *str)
140
141 __setup("apic=", apic_set_verbosity);
142
143+/*
144+ * Determine whether to use aligned or staggerd quanta.
145+ */
146+
147+static int __init apic_synch_type(char *str)
148+{
149+ if (strcmp("aligned", str) == 0)
150+ stagger = 0;
151+ else if (strcmp("staggered", str) == 0)
152+ stagger = 1;
153+ else
154+ stagger = 0; /* aligned quanta by default */
155+ return 1;
156+}
157+
158+__setup("quanta=", apic_synch_type);
159+
160 static int __init detect_init_APIC (void)
161 {
162 u32 h, l, features;
163@@ -1198,6 +1227,47 @@ EXPORT_SYMBOL(switch_ipi_to_APIC_timer);
164 #undef APIC_DIVISOR
165
166 /*
167+ * This function is called to align all quanta, and to stagger quanta if
168+ * necessary. It relies on a barrier to synchronize all processors, so
169+ * that they all reset their APIC timers at the same time. If quanta
170+ * should be staggered, the appropriate stagger delay is then added at
171+ * each processor.
172+ */
173+
174+void synchronize_quanta(void)
175+{
176+ int cpu = smp_processor_id();
177+ int total_cpus = num_online_cpus();
178+ int stagger_interval = jiffies_to_usecs(1) / total_cpus;
179+
180+ /*
181+ * Disable APIC timer, wait for all other processors to reach barrier,
182+ * and re-enable all timers concurrently.
183+ */
184+ disable_APIC_timer();
185+ atomic_inc(&quantum_sync_barrier);
186+ while (atomic_read(&quantum_sync_barrier) < total_cpus) {
187+ /* Delay, otherwise atomic_inc's cannot occur. */
188+ udelay(1);
189+ }
190+
191+ /* Add necessary stagger for this CPU, if required. */
192+ if (stagger) {
193+ int stagger_us = cpu * stagger_interval;
194+ udelay(stagger_us);
195+ }
196+
197+ /* Re-enable all timers. */
198+ __setup_APIC_LVTT(calibration_result);
199+ enable_APIC_timer();
200+
201+ /* The first CPU signals that quantum sync is complete. */
202+ if (cpu == 0)
203+ atomic_inc(&sync_done);
204+}
205+
206+
207+/*
208 * Local timer interrupt handler. It does both profiling and
209 * process statistics/rescheduling.
210 *
211@@ -1209,11 +1279,32 @@ EXPORT_SYMBOL(switch_ipi_to_APIC_timer);
212
213 inline void smp_local_timer_interrupt(void)
214 {
215+/* s64 offset; */
216+
217+ TS_TICK_START;
218+
219 profile_tick(CPU_PROFILING);
220 #ifdef CONFIG_SMP
221 update_process_times(user_mode_vm(get_irq_regs()));
222 #endif
223
224+ /* Print out timing data - can be commented out if necessary. */
225+/* offset = get_nsec_offset(); */
226+/* TRACE("%d\n", offset); */
227+
228+ /*
229+ * Synchronize quanta if we have reached qsync_time plus wait
230+ * interval. The synchronization code itself is placed in its own
231+ * (non-inline) function, to avoid issues with creating an inline
232+ * function that is too large.
233+ */
234+ if (unlikely(!atomic_read(&sync_done) &&
235+ time_after(jiffies,
236+ (unsigned long)(atomic_read(&qsync_time) +
237+ msecs_to_jiffies(WAIT_TO_SYNC))))) {
238+ synchronize_quanta();
239+ }
240+
241 /*
242 * We take the 'long' return path, and there every subsystem
243 * grabs the apropriate locks (kernel lock/ irq lock).
244@@ -1224,6 +1315,7 @@ inline void smp_local_timer_interrupt(void)
245 * Currently this isn't too much of an issue (performance wise),
246 * we can take more than 100K local irqs per second on a 100 MHz P5.
247 */
248+ TS_TICK_END;
249 }
250
251 /*
252diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c
253index e3d4b73..9670f77 100644
254--- a/arch/i386/kernel/i386_ksyms.c
255+++ b/arch/i386/kernel/i386_ksyms.c
256@@ -6,6 +6,7 @@ EXPORT_SYMBOL(__down_failed);
257 EXPORT_SYMBOL(__down_failed_interruptible);
258 EXPORT_SYMBOL(__down_failed_trylock);
259 EXPORT_SYMBOL(__up_wakeup);
260+
261 /* Networking helper routines. */
262 EXPORT_SYMBOL(csum_partial_copy_generic);
263
264diff --git a/arch/i386/kernel/signal.c b/arch/i386/kernel/signal.c
265index 65d7620..7415518 100644
266--- a/arch/i386/kernel/signal.c
267+++ b/arch/i386/kernel/signal.c
268@@ -27,6 +27,8 @@
269 #include <asm/i387.h>
270 #include "sigframe.h"
271
272+#include <linux/ics.h>
273+
274 #define DEBUG_SIG 0
275
276 #define _BLOCKABLE (~(sigmask(SIGKILL) | sigmask(SIGSTOP)))
277@@ -653,5 +655,16 @@ void do_notify_resume(struct pt_regs *regs, void *_unused,
278 if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
279 do_signal(regs);
280
281+ if (thread_info_flags & _TIF_ROLLBACK_RCS) {
282+ long addr = (long) get_rollback_addr();
283+ if (addr) {
284+ ICS_DBG(KERN_DEBUG "do_notify_resume(): eip 0x%lx -> "
285+ "0x%lx\n", regs->eip, addr);
286+ regs->eip = addr;
287+
288+ }
289+ clear_thread_flag(TIF_ROLLBACK_RCS);
290+ }
291+
292 clear_thread_flag(TIF_IRET);
293 }
294diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
295index 2697e92..32f7d54 100644
296--- a/arch/i386/kernel/syscall_table.S
297+++ b/arch/i386/kernel/syscall_table.S
298@@ -319,3 +319,30 @@ ENTRY(sys_call_table)
299 .long sys_move_pages
300 .long sys_getcpu
301 .long sys_epoll_pwait
302+ /* LITMUS syscalls */
303+ .long sys_sched_setpolicy /* 320 */
304+ .long sys_sched_getpolicy
305+ .long sys_set_rt_mode
306+ .long sys_set_rt_task_param
307+ .long sys_get_rt_task_param
308+ .long sys_ni_syscall /* 325 */
309+ .long sys_sleep_next_period
310+ .long sys_scheduler_setup
311+ .long sys_register_np_flag
312+ .long sys_exit_np
313+ .long sys_od_open /* 330 */
314+ .long sys_od_close
315+ .long sys_pi_down
316+ .long sys_pi_up
317+ .long sys_srp_down
318+ .long sys_srp_up /* 335 */
319+ .long sys_reg_task_srp_sem
320+ .long sys_query_job_no
321+ .long sys_wait_for_job_release
322+ .long sys_set_service_levels
323+ .long sys_get_cur_service_level /* 340 */
324+ .long sys_reg_ics_cb
325+ .long sys_start_wcs
326+ .long sys_task_mode_transition /* 343 */
327+
328+
329diff --git a/fs/Makefile b/fs/Makefile
330index b9ffa63..318c0f7 100644
331--- a/fs/Makefile
332+++ b/fs/Makefile
333@@ -11,7 +11,7 @@ obj-y := open.o read_write.o file_table.o super.o \
334 attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \
335 seq_file.o xattr.o libfs.o fs-writeback.o \
336 pnode.o drop_caches.o splice.o sync.o utimes.o \
337- stack.o
338+ stack.o fdso.o
339
340 ifeq ($(CONFIG_BLOCK),y)
341 obj-y += buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
342diff --git a/fs/exec.c b/fs/exec.c
343index 11fe93f..29498a9 100644
344--- a/fs/exec.c
345+++ b/fs/exec.c
346@@ -54,6 +54,8 @@
347 #include <asm/uaccess.h>
348 #include <asm/mmu_context.h>
349
350+#include <linux/litmus.h>
351+
352 #ifdef CONFIG_KMOD
353 #include <linux/kmod.h>
354 #endif
355@@ -1140,7 +1142,8 @@ int do_execve(char * filename,
356 if (IS_ERR(file))
357 goto out_kfree;
358
359- sched_exec();
360+ sched_exec();
361+ litmus_exec();
362
363 bprm->p = PAGE_SIZE*MAX_ARG_PAGES-sizeof(void *);
364
365diff --git a/fs/fdso.c b/fs/fdso.c
366new file mode 100644
367index 0000000..e639020
368--- /dev/null
369+++ b/fs/fdso.c
370@@ -0,0 +1,281 @@
371+/* fdso.c - file descriptor attached shared objects
372+ *
373+ * (c) 2007 B. Brandenburg, LITMUS^RT project
374+ *
375+ * Notes:
376+ * - objects descriptor (OD) tables are not cloned during a fork.
377+ * - objects are created on-demand, and freed after the last reference
378+ * is dropped.
379+ * - for now, object types are hard coded.
380+ * - As long as we have live objects, we keep a reference to the inode.
381+ */
382+
383+#include <linux/errno.h>
384+#include <linux/sched.h>
385+#include <linux/mutex.h>
386+#include <linux/file.h>
387+#include <asm/uaccess.h>
388+
389+#include <linux/fdso.h>
390+
391+extern struct fdso_ops pi_sem_ops;
392+extern struct fdso_ops srp_sem_ops;
393+extern struct fdso_ops ics_ops;
394+
395+static const struct fdso_ops* fdso_ops[] = {
396+ &pi_sem_ops,
397+ &srp_sem_ops,
398+ &ics_ops
399+};
400+
401+static void* fdso_create(obj_type_t type)
402+{
403+ return fdso_ops[type]->create();
404+}
405+
406+static void fdso_destroy(obj_type_t type, void* obj)
407+{
408+ fdso_ops[type]->destroy(obj);
409+}
410+
411+static int fdso_open(struct od_table_entry* entry, void* __user config)
412+{
413+ if (fdso_ops[entry->obj->type]->open)
414+ return fdso_ops[entry->obj->type]->open(entry, config);
415+ else
416+ return 0;
417+}
418+
419+static int fdso_close(struct od_table_entry* entry)
420+{
421+ if (fdso_ops[entry->obj->type]->close)
422+ return fdso_ops[entry->obj->type]->close(entry);
423+ else
424+ return 0;
425+}
426+
427+/* inode must be locked already */
428+static struct inode_obj_id* alloc_inode_obj(struct inode* inode,
429+ obj_type_t type,
430+ unsigned int id)
431+{
432+ struct inode_obj_id* obj;
433+ void* raw_obj;
434+
435+ raw_obj = fdso_create(type);
436+ if (!raw_obj)
437+ return NULL;
438+
439+ obj = kmalloc(sizeof(struct inode_obj_id), GFP_KERNEL);
440+ if (!obj)
441+ return NULL;
442+ INIT_LIST_HEAD(&obj->list);
443+ atomic_set(&obj->count, 1);
444+ obj->type = type;
445+ obj->id = id;
446+ obj->obj = raw_obj;
447+ obj->inode = inode;
448+
449+ list_add(&obj->list, &inode->i_obj_list);
450+ atomic_inc(&inode->i_count);
451+
452+ printk(KERN_DEBUG "alloc_inode_obj(%p, %d, %d): object created\n", inode, type, id);
453+ return obj;
454+}
455+
456+/* inode must be locked already */
457+static struct inode_obj_id* get_inode_obj(struct inode* inode,
458+ obj_type_t type,
459+ unsigned int id)
460+{
461+ struct list_head* pos;
462+ struct inode_obj_id* obj = NULL;
463+
464+ list_for_each(pos, &inode->i_obj_list) {
465+ obj = list_entry(pos, struct inode_obj_id, list);
466+ if (obj->id == id && obj->type == type) {
467+ atomic_inc(&obj->count);
468+ return obj;
469+ }
470+ }
471+ printk(KERN_DEBUG "get_inode_obj(%p, %d, %d): couldn't find object\n", inode, type, id);
472+ return NULL;
473+}
474+
475+
476+static void put_inode_obj(struct inode_obj_id* obj)
477+{
478+ struct inode* inode;
479+ int let_go = 0;
480+
481+ inode = obj->inode;
482+ if (atomic_dec_and_test(&obj->count)) {
483+
484+ mutex_lock(&inode->i_obj_mutex);
485+ /* no new references can be obtained */
486+ if (!atomic_read(&obj->count)) {
487+ list_del(&obj->list);
488+ fdso_destroy(obj->type, obj->obj);
489+ kfree(obj);
490+ let_go = 1;
491+ }
492+ mutex_unlock(&inode->i_obj_mutex);
493+ if (let_go)
494+ iput(inode);
495+ }
496+}
497+
498+static struct od_table_entry* get_od_entry(struct task_struct* t)
499+{
500+ struct od_table_entry* table;
501+ int i;
502+
503+
504+ table = t->od_table;
505+ if (!table) {
506+ table = (struct od_table_entry*)
507+ kzalloc(sizeof(struct od_table_entry) *
508+ MAX_OBJECT_DESCRIPTORS, GFP_KERNEL);
509+ t->od_table = table;
510+ }
511+
512+ for (i = 0; table && i < MAX_OBJECT_DESCRIPTORS; i++)
513+ if (!table[i].used) {
514+ table[i].used = 1;
515+ return table + i;
516+ }
517+ return NULL;
518+}
519+
520+static int put_od_entry(struct od_table_entry* od)
521+{
522+ put_inode_obj(od->obj);
523+ od->used = 0;
524+ return 0;
525+}
526+
527+void exit_od_table(struct task_struct* t)
528+{
529+ int i;
530+
531+ if (t->od_table) {
532+ for (i = 0; i < MAX_OBJECT_DESCRIPTORS; i++)
533+ if (t->od_table[i].used)
534+ put_od_entry(t->od_table + i);
535+ kfree(t->od_table);
536+ t->od_table = NULL;
537+ }
538+}
539+
540+static int do_sys_od_open(struct file* file, obj_type_t type, int id,
541+ void* __user config)
542+{
543+ int idx = 0, err;
544+ struct inode* inode;
545+ struct inode_obj_id* obj = NULL;
546+ struct od_table_entry* entry;
547+
548+ inode = file->f_dentry->d_inode;
549+
550+ entry = get_od_entry(current);
551+ if (!entry)
552+ return -ENOMEM;
553+
554+ mutex_lock(&inode->i_obj_mutex);
555+ obj = get_inode_obj(inode, type, id);
556+ if (!obj)
557+ obj = alloc_inode_obj(inode, type, id);
558+ if (!obj) {
559+ idx = -ENOMEM;
560+ entry->used = 0;
561+ } else {
562+ entry->obj = obj;
563+ entry->extra = NULL;
564+ idx = entry - current->od_table;
565+ }
566+
567+ mutex_unlock(&inode->i_obj_mutex);
568+
569+ err = fdso_open(entry, config);
570+ if (err < 0) {
571+ /* The class rejected the open call.
572+ * We need to clean up and tell user space.
573+ */
574+ put_od_entry(entry);
575+ idx = err;
576+ }
577+
578+ return idx;
579+}
580+
581+
582+struct od_table_entry* __od_lookup(int od)
583+{
584+ struct task_struct *t = current;
585+
586+ if (!t->od_table)
587+ return NULL;
588+ if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
589+ return NULL;
590+ if (!t->od_table[od].used)
591+ return NULL;
592+ return t->od_table + od;
593+}
594+
595+
596+asmlinkage int sys_od_open(int fd, int type, int obj_id, void* __user config)
597+{
598+ int ret = 0;
599+ struct file* file;
600+
601+ /*
602+ 1) get file from fd, get inode from file
603+ 2) lock inode
604+ 3) try to lookup object
605+ 4) if not present create and enqueue object, inc inode refcnt
606+ 5) increment refcnt of object
607+ 6) alloc od_table_entry, setup ptrs
608+ 7) unlock inode
609+ 8) return offset in od_table as OD
610+ */
611+
612+ if (type < MIN_OBJ_TYPE || type > MAX_OBJ_TYPE) {
613+ ret = -EINVAL;
614+ goto out;
615+ }
616+
617+ file = fget(fd);
618+ if (!file) {
619+ ret = -EBADF;
620+ goto out;
621+ }
622+
623+ ret = do_sys_od_open(file, type, obj_id, config);
624+
625+ fput(file);
626+
627+out:
628+ return ret;
629+}
630+
631+
632+asmlinkage int sys_od_close(int od)
633+{
634+ int ret = -EINVAL;
635+ struct task_struct *t = current;
636+
637+ if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
638+ return ret;
639+
640+ if (!t->od_table || !t->od_table[od].used)
641+ return ret;
642+
643+
644+ /* give the class a chance to reject the close
645+ */
646+ ret = fdso_close(t->od_table + od);
647+ if (ret == 0)
648+ ret = put_od_entry(t->od_table + od);
649+
650+ return ret;
651+}
652diff --git a/fs/inode.c b/fs/inode.c
653index bf21dc6..fcf8ce3 100644
654--- a/fs/inode.c
655+++ b/fs/inode.c
656@@ -205,6 +205,8 @@ void inode_init_once(struct inode *inode)
657 INIT_LIST_HEAD(&inode->inotify_watches);
658 mutex_init(&inode->inotify_mutex);
659 #endif
660+ INIT_LIST_HEAD(&inode->i_obj_list);
661+ mutex_init(&inode->i_obj_mutex);
662 }
663
664 EXPORT_SYMBOL(inode_init_once);
665diff --git a/include/asm-i386/thread_info.h b/include/asm-i386/thread_info.h
666index 4b187bb..fd9dd60 100644
667--- a/include/asm-i386/thread_info.h
668+++ b/include/asm-i386/thread_info.h
669@@ -131,6 +131,7 @@ static inline struct thread_info *current_thread_info(void)
670 #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
671 #define TIF_SECCOMP 8 /* secure computing */
672 #define TIF_RESTORE_SIGMASK 9 /* restore signal mask in do_signal() */
673+#define TIF_ROLLBACK_RCS 10 /* set EIP to rollback addr */
674 #define TIF_MEMDIE 16
675 #define TIF_DEBUG 17 /* uses debug registers */
676 #define TIF_IO_BITMAP 18 /* uses I/O bitmap */
677@@ -146,6 +147,7 @@ static inline struct thread_info *current_thread_info(void)
678 #define _TIF_SYSCALL_AUDIT (1<<TIF_SYSCALL_AUDIT)
679 #define _TIF_SECCOMP (1<<TIF_SECCOMP)
680 #define _TIF_RESTORE_SIGMASK (1<<TIF_RESTORE_SIGMASK)
681+#define _TIF_ROLLBACK_RCS (1<<TIF_ROLLBACK_RCS)
682 #define _TIF_DEBUG (1<<TIF_DEBUG)
683 #define _TIF_IO_BITMAP (1<<TIF_IO_BITMAP)
684 #define _TIF_FREEZE (1<<TIF_FREEZE)
685diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
686index 833fa17..ecc7490 100644
687--- a/include/asm-i386/unistd.h
688+++ b/include/asm-i386/unistd.h
689@@ -325,10 +325,36 @@
690 #define __NR_move_pages 317
691 #define __NR_getcpu 318
692 #define __NR_epoll_pwait 319
693+/* LITMUS */
694+#define __NR_sched_setpolicy 320
695+#define __NR_sched_getpolicy 321
696+/* Syscall definitions for mode change and task creation-manipulation */
697+#define __NR_set_rt_mode 322
698+#define __NR_set_rt_task_param 323
699+#define __NR_get_rt_task_param 324
700+#define __NR_prepare_rt_task 325
701+#define __NR_sleep_next_period 326
702+#define __NR_scheduler_setup 327
703+#define __NR_register_np_flag 328
704+#define __NR_exit_np 329
705+#define __NR_od_open 330
706+#define __NR_od_close 331
707+#define __NR_pi_down 332
708+#define __NR_pi_up 333
709+#define __NR_srp_down 334
710+#define __NR_srp_up 335
711+#define __NR_reg_task_srp_sem 336
712+#define __NR_query_job_no 337
713+#define __NR_wait_for_job_release 338
714+#define __NR_set_service_levels 339
715+#define __NR_get_cur_service_level 340
716+#define __NR_reg_ics_cb 341
717+#define __NR_start_wcs 342
718+
719
720 #ifdef __KERNEL__
721
722-#define NR_syscalls 320
723+#define NR_syscalls 343
724
725 #define __ARCH_WANT_IPC_PARSE_VERSION
726 #define __ARCH_WANT_OLD_READDIR
727diff --git a/include/linux/edf_common.h b/include/linux/edf_common.h
728new file mode 100644
729index 0000000..f940308
730--- /dev/null
731+++ b/include/linux/edf_common.h
732@@ -0,0 +1,36 @@
733+/* EDF common data structures and utility functions shared by all EDF
734+ * based scheduler plugins
735+ */
736+
737+/* CLEANUP: Add comments and make it less messy.
738+ *
739+ */
740+
741+#ifndef __UNC_EDF_COMMON_H__
742+#define __UNC_EDF_COMMON_H__
743+
744+#include <linux/rt_domain.h>
745+
746+
747+void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched);
748+
749+int edf_higher_prio(struct task_struct* first,
750+ struct task_struct* second);
751+
752+int edf_ready_order(struct list_head* a, struct list_head* b);
753+
754+void edf_release_at(struct task_struct *t, jiffie_t start);
755+#define edf_release_now(t) edf_release_at(t, jiffies)
756+
757+int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t);
758+long edf_sleep_next_period(void);
759+
760+void edf_prepare_for_next_period(struct task_struct *t);
761+
762+#define job_completed(t) (!is_be(t) && \
763+ (t)->rt_param.times.exec_time == (t)->rt_param.basic_params.exec_cost)
764+
765+int edf_set_hp_task(struct pi_semaphore *sem);
766+int edf_set_hp_cpu_task(struct pi_semaphore *sem, int cpu);
767+
768+#endif
769diff --git a/include/linux/fdso.h b/include/linux/fdso.h
770new file mode 100644
771index 0000000..3e962fd
772--- /dev/null
773+++ b/include/linux/fdso.h
774@@ -0,0 +1,70 @@
775+/* fdso.h - file descriptor attached shared objects
776+ *
777+ * (c) 2007 B. Brandenburg, LITMUS^RT project
778+ */
779+
780+#ifndef _LINUX_FDSO_H_
781+#define _LINUX_FDSO_H_
782+
783+#include <linux/list.h>
784+#include <asm/atomic.h>
785+
786+#include <linux/fs.h>
787+
788+#define MAX_OBJECT_DESCRIPTORS 32
789+
790+typedef enum {
791+ MIN_OBJ_TYPE = 0,
792+
793+ PI_SEM = 0,
794+ SRP_SEM = 1,
795+ ICS_ID = 2,
796+
797+ MAX_OBJ_TYPE = 2
798+} obj_type_t;
799+
800+struct inode_obj_id {
801+ struct list_head list;
802+ atomic_t count;
803+ struct inode* inode;
804+
805+ obj_type_t type;
806+ void* obj;
807+ unsigned int id;
808+};
809+
810+
811+struct od_table_entry {
812+ unsigned int used;
813+
814+ struct inode_obj_id* obj;
815+ void* extra;
816+};
817+
818+struct fdso_ops {
819+ void* (*create) (void);
820+ void (*destroy)(void*);
821+ int (*open) (struct od_table_entry*, void* __user);
822+ int (*close) (struct od_table_entry*);
823+};
824+
825+/* translate a userspace supplied od into the raw table entry
826+ * returns NULL if od is invalid
827+ */
828+struct od_table_entry* __od_lookup(int od);
829+
830+/* translate a userspace supplied od into the associated object
831+ * returns NULL if od is invalid
832+ */
833+static inline void* od_lookup(int od, obj_type_t type)
834+{
835+ struct od_table_entry* e = __od_lookup(od);
836+ return e && e->obj->type == type ? e->obj->obj : NULL;
837+}
838+
839+#define lookup_pi_sem(od) ((struct pi_semaphore*) od_lookup(od, PI_SEM))
840+#define lookup_srp_sem(od) ((struct srp_semaphore*) od_lookup(od, SRP_SEM))
841+#define lookup_ics(od) ((struct ics*) od_lookup(od, ICS_ID))
842+
843+
844+#endif
845diff --git a/include/linux/feather_buffer.h b/include/linux/feather_buffer.h
846new file mode 100644
847index 0000000..c788227
848--- /dev/null
849+++ b/include/linux/feather_buffer.h
850@@ -0,0 +1,108 @@
851+#ifndef _FEATHER_BUFFER_H_
852+#define _FEATHER_BUFFER_H_
853+
854+/* requires UINT_MAX and memcpy */
855+
856+static inline int fetch_and_inc(int *val)
857+{
858+ int ret = 1;
859+ __asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" );
860+ return ret;
861+}
862+
863+static inline int fetch_and_dec(int *val)
864+{
865+ int ret = -1;
866+ __asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" );
867+ return ret;
868+}
869+
870+#define SLOT_FREE 0
871+#define SLOT_BUSY 1
872+#define SLOT_READY 2
873+
874+struct ft_buffer {
875+ unsigned int slot_count;
876+ unsigned int slot_size;
877+
878+ int free_count;
879+ unsigned int write_idx;
880+ unsigned int read_idx;
881+
882+ char* slots;
883+ void* buffer_mem;
884+ unsigned int failed_writes;
885+};
886+
887+static inline int init_ft_buffer(struct ft_buffer* buf,
888+ unsigned int slot_count,
889+ unsigned int slot_size,
890+ char* slots,
891+ void* buffer_mem)
892+{
893+ int i = 0;
894+ if (!slot_count || UINT_MAX % slot_count != slot_count - 1) {
895+ /* The slot count must divide UNIT_MAX + 1 so that when it
896+ * wraps around the index correctly points to 0.
897+ */
898+ return 0;
899+ } else {
900+ buf->slot_count = slot_count;
901+ buf->slot_size = slot_size;
902+ buf->slots = slots;
903+ buf->buffer_mem = buffer_mem;
904+ buf->free_count = slot_count;
905+ buf->write_idx = 0;
906+ buf->read_idx = 0;
907+ buf->failed_writes = 0;
908+ for (i = 0; i < slot_count; i++)
909+ buf->slots[i] = SLOT_FREE;
910+ return 1;
911+ }
912+}
913+
914+static inline int ft_buffer_start_write(struct ft_buffer* buf, void **ptr)
915+{
916+ int free = fetch_and_dec(&buf->free_count);
917+ unsigned int idx;
918+ if (free <= 0) {
919+ fetch_and_inc(&buf->free_count);
920+ *ptr = 0;
921+ fetch_and_inc(&buf->failed_writes);
922+ return 0;
923+ } else {
924+ idx = fetch_and_inc((int*) &buf->write_idx) % buf->slot_count;
925+ buf->slots[idx] = SLOT_BUSY;
926+ *ptr = ((char*) buf->buffer_mem) + idx * buf->slot_size;
927+ return 1;
928+ }
929+}
930+
931+static inline void ft_buffer_finish_write(struct ft_buffer* buf, void *ptr)
932+{
933+ unsigned int idx = ((char*) ptr - (char*) buf->buffer_mem) / buf->slot_size;
934+ buf->slots[idx] = SLOT_READY;
935+}
936+
937+
938+/* exclusive reader access is assumed */
939+static inline int ft_buffer_read(struct ft_buffer* buf, void* dest)
940+{
941+ unsigned int idx;
942+ if (buf->free_count == buf->slot_count)
943+ /* nothing available */
944+ return 0;
945+ idx = buf->read_idx % buf->slot_count;
946+ if (buf->slots[idx] == SLOT_READY) {
947+ memcpy(dest, ((char*) buf->buffer_mem) + idx * buf->slot_size,
948+ buf->slot_size);
949+ buf->slots[idx] = SLOT_FREE;
950+ buf->read_idx++;
951+ fetch_and_inc(&buf->free_count);
952+ return 1;
953+ } else
954+ return 0;
955+}
956+
957+
958+#endif
959diff --git a/include/linux/feather_trace.h b/include/linux/feather_trace.h
960new file mode 100644
961index 0000000..5c37ea7
962--- /dev/null
963+++ b/include/linux/feather_trace.h
964@@ -0,0 +1,93 @@
965+#ifndef _FEATHER_TRACE_H_
966+#define _FEATHER_TRACE_H_
967+
968+#define feather_callback __attribute__((regparm(0)))
969+
970+/* make the compiler reload any register that is not saved in
971+ * a cdecl function call
972+ */
973+#define CLOBBER_LIST "memory", "cc", "eax", "ecx", "edx"
974+
975+#define ft_event(id, callback) \
976+ __asm__ __volatile__( \
977+ "1: jmp 2f \n\t" \
978+ " call " #callback " \n\t" \
979+ ".section __event_table, \"aw\" \n\t" \
980+ ".long " #id ", 0, 1b, 2f \n\t" \
981+ ".previous \n\t" \
982+ "2: \n\t" \
983+ : : : CLOBBER_LIST)
984+
985+#define ft_event0(id, callback) \
986+ __asm__ __volatile__( \
987+ "1: jmp 2f \n\t" \
988+ " subl $4, %%esp \n\t" \
989+ " movl $" #id ", (%%esp) \n\t" \
990+ " call " #callback " \n\t" \
991+ " addl $4, %%esp \n\t" \
992+ ".section __event_table, \"aw\" \n\t" \
993+ ".long " #id ", 0, 1b, 2f \n\t" \
994+ ".previous \n\t" \
995+ "2: \n\t" \
996+ : : : CLOBBER_LIST)
997+
998+#define ft_event1(id, callback, param) \
999+ __asm__ __volatile__( \
1000+ "1: jmp 2f \n\t" \
1001+ " subl $8, %%esp \n\t" \
1002+ " movl %0, 4(%%esp) \n\t" \
1003+ " movl $" #id ", (%%esp) \n\t" \
1004+ " call " #callback " \n\t" \
1005+ " addl $8, %%esp \n\t" \
1006+ ".section __event_table, \"aw\" \n\t" \
1007+ ".long " #id ", 0, 1b, 2f \n\t" \
1008+ ".previous \n\t" \
1009+ "2: \n\t" \
1010+ : : "r" (param) : CLOBBER_LIST)
1011+
1012+#define ft_event2(id, callback, param, param2) \
1013+ __asm__ __volatile__( \
1014+ "1: jmp 2f \n\t" \
1015+ " subl $12, %%esp \n\t" \
1016+ " movl %1, 8(%%esp) \n\t" \
1017+ " movl %0, 4(%%esp) \n\t" \
1018+ " movl $" #id ", (%%esp) \n\t" \
1019+ " call " #callback " \n\t" \
1020+ " addl $12, %%esp \n\t" \
1021+ ".section __event_table, \"aw\" \n\t" \
1022+ ".long " #id ", 0, 1b, 2f \n\t" \
1023+ ".previous \n\t" \
1024+ "2: \n\t" \
1025+ : : "r" (param), "r" (param2) : CLOBBER_LIST)
1026+
1027+
1028+#define ft_event3(id, callback, p, p2, p3) \
1029+ __asm__ __volatile__( \
1030+ "1: jmp 2f \n\t" \
1031+ " subl $16, %%esp \n\t" \
1032+ " movl %1, 12(%%esp) \n\t" \
1033+ " movl %1, 8(%%esp) \n\t" \
1034+ " movl %0, 4(%%esp) \n\t" \
1035+ " movl $" #id ", (%%esp) \n\t" \
1036+ " call " #callback " \n\t" \
1037+ " addl $16, %%esp \n\t" \
1038+ ".section __event_table, \"aw\" \n\t" \
1039+ ".long " #id ", 0, 1b, 2f \n\t" \
1040+ ".previous \n\t" \
1041+ "2: \n\t" \
1042+ : : "r" (p), "r" (p2), "r" (p3) : CLOBBER_LIST)
1043+
1044+
1045+static inline unsigned long long ft_read_tsc(void)
1046+{
1047+ unsigned long long ret;
1048+ __asm__ __volatile__("rdtsc" : "=A" (ret));
1049+ return ret;
1050+}
1051+
1052+int ft_enable_event(unsigned long id);
1053+int ft_disable_event(unsigned long id);
1054+int ft_is_event_enabled(unsigned long id);
1055+int ft_disable_all_events(void);
1056+
1057+#endif
1058diff --git a/include/linux/fifo_common.h b/include/linux/fifo_common.h
1059new file mode 100644
1060index 0000000..0883226
1061--- /dev/null
1062+++ b/include/linux/fifo_common.h
1063@@ -0,0 +1,18 @@
1064+/* FIFO common definitions and utility functions.
1065+ */
1066+#ifndef __UNC_SCHED_FIFO_H__
1067+#define __UNC_SCHED_FIFO_H__
1068+
1069+#include <linux/rt_domain.h>
1070+
1071+
1072+int fifo_higher_prio(struct task_struct* first,
1073+ struct task_struct* second);
1074+
1075+int fifo_ready_order(struct list_head* a, struct list_head* b);
1076+
1077+
1078+void fifo_domain_init(rt_domain_t* fifo, check_resched_needed_t resched);
1079+
1080+
1081+#endif
1082diff --git a/include/linux/fpmath.h b/include/linux/fpmath.h
1083new file mode 100644
1084index 0000000..a15c239
1085--- /dev/null
1086+++ b/include/linux/fpmath.h
1087@@ -0,0 +1,111 @@
1088+#ifndef __FP_MATH_H__
1089+#define __FP_MATH_H__
1090+
1091+#define FP_SHIFT 10
1092+#define ROUND_BIT (FP_SHIFT - 1)
1093+#define ONE FP(1)
1094+
1095+#define _fp(x) ((fp_t) {x})
1096+
1097+static inline long _point(fp_t x)
1098+{
1099+ return (x.val % (1 << FP_SHIFT));
1100+
1101+}
1102+
1103+#define fp2str(x) x.val
1104+/*(x.val >> FP_SHIFT), (x.val % (1 << FP_SHIFT)) */
1105+#define _FP_ "%ld/1024"
1106+
1107+
1108+static inline fp_t FP(long x)
1109+{
1110+ return _fp(((long) x) << FP_SHIFT);
1111+}
1112+
1113+static inline long _floor(fp_t x)
1114+{
1115+ return x.val >> FP_SHIFT;
1116+}
1117+
1118+/* FIXME: negative rounding */
1119+static inline long _round(fp_t x)
1120+{
1121+ return _floor(x) + ((x.val >> ROUND_BIT) & 1);
1122+}
1123+
1124+/* divide two integers to obtain a fixed point value */
1125+static inline fp_t _frac(long a, long b)
1126+{
1127+ return _fp(FP(a).val / (b));
1128+}
1129+
1130+/* multiply two fixed point values */
1131+static inline fp_t _mul(fp_t a, fp_t b)
1132+{
1133+ return _fp((a.val * b.val) >> FP_SHIFT);
1134+}
1135+
1136+static inline fp_t _div(fp_t a, fp_t b)
1137+{
1138+ /* try not to overflow */
1139+ if (unlikely(a.val > 2 << (BITS_PER_LONG - FP_SHIFT)))
1140+ return _fp((a.val / b.val) << FP_SHIFT);
1141+ else
1142+ return _fp((a.val << FP_SHIFT) / b.val);
1143+}
1144+
1145+static inline fp_t _add(fp_t a, fp_t b)
1146+{
1147+ return _fp(a.val + b.val);
1148+}
1149+
1150+static inline fp_t _sub(fp_t a, fp_t b)
1151+{
1152+ return _fp(a.val - b.val);
1153+}
1154+
1155+static inline fp_t _neg(fp_t x)
1156+{
1157+ return _fp(-x.val);
1158+}
1159+
1160+static inline fp_t _abs(fp_t x)
1161+{
1162+ return _fp(abs(x.val));
1163+}
1164+
1165+static inline int _leq(fp_t a, fp_t b)
1166+{
1167+ return a.val <= b.val;
1168+}
1169+
1170+static inline int _geq(fp_t a, fp_t b)
1171+{
1172+ return a.val >= b.val;
1173+}
1174+
1175+static inline int _lt(fp_t a, fp_t b)
1176+{
1177+ return a.val < b.val;
1178+}
1179+
1180+static inline int _gt(fp_t a, fp_t b)
1181+{
1182+ return a.val > b.val;
1183+}
1184+
1185+static inline int _eq(fp_t a, fp_t b)
1186+{
1187+ return a.val == b.val;
1188+}
1189+
1190+static inline fp_t _max(fp_t a, fp_t b)
1191+{
1192+ if (a.val < b.val)
1193+ return b;
1194+ else
1195+ return a;
1196+}
1197+
1198+#endif
1199diff --git a/include/linux/fs.h b/include/linux/fs.h
1200index 1410e53..4e1117c 100644
1201--- a/include/linux/fs.h
1202+++ b/include/linux/fs.h
1203@@ -524,6 +524,8 @@ static inline int mapping_writably_mapped(struct address_space *mapping)
1204 #define i_size_ordered_init(inode) do { } while (0)
1205 #endif
1206
1207+struct inode_obj_id_table;
1208+
1209 struct inode {
1210 struct hlist_node i_hash;
1211 struct list_head i_list;
1212@@ -589,6 +591,9 @@ struct inode {
1213 void *i_security;
1214 #endif
1215 void *i_private; /* fs or device private pointer */
1216+
1217+ struct list_head i_obj_list;
1218+ struct mutex i_obj_mutex;
1219 };
1220
1221 /*
1222diff --git a/include/linux/ics.h b/include/linux/ics.h
1223new file mode 100644
1224index 0000000..f19534f
1225--- /dev/null
1226+++ b/include/linux/ics.h
1227@@ -0,0 +1,35 @@
1228+#ifndef _LINUX_ICS_H_
1229+#define _LINUX_ICS_H_
1230+
1231+#include <asm/atomic.h>
1232+#include <linux/mutex.h>
1233+
1234+#define MAX_ICS_NESTING 16
1235+
1236+struct ics_descriptor {
1237+ /* ICS id, only read by kernel */
1238+ int id;
1239+ /* rollback program counter, only read by kernel */
1240+ void* pc;
1241+ /* rollback stack pointer, not used by kernel */
1242+ void* sp;
1243+ /* retry flag, not used by kernel */
1244+ int* retry;
1245+};
1246+
1247+/* ICS control block */
1248+struct ics_cb {
1249+ /* Points to the top-most valid entry.
1250+ * -1 indicates an empty stack.
1251+ * Read and written by kernel.
1252+ */
1253+ int top;
1254+ struct ics_descriptor ics_stack[MAX_ICS_NESTING];
1255+};
1256+
1257+/* get rollback addr for current task */
1258+void* get_rollback_addr(void);
1259+
1260+#define ICS_DBG(x, args...) printk(x, ## args)
1261+
1262+#endif
1263diff --git a/include/linux/list.h b/include/linux/list.h
1264index 611059d..319c5ed 100644
1265--- a/include/linux/list.h
1266+++ b/include/linux/list.h
1267@@ -898,6 +898,36 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
1268 ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
1269 pos = pos->next)
1270
1271+
1272+typedef int (*list_cmp_t)(struct list_head*, struct list_head*);
1273+
1274+static inline unsigned int list_insert(struct list_head* new,
1275+ struct list_head* head,
1276+ list_cmp_t order_before)
1277+{
1278+ struct list_head *pos;
1279+ unsigned int passed = 0;
1280+
1281+ BUG_ON(!new);
1282+
1283+ /* find a spot where the new entry is less than the next */
1284+ list_for_each(pos, head) {
1285+ if (unlikely(order_before(new, pos))) {
1286+ /* pos is not less than new, thus insert here */
1287+ __list_add(new, pos->prev, pos);
1288+ goto out;
1289+ }
1290+ passed++;
1291+ }
1292+ /* if we get to this point either the list is empty or every entry
1293+ * queued element is less than new.
1294+ * Let's add new to the end. */
1295+ list_add_tail(new, head);
1296+ out:
1297+ return passed;
1298+}
1299+
1300+
1301 #else
1302 #warning "don't include kernel headers in userspace"
1303 #endif /* __KERNEL__ */
1304diff --git a/include/linux/litmus.h b/include/linux/litmus.h
1305new file mode 100644
1306index 0000000..858b2c3
1307--- /dev/null
1308+++ b/include/linux/litmus.h
1309@@ -0,0 +1,141 @@
1310+/*
1311+ * Constant definitions related to
1312+ * scheduling policy.
1313+ */
1314+
1315+#ifndef _LINUX_LITMUS_H_
1316+#define _LINUX_LITMUS_H_
1317+
1318+#include <linux/jiffies.h>
1319+#include <linux/sched_trace.h>
1320+
1321+typedef enum {
1322+ SCHED_BEG = 0,
1323+ SCHED_LINUX = 0,
1324+ SCHED_PFAIR = 1,
1325+ SCHED_PFAIR_STAGGER = 2,
1326+ SCHED_PART_EDF = 3,
1327+ SCHED_PART_EEVDF = 4,
1328+ SCHED_GLOBAL_EDF = 5,
1329+ SCHED_PFAIR_DESYNC = 6,
1330+ SCHED_GLOBAL_EDF_NP = 7,
1331+ SCHED_CUSTOM = 8,
1332+ SCHED_EDF_HSB = 9,
1333+ SCHED_GSN_EDF = 10,
1334+ SCHED_PSN_EDF = 11,
1335+ SCHED_ADAPTIVE = 12,
1336+ /* Add your scheduling policy here */
1337+
1338+ SCHED_END = 12,
1339+ SCHED_DEFAULT = 0,
1340+ SCHED_INVALID = -1,
1341+} spolicy;
1342+
1343+
1344+typedef enum {
1345+ LITMUS_RESERVED_RANGE = 1024,
1346+
1347+} sched_setup_cmd_t;
1348+
1349+/* System-wide runtime modes */
1350+enum rt_mode_t {
1351+ MODE_NON_RT = 0,
1352+ MODE_RT_RUN = 1
1353+};
1354+
1355+/* per-task modes */
1356+enum rt_task_mode_t {
1357+ BACKGROUND_TASK = 0,
1358+ LITMUS_RT_TASK = 1
1359+};
1360+
1361+
1362+/* Plugin boot options, for convenience */
1363+#define PLUGIN_LINUX "linux"
1364+#define PLUGIN_PFAIR "pfair"
1365+#define PLUGIN_PART_EDF "part_edf"
1366+#define PLUGIN_GLOBAL_EDF "global_edf"
1367+#define PLUGIN_GLOBAL_EDF_NP "global_edf_np"
1368+#define PLUGIN_EDF_HSB "edf_hsb"
1369+#define PLUGIN_GSN_EDF "gsn_edf"
1370+#define PLUGIN_PSN_EDF "psn_edf"
1371+#define PLUGIN_ADAPTIVE "adaptive"
1372+
1373+extern spolicy sched_policy;
1374+
1375+/* RT mode start time */
1376+extern volatile unsigned long rt_start_time;
1377+
1378+/* Here we store the current mode of the system */
1379+extern atomic_t rt_mode;
1380+
1381+#define get_rt_mode() (atomic_read(&rt_mode))
1382+#define set_rt_mode(a) atomic_set(&rt_mode,(a))
1383+
1384+#define TRACE(fmt, args...) \
1385+ sched_trace_log_message("%d: " fmt, raw_smp_processor_id(), ## args)
1386+
1387+#define TRACE_TASK(t, fmt, args...) \
1388+ TRACE("(%s/%d) " fmt, (t)->comm, (t)->pid, ##args)
1389+
1390+#define TRACE_CUR(fmt, args...) \
1391+ TRACE_TASK(current, fmt, ## args)
1392+
1393+#define TRACE_BUG_ON(cond) \
1394+ do { if (cond) TRACE("BUG_ON(%s) at %s:%d " \
1395+ "called from %p current=%s/%d state=%d " \
1396+ "flags=%x mode=%d partition=%d cpu=%d rtflags=%d"\
1397+ " job=%u knp=%d timeslice=%u\n", \
1398+ #cond, __FILE__, __LINE__, __builtin_return_address(0), current->comm, \
1399+ current->pid, current->state, current->flags, get_rt_mode(), \
1400+ get_partition(current), smp_processor_id(), get_rt_flags(current), \
1401+ current->rt_param.times.job_no, current->rt_param.kernel_np, \
1402+ current->time_slice\
1403+ ); } while(0);
1404+
1405+
1406+/* in_list - is a given list_head queued on some list?
1407+ */
1408+static inline int in_list(struct list_head* list)
1409+{
1410+ return !( /* case 1: deleted */
1411+ (list->next == LIST_POISON1 &&
1412+ list->prev == LIST_POISON2)
1413+ ||
1414+ /* case 2: initialized */
1415+ (list->next == list &&
1416+ list->prev == list)
1417+ );
1418+}
1419+
1420+void list_qsort(struct list_head* list, list_cmp_t less_than);
1421+
1422+
1423+#define RT_PREEMPTIVE 0x2050 /* = NP */
1424+#define RT_NON_PREEMPTIVE 0x4e50 /* = P */
1425+#define RT_EXIT_NP_REQUESTED 0x5251 /* = RQ */
1426+
1427+/* returns 1 if task t has registered np flag and set it to RT_NON_PREEMPTIVE
1428+ */
1429+int is_np(struct task_struct *t);
1430+
1431+/* request that the task should call sys_exit_np()
1432+ */
1433+void request_exit_np(struct task_struct *t);
1434+
1435+/* kill naughty tasks
1436+ */
1437+void scheduler_signal(struct task_struct *t, unsigned int signal);
1438+void send_scheduler_signals(void);
1439+void np_mem_kill(struct task_struct *t);
1440+
1441+void litmus_fork(struct task_struct *tsk);
1442+void litmus_exec(void);
1443+/* clean up real-time state of a task */
1444+void exit_litmus(struct task_struct *dead_tsk);
1445+
1446+long transition_to_rt(struct task_struct* tsk);
1447+long transition_to_be(struct task_struct* tsk);
1448+
1449+
1450+#endif
1451diff --git a/include/linux/pfair_common.h b/include/linux/pfair_common.h
1452new file mode 100644
1453index 0000000..67e18c6
1454--- /dev/null
1455+++ b/include/linux/pfair_common.h
1456@@ -0,0 +1,40 @@
1457+/* PFAIR common data structures and utility functions shared by all PFAIR
1458+ * based scheduler plugins
1459+ */
1460+
1461+#ifndef __UNC_PFAIR_COMMON_H__
1462+#define __UNC_PFAIR_COMMON_H__
1463+
1464+#include <linux/queuelock.h>
1465+#include <linux/cpumask.h>
1466+
1467+typedef struct _pfair_domain {
1468+ /* Global lock to protect the data structures */
1469+ queuelock_t pfair_lock;
1470+ /* runnable rt tasks are in here */
1471+ struct list_head ready_queue;
1472+
1473+ /* real-time tasks waiting for release are in here */
1474+ struct list_head release_queue;
1475+
1476+ /* CPU's in the domain */
1477+ cpumask_t domain_cpus;
1478+
1479+} pfair_domain_t;
1480+
1481+#define next_ready(pfair) \
1482+ (list_entry((pfair)->ready_queue.next, struct task_struct, rt_list))
1483+void pfair_domain_init(pfair_domain_t *pfair);
1484+void pfair_add_ready(pfair_domain_t* pfair, struct task_struct *new);
1485+struct task_struct* __pfair_take_ready(pfair_domain_t* pfair);
1486+void pfair_add_release(pfair_domain_t* pfair, struct task_struct *task);
1487+void pfair_try_release_pending(pfair_domain_t* pfair);
1488+void __pfair_prepare_new_release(struct task_struct *t, jiffie_t start);
1489+
1490+void pfair_prepare_next_job(struct task_struct *t);
1491+void pfair_prepare_next_subtask(struct task_struct *t);
1492+
1493+void pfair_prepare_new_releases(pfair_domain_t *pfair, jiffie_t start);
1494+
1495+#endif
1496+
1497diff --git a/include/linux/pfair_math.h b/include/linux/pfair_math.h
1498new file mode 100644
1499index 0000000..b2a14e4
1500--- /dev/null
1501+++ b/include/linux/pfair_math.h
1502@@ -0,0 +1,80 @@
1503+/* PFAIR Mathematical functions */
1504+#ifndef __UNC_PFAIR_MATH_H__
1505+#define __UNC_PFAIR_MATH_H__
1506+
1507+#include <linux/rt_param.h>
1508+#include <asm/div64.h>
1509+#include <linux/litmus.h>
1510+#include <linux/sched.h>
1511+
1512+/* Type definition for our quantums */
1513+typedef unsigned long long quantum_t;
1514+
1515+/*
1516+* This file defines mathematical functions "ceiling", "floor",
1517+* and PFAIR specific functions for computing the release and
1518+* the deadline of a subtask, as well as tie breakers:
1519+* b-bit and group deadline.
1520+*/
1521+static inline quantum_t FLOOR(quantum_t a, unsigned long b)
1522+{
1523+ BUG_ON( b == 0);
1524+ do_div(a, b);
1525+ return a;
1526+}
1527+static inline quantum_t CEIL(quantum_t a, unsigned long b)
1528+{
1529+ quantum_t t = FLOOR(a, b);
1530+ return (quantum_t)((t * b == a) ? t : (t + 1));
1531+}
1532+
1533+
1534+/*
1535+* invariant - i-1=get_passed_quanta(t)
1536+*
1537+* release time of i-th subtask of j-th job is
1538+* r_{ij}+\lfloor i-1/wt(T) \rfloor
1539+* This operation should be robust to wrap-around
1540+* so we can compare the result with jiffies safely
1541+*/
1542+static inline quantum_t release_time(struct task_struct * t)
1543+{
1544+ quantum_t e = get_exec_cost(t);
1545+ quantum_t p = get_rt_period(t);
1546+ return FLOOR((get_passed_quanta(t)) * p, e);
1547+}
1548+/*
1549+* deadline time of i-th subtask of j-th job is
1550+* r_{ij}+\lceil i/wt(T) \rceil
1551+* This operation should be robust to wrap-around
1552+* so we can compare the result with jiffies safely
1553+*/
1554+static inline quantum_t pfair_deadline(struct task_struct * t)
1555+{
1556+ quantum_t e = get_exec_cost(t);
1557+ quantum_t p = get_rt_period(t);
1558+ return CEIL((get_passed_quanta(t) + 1) * p, e);
1559+}
1560+/* In PFAIR b-bit is defined as
1561+* \lceil i/wt(T) \rceil-\lfloor i/wt(T) \rfloor
1562+*/
1563+static inline int b_bit(struct task_struct *t)
1564+{
1565+ quantum_t e = get_exec_cost(t);
1566+ quantum_t p = get_rt_period(t);
1567+ return CEIL((get_passed_quanta(t) + 1) * p, e)-
1568+ FLOOR((get_passed_quanta(t) + 1) * p, e);
1569+}
1570+/*
1571+* Group deadline
1572+*/
1573+static inline quantum_t group_deadline(struct task_struct * t)
1574+{
1575+ quantum_t p = get_rt_period(t);
1576+ quantum_t e = get_exec_cost(t);
1577+ quantum_t stage1 = CEIL((get_passed_quanta(t) + 1) * p, e);
1578+ quantum_t stage2 = CEIL(stage1 * (p - e), p);
1579+ return CEIL(stage2 * p, p - e);
1580+}
1581+
1582+#endif /* __UNC_PFAIR_MATH_H__ */
1583diff --git a/include/linux/queuelock.h b/include/linux/queuelock.h
1584new file mode 100644
1585index 0000000..c289c21
1586--- /dev/null
1587+++ b/include/linux/queuelock.h
1588@@ -0,0 +1,98 @@
1589+#ifndef _UNC_QUEUELOCK_H_
1590+#define _UNC_QUEUELOCK_H_
1591+/**
1592+* Queue lock
1593+*
1594+* This is an implementation of T. Anderson's queue lock.
1595+* It strives to follow the normal Linux locking conventions
1596+* as much as possible. The rules for acquiring a lock are:
1597+*
1598+* 1) The caller must ensure interrupts and preemptions are disabled.
1599+*
1600+* 2) The caller _cannot_ recursively acquire the lock.
1601+*
1602+* 3) The caller may not sleep while holding the lock. This is currently
1603+* not enforced, but it will not work.
1604+*/
1605+
1606+#include <linux/cache.h>
1607+#include <asm/atomic.h>
1608+#include <linux/smp.h>
1609+
1610+typedef struct {
1611+ /* pad the values being spun on to make sure
1612+ that they are cache local
1613+ */
1614+ union {
1615+ volatile enum {
1616+ MUST_WAIT,
1617+ HAS_LOCK
1618+ } val;
1619+ char padding[SMP_CACHE_BYTES];
1620+ } slots[NR_CPUS];
1621+
1622+ /* since spin_slot is not being spun on it can be
1623+ * in a shared cache line. next_slot will be evicted
1624+ * anyway on every attempt to acquire the lock.
1625+ */
1626+ int spin_slot[NR_CPUS];
1627+
1628+ /* The next slot that will be available.
1629+ */
1630+ atomic_t next_slot;
1631+} queuelock_t;
1632+
1633+
1634+static inline void queue_lock_init(queuelock_t *lock)
1635+{
1636+ int i;
1637+ for (i = 0; i < NR_CPUS; i++) {
1638+ lock->slots[i].val = MUST_WAIT;
1639+ lock->spin_slot[i] = i;
1640+ }
1641+ lock->slots[0].val = HAS_LOCK;
1642+ atomic_set(&lock->next_slot, 0);
1643+}
1644+
1645+
1646+static inline void queue_lock(queuelock_t *lock)
1647+{
1648+ int me = smp_processor_id();
1649+ volatile int* spin_var;
1650+ /* Get slot to spin on. atomic_inc_return() returns the incremented
1651+ * value, so take one of again
1652+ */
1653+ lock->spin_slot[me] = atomic_inc_return(&lock->next_slot) - 1;
1654+ /* check for wrap-around
1655+ * This could probably optimized away if we ensure that NR_CPUS divides
1656+ * INT_MAX...
1657+ */
1658+ if (unlikely(lock->spin_slot[me] == NR_CPUS - 1))
1659+ atomic_add(-NR_CPUS, &lock->next_slot);
1660+ /* range limit*/
1661+ lock->spin_slot[me] %= NR_CPUS;
1662+ /* spin until you acquire the lock */
1663+ spin_var = (int*) &lock->slots[lock->spin_slot[me]].val;
1664+ while (*spin_var == MUST_WAIT)
1665+ cpu_relax();
1666+
1667+ /* reset the lock */
1668+ lock->slots[lock->spin_slot[me]].val = MUST_WAIT;
1669+ barrier();
1670+}
1671+
1672+
1673+static inline void queue_unlock(queuelock_t *lock)
1674+{
1675+ int me = smp_processor_id();
1676+ barrier();
1677+ lock->slots[(lock->spin_slot[me] + 1) % NR_CPUS].val = HAS_LOCK;
1678+}
1679+
1680+#define queue_lock_irqsave(lock, flags) \
1681+ do { local_irq_save(flags); queue_lock(lock); } while (0);
1682+
1683+#define queue_unlock_irqrestore(lock, flags) \
1684+ do { queue_unlock(lock); local_irq_restore(flags); } while (0);
1685+
1686+#endif /* _UNC_QUEUELOCK_H_ */
1687diff --git a/include/linux/rt_domain.h b/include/linux/rt_domain.h
1688new file mode 100644
1689index 0000000..3305159
1690--- /dev/null
1691+++ b/include/linux/rt_domain.h
1692@@ -0,0 +1,98 @@
1693+/* CLEANUP: Add comments and make it less messy.
1694+ *
1695+ */
1696+
1697+#ifndef __UNC_RT_DOMAIN_H__
1698+#define __UNC_RT_DOMAIN_H__
1699+
1700+struct _rt_domain;
1701+
1702+typedef int (*check_resched_needed_t)(struct _rt_domain *rt);
1703+typedef void (*release_at_t)(struct task_struct *t, jiffie_t start);
1704+
1705+typedef struct _rt_domain {
1706+ /* runnable rt tasks are in here */
1707+ rwlock_t ready_lock;
1708+ struct list_head ready_queue;
1709+
1710+ /* real-time tasks waiting for release are in here */
1711+ spinlock_t release_lock;
1712+ struct list_head release_queue;
1713+
1714+ /* how do we check if we need to kick another CPU? */
1715+ check_resched_needed_t check_resched;
1716+
1717+ /* how are tasks ordered in the ready queue? */
1718+ list_cmp_t order;
1719+} rt_domain_t;
1720+
1721+#define next_ready(rt) \
1722+ (list_entry((rt)->ready_queue.next, struct task_struct, rt_list))
1723+
1724+#define ready_jobs_pending(rt) \
1725+ (!list_empty(&(rt)->ready_queue))
1726+
1727+void rt_domain_init(rt_domain_t *rt, check_resched_needed_t f,
1728+ list_cmp_t order);
1729+
1730+void __add_ready(rt_domain_t* rt, struct task_struct *new);
1731+void __add_release(rt_domain_t* rt, struct task_struct *task);
1732+
1733+struct task_struct* __take_ready_rq(rt_domain_t* rt, runqueue_t* rq, int cpu);
1734+struct task_struct* __take_ready(rt_domain_t* rt);
1735+struct task_struct* __peek_ready(rt_domain_t* rt);
1736+
1737+void try_release_pending(rt_domain_t* rt);
1738+void __release_pending(rt_domain_t* rt);
1739+
1740+void rerelease_all(rt_domain_t *rt, release_at_t release);
1741+void __rerelease_all(rt_domain_t *rt, release_at_t release);
1742+
1743+static inline void add_ready(rt_domain_t* rt, struct task_struct *new)
1744+{
1745+ unsigned long flags;
1746+ /* first we need the write lock for rt_ready_queue */
1747+ write_lock_irqsave(&rt->ready_lock, flags);
1748+ __add_ready(rt, new);
1749+ write_unlock_irqrestore(&rt->ready_lock, flags);
1750+}
1751+
1752+static inline struct task_struct* take_ready(rt_domain_t* rt)
1753+{
1754+ unsigned long flags;
1755+ struct task_struct* ret;
1756+ /* first we need the write lock for rt_ready_queue */
1757+ write_lock_irqsave(&rt->ready_lock, flags);
1758+ ret = __take_ready(rt);
1759+ write_unlock_irqrestore(&rt->ready_lock, flags);
1760+ return ret;
1761+}
1762+
1763+
1764+static inline void add_release(rt_domain_t* rt, struct task_struct *task)
1765+{
1766+ unsigned long flags;
1767+ /* first we need the write lock for rt_ready_queue */
1768+ spin_lock_irqsave(&rt->release_lock, flags);
1769+ __add_release(rt, task);
1770+ spin_unlock_irqrestore(&rt->release_lock, flags);
1771+}
1772+
1773+static inline int __jobs_pending(rt_domain_t* rt)
1774+{
1775+ return !list_empty(&rt->ready_queue);
1776+}
1777+
1778+static inline int jobs_pending(rt_domain_t* rt)
1779+{
1780+ unsigned long flags;
1781+ int ret;
1782+ /* first we need the write lock for rt_ready_queue */
1783+ read_lock_irqsave(&rt->ready_lock, flags);
1784+ ret = __jobs_pending(rt);
1785+ read_unlock_irqrestore(&rt->ready_lock, flags);
1786+ return ret;
1787+}
1788+
1789+
1790+#endif
1791diff --git a/include/linux/rt_param.h b/include/linux/rt_param.h
1792new file mode 100644
1793index 0000000..4ebab77
1794--- /dev/null
1795+++ b/include/linux/rt_param.h
1796@@ -0,0 +1,277 @@
1797+/*
1798+ * Definition of the scheduler plugin interface.
1799+ *
1800+ */
1801+#ifndef _LINUX_RT_PARAM_H_
1802+#define _LINUX_RT_PARAM_H_
1803+
1804+#include <linux/wait.h>
1805+
1806+typedef unsigned long jiffie_t;
1807+
1808+/* different types of clients */
1809+typedef enum {
1810+ RT_CLASS_HARD,
1811+ RT_CLASS_SOFT,
1812+ RT_CLASS_BEST_EFFORT
1813+} task_class_t;
1814+
1815+typedef struct rt_param {
1816+ unsigned long exec_cost;
1817+ unsigned long period;
1818+ unsigned int cpu;
1819+ task_class_t class;
1820+} rt_param_t;
1821+
1822+/* fixed point wrapper to force compiler
1823+ * errors in case of misuse of a fixed point value
1824+ */
1825+typedef struct
1826+{
1827+ long val;
1828+} fp_t;
1829+
1830+typedef struct {
1831+ fp_t weight;
1832+ unsigned long period;
1833+ fp_t value;
1834+} service_level_t;
1835+
1836+typedef struct {
1837+ fp_t estimate;
1838+ fp_t accumulated;
1839+} predictor_state_t;
1840+
1841+typedef struct {
1842+ /* when will this task be release the next time? */
1843+ jiffie_t release;
1844+ /* time instant the last job was released */
1845+ jiffie_t last_release;
1846+ /* what is the current deadline? */
1847+ jiffie_t deadline;
1848+ /* b-bit tie breaker for PFAIR, it is ignored in EDF */
1849+ int b_bit;
1850+ /* group deadline tie breaker, it is ignored in EDF */
1851+ jiffie_t group_deadline;
1852+ /* how long has this task executed so far?
1853+ * In case of capacity sharing a job completion cannot be
1854+ * detected by checking time_slice == 0 as the job may have
1855+ * executed while using another capacity. Use this counter
1856+ * to keep track of the time spent on a CPU by a job.
1857+ *
1858+ * In other words: The number of consumed quanta since the
1859+ * last job release.
1860+ */
1861+ unsigned int exec_time;
1862+
1863+ /* Which job is this. This is used to let user space
1864+ * specify which job to wait for, which is important if jobs
1865+ * overrun. If we just call sys_sleep_next_period() then we
1866+ * will unintentionally miss jobs after an overrun.
1867+ *
1868+ * Increase this sequence number when a job is released.
1869+ */
1870+ unsigned int job_no;
1871+} rt_times_t;
1872+
1873+
1874+/* RT task parameters for scheduling extensions
1875+ * These parameters are inherited during clone and therefore must
1876+ * be explicitly set up before the task set is launched.
1877+ */
1878+typedef struct task_rt_param {
1879+ /* is the task sleeping? */
1880+ unsigned int flags:8;
1881+
1882+ /* Real-time marker: 1 iff it is a LITMUS real-time task.
1883+ */
1884+ unsigned int is_realtime:1;
1885+
1886+ /* is a BE->RT or RT->BE transition pending? */
1887+ unsigned int transition_pending:1;
1888+
1889+ /* is this task under control of litmus?
1890+ *
1891+ * this is necessary because otherwise signal delivery code
1892+ * may try to wake up a task that is already queued in plugin
1893+ * data structures.
1894+ */
1895+ unsigned int litmus_controlled:1;
1896+
1897+ /* Did this task register any SRP controlled resource accesses?
1898+ * This, of course, should only ever be true under partitioning.
1899+ * However, this limitation is not currently enforced.
1900+ */
1901+ unsigned int subject_to_srp:1;
1902+
1903+ /* if a BE->RT transition failed, then this field contains the error */
1904+ unsigned long transition_error;
1905+
1906+ /* user controlled parameters */
1907+ rt_param_t basic_params;
1908+
1909+ /* task representing the current "inherited" task
1910+ * priority, assigned by inherit_priority and
1911+ * return priority in the scheduler plugins.
1912+ * could point to self if PI does not result in
1913+ * an increased task priority.
1914+ */
1915+ struct task_struct* inh_task;
1916+
1917+ /* Don't just dereference this pointer in kernel space!
1918+ * It might very well point to junk or nothing at all.
1919+ * NULL indicates that the task has not requested any non-preemptable
1920+ * section support.
1921+ * Not inherited upon fork.
1922+ */
1923+ __user short* np_flag;
1924+
1925+ /* For the FMLP under PSN-EDF, it is required to make the task
1926+ * non-preemptive from kernel space. In order not to interfere with
1927+ * user space, this counter indicates the kernel space np setting.
1928+ * kernel_np > 0 => task is non-preemptive
1929+ */
1930+ unsigned int kernel_np;
1931+
1932+ /* timing parameters */
1933+ rt_times_t times;
1934+
1935+ /* This is currently only used by the PFAIR code
1936+ * and a prime candidate for cleanup.
1937+ */
1938+ rt_times_t backup;
1939+
1940+ /* This field can be used by plugins to store where the task
1941+ * is currently scheduled. It is the responsibility of the
1942+ * plugin to avoid race conditions.
1943+ *
1944+ * Used by GSN-EDF.
1945+ */
1946+ int scheduled_on;
1947+
1948+ /* This field can be used by plugins to store where the task
1949+ * is currently linked. It is the responsibility of the plugin
1950+ * to avoid race conditions.
1951+ *
1952+ * Used by GSN-EDF.
1953+ */
1954+ int linked_on;
1955+
1956+ /* Adaptive support. Adaptive tasks will store service levels
1957+ * in this (dynamically allocated) structure.
1958+ */
1959+ service_level_t* service_level;
1960+ unsigned int no_service_levels;
1961+ unsigned int cur_service_level;
1962+
1963+ /* Adaptive support. Store state for weight estimation.
1964+ */
1965+ predictor_state_t predictor_state;
1966+
1967+ /* Adaptive support. Optimizer fields.
1968+ */
1969+ struct list_head opt_list;
1970+ fp_t opt_order;
1971+ fp_t opt_dw;
1972+ fp_t opt_nw;
1973+ unsigned int opt_level;
1974+ jiffie_t opt_change;
1975+
1976+ /* Fields saved before BE->RT transition.
1977+ */
1978+ int old_policy;
1979+ int old_prio;
1980+} task_rt_param_t;
1981+
1982+/* Possible RT flags */
1983+#define RT_F_RUNNING 0x00000000
1984+#define RT_F_SLEEP 0x00000001
1985+#define RT_F_EXP_QUANTA 0x00000002
1986+#define RT_F_NON_PREEMTABLE 0x00000004
1987+#define RT_F_EXIT_SEM 0x00000008
1988+
1989+#define is_realtime(t) ((t)->rt_param.is_realtime)
1990+#define rt_transition_pending(t) \
1991+ ((t)->rt_param.transition_pending)
1992+
1993+/* Realtime utility macros */
1994+#define get_passed_quanta(t) ((t)->rt_param.times.exec_time)
1995+#define inc_passed_quanta(t) ((t)->rt_param.times.exec_time += 1)
1996+#define get_rt_flags(t) ((t)->rt_param.flags)
1997+#define set_rt_flags(t,f) (t)->rt_param.flags=(f)
1998+#define get_exec_cost(t) ((t)->rt_param.basic_params.exec_cost)
1999+#define get_rt_period(t) ((t)->rt_param.basic_params.period)
2000+#define set_rt_period(t,p) (t)->rt_param.basic_params.period=(p)
2001+#define set_exec_cost(t,e) (t)->rt_param.basic_params.exec_cost=(e)
2002+#define get_partition(t) (t)->rt_param.basic_params.cpu
2003+#define get_deadline(t) ((t)->rt_param.times.deadline)
2004+#define get_last_release(t) ((t)->rt_param.times.last_release)
2005+#define get_class(t) ((t)->rt_param.basic_params.class)
2006+
2007+#define has_active_job(t) \
2008+ (time_before(get_last_release(t), jiffies) \
2009+ && time_before_eq(jiffies, get_deadline(t)))
2010+
2011+#define get_est_weight(t) ((t)->rt_param.predictor_state.estimate)
2012+#define get_sl(t, l) \
2013+ ((t)->rt_param.service_level[l])
2014+#define get_cur_sl(t) ((t)->rt_param.cur_service_level)
2015+#define get_max_sl(t) ((t)->rt_param.no_service_levels - 1)
2016+#define get_opt_sl(t) ((t)->rt_param.opt_level)
2017+
2018+
2019+#define is_subject_to_srp(t) ((t)->rt_param.subject_to_srp)
2020+#define is_hrt(t) \
2021+ ((t)->rt_param.basic_params.class == RT_CLASS_HARD)
2022+#define is_srt(t) \
2023+ ((t)->rt_param.basic_params.class == RT_CLASS_SOFT)
2024+#define is_be(t) \
2025+ ((t)->rt_param.basic_params.class == RT_CLASS_BEST_EFFORT)
2026+
2027+#define clear_rt_params(t) \
2028+memset(&(t)->rt_param,0, sizeof(struct task_rt_param))
2029+
2030+#define get_release(t) ((t)->rt_param.times.release)
2031+#define set_release(t,r) ((t)->rt_param.times.release=(r))
2032+
2033+/* honor the flag that is set when scheduling is in progress
2034+ * This is some dirty hack in Linux that creates race conditions in our code
2035+ * if we don't pay attention to it.
2036+ */
2037+#define is_running(t) \
2038+ ((t)->state == TASK_RUNNING || \
2039+ (t)->thread_info->preempt_count & PREEMPT_ACTIVE)
2040+
2041+#define is_blocked(t) (!is_running(t))
2042+#define is_released(t) (time_before_eq((t)->rt_param.times.release, jiffies))
2043+#define is_tardy(t) (time_before_eq((t)->rt_param.times.deadline, jiffies))
2044+#define task_slack(t) ( (int) (t)->rt_param.times.deadline - (int) jiffies - \
2045+ (int) ((t)->rt_param.basic_params.exec_cost - \
2046+ (t)->rt_param.times.exec_time))
2047+
2048+
2049+/* real-time comparison macros */
2050+#define earlier_deadline(a, b) (time_before(\
2051+ (a)->rt_param.times.deadline,\
2052+ (b)->rt_param.times.deadline))
2053+#define earlier_release(a, b) (time_before(\
2054+ (a)->rt_param.times.release,\
2055+ (b)->rt_param.times.release))
2056+
2057+#define earlier_last_release(a, b) (time_before(\
2058+ (a)->rt_param.times.last_release,\
2059+ (b)->rt_param.times.last_release))
2060+
2061+
2062+#define make_np(t) do {t->rt_param.kernel_np++;} while(0);
2063+#define take_np(t) do {t->rt_param.kernel_np--;} while(0);
2064+
2065+#define backup_times(t) do { (t)->rt_param.backup=(t)->rt_param.times; \
2066+ } while(0);
2067+#define restore_times(t) do { (t)->rt_param.times=(t)->rt_param.backup; \
2068+ } while(0);
2069+
2070+
2071+#define rt_list2task(p) list_entry(p, struct task_struct, rt_list)
2072+
2073+#endif
2074diff --git a/include/linux/sched.h b/include/linux/sched.h
2075index 4463735..f590e28 100644
2076--- a/include/linux/sched.h
2077+++ b/include/linux/sched.h
2078@@ -3,6 +3,8 @@
2079
2080 #include <linux/auxvec.h> /* For AT_VECTOR_SIZE */
2081
2082+#include <linux/rt_param.h>
2083+
2084 /*
2085 * cloning flags:
2086 */
2087@@ -796,6 +798,9 @@ enum sleep_type {
2088 SLEEP_INTERRUPTED,
2089 };
2090
2091+struct od_table_entry;
2092+struct ics_cb;
2093+
2094 struct prio_array;
2095
2096 struct task_struct {
2097@@ -1051,6 +1056,15 @@ struct task_struct {
2098 #ifdef CONFIG_FAULT_INJECTION
2099 int make_it_fail;
2100 #endif
2101+ /* litmus parameters and state */
2102+ task_rt_param_t rt_param;
2103+
2104+ /* allow scheduler plugins to queue in release lists, etc. */
2105+ struct list_head rt_list;
2106+
2107+ /* references to PI semaphores, etc. */
2108+ struct od_table_entry* od_table;
2109+ struct ics_cb* ics_cb;
2110 };
2111
2112 static inline pid_t process_group(struct task_struct *tsk)
2113diff --git a/include/linux/sched_plugin.h b/include/linux/sched_plugin.h
2114new file mode 100644
2115index 0000000..e22722c
2116--- /dev/null
2117+++ b/include/linux/sched_plugin.h
2118@@ -0,0 +1,147 @@
2119+/*
2120+ * Definition of the scheduler plugin interface.
2121+ *
2122+ */
2123+#ifndef _LINUX_SCHED_PLUGIN_H_
2124+#define _LINUX_SCHED_PLUGIN_H_
2125+
2126+#include <linux/sched.h>
2127+
2128+/* struct for semaphore with priority inheritance */
2129+struct pi_semaphore {
2130+ atomic_t count;
2131+ int sleepers;
2132+ wait_queue_head_t wait;
2133+ union {
2134+ /* highest-prio holder/waiter */
2135+ struct task_struct *task;
2136+ struct task_struct* cpu_task[NR_CPUS];
2137+ } hp;
2138+ /* current lock holder */
2139+ struct task_struct *holder;
2140+};
2141+
2142+
2143+/* Enforce runqueues to be opaque objects.
2144+ *
2145+ * This allows us to pass around pointers to runqueues,
2146+ * without actually having to rip it out of sched.c. It
2147+ * also discourages plugins from trying to be
2148+ * overly clever.
2149+ */
2150+typedef void runqueue_t;
2151+
2152+
2153+/********************* scheduler invocation ******************/
2154+
2155+typedef enum {
2156+ NO_RESCHED = 0,
2157+ FORCE_RESCHED = 1
2158+} reschedule_check_t;
2159+
2160+
2161+/* Plugin-specific realtime tick handler */
2162+typedef reschedule_check_t (*scheduler_tick_t) (void);
2163+/* Novell make sched decision function */
2164+typedef int (*schedule_t) (struct task_struct * prev,
2165+ struct task_struct ** next,
2166+ runqueue_t * rq);
2167+/* Clean up after the task switch has occured.
2168+ * This function is called after every (even non-rt) task switch.
2169+ */
2170+typedef void (*finish_switch_t)(struct task_struct *prev);
2171+
2172+
2173+/********************* task state changes ********************/
2174+
2175+/* called to setup a new real-time task */
2176+typedef long (*prepare_task_t) (struct task_struct *task);
2177+/* called to re-introduce a task after blocking */
2178+typedef void (*wake_up_task_t) (struct task_struct *task);
2179+/* called to notify the plugin of a blocking real-time task
2180+ * it will only be called for real-time tasks and before schedule is called */
2181+typedef void (*task_blocks_t) (struct task_struct *task);
2182+/* called when a real-time task exits. Free any allocated resources */
2183+typedef long (*tear_down_t) (struct task_struct *);
2184+
2185+/* Called when the new_owner is released from the wait queue
2186+ * it should now inherit the priority from sem, _before_ it gets readded
2187+ * to any queue
2188+ */
2189+typedef long (*inherit_priority_t) (struct pi_semaphore *sem,
2190+ struct task_struct *new_owner);
2191+
2192+/* Called when the current task releases a semahpore where it might have
2193+ * inherited a piority from
2194+ */
2195+typedef long (*return_priority_t) (struct pi_semaphore *sem);
2196+
2197+/* Called when a task tries to acquire a semaphore and fails. Check if its
2198+ * priority is higher than that of the current holder.
2199+ */
2200+typedef long (*pi_block_t) (struct pi_semaphore *sem, struct task_struct *t);
2201+
2202+
2203+/********************* sys call backends ********************/
2204+/* This function causes the caller to sleep until the next release */
2205+typedef long (*sleep_next_period_t) (void);
2206+
2207+typedef int (*scheduler_setup_t) (int cmd, void __user *parameter);
2208+
2209+typedef int (*mode_change_t) (int);
2210+
2211+struct sched_plugin {
2212+ /* basic info */
2213+ char *plugin_name;
2214+ int ready_to_use;
2215+
2216+ /* management interface */
2217+ mode_change_t mode_change;
2218+
2219+ /* scheduler invocation */
2220+ scheduler_tick_t scheduler_tick;
2221+ schedule_t schedule;
2222+ finish_switch_t finish_switch;
2223+
2224+ /* syscall backend */
2225+ sleep_next_period_t sleep_next_period;
2226+ scheduler_setup_t scheduler_setup;
2227+
2228+ /* task state changes */
2229+ prepare_task_t prepare_task;
2230+ wake_up_task_t wake_up_task;
2231+ task_blocks_t task_blocks;
2232+ tear_down_t tear_down;
2233+
2234+ /* priority inheritance */
2235+ inherit_priority_t inherit_priority;
2236+ return_priority_t return_priority;
2237+ pi_block_t pi_block;
2238+} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
2239+
2240+typedef struct sched_plugin sched_plugin_t;
2241+
2242+extern sched_plugin_t *curr_sched_plugin;
2243+
2244+
2245+/* common scheduler tick */
2246+reschedule_check_t rt_scheduler_tick(void);
2247+
2248+
2249+/* Don't pull in our definitions on top of the real ones
2250+ * in sched.c!
2251+ */
2252+#ifndef __SCHED_C__
2253+
2254+/* External linux scheduler facilities */
2255+void deactivate_task(struct task_struct *, runqueue_t *);
2256+/* This function is defined in sched.c. We need acces to it for
2257+ * indirect switching.
2258+ */
2259+void __activate_task(struct task_struct *, runqueue_t *);
2260+void __setscheduler(struct task_struct *, int, int);
2261+
2262+#endif
2263+
2264+extern int get_sched_options(void);
2265+#endif
2266diff --git a/include/linux/sched_trace.h b/include/linux/sched_trace.h
2267new file mode 100644
2268index 0000000..01f21c6
2269--- /dev/null
2270+++ b/include/linux/sched_trace.h
2271@@ -0,0 +1,182 @@
2272+/* sched_trace.h -- record scheduler events to a byte stream for offline analysis.
2273+ */
2274+#ifndef _LINUX_SCHED_TRACE_H_
2275+#define _LINUX_SCHED_TRACE_H_
2276+
2277+#include <linux/sched.h>
2278+
2279+typedef enum {
2280+ ST_INVOCATION = 0,
2281+ ST_ARRIVAL = 1,
2282+ ST_DEPARTURE = 2,
2283+ ST_PREEMPTION = 3,
2284+ ST_SCHEDULED = 4,
2285+ ST_JOB_RELEASE = 5,
2286+ ST_JOB_COMPLETION = 6,
2287+ ST_CAPACITY_RELEASE = 7,
2288+ ST_CAPACITY_ALLOCATION = 8,
2289+ ST_SERVICE_LEVEL_CHANGE = 9,
2290+ ST_WEIGHT_ERROR = 10,
2291+} trace_type_t;
2292+
2293+typedef struct {
2294+ trace_type_t trace:8;
2295+ unsigned int size:24;
2296+ unsigned long long timestamp;
2297+} trace_header_t;
2298+
2299+
2300+typedef struct {
2301+ unsigned int is_rt:1;
2302+ unsigned int is_server:1;
2303+ task_class_t class:4;
2304+ unsigned int budget:24;
2305+ u32 deadline;
2306+
2307+ pid_t pid;
2308+} task_info_t;
2309+
2310+typedef struct {
2311+ trace_header_t header;
2312+ unsigned long flags;
2313+} invocation_record_t;
2314+
2315+typedef struct {
2316+ trace_header_t header;
2317+ task_info_t task;
2318+} arrival_record_t;
2319+
2320+typedef struct {
2321+ trace_header_t header;
2322+ task_info_t task;
2323+} departure_record_t;
2324+
2325+typedef struct {
2326+ trace_header_t header;
2327+ task_info_t task;
2328+ task_info_t by;
2329+} preemption_record_t;
2330+
2331+typedef struct {
2332+ trace_header_t header;
2333+ task_info_t task;
2334+} scheduled_record_t;
2335+
2336+typedef struct {
2337+ trace_header_t header;
2338+ task_info_t task;
2339+ u16 period;
2340+ u16 wcet;
2341+} release_record_t;
2342+
2343+typedef struct {
2344+ trace_header_t header;
2345+ task_info_t task;
2346+ u16 period;
2347+ u16 wcet;
2348+ int tardiness;
2349+ unsigned int job_no;
2350+} completion_record_t;
2351+
2352+typedef struct {
2353+ trace_header_t header;
2354+ task_info_t task;
2355+} cap_release_record_t;
2356+
2357+typedef struct {
2358+ trace_header_t header;
2359+ task_info_t task;
2360+ u16 budget;
2361+ u32 deadline;
2362+ pid_t donor;
2363+} cap_allocation_record_t;
2364+
2365+typedef struct {
2366+ trace_header_t header;
2367+ task_info_t task;
2368+ unsigned int from:16;
2369+ unsigned int to:16;
2370+ service_level_t new_level;
2371+ service_level_t old_level;
2372+} service_level_change_record_t;
2373+
2374+typedef struct {
2375+ trace_header_t header;
2376+ pid_t task;
2377+ fp_t estimate;
2378+ fp_t actual;
2379+} weight_error_record_t;
2380+
2381+#ifdef CONFIG_SCHED_TASK_TRACE
2382+void sched_trace_scheduler_invocation(void);
2383+
2384+void sched_trace_task_arrival(struct task_struct *t);
2385+void sched_trace_task_departure(struct task_struct *t);
2386+void sched_trace_task_preemption(struct task_struct *t,
2387+ struct task_struct* by);
2388+void sched_trace_task_scheduled(struct task_struct *);
2389+
2390+void sched_trace_job_release(struct task_struct *t);
2391+void sched_trace_job_completion(struct task_struct *t);
2392+
2393+void sched_trace_capacity_release(struct task_struct *t);
2394+void sched_trace_capacity_allocation(struct task_struct *t,
2395+ u16 budget, u32 deadline, pid_t donor);
2396+
2397+void sched_trace_capacity_alloc_srv(pid_t srv, u32 srv_dl, task_class_t cls,
2398+ u16 srv_budget,
2399+ u16 budget, u32 deadline, pid_t donor);
2400+
2401+void sched_trace_server_release(int id, unsigned int wcet,
2402+ unsigned int period,
2403+ task_class_t class);
2404+
2405+void sched_trace_server_completion(int id, unsigned int budget,
2406+ jiffie_t deadline,
2407+ task_class_t class);
2408+
2409+void sched_trace_server_scheduled(int id, task_class_t class,
2410+ unsigned int budget, jiffie_t deadline);
2411+
2412+void sched_trace_service_level_change(struct task_struct* t,
2413+ unsigned int from,
2414+ unsigned int to);
2415+
2416+void sched_trace_weight_error(struct task_struct* t, fp_t actual);
2417+
2418+#else
2419+#define sched_trace_scheduler_invocation(x)
2420+
2421+#define sched_trace_task_arrival(t)
2422+#define sched_trace_task_departure(t)
2423+#define sched_trace_task_preemption(t, by)
2424+#define sched_trace_task_scheduled(t)
2425+#define sched_trace_job_release(t)
2426+#define sched_trace_job_completion(t)
2427+#define sched_trace_capacity_release(t)
2428+#define sched_trace_capacity_allocation(t, budget, deadline, donor)
2429+#define sched_trace_capacity_alloc_srv(srv, srv_dl, cls, srv_budget,\
2430+ budget, deadline, donor)
2431+#define sched_trace_server_release(id, wcet, period, class)
2432+#define sched_trace_server_completion(id, budget, deadline, class)
2433+#define sched_trace_server_scheduled(id, class, budget, deadline)
2434+
2435+#define sched_trace_service_level_change(t, a, b)
2436+
2437+#define sched_trace_weight_error(x, y)
2438+
2439+
2440+#endif
2441+
2442+
2443+#ifdef CONFIG_SCHED_DEBUG_TRACE
2444+void sched_trace_log_message(const char* fmt, ...);
2445+
2446+#else
2447+
2448+#define sched_trace_log_message(fmt, ...)
2449+
2450+#endif
2451+
2452+
2453+#endif
2454diff --git a/include/linux/trace.h b/include/linux/trace.h
2455new file mode 100644
2456index 0000000..9e457aa
2457--- /dev/null
2458+++ b/include/linux/trace.h
2459@@ -0,0 +1,74 @@
2460+
2461+#ifndef _SYS_TRACE_H_
2462+#define _SYS_TRACE_H_
2463+
2464+#include <linux/feather_trace.h>
2465+#include <linux/feather_buffer.h>
2466+
2467+
2468+/*********************** TIMESTAMPS ************************/
2469+
2470+struct timestamp {
2471+ unsigned long event;
2472+ unsigned long long timestamp;
2473+ unsigned int seq_no;
2474+ int cpu;
2475+};
2476+
2477+
2478+/* buffer holding time stamps - will be provided by driver */
2479+extern struct ft_buffer* trace_ts_buf;
2480+
2481+/* save_timestamp: stores current time as struct timestamp
2482+ * in trace_ts_buf
2483+ */
2484+asmlinkage void save_timestamp(unsigned long event);
2485+
2486+#define TIMESTAMP(id) ft_event0(id, save_timestamp)
2487+
2488+/* Convention for timestamps
2489+ * =========================
2490+ *
2491+ * In order to process the trace files with a common tool, we use the following
2492+ * convention to measure execution times: The end time id of a code segment is
2493+ * always the next number after the start time event id.
2494+ */
2495+
2496+#define TS_SCHED_START TIMESTAMP(100)
2497+#define TS_SCHED_END TIMESTAMP(101)
2498+#define TS_CXS_START TIMESTAMP(102)
2499+#define TS_CXS_END TIMESTAMP(103)
2500+
2501+#define TS_TICK_START TIMESTAMP(110)
2502+#define TS_TICK_END TIMESTAMP(111)
2503+
2504+#define TS_PLUGIN_SCHED_START TIMESTAMP(120)
2505+#define TS_PLUGIN_SCHED_END TIMESTAMP(121)
2506+
2507+#define TS_PLUGIN_TICK_START TIMESTAMP(130)
2508+#define TS_PLUGIN_TICK_END TIMESTAMP(131)
2509+
2510+#define TS_ENTER_NP_START TIMESTAMP(140)
2511+#define TS_ENTER_NP_END TIMESTAMP(141)
2512+
2513+#define TS_EXIT_NP_START TIMESTAMP(150)
2514+#define TS_EXIT_NP_END TIMESTAMP(151)
2515+
2516+#define TS_SRP_UP_START TIMESTAMP(160)
2517+#define TS_SRP_UP_END TIMESTAMP(161)
2518+#define TS_SRP_DOWN_START TIMESTAMP(162)
2519+#define TS_SRP_DOWN_END TIMESTAMP(163)
2520+
2521+#define TS_PI_UP_START TIMESTAMP(170)
2522+#define TS_PI_UP_END TIMESTAMP(171)
2523+#define TS_PI_DOWN_START TIMESTAMP(172)
2524+#define TS_PI_DOWN_END TIMESTAMP(173)
2525+
2526+#define TS_FIFO_UP_START TIMESTAMP(180)
2527+#define TS_FIFO_UP_END TIMESTAMP(181)
2528+#define TS_FIFO_DOWN_START TIMESTAMP(182)
2529+#define TS_FIFO_DOWN_END TIMESTAMP(183)
2530+
2531+
2532+
2533+#endif /* !_SYS_TRACE_H_ */
2534diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
2535index 975c963..6ae0ff9 100644
2536--- a/include/linux/uaccess.h
2537+++ b/include/linux/uaccess.h
2538@@ -84,4 +84,20 @@ static inline unsigned long __copy_from_user_nocache(void *to,
2539 ret; \
2540 })
2541
2542+/* This is a naive attempt at a write version of the above native Linux macro.
2543+ */
2544+#define poke_kernel_address(val, addr) \
2545+ ({ \
2546+ long ret; \
2547+ mm_segment_t old_fs = get_fs(); \
2548+ \
2549+ set_fs(KERNEL_DS); \
2550+ pagefault_disable(); \
2551+ ret = __put_user(val, (__force typeof(val) __user *)(addr)); \
2552+ pagefault_enable(); \
2553+ set_fs(old_fs); \
2554+ ret; \
2555+ })
2556+
2557+
2558 #endif /* __LINUX_UACCESS_H__ */
2559diff --git a/include/linux/wait.h b/include/linux/wait.h
2560index e820d00..c7e96b6 100644
2561--- a/include/linux/wait.h
2562+++ b/include/linux/wait.h
2563@@ -161,6 +161,8 @@ wait_queue_head_t *FASTCALL(bit_waitqueue(void *, int));
2564 #define wake_up_locked(x) __wake_up_locked((x), TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE)
2565 #define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1)
2566
2567+#define pi_wake_up(x) __pi_wake_up(x, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, NULL)
2568+
2569 #define __wait_event(wq, condition) \
2570 do { \
2571 DEFINE_WAIT(__wait); \
2572diff --git a/kernel/Makefile b/kernel/Makefile
2573index 14f4d45..c3d8b0d 100644
2574--- a/kernel/Makefile
2575+++ b/kernel/Makefile
2576@@ -8,7 +8,13 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
2577 signal.o sys.o kmod.o workqueue.o pid.o \
2578 rcupdate.o extable.o params.o posix-timers.o \
2579 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
2580- hrtimer.o rwsem.o latency.o nsproxy.o srcu.o
2581+ hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \
2582+ sched_plugin.o litmus.o sched_trace.o \
2583+ edf_common.o fifo_common.o pfair_common.o\
2584+ sched_global_edf.o sched_part_edf.o sched_edf_hsb.o sched_pfair.o \
2585+ sched_gsn_edf.o sched_psn_edf.o litmus_sem.o \
2586+ trace.o ft_event.o rt_domain.o sched_adaptive.o \
2587+ ics.o
2588
2589 obj-$(CONFIG_STACKTRACE) += stacktrace.o
2590 obj-y += time/
2591diff --git a/kernel/edf_common.c b/kernel/edf_common.c
2592new file mode 100644
2593index 0000000..4746c66
2594--- /dev/null
2595+++ b/kernel/edf_common.c
2596@@ -0,0 +1,135 @@
2597+/*
2598+ * kernel/edf_common.c
2599+ *
2600+ * Common functions for EDF based scheduler.
2601+ */
2602+
2603+#include <linux/percpu.h>
2604+#include <linux/sched.h>
2605+#include <linux/list.h>
2606+
2607+#include <linux/litmus.h>
2608+#include <linux/sched_plugin.h>
2609+#include <linux/sched_trace.h>
2610+
2611+
2612+#include <linux/edf_common.h>
2613+
2614+/* edf_higher_prio - returns true if first has a higher EDF priority
2615+ * than second. Deadline ties are broken by PID.
2616+ *
2617+ * first first must not be NULL and a real-time task.
2618+ * second may be NULL or a non-rt task.
2619+ */
2620+int edf_higher_prio(struct task_struct* first,
2621+ struct task_struct* second)
2622+{
2623+ struct task_struct *first_task = first;
2624+ struct task_struct *second_task = second;
2625+
2626+ /* Check for inherited priorities. Change task
2627+ * used for comparison in such a case.
2628+ */
2629+ if (first && first->rt_param.inh_task)
2630+ first_task = first->rt_param.inh_task;
2631+ if (second && second->rt_param.inh_task)
2632+ second_task = second->rt_param.inh_task;
2633+
2634+ return
2635+ /* does the second task exist and is it a real-time task? If
2636+ * not, the first task (which is a RT task) has higher
2637+ * priority.
2638+ */
2639+ !second_task || !is_realtime(second_task) ||
2640+
2641+ /* is the deadline of the first task earlier?
2642+ * Then it has higher priority.
2643+ */
2644+ earlier_deadline(first_task, second_task) ||
2645+
2646+ /* Do we have a deadline tie?
2647+ * Then break by PID.
2648+ */
2649+ (get_deadline(first_task) == get_deadline(second_task) &&
2650+ (first_task->pid < second_task->pid ||
2651+
2652+ /* If the PIDs are the same then the task with the inherited
2653+ * priority wins.
2654+ */
2655+ (first_task->pid == second_task->pid &&
2656+ !second->rt_param.inh_task)));
2657+}
2658+
2659+int edf_ready_order(struct list_head* a, struct list_head* b)
2660+{
2661+ return edf_higher_prio(
2662+ list_entry(a, struct task_struct, rt_list),
2663+ list_entry(b, struct task_struct, rt_list));
2664+}
2665+
2666+void edf_release_at(struct task_struct *t, jiffie_t start)
2667+{
2668+ t->rt_param.times.deadline = start;
2669+ edf_prepare_for_next_period(t);
2670+ t->rt_param.times.last_release = start;
2671+ set_rt_flags(t, RT_F_RUNNING);
2672+}
2673+
2674+void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched)
2675+{
2676+ rt_domain_init(rt, resched, edf_ready_order);
2677+}
2678+
2679+void edf_prepare_for_next_period(struct task_struct *t)
2680+{
2681+ BUG_ON(!t);
2682+ /* prepare next release */
2683+ t->rt_param.times.release = t->rt_param.times.deadline;
2684+ t->rt_param.times.deadline += get_rt_period(t);
2685+ t->rt_param.times.exec_time = 0;
2686+ /* update job sequence number */
2687+ t->rt_param.times.job_no++;
2688+
2689+ t->time_slice = get_exec_cost(t);
2690+
2691+ /* who uses this? statistics? */
2692+ t->first_time_slice = 0;
2693+}
2694+
2695+/* need_to_preempt - check whether the task t needs to be preempted
2696+ * call only with irqs disabled and with ready_lock acquired
2697+ * THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT!
2698+ */
2699+int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t)
2700+{
2701+ /* we need the read lock for edf_ready_queue */
2702+ /* no need to preempt if there is nothing pending */
2703+ if (!ready_jobs_pending(rt))
2704+ return 0;
2705+ /* we need to reschedule if t doesn't exist */
2706+ if (!t)
2707+ return 1;
2708+
2709+ /* NOTE: We cannot check for non-preemptibility since we
2710+ * don't know what address space we're currently in.
2711+ */
2712+
2713+ /* make sure to get non-rt stuff out of the way */
2714+ return !is_realtime(t) || edf_higher_prio(next_ready(rt), t);
2715+}
2716+
2717+
2718+/*
2719+ * Deactivate current task until the beginning of the next period.
2720+ */
2721+long edf_sleep_next_period(void)
2722+{
2723+ /* Mark that we do not excute anymore */
2724+ set_rt_flags(current, RT_F_SLEEP);
2725+ /* call schedule, this will return when a new job arrives
2726+ * it also takes care of preparing for the next release
2727+ */
2728+ schedule();
2729+ return 0;
2730+}
2731+
2732diff --git a/kernel/exit.c b/kernel/exit.c
2733index fec12eb..8a0eb79 100644
2734--- a/kernel/exit.c
2735+++ b/kernel/exit.c
2736@@ -50,6 +50,8 @@
2737
2738 extern void sem_exit (void);
2739
2740+extern void exit_od_table(struct task_struct* t);
2741+
2742 static void exit_mm(struct task_struct * tsk);
2743
2744 static void __unhash_process(struct task_struct *p)
2745@@ -916,6 +918,8 @@ fastcall NORET_TYPE void do_exit(long code)
2746 if (unlikely(tsk->audit_context))
2747 audit_free(tsk);
2748
2749+ exit_od_table(tsk);
2750+
2751 taskstats_exit(tsk, group_dead);
2752
2753 exit_mm(tsk);
2754diff --git a/kernel/fifo_common.c b/kernel/fifo_common.c
2755new file mode 100644
2756index 0000000..c1641a1
2757--- /dev/null
2758+++ b/kernel/fifo_common.c
2759@@ -0,0 +1,86 @@
2760+/*
2761+ * kernel/fifo_common.c
2762+ *
2763+ * Fifo helper functions. Could one day be a FIFO plugin if someone
2764+ * is interested.
2765+ *
2766+ * The current FIFO implementaion automatically chops Linux tasks into
2767+ * smaller jobs by assigning a fixed time slice. Once that time slice expires,
2768+ * it is treated as a new job release (that is queued in the back).
2769+ *
2770+ * The result is that it provides FIFO properties on a job level and round-robin
2771+ * on a task level if the tasks execute continuously.
2772+ */
2773+
2774+#include <asm/uaccess.h>
2775+#include <linux/percpu.h>
2776+#include <linux/sched.h>
2777+#include <linux/list.h>
2778+
2779+#include <linux/litmus.h>
2780+#include <linux/sched_plugin.h>
2781+#include <linux/sched_trace.h>
2782+#include <linux/fifo_common.h>
2783+
2784+/* This function is defined in sched.c. We need access it for
2785+ * indirect switching.
2786+ */
2787+void __activate_task(struct task_struct *p, runqueue_t *rq);
2788+
2789+/* fifo_higher_prio - returns true if first has a higher FIFO priority
2790+ * than second. Release time ties are broken by PID.
2791+ *
2792+ * first first must not be NULL and a real-time task.
2793+ * second may be NULL or a non-rt task.
2794+ */
2795+int fifo_higher_prio(struct task_struct* first,
2796+ struct task_struct* second)
2797+{
2798+ struct task_struct *first_task = first;
2799+ struct task_struct *second_task = second;
2800+
2801+ /* Check for inherited priorities. Change task
2802+ * used for comparison in such a case.
2803+ */
2804+ if (first && first->rt_param.inh_task)
2805+ first_task = first->rt_param.inh_task;
2806+ if (second && second->rt_param.inh_task)
2807+ second_task = second->rt_param.inh_task;
2808+
2809+ return
2810+ /* does the second task exist and is it a real-time task? If
2811+ * not, the first task (which is a RT task) has higher
2812+ * priority.
2813+ */
2814+ !second_task || !is_realtime(second_task) ||
2815+
2816+ /* is the release of the first task earlier?
2817+ * Then it has higher priority.
2818+ */
2819+ earlier_last_release(first_task, second_task) ||
2820+
2821+ /* Do we have a release time tie?
2822+ * Then break by PID.
2823+ */
2824+ (get_last_release(first_task) ==
2825+ get_last_release(second_task) &&
2826+ (first_task->pid < second_task->pid ||
2827+
2828+ /* If the PIDs are the same then the task with the inherited
2829+ * priority wins.
2830+ */
2831+ (first_task->pid == second_task->pid &&
2832+ !second->rt_param.inh_task)));
2833+}
2834+
2835+int fifo_ready_order(struct list_head* a, struct list_head* b)
2836+{
2837+ return fifo_higher_prio(
2838+ list_entry(a, struct task_struct, rt_list),
2839+ list_entry(b, struct task_struct, rt_list));
2840+}
2841+
2842+void fifo_domain_init(rt_domain_t* rt, check_resched_needed_t resched)
2843+{
2844+ rt_domain_init(rt, resched, fifo_ready_order);
2845+}
2846diff --git a/kernel/fork.c b/kernel/fork.c
2847index d57118d..be824d4 100644
2848--- a/kernel/fork.c
2849+++ b/kernel/fork.c
2850@@ -57,6 +57,9 @@
2851 #include <asm/cacheflush.h>
2852 #include <asm/tlbflush.h>
2853
2854+#include <linux/litmus.h>
2855+#include <linux/sched_plugin.h>
2856+
2857 /*
2858 * Protected counters by write_lock_irq(&tasklist_lock)
2859 */
2860@@ -118,6 +121,8 @@ void __put_task_struct(struct task_struct *tsk)
2861 WARN_ON(atomic_read(&tsk->usage));
2862 WARN_ON(tsk == current);
2863
2864+ exit_litmus(tsk);
2865+
2866 security_task_free(tsk);
2867 free_uid(tsk->user);
2868 put_group_info(tsk->group_info);
2869diff --git a/kernel/ft_event.c b/kernel/ft_event.c
2870new file mode 100644
2871index 0000000..10318ee
2872--- /dev/null
2873+++ b/kernel/ft_event.c
2874@@ -0,0 +1,104 @@
2875+#include <linux/types.h>
2876+
2877+#include <linux/feather_trace.h>
2878+
2879+/* the feather trace management functions assume
2880+ * exclusive access to the event table
2881+ */
2882+
2883+
2884+#define BYTE_JUMP 0xeb
2885+#define BYTE_JUMP_LEN 0x02
2886+
2887+/* for each event, there is an entry in the event table */
2888+struct trace_event {
2889+ long id;
2890+ long count;
2891+ long start_addr;
2892+ long end_addr;
2893+};
2894+
2895+extern struct trace_event __start___event_table[];
2896+extern struct trace_event __stop___event_table[];
2897+
2898+int ft_enable_event(unsigned long id)
2899+{
2900+ struct trace_event* te = __start___event_table;
2901+ int count = 0;
2902+ char* delta;
2903+ unsigned char* instr;
2904+
2905+ while (te < __stop___event_table) {
2906+ if (te->id == id && ++te->count == 1) {
2907+ instr = (unsigned char*) te->start_addr;
2908+ /* make sure we don't clobber something wrong */
2909+ if (*instr == BYTE_JUMP) {
2910+ delta = (((unsigned char*) te->start_addr) + 1);
2911+ *delta = 0;
2912+ }
2913+ }
2914+ if (te->id == id)
2915+ count++;
2916+ te++;
2917+ }
2918+ return count;
2919+}
2920+
2921+int ft_disable_event(unsigned long id)
2922+{
2923+ struct trace_event* te = __start___event_table;
2924+ int count = 0;
2925+ char* delta;
2926+ unsigned char* instr;
2927+
2928+ while (te < __stop___event_table) {
2929+ if (te->id == id && --te->count == 0) {
2930+ instr = (unsigned char*) te->start_addr;
2931+ if (*instr == BYTE_JUMP) {
2932+ delta = (((unsigned char*) te->start_addr) + 1);
2933+ *delta = te->end_addr - te->start_addr -
2934+ BYTE_JUMP_LEN;
2935+ }
2936+ }
2937+ if (te->id == id)
2938+ count++;
2939+ te++;
2940+ }
2941+ return count;
2942+}
2943+
2944+int ft_disable_all_events(void)
2945+{
2946+ struct trace_event* te = __start___event_table;
2947+ int count = 0;
2948+ char* delta;
2949+ unsigned char* instr;
2950+
2951+ while (te < __stop___event_table) {
2952+ if (te->count) {
2953+ instr = (unsigned char*) te->start_addr;
2954+ if (*instr == BYTE_JUMP) {
2955+ delta = (((unsigned char*) te->start_addr)
2956+ + 1);
2957+ *delta = te->end_addr - te->start_addr -
2958+ BYTE_JUMP_LEN;
2959+ te->count = 0;
2960+ count++;
2961+ }
2962+ }
2963+ te++;
2964+ }
2965+ return count;
2966+}
2967+
2968+int ft_is_event_enabled(unsigned long id)
2969+{
2970+ struct trace_event* te = __start___event_table;
2971+
2972+ while (te < __stop___event_table) {
2973+ if (te->id == id)
2974+ return te->count;
2975+ te++;
2976+ }
2977+ return 0;
2978+}
2979diff --git a/kernel/ics.c b/kernel/ics.c
2980new file mode 100644
2981index 0000000..a016033
2982--- /dev/null
2983+++ b/kernel/ics.c
2984@@ -0,0 +1,229 @@
2985+/* ics.c - interruptible critical sections
2986+ *
2987+ * (c) 2007 Bjoern Brandenburg, LITMUS^RT project
2988+ *
2989+ * This file contains the platform-independent parts to support ICSs on top of
2990+ * the FDSO layer.
2991+ */
2992+
2993+#include <linux/slab.h>
2994+#include <linux/sched.h>
2995+#include <linux/smp.h>
2996+#include <linux/errno.h>
2997+#include <asm/uaccess.h>
2998+
2999+
3000+#include <linux/fdso.h>
3001+#include <linux/ics.h>
3002+
3003+#define ics_stack_empty(top) (top == -1)
3004+
3005+struct ics {
3006+ struct mutex writer_mutex;
3007+};
3008+
3009+static void* create_ics(void)
3010+{
3011+ struct ics* ics;
3012+
3013+ ics = kmalloc(sizeof(struct ics), GFP_KERNEL);
3014+ if (!ics)
3015+ return NULL;
3016+ mutex_init(&ics->writer_mutex);
3017+ ICS_DBG("allocated ics/%p\n", ics);
3018+ return ics;
3019+}
3020+
3021+static void destroy_ics(void* ics)
3022+{
3023+ ICS_DBG("freeing ics/%p\n", ics);
3024+ kfree(ics);
3025+}
3026+
3027+static int open_ics(struct od_table_entry* entry, void* __user mapping)
3028+{
3029+ if (!access_ok(VERIFY_WRITE, mapping, sizeof(int *)))
3030+ return -EFAULT;
3031+
3032+ entry->extra = (void*) mapping;
3033+ return 0;
3034+}
3035+
3036+static int close_ics(struct od_table_entry* entry)
3037+{
3038+ return 0;
3039+}
3040+
3041+struct fdso_ops ics_ops = {
3042+ .create = create_ics,
3043+ .destroy = destroy_ics,
3044+ .open = open_ics,
3045+ .close = close_ics
3046+};
3047+
3048+
3049+static int get_ics_stack_top(void)
3050+{
3051+ int err = 0;
3052+ int top = -1;
3053+ struct task_struct* t = current;
3054+
3055+ if (t->ics_cb) {
3056+ err = get_user(top, &t->ics_cb->top);
3057+ ICS_DBG("%d stack_top() -> err=%d top=%d\n", t->pid, err, top);
3058+ }
3059+ if (err != 0 || top < -1 || top >= MAX_ICS_NESTING)
3060+ return -1;
3061+ else
3062+ return top;
3063+}
3064+
3065+void* get_rollback_addr(void)
3066+{
3067+ int err = 0;
3068+ int top;
3069+ void* addr = NULL;
3070+ struct task_struct* t = current;
3071+
3072+ /* we implicitly roll back to the top address */
3073+
3074+ top = get_ics_stack_top();
3075+ if (!ics_stack_empty(top))
3076+ err = get_user(addr, &t->ics_cb->ics_stack[top].pc);
3077+
3078+ if (err != 0)
3079+ addr = NULL;
3080+ return addr;
3081+}
3082+
3083+static int get_ics_stack(int idx)
3084+{
3085+ int err;
3086+ struct task_struct* t = current;
3087+ int od;
3088+
3089+ err = get_user(od, &t->ics_cb->ics_stack[idx].id);
3090+
3091+ if (!err)
3092+ return od;
3093+ else
3094+ return -1;
3095+}
3096+
3097+
3098+static void abort_local_ics_reader(void* _ics)
3099+{
3100+ struct task_struct* t = current;
3101+ int i, od, top, err1, err2;
3102+ int retry = 1;
3103+ struct ics *stacked, *ics;
3104+ struct od_table_entry* entry;
3105+
3106+ ICS_DBG(KERN_DEBUG "abort_local_ics_reader() on %d, examining %s/%d\n",
3107+ raw_smp_processor_id(), t->comm, t->pid);
3108+
3109+ ics = (struct ics*) _ics;
3110+
3111+ /* things to check
3112+ *
3113+ * 1) if local task has no ics_cb then return
3114+ * 2) if local task has no ics on stack then return
3115+ * 3) if local task has <ics> not in ics stack then return
3116+ * 4) otherwise rollback local task and set retry flag
3117+ */
3118+
3119+ if (!t->ics_cb) {
3120+ ICS_DBG("%d no ics_cb\n", t->pid);
3121+ return;
3122+ }
3123+
3124+ top = get_ics_stack_top();
3125+ if (ics_stack_empty(top)) {
3126+ ICS_DBG("%d stack empty\n", t->pid);
3127+ return;
3128+ }
3129+
3130+ for (i = 0; i <= top; i++) {
3131+ od = get_ics_stack(i);
3132+ if (od < 0) {
3133+ ICS_DBG("%d garbage od=%d\n", t->pid, od);
3134+ /* end of stack or garbage */
3135+ return;
3136+ }
3137+ stacked = lookup_ics(od);
3138+ entry = __od_lookup(od);
3139+ if (!stacked) {
3140+ ICS_DBG("%d garbage lookup od=%d\n", t->pid, od);
3141+ /* garbage on stack */
3142+ return;
3143+ }
3144+ if (ics == stacked) {
3145+ ICS_DBG(KERN_DEBUG "ICS: aborting %s/%d\n",
3146+ t->comm, t->pid);
3147+ /* set retry flag */
3148+ err1 = put_user(retry, (int*)entry->extra);
3149+ /* set ics stack pointer */
3150+ err2 = put_user(i, &t->ics_cb->top);
3151+ if (likely(err1 == 0 && err2 == 0)) {
3152+ set_tsk_thread_flag(t, TIF_ROLLBACK_RCS);
3153+ ICS_DBG(KERN_DEBUG "%s/%d aborted.\n",
3154+ t->comm, t->pid);
3155+ } else
3156+ printk(KERN_INFO "ICS: could not roll back "
3157+ "%s/%d state=%d err1=%d err2=%d i=%d extra=%p &top=%p\n",
3158+ t->comm, t->pid, t->state, err1, err2, i, entry->extra, &t->ics_cb->top);
3159+ return;
3160+ }
3161+ }
3162+}
3163+
3164+static void abort_ics_readers(struct ics* ics)
3165+{
3166+ ICS_DBG(KERN_DEBUG "abort_ics_readers() on %d\n",
3167+ raw_smp_processor_id());
3168+
3169+ smp_call_function(abort_local_ics_reader, ics, 0, 1);
3170+}
3171+
3172+static int do_start_wcs(struct ics* ics)
3173+{
3174+ mutex_lock(&ics->writer_mutex);
3175+
3176+ abort_ics_readers(ics);
3177+
3178+ mutex_unlock(&ics->writer_mutex);
3179+
3180+ return 0;
3181+}
3182+
3183+
3184+asmlinkage long sys_start_wcs(int ics_od)
3185+{
3186+ long ret = 0;
3187+ struct ics * ics;
3188+
3189+ ics = lookup_ics(ics_od);
3190+ if (ics)
3191+ ret = do_start_wcs(ics);
3192+ else
3193+ ret = -EINVAL;
3194+
3195+ ICS_DBG(KERN_DEBUG "%s/%d sys_start_wcs(%d) -> %ld\n",
3196+ current->comm, current->pid, ics_od, ret);
3197+
3198+ return ret;
3199+}
3200+
3201+
3202+asmlinkage long sys_reg_ics_cb(struct ics_cb* __user ics_cb)
3203+{
3204+ long ret = -EFAULT;
3205+ struct task_struct *t = current;
3206+
3207+ if (access_ok(VERIFY_WRITE, ics_cb, sizeof(*ics_cb))) {
3208+ t->ics_cb = ics_cb;
3209+ ret = 0;
3210+ }
3211+
3212+ return ret;
3213+}
3214diff --git a/kernel/litmus.c b/kernel/litmus.c
3215new file mode 100644
3216index 0000000..8ebb9c9
3217--- /dev/null
3218+++ b/kernel/litmus.c
3219@@ -0,0 +1,1034 @@
3220+/* litmus.c -- Implementation of the LITMUS syscalls, the LITMUS intialization,
3221+ * and the common tick function.
3222+ */
3223+#include <asm/uaccess.h>
3224+#include <linux/uaccess.h>
3225+#include <linux/sysrq.h>
3226+
3227+#include <linux/queuelock.h>
3228+#include <linux/litmus.h>
3229+#include <linux/sched.h>
3230+#include <linux/sched_plugin.h>
3231+#include <linux/fpmath.h>
3232+
3233+#include <linux/trace.h>
3234+
3235+#define MAX_SERVICE_LEVELS 10
3236+
3237+/* Variables that govern the scheduling process */
3238+spolicy sched_policy = SCHED_DEFAULT;
3239+int sched_options = 0;
3240+
3241+
3242+/* This is a flag for switching the system into RT mode when it is booted up
3243+ * In RT-mode non-realtime tasks are scheduled as background tasks.
3244+ */
3245+
3246+/* The system is booting in non-realtime mode */
3247+atomic_t rt_mode = ATOMIC_INIT(MODE_NON_RT);
3248+/* Here we specify a mode change to be made */
3249+atomic_t new_mode = ATOMIC_INIT(MODE_NON_RT);
3250+/* Number of RT tasks that exist in the system */
3251+atomic_t n_rt_tasks = ATOMIC_INIT(0);
3252+
3253+/* Only one CPU may perform a mode change. */
3254+static queuelock_t mode_change_lock;
3255+
3256+/* The time instant when we switched to RT mode */
3257+volatile jiffie_t rt_start_time = 0;
3258+
3259+/* To send signals from the scheduler
3260+ * Must drop locks first.
3261+ */
3262+static LIST_HEAD(sched_sig_list);
3263+static DEFINE_SPINLOCK(sched_sig_list_lock);
3264+
3265+/**
3266+ * sys_set_rt_mode
3267+ * @newmode: new mode the scheduler must be switched to
3268+ * External syscall for setting the RT mode flag
3269+ * Returns EINVAL if mode is not recognized or mode transition is
3270+ * not permitted
3271+ * On success 0 is returned
3272+ *
3273+ * FIXME: In a "real" OS we cannot just let any user switch the mode...
3274+ */
3275+asmlinkage long sys_set_rt_mode(int newmode)
3276+{
3277+ if ((newmode == MODE_NON_RT) || (newmode == MODE_RT_RUN)) {
3278+ printk(KERN_INFO "real-time mode switch to %s\n",
3279+ (newmode == MODE_RT_RUN ? "rt" : "non-rt"));
3280+ atomic_set(&new_mode, newmode);
3281+ return 0;
3282+ }
3283+ return -EINVAL;
3284+}
3285+
3286+/*
3287+ * sys_set_task_rt_param
3288+ * @pid: Pid of the task which scheduling parameters must be changed
3289+ * @param: New real-time extension parameters such as the execution cost and
3290+ * period
3291+ * Syscall for manipulating with task rt extension params
3292+ * Returns EFAULT if param is NULL.
3293+ * ESRCH if pid is not corrsponding
3294+ * to a valid task.
3295+ * EINVAL if either period or execution cost is <=0
3296+ * EPERM if pid is a real-time task
3297+ * 0 if success
3298+ *
3299+ * Only non-real-time tasks may be configured with this system call
3300+ * to avoid races with the scheduler. In practice, this means that a
3301+ * task's parameters must be set _before_ calling sys_prepare_rt_task()
3302+ */
3303+asmlinkage long sys_set_rt_task_param(pid_t pid, rt_param_t __user * param)
3304+{
3305+ rt_param_t tp;
3306+ struct task_struct *target;
3307+ int retval = -EINVAL;
3308+
3309+ printk("Setting up rt task parameters for process %d.\n", pid);
3310+
3311+ if (pid < 0 || param == 0) {
3312+ goto out;
3313+ }
3314+ if (copy_from_user(&tp, param, sizeof(tp))) {
3315+ retval = -EFAULT;
3316+ goto out;
3317+ }
3318+
3319+ /* Task search and manipulation must be protected */
3320+ read_lock_irq(&tasklist_lock);
3321+ if (!(target = find_task_by_pid(pid))) {
3322+ retval = -ESRCH;
3323+ goto out_unlock;
3324+ }
3325+
3326+ if (is_realtime(target)) {
3327+ /* The task is already a real-time task.
3328+ * We cannot not allow parameter changes at this point.
3329+ */
3330+ retval = -EBUSY;
3331+ goto out_unlock;
3332+ }
3333+
3334+ if (tp.exec_cost <= 0)
3335+ goto out_unlock;
3336+ if (tp.period <= 0)
3337+ goto out_unlock;
3338+ if (!cpu_online(tp.cpu))
3339+ goto out_unlock;
3340+ if (tp.period < tp.exec_cost)
3341+ {
3342+ printk(KERN_INFO "litmus: real-time task %d rejected "
3343+ "because wcet > period\n", pid);
3344+ goto out_unlock;
3345+ }
3346+
3347+ /* Assign params */
3348+ target->rt_param.basic_params = tp;
3349+
3350+ retval = 0;
3351+ out_unlock:
3352+ read_unlock_irq(&tasklist_lock);
3353+ out:
3354+ return retval;
3355+}
3356+
3357+/* Getter of task's RT params
3358+ * returns EINVAL if param or pid is NULL
3359+ * returns ESRCH if pid does not correspond to a valid task
3360+ * returns EFAULT if copying of parameters has failed.
3361+ */
3362+asmlinkage long sys_get_rt_task_param(pid_t pid, rt_param_t __user * param)
3363+{
3364+ int retval = -EINVAL;
3365+ struct task_struct *source;
3366+ rt_param_t lp;
3367+ if (param == 0 || pid < 0)
3368+ goto out;
3369+ read_lock(&tasklist_lock);
3370+ if (!(source = find_task_by_pid(pid))) {
3371+ retval = -ESRCH;
3372+ goto out_unlock;
3373+ }
3374+ lp = source->rt_param.basic_params;
3375+ read_unlock(&tasklist_lock);
3376+ /* Do copying outside the lock */
3377+ retval =
3378+ copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0;
3379+ return retval;
3380+ out_unlock:
3381+ read_unlock(&tasklist_lock);
3382+ out:
3383+ return retval;
3384+
3385+}
3386+
3387+/*
3388+ * sys_set_service_levels
3389+ * @pid: Pid of the task that is to be configured
3390+ * @count: The number of service levels
3391+ * @levels: The new service levels.
3392+ *
3393+ * Returns EFAULT if levels is not a valid address.
3394+ * ESRCH if pid is not corrsponding
3395+ * to a valid task.
3396+ * EINVAL if either period or execution cost is <=0 for any level,
3397+ * of if utility is not incresing.
3398+ * EPERM if pid is a real-time task
3399+ * ENOMEM if there is insufficient memory available
3400+ * 0 if success
3401+ *
3402+ * May not be used on RT tasks to avoid races.
3403+ */
3404+asmlinkage long sys_set_service_levels(pid_t pid,
3405+ unsigned int count,
3406+ service_level_t __user *levels)
3407+{
3408+ struct task_struct *target;
3409+ service_level_t level, *klevels;
3410+ int retval = -EINVAL, i;
3411+ fp_t last_value = FP(0);
3412+ fp_t last_weight = FP(0);
3413+
3414+ TRACE("Setting up service levels for process %d.\n", pid);
3415+
3416+ if (pid < 0 || count > MAX_SERVICE_LEVELS) {
3417+ goto out;
3418+ }
3419+
3420+ /* Task search and manipulation must be protected */
3421+ read_lock_irq(&tasklist_lock);
3422+ if (!(target = find_task_by_pid(pid))) {
3423+ retval = -ESRCH;
3424+ read_unlock_irq(&tasklist_lock);
3425+ goto out;
3426+ }
3427+ read_unlock_irq(&tasklist_lock);
3428+
3429+ if (is_realtime(target)) {
3430+ /* The task is already a real-time task.
3431+ * We cannot not allow parameter changes at this point.
3432+ */
3433+ retval = -EBUSY;
3434+ goto out;
3435+ }
3436+
3437+ /* get rid of old service levels, if any */
3438+ kfree(target->rt_param.service_level);
3439+ target->rt_param.service_level = NULL;
3440+ target->rt_param.no_service_levels = 0;
3441+
3442+ /* count == 0 means tear down service levels*/
3443+ if (count == 0) {
3444+ retval = 0;
3445+ goto out;
3446+ }
3447+
3448+ klevels = kmalloc(sizeof(service_level_t) * count, GFP_KERNEL);
3449+ if (!klevels) {
3450+ retval = -ENOMEM;
3451+ goto out;
3452+ }
3453+
3454+ for (i = 0; i < count; i++) {
3455+ if (copy_from_user(&level, levels + i, sizeof(level))) {
3456+ retval = -EFAULT;
3457+ kfree(klevels);
3458+ goto out;
3459+ }
3460+ if (level.period <= 0) {
3461+ TRACE("service level %d period <= 0\n", i);
3462+ goto out;
3463+ }
3464+ if (_leq(level.weight, last_weight)) {
3465+ TRACE("service level %d weight non-increase\n", i);
3466+ goto out;
3467+ }
3468+ if (_leq(level.value, last_value)) {
3469+ TRACE("service level %d value non-increase\n", i);
3470+ goto out;
3471+ }
3472+ last_value = level.value;
3473+ last_weight = level.weight;
3474+ klevels[i] = level;
3475+ }
3476+ target->rt_param.basic_params.exec_cost =
3477+ _round(_mul(klevels[0].weight,
3478+ FP(klevels[0].period)));
3479+ target->rt_param.basic_params.period = klevels[0].period;
3480+ target->rt_param.service_level = klevels;
3481+ target->rt_param.no_service_levels = count;
3482+ retval = 0;
3483+
3484+ out:
3485+ return retval;
3486+}
3487+
3488+asmlinkage long sys_get_cur_service_level(void)
3489+{
3490+ long level;
3491+
3492+ if (!is_realtime(current))
3493+ return -EINVAL;
3494+
3495+ /* block scheduler that might cause reweighting to happen */
3496+ local_irq_disable();
3497+ level = current->rt_param.cur_service_level;
3498+ local_irq_enable();
3499+ return level;
3500+}
3501+
3502+
3503+/* sys_task_mode_transition
3504+ * @target_mode: The desired execution mode after the system call completes.
3505+ * Either BACKGROUND_TASK or LITMUS_RT_TASK.
3506+ * Allow a normal task to become a real-time task, vice versa.
3507+ * Returns EINVAL if illegal transition requested.
3508+ * 0 if task mode was changed succesfully
3509+ * other if plugin failed.
3510+ */
3511+asmlinkage long sys_task_mode_transition(int target_mode)
3512+{
3513+ int retval = -EINVAL;
3514+ struct task_struct *t = current;
3515+
3516+ if (( is_realtime(t) && target_mode == BACKGROUND_TASK) ||
3517+ (!is_realtime(t) && target_mode == LITMUS_RT_TASK)) {
3518+ TRACE_TASK(t, "attempts mode transition to %s\n",
3519+ is_realtime(t) ? "best-effort" : "real-time");
3520+ preempt_disable();
3521+ t->rt_param.transition_pending = 1;
3522+ t->state = TASK_STOPPED;
3523+ preempt_enable_no_resched();
3524+
3525+ schedule();
3526+
3527+ retval = t->rt_param.transition_error;
3528+ }
3529+ return retval;
3530+}
3531+
3532+/* implemented in kernel/litmus_sem.c */
3533+void srp_ceiling_block(void);
3534+
3535+/*
3536+ * This is the crucial function for periodic task implementation,
3537+ * It checks if a task is periodic, checks if such kind of sleep
3538+ * is permitted and calls plugin-specific sleep, which puts the
3539+ * task into a wait array.
3540+ * returns 0 on successful wakeup
3541+ * returns EPERM if current conditions do not permit such sleep
3542+ * returns EINVAL if current task is not able to go to sleep
3543+ */
3544+asmlinkage long sys_sleep_next_period(void)
3545+{
3546+ int retval = -EPERM;
3547+ if (!is_realtime(current)) {
3548+ retval = -EINVAL;
3549+ goto out;
3550+ }
3551+ /* Task with negative or zero period cannot sleep */
3552+ if (get_rt_period(current) <= 0) {
3553+ retval = -EINVAL;
3554+ goto out;
3555+ }
3556+ /* The plugin has to put the task into an
3557+ * appropriate queue and call schedule
3558+ */
3559+ retval = curr_sched_plugin->sleep_next_period();
3560+ if (!retval && is_subject_to_srp(current))
3561+ srp_ceiling_block();
3562+ out:
3563+ return retval;
3564+}
3565+
3566+/* This is an "improved" version of sys_sleep_next_period() that
3567+ * addresses the problem of unintentionally missing a job after
3568+ * an overrun.
3569+ *
3570+ * returns 0 on successful wakeup
3571+ * returns EPERM if current conditions do not permit such sleep
3572+ * returns EINVAL if current task is not able to go to sleep
3573+ */
3574+asmlinkage long sys_wait_for_job_release(unsigned int job)
3575+{
3576+ int retval = -EPERM;
3577+ if (!is_realtime(current)) {
3578+ retval = -EINVAL;
3579+ goto out;
3580+ }
3581+
3582+ /* Task with negative or zero period cannot sleep */
3583+ if (get_rt_period(current) <= 0) {
3584+ retval = -EINVAL;
3585+ goto out;
3586+ }
3587+
3588+ retval = 0;
3589+
3590+ /* first wait until we have "reached" the desired job
3591+ *
3592+ * This implementation has at least two problems:
3593+ *
3594+ * 1) It doesn't gracefully handle the wrap around of
3595+ * job_no. Since LITMUS is a prototype, this is not much
3596+ * of a problem right now.
3597+ *
3598+ * 2) It is theoretically racy if a job release occurs
3599+ * between checking job_no and calling sleep_next_period().
3600+ * A proper solution would requiring adding another callback
3601+ * in the plugin structure and testing the condition with
3602+ * interrupts disabled.
3603+ *
3604+ * FIXME: At least problem 2 should be taken care of eventually.
3605+ */
3606+ while (!retval && job > current->rt_param.times.job_no)
3607+ /* If the last job overran then job <= job_no and we
3608+ * don't send the task to sleep.
3609+ */
3610+ retval = curr_sched_plugin->sleep_next_period();
3611+
3612+ /* We still have to honor the SRP after the actual release.
3613+ */
3614+ if (!retval && is_subject_to_srp(current))
3615+ srp_ceiling_block();
3616+ out:
3617+ return retval;
3618+}
3619+
3620+/* This is a helper syscall to query the current job sequence number.
3621+ *
3622+ * returns 0 on successful query
3623+ * returns EPERM if task is not a real-time task.
3624+ * returns EFAULT if &job is not a valid pointer.
3625+ */
3626+asmlinkage long sys_query_job_no(unsigned int __user *job)
3627+{
3628+ int retval = -EPERM;
3629+ if (is_realtime(current))
3630+ retval = put_user(current->rt_param.times.job_no, job);
3631+
3632+ return retval;
3633+}
3634+
3635+
3636+/* The LITMUS tick function. It manages the change to and from real-time mode
3637+ * and then calls the plugin's tick function.
3638+ */
3639+reschedule_check_t __sched rt_scheduler_tick(void)
3640+{
3641+ /* Check for mode change */
3642+ if ((get_rt_mode() != atomic_read(&new_mode))) {
3643+ queue_lock(&mode_change_lock);
3644+ // If the mode is already changed, proceed
3645+ if (get_rt_mode() == atomic_read(&new_mode)) {
3646+ queue_unlock(&mode_change_lock);
3647+ goto proceed;
3648+ }
3649+ // change the mode
3650+ if ((atomic_read(&new_mode) == MODE_RT_RUN)) {
3651+ /* The deferral of entering real-time mode should be
3652+ * handled by deferring task releases in the plugin.
3653+ * The plugin interface does not really need to know
3654+ * about quanta, that is the plugin's job.
3655+ */
3656+
3657+ /* update rt start time */
3658+ rt_start_time = jiffies;
3659+ printk(KERN_INFO "Real-Time mode enabled at %ld "
3660+ "on %d\n",
3661+ jiffies, smp_processor_id());
3662+ } else
3663+ printk(KERN_INFO "Real-Time mode disabled at %ld "
3664+ "on %d\n",
3665+ jiffies, smp_processor_id());
3666+ if (curr_sched_plugin->mode_change)
3667+ curr_sched_plugin->
3668+ mode_change(atomic_read(&new_mode));
3669+ printk(KERN_INFO "Plugin mode change done at %ld\n",
3670+ jiffies);
3671+ set_rt_mode(atomic_read(&new_mode));
3672+ queue_unlock(&mode_change_lock);
3673+ }
3674+
3675+ proceed:
3676+ /* Call plugin-defined tick handler
3677+ *
3678+ * It is the plugin's tick handler' job to detect quantum
3679+ * boundaries in pfair.
3680+ */
3681+ return curr_sched_plugin->scheduler_tick();
3682+}
3683+
3684+asmlinkage spolicy sys_sched_setpolicy(spolicy newpolicy)
3685+{
3686+ /* Dynamic policy change is disabled at the moment */
3687+ return SCHED_INVALID;
3688+}
3689+
3690+asmlinkage spolicy sys_sched_getpolicy(void)
3691+{
3692+ return sched_policy;
3693+}
3694+
3695+
3696+asmlinkage int sys_scheduler_setup(int cmd, void __user *parameter)
3697+{
3698+ int ret = -EINVAL;
3699+
3700+ ret = curr_sched_plugin->scheduler_setup(cmd, parameter);
3701+ return ret;
3702+}
3703+
3704+struct sched_sig {
3705+ struct list_head list;
3706+ struct task_struct* task;
3707+ unsigned int signal:31;
3708+ int force:1;
3709+};
3710+
3711+static void __scheduler_signal(struct task_struct *t, unsigned int signo,
3712+ int force)
3713+{
3714+ struct sched_sig* sig;
3715+
3716+ sig = kmalloc(GFP_ATOMIC, sizeof(struct sched_sig));
3717+ if (!sig) {
3718+ TRACE_TASK(t, "dropping signal: %u\n", t);
3719+ return;
3720+ }
3721+
3722+ spin_lock(&sched_sig_list_lock);
3723+
3724+ sig->signal = signo;
3725+ sig->force = force;
3726+ sig->task = t;
3727+ get_task_struct(t);
3728+ list_add(&sig->list, &sched_sig_list);
3729+
3730+ spin_unlock(&sched_sig_list_lock);
3731+}
3732+
3733+void scheduler_signal(struct task_struct *t, unsigned int signo)
3734+{
3735+ __scheduler_signal(t, signo, 0);
3736+}
3737+
3738+void force_scheduler_signal(struct task_struct *t, unsigned int signo)
3739+{
3740+ __scheduler_signal(t, signo, 1);
3741+}
3742+
3743+/* FIXME: get rid of the locking and do this on a per-processor basis */
3744+void send_scheduler_signals(void)
3745+{
3746+ unsigned long flags;
3747+ struct list_head *p, *extra;
3748+ struct siginfo info;
3749+ struct sched_sig* sig;
3750+ struct task_struct* t;
3751+ struct list_head claimed;
3752+
3753+ if (spin_trylock_irqsave(&sched_sig_list_lock, flags)) {
3754+ if (list_empty(&sched_sig_list))
3755+ p = NULL;
3756+ else {
3757+ p = sched_sig_list.next;
3758+ list_del(&sched_sig_list);
3759+ INIT_LIST_HEAD(&sched_sig_list);
3760+ }
3761+ spin_unlock_irqrestore(&sched_sig_list_lock, flags);
3762+
3763+ /* abort if there are no signals */
3764+ if (!p)
3765+ return;
3766+
3767+ /* take signal list we just obtained */
3768+ list_add(&claimed, p);
3769+
3770+ list_for_each_safe(p, extra, &claimed) {
3771+ list_del(p);
3772+ sig = list_entry(p, struct sched_sig, list);
3773+ t = sig->task;
3774+ info.si_signo = sig->signal;
3775+ info.si_errno = 0;
3776+ info.si_code = SI_KERNEL;
3777+ info.si_pid = 1;
3778+ info.si_uid = 0;
3779+ TRACE("sending signal %d to %d\n", info.si_signo,
3780+ t->pid);
3781+ if (sig->force)
3782+ force_sig_info(sig->signal, &info, t);
3783+ else
3784+ send_sig_info(sig->signal, &info, t);
3785+ put_task_struct(t);
3786+ kfree(sig);
3787+ }
3788+ }
3789+
3790+}
3791+
3792+static inline void np_mem_error(struct task_struct* t, const char* reason)
3793+{
3794+ if (t->state != TASK_DEAD && !(t->flags & PF_EXITING)) {
3795+ TRACE("np section: %s => %s/%d killed\n",
3796+ reason, t->comm, t->pid);
3797+ force_scheduler_signal(t, SIGKILL);
3798+ }
3799+}
3800+
3801+/* sys_register_np_flag() allows real-time tasks to register an
3802+ * np section indicator.
3803+ * returns 0 if the flag was successfully registered
3804+ * returns EINVAL if current task is not a real-time task
3805+ * returns EFAULT if *flag couldn't be written
3806+ */
3807+asmlinkage long sys_register_np_flag(short __user *flag)
3808+{
3809+ int retval = -EINVAL;
3810+ short test_val = RT_PREEMPTIVE;
3811+
3812+ /* avoid races with the scheduler */
3813+ preempt_disable();
3814+ TRACE("reg_np_flag(%p) for %s/%d\n", flag,
3815+ current->comm, current->pid);
3816+
3817+ /* Let's first try to write to the address.
3818+ * That way it is initialized and any bugs
3819+ * involving dangling pointers will caught
3820+ * early.
3821+ * NULL indicates disabling np section support
3822+ * and should not be tested.
3823+ */
3824+ if (flag)
3825+ retval = poke_kernel_address(test_val, flag);
3826+ else
3827+ retval = 0;
3828+ TRACE("reg_np_flag: retval=%d\n", retval);
3829+ if (unlikely(0 != retval))
3830+ np_mem_error(current, "np flag: not writable");
3831+ else
3832+ /* the pointer is ok */
3833+ current->rt_param.np_flag = flag;
3834+
3835+ preempt_enable();
3836+ return retval;
3837+}
3838+
3839+
3840+void request_exit_np(struct task_struct *t)
3841+{
3842+ int ret;
3843+ short flag;
3844+
3845+ /* We can only do this if t is actually currently scheduled on this CPU
3846+ * because otherwise we are in the wrong address space. Thus make sure
3847+ * to check.
3848+ */
3849+ BUG_ON(t != current);
3850+
3851+ if (unlikely(!is_realtime(t) || !t->rt_param.np_flag)) {
3852+ TRACE_TASK(t, "request_exit_np(): BAD TASK!\n");
3853+ return;
3854+ }
3855+
3856+ flag = RT_EXIT_NP_REQUESTED;
3857+ ret = poke_kernel_address(flag, t->rt_param.np_flag + 1);
3858+ TRACE("request_exit_np(%s/%d)\n", t->comm, t->pid);
3859+ if (unlikely(0 != ret))
3860+ np_mem_error(current, "request_exit_np(): flag not writable");
3861+
3862+}
3863+
3864+
3865+int is_np(struct task_struct* t)
3866+{
3867+ int ret;
3868+ unsigned short flag = 0x5858; /* = XX, looks nicer in debug*/
3869+
3870+ BUG_ON(t != current);
3871+
3872+ if (unlikely(t->rt_param.kernel_np))
3873+ return 1;
3874+ else if (unlikely(t->rt_param.np_flag == NULL) ||
3875+ t->flags & PF_EXITING ||
3876+ t->state == TASK_DEAD)
3877+ return 0;
3878+ else {
3879+ /* This is the tricky part. The process has registered a
3880+ * non-preemptive section marker. We now need to check whether
3881+ * it is set to to NON_PREEMPTIVE. Along the way we could
3882+ * discover that the pointer points to an unmapped region (=>
3883+ * kill the task) or that the location contains some garbage
3884+ * value (=> also kill the task). Killing the task in any case
3885+ * forces userspace to play nicely. Any bugs will be discovered
3886+ * immediately.
3887+ */
3888+ ret = probe_kernel_address(t->rt_param.np_flag, flag);
3889+ if (0 == ret && (flag == RT_NON_PREEMPTIVE ||
3890+ flag == RT_PREEMPTIVE))
3891+ return flag != RT_PREEMPTIVE;
3892+ else {
3893+ /* either we could not read from the address or
3894+ * it contained garbage => kill the process
3895+ * FIXME: Should we cause a SEGFAULT instead?
3896+ */
3897+ TRACE("is_np: ret=%d flag=%c%c (%x)\n", ret,
3898+ flag & 0xff, (flag >> 8) & 0xff, flag);
3899+ np_mem_error(t, "is_np() could not read");
3900+ return 0;
3901+ }
3902+ }
3903+}
3904+
3905+/*
3906+ * sys_exit_np() allows real-time tasks to signal that it left a
3907+ * non-preemptable section. It will be called after the kernel requested a
3908+ * callback in the preemption indicator flag.
3909+ * returns 0 if the signal was valid and processed.
3910+ * returns EINVAL if current task is not a real-time task
3911+ */
3912+asmlinkage long sys_exit_np(void)
3913+{
3914+ int retval = -EINVAL;
3915+
3916+ TS_EXIT_NP_START;
3917+
3918+ if (!is_realtime(current))
3919+ goto out;
3920+
3921+ TRACE("sys_exit_np(%s/%d)\n", current->comm, current->pid);
3922+ /* force rescheduling so that we can be preempted */
3923+ set_tsk_need_resched(current);
3924+ retval = 0;
3925+ out:
3926+
3927+ TS_EXIT_NP_END;
3928+ return retval;
3929+}
3930+
3931+long transition_to_rt(struct task_struct* tsk)
3932+{
3933+ long retval;
3934+
3935+ BUG_ON(is_realtime(tsk));
3936+
3937+ if (get_rt_period(tsk) == 0 ||
3938+ get_exec_cost(tsk) > get_rt_period(tsk)) {
3939+ TRACE_TASK(tsk, "litmus prepare: invalid task parameters "
3940+ "(%lu, %lu)\n",
3941+ get_exec_cost(tsk), get_rt_period(tsk));
3942+ return -EINVAL;
3943+ }
3944+
3945+ if (!cpu_online(get_partition(tsk)))
3946+ {
3947+ TRACE_TASK(tsk, "litmus prepare: cpu %d is not online\n",
3948+ get_partition(tsk));
3949+ return -EINVAL;
3950+ }
3951+
3952+ tsk->rt_param.old_prio = tsk->rt_priority;
3953+ tsk->rt_param.old_policy = tsk->policy;
3954+ INIT_LIST_HEAD(&tsk->rt_list);
3955+
3956+ retval = curr_sched_plugin->prepare_task(tsk);
3957+
3958+ if (!retval) {
3959+ atomic_inc(&n_rt_tasks);
3960+ tsk->rt_param.is_realtime = 1;
3961+ tsk->rt_param.litmus_controlled = 1;
3962+ }
3963+
3964+ return retval;
3965+}
3966+
3967+/* p is a real-time task. Re-init its state as a best-effort task. */
3968+static void reinit_litmus_state(struct task_struct* p, int restore)
3969+{
3970+ rt_param_t user_config;
3971+ __user short *np_flag;
3972+
3973+ if (restore) {
3974+ /* Safe user-space provided configuration data.
3975+ * FIXME: This is missing service levels for adaptive tasks.
3976+ */
3977+ user_config = p->rt_param.basic_params;
3978+ np_flag = p->rt_param.np_flag;
3979+ }
3980+
3981+ /* We probably should not be inheriting any task's priority
3982+ * at this point in time.
3983+ */
3984+ WARN_ON(p->rt_param.inh_task);
3985+
3986+ /* We need to restore the priority of the task. */
3987+ __setscheduler(p, p->rt_param.old_policy, p->rt_param.old_prio);
3988+
3989+ /* Cleanup everything else. */
3990+ memset(&p->rt_param, 0, sizeof(task_rt_param_t));
3991+
3992+ /* Restore preserved fields. */
3993+ if (restore) {
3994+ p->rt_param.basic_params = user_config;
3995+ p->rt_param.np_flag = np_flag;
3996+ }
3997+}
3998+
3999+long transition_to_be(struct task_struct* tsk)
4000+{
4001+ BUG_ON(!is_realtime(tsk));
4002+
4003+ curr_sched_plugin->tear_down(tsk);
4004+ atomic_dec(&n_rt_tasks);
4005+ reinit_litmus_state(tsk, 1);
4006+ return 0;
4007+}
4008+
4009+/* Called upon fork.
4010+ * p is the newly forked task.
4011+ */
4012+void litmus_fork(struct task_struct* p)
4013+{
4014+ if (is_realtime(p))
4015+ /* clean out any litmus related state, don't preserve anything*/
4016+ reinit_litmus_state(p, 0);
4017+}
4018+
4019+/* Called upon execve().
4020+ * current is doing the exec.
4021+ * Don't let address space specific stuff leak.
4022+ */
4023+void litmus_exec(void)
4024+{
4025+ struct task_struct* p = current;
4026+
4027+ if (is_realtime(p)) {
4028+ WARN_ON(p->rt_param.inh_task);
4029+ p->rt_param.np_flag = NULL;
4030+ }
4031+}
4032+
4033+void exit_litmus(struct task_struct *dead_tsk)
4034+{
4035+ if (is_realtime(dead_tsk))
4036+ transition_to_be(dead_tsk);
4037+ kfree(dead_tsk->rt_param.service_level);
4038+}
4039+
4040+
4041+void list_qsort(struct list_head* list, list_cmp_t less_than)
4042+{
4043+ struct list_head lt;
4044+ struct list_head geq;
4045+ struct list_head *pos, *extra, *pivot;
4046+ int n_lt = 0, n_geq = 0;
4047+ BUG_ON(!list);
4048+
4049+ if (list->next == list)
4050+ return;
4051+
4052+ INIT_LIST_HEAD(&lt);
4053+ INIT_LIST_HEAD(&geq);
4054+
4055+ pivot = list->next;
4056+ list_del(pivot);
4057+ list_for_each_safe(pos, extra, list) {
4058+ list_del(pos);
4059+ if (less_than(pos, pivot)) {
4060+ list_add(pos, &lt);
4061+ n_lt++;
4062+ } else {
4063+ list_add(pos, &geq);
4064+ n_geq++;
4065+ }
4066+ }
4067+ if (n_lt < n_geq) {
4068+ list_qsort(&lt, less_than);
4069+ list_qsort(&geq, less_than);
4070+ } else {
4071+ list_qsort(&geq, less_than);
4072+ list_qsort(&lt, less_than);
4073+ }
4074+ list_splice(&geq, list);
4075+ list_add(pivot, list);
4076+ list_splice(&lt, list);
4077+}
4078+
4079+#ifdef CONFIG_MAGIC_SYSRQ
4080+/* We offer the possibility to change the real-time mode of the system
4081+ * with a magic sys request. This helps in debugging in case the system fails
4082+ * to perform its planned switch back to normal mode. This may happen if we have
4083+ * total system utilization and the task that is supposed to do the switch is
4084+ * always preempted (if it is not a real-time task).
4085+ */
4086+int sys_kill(int pid, int sig);
4087+
4088+
4089+static void sysrq_handle_toGgle_rt_mode(int key, struct tty_struct *tty)
4090+{
4091+ sys_set_rt_mode(get_rt_mode() == MODE_NON_RT);
4092+}
4093+
4094+static struct sysrq_key_op sysrq_toGgle_rt_mode_op = {
4095+ .handler = sysrq_handle_toGgle_rt_mode,
4096+ .help_msg = "toGgle-rt-mode",
4097+ .action_msg = "real-time mode changed",
4098+};
4099+
4100+static void sysrq_handle_kill_rt_tasks(int key, struct tty_struct *tty)
4101+{
4102+ struct task_struct *t;
4103+ read_lock(&tasklist_lock);
4104+ for_each_process(t) {
4105+ if (is_realtime(t)) {
4106+ sys_kill(t->pid, SIGKILL);
4107+ }
4108+ }
4109+ read_unlock(&tasklist_lock);
4110+}
4111+
4112+static struct sysrq_key_op sysrq_kill_rt_tasks_op = {
4113+ .handler = sysrq_handle_kill_rt_tasks,
4114+ .help_msg = "Quit-rt-tasks",
4115+ .action_msg = "sent SIGKILL to all real-time tasks",
4116+};
4117+#endif
4118+
4119+/*
4120+ * Scheduler initialization so that customized scheduler is
4121+ * enabled at boot time
4122+ * by setting boot option "rtsched=plugin_name", e.g. "rtsched=pfair"
4123+ */
4124+
4125+/* All we need to know about other plugins is their initialization
4126+ * functions. These functions initialize internal data structures of a
4127+ * scheduler and return a pointer to initialized sched_plugin data
4128+ * structure with pointers to scheduling function implementations.
4129+ * If called repeatedly these init functions just return an existing
4130+ * plugin pointer.
4131+ */
4132+sched_plugin_t *init_global_edf_plugin(void);
4133+sched_plugin_t *init_global_edf_np_plugin(void);
4134+sched_plugin_t *init_part_edf_plugin(void);
4135+sched_plugin_t *init_edf_hsb_plugin(void);
4136+sched_plugin_t *init_pfair_plugin(void);
4137+sched_plugin_t *init_gsn_edf_plugin(void);
4138+sched_plugin_t *init_psn_edf_plugin(void);
4139+sched_plugin_t *init_adaptive_plugin(void);
4140+
4141+/* keep everything needed to setup plugins in one place */
4142+
4143+/* we are lazy, so we use a convention for function naming to fill
4144+ * a table
4145+ */
4146+#define PLUGIN(caps, small) \
4147+ {PLUGIN_ ## caps, SCHED_ ## caps, init_ ## small ## _plugin}
4148+
4149+#define init_nosetup_plugin 0
4150+
4151+static struct {
4152+ const char *name;
4153+ const spolicy policy_id;
4154+ sched_plugin_t *(*init) (void);
4155+} available_plugins[] = {
4156+ PLUGIN(LINUX, nosetup),
4157+ PLUGIN(GLOBAL_EDF_NP, global_edf_np),
4158+ PLUGIN(GLOBAL_EDF, global_edf),
4159+ PLUGIN(PART_EDF, part_edf),
4160+ PLUGIN(EDF_HSB, edf_hsb),
4161+ PLUGIN(PFAIR, pfair),
4162+ PLUGIN(GSN_EDF, gsn_edf),
4163+ PLUGIN(PSN_EDF, psn_edf),
4164+ PLUGIN(ADAPTIVE, adaptive),
4165+ /*********************************************
4166+ * Add your custom plugin here
4167+ **********************************************/
4168+};
4169+
4170+/* Some plugins may leave important functions unused. We define dummies
4171+ * so that we don't have to check for null pointers all over the place.
4172+ */
4173+void litmus_dummy_finish_switch(struct task_struct * prev);
4174+int litmus_dummy_schedule(struct task_struct * prev, struct task_struct** next,
4175+ runqueue_t* q);
4176+reschedule_check_t litmus_dummy_scheduler_tick(void);
4177+long litmus_dummy_prepare_task(struct task_struct *t);
4178+void litmus_dummy_wake_up_task(struct task_struct *task);
4179+void litmus_dummy_task_blocks(struct task_struct *task);
4180+long litmus_dummy_tear_down(struct task_struct *task);
4181+int litmus_dummy_scheduler_setup(int cmd, void __user *parameter);
4182+long litmus_dummy_sleep_next_period(void);
4183+long litmus_dummy_inherit_priority(struct pi_semaphore *sem,
4184+ struct task_struct *new_owner);
4185+long litmus_dummy_return_priority(struct pi_semaphore *sem);
4186+long litmus_dummy_pi_block(struct pi_semaphore *sem,
4187+ struct task_struct *t);
4188+
4189+#define CHECK(func) {\
4190+ if (!curr_sched_plugin->func) \
4191+ curr_sched_plugin->func = litmus_dummy_ ## func;}
4192+
4193+static int boot_sched_setup(char *plugin_name)
4194+{
4195+ int i = 0;
4196+
4197+ /* Common initializers,
4198+ * mode change lock is used to enforce single mode change
4199+ * operation.
4200+ */
4201+ queue_lock_init(&mode_change_lock);
4202+
4203+ printk("Starting LITMUS^RT kernel\n");
4204+
4205+ /* Look for a matching plugin.
4206+ */
4207+ for (i = 0; i < ARRAY_SIZE(available_plugins); i++) {
4208+ if (!strcmp(plugin_name, available_plugins[i].name)) {
4209+ printk("Using %s scheduler plugin\n", plugin_name);
4210+ sched_policy = available_plugins[i].policy_id;
4211+ if (available_plugins[i].init)
4212+ curr_sched_plugin = available_plugins[i].init();
4213+ goto out;
4214+ }
4215+ }
4216+
4217+
4218+ /* Otherwise we have default linux scheduler */
4219+ printk("Plugin name %s is unknown, using default %s\n", plugin_name,
4220+ curr_sched_plugin->plugin_name);
4221+
4222+out:
4223+ /* make sure we don't trip over null pointers later */
4224+ CHECK(finish_switch);
4225+ CHECK(schedule);
4226+ CHECK(scheduler_tick);
4227+ CHECK(wake_up_task);
4228+ CHECK(tear_down);
4229+ CHECK(task_blocks);
4230+ CHECK(prepare_task);
4231+ CHECK(scheduler_setup);
4232+ CHECK(sleep_next_period);
4233+ CHECK(inherit_priority);
4234+ CHECK(return_priority);
4235+ CHECK(pi_block);
4236+
4237+#ifdef CONFIG_MAGIC_SYSRQ
4238+ /* offer some debugging help */
4239+ if (!register_sysrq_key('g', &sysrq_toGgle_rt_mode_op))
4240+ printk("Registered eXit real-time mode magic sysrq.\n");
4241+ else
4242+ printk("Could not register eXit real-time mode magic sysrq.\n");
4243+ if (!register_sysrq_key('q', &sysrq_kill_rt_tasks_op))
4244+ printk("Registered kill rt tasks magic sysrq.\n");
4245+ else
4246+ printk("Could not register kill rt tasks magic sysrq.\n");
4247+#endif
4248+ printk("Litmus setup complete.");
4249+ return 1;
4250+}
4251+
4252+/* Register for boot option */
4253+__setup("rtsched=", boot_sched_setup);
4254diff --git a/kernel/litmus_sem.c b/kernel/litmus_sem.c
4255new file mode 100644
4256index 0000000..53da534
4257--- /dev/null
4258+++ b/kernel/litmus_sem.c
4259@@ -0,0 +1,567 @@
4260+/*
4261+ * PI semaphores and SRP implementations.
4262+ * Much of the code here is borrowed from include/asm-i386/semaphore.h.
4263+ *
4264+ * NOTE: This implementation is very much a prototype and horribly insecure. It
4265+ * is intended to be a proof of concept, not a feature-complete solution.
4266+ */
4267+
4268+#include <asm/atomic.h>
4269+#include <asm/semaphore.h>
4270+#include <linux/sched.h>
4271+#include <linux/wait.h>
4272+#include <linux/spinlock.h>
4273+#include <linux/queuelock.h>
4274+#include <linux/litmus.h>
4275+#include <linux/sched_plugin.h>
4276+#include <linux/edf_common.h>
4277+
4278+#include <linux/fdso.h>
4279+
4280+#include <linux/trace.h>
4281+
4282+/* ************************************************************************** */
4283+/* PRIORITY INHERITANCE */
4284+/* ************************************************************************** */
4285+
4286+static void* create_pi_semaphore(void)
4287+{
4288+ struct pi_semaphore* sem;
4289+ int i;
4290+
4291+ sem = kmalloc(sizeof(struct pi_semaphore), GFP_KERNEL);
4292+ if (!sem)
4293+ return NULL;
4294+ atomic_set(&sem->count, 1);
4295+ sem->sleepers = 0;
4296+ init_waitqueue_head(&sem->wait);
4297+ sem->hp.task = NULL;
4298+ sem->holder = NULL;
4299+ for (i = 0; i < NR_CPUS; i++)
4300+ sem->hp.cpu_task[i] = NULL;
4301+ return sem;
4302+}
4303+
4304+static void destroy_pi_semaphore(void* sem)
4305+{
4306+ /* XXX assert invariants */
4307+ kfree(sem);
4308+}
4309+
4310+struct fdso_ops pi_sem_ops = {
4311+ .create = create_pi_semaphore,
4312+ .destroy = destroy_pi_semaphore
4313+};
4314+
4315+struct wq_pair {
4316+ struct task_struct* tsk;
4317+ struct pi_semaphore* sem;
4318+};
4319+
4320+static int rt_pi_wake_up(wait_queue_t *wait, unsigned mode, int sync,
4321+ void *key)
4322+{
4323+ struct wq_pair* wqp = (struct wq_pair*) wait->private;
4324+ set_rt_flags(wqp->tsk, RT_F_EXIT_SEM);
4325+ curr_sched_plugin->inherit_priority(wqp->sem, wqp->tsk);
4326+ TRACE_TASK(wqp->tsk,
4327+ "woken up by rt_pi_wake_up() (RT_F_SEM_EXIT, PI)\n");
4328+ /* point to task for default_wake_function() */
4329+ wait->private = wqp->tsk;
4330+ default_wake_function(wait, mode, sync, key);
4331+
4332+ /* Always return true since we know that if we encountered a task
4333+ * that was already running the wake_up raced with the schedule in
4334+ * rt_pi_down(). In that case the task in rt_pi_down() will be scheduled
4335+ * immediately and own the lock. We must not wake up another task in
4336+ * any case.
4337+ */
4338+ return 1;
4339+}
4340+
4341+/* caller is responsible for locking */
4342+int edf_set_hp_task(struct pi_semaphore *sem)
4343+{
4344+ struct list_head *tmp, *next;
4345+ struct task_struct *queued;
4346+ int ret = 0;
4347+
4348+ sem->hp.task = NULL;
4349+ list_for_each_safe(tmp, next, &sem->wait.task_list) {
4350+ queued = ((struct wq_pair*)
4351+ list_entry(tmp, wait_queue_t,
4352+ task_list)->private)->tsk;
4353+
4354+ /* Compare task prios, find high prio task. */
4355+ if (edf_higher_prio(queued, sem->hp.task)) {
4356+ sem->hp.task = queued;
4357+ ret = 1;
4358+ }
4359+ }
4360+ return ret;
4361+}
4362+
4363+/* caller is responsible for locking */
4364+int edf_set_hp_cpu_task(struct pi_semaphore *sem, int cpu)
4365+{
4366+ struct list_head *tmp, *next;
4367+ struct task_struct *queued;
4368+ int ret = 0;
4369+
4370+ sem->hp.cpu_task[cpu] = NULL;
4371+ list_for_each_safe(tmp, next, &sem->wait.task_list) {
4372+ queued = ((struct wq_pair*)
4373+ list_entry(tmp, wait_queue_t,
4374+ task_list)->private)->tsk;
4375+
4376+ /* Compare task prios, find high prio task. */
4377+ if (get_partition(queued) == cpu &&
4378+ edf_higher_prio(queued, sem->hp.cpu_task[cpu])) {
4379+ sem->hp.cpu_task[cpu] = queued;
4380+ ret = 1;
4381+ }
4382+ }
4383+ return ret;
4384+}
4385+
4386+int do_pi_down(struct pi_semaphore* sem)
4387+{
4388+ unsigned long flags;
4389+ struct task_struct *tsk = current;
4390+ struct wq_pair pair;
4391+ int suspended = 1;
4392+ wait_queue_t wait = {
4393+ .private = &pair,
4394+ .func = rt_pi_wake_up,
4395+ .task_list = {NULL, NULL}
4396+ };
4397+
4398+ pair.tsk = tsk;
4399+ pair.sem = sem;
4400+ spin_lock_irqsave(&sem->wait.lock, flags);
4401+
4402+ if (atomic_dec_return(&sem->count) < 0 ||
4403+ waitqueue_active(&sem->wait)) {
4404+ /* we need to suspend */
4405+ tsk->state = TASK_UNINTERRUPTIBLE;
4406+ add_wait_queue_exclusive_locked(&sem->wait, &wait);
4407+
4408+ TRACE_CUR("suspends on PI lock %p\n", sem);
4409+ curr_sched_plugin->pi_block(sem, tsk);
4410+
4411+ /* release lock before sleeping */
4412+ spin_unlock_irqrestore(&sem->wait.lock, flags);
4413+
4414+ TS_PI_DOWN_END;
4415+ preempt_enable_no_resched();
4416+
4417+
4418+ /* we depend on the FIFO order
4419+ * Thus, we don't need to recheck when we wake up, we
4420+ * are guaranteed to have the lock since there is only one
4421+ * wake up per release
4422+ */
4423+ schedule();
4424+
4425+ TRACE_CUR("woke up, now owns PI lock %p\n", sem);
4426+
4427+ /* try_to_wake_up() set our state to TASK_RUNNING,
4428+ * all we need to do is to remove our wait queue entry
4429+ */
4430+ remove_wait_queue(&sem->wait, &wait);
4431+ } else {
4432+ /* no priority inheritance necessary, since there are no queued
4433+ * tasks.
4434+ */
4435+ suspended = 0;
4436+ TRACE_CUR("acquired PI lock %p, no contention\n", sem);
4437+ sem->holder = tsk;
4438+ sem->hp.task = tsk;
4439+ curr_sched_plugin->inherit_priority(sem, tsk);
4440+ spin_unlock_irqrestore(&sem->wait.lock, flags);
4441+ }
4442+ return suspended;
4443+}
4444+
4445+void do_pi_up(struct pi_semaphore* sem)
4446+{
4447+ unsigned long flags;
4448+
4449+ spin_lock_irqsave(&sem->wait.lock, flags);
4450+
4451+ TRACE_CUR("releases PI lock %p\n", sem);
4452+ curr_sched_plugin->return_priority(sem);
4453+ sem->holder = NULL;
4454+ if (atomic_inc_return(&sem->count) < 1)
4455+ /* there is a task queued */
4456+ wake_up_locked(&sem->wait);
4457+
4458+ spin_unlock_irqrestore(&sem->wait.lock, flags);
4459+}
4460+
4461+asmlinkage long sys_pi_down(int sem_od)
4462+{
4463+ long ret = 0;
4464+ struct pi_semaphore * sem;
4465+ int suspended = 0;
4466+
4467+ preempt_disable();
4468+ TS_PI_DOWN_START;
4469+
4470+ sem = lookup_pi_sem(sem_od);
4471+ if (sem)
4472+ suspended = do_pi_down(sem);
4473+ else
4474+ ret = -EINVAL;
4475+
4476+ if (!suspended) {
4477+ TS_PI_DOWN_END;
4478+ preempt_enable();
4479+ }
4480+
4481+ return ret;
4482+}
4483+
4484+asmlinkage long sys_pi_up(int sem_od)
4485+{
4486+ long ret = 0;
4487+ struct pi_semaphore * sem;
4488+
4489+ preempt_disable();
4490+ TS_PI_UP_START;
4491+
4492+ sem = lookup_pi_sem(sem_od);
4493+ if (sem)
4494+ do_pi_up(sem);
4495+ else
4496+ ret = -EINVAL;
4497+
4498+
4499+ TS_PI_UP_END;
4500+ preempt_enable();
4501+
4502+ return ret;
4503+}
4504+
4505+/* Clear wait queue and wakeup waiting tasks, and free semaphore. */
4506+/*
4507+asmlinkage long sys_pi_sema_free(int sem_id)
4508+{
4509+ struct list_head *tmp, *next;
4510+ unsigned long flags;
4511+
4512+ if (sem_id < 0 || sem_id >= MAX_PI_SEMAPHORES)
4513+ return -EINVAL;
4514+
4515+ if (!pi_sems[sem_id].used)
4516+ return -EINVAL;
4517+
4518+ spin_lock_irqsave(&pi_sems[sem_id].wait.lock, flags);
4519+ if (waitqueue_active(&pi_sems[sem_id].wait)) {
4520+ list_for_each_safe(tmp, next,
4521+ &pi_sems[sem_id].wait.task_list) {
4522+ wait_queue_t *curr = list_entry(tmp, wait_queue_t,
4523+ task_list);
4524+ list_del(tmp);
4525+ set_rt_flags((struct task_struct*)curr->private,
4526+ RT_F_EXIT_SEM);
4527+ curr->func(curr,
4528+ TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
4529+ 0, NULL);
4530+ }
4531+ }
4532+
4533+ spin_unlock_irqrestore(&pi_sems[sem_id].wait.lock, flags);
4534+ pi_sems[sem_id].used = 0;
4535+
4536+ return 0;
4537+}
4538+*/
4539+
4540+
4541+
4542+/* ************************************************************************** */
4543+/* STACK RESOURCE POLICY */
4544+/* ************************************************************************** */
4545+
4546+
4547+struct srp_priority {
4548+ struct list_head list;
4549+ unsigned int period;
4550+ pid_t pid;
4551+};
4552+
4553+#define list2prio(l) list_entry(l, struct srp_priority, list)
4554+
4555+/* SRP task priority comparison function. Smaller periods have highest
4556+ * priority, tie-break is PID. Special case: period == 0 <=> no priority
4557+ */
4558+static int srp_higher_prio(struct srp_priority* first,
4559+ struct srp_priority* second)
4560+{
4561+ if (!first->period)
4562+ return 0;
4563+ else
4564+ return !second->period ||
4565+ first->period < second->period || (
4566+ first->period == second->period &&
4567+ first->pid < second->pid);
4568+}
4569+
4570+struct srp {
4571+ struct list_head ceiling;
4572+ wait_queue_head_t ceiling_blocked;
4573+};
4574+
4575+
4576+DEFINE_PER_CPU(struct srp, srp);
4577+
4578+#define system_ceiling(srp) list2prio(srp->ceiling.next)
4579+
4580+static int srp_exceeds_ceiling(struct task_struct* first,
4581+ struct srp* srp)
4582+{
4583+ return list_empty(&srp->ceiling) ||
4584+ get_rt_period(first) < system_ceiling(srp)->period ||
4585+ (get_rt_period(first) == system_ceiling(srp)->period &&
4586+ first->pid < system_ceiling(srp)->pid);
4587+}
4588+
4589+static void srp_add_prio(struct srp* srp, struct srp_priority* prio)
4590+{
4591+ struct list_head *pos;
4592+ if (in_list(&prio->list)) {
4593+ TRACE_CUR("WARNING: SRP violation detected, prio is already in "
4594+ "ceiling list!\n");
4595+ return;
4596+ }
4597+ list_for_each(pos, &srp->ceiling)
4598+ if (unlikely(srp_higher_prio(prio, list2prio(pos)))) {
4599+ __list_add(&prio->list, pos->prev, pos);
4600+ return;
4601+ }
4602+
4603+ list_add_tail(&prio->list, &srp->ceiling);
4604+}
4605+
4606+/* struct for uniprocessor SRP "semaphore" */
4607+struct srp_semaphore {
4608+ struct srp_priority ceiling;
4609+ int cpu; /* cpu associated with this "semaphore" and resource */
4610+ int claimed; /* is the resource claimed (ceiling should be used)? */
4611+};
4612+
4613+
4614+static void* create_srp_semaphore(void)
4615+{
4616+ struct srp_semaphore* sem;
4617+
4618+ if (!is_realtime(current))
4619+ /* XXX log error */
4620+ return NULL;
4621+
4622+ sem = kmalloc(sizeof(*sem), GFP_KERNEL);
4623+ if (!sem)
4624+ return NULL;
4625+
4626+ INIT_LIST_HEAD(&sem->ceiling.list);
4627+ sem->ceiling.period = 0;
4628+ sem->claimed = 0;
4629+ sem->cpu = get_partition(current);
4630+ return sem;
4631+}
4632+
4633+static void destroy_srp_semaphore(void* sem)
4634+{
4635+ /* XXX invariants */
4636+ kfree(sem);
4637+}
4638+
4639+struct fdso_ops srp_sem_ops = {
4640+ .create = create_srp_semaphore,
4641+ .destroy = destroy_srp_semaphore
4642+};
4643+
4644+/* Initialize SRP semaphores at boot time. */
4645+static int __init srp_sema_boot_init(void)
4646+{
4647+ int i;
4648+
4649+ printk("Initializing SRP per-CPU ceilings...");
4650+ for (i = 0; i < NR_CPUS; i++) {
4651+ init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked);
4652+ INIT_LIST_HEAD(&per_cpu(srp, i).ceiling);
4653+ }
4654+ printk(" done!\n");
4655+
4656+ return 0;
4657+}
4658+__initcall(srp_sema_boot_init);
4659+
4660+
4661+void do_srp_down(struct srp_semaphore* sem)
4662+{
4663+ /* claim... */
4664+ sem->claimed = 1;
4665+ /* ...and update ceiling */
4666+ srp_add_prio(&__get_cpu_var(srp), &sem->ceiling);
4667+}
4668+
4669+void do_srp_up(struct srp_semaphore* sem)
4670+{
4671+ sem->claimed = 0;
4672+
4673+ /* Determine new system priority ceiling for this CPU. */
4674+ if (in_list(&sem->ceiling.list))
4675+ list_del(&sem->ceiling.list);
4676+ else
4677+ TRACE_CUR("WARNING: SRP violation detected, prio not in ceiling"
4678+ " list!\n");
4679+
4680+ /* Wake tasks on this CPU, if they exceed current ceiling. */
4681+ wake_up_all(&__get_cpu_var(srp).ceiling_blocked);
4682+}
4683+
4684+/* Adjust the system-wide priority ceiling if resource is claimed. */
4685+asmlinkage long sys_srp_down(int sem_od)
4686+{
4687+ int cpu;
4688+ int ret = -EINVAL;
4689+ struct srp_semaphore* sem;
4690+
4691+ /* disabling preemptions is sufficient protection since
4692+ * SRP is strictly per CPU and we don't interfere with any
4693+ * interrupt handlers
4694+ */
4695+ preempt_disable();
4696+ TS_SRP_DOWN_START;
4697+
4698+ cpu = smp_processor_id();
4699+ sem = lookup_srp_sem(sem_od);
4700+ if (sem && sem->cpu == cpu) {
4701+ do_srp_down(sem);
4702+ ret = 0;
4703+ }
4704+
4705+ TS_SRP_DOWN_END;
4706+ preempt_enable();
4707+ return ret;
4708+}
4709+
4710+/* Adjust the system-wide priority ceiling if resource is freed. */
4711+asmlinkage long sys_srp_up(int sem_od)
4712+{
4713+ int cpu;
4714+ int ret = -EINVAL;
4715+ struct srp_semaphore* sem;
4716+
4717+ preempt_disable();
4718+ TS_SRP_UP_START;
4719+
4720+ cpu = smp_processor_id();
4721+ sem = lookup_srp_sem(sem_od);
4722+
4723+ if (sem && sem->cpu == cpu) {
4724+ do_srp_up(sem);
4725+ ret = 0;
4726+ }
4727+
4728+ TS_SRP_UP_END;
4729+ preempt_enable();
4730+ return ret;
4731+}
4732+
4733+/* Indicate that task will use a resource associated with a given
4734+ * semaphore. Should be done *a priori* before RT task system is
4735+ * executed, so this does *not* update the system priority
4736+ * ceiling! (The ceiling would be meaningless anyway, as the SRP
4737+ * breaks without this a priori knowledge.)
4738+ */
4739+asmlinkage long sys_reg_task_srp_sem(int sem_od)
4740+{
4741+ /*
4742+ * FIXME: This whole concept is rather brittle!
4743+ * There must be a better solution. Maybe register on
4744+ * first reference?
4745+ */
4746+
4747+ struct task_struct *t = current;
4748+ struct srp_priority t_prio;
4749+ struct srp_semaphore* sem;
4750+
4751+ sem = lookup_srp_sem(sem_od);
4752+
4753+ if (!sem)
4754+ return -EINVAL;
4755+
4756+ if (!is_realtime(t))
4757+ return -EPERM;
4758+
4759+ if (sem->cpu != get_partition(t))
4760+ return -EINVAL;
4761+
4762+ preempt_disable();
4763+ t->rt_param.subject_to_srp = 1;
4764+ t_prio.period = get_rt_period(t);
4765+ t_prio.pid = t->pid;
4766+ if (srp_higher_prio(&t_prio, &sem->ceiling)) {
4767+ sem->ceiling.period = t_prio.period;
4768+ sem->ceiling.pid = t_prio.pid;
4769+ }
4770+
4771+ preempt_enable();
4772+
4773+ return 0;
4774+}
4775+
4776+static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync,
4777+ void *key)
4778+{
4779+ int cpu = smp_processor_id();
4780+ struct task_struct *tsk = wait->private;
4781+ if (cpu != get_partition(tsk))
4782+ TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b",
4783+ get_partition(tsk));
4784+ else if (srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
4785+ return default_wake_function(wait, mode, sync, key);
4786+ return 0;
4787+}
4788+
4789+
4790+/* Wait for current task priority to exceed system-wide priority ceiling.
4791+ * Can be used to determine when it is safe to run a job after its release.
4792+ */
4793+void srp_ceiling_block(void)
4794+{
4795+ struct task_struct *tsk = current;
4796+ wait_queue_t wait = {
4797+ .private = tsk,
4798+ .func = srp_wake_up,
4799+ .task_list = {NULL, NULL}
4800+ };
4801+
4802+ preempt_disable();
4803+ if (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) {
4804+ tsk->state = TASK_UNINTERRUPTIBLE;
4805+ add_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
4806+ TRACE_CUR("is priority ceiling blocked.\n");
4807+ preempt_enable_no_resched();
4808+ schedule();
4809+ /* Access to CPU var must occur with preemptions disabled,
4810+ * otherwise Linux debug code complains loudly, even if it is
4811+ * ok here.
4812+ */
4813+ preempt_disable();
4814+ TRACE_CUR("finally exceeds system ceiling.\n");
4815+ remove_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
4816+ preempt_enable();
4817+ } else {
4818+ TRACE_CUR("is not priority ceiling blocked\n");
4819+ preempt_enable();
4820+ }
4821+}
4822+
4823+/* ************************************************************************** */
4824+
4825+
4826+
4827diff --git a/kernel/pfair_common.c b/kernel/pfair_common.c
4828new file mode 100644
4829index 0000000..c50fdab
4830--- /dev/null
4831+++ b/kernel/pfair_common.c
4832@@ -0,0 +1,237 @@
4833+/*
4834+ * Common functions for PFAIR based scheduler.
4835+ */
4836+
4837+#include <linux/percpu.h>
4838+#include <linux/sched.h>
4839+#include <linux/list.h>
4840+
4841+#include <linux/litmus.h>
4842+#include <linux/sched_plugin.h>
4843+#include <linux/sched_trace.h>
4844+
4845+#include <linux/pfair_common.h>
4846+#include <linux/pfair_math.h>
4847+/* Comparison of two tasks whether
4848+ * the lhs has higher priority than the rhs */
4849+int is_pfair_hp(struct task_struct *lhs, struct task_struct *rhs)
4850+{
4851+ /* Favor subtasks with earlier deadlines */
4852+ if(time_before(get_deadline(lhs), get_deadline(rhs)))
4853+ return 1;
4854+ if(get_deadline(lhs) == get_deadline(rhs)) {
4855+ /* If deadlines are equal,
4856+ * favor non-zero b-bit (a heavy task) */
4857+ if(lhs->rt_param.times.b_bit > rhs->rt_param.times.b_bit)
4858+ return 1;
4859+
4860+ if(lhs->rt_param.times.b_bit == rhs->rt_param.times.b_bit &&
4861+ lhs->rt_param.times.b_bit == 1)
4862+ /* If b-bit is 1, favor tasks with later
4863+ * group deadline */
4864+ return time_after(lhs->rt_param.times.group_deadline,
4865+ rhs->rt_param.times.group_deadline);
4866+
4867+ }
4868+ return 0;
4869+}
4870+
4871+void pfair_domain_init(pfair_domain_t *pfair)
4872+{
4873+ BUG_ON(!pfair);
4874+ INIT_LIST_HEAD(&pfair->ready_queue);
4875+ INIT_LIST_HEAD(&pfair->release_queue);
4876+ queue_lock_init(&pfair->pfair_lock);
4877+ cpus_setall(pfair->domain_cpus);
4878+ /* Use cpu 0 to keep the system alive
4879+ * TODO: Remove later or make it configurable
4880+ * */
4881+ cpu_clear(0, pfair->domain_cpus);
4882+}
4883+
4884+
4885+/* add_ready - add a real-time task to the PFAIR ready queue.
4886+ * It must be runnable. Global domain lock must be held before
4887+ * calling this function.
4888+ *
4889+ * @new: the newly released task
4890+ */
4891+void pfair_add_ready(pfair_domain_t* pfair, struct task_struct *new)
4892+{
4893+ struct list_head *pos;
4894+ struct task_struct *queued;
4895+
4896+ BUG_ON(!new);
4897+ /* find a spot where our deadline is earlier than the next */
4898+ list_for_each(pos, &pfair->ready_queue) {
4899+ queued = list_entry(pos, struct task_struct, rt_list);
4900+ if (unlikely(is_pfair_hp(new, queued))) {
4901+ /* the task at pos has a later deadline */
4902+ /* insert the new task in front of it */
4903+ __list_add(&new->rt_list, pos->prev, pos);
4904+ return;
4905+ }
4906+ }
4907+ /* if we get to this point either the list is empty or new has the
4908+ * lowest priority. Let's add it to the end. */
4909+ list_add_tail(&new->rt_list, &pfair->ready_queue);
4910+}
4911+/**
4912+ * Extraction function.
4913+ */
4914+struct task_struct* __pfair_take_ready(pfair_domain_t* pfair)
4915+{
4916+ struct task_struct *t = NULL;
4917+ /* either not yet released, preempted, or non-rt */
4918+ if (!list_empty(&pfair->ready_queue)) {
4919+
4920+ /* take next rt task */
4921+ t = list_entry(pfair->ready_queue.next, struct task_struct,
4922+ rt_list);
4923+
4924+ /* kick it out of the ready list */
4925+ list_del(&t->rt_list);
4926+ }
4927+ return t;
4928+}
4929+
4930+
4931+/* add_release - add a real-time task to the PFAIR release queue.
4932+ * Domain lock must be acquired before the function is called.
4933+ *
4934+ * @task: the sleeping task
4935+ */
4936+void pfair_add_release(pfair_domain_t* pfair, struct task_struct *task)
4937+{
4938+ struct list_head *pos;
4939+ struct task_struct *queued;
4940+
4941+ BUG_ON(!task);
4942+ /* find a spot where our deadline is earlier than the next */
4943+ list_for_each_prev(pos, &pfair->release_queue) {
4944+ queued = list_entry(pos, struct task_struct, rt_list);
4945+ if ((unlikely(time_before(queued->rt_param.times.release,
4946+ task->rt_param.times.release)))) {
4947+ /* the task at pos has an earlier release */
4948+ /* insert the new task in behind it */
4949+ __list_add(&task->rt_list, pos, pos->next);
4950+ return;
4951+ }
4952+ }
4953+ /* if we get to this point either the list is empty or task has the
4954+ * earliest release. Let's add it to the front. */
4955+ list_add(&task->rt_list, &pfair->release_queue);
4956+}
4957+/**
4958+ * This function is called from tick handler, it acquires the lock
4959+ * automatically. Only one processor effectively merges the queues.
4960+ */
4961+void pfair_try_release_pending(pfair_domain_t* pfair)
4962+{
4963+ unsigned long flags;
4964+ struct list_head *pos, *save;
4965+ struct task_struct *queued;
4966+ queue_lock_irqsave(&pfair->pfair_lock, flags);
4967+
4968+ list_for_each_safe(pos, save, &pfair->release_queue) {
4969+ queued = list_entry(pos, struct task_struct, rt_list);
4970+ if (likely(time_before_eq(
4971+ queued->rt_param.times.release, jiffies))) {
4972+ /* this one is ready to go*/
4973+ list_del(pos);
4974+ set_rt_flags(queued, RT_F_RUNNING);
4975+
4976+ sched_trace_job_release(queued);
4977+ /* now it can be picked up */
4978+ barrier();
4979+ pfair_add_ready(pfair, queued);
4980+ }
4981+ else
4982+ /* the release queue is ordered */
4983+ break;
4984+ }
4985+ queue_unlock_irqrestore(&pfair->pfair_lock, flags);
4986+}
4987+/*
4988+ * Subtask preparation. Assuming that last_release
4989+ * denotes the time when the job was released.
4990+ */
4991+void pfair_prepare_next_subtask(struct task_struct *t)
4992+{
4993+ BUG_ON(!t);
4994+ /* assign subtask release time, deadline, b-bit,
4995+ * and group deadline
4996+ */
4997+ t->rt_param.times.release = t->rt_param.times.last_release
4998+ +release_time(t);
4999+ t->rt_param.times.deadline = t->rt_param.times.last_release
5000+ +pfair_deadline(t);
5001+ t->rt_param.times.b_bit = b_bit(t);
5002+ t->rt_param.times.group_deadline = t->rt_param.times.last_release
5003+ +group_deadline(t);
5004+}
5005+
5006+void pfair_prepare_next_job(struct task_struct *t)
5007+{
5008+ BUG_ON(!t);
5009+
5010+ /* prepare next job release */
5011+ /* make passed quantums zero so that we could compute new release times
5012+ * and deadlines for subtasks correctly
5013+ */
5014+ t->rt_param.times.exec_time = 0;
5015+ /* assign job-wide release time,
5016+ * this is the starting point to
5017+ * compute subtask releases, deadlines and group deadlines
5018+ */
5019+ t->rt_param.times.last_release = t->rt_param.times.last_release
5020+ +get_rt_period(t);
5021+ /* Release the first subtask. */
5022+ pfair_prepare_next_subtask(t);
5023+ t->first_time_slice = 0;
5024+ /* Increase job sequence number */
5025+ t->rt_param.times.job_no++;
5026+}
5027+
5028+void __pfair_prepare_new_release(struct task_struct *t, jiffie_t start)
5029+{
5030+ t->rt_param.times.release = start;
5031+ t->rt_param.times.last_release = start;
5032+ t->rt_param.times.exec_time = 0;
5033+ t->first_time_slice = 0;
5034+ pfair_prepare_next_subtask(t);
5035+ set_rt_flags(t, RT_F_RUNNING);
5036+}
5037+
5038+void pfair_prepare_new_releases(pfair_domain_t *pfair, jiffie_t start)
5039+{
5040+ unsigned long flags;
5041+ struct list_head tmp_list;
5042+ struct list_head *pos, *n;
5043+ struct task_struct *t;
5044+
5045+ INIT_LIST_HEAD(&tmp_list);
5046+
5047+ queue_lock_irqsave(&pfair->pfair_lock, flags);
5048+
5049+
5050+ while (!list_empty(&pfair->release_queue)) {
5051+ pos = pfair->release_queue.next;
5052+ list_del(pos);
5053+ list_add(pos, &tmp_list);
5054+ }
5055+ while (!list_empty(&pfair->ready_queue)) {
5056+ pos = pfair->ready_queue.next;
5057+ list_del(pos);
5058+ list_add(pos, &tmp_list);
5059+ }
5060+
5061+ list_for_each_safe(pos, n, &tmp_list) {
5062+ t = list_entry(pos, struct task_struct, rt_list);
5063+ list_del(pos);
5064+ __pfair_prepare_new_release(t, start);
5065+ pfair_add_release(pfair, t);
5066+ }
5067+ queue_unlock_irqrestore(&pfair->pfair_lock, flags);
5068+}
5069+
5070diff --git a/kernel/rt_domain.c b/kernel/rt_domain.c
5071new file mode 100644
5072index 0000000..4875c53
5073--- /dev/null
5074+++ b/kernel/rt_domain.c
5075@@ -0,0 +1,185 @@
5076+/*
5077+ * kernel/rt_domain.c
5078+ *
5079+ * LITMUS real-time infrastructure. This file contains the
5080+ * functions that manipulate RT domains. RT domains are an abstraction
5081+ * of a ready queue and a release queue.
5082+ */
5083+
5084+#include <linux/percpu.h>
5085+#include <linux/sched.h>
5086+#include <linux/list.h>
5087+
5088+#include <linux/litmus.h>
5089+#include <linux/sched_plugin.h>
5090+#include <linux/sched_trace.h>
5091+
5092+#include <linux/rt_domain.h>
5093+
5094+
5095+static int dummy_resched(rt_domain_t *rt)
5096+{
5097+ return 0;
5098+}
5099+
5100+static int dummy_order(struct list_head* a, struct list_head* b)
5101+{
5102+ return 0;
5103+}
5104+
5105+int release_order(struct list_head* a, struct list_head* b)
5106+{
5107+ return earlier_release(
5108+ list_entry(a, struct task_struct, rt_list),
5109+ list_entry(b, struct task_struct, rt_list));
5110+}
5111+
5112+
5113+void rt_domain_init(rt_domain_t *rt,
5114+ check_resched_needed_t f,
5115+ list_cmp_t order)
5116+{
5117+ BUG_ON(!rt);
5118+ if (!f)
5119+ f = dummy_resched;
5120+ if (!order)
5121+ order = dummy_order;
5122+ INIT_LIST_HEAD(&rt->ready_queue);
5123+ INIT_LIST_HEAD(&rt->release_queue);
5124+ rt->ready_lock = RW_LOCK_UNLOCKED;
5125+ rt->release_lock = SPIN_LOCK_UNLOCKED;
5126+ rt->check_resched = f;
5127+ rt->order = order;
5128+}
5129+
5130+/* add_ready - add a real-time task to the rt ready queue. It must be runnable.
5131+ * @new: the newly released task
5132+ */
5133+void __add_ready(rt_domain_t* rt, struct task_struct *new)
5134+{
5135+ TRACE("rt: adding %s/%d (%u, %u) to ready queue\n",
5136+ new->comm, new->pid, get_exec_cost(new), get_rt_period(new));
5137+
5138+ if (!list_insert(&new->rt_list, &rt->ready_queue, rt->order))
5139+ rt->check_resched(rt);
5140+}
5141+
5142+struct task_struct* __take_ready(rt_domain_t* rt)
5143+{
5144+ struct task_struct *t = __peek_ready(rt);
5145+
5146+ /* kick it out of the ready list */
5147+ if (t)
5148+ list_del(&t->rt_list);
5149+ return t;
5150+}
5151+
5152+struct task_struct* __peek_ready(rt_domain_t* rt)
5153+{
5154+ if (!list_empty(&rt->ready_queue))
5155+ return next_ready(rt);
5156+ else
5157+ return NULL;
5158+}
5159+
5160+struct task_struct* __take_ready_rq(rt_domain_t* rt, runqueue_t* rq, int cpu)
5161+{
5162+ struct task_struct *task = __take_ready(rt);
5163+
5164+ if (task) {
5165+ set_task_cpu(task, cpu);
5166+ __activate_task(task, rq);
5167+ }
5168+ return task;
5169+}
5170+
5171+/* add_release - add a real-time task to the rt release queue.
5172+ * @task: the sleeping task
5173+ */
5174+void __add_release(rt_domain_t* rt, struct task_struct *task)
5175+{
5176+ TRACE("rt: adding %s/%d (%u, %u) rel=%d to release queue\n",
5177+ task->comm, task->pid, get_exec_cost(task), get_rt_period(task),
5178+ get_release(task));
5179+
5180+ list_insert(&task->rt_list, &rt->release_queue, release_order);
5181+}
5182+
5183+void __release_pending(rt_domain_t* rt)
5184+{
5185+ struct list_head *pos, *save;
5186+ struct task_struct *queued;
5187+ list_for_each_safe(pos, save, &rt->release_queue) {
5188+ queued = list_entry(pos, struct task_struct, rt_list);
5189+ if (likely(is_released(queued))) {
5190+ /* this one is ready to go*/
5191+ list_del(pos);
5192+ set_rt_flags(queued, RT_F_RUNNING);
5193+
5194+ sched_trace_job_release(queued);
5195+
5196+ /* now it can be picked up */
5197+ barrier();
5198+ add_ready(rt, queued);
5199+ }
5200+ else
5201+ /* the release queue is ordered */
5202+ break;
5203+ }
5204+}
5205+
5206+void try_release_pending(rt_domain_t* rt)
5207+{
5208+ unsigned long flags;
5209+
5210+ if (spin_trylock_irqsave(&rt->release_lock, flags)) {
5211+ __release_pending(rt);
5212+ spin_unlock_irqrestore(&rt->release_lock, flags);
5213+ }
5214+}
5215+
5216+void rerelease_all(rt_domain_t *rt,
5217+ release_at_t release)
5218+{
5219+ unsigned long flags;
5220+
5221+ spin_lock_irqsave(&rt->release_lock, flags);
5222+ write_lock(&rt->ready_lock);
5223+
5224+ __rerelease_all(rt, release);
5225+
5226+ write_unlock(&rt->ready_lock);
5227+ spin_unlock_irqrestore(&rt->release_lock, flags);
5228+}
5229+
5230+void __rerelease_all(rt_domain_t *rt,
5231+ release_at_t release)
5232+{
5233+ jiffie_t start = jiffies + 10;
5234+ struct list_head tmp_list;
5235+ struct list_head *pos, *n;
5236+ struct task_struct *t;
5237+
5238+ INIT_LIST_HEAD(&tmp_list);
5239+
5240+ while (!list_empty(&rt->release_queue)) {
5241+ pos = rt->release_queue.next;
5242+ list_del(pos);
5243+ list_add(pos, &tmp_list);
5244+ }
5245+ while (!list_empty(&rt->ready_queue)) {
5246+ pos = rt->ready_queue.next;
5247+ list_del(pos);
5248+ list_add(pos, &tmp_list);
5249+ }
5250+
5251+ list_for_each_safe(pos, n, &tmp_list) {
5252+ t = list_entry(pos, struct task_struct, rt_list);
5253+ list_del(pos);
5254+ release(t, start);
5255+ __add_release(rt, t);
5256+ }
5257+
5258+}
5259+
5260+
5261diff --git a/kernel/sched.c b/kernel/sched.c
5262index cca93cc..47f16cc 100644
5263--- a/kernel/sched.c
5264+++ b/kernel/sched.c
5265@@ -56,6 +56,16 @@
5266
5267 #include <asm/unistd.h>
5268
5269+#include <linux/litmus.h>
5270+#define __SCHED_C__
5271+#include <linux/sched_plugin.h>
5272+#include <linux/sched_trace.h>
5273+#include <linux/rt_param.h>
5274+#include <linux/trace.h>
5275+
5276+/* LITMUS: avoid races with multiple task wake-ups */
5277+DEFINE_SPINLOCK(litmus_task_set_lock);
5278+
5279 /*
5280 * Convert user-nice values [ -20 ... 0 ... 19 ]
5281 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
5282@@ -836,7 +846,7 @@ static int effective_prio(struct task_struct *p)
5283 * keep the priority unchanged. Otherwise, update priority
5284 * to the normal priority:
5285 */
5286- if (!rt_prio(p->prio))
5287+ if (!rt_prio(p->prio) && !is_realtime(p))
5288 return p->normal_prio;
5289 return p->prio;
5290 }
5291@@ -844,7 +854,7 @@ static int effective_prio(struct task_struct *p)
5292 /*
5293 * __activate_task - move a task to the runqueue.
5294 */
5295-static void __activate_task(struct task_struct *p, struct rq *rq)
5296+void __activate_task(struct task_struct *p, struct rq *rq)
5297 {
5298 struct prio_array *target = rq->active;
5299
5300@@ -999,7 +1009,7 @@ out:
5301 /*
5302 * deactivate_task - remove a task from the runqueue.
5303 */
5304-static void deactivate_task(struct task_struct *p, struct rq *rq)
5305+void deactivate_task(struct task_struct *p, struct rq *rq)
5306 {
5307 dec_nr_running(p, rq);
5308 dequeue_task(p, p->array);
5309@@ -1408,13 +1418,44 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
5310 #endif
5311
5312 rq = task_rq_lock(p, &flags);
5313+
5314+ if (is_realtime(p))
5315+ TRACE("try_to_wake_up(%s/%d)\n", p->comm, p->pid);
5316+
5317 old_state = p->state;
5318 if (!(old_state & state))
5319- goto out;
5320+ goto out;
5321
5322 if (p->array)
5323 goto out_running;
5324
5325+
5326+ spin_lock(&litmus_task_set_lock);
5327+ if (p->rt_param.litmus_controlled) {
5328+ /* Already included. This can happen
5329+ * if the task dropped all locks to call
5330+ * schedule() but a wake up raced and came in
5331+ * early.
5332+ */
5333+
5334+ spin_unlock(&litmus_task_set_lock);
5335+ goto out_running;
5336+ }
5337+
5338+ sched_trace_task_arrival(p);
5339+ if (is_realtime(p)) {
5340+ p->rt_param.litmus_controlled = 1;
5341+ curr_sched_plugin->wake_up_task(p);
5342+
5343+ spin_unlock(&litmus_task_set_lock);
5344+ goto out_running;
5345+ }
5346+
5347+ p->rt_param.litmus_controlled = 0;
5348+ spin_unlock(&litmus_task_set_lock);
5349+
5350+
5351+
5352 cpu = task_cpu(p);
5353 this_cpu = smp_processor_id();
5354
5355@@ -1575,11 +1616,14 @@ static void task_running_tick(struct rq *rq, struct task_struct *p);
5356 void fastcall sched_fork(struct task_struct *p, int clone_flags)
5357 {
5358 int cpu = get_cpu();
5359+
5360+ litmus_fork(p);
5361
5362 #ifdef CONFIG_SMP
5363 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
5364 #endif
5365 set_task_cpu(p, cpu);
5366+ clear_rt_params(p);
5367
5368 /*
5369 * We mark the process as running here, but have not actually
5370@@ -1730,6 +1774,9 @@ void fastcall sched_exit(struct task_struct *p)
5371 unsigned long flags;
5372 struct rq *rq;
5373
5374+ if (is_realtime(p))
5375+ return;
5376+
5377 /*
5378 * If the child was a (relative-) CPU hog then decrease
5379 * the sleep_avg of the parent as well.
5380@@ -1765,6 +1812,31 @@ static inline void prepare_task_switch(struct rq *rq, struct task_struct *next)
5381 prepare_arch_switch(next);
5382 }
5383
5384+static void litmus_transition(struct task_struct *tsk, struct rq *rq)
5385+{
5386+ int wakeup = 0;
5387+ WARN_ON(tsk->state != TASK_STOPPED);
5388+
5389+ tsk->rt_param.transition_pending = 0;
5390+ if (is_realtime(tsk)) {
5391+ /* RT -> BE transition */
5392+ tsk->rt_param.transition_error = transition_to_be(tsk);
5393+ wakeup = tsk->rt_param.transition_error == 0;
5394+ } else {
5395+ /* BE -> RT transition */
5396+ tsk->rt_param.transition_error = transition_to_rt(tsk);
5397+ /* If it was rejected as a real-time task, then
5398+ * keep it running as a best-effort task.
5399+ */
5400+ wakeup = tsk->rt_param.transition_error != 0;
5401+ }
5402+ if (wakeup) {
5403+ /* we still hold the runqueue lock */
5404+ tsk->state = TASK_RUNNING;
5405+ __activate_task(tsk, rq);
5406+ }
5407+}
5408+
5409 /**
5410 * finish_task_switch - clean up after a task-switch
5411 * @rq: runqueue associated with task-switch
5412@@ -1801,6 +1873,15 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
5413 */
5414 prev_state = prev->state;
5415 finish_arch_switch(prev);
5416+ /* Requeue previous real-time task before we drop the rq lock, cause
5417+ * that may lead to a preemption.
5418+ */
5419+ curr_sched_plugin->finish_switch(prev);
5420+ sched_trace_task_scheduled(current);
5421+ if (rt_transition_pending(prev))
5422+ litmus_transition(prev, rq);
5423+ /* trace before IRQs are enabled */
5424+ TS_CXS_END;
5425 finish_lock_switch(rq, prev);
5426 if (mm)
5427 mmdrop(mm);
5428@@ -1811,7 +1892,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
5429 */
5430 kprobe_flush_task(prev);
5431 put_task_struct(prev);
5432- }
5433+ }
5434 }
5435
5436 /**
5437@@ -2990,7 +3071,7 @@ static inline void idle_balance(int cpu, struct rq *rq)
5438 static inline void wake_priority_sleeper(struct rq *rq)
5439 {
5440 #ifdef CONFIG_SCHED_SMT
5441- if (!rq->nr_running)
5442+ if (!rq->nr_running || get_rt_mode() == MODE_RT_RUN)
5443 return;
5444
5445 spin_lock(&rq->lock);
5446@@ -3220,14 +3301,30 @@ void scheduler_tick(void)
5447
5448 update_cpu_clock(p, rq, now);
5449
5450- if (p == rq->idle)
5451- /* Task on the idle queue */
5452- wake_priority_sleeper(rq);
5453- else
5454- task_running_tick(rq, p);
5455+ /* check whether the RT scheduler plugin requires a call to
5456+ * schedule
5457+ */
5458+ TS_PLUGIN_TICK_START;
5459+ if (rt_scheduler_tick() == FORCE_RESCHED)
5460+ set_tsk_need_resched(p);
5461+ TS_PLUGIN_TICK_END;
5462+
5463+ /* real-time accounting is done by the plugin
5464+ * call linux functions only for background tasks
5465+ */
5466+ if (!is_realtime(p)) {
5467+ if (p == rq->idle)
5468+ /* Task on the idle queue */
5469+ wake_priority_sleeper(rq);
5470+ else
5471+ task_running_tick(rq, p);
5472+ }
5473+ send_scheduler_signals();
5474+
5475 #ifdef CONFIG_SMP
5476 update_load(rq);
5477- if (time_after_eq(jiffies, rq->next_balance))
5478+ if (time_after_eq(jiffies, rq->next_balance) &&
5479+ get_rt_mode() == MODE_NON_RT)
5480 raise_softirq(SCHED_SOFTIRQ);
5481 #endif
5482 }
5483@@ -3406,6 +3503,7 @@ static inline int interactive_sleep(enum sleep_type sleep_type)
5484 sleep_type == SLEEP_INTERRUPTED);
5485 }
5486
5487+
5488 /*
5489 * schedule() is the main scheduler function.
5490 */
5491@@ -3420,6 +3518,7 @@ asmlinkage void __sched schedule(void)
5492 long *switch_count;
5493 struct rq *rq;
5494
5495+
5496 /*
5497 * Test if we are atomic. Since do_exit() needs to call into
5498 * schedule() atomically, we ignore that path for now.
5499@@ -3427,8 +3526,9 @@ asmlinkage void __sched schedule(void)
5500 */
5501 if (unlikely(in_atomic() && !current->exit_state)) {
5502 printk(KERN_ERR "BUG: scheduling while atomic: "
5503- "%s/0x%08x/%d\n",
5504- current->comm, preempt_count(), current->pid);
5505+ "%s/0x%08x/%d %s\n",
5506+ current->comm, preempt_count(), current->pid,
5507+ is_realtime(current) ? "rt" : "non-rt");
5508 debug_show_held_locks(current);
5509 if (irqs_disabled())
5510 print_irqtrace_events(current);
5511@@ -3438,6 +3538,7 @@ asmlinkage void __sched schedule(void)
5512
5513 need_resched:
5514 preempt_disable();
5515+ TS_SCHED_START;
5516 prev = current;
5517 release_kernel_lock(prev);
5518 need_resched_nonpreemptible:
5519@@ -3470,6 +3571,7 @@ need_resched_nonpreemptible:
5520 spin_lock_irq(&rq->lock);
5521
5522 switch_count = &prev->nivcsw;
5523+ /* check for blocking tasks */
5524 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
5525 switch_count = &prev->nvcsw;
5526 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
5527@@ -3478,13 +3580,65 @@ need_resched_nonpreemptible:
5528 else {
5529 if (prev->state == TASK_UNINTERRUPTIBLE)
5530 rq->nr_uninterruptible++;
5531+ /* we need to remove real-time tasks from the runqueue*/
5532+
5533+ /* protect against races with signal delivery and IO
5534+ * interrupts on other CPUs
5535+ *
5536+ * FIXME: This is probably not sufficient,
5537+ * as (in theory) after
5538+ * unlocking the task_set_lock this task could
5539+ * be scheduled elsewere before we switched away
5540+ * from it. This has not been observed
5541+ * yet. To get this locking right is tricky.
5542+ */
5543+ spin_lock(&litmus_task_set_lock);
5544+ if (prev->rt_param.litmus_controlled)
5545+ prev->rt_param.litmus_controlled = 0;
5546+ spin_unlock(&litmus_task_set_lock);
5547+
5548+ if (is_realtime(prev)) {
5549+ TRACE("schedule: %s/%d blocks. state = %d\n",
5550+ prev->comm, prev->pid, prev->state);
5551+ curr_sched_plugin->task_blocks(prev);
5552+ /* Enable this for all tasks to get _a lot_ of
5553+ * data. Can be helpful for debugging.
5554+ */
5555+ sched_trace_task_departure(prev);
5556+ }
5557+ /* only indirect switching is supported in the current
5558+ * version of LITMUS
5559+ */
5560 deactivate_task(prev, rq);
5561 }
5562 }
5563
5564+ next = NULL;
5565+
5566+ /* consult the real-time plugin */
5567+ TS_PLUGIN_SCHED_START;
5568+ curr_sched_plugin->schedule(prev, &next, rq);
5569+ TS_PLUGIN_SCHED_END;
5570+ /* If the real-time plugin wants to switch to a specific task
5571+ * it'll be on the rq and have the highest priority. There will
5572+ * be exaclty one such task, thus the selection of the next task
5573+ * is unambiguous and the following code can only get
5574+ * triggered if there are no RT tasks pending (on this CPU). Thus,
5575+ * we may as well skip it.
5576+ */
5577+ if (next)
5578+ goto switch_tasks;
5579+
5580 cpu = smp_processor_id();
5581 if (unlikely(!rq->nr_running)) {
5582- idle_balance(cpu, rq);
5583+ /* only load-balance if we are not in RT mode
5584+ *
5585+ * TODO: Maybe this can be relaxed by modifiying the
5586+ * load-balancing routines in such a way that they never touch
5587+ * real-time tasks.
5588+ */
5589+ if (get_rt_mode() == MODE_NON_RT)
5590+ idle_balance(cpu, rq);
5591 if (!rq->nr_running) {
5592 next = rq->idle;
5593 rq->expired_timestamp = 0;
5594@@ -3528,7 +3682,7 @@ need_resched_nonpreemptible:
5595 }
5596 }
5597 next->sleep_type = SLEEP_NORMAL;
5598- if (dependent_sleeper(cpu, rq, next))
5599+ if (get_rt_mode() == MODE_NON_RT && dependent_sleeper(cpu, rq, next))
5600 next = rq->idle;
5601 switch_tasks:
5602 if (next == rq->idle)
5603@@ -3546,7 +3700,11 @@ switch_tasks:
5604 prev->timestamp = prev->last_ran = now;
5605
5606 sched_info_switch(prev, next);
5607+ TS_SCHED_END;
5608 if (likely(prev != next)) {
5609+ TS_CXS_START;
5610+ if (is_running(prev))
5611+ sched_trace_task_preemption(prev, next);
5612 next->timestamp = now;
5613 rq->nr_switches++;
5614 rq->curr = next;
5615@@ -3560,9 +3718,12 @@ switch_tasks:
5616 * CPUs since it called schedule(), thus the 'rq' on its stack
5617 * frame will be invalid.
5618 */
5619- finish_task_switch(this_rq(), prev);
5620- } else
5621+ finish_task_switch(this_rq(), prev);
5622+ } else {
5623 spin_unlock_irq(&rq->lock);
5624+ }
5625+
5626+ send_scheduler_signals();
5627
5628 prev = current;
5629 if (unlikely(reacquire_kernel_lock(prev) < 0))
5630@@ -3691,6 +3852,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
5631 }
5632 }
5633
5634+
5635 /**
5636 * __wake_up - wake up threads blocked on a waitqueue.
5637 * @q: the waitqueue
5638@@ -3709,6 +3871,7 @@ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
5639 }
5640 EXPORT_SYMBOL(__wake_up);
5641
5642+
5643 /*
5644 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
5645 */
5646@@ -3717,6 +3880,7 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
5647 __wake_up_common(q, mode, 1, 0, NULL);
5648 }
5649
5650+
5651 /**
5652 * __wake_up_sync - wake up threads blocked on a waitqueue.
5653 * @q: the waitqueue
5654@@ -4175,7 +4339,7 @@ static inline struct task_struct *find_process_by_pid(pid_t pid)
5655 }
5656
5657 /* Actually do priority change: must hold rq lock. */
5658-static void __setscheduler(struct task_struct *p, int policy, int prio)
5659+void __setscheduler(struct task_struct *p, int policy, int prio)
5660 {
5661 BUG_ON(p->array);
5662
5663@@ -6877,7 +7041,7 @@ void __init sched_init_smp(void)
5664 BUG();
5665 }
5666 #else
5667-void __init sched_init_smp(void)
5668+void __init linux_sched_init_smp(void)
5669 {
5670 }
5671 #endif /* CONFIG_SMP */
5672diff --git a/kernel/sched_adaptive.c b/kernel/sched_adaptive.c
5673new file mode 100644
5674index 0000000..44ce924
5675--- /dev/null
5676+++ b/kernel/sched_adaptive.c
5677@@ -0,0 +1,1454 @@
5678+
5679+
5680+/*
5681+ * kernel/sched_adaptive.c
5682+ *
5683+ * Implementation of Aaron's adaptive global EDF scheduling algorithm. It is
5684+ * based on the GSN-EDF scheduler. However, it does not support synchronization
5685+ * primitives.
5686+ *
5687+ * It implements a version of FC-GEDF with a bunch of linearity assumptions for
5688+ * the optimizer and the the weight-transfer function. The code is meant to be
5689+ * clear, however you really need to read the paper if you want to understand
5690+ * what is going on here.
5691+ *
5692+ * Block et al., "Feedback-Controlled Adaptive Multiprocessor Real-Time
5693+ * Systems", submitted to RTAS 2008.
5694+ */
5695+
5696+#include <linux/percpu.h>
5697+#include <linux/sched.h>
5698+#include <linux/list.h>
5699+
5700+#include <linux/queuelock.h>
5701+#include <linux/litmus.h>
5702+#include <linux/sched_plugin.h>
5703+#include <linux/edf_common.h>
5704+#include <linux/sched_trace.h>
5705+#include <asm/uaccess.h>
5706+
5707+#include <linux/fpmath.h>
5708+
5709+/* Overview of GSN-EDF operations.
5710+ *
5711+ * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This
5712+ * description only covers how the individual operations are implemented in
5713+ * LITMUS.
5714+ *
5715+ * link_task_to_cpu(T, cpu) - Low-level operation to update the linkage
5716+ * structure (NOT the actually scheduled
5717+ * task). If there is another linked task To
5718+ * already it will set To->linked_on = NO_CPU
5719+ * (thereby removing its association with this
5720+ * CPU). However, it will not requeue the
5721+ * previously linked task (if any). It will set
5722+ * T's state to RT_F_RUNNING and check whether
5723+ * it is already running somewhere else. If T
5724+ * is scheduled somewhere else it will link
5725+ * it to that CPU instead (and pull the linked
5726+ * task to cpu). T may be NULL.
5727+ *
5728+ * unlink(T) - Unlink removes T from all scheduler data
5729+ * structures. If it is linked to some CPU it
5730+ * will link NULL to that CPU. If it is
5731+ * currently queued in the gsnedf queue it will
5732+ * be removed from the T->rt_list. It is safe to
5733+ * call unlink(T) if T is not linked. T may not
5734+ * be NULL.
5735+ *
5736+ * requeue(T) - Requeue will insert T into the appropriate
5737+ * queue. If the system is in real-time mode and
5738+ * the T is released already, it will go into the
5739+ * ready queue. If the system is not in
5740+ * real-time mode is T, then T will go into the
5741+ * release queue. If T's release time is in the
5742+ * future, it will go into the release
5743+ * queue. That means that T's release time/job
5744+ * no/etc. has to be updated before requeu(T) is
5745+ * called. It is not safe to call requeue(T)
5746+ * when T is already queued. T may not be NULL.
5747+ *
5748+ * gsnedf_job_arrival(T) - This is the catch all function when T enters
5749+ * the system after either a suspension or at a
5750+ * job release. It will queue T (which means it
5751+ * is not safe to call gsnedf_job_arrival(T) if
5752+ * T is already queued) and then check whether a
5753+ * preemption is necessary. If a preemption is
5754+ * necessary it will update the linkage
5755+ * accordingly and cause scheduled to be called
5756+ * (either with an IPI or need_resched). It is
5757+ * safe to call gsnedf_job_arrival(T) if T's
5758+ * next job has not been actually released yet
5759+ * (releast time in the future). T will be put
5760+ * on the release queue in that case.
5761+ *
5762+ * job_completion(T) - Take care of everything that needs to be done
5763+ * to prepare T for its next release and place
5764+ * it in the right queue with
5765+ * gsnedf_job_arrival().
5766+ *
5767+ *
5768+ * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is
5769+ * equivalent to unlink(T). Note that if you unlink a task from a CPU none of
5770+ * the functions will automatically propagate pending task from the ready queue
5771+ * to a linked task. This is the job of the calling function ( by means of
5772+ * __take_ready).
5773+ */
5774+
5775+static void unlink(struct task_struct* t);
5776+static void adaptive_job_arrival(struct task_struct* task);
5777+
5778+/* cpu_entry_t - maintain the linked and scheduled state
5779+ */
5780+typedef struct {
5781+ int cpu;
5782+ struct task_struct* linked; /* only RT tasks */
5783+ struct task_struct* scheduled; /* only RT tasks */
5784+ struct list_head list;
5785+ atomic_t will_schedule; /* prevent unneeded IPIs */
5786+} cpu_entry_t;
5787+DEFINE_PER_CPU(cpu_entry_t, adaptive_cpu_entries);
5788+
5789+#define set_will_schedule() \
5790+ (atomic_set(&__get_cpu_var(adaptive_cpu_entries).will_schedule, 1))
5791+#define clear_will_schedule() \
5792+ (atomic_set(&__get_cpu_var(adaptive_cpu_entries).will_schedule, 0))
5793+#define test_will_schedule(cpu) \
5794+ (atomic_read(&per_cpu(adaptive_cpu_entries, cpu).will_schedule))
5795+
5796+
5797+#define NO_CPU 0xffffffff
5798+
5799+/* The gsnedf_lock is used to serialize all scheduling events.
5800+ * It protects
5801+ */
5802+static queuelock_t adaptive_lock;
5803+/* the cpus queue themselves according to priority in here */
5804+static LIST_HEAD(adaptive_cpu_queue);
5805+
5806+static rt_domain_t adaptive;
5807+
5808+/* feedback control parameters */
5809+static fp_t fc_a, fc_b;
5810+
5811+/* optimizer trigger */
5812+static jiffie_t last_optimizer_run;
5813+static jiffie_t optimizer_min_invocation_sep;
5814+static jiffie_t optimizer_period;
5815+static fp_t task_error_threshold;
5816+
5817+static fp_t system_capacity;
5818+/* total actual weight of the task system */
5819+static fp_t total_weight;
5820+
5821+/* optimizer time snapshot */
5822+jiffie_t opt_time;
5823+
5824+/* Delayed weight increase notification list.
5825+ * This list gets clobbered on each optimizer run.
5826+ */
5827+static LIST_HEAD(adaptive_inc_list);
5828+
5829+/* comment out to disable optimizer debugging */
5830+#define ENABLE_OPTIMIZER_DEBUGGING
5831+
5832+#ifdef ENABLE_OPTIMIZER_DEBUGGING
5833+#define OPT_DBG TRACE
5834+#define OPT_DBG_T TRACE_TASK
5835+#else
5836+#define OPT_DBG
5837+#define OPT_DBG_T OPT_D
5838+#endif
5839+
5840+/******************************************************************************/
5841+/* OPTIMIZER MATH */
5842+/******************************************************************************/
5843+
5844+/* All time dependent functions
5845+ * rely on opt_time.
5846+ * Update in the optimizer before use!
5847+ */
5848+
5849+static inline fp_t ideal(fp_t weight, jiffie_t delta_t)
5850+{
5851+ return _mul(weight, FP(delta_t));
5852+}
5853+
5854+static noinline long ideal_exec_time(struct task_struct* t)
5855+{
5856+ jiffie_t delta = opt_time - get_last_release(t);
5857+ return _round(ideal(get_est_weight(t), delta));
5858+}
5859+
5860+/* this makes a whole bunch of linearity assumptions */
5861+static noinline fp_t weight_transfer(struct task_struct* t,
5862+ unsigned int from, unsigned int to,
5863+ fp_t act_weight)
5864+{
5865+ fp_t rel_from, rel_to, ret;
5866+ rel_from = get_sl(t, from).weight;
5867+ rel_to = get_sl(t, to).weight;
5868+ ret.val = (act_weight.val * rel_to.val) / rel_from.val;
5869+ OPT_DBG("weight_transfer(%ld, %ld, %ld) => %ld to=%u from=%u\n",
5870+ rel_from.val, rel_to.val, act_weight.val, ret.val, from, to);
5871+
5872+ return ret;
5873+}
5874+
5875+static noinline fp_t est_weight_at(struct task_struct* t, unsigned int level)
5876+{
5877+ if (t->rt_param.no_service_levels)
5878+ return weight_transfer(t, get_cur_sl(t), level,
5879+ get_est_weight(t));
5880+ else
5881+ return get_est_weight(t);
5882+
5883+}
5884+
5885+static noinline void update_estimate(predictor_state_t *state, fp_t actual_weight,
5886+ fp_t a, fp_t b)
5887+{
5888+ fp_t err, new;
5889+
5890+ OPT_DBG("OLD ESTIMATE Weight" _FP_ " ActWt " _FP_ " A:" _FP_ ", B:" _FP_
5891+ "\n", fp2str(state->estimate), fp2str(actual_weight), fp2str(a),
5892+ fp2str(b));
5893+ err = _sub(actual_weight, state->estimate);
5894+ new = _add(_mul(a, err),
5895+ _mul(b, state->accumulated));
5896+
5897+ total_weight = _sub(total_weight, state->estimate);
5898+ state->estimate = new;
5899+ total_weight = _add(total_weight, state->estimate);
5900+
5901+ state->accumulated = _add(state->accumulated, err);
5902+ OPT_DBG("ERROR " _FP_ ", NEW " _FP_ ", ACC" _FP_ "\n", fp2str(err),
5903+ fp2str(new), fp2str(state->accumulated));
5904+
5905+}
5906+
5907+static noinline fp_t linear_metric(struct task_struct* t)
5908+{
5909+ fp_t v1, vmax, g1, gmax;
5910+ fp_t est_w;
5911+ unsigned int l = t->rt_param.no_service_levels;
5912+ unsigned int lcur;
5913+
5914+ if (l <= 1)
5915+ return FP(0);
5916+
5917+ lcur = get_cur_sl(t);;
5918+ est_w = get_est_weight(t);
5919+
5920+ OPT_DBG_T(t, " linear_metric: lcur=%u l=%u est_w=" _FP_ "\n",
5921+ lcur, l, est_w);
5922+ OPT_DBG_T(t, " linear_metric: est_w.val=%ld\n", est_w.val);
5923+
5924+
5925+ v1 = t->rt_param.service_level[0].value;
5926+ vmax = t->rt_param.service_level[l - 1].value;
5927+
5928+ OPT_DBG_T(t, " linear_metric: v1=" _FP_ " vmax=" _FP_ "\n", v1, vmax);
5929+ OPT_DBG_T(t, " linear_metric: v1=%ld vmax=%ld\n", v1.val, vmax.val);
5930+
5931+
5932+ g1 = weight_transfer(t, lcur, 0, est_w);
5933+ gmax = weight_transfer(t, lcur, l - 1, est_w);
5934+
5935+ OPT_DBG_T(t, " linear_metric: g1=" _FP_ " gmax=" _FP_ "\n", g1, gmax);
5936+ OPT_DBG_T(t, " linear_metric: g1=%ld gmax=%ld\n", g1, gmax);
5937+
5938+
5939+ TRACE_BUG_ON(_eq(_sub(gmax, g1), FP(0)));
5940+ if (_eq(_sub(gmax, g1), FP(0)))
5941+ return FP(0);
5942+ return _div(_sub(vmax, v1),
5943+ _sub(gmax, g1));
5944+}
5945+
5946+static noinline unsigned long reweighted_period(fp_t ow, fp_t nw,
5947+ unsigned long alloc,
5948+ jiffie_t deadline,
5949+ jiffie_t release)
5950+{
5951+ fp_t dl;
5952+ dl = _mul(FP(deadline - release), ow);
5953+ dl = _sub(dl, FP(alloc));
5954+ if(_eq(nw, FP(0)))
5955+ return 0;
5956+ dl = _div(dl, nw);
5957+ return _round(dl);
5958+}
5959+
5960+static noinline int is_under_allocated(struct task_struct* t)
5961+{
5962+ return ideal_exec_time(t) >= t->rt_param.times.exec_time;
5963+}
5964+
5965+static noinline jiffie_t dec_equal_point_delay(struct task_struct* t)
5966+{
5967+ if (_lt(FP(0), get_est_weight(t)))
5968+ /* when t was released plus time needed to equalize
5969+ * minus now
5970+ */
5971+ return get_last_release(t) +
5972+ _round(_div( FP(t->rt_param.times.exec_time),
5973+ get_est_weight(t))) -
5974+ opt_time;
5975+ else
5976+ /* if the weight is zero we just take the
5977+ * deadline
5978+ */
5979+ return t->rt_param.times.deadline;
5980+}
5981+
5982+static noinline jiffie_t inc_equal_point_delay(struct task_struct* t)
5983+{
5984+ if (_lt(FP(0), t->rt_param.opt_nw))
5985+ /* when t was released plus time needed to equalize
5986+ * minus now
5987+ */
5988+ return get_last_release(t) +
5989+ _round(_div( FP(t->rt_param.times.exec_time),
5990+ t->rt_param.opt_nw)) -
5991+ opt_time;
5992+ else
5993+ /* if the weight is zero we just take the
5994+ * deadline
5995+ */
5996+ return t->rt_param.times.deadline;
5997+}
5998+
5999+static noinline jiffie_t decrease_delay(struct task_struct* t)
6000+{
6001+ if (has_active_job(t) && !is_under_allocated(t))
6002+ return dec_equal_point_delay(t);
6003+ return 0;
6004+}
6005+
6006+
6007+
6008+/******************************************************************************/
6009+/* SORT ORDERS */
6010+/******************************************************************************/
6011+
6012+static int by_linear_metric(struct list_head* a, struct list_head* b)
6013+{
6014+ struct task_struct *ta, *tb;
6015+ ta = list_entry(a, struct task_struct, rt_param.opt_list);
6016+ tb = list_entry(b, struct task_struct, rt_param.opt_list);
6017+ return _gt(ta->rt_param.opt_order, tb->rt_param.opt_order);
6018+}
6019+
6020+static int by_delta_weight(struct list_head* a, struct list_head* b)
6021+{
6022+ struct task_struct *ta, *tb;
6023+ ta = list_entry(a, struct task_struct, rt_param.opt_list);
6024+ tb = list_entry(b, struct task_struct, rt_param.opt_list);
6025+ return _lt(ta->rt_param.opt_dw, tb->rt_param.opt_dw);
6026+}
6027+
6028+static int by_enactment_time(struct list_head* a, struct list_head* b)
6029+{
6030+ struct task_struct *ta, *tb;
6031+ ta = list_entry(a, struct task_struct, rt_param.opt_list);
6032+ tb = list_entry(b, struct task_struct, rt_param.opt_list);
6033+ return ta->rt_param.opt_change < tb->rt_param.opt_change;
6034+}
6035+
6036+/******************************************************************************/
6037+/* WEIGHT CHANGE MECHANICS */
6038+/******************************************************************************/
6039+
6040+static void set_service_level(struct task_struct* t, unsigned int level)
6041+{
6042+ service_level_t *new;
6043+ unsigned int old;
6044+ BUG_ON(!t);
6045+ BUG_ON(t->rt_param.no_service_levels <= level);
6046+
6047+ old = t->rt_param.cur_service_level;
6048+ t->rt_param.cur_service_level = level;
6049+ new = t->rt_param.service_level + level;
6050+ t->rt_param.basic_params.period = new->period;
6051+ t->rt_param.basic_params.exec_cost = _round(_mul(new->weight,
6052+ FP(new->period)));
6053+
6054+ scheduler_signal(t, SIGUSR1);
6055+
6056+ sched_trace_service_level_change(t, old, level);
6057+ OPT_DBG_T(t, "service level %u activated\n", level);
6058+}
6059+
6060+/* call this _before_ updating deadline and release of t */
6061+static void update_weight_estimate(struct task_struct* t)
6062+{
6063+ fp_t nw, ow;
6064+ jiffie_t sl_period, exec_time;
6065+
6066+ ow = get_est_weight(t);
6067+ nw = t->rt_param.opt_nw;
6068+ exec_time = t->rt_param.times.exec_time;
6069+ sl_period = get_sl(t, get_opt_sl(t)).period;
6070+
6071+ OPT_DBG("ow=" _FP_ " nw=" _FP_ ", r-d " _FP_
6072+ ", deadline %d, release %d, exec_time=%ld sl_period=%lu\n",
6073+ fp2str(ow), fp2str(nw),
6074+ fp2str(FP(get_deadline(t) - get_last_release(t))),
6075+ get_deadline(t), get_last_release(t), exec_time, sl_period);
6076+
6077+ total_weight = _sub(total_weight, get_est_weight(t));
6078+ t->rt_param.predictor_state.estimate = nw;
6079+ OPT_DBG_T(t, "update_weight_estimate from " _FP_ " to "_FP_"\n",
6080+ fp2str(ow), fp2str(nw));
6081+ total_weight = _add(total_weight, get_est_weight(t));
6082+
6083+ OPT_DBG_T(t, " update_weight_estimate: " _FP_ " => " _FP_ "\n",
6084+ fp2str(ow), fp2str(get_est_weight(t)));
6085+}
6086+
6087+
6088+static void decrease_weight(struct task_struct* t)
6089+{
6090+ fp_t ow, nw;
6091+ jiffie_t last, period, delay;
6092+
6093+ ow = get_sl(t, get_cur_sl(t)).weight;
6094+ nw = get_sl(t, get_opt_sl(t)).weight;
6095+ last = t->rt_param.times.last_release;
6096+ period = reweighted_period(ow, nw, t->rt_param.times.exec_time,
6097+ t->rt_param.times.deadline, last);
6098+
6099+ /* necessary delay has already been computed by optimizer */
6100+ delay = t->rt_param.opt_change;
6101+
6102+ update_weight_estimate(t);
6103+
6104+ if (!delay)
6105+ t->rt_param.times.last_release = opt_time;
6106+ t->rt_param.times.release = opt_time + delay;
6107+ t->rt_param.times.deadline = opt_time + delay + period;
6108+
6109+ set_service_level(t, get_opt_sl(t));
6110+
6111+ /* take out of queue/link structure */
6112+ unlink(t);
6113+ /* present as a new job */
6114+ adaptive_job_arrival(t);
6115+}
6116+
6117+
6118+static void increase_weight(struct task_struct* t)
6119+{
6120+ fp_t ow, nw;
6121+ jiffie_t last, period, delay;
6122+
6123+ ow = get_sl(t, get_cur_sl(t)).weight;
6124+ nw = get_sl(t, get_opt_sl(t)).weight;
6125+ last = t->rt_param.times.last_release;
6126+ period = reweighted_period(ow, nw, t->rt_param.times.exec_time,
6127+ t->rt_param.times.deadline, last);
6128+
6129+ if (t->rt_param.opt_change == 0) {
6130+ /* can be enacted now */
6131+ if (is_under_allocated(t) ||
6132+ time_before(opt_time + period, get_deadline(t)))
6133+ /* do it now */
6134+ delay = 0;
6135+ else {
6136+ if (is_under_allocated(t)) {
6137+ t->rt_param.opt_change += opt_time;
6138+ /* The next job release will notice that opt !=
6139+ * sl and initiate a weight change.
6140+ */
6141+ return;
6142+ } else
6143+ /* nope, wait for equal point */
6144+ delay = inc_equal_point_delay(t);
6145+ }
6146+
6147+ update_weight_estimate(t);
6148+
6149+ if (!delay)
6150+ t->rt_param.times.last_release = opt_time;
6151+ t->rt_param.times.release = opt_time + delay;
6152+ t->rt_param.times.deadline = opt_time + delay + period;
6153+
6154+ set_service_level(t, get_opt_sl(t));
6155+
6156+ /* take out of queue/link structure */
6157+ unlink(t);
6158+ /* present as a new job */
6159+ adaptive_job_arrival(t);
6160+
6161+ } else {
6162+ /* must wait until capacity is released */
6163+ t->rt_param.opt_change += opt_time;
6164+ list_insert(&t->rt_param.opt_list, &adaptive_inc_list,
6165+ by_enactment_time);
6166+ }
6167+}
6168+
6169+static void delayed_increase_weight(void)
6170+{
6171+ struct list_head *p, *extra;
6172+ struct task_struct* t;
6173+
6174+ opt_time = jiffies;
6175+ list_for_each_safe(p, extra, &adaptive_inc_list) {
6176+ t = list_entry(p, struct task_struct, rt_param.opt_list);
6177+ if (time_before_eq(t->rt_param.opt_change, opt_time)) {
6178+ list_del(p);
6179+ /* prevent recursion */
6180+ t->rt_param.opt_change = 0;
6181+ /* this takes care of everything */
6182+ increase_weight(t);
6183+ } else
6184+ /* list is sorted */
6185+ break;
6186+ }
6187+}
6188+
6189+static void change_weight(struct task_struct* t)
6190+{
6191+ if (get_cur_sl(t) < get_opt_sl(t))
6192+ increase_weight(t);
6193+ else
6194+ decrease_weight(t);
6195+ OPT_DBG_T(t, "after change_weight: last_rel:%d rel:%d dl:%d\n",
6196+ get_last_release(t),
6197+ get_release(t),
6198+ get_deadline(t));
6199+}
6200+
6201+/******************************************************************************/
6202+/* OPTIMIZER */
6203+/******************************************************************************/
6204+
6205+/* only invoke with adaptive_lock behing held */
6206+void adaptive_optimize(void)
6207+{
6208+ struct list_head list;
6209+ struct list_head inc, dec;
6210+ struct list_head *p, *extra;
6211+ cpu_entry_t *cpu;
6212+ struct task_struct* t;
6213+ fp_t M = FP(0), w0, wl, tmp, estU = FP(0);
6214+ unsigned int l;
6215+ jiffie_t enactment_time;
6216+
6217+ if (time_before(jiffies,
6218+ last_optimizer_run + optimizer_min_invocation_sep))
6219+ return;
6220+
6221+ OPT_DBG(":::::: running adaptive optimizer\n");
6222+ opt_time = jiffies;
6223+
6224+ INIT_LIST_HEAD(&list);
6225+
6226+ /* 1) gather all tasks */
6227+ list_for_each(p, &adaptive.ready_queue)
6228+ list_add(&(rt_list2task(p)->rt_param.opt_list), &list);
6229+ list_for_each(p, &adaptive.release_queue)
6230+ list_add(&(rt_list2task(p)->rt_param.opt_list), &list);
6231+ list_for_each(p, &adaptive_cpu_queue) {
6232+ cpu = list_entry(p, cpu_entry_t, list);
6233+ if (cpu->linked)
6234+ list_add(&cpu->linked->rt_param.opt_list, &list);
6235+ }
6236+
6237+ /* 2) determine current system capacity */
6238+ M = system_capacity;
6239+ OPT_DBG("opt: system capacity: " _FP_ "\n", fp2str(M));
6240+
6241+ /* 3) Compute L value for all tasks,
6242+ * and set tasks to service level 0,
6243+ * also account for weight.
6244+ * Also establish current estimated utilization
6245+ */
6246+ list_for_each_safe(p, extra, &list) {
6247+ t = list_entry(p, struct task_struct, rt_param.opt_list);
6248+ if (time_before(opt_time, get_last_release(t))) {
6249+ list_del(p);
6250+ continue;
6251+ }
6252+ t->rt_param.opt_order = linear_metric(t);
6253+ OPT_DBG_T(t, "est_w = " _FP_ " L = " _FP_ "\n",
6254+ get_est_weight(t),
6255+ fp2str(t->rt_param.opt_order));
6256+ t->rt_param.opt_level = 0;
6257+ M = _sub(M, est_weight_at(t, 0));
6258+ estU = _add(estU, get_est_weight(t));
6259+ }
6260+ OPT_DBG("opt: estimated utilization: " _FP_ "\n", fp2str(estU));
6261+ OPT_DBG("opt: estimated capacity at all sl=0: " _FP_ "\n", fp2str(M));
6262+
6263+
6264+ /* 4) sort list by decreasing linear metric */
6265+ list_qsort(&list, by_linear_metric);
6266+
6267+ /* 5) assign each task a service level */
6268+ list_for_each(p, &list) {
6269+ t = list_entry(p, struct task_struct, rt_param.opt_list);
6270+ l = t->rt_param.no_service_levels;
6271+ w0 = est_weight_at(t, 0);
6272+ while (l > 1) {
6273+ l--;
6274+ wl = est_weight_at(t, l);
6275+ tmp = _sub(M, _sub(wl, w0));
6276+ if (_leq(FP(0), tmp)) {
6277+ /* this level fits in */
6278+ M = tmp;
6279+ t->rt_param.opt_level = l;
6280+ t->rt_param.opt_dw = _sub(wl,
6281+ get_est_weight(t));
6282+ t->rt_param.opt_nw = wl;
6283+ break; /* proceed to next task */
6284+ }
6285+ }
6286+ OPT_DBG_T(t, " will run at sl=%u, prior=%u dw=" _FP_ "\n",
6287+ l, get_cur_sl(t), fp2str(t->rt_param.opt_dw));
6288+
6289+ }
6290+
6291+ /* 6) filter tasks that reweight */
6292+ INIT_LIST_HEAD(&inc);
6293+ INIT_LIST_HEAD(&dec);
6294+ list_for_each_safe(p, extra, &list) {
6295+ t = list_entry(p, struct task_struct, rt_param.opt_list);
6296+ list_del(p);
6297+ if (t->rt_param.opt_level < get_cur_sl(t)) {
6298+ list_add(p, &dec);
6299+ t->rt_param.opt_change = decrease_delay(t);
6300+ } else if (t->rt_param.opt_level > get_cur_sl(t)) {
6301+ list_add(p, &inc);
6302+ t->rt_param.opt_change = 0;
6303+ }
6304+ /* if t doesn't change we can ignore it from now on */
6305+ }
6306+
6307+ /* 7) sort dec and inc list */
6308+ list_qsort(&dec, by_enactment_time);
6309+ list_qsort(&inc, by_delta_weight);
6310+
6311+ /* 8) now figure out when we can enact weight increases
6312+ * It works like this: We know the current system utilization.
6313+ * Thus, we know the remaining capacity. We also know when
6314+ * decreases are going to be enacted (=> capacity increases).
6315+ * Now we only need to find a spot where the weight increase will
6316+ * not drive the system into overload.
6317+ */
6318+
6319+ /* Very ugly jump, but we need to force enactment_time = 0
6320+ * during the first iteration.
6321+ */
6322+ M = system_capacity;
6323+ enactment_time = 0;
6324+ goto first_iteration;
6325+
6326+ while (!list_empty(&inc)) {
6327+ enactment_time = list_entry(dec.next, struct task_struct,
6328+ rt_param.opt_list)
6329+ ->rt_param.opt_change;
6330+ first_iteration:
6331+ /* Start by collapsing the next decrease.
6332+ * Except for in the first iteration, it will always
6333+ * pick off at least one task.
6334+ */
6335+ list_for_each_safe(p, extra, &dec) {
6336+ t = list_entry(p, struct task_struct,
6337+ rt_param.opt_list);
6338+ if (t->rt_param.opt_change == enactment_time) {
6339+ list_del(p);
6340+ /* opt_dw is negative */
6341+ estU = _add(estU, t->rt_param.opt_dw);
6342+ list_add(p, &list);
6343+
6344+ OPT_DBG_T(t, " weight decrease at %ld => estU="
6345+ _FP_ "\n", enactment_time,
6346+ fp2str(estU));
6347+
6348+ } else
6349+ /* stop decrease loop */
6350+ break;
6351+ }
6352+
6353+ /* now start setting enactment times for increases */
6354+ while (!list_empty(&inc)) {
6355+ p = inc.next;
6356+ t = list_entry(p, struct task_struct,
6357+ rt_param.opt_list);
6358+ tmp = _add(estU, t->rt_param.opt_dw);
6359+ if (_leq(tmp, M)) {
6360+ /* it fits */
6361+ estU = tmp;
6362+ t->rt_param.opt_change = enactment_time;
6363+ list_del(p);
6364+ list_add(p, &list);
6365+
6366+ OPT_DBG_T(t, " weight increase at %ld => estU="
6367+ _FP_ "\n", enactment_time,
6368+ fp2str(estU));
6369+
6370+ } else
6371+ /* stop increase loop */
6372+ break;
6373+ }
6374+
6375+ TRACE_BUG_ON(list_empty(&dec) && !list_empty(&inc));
6376+ if (list_empty(&dec) && !list_empty(&inc))
6377+ /* break out in case of bug */
6378+ break;
6379+ }
6380+
6381+ /* 9) Wow. We made it. Every task has a now a new service level
6382+ * assigned, together with a correct (earliest) enactment time.
6383+ * all we have left to do now is to enact changes that did not get
6384+ * delayed. Also convert change fields to actual timestamp for to be
6385+ * nice to the scheduler_tick().
6386+ */
6387+ INIT_LIST_HEAD(&adaptive_inc_list);
6388+ list_for_each_safe(p, extra, &list) {
6389+ t = list_entry(p, struct task_struct, rt_param.opt_list);
6390+ list_del(p);
6391+ change_weight(t);
6392+ }
6393+
6394+ last_optimizer_run = jiffies;
6395+ OPT_DBG(":::::: optimizer run complete\n");
6396+}
6397+
6398+/* update_cpu_position - Move the cpu entry to the correct place to maintain
6399+ * order in the cpu queue. Caller must hold adaptive lock.
6400+ */
6401+static void update_cpu_position(cpu_entry_t *entry)
6402+{
6403+ cpu_entry_t *other;
6404+ struct list_head *pos;
6405+ list_del(&entry->list);
6406+ /* if we do not execute real-time jobs we just move
6407+ * to the end of the queue
6408+ */
6409+ if (entry->linked) {
6410+ list_for_each(pos, &adaptive_cpu_queue) {
6411+ other = list_entry(pos, cpu_entry_t, list);
6412+ if (edf_higher_prio(entry->linked, other->linked)) {
6413+ __list_add(&entry->list, pos->prev, pos);
6414+ return;
6415+ }
6416+ }
6417+ }
6418+ /* if we get this far we have the lowest priority job */
6419+ list_add_tail(&entry->list, &adaptive_cpu_queue);
6420+}
6421+
6422+/* link_task_to_cpu - Update the link of a CPU.
6423+ * Handles the case where the to-be-linked task is already
6424+ * scheduled on a different CPU.
6425+ */
6426+static noinline void link_task_to_cpu(struct task_struct* linked,
6427+ cpu_entry_t *entry)
6428+
6429+{
6430+ cpu_entry_t *sched;
6431+ struct task_struct* tmp;
6432+ int on_cpu;
6433+
6434+ BUG_ON(linked && !is_realtime(linked));
6435+
6436+ /* Currently linked task is set to be unlinked. */
6437+ if (entry->linked)
6438+ entry->linked->rt_param.linked_on = NO_CPU;
6439+
6440+ /* Link new task to CPU. */
6441+ if (linked) {
6442+ set_rt_flags(linked, RT_F_RUNNING);
6443+ /* handle task is already scheduled somewhere! */
6444+ on_cpu = linked->rt_param.scheduled_on;
6445+ if (on_cpu != NO_CPU) {
6446+ sched = &per_cpu(adaptive_cpu_entries, on_cpu);
6447+ /* this should only happen if not linked already */
6448+ BUG_ON(sched->linked == linked);
6449+
6450+ /* If we are already scheduled on the CPU to which we
6451+ * wanted to link, we don't need to do the swap --
6452+ * we just link ourselves to the CPU and depend on
6453+ * the caller to get things right.
6454+ */
6455+ if (entry != sched) {
6456+ tmp = sched->linked;
6457+ linked->rt_param.linked_on = sched->cpu;
6458+ sched->linked = linked;
6459+ update_cpu_position(sched);
6460+ linked = tmp;
6461+ }
6462+ }
6463+ if (linked) /* might be NULL due to swap */
6464+ linked->rt_param.linked_on = entry->cpu;
6465+ }
6466+ entry->linked = linked;
6467+ update_cpu_position(entry);
6468+}
6469+
6470+/* unlink - Make sure a task is not linked any longer to an entry
6471+ * where it was linked before. Must hold adaptive_lock.
6472+ */
6473+static void unlink(struct task_struct* t)
6474+{
6475+ cpu_entry_t *entry;
6476+
6477+ if (unlikely(!t)) {
6478+ TRACE_BUG_ON(!t);
6479+ return;
6480+ }
6481+
6482+ if (t->rt_param.linked_on != NO_CPU) {
6483+ /* unlink */
6484+ entry = &per_cpu(adaptive_cpu_entries, t->rt_param.linked_on);
6485+ t->rt_param.linked_on = NO_CPU;
6486+ link_task_to_cpu(NULL, entry);
6487+ } else if (in_list(&t->rt_list)) {
6488+ /* This is an interesting situation: t is scheduled,
6489+ * but was just recently unlinked. It cannot be
6490+ * linked anywhere else (because then it would have
6491+ * been relinked to this CPU), thus it must be in some
6492+ * queue. We must remove it from the list in this
6493+ * case.
6494+ */
6495+ list_del(&t->rt_list);
6496+ }
6497+}
6498+
6499+
6500+/* preempt - force a CPU to reschedule
6501+ */
6502+static noinline void preempt(cpu_entry_t *entry)
6503+{
6504+ /* We cannot make the is_np() decision here if it is a remote CPU
6505+ * because requesting exit_np() requires that we currently use the
6506+ * address space of the task. Thus, in the remote case we just send
6507+ * the IPI and let schedule() handle the problem.
6508+ */
6509+
6510+ if (smp_processor_id() == entry->cpu) {
6511+ if (entry->scheduled && is_np(entry->scheduled))
6512+ request_exit_np(entry->scheduled);
6513+ else
6514+ set_tsk_need_resched(current);
6515+ } else
6516+ /* in case that it is a remote CPU we have to defer the
6517+ * the decision to the remote CPU
6518+ */
6519+ if (!test_will_schedule(entry->cpu))
6520+ smp_send_reschedule(entry->cpu);
6521+}
6522+
6523+/* requeue - Put an unlinked task into gsn-edf domain.
6524+ * Caller must hold adaptive_lock.
6525+ */
6526+static noinline void requeue(struct task_struct* task)
6527+{
6528+ BUG_ON(!task);
6529+ /* sanity check rt_list before insertion */
6530+ BUG_ON(in_list(&task->rt_list));
6531+
6532+ if (get_rt_flags(task) == RT_F_SLEEP ||
6533+ get_rt_mode() != MODE_RT_RUN) {
6534+ /* this task has expired
6535+ * _schedule has already taken care of updating
6536+ * the release and
6537+ * deadline. We just must check if it has been released.
6538+ */
6539+ if (is_released(task) && get_rt_mode() == MODE_RT_RUN)
6540+ __add_ready(&adaptive, task);
6541+ else {
6542+ /* it has got to wait */
6543+ __add_release(&adaptive, task);
6544+ }
6545+
6546+ } else
6547+ /* this is a forced preemption
6548+ * thus the task stays in the ready_queue
6549+ * we only must make it available to others
6550+ */
6551+ __add_ready(&adaptive, task);
6552+}
6553+
6554+/* adaptive_job_arrival: task is either resumed or released */
6555+static void adaptive_job_arrival(struct task_struct* task)
6556+{
6557+ cpu_entry_t* last;
6558+
6559+ BUG_ON(list_empty(&adaptive_cpu_queue));
6560+ BUG_ON(!task);
6561+
6562+ TRACE_TASK(task, "job_arrival: last_rel=%d rel=%d dl=%d now=%d\n",
6563+ get_last_release(task), get_release(task),
6564+ get_deadline(task),
6565+ jiffies);
6566+
6567+
6568+ /* first queue arriving job */
6569+ requeue(task);
6570+
6571+ /* then check for any necessary preemptions */
6572+ last = list_entry(adaptive_cpu_queue.prev, cpu_entry_t, list);
6573+ if (edf_preemption_needed(&adaptive, last->linked)) {
6574+ /* preemption necessary */
6575+ task = __take_ready(&adaptive);
6576+
6577+ TRACE("job_arrival: task %d linked to %d\n",
6578+ task->pid, last->cpu);
6579+
6580+ if (last->linked)
6581+ requeue(last->linked);
6582+
6583+ link_task_to_cpu(task, last);
6584+ preempt(last);
6585+ }
6586+}
6587+
6588+/* check for current job releases */
6589+static noinline void adaptive_release_jobs(void)
6590+{
6591+ struct list_head *pos, *save;
6592+ struct task_struct *queued;
6593+
6594+ list_for_each_safe(pos, save, &adaptive.release_queue) {
6595+ queued = list_entry(pos, struct task_struct, rt_list);
6596+ if (likely(is_released(queued))) {
6597+ TRACE_TASK(queued, "released rel=%d now=%d\n",
6598+ get_release(queued), jiffies);
6599+ /* this one is ready to go*/
6600+ list_del(pos);
6601+ set_rt_flags(queued, RT_F_RUNNING);
6602+ queued->rt_param.times.last_release =
6603+ queued->rt_param.times.release;
6604+
6605+ /* check for delayed weight increase */
6606+ if (get_opt_sl(queued) != get_cur_sl(queued) &&
6607+ time_before_eq(queued->rt_param.opt_change, jiffies)) {
6608+ opt_time = jiffies;
6609+ set_service_level(queued, get_opt_sl(queued));
6610+ queued->rt_param.times.deadline =
6611+ get_last_release(queued) +
6612+ get_rt_period(queued);
6613+ total_weight = _sub(total_weight, get_est_weight(queued));
6614+ queued->rt_param.predictor_state.estimate =
6615+ queued->rt_param.opt_nw;
6616+ total_weight = _add(total_weight, get_est_weight(queued));
6617+ }
6618+
6619+ sched_trace_job_release(queued);
6620+ adaptive_job_arrival(queued);
6621+ }
6622+ else
6623+ /* the release queue is ordered */
6624+ break;
6625+ }
6626+}
6627+
6628+/* adaptive_scheduler_tick - this function is called for every local timer
6629+ * interrupt.
6630+ *
6631+ * checks whether the current task has expired and checks
6632+ * whether we need to preempt it if it has not expired
6633+ */
6634+static reschedule_check_t adaptive_scheduler_tick(void)
6635+{
6636+ unsigned long flags;
6637+ struct task_struct* t = current;
6638+ reschedule_check_t want_resched = NO_RESCHED;
6639+
6640+ /* Account for exec time.
6641+ * Since we don't preempt forcefully, nothing else needs to be done.
6642+ */
6643+ if (is_realtime(t))
6644+ t->rt_param.times.exec_time++;
6645+
6646+ /* only the first CPU needs to release jobs */
6647+ if (get_rt_mode() == MODE_RT_RUN) {
6648+ queue_lock_irqsave(&adaptive_lock, flags);
6649+
6650+ /* (1) run the optimizer if it did not trigger often enough */
6651+ if (time_before_eq(last_optimizer_run + optimizer_period, jiffies)) {
6652+
6653+ OPT_DBG("adaptive: optimizing due to period threshold\n");
6654+
6655+ adaptive_optimize();
6656+ }
6657+
6658+ /* (2) enact delayed weight increases */
6659+ delayed_increase_weight();
6660+
6661+ /* (3) try to release pending jobs */
6662+ adaptive_release_jobs();
6663+
6664+ /* we don't need to check linked != scheduled since
6665+ * set_tsk_need_resched has been set by preempt() if necessary
6666+ */
6667+
6668+ queue_unlock_irqrestore(&adaptive_lock, flags);
6669+ }
6670+
6671+ return want_resched;
6672+}
6673+
6674+/* caller holds adaptive_lock */
6675+static noinline void job_completion(struct task_struct *t)
6676+{
6677+ long delta;
6678+ fp_t actual_weight, old_estimate;
6679+ unsigned int lcurr = get_cur_sl(t);
6680+ fp_t v = t->rt_param.service_level[lcurr].value;
6681+
6682+ int non_zero_weight;
6683+ fp_t error_percentage;
6684+ int exceeds_threshold;
6685+
6686+ BUG_ON(!t);
6687+
6688+ TRACE_TASK(t, " completion, last_rel=%d rel=%d dl=%d now=%d "
6689+ "period=%d\n",
6690+ get_last_release(t), get_release(t), get_deadline(t),
6691+ jiffies, get_rt_period(t));
6692+
6693+ sched_trace_job_completion(t);
6694+ delta = t->rt_param.times.exec_time -
6695+ t->rt_param.basic_params.exec_cost;
6696+
6697+ OPT_DBG_T(t, "job %d completes, delta WCET = %d\n",
6698+ t->rt_param.times.job_no, delta);
6699+
6700+ actual_weight = _frac(t->rt_param.times.exec_time,
6701+ t->rt_param.basic_params.period);
6702+ sched_trace_weight_error(t, actual_weight);
6703+ old_estimate = get_est_weight(t);
6704+ update_estimate(&t->rt_param.predictor_state, actual_weight,
6705+ fc_a, fc_b);
6706+
6707+ OPT_DBG_T(t, "Job %d completes. Current value " _FP_
6708+ ", Weight estimation: error=" _FP_ " weight="
6709+ _FP_ " => " _FP_ "\n",t->rt_param.times.job_no, v,
6710+ _sub(get_est_weight(t), old_estimate),
6711+ old_estimate, get_est_weight(t));
6712+
6713+ /* Now we have determined the task error.
6714+ * Next we release the next job.
6715+ * Then we optimize. It's easier for the optimizer to deal
6716+ * with just-released jobs.
6717+ */
6718+
6719+ /* prepare for next period */
6720+ edf_prepare_for_next_period(t);
6721+
6722+ TRACE_TASK(t, " prepped, last_rel=%d rel=%d dl=%d now=%d\n",
6723+ get_last_release(t), get_release(t), get_deadline(t),
6724+ jiffies);
6725+
6726+ if (is_released(t)) {
6727+ /* set flags */
6728+ /* prevent fake completions */
6729+ set_rt_flags(t, RT_F_RUNNING);
6730+ t->rt_param.times.last_release =
6731+ t->rt_param.times.release;
6732+ }
6733+
6734+
6735+ non_zero_weight = !_eq(get_est_weight(t),FP(0));
6736+ if (non_zero_weight)
6737+ error_percentage = _div(_abs(_sub(get_est_weight(t),
6738+ old_estimate)),
6739+ get_est_weight(t));
6740+ else
6741+ error_percentage = FP(0);
6742+ exceeds_threshold = _gt(error_percentage, task_error_threshold);
6743+
6744+
6745+ if (exceeds_threshold) {
6746+ OPT_DBG("adaptive: optimizing due to task error threshold\n");
6747+ adaptive_optimize();
6748+ } else if (_gt(total_weight, system_capacity)) {
6749+ OPT_DBG("adaptive: optimizing due to system capacity exceeded\n");
6750+ adaptive_optimize();
6751+ }
6752+
6753+
6754+ /* unlink */
6755+ unlink(t);
6756+ /* requeue
6757+ * But don't requeue a blocking task. */
6758+ if (is_running(t))
6759+ adaptive_job_arrival(t);
6760+}
6761+
6762+
6763+/* Getting schedule() right is a bit tricky. schedule() may not make any
6764+ * assumptions on the state of the current task since it may be called for a
6765+ * number of reasons. The reasons include a scheduler_tick() determined that it
6766+ * was necessary, because sys_exit_np() was called, because some Linux
6767+ * subsystem determined so, or even (in the worst case) because there is a bug
6768+ * hidden somewhere. Thus, we must take extreme care to determine what the
6769+ * current state is.
6770+ *
6771+ * The CPU could currently be scheduling a task (or not), be linked (or not).
6772+ *
6773+ * The following assertions for the scheduled task could hold:
6774+ *
6775+ * - !is_running(scheduled) // the job blocks
6776+ * - get_rt_flag() == RT_F_SLEEP // the job completed (by syscall)
6777+ * - linked != scheduled // we need to reschedule (for any reason)
6778+ *
6779+ * Any of these can occur together.
6780+ */
6781+static int adaptive_schedule(struct task_struct * prev,
6782+ struct task_struct ** next,
6783+ runqueue_t * rq)
6784+{
6785+ cpu_entry_t* entry = &__get_cpu_var(adaptive_cpu_entries);
6786+ int sleep, preempt, exists,
6787+ rt, blocks;
6788+ struct task_struct* linked;
6789+
6790+ /* Will be released in finish_switch. */
6791+ queue_lock(&adaptive_lock);
6792+ clear_will_schedule();
6793+
6794+ /* sanity checking */
6795+ BUG_ON(entry->scheduled && entry->scheduled != prev);
6796+ BUG_ON(entry->scheduled && !is_realtime(prev));
6797+
6798+ /* (0) Determine state */
6799+ exists = entry->scheduled != NULL;
6800+ blocks = exists && !is_running(entry->scheduled);
6801+ sleep = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
6802+ preempt = entry->scheduled != entry->linked;
6803+ rt = get_rt_mode() == MODE_RT_RUN;
6804+
6805+ /* If a task blocks we have no choice but to reschedule.
6806+ */
6807+ if (blocks)
6808+ unlink(entry->scheduled);
6809+
6810+ /* Task wants to sleep -> job is done.
6811+ */
6812+ if (sleep)
6813+ job_completion(entry->scheduled);
6814+
6815+ /* Stop real-time tasks when we leave real-time mode
6816+ */
6817+ if (!rt && entry->linked) {
6818+ /* task will be preempted once it is preemptable
6819+ * (which it may be already)
6820+ */
6821+ linked = entry->linked;
6822+ unlink(linked);
6823+ requeue(linked);
6824+ }
6825+
6826+ /* Link pending task if we became unlinked.
6827+ */
6828+ if (rt && !entry->linked)
6829+ link_task_to_cpu(__take_ready(&adaptive), entry);
6830+
6831+ /* The final scheduling decision. Do we need to switch for some reason?
6832+ * If linked different from scheduled select linked as next.
6833+ */
6834+ if (entry->linked != entry->scheduled) {
6835+ /* Take care of a previously scheduled
6836+ * job by taking it out of the Linux runqueue.
6837+ */
6838+ if (entry->scheduled)
6839+ if (prev->array)
6840+ /* take it out of the run queue */
6841+ deactivate_task(prev, rq);
6842+
6843+ /* Schedule a linked job? */
6844+ if (entry->linked) {
6845+ *next = entry->linked;
6846+ /* mark the task as executing on this cpu */
6847+ set_task_cpu(*next, smp_processor_id());
6848+ /* stick the task into the runqueue */
6849+ __activate_task(*next, rq);
6850+ }
6851+ } else
6852+ /* Only override Linux scheduler if we have real-time task
6853+ * scheduled that needs to continue.
6854+ */
6855+ if (exists)
6856+ *next = prev;
6857+
6858+ /* Unlock in case that we don't affect real-time tasks or
6859+ * if nothing changed and finish_switch won't be called.
6860+ */
6861+ if (prev == *next || (!is_realtime(prev) && !*next))
6862+ queue_unlock(&adaptive_lock);
6863+
6864+ return 0;
6865+}
6866+
6867+
6868+/* _finish_switch - we just finished the switch away from prev
6869+ */
6870+static void adaptive_finish_switch(struct task_struct *prev)
6871+{
6872+ cpu_entry_t* entry = &__get_cpu_var(adaptive_cpu_entries);
6873+
6874+ if (is_realtime(current))
6875+ entry->scheduled = current;
6876+ else
6877+ entry->scheduled = NULL;
6878+
6879+ prev->rt_param.scheduled_on = NO_CPU;
6880+ current->rt_param.scheduled_on = smp_processor_id();
6881+
6882+ /* unlock in case schedule() left it locked */
6883+ if (is_realtime(current) || is_realtime(prev))
6884+ queue_unlock(&adaptive_lock);
6885+}
6886+
6887+
6888+/* Prepare a task for running in RT mode
6889+ * Enqueues the task into master queue data structure
6890+ * returns
6891+ * -EPERM if task is not TASK_STOPPED
6892+ */
6893+static long adaptive_prepare_task(struct task_struct * t)
6894+{
6895+ unsigned long flags;
6896+
6897+ TRACE("adaptive: prepare task %d\n", t->pid);
6898+
6899+ if (t->state == TASK_STOPPED) {
6900+ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
6901+
6902+ t->rt_param.scheduled_on = NO_CPU;
6903+ t->rt_param.linked_on = NO_CPU;
6904+ if (t->rt_param.no_service_levels) {
6905+ t->rt_param.predictor_state.estimate =
6906+ get_sl(t, 0).weight;
6907+ } else
6908+ t->rt_param.predictor_state.estimate =
6909+ _frac(get_exec_cost(t), get_rt_period(t));
6910+
6911+ TRACE_TASK(t, "est_weight=" _FP_ "\n", get_est_weight(t));
6912+
6913+ if (get_rt_mode() == MODE_RT_RUN)
6914+ /* The action is already on.
6915+ * Prepare immediate release
6916+ */
6917+ edf_release_now(t);
6918+ /* The task should be running in the queue, otherwise signal
6919+ * code will try to wake it up with fatal consequences.
6920+ */
6921+ t->state = TASK_RUNNING;
6922+
6923+ queue_lock_irqsave(&adaptive_lock, flags);
6924+ total_weight = _add(total_weight, get_est_weight(t));
6925+ requeue(t);
6926+ queue_unlock_irqrestore(&adaptive_lock, flags);
6927+ return 0;
6928+ }
6929+ else
6930+ return -EPERM;
6931+}
6932+
6933+static void adaptive_wake_up_task(struct task_struct *task)
6934+{
6935+ unsigned long flags;
6936+ /* We must determine whether task should go into the release
6937+ * queue or into the ready queue. It may enter the ready queue
6938+ * if it has credit left in its time slice and has not yet reached
6939+ * its deadline. If it is now passed its deadline we assume this the
6940+ * arrival of a new sporadic job and thus put it in the ready queue
6941+ * anyway.If it has zero budget and the next release is in the future
6942+ * it has to go to the release queue.
6943+ */
6944+
6945+ TRACE("adaptive: %d unsuspends\n", task->pid);
6946+
6947+ task->state = TASK_RUNNING;
6948+
6949+ if (is_tardy(task)) {
6950+ /* new sporadic release */
6951+ edf_release_now(task);
6952+ sched_trace_job_release(task);
6953+ }
6954+ else if (task->time_slice)
6955+ /* came back in time before deadline */
6956+ set_rt_flags(task, RT_F_RUNNING);
6957+
6958+ queue_lock_irqsave(&adaptive_lock, flags);
6959+ total_weight = _add(total_weight, get_est_weight(task));
6960+ adaptive_job_arrival(task);
6961+ queue_unlock_irqrestore(&adaptive_lock, flags);
6962+}
6963+
6964+static void adaptive_task_blocks(struct task_struct *t)
6965+{
6966+ unsigned long flags;
6967+
6968+ /* unlink if necessary */
6969+ queue_lock_irqsave(&adaptive_lock, flags);
6970+ total_weight = _sub(total_weight, get_est_weight(t));
6971+ unlink(t);
6972+ queue_unlock_irqrestore(&adaptive_lock, flags);
6973+
6974+ BUG_ON(!is_realtime(t));
6975+
6976+ TRACE("task %d suspends\n", t->pid);
6977+
6978+ BUG_ON(t->rt_list.next != LIST_POISON1);
6979+ BUG_ON(t->rt_list.prev != LIST_POISON2);
6980+}
6981+
6982+
6983+/* When _tear_down is called, the task should not be in any queue any more
6984+ * as it must have blocked first. We don't have any internal state for the task,
6985+ * it is all in the task_struct.
6986+ */
6987+static long adaptive_tear_down(struct task_struct * t)
6988+{
6989+ BUG_ON(!is_realtime(t));
6990+ TRACE_TASK(t, "RIP\n");
6991+ BUG_ON(t->array);
6992+ BUG_ON(t->rt_list.next != LIST_POISON1);
6993+ BUG_ON(t->rt_list.prev != LIST_POISON2);
6994+ return 0;
6995+}
6996+
6997+static int adaptive_mode_change(int new_mode)
6998+{
6999+ unsigned long flags;
7000+ int cpu;
7001+ cpu_entry_t *entry;
7002+ struct task_struct* t;
7003+ struct list_head* pos;
7004+
7005+ if (new_mode == MODE_RT_RUN) {
7006+ queue_lock_irqsave(&adaptive_lock, flags);
7007+
7008+ system_capacity = FP(0);
7009+ for_each_online_cpu(cpu)
7010+ system_capacity = _add(system_capacity, FP(1));
7011+
7012+ __rerelease_all(&adaptive, edf_release_at);
7013+
7014+ total_weight = FP(0);
7015+ list_for_each(pos, &adaptive.release_queue) {
7016+ t = list_entry(pos, struct task_struct, rt_list);
7017+ total_weight = _add(total_weight, get_est_weight(t));
7018+ }
7019+ TRACE("adaptive: total weight: " _FP_
7020+ " (at mode change)\n", total_weight);
7021+
7022+
7023+ /* get old cruft out of the way in case we reenter real-time
7024+ * mode for a second time
7025+ */
7026+ while (!list_empty(&adaptive_cpu_queue))
7027+ list_del(adaptive_cpu_queue.next);
7028+ /* reinitialize */
7029+ for_each_online_cpu(cpu) {
7030+ entry = &per_cpu(adaptive_cpu_entries, cpu);
7031+ atomic_set(&entry->will_schedule, 0);
7032+ entry->linked = NULL;
7033+ entry->scheduled = NULL;
7034+ list_add(&entry->list, &adaptive_cpu_queue);
7035+ }
7036+
7037+ adaptive_optimize();
7038+
7039+ queue_unlock_irqrestore(&adaptive_lock, flags);
7040+
7041+ }
7042+ return 0;
7043+}
7044+
7045+
7046+typedef enum {
7047+ ADAPTIVE_SET_MIN_OPT_SEP = 1
7048+} adaptive_cmds_t;
7049+
7050+
7051+static int adaptive_setup(int cmd, void __user *up)
7052+{
7053+ unsigned int error = -EINVAL;
7054+ unsigned int val;
7055+
7056+ if (copy_from_user(&val, up, sizeof(unsigned int))) {
7057+ error = -EFAULT;
7058+ goto out;
7059+ }
7060+
7061+ switch (cmd) {
7062+ case ADAPTIVE_SET_MIN_OPT_SEP:
7063+ optimizer_min_invocation_sep = val;
7064+ TRACE("adaptive: min opt sep set to %d\n",
7065+ optimizer_min_invocation_sep);
7066+ return 0;
7067+ break;
7068+ }
7069+
7070+out:
7071+ return error;
7072+}
7073+
7074+
7075+/* Plugin object */
7076+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
7077+ .ready_to_use = 0
7078+};
7079+
7080+
7081+/*
7082+ * Plugin initialization code.
7083+ */
7084+#define INIT_SCHED_PLUGIN (struct sched_plugin){ \
7085+ .plugin_name = "ADAPTIVE", \
7086+ .ready_to_use = 1, \
7087+ .scheduler_tick = adaptive_scheduler_tick, \
7088+ .prepare_task = adaptive_prepare_task, \
7089+ .sleep_next_period = edf_sleep_next_period, \
7090+ .tear_down = adaptive_tear_down, \
7091+ .schedule = adaptive_schedule, \
7092+ .finish_switch = adaptive_finish_switch, \
7093+ .mode_change = adaptive_mode_change, \
7094+ .wake_up_task = adaptive_wake_up_task, \
7095+ .task_blocks = adaptive_task_blocks, \
7096+ .scheduler_setup = adaptive_setup \
7097+}
7098+
7099+
7100+sched_plugin_t *__init init_adaptive_plugin(void)
7101+{
7102+ int cpu;
7103+ cpu_entry_t *entry;
7104+
7105+ /* magic values given in the paper */
7106+ fc_a = _frac( 102, 1000);
7107+ fc_b = _frac( 303, 1000);
7108+
7109+ optimizer_period = 1000;
7110+ optimizer_min_invocation_sep = 200;
7111+ task_error_threshold = _frac(1, 2);
7112+
7113+ if (!s_plugin.ready_to_use)
7114+ {
7115+ /* initialize CPU state */
7116+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
7117+ entry = &per_cpu(adaptive_cpu_entries, cpu);
7118+ atomic_set(&entry->will_schedule, 0);
7119+ entry->linked = NULL;
7120+ entry->scheduled = NULL;
7121+ entry->cpu = cpu;
7122+ }
7123+
7124+ queue_lock_init(&adaptive_lock);
7125+ edf_domain_init(&adaptive, NULL);
7126+ s_plugin = INIT_SCHED_PLUGIN;
7127+ }
7128+ return &s_plugin;
7129+}
7130+
7131+
7132diff --git a/kernel/sched_edf_hsb.c b/kernel/sched_edf_hsb.c
7133new file mode 100644
7134index 0000000..b888e17
7135--- /dev/null
7136+++ b/kernel/sched_edf_hsb.c
7137@@ -0,0 +1,1724 @@
7138+/*
7139+ * kernel/sched_edf_hsb.c
7140+ *
7141+ * Implementation of the EDF-HSB scheduler plugin.
7142+ *
7143+ */
7144+
7145+#include <asm/uaccess.h>
7146+#include <linux/percpu.h>
7147+#include <linux/sched.h>
7148+#include <linux/list.h>
7149+
7150+#include <linux/litmus.h>
7151+#include <linux/sched_plugin.h>
7152+#include <linux/edf_common.h>
7153+#include <linux/fifo_common.h>
7154+#include <linux/sched_trace.h>
7155+
7156+/* undefine to remove capacity sharing */
7157+#define HSB_CAP_SHARE_ENABLED
7158+
7159+/* fake server PIDs */
7160+#define HRT_BASE_PID 50000
7161+#define SRT_BASE_PID 60000
7162+
7163+
7164+/******************************************************************************/
7165+/* Capacity queue */
7166+/******************************************************************************/
7167+
7168+int cap_check_resched(jiffie_t deadline);
7169+
7170+typedef struct {
7171+ int budget;
7172+ jiffie_t deadline;
7173+ pid_t donor;
7174+
7175+ struct list_head list;
7176+} capacity_t;
7177+
7178+typedef struct {
7179+ spinlock_t lock;
7180+ struct list_head queue;
7181+} capacity_queue_t;
7182+
7183+#define next_cap(q) list_entry((q)->queue.next, capacity_t, list)
7184+
7185+void capacity_queue_init(capacity_queue_t* queue)
7186+{
7187+ queue->lock = SPIN_LOCK_UNLOCKED;
7188+ INIT_LIST_HEAD(&queue->queue);
7189+}
7190+
7191+void __add_capacity(capacity_queue_t* queue, capacity_t *cap)
7192+{
7193+ struct list_head* pos;
7194+ capacity_t* queued;
7195+
7196+ list_for_each_prev(pos, &queue->queue) {
7197+ queued = list_entry(pos, capacity_t, list);
7198+ if ( time_before_eq(queued->deadline, cap->deadline)) {
7199+ __list_add(&cap->list, pos, pos->next);
7200+ return;
7201+ }
7202+ }
7203+ list_add(&cap->list, &queue->queue);
7204+}
7205+
7206+int __capacity_available(capacity_queue_t* queue)
7207+{
7208+ capacity_t *cap;
7209+
7210+ while (!list_empty(&queue->queue)) {
7211+ cap = list_entry(queue->queue.next, capacity_t, list);
7212+
7213+
7214+ if (time_before_eq(cap->deadline, jiffies)) {
7215+ list_del(queue->queue.next);
7216+ kfree(cap);
7217+ cap = NULL;
7218+ } else
7219+ break;
7220+ }
7221+
7222+ return !list_empty(&queue->queue);
7223+}
7224+
7225+void __return_capacity(capacity_queue_t* queue, capacity_t *cap)
7226+{
7227+ if (!cap->budget || time_before_eq(cap->deadline, jiffies))
7228+ kfree(cap);
7229+ else
7230+ __add_capacity(queue, cap);
7231+}
7232+
7233+
7234+void return_capacity(capacity_queue_t* queue, capacity_t *cap)
7235+
7236+{
7237+ unsigned long flags;
7238+
7239+ if (!cap->budget || time_before_eq(cap->deadline, jiffies))
7240+ kfree(cap);
7241+ else {
7242+ spin_lock_irqsave(&queue->lock, flags);
7243+ __add_capacity(queue, cap);
7244+ spin_unlock_irqrestore(&queue->lock, flags);
7245+ }
7246+}
7247+
7248+
7249+#define MIN_TIME_DELTA 1
7250+#define MIN_BUDGET 1
7251+
7252+#ifdef HSB_CAP_SHARE_ENABLED
7253+void release_capacity(capacity_queue_t* queue, unsigned int budget,
7254+ jiffie_t deadline, struct task_struct* t)
7255+{
7256+ capacity_t* cap;
7257+ unsigned long flags;
7258+
7259+ if (deadline >= jiffies + MIN_TIME_DELTA && budget >= MIN_BUDGET) {
7260+ cap = kmalloc(sizeof(capacity_t), GFP_ATOMIC);
7261+ if (cap) {
7262+ cap->budget = budget;
7263+ cap->deadline = deadline;
7264+ if (t)
7265+ cap->donor = t->pid;
7266+ else
7267+ cap->donor = 0;
7268+ spin_lock_irqsave(&queue->lock, flags);
7269+ __add_capacity(queue, cap);
7270+ cap_check_resched(next_cap(queue)->deadline);
7271+ spin_unlock_irqrestore(&queue->lock, flags);
7272+ if (t)
7273+ sched_trace_capacity_release(t);
7274+ }
7275+ }
7276+}
7277+
7278+void __release_capacity(capacity_queue_t* queue, unsigned int budget,
7279+ jiffie_t deadline, struct task_struct* t)
7280+{
7281+ capacity_t* cap;
7282+
7283+ if (deadline >= jiffies + MIN_TIME_DELTA && budget >= MIN_BUDGET) {
7284+ cap = kmalloc(sizeof(capacity_t), GFP_ATOMIC);
7285+ if (cap) {
7286+ cap->budget = budget;
7287+ cap->deadline = deadline;
7288+ if (t)
7289+ cap->donor = t->pid;
7290+ else
7291+ cap->donor = 0;
7292+ /* no locking, no resched check -- called from schedule */
7293+ __add_capacity(queue, cap);
7294+ if (t)
7295+ sched_trace_capacity_release(t);
7296+ }
7297+ }
7298+}
7299+
7300+
7301+capacity_t* __take_capacity(capacity_queue_t* queue, jiffie_t deadline, int deadline_matters)
7302+{
7303+ capacity_t* cap = NULL;
7304+
7305+ while (!list_empty(&queue->queue)) {
7306+ cap = list_entry(queue->queue.next, capacity_t, list);
7307+
7308+ if (deadline_matters && time_before(deadline, cap->deadline)) {
7309+ cap = NULL;
7310+ break;
7311+ }
7312+
7313+ list_del(queue->queue.next);
7314+ if (cap->deadline > jiffies) {
7315+ if (cap->deadline - jiffies < cap->budget)
7316+ cap->budget = cap->deadline - jiffies;
7317+ break;
7318+ }
7319+ kfree(cap);
7320+ cap = NULL;
7321+ }
7322+
7323+ return cap;
7324+}
7325+#else
7326+
7327+/* no capacity sharing */
7328+void release_capacity(capacity_queue_t* queue, unsigned int budget,
7329+ jiffie_t deadline, struct task_struct* t)
7330+{
7331+}
7332+
7333+capacity_t* __take_capacity(capacity_queue_t* queue, jiffie_t deadline, int deadline_matters)
7334+{
7335+ return NULL;
7336+}
7337+#endif
7338+
7339+
7340+/******************************************************************************/
7341+/* server abstractions */
7342+/******************************************************************************/
7343+
7344+
7345+/* hrt_server_t - Abstraction of a hard real-time server.
7346+ *
7347+ * One HRT server per CPU. If it is unused period and wcet may be zero.
7348+ * HRT servers are strictly periodic and retain their budget.
7349+ */
7350+typedef struct {
7351+ rt_domain_t domain;
7352+
7353+ unsigned int period;
7354+ unsigned int wcet;
7355+
7356+ jiffie_t deadline;
7357+ int budget;
7358+} hrt_server_t;
7359+
7360+/* be_server_t - Abstraction of best-effort server.
7361+ *
7362+ * This is pretty much only an accounting abstraction.
7363+ */
7364+typedef struct {
7365+ unsigned int period;
7366+ unsigned int wcet;
7367+
7368+ jiffie_t deadline;
7369+ jiffie_t release;
7370+ int budget;
7371+
7372+ struct list_head list;
7373+ pid_t pid;
7374+} be_server_t;
7375+
7376+/* cast to int to allow for negative slack, i.e. tardiness */
7377+#define server_slack(srv) \
7378+ ( ((int) (srv)->deadline - (int) jiffies) - (int) (srv)->budget )
7379+
7380+typedef struct {
7381+ int cpu;
7382+
7383+ hrt_server_t hrt;
7384+ be_server_t* be;
7385+ capacity_t* cap;
7386+
7387+ task_class_t exec_class;
7388+ jiffie_t cur_deadline;
7389+ atomic_t will_schedule;
7390+
7391+ struct list_head list;
7392+ spinlock_t lock;
7393+} cpu_state_t;
7394+
7395+
7396+DEFINE_PER_CPU(cpu_state_t, hsb_cpu_state);
7397+
7398+#define hrt_dom(cpu) (&per_cpu(hsb_cpu_state, cpu).hrt.domain)
7399+
7400+#define set_will_schedule() \
7401+ (atomic_set(&__get_cpu_var(hsb_cpu_state).will_schedule, 1))
7402+#define clear_will_schedule() \
7403+ (atomic_set(&__get_cpu_var(hsb_cpu_state).will_schedule, 0))
7404+#define test_will_schedule(cpu) \
7405+ (atomic_read(&per_cpu(hsb_cpu_state, cpu).will_schedule))
7406+
7407+
7408+static void prepare_hrt_release(hrt_server_t *srv, jiffie_t start)
7409+{
7410+ if (srv->period && srv->wcet) {
7411+ srv->deadline = start;
7412+ srv->budget = 0;
7413+ }
7414+}
7415+
7416+static void check_for_hrt_release(hrt_server_t *srv) {
7417+ if (srv->wcet && srv->period &&
7418+ time_before_eq(srv->deadline, jiffies)) {
7419+ srv->deadline += srv->period;
7420+ srv->budget = srv->wcet;
7421+ sched_trace_server_release(HRT_BASE_PID + smp_processor_id(),
7422+ srv->budget, srv->period, RT_CLASS_HARD);
7423+ }
7424+}
7425+
7426+/* A HRT client is eligible if either its deadline is before the
7427+ * the server deadline or if the server has zero slack. The server
7428+ * must have budget left.
7429+ */
7430+static inline int hrt_client_eligible(hrt_server_t *srv)
7431+{
7432+ if (!list_empty(&srv->domain.ready_queue))
7433+ return srv->budget && (
7434+ time_before(get_deadline(next_ready(&srv->domain)),
7435+ srv->deadline)
7436+ || server_slack(srv) <= 0);
7437+ else
7438+ return 0;
7439+}
7440+
7441+static void hsb_cpu_state_init(cpu_state_t* cpu_state,
7442+ check_resched_needed_t check,
7443+ int cpu)
7444+{
7445+ edf_domain_init(&cpu_state->hrt.domain, check);
7446+ cpu_state->hrt.budget = 0;
7447+ cpu_state->hrt.deadline = 0;
7448+ cpu_state->hrt.period = 0;
7449+ cpu_state->hrt.wcet = 0;
7450+
7451+ cpu_state->be = NULL;
7452+ cpu_state->cap = NULL;
7453+
7454+ cpu_state->cur_deadline = 0;
7455+ cpu_state->cpu = cpu;
7456+ cpu_state->lock = SPIN_LOCK_UNLOCKED;
7457+ cpu_state->exec_class = RT_CLASS_BEST_EFFORT;
7458+
7459+ atomic_set(&cpu_state->will_schedule, 0);
7460+ INIT_LIST_HEAD(&cpu_state->list);
7461+}
7462+
7463+/******************************************************************************/
7464+/* BE queue functions - mostly like edf_common.c */
7465+/******************************************************************************/
7466+
7467+#define be_earlier_deadline(a, b) (time_before(\
7468+ (a)->deadline, (b)->deadline))
7469+#define be_earlier_release(a, b) (time_before(\
7470+ (a)->release, (b)->release))
7471+
7472+
7473+static void be_add_ready(rt_domain_t* edf, be_server_t *new)
7474+{
7475+ unsigned long flags;
7476+ struct list_head *pos;
7477+ be_server_t *queued;
7478+ unsigned int passed = 0;
7479+
7480+ BUG_ON(!new);
7481+ /* first we need the write lock for rt_ready_queue */
7482+ write_lock_irqsave(&edf->ready_lock, flags);
7483+ /* find a spot where our deadline is earlier than the next */
7484+ list_for_each(pos, &edf->ready_queue) {
7485+ queued = list_entry(pos, be_server_t, list);
7486+ if (unlikely(be_earlier_deadline(new, queued))) {
7487+ __list_add(&new->list, pos->prev, pos);
7488+ goto out;
7489+ }
7490+ passed++;
7491+ }
7492+ /* if we get to this point either the list is empty or new has the
7493+ * lowest priority. Let's add it to the end. */
7494+ list_add_tail(&new->list, &edf->ready_queue);
7495+ out:
7496+ if (!passed)
7497+ edf->check_resched(edf);
7498+ write_unlock_irqrestore(&edf->ready_lock, flags);
7499+}
7500+
7501+static be_server_t* be_take_ready(rt_domain_t* edf)
7502+{
7503+ be_server_t *t = NULL;
7504+
7505+ if (!list_empty(&edf->ready_queue)) {
7506+ t = list_entry(edf->ready_queue.next, be_server_t, list);
7507+ /* kick it out of the ready list */
7508+ list_del(&t->list);
7509+ }
7510+ return t;
7511+}
7512+
7513+/*static be_server_t* get_be_server(rt_domain_t* edf)
7514+{
7515+ be_server_t *t = NULL;
7516+
7517+ spin_lock(&edf->release_lock);
7518+ write_lock(&edf->ready_lock);
7519+ t = be_take_ready(edf);
7520+
7521+ if (!t && !list_empty(&edf->release_queue)) {
7522+ t = list_entry(edf->release_queue.next, be_server_t, list);
7523+
7524+ list_del(&t->list);
7525+ }
7526+
7527+ write_unlock(&edf->ready_lock);
7528+ spin_unlock(&edf->release_lock);
7529+ return t;
7530+}*/
7531+
7532+static void be_add_release(rt_domain_t* edf, be_server_t *srv)
7533+{
7534+ unsigned long flags;
7535+ struct list_head *pos;
7536+ be_server_t *queued;
7537+
7538+ spin_lock_irqsave(&edf->release_lock, flags);
7539+ list_for_each_prev(pos, &edf->release_queue) {
7540+ queued = list_entry(pos, be_server_t, list);
7541+ if ((unlikely(be_earlier_release(queued, srv)))) {
7542+ /* the task at pos has an earlier release */
7543+ /* insert the new task in behind it */
7544+ __list_add(&srv->list, pos, pos->next);
7545+ goto out;
7546+ }
7547+ }
7548+
7549+ list_add(&srv->list, &edf->release_queue);
7550+ out:
7551+ spin_unlock_irqrestore(&edf->release_lock, flags);
7552+}
7553+
7554+static void be_try_release_pending(rt_domain_t* edf)
7555+{
7556+ unsigned long flags;
7557+ struct list_head *pos, *save;
7558+ be_server_t *queued;
7559+
7560+ if (spin_trylock_irqsave(&edf->release_lock, flags)) {
7561+ list_for_each_safe(pos, save, &edf->release_queue) {
7562+ queued = list_entry(pos, be_server_t, list);
7563+ if (likely(time_before_eq(
7564+ queued->release,
7565+ jiffies))) {
7566+ list_del(pos);
7567+ be_add_ready(edf, queued);
7568+ sched_trace_server_release(
7569+ queued->pid, queued->budget,
7570+ queued->period, RT_CLASS_BEST_EFFORT);
7571+ } else
7572+ /* the release queue is ordered */
7573+ break;
7574+ }
7575+ spin_unlock_irqrestore(&edf->release_lock, flags);
7576+ }
7577+}
7578+
7579+static void be_prepare_new_release(be_server_t *t, jiffie_t start) {
7580+ t->release = start;
7581+ t->deadline = t->release + t->period;
7582+ t->budget = t->wcet;
7583+}
7584+
7585+static void be_prepare_new_releases(rt_domain_t *edf, jiffie_t start)
7586+{
7587+ unsigned long flags;
7588+ struct list_head tmp_list;
7589+ struct list_head *pos, *n;
7590+ be_server_t *t;
7591+
7592+ INIT_LIST_HEAD(&tmp_list);
7593+
7594+ spin_lock_irqsave(&edf->release_lock, flags);
7595+ write_lock(&edf->ready_lock);
7596+
7597+
7598+ while (!list_empty(&edf->release_queue)) {
7599+ pos = edf->release_queue.next;
7600+ list_del(pos);
7601+ list_add(pos, &tmp_list);
7602+ }
7603+
7604+ while (!list_empty(&edf->ready_queue)) {
7605+ pos = edf->ready_queue.next;
7606+ list_del(pos);
7607+ list_add(pos, &tmp_list);
7608+
7609+ }
7610+
7611+ write_unlock(&edf->ready_lock);
7612+ spin_unlock_irqrestore(&edf->release_lock, flags);
7613+
7614+ list_for_each_safe(pos, n, &tmp_list) {
7615+ t = list_entry(pos, be_server_t, list);
7616+ list_del(pos);
7617+ be_prepare_new_release(t, start);
7618+ be_add_release(edf, t);
7619+ }
7620+
7621+}
7622+
7623+static void be_prepare_for_next_period(be_server_t *t)
7624+{
7625+ BUG_ON(!t);
7626+ /* prepare next release */
7627+ t->release = t->deadline;
7628+ t->deadline += t->period;
7629+ t->budget = t->wcet;
7630+}
7631+
7632+#define be_next_ready(edf) \
7633+ list_entry((edf)->ready_queue.next, be_server_t, list)
7634+
7635+
7636+/* need_to_preempt - check whether the task t needs to be preempted by a
7637+ * best-effort server.
7638+ */
7639+static inline int be_preemption_needed(rt_domain_t* edf, cpu_state_t* state)
7640+{
7641+ /* we need the read lock for rt_ready_queue */
7642+ if (!list_empty(&edf->ready_queue))
7643+ {
7644+
7645+ if (state->exec_class == RT_CLASS_SOFT) {
7646+ if (state->cap)
7647+ return time_before(
7648+ be_next_ready(edf)->deadline,
7649+ state->cap->deadline);
7650+ else
7651+ return time_before(
7652+ be_next_ready(edf)->deadline,
7653+ state->cur_deadline);
7654+ } else
7655+ return 1;
7656+ }
7657+ return 0;
7658+}
7659+
7660+static void be_enqueue(rt_domain_t* edf, be_server_t* srv)
7661+{
7662+ int new_release = 0;
7663+ if (!srv->budget) {
7664+ be_prepare_for_next_period(srv);
7665+ new_release = 1;
7666+ }
7667+
7668+ if (time_before_eq(srv->release, jiffies) &&
7669+ get_rt_mode() == MODE_RT_RUN) {
7670+ be_add_ready(edf, srv);
7671+ if (new_release)
7672+ sched_trace_server_release(
7673+ srv->pid, srv->budget,
7674+ srv->period, RT_CLASS_BEST_EFFORT);
7675+ } else
7676+ be_add_release(edf, srv);
7677+}
7678+
7679+static void be_preempt(rt_domain_t *be, cpu_state_t *state)
7680+{
7681+ be_server_t *srv;
7682+
7683+ spin_lock(&state->lock);
7684+ srv = state->be;
7685+ state->be = NULL;
7686+ spin_unlock(&state->lock);
7687+
7688+ /* add outside of lock to avoid deadlock */
7689+ if (srv)
7690+ be_enqueue(be, srv);
7691+}
7692+
7693+
7694+/******************************************************************************/
7695+/* Actual HSB implementation */
7696+/******************************************************************************/
7697+
7698+/* always acquire the cpu lock as the last lock to avoid deadlocks */
7699+static spinlock_t hsb_cpu_lock = SPIN_LOCK_UNLOCKED;
7700+/* the cpus queue themselves according to priority in here */
7701+static LIST_HEAD(hsb_cpu_queue);
7702+
7703+
7704+/* the global soft real-time domain */
7705+static rt_domain_t srt;
7706+/* the global best-effort server domain
7707+ * belongs conceptually to the srt domain, but has
7708+ * be_server_t* queued instead of tast_t*
7709+ */
7710+static rt_domain_t be;
7711+
7712+static rt_domain_t hsb_fifo;
7713+
7714+static capacity_queue_t cap_queue;
7715+
7716+
7717+
7718+
7719+/* adjust_cpu_queue - Move the cpu entry to the correct place to maintain
7720+ * order in the cpu queue.
7721+ *
7722+ */
7723+static void adjust_cpu_queue(task_class_t class, jiffie_t deadline,
7724+ be_server_t *be)
7725+{
7726+ struct list_head *pos;
7727+ cpu_state_t *other;
7728+ cpu_state_t *entry;
7729+
7730+ spin_lock(&hsb_cpu_lock);
7731+
7732+ entry = &__get_cpu_var(hsb_cpu_state);
7733+
7734+ spin_lock(&entry->lock);
7735+ entry->exec_class = class;
7736+ entry->cur_deadline = deadline;
7737+ entry->be = be;
7738+
7739+ spin_unlock(&entry->lock);
7740+
7741+
7742+
7743+ if (be)
7744+ sched_trace_server_scheduled(
7745+ be->pid, RT_CLASS_BEST_EFFORT, be->budget,
7746+ be->deadline);
7747+ else if (class == RT_CLASS_HARD)
7748+ sched_trace_server_scheduled(
7749+ HRT_BASE_PID + smp_processor_id(), RT_CLASS_HARD,
7750+ entry->hrt.budget, entry->hrt.deadline);
7751+
7752+ list_del(&entry->list);
7753+ /* If we do not execute real-time jobs we just move
7754+ * to the end of the queue .
7755+ * If we execute hard real-time jobs we move the start
7756+ * of the queue.
7757+ */
7758+
7759+ switch (entry->exec_class) {
7760+ case RT_CLASS_HARD:
7761+ list_add(&entry->list, &hsb_cpu_queue);
7762+ break;
7763+
7764+ case RT_CLASS_SOFT:
7765+ list_for_each(pos, &hsb_cpu_queue) {
7766+ other = list_entry(pos, cpu_state_t, list);
7767+ if (other->exec_class > RT_CLASS_SOFT ||
7768+ time_before_eq(entry->cur_deadline,
7769+ other->cur_deadline))
7770+ {
7771+ __list_add(&entry->list, pos->prev, pos);
7772+ goto out;
7773+ }
7774+ }
7775+ /* possible fall through if lowest SRT priority */
7776+
7777+ case RT_CLASS_BEST_EFFORT:
7778+ list_add_tail(&entry->list, &hsb_cpu_queue);
7779+ break;
7780+
7781+ default:
7782+ /* something wrong in the variable */
7783+ BUG();
7784+ }
7785+ out:
7786+ spin_unlock(&hsb_cpu_lock);
7787+}
7788+
7789+
7790+/* hrt_check_resched - check whether the HRT server on given CPU needs to
7791+ * preempt the running task.
7792+ */
7793+static int hrt_check_resched(rt_domain_t *edf)
7794+{
7795+ hrt_server_t *srv = container_of(edf, hrt_server_t, domain);
7796+ cpu_state_t *state = container_of(srv, cpu_state_t, hrt);
7797+ int ret = 0;
7798+
7799+ spin_lock(&state->lock);
7800+
7801+ if (hrt_client_eligible(srv)) {
7802+ if (state->exec_class > RT_CLASS_HARD ||
7803+ time_before(
7804+ get_deadline(next_ready(edf)),
7805+ state->cur_deadline)
7806+ ) {
7807+ if (state->cpu == smp_processor_id())
7808+ set_tsk_need_resched(current);
7809+ else
7810+ smp_send_reschedule(state->cpu);
7811+ }
7812+ }
7813+
7814+ spin_unlock(&state->lock);
7815+ return ret;
7816+}
7817+
7818+
7819+/* srt_check_resched - Check whether another CPU needs to switch to a SRT task.
7820+ *
7821+ * The function only checks and kicks the last CPU. It will reschedule and
7822+ * kick the next if necessary, and so on. The caller is responsible for making
7823+ * sure that it is not the last entry or that a reschedule is not necessary.
7824+ *
7825+ * Caller must hold edf->ready_lock!
7826+ */
7827+static int srt_check_resched(rt_domain_t *edf)
7828+{
7829+ cpu_state_t *last;
7830+ int ret = 0;
7831+
7832+ spin_lock(&hsb_cpu_lock);
7833+
7834+ if (!list_empty(&srt.ready_queue)) {
7835+ last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list);
7836+ /* guard against concurrent updates */
7837+ spin_lock(&last->lock);
7838+ if (last->exec_class == RT_CLASS_BEST_EFFORT || (
7839+ last->exec_class == RT_CLASS_SOFT &&
7840+ time_before(get_deadline(next_ready(&srt)),
7841+ last->cur_deadline)))
7842+ {
7843+ if (smp_processor_id() == last->cpu)
7844+ set_tsk_need_resched(current);
7845+ else
7846+ if (!test_will_schedule(last->cpu))
7847+ smp_send_reschedule(last->cpu);
7848+ ret = 1;
7849+ }
7850+ spin_unlock(&last->lock);
7851+ }
7852+
7853+ spin_unlock(&hsb_cpu_lock);
7854+ return ret;
7855+}
7856+
7857+
7858+/* be_check_resched - Check whether another CPU needs to switch to a BE server..
7859+ *
7860+ * Caller must hold edf->ready_lock!
7861+ */
7862+static int be_check_resched(rt_domain_t *edf)
7863+{
7864+ cpu_state_t *last;
7865+ int soft, bg;
7866+ int ret = 0;
7867+
7868+ spin_lock(&hsb_cpu_lock);
7869+
7870+ if (!list_empty(&be.ready_queue)) {
7871+ last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list);
7872+ /* guard against concurrent updates */
7873+ spin_lock(&last->lock);
7874+
7875+ bg = last->exec_class == RT_CLASS_BEST_EFFORT;
7876+ soft = last->exec_class == RT_CLASS_SOFT;
7877+
7878+ if (bg || (soft && time_before(be_next_ready(&be)->deadline,
7879+ last->cur_deadline)))
7880+ {
7881+ if (smp_processor_id() == last->cpu)
7882+ set_tsk_need_resched(current);
7883+ else
7884+ if (!test_will_schedule(last->cpu))
7885+ smp_send_reschedule(last->cpu);
7886+ ret = 1;
7887+ }
7888+
7889+ spin_unlock(&last->lock);
7890+ }
7891+
7892+ spin_unlock(&hsb_cpu_lock);
7893+ return ret;
7894+}
7895+
7896+
7897+int cap_check_resched(jiffie_t deadline)
7898+{
7899+ unsigned long flags;
7900+ cpu_state_t *last;
7901+ int soft, bg;
7902+ int ret = 0;
7903+
7904+
7905+
7906+ if (get_rt_mode() == MODE_RT_RUN) {
7907+ spin_lock_irqsave(&hsb_cpu_lock, flags);
7908+
7909+ last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list);
7910+ /* guard against concurrent updates */
7911+ spin_lock(&last->lock);
7912+
7913+ bg = last->exec_class == RT_CLASS_BEST_EFFORT;
7914+ soft = last->exec_class == RT_CLASS_SOFT;
7915+
7916+ if (bg || (soft && time_before(deadline,
7917+ last->cur_deadline)))
7918+ {
7919+ if (smp_processor_id() == last->cpu)
7920+ set_tsk_need_resched(current);
7921+ else
7922+ if (!test_will_schedule(last->cpu))
7923+ smp_send_reschedule(last->cpu);
7924+ ret = 1;
7925+ }
7926+
7927+ spin_unlock(&last->lock);
7928+
7929+ spin_unlock_irqrestore(&hsb_cpu_lock, flags);
7930+ }
7931+ return ret;
7932+}
7933+
7934+int fifo_check_resched(void)
7935+{
7936+ unsigned long flags;
7937+ cpu_state_t *last;
7938+ int ret = 0;
7939+
7940+ if (get_rt_mode() == MODE_RT_RUN) {
7941+ spin_lock_irqsave(&hsb_cpu_lock, flags);
7942+
7943+
7944+ last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list);
7945+ /* guard against concurrent updates */
7946+
7947+ spin_lock(&last->lock);
7948+
7949+ if (last->exec_class == RT_CLASS_BEST_EFFORT)
7950+ {
7951+ if (smp_processor_id() == last->cpu)
7952+ set_tsk_need_resched(current);
7953+ else
7954+ if (!test_will_schedule(last->cpu))
7955+ smp_send_reschedule(last->cpu);
7956+ ret = 1;
7957+ }
7958+
7959+ spin_unlock(&last->lock);
7960+
7961+ spin_unlock_irqrestore(&hsb_cpu_lock, flags);
7962+ }
7963+ return ret;
7964+}
7965+
7966+
7967+
7968+static inline int hsb_preemption_needed(rt_domain_t* edf, cpu_state_t* state)
7969+{
7970+ /* we need the read lock for rt_ready_queue */
7971+ if (!list_empty(&edf->ready_queue))
7972+ {
7973+ if (state->exec_class == RT_CLASS_SOFT) {
7974+ if (state->cap)
7975+ return time_before(get_deadline(next_ready(edf))
7976+ , state->cap->deadline);
7977+ else
7978+ return time_before(get_deadline(next_ready(edf))
7979+ , state->cur_deadline);
7980+ } else
7981+ return 1;
7982+ }
7983+ return 0;
7984+}
7985+
7986+static inline int cap_preemption_needed(capacity_queue_t* q, cpu_state_t* state)
7987+{
7988+ /* we need the read lock for rt_ready_queue */
7989+ if (!list_empty(&q->queue))
7990+ {
7991+ if (state->exec_class == RT_CLASS_SOFT) {
7992+ if (state->cap)
7993+ return time_before(next_cap(q)->deadline
7994+ , state->cap->deadline);
7995+ else
7996+ return time_before(next_cap(q)->deadline
7997+ , state->cur_deadline);
7998+ } else
7999+ return 1;
8000+ }
8001+ return 0;
8002+}
8003+
8004+/* hsb_scheduler_tick - this function is called for every local timer
8005+ * interrupt.
8006+ *
8007+ * checks whether the current task has expired and checks
8008+ * whether we need to preempt it if it has not expired
8009+ */
8010+static reschedule_check_t hsb_scheduler_tick(void)
8011+{
8012+ unsigned long flags;
8013+ struct task_struct *t = current;
8014+ int resched = 0;
8015+
8016+ cpu_state_t *state = &__get_cpu_var(hsb_cpu_state);
8017+
8018+ /* expire tasks even if not in real-time mode
8019+ * this makes sure that at the end of real-time mode
8020+ * no tasks "run away forever".
8021+ */
8022+
8023+ /* charge BE server only if we are not running on a spare capacity */
8024+ if (state->be && !state->cap && --state->be->budget <= 0) {
8025+ sched_trace_server_completion(state->be->pid, 0,
8026+ state->be->deadline,
8027+ RT_CLASS_BEST_EFFORT);
8028+ be_preempt(&be, state);
8029+ resched = 1;
8030+ }
8031+
8032+ if (state->cap)
8033+ if (--state->cap->budget <= 0 ||
8034+ time_before_eq(state->cap->deadline, jiffies)) {
8035+ kfree(state->cap);
8036+ state->cap = NULL;
8037+ resched = 1;
8038+ }
8039+
8040+ if (is_realtime(t)) {
8041+ if (is_hrt(t) && (--state->hrt.budget <= 0)) {
8042+ sched_trace_server_completion(
8043+ HRT_BASE_PID + smp_processor_id(), 0,
8044+ state->hrt.deadline, RT_CLASS_HARD);
8045+ resched = 1;
8046+ }
8047+
8048+ /* account for received service... */
8049+ t->rt_param.times.exec_time++;
8050+
8051+ /* ...and charge current budget */
8052+ if (!state->cap) {
8053+ --t->time_slice;
8054+ /* a task always should be able to finish its job */
8055+ BUG_ON(!is_be(t) && !t->time_slice && !job_completed(t));
8056+ }
8057+
8058+ if (job_completed(t) || (is_be(t) && !t->time_slice)) {
8059+ sched_trace_job_completion(t);
8060+ set_rt_flags(t, RT_F_SLEEP);
8061+ resched = 1;
8062+ }
8063+ }
8064+
8065+
8066+ if (get_rt_mode() == MODE_RT_RUN)
8067+ {
8068+ try_release_pending(&state->hrt.domain);
8069+ check_for_hrt_release(&state->hrt);
8070+ try_release_pending(&srt);
8071+ be_try_release_pending(&be);
8072+
8073+ if (!resched)
8074+ switch (state->exec_class) {
8075+ case RT_CLASS_HARD:
8076+ read_lock_irqsave(&state->hrt.domain.ready_lock,
8077+ flags);
8078+ resched = edf_preemption_needed(
8079+ &state->hrt.domain,
8080+ t);
8081+ read_unlock_irqrestore(
8082+ &state->hrt.domain.ready_lock, flags);
8083+ break;
8084+
8085+ case RT_CLASS_SOFT:
8086+ case RT_CLASS_BEST_EFFORT:
8087+ local_irq_save(flags);
8088+
8089+ /* check for HRT jobs */
8090+ read_lock(&state->hrt.domain.ready_lock);
8091+ resched = hrt_client_eligible(&state->hrt);
8092+ read_unlock(&state->hrt.domain.ready_lock);
8093+
8094+ /* check for spare capacities */
8095+ if (!resched) {
8096+ spin_lock(&cap_queue.lock);
8097+ resched =
8098+ cap_preemption_needed(&cap_queue,
8099+ state);
8100+ spin_unlock(&cap_queue.lock);
8101+ }
8102+
8103+ /* check for SRT jobs */
8104+ if (!resched) {
8105+ read_lock(&srt.ready_lock);
8106+ resched = hsb_preemption_needed(
8107+ &srt, state);
8108+ read_unlock(&srt.ready_lock);
8109+ }
8110+
8111+ /* check for BE jobs */
8112+ if (!resched) {
8113+ read_lock(&be.ready_lock);
8114+ resched = be_preemption_needed(
8115+ &be, state);
8116+ read_unlock(&be.ready_lock);
8117+ }
8118+
8119+ /* check for background jobs */
8120+ if (!resched && !is_realtime(current))
8121+ resched = jobs_pending(&hsb_fifo);
8122+ local_irq_restore(flags);
8123+ break;
8124+
8125+ default:
8126+ /* something wrong in the variable */
8127+ BUG();
8128+ }
8129+ }
8130+
8131+ if (resched) {
8132+ set_will_schedule();
8133+ return FORCE_RESCHED;
8134+ } else
8135+ return NO_RESCHED;
8136+}
8137+
8138+static int schedule_hrt(struct task_struct * prev,
8139+ struct task_struct ** next, runqueue_t * rq)
8140+{
8141+ unsigned long flags;
8142+ int deactivate = 1;
8143+ cpu_state_t *state;
8144+
8145+
8146+ state = &__get_cpu_var(hsb_cpu_state);
8147+
8148+ write_lock_irqsave(&state->hrt.domain.ready_lock, flags);
8149+
8150+
8151+ if (state->cap) {
8152+ /* hrt_schedule does not have the cap_queue lock */
8153+ return_capacity(&cap_queue, state->cap);
8154+ state->cap = NULL;
8155+ }
8156+
8157+ if (is_hrt(prev) && is_released(prev) && is_running(prev)
8158+ && !edf_preemption_needed(&state->hrt.domain, prev)) {
8159+ /* This really should only happen if the task has
8160+ * 100% utilization or when we got a bogus/delayed
8161+ * resched IPI.
8162+ */
8163+ TRACE("HRT: prev will be next, already released\n");
8164+ *next = prev;
8165+ deactivate = 0;
8166+ } else {
8167+ /* either not yet released, preempted, or non-rt */
8168+ *next = __take_ready(&state->hrt.domain);
8169+ /* the logic in hsb_schedule makes sure *next must exist
8170+ * if we get here */
8171+ BUG_ON(!*next);
8172+ /* stick the task into the runqueue */
8173+ __activate_task(*next, rq);
8174+ set_task_cpu(*next, smp_processor_id());
8175+ }
8176+
8177+ set_rt_flags(*next, RT_F_RUNNING);
8178+ adjust_cpu_queue(RT_CLASS_HARD, get_deadline(*next), NULL);
8179+ clear_will_schedule();
8180+
8181+ write_unlock_irqrestore(&state->hrt.domain.ready_lock, flags);
8182+ return deactivate;
8183+}
8184+
8185+
8186+static struct task_struct* find_min_slack_task(struct task_struct *prev,
8187+ rt_domain_t* edf)
8188+{
8189+ struct list_head *pos;
8190+ struct task_struct* tsk = NULL;
8191+ struct task_struct* cur;
8192+
8193+ if (is_realtime(prev) && is_running(prev) &&
8194+ get_rt_flags(prev) != RT_F_SLEEP)
8195+ tsk = prev;
8196+ list_for_each(pos, &edf->ready_queue) {
8197+ cur = list_entry(pos, struct task_struct, rt_list);
8198+ if (!tsk || task_slack(tsk) > task_slack(cur))
8199+ tsk = cur;
8200+ }
8201+ return tsk;
8202+}
8203+
8204+static struct task_struct* null_heuristic(struct task_struct *prev,
8205+ rt_domain_t* edf,
8206+ rt_domain_t* fifo)
8207+{
8208+ if (jobs_pending(fifo))
8209+ return NULL;
8210+ else if (!list_empty(&edf->ready_queue))
8211+ return list_entry(edf->ready_queue.next,
8212+ struct task_struct, rt_list);
8213+ else
8214+ return NULL;
8215+}
8216+
8217+/* caller holds all locks
8218+ */
8219+
8220+static int schedule_capacity(struct task_struct *prev,
8221+ struct task_struct **next, runqueue_t *rq)
8222+{
8223+ cpu_state_t *state = &__get_cpu_var(hsb_cpu_state);
8224+ capacity_t* old;
8225+
8226+ if (state->cap) {
8227+ old = state->cap;
8228+ state->cap = __take_capacity(&cap_queue, old->deadline, 1);
8229+ if (!state->cap)
8230+ state->cap = old;
8231+ else
8232+ __return_capacity(&cap_queue, old);
8233+ } else
8234+ state->cap = __take_capacity(&cap_queue, 0, 0);
8235+
8236+
8237+ /* pick a task likely to be tardy */
8238+ *next = find_min_slack_task(prev, &srt);
8239+
8240+ /* only give away spare capacities if there is no task that
8241+ * is going to be tardy
8242+ */
8243+ if (*next && task_slack(*next) >= 0)
8244+ *next = null_heuristic(prev, &srt, &hsb_fifo);
8245+ if (*next && *next != prev)
8246+ list_del(&(*next)->rt_list);
8247+
8248+
8249+ /* if there is none pick a BE job */
8250+ if (!*next) {
8251+ if (is_realtime(prev) && is_be(prev) && is_running(prev) &&
8252+ get_rt_flags(prev) != RT_F_SLEEP)
8253+ *next = prev;
8254+ else
8255+ *next = take_ready(&hsb_fifo);
8256+ }
8257+
8258+ if (state->be)
8259+ be_preempt(&be, state);
8260+ BUG_ON(!state->cap);
8261+ if (*next && state->cap->donor) {
8262+ sched_trace_capacity_allocation(
8263+ *next, state->cap->budget, state->cap->deadline,
8264+ state->cap->donor);
8265+ }
8266+
8267+ return *next != prev;
8268+}
8269+
8270+
8271+
8272+#define BG 0
8273+#define SRT 1
8274+#define BE 2
8275+#define CAP 3
8276+
8277+static inline int what_first(rt_domain_t *be, rt_domain_t *srt, capacity_queue_t* q)
8278+{
8279+ jiffie_t sdl = 0, bdl= 0, cdl = 0, cur;
8280+ int _srt = !list_empty(&srt->ready_queue);
8281+ int _be = !list_empty(&be->ready_queue);
8282+ int _cap = __capacity_available(q);
8283+
8284+
8285+ int ret = BG; /* nothing ready => background mode*/
8286+ cur = 0;
8287+
8288+ if (_srt)
8289+ sdl = get_deadline(next_ready(srt));
8290+ if (_be)
8291+ bdl = be_next_ready(be)->deadline;
8292+ if (_cap)
8293+ cdl = next_cap(q)->deadline;
8294+
8295+
8296+
8297+ if (_cap) {
8298+ ret = CAP;
8299+ cur = cdl;
8300+ }
8301+ if (_srt && (time_before(sdl, cur) || !ret)) {
8302+ ret = SRT;
8303+ cur = sdl;
8304+ }
8305+ if (_be && (time_before(bdl, cur) || !ret)) {
8306+ ret = BE;
8307+ cur = bdl;
8308+ }
8309+ return ret;
8310+}
8311+
8312+
8313+
8314+static int schedule_srt_be_cap(struct task_struct *prev,
8315+ struct task_struct **next, runqueue_t *rq)
8316+{
8317+ task_class_t class = RT_CLASS_BEST_EFFORT;
8318+ jiffie_t deadline = 0;
8319+ unsigned long flags;
8320+ int deactivate = 1;
8321+ be_server_t* bes;
8322+ cpu_state_t* state;
8323+ int type = BG;
8324+
8325+reschedule:
8326+ write_lock_irqsave(&srt.ready_lock, flags);
8327+ write_lock(&be.ready_lock);
8328+ spin_lock(&cap_queue.lock);
8329+
8330+
8331+ state = &__get_cpu_var(hsb_cpu_state);
8332+ bes = NULL;
8333+
8334+ clear_will_schedule();
8335+
8336+ if (is_realtime(prev) && (is_released(prev) || is_be(prev)) &&
8337+ is_running(prev) && !hsb_preemption_needed(&srt, state) &&
8338+ !be_preemption_needed(&be, state)
8339+ ) {
8340+ /* Our current task's next job has already been
8341+ * released and has higher priority than the highest
8342+ * prioriy waiting task; in other words: it is tardy.
8343+ * We just keep it.
8344+ */
8345+ TRACE("prev will be next, already released\n");
8346+ *next = prev;
8347+ class = prev->rt_param.basic_params.class;
8348+ deadline = get_deadline(*next);
8349+ deactivate = 0;
8350+ } else {
8351+ /* either not yet released, preempted, or non-rt */
8352+ type = what_first(&be, &srt, &cap_queue);
8353+ switch (type) {
8354+ case CAP:
8355+ /* capacity */
8356+ deactivate = schedule_capacity(prev, next, rq);
8357+ deadline = state->cap->deadline;
8358+ if (*next)
8359+ class = RT_CLASS_SOFT;
8360+ else
8361+ class = RT_CLASS_BEST_EFFORT;
8362+ break;
8363+ case BE:
8364+ /* be */
8365+ *next = NULL;
8366+ bes = be_take_ready(&be);
8367+ if (bes) {
8368+ class = RT_CLASS_SOFT;
8369+ deadline = bes->deadline;
8370+ *next = take_ready(&hsb_fifo);
8371+ if (!*next) {
8372+ /* deactivate */
8373+ __release_capacity(&cap_queue,
8374+ bes->budget,
8375+ bes->deadline, NULL);
8376+ bes->budget = 0;
8377+ barrier();
8378+ spin_unlock(&cap_queue.lock);
8379+ write_unlock(&be.ready_lock);
8380+ write_unlock_irqrestore(&srt.ready_lock,
8381+ flags);
8382+ be_enqueue(&be, bes);
8383+ goto reschedule;
8384+ }
8385+ }
8386+ break;
8387+ case SRT:
8388+ /* srt */
8389+ *next = __take_ready(&srt);
8390+ if (*next) {
8391+ class = RT_CLASS_SOFT;
8392+ deadline = get_deadline(*next);
8393+ }
8394+ break;
8395+ case BG:
8396+ /* background server mode */
8397+ class = RT_CLASS_BEST_EFFORT;
8398+ deadline = 0;
8399+ *next = take_ready(&hsb_fifo);
8400+ break;
8401+ }
8402+
8403+
8404+ /* give back capacities */
8405+ if (type != CAP && state->cap) {
8406+ __return_capacity(&cap_queue, state->cap);
8407+ state->cap = NULL;
8408+ }
8409+ if (*next && deactivate) {
8410+ /* mark the task as executing on this cpu */
8411+ set_task_cpu(*next, smp_processor_id());
8412+ /* stick the task into the runqueue */
8413+ __activate_task(*next, rq);
8414+ }
8415+ }
8416+
8417+ adjust_cpu_queue(class, deadline, bes);
8418+
8419+ switch (type) {
8420+ case BG:
8421+ break;
8422+ case BE:
8423+ be.check_resched(&be);
8424+ break;
8425+ case SRT:
8426+ srt.check_resched(&srt);
8427+ break;
8428+ case CAP:
8429+ if (!list_empty(&cap_queue.queue))
8430+ cap_check_resched(list_entry(cap_queue.queue.next,
8431+ capacity_t, list)->deadline);
8432+ break;
8433+ }
8434+
8435+
8436+ if(*next)
8437+ set_rt_flags(*next, RT_F_RUNNING);
8438+
8439+ spin_unlock(&cap_queue.lock);
8440+ write_unlock(&be.ready_lock);
8441+ write_unlock_irqrestore(&srt.ready_lock, flags);
8442+ return deactivate;
8443+}
8444+
8445+
8446+static int hsb_schedule(struct task_struct * prev, struct task_struct ** next,
8447+ runqueue_t * rq)
8448+{
8449+ int need_deactivate = 1;
8450+ cpu_state_t *state = NULL;
8451+
8452+ preempt_disable();
8453+
8454+ state = &__get_cpu_var(hsb_cpu_state);
8455+
8456+ be_preempt(&be, state);
8457+
8458+
8459+ if (is_realtime(prev) && !is_be(prev) &&
8460+ get_rt_flags(prev) == RT_F_SLEEP)
8461+ {
8462+ TRACE("preparing %d for next period\n", prev->pid);
8463+ release_capacity(&cap_queue, prev->time_slice,
8464+ prev->rt_param.times.deadline, prev);
8465+ edf_prepare_for_next_period(prev);
8466+ }
8467+
8468+ if (get_rt_mode() == MODE_RT_RUN) {
8469+ /* we need to schedule hrt if a hrt job is pending or when
8470+ * we have a non expired hrt job on the cpu
8471+ */
8472+
8473+ if (hrt_client_eligible(&state->hrt) ||
8474+ unlikely((is_hrt(prev) && is_running(prev) &&
8475+ get_rt_flags(prev) != RT_F_SLEEP))) {
8476+ if (state->cap) {
8477+ return_capacity(&cap_queue, state->cap);
8478+ state->cap = NULL;
8479+ }
8480+ need_deactivate = schedule_hrt(prev, next, rq);
8481+ } else
8482+ need_deactivate = schedule_srt_be_cap(prev, next, rq);
8483+
8484+ }
8485+
8486+ if (is_realtime(prev) && need_deactivate && prev->array) {
8487+ /* take it out of the run queue */
8488+ deactivate_task(prev, rq);
8489+ }
8490+
8491+ preempt_enable();
8492+
8493+ return 0;
8494+}
8495+
8496+/* put task into correct queue */
8497+static inline void hsb_add_release(struct task_struct *t)
8498+{
8499+ if (is_hrt(t))
8500+ add_release(hrt_dom(get_partition(t)), t);
8501+ else if (is_srt(t))
8502+ add_release(&srt, t);
8503+ else if (is_be(t)) {
8504+ t->time_slice = 0;
8505+ add_ready(&hsb_fifo, t);
8506+ fifo_check_resched();
8507+ } else
8508+ BUG();
8509+
8510+}
8511+
8512+/* put task into correct queue */
8513+static inline void hsb_add_ready(struct task_struct *t)
8514+{
8515+ if (is_hrt(t))
8516+ add_ready(hrt_dom(get_partition(t)), t);
8517+ else if (is_srt(t))
8518+ add_ready(&srt, t);
8519+ else if (is_be(t)) {
8520+ add_ready(&hsb_fifo, t);
8521+ fifo_check_resched();
8522+ }
8523+ else
8524+ BUG();
8525+}
8526+
8527+
8528+/* _finish_switch - we just finished the switch away from prev
8529+ * it is now safe to requeue the task
8530+ */
8531+static void hsb_finish_switch(struct task_struct *prev)
8532+{
8533+ if (!is_realtime(prev) || !is_running(prev))
8534+ return;
8535+
8536+ TRACE("finish switch for %d\n", prev->pid);
8537+
8538+ if (is_be(prev)) {
8539+ add_ready(&hsb_fifo, prev);
8540+ return;
8541+ }
8542+
8543+ if (get_rt_flags(prev) == RT_F_SLEEP ||
8544+ get_rt_mode() != MODE_RT_RUN) {
8545+ /* this task has expired
8546+ * _schedule has already taken care of updating
8547+ * the release and
8548+ * deadline. We just must check if has been released.
8549+ */
8550+ if (is_released(prev) && get_rt_mode() == MODE_RT_RUN) {
8551+ sched_trace_job_release(prev);
8552+ hsb_add_ready(prev);
8553+ TRACE("%d goes straight to ready queue\n", prev->pid);
8554+ }
8555+ else
8556+ /* it has got to wait */
8557+ hsb_add_release(prev);
8558+ }
8559+ else {
8560+ /* this is a forced preemption
8561+ * thus the task stays in the ready_queue
8562+ * we only must make it available to other cpus
8563+ */
8564+ hsb_add_ready(prev);
8565+ }
8566+}
8567+
8568+
8569+/* Prepare a task for running in RT mode
8570+ * Enqueues the task into master queue data structure
8571+ * returns
8572+ * -EPERM if task is not TASK_STOPPED
8573+ */
8574+static long hsb_prepare_task(struct task_struct * t)
8575+{
8576+ TRACE("edf-hsb: prepare task %d\n", t->pid);
8577+
8578+ if (t->state == TASK_STOPPED) {
8579+ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
8580+
8581+ if (get_rt_mode() == MODE_RT_RUN && !is_be(t))
8582+ /* The action is already on.
8583+ * Prepare immediate release
8584+ */
8585+ edf_release_now(t);
8586+ /* The task should be running in the queue, otherwise signal
8587+ * code will try to wake it up with fatal consequences.
8588+ */
8589+ t->state = TASK_RUNNING;
8590+ if (is_be(t))
8591+ t->rt_param.times.deadline = 0;
8592+ hsb_add_release(t);
8593+ return 0;
8594+ }
8595+ else
8596+ return -EPERM;
8597+}
8598+
8599+static void hsb_wake_up_task(struct task_struct *task)
8600+{
8601+ /* We must determine whether task should go into the release
8602+ * queue or into the ready queue. It may enter the ready queue
8603+ * if it has credit left in its time slice and has not yet reached
8604+ * its deadline. If it is now passed its deadline we assume this the
8605+ * arrival of a new sporadic job and thus put it in the ready queue
8606+ * anyway.If it has zero budget and the next release is in the future
8607+ * it has to go to the release queue.
8608+ */
8609+ TRACE("edf-hsb: wake up %d with budget=%d\n",
8610+ task->pid, task->time_slice);
8611+ task->state = TASK_RUNNING;
8612+
8613+ if (is_be(task)) {
8614+ task->rt_param.times.last_release = jiffies;
8615+ hsb_add_release(task);
8616+ }
8617+ else if (is_tardy(task)) {
8618+ /* new sporadic release */
8619+ edf_release_now(task);
8620+ sched_trace_job_release(task);
8621+ hsb_add_ready(task);
8622+ }
8623+ else if (task->time_slice) {
8624+ /* came back in time before deadline
8625+ */
8626+ set_rt_flags(task, RT_F_RUNNING);
8627+ hsb_add_ready(task);
8628+ }
8629+ else {
8630+ hsb_add_release(task);
8631+ }
8632+
8633+}
8634+
8635+static void hsb_task_blocks(struct task_struct *t)
8636+{
8637+ /* not really anything to do since it can only block if
8638+ * it is running, and when it is not running it is not in any
8639+ * queue anyway.
8640+ */
8641+ TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice);
8642+ if (is_be(t))
8643+ sched_trace_job_completion(t);
8644+}
8645+
8646+
8647+static int hsb_mode_change(int new_mode)
8648+{
8649+ int cpu;
8650+ cpu_state_t *entry;
8651+ jiffie_t start;
8652+
8653+ TRACE("[%d] edf-hsb: mode changed to %d\n", smp_processor_id(),
8654+ new_mode);
8655+ if (new_mode == MODE_RT_RUN) {
8656+ start = jiffies + 20;
8657+ rerelease_all(&srt, edf_release_at);
8658+ be_prepare_new_releases(&be, start);
8659+
8660+ /* initialize per CPU state
8661+ * we can't do this at boot time because we don't know
8662+ * which CPUs will be online and we can't put non-existing
8663+ * cpus into the queue
8664+ */
8665+ spin_lock(&hsb_cpu_lock);
8666+ /* get old cruft out of the way in case we reenter real-time
8667+ * mode for a second time
8668+ */
8669+ while (!list_empty(&hsb_cpu_queue))
8670+ list_del(hsb_cpu_queue.next);
8671+ /* reinitialize */
8672+ for_each_online_cpu(cpu) {
8673+ entry = &per_cpu(hsb_cpu_state, cpu);
8674+ atomic_set(&entry->will_schedule, 0);
8675+ entry->exec_class = RT_CLASS_BEST_EFFORT;
8676+ entry->cur_deadline = 0;
8677+ list_add(&entry->list, &hsb_cpu_queue);
8678+
8679+ rerelease_all(&entry->hrt.domain, edf_release_at);
8680+ prepare_hrt_release(&entry->hrt, start);
8681+ }
8682+ spin_unlock(&hsb_cpu_lock);
8683+
8684+ }
8685+ TRACE("[%d] edf-hsb: mode change done\n", smp_processor_id());
8686+ return 0;
8687+}
8688+
8689+
8690+typedef enum {
8691+ EDF_HSB_SET_HRT,
8692+ EDF_HSB_GET_HRT,
8693+ EDF_HSB_CREATE_BE
8694+} edf_hsb_setup_cmds_t;
8695+
8696+typedef struct {
8697+ int cpu;
8698+ unsigned int wcet;
8699+ unsigned int period;
8700+} setup_hrt_param_t;
8701+
8702+typedef struct {
8703+ unsigned int wcet;
8704+ unsigned int period;
8705+} create_be_param_t;
8706+
8707+typedef struct {
8708+ union {
8709+ setup_hrt_param_t setup_hrt;
8710+ create_be_param_t create_be;
8711+ };
8712+} param_t;
8713+
8714+static pid_t next_be_server_pid = SRT_BASE_PID;
8715+
8716+static int hsb_scheduler_setup(int cmd, void __user* up)
8717+{
8718+ unsigned long flags;
8719+ int error = -EINVAL;
8720+ cpu_state_t* state;
8721+ be_server_t* srv;
8722+ param_t param;
8723+
8724+ switch (cmd) {
8725+ case EDF_HSB_SET_HRT:
8726+ if (copy_from_user(&param, up, sizeof(setup_hrt_param_t))) {
8727+ error = -EFAULT;
8728+ goto out;
8729+ }
8730+ if (!cpu_online(param.setup_hrt.cpu)) {
8731+ printk(KERN_WARNING "scheduler setup: "
8732+ "CPU %d is not online!\n", param.setup_hrt.cpu);
8733+ error = -EINVAL;
8734+ goto out;
8735+ }
8736+ if (param.setup_hrt.period < param.setup_hrt.wcet) {
8737+ printk(KERN_WARNING "period < wcet!\n");
8738+ error = -EINVAL;
8739+ goto out;
8740+ }
8741+
8742+ state = &per_cpu(hsb_cpu_state, param.setup_hrt.cpu);
8743+ spin_lock_irqsave(&state->lock, flags);
8744+
8745+ state->hrt.wcet = param.setup_hrt.wcet;
8746+ state->hrt.period = param.setup_hrt.period;
8747+
8748+ spin_unlock_irqrestore(&state->lock, flags);
8749+
8750+ printk(KERN_WARNING "edf-hsb: set HRT #%d to (%d, %d)\n",
8751+ param.setup_hrt.cpu, param.setup_hrt.wcet,
8752+ param.setup_hrt.period);
8753+
8754+ error = 0;
8755+
8756+ break;
8757+
8758+ case EDF_HSB_GET_HRT:
8759+ if (copy_from_user(&param, up, sizeof(setup_hrt_param_t))) {
8760+ error = -EFAULT;
8761+ goto out;
8762+ }
8763+ if (!cpu_online(param.setup_hrt.cpu)) {
8764+ error = -EINVAL;
8765+ goto out;
8766+ }
8767+ state = &per_cpu(hsb_cpu_state, param.setup_hrt.cpu);
8768+ spin_lock_irqsave(&state->lock, flags);
8769+
8770+ param.setup_hrt.wcet = state->hrt.wcet;
8771+ param.setup_hrt.period = state->hrt.period;
8772+
8773+ spin_unlock_irqrestore(&state->lock, flags);
8774+
8775+ if (copy_to_user(up, &param, sizeof(setup_hrt_param_t))) {
8776+ error = -EFAULT;
8777+ goto out;
8778+ }
8779+ error = 0;
8780+ break;
8781+
8782+ case EDF_HSB_CREATE_BE:
8783+ if (copy_from_user(&param, up, sizeof(create_be_param_t))) {
8784+ error = -EFAULT;
8785+ goto out;
8786+ }
8787+ if (param.create_be.period < param.create_be.wcet ||
8788+ !param.create_be.period || !param.create_be.wcet) {
8789+ error = -EINVAL;
8790+ goto out;
8791+ }
8792+ srv = (be_server_t*) kmalloc(sizeof(be_server_t), GFP_KERNEL);
8793+ if (!srv) {
8794+ error = -ENOMEM;
8795+ goto out;
8796+ }
8797+ srv->wcet = param.create_be.wcet;
8798+ srv->period = param.create_be.period;
8799+ srv->pid = next_be_server_pid++;
8800+ INIT_LIST_HEAD(&srv->list);
8801+ be_prepare_new_release(srv, jiffies);
8802+ be_enqueue(&be, srv);
8803+
8804+ printk(KERN_WARNING "edf-hsb: created a BE with (%d, %d)\n",
8805+ param.create_be.wcet, param.create_be.period);
8806+
8807+ error = 0;
8808+ break;
8809+
8810+ default:
8811+ printk(KERN_WARNING "edf-hsb: unknown command %d\n", cmd);
8812+ }
8813+
8814+out:
8815+ return error;
8816+}
8817+
8818+/* Plugin object */
8819+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
8820+ .ready_to_use = 0
8821+};
8822+
8823+
8824+/*
8825+ * Plugin initialization code.
8826+ */
8827+#define INIT_SCHED_PLUGIN (struct sched_plugin){\
8828+ .plugin_name = "EDF-HSB",\
8829+ .ready_to_use = 1,\
8830+ .scheduler_tick = hsb_scheduler_tick,\
8831+ .prepare_task = hsb_prepare_task,\
8832+ .sleep_next_period = edf_sleep_next_period,\
8833+ .schedule = hsb_schedule,\
8834+ .finish_switch = hsb_finish_switch,\
8835+ .mode_change = hsb_mode_change,\
8836+ .wake_up_task = hsb_wake_up_task,\
8837+ .task_blocks = hsb_task_blocks, \
8838+ .scheduler_setup = hsb_scheduler_setup \
8839+}
8840+
8841+
8842+sched_plugin_t *__init init_edf_hsb_plugin(void)
8843+{
8844+ int i;
8845+
8846+ if (!s_plugin.ready_to_use)
8847+ {
8848+ capacity_queue_init(&cap_queue);
8849+ edf_domain_init(&srt, srt_check_resched);
8850+ edf_domain_init(&be, be_check_resched);
8851+ fifo_domain_init(&hsb_fifo, NULL);
8852+ for (i = 0; i < NR_CPUS; i++)
8853+ {
8854+ hsb_cpu_state_init(&per_cpu(hsb_cpu_state, i),
8855+ hrt_check_resched, i);
8856+ printk("HRT server %d initialized.\n", i);
8857+ }
8858+ s_plugin = INIT_SCHED_PLUGIN;
8859+ }
8860+ return &s_plugin;
8861+}
8862diff --git a/kernel/sched_global_edf.c b/kernel/sched_global_edf.c
8863new file mode 100644
8864index 0000000..bc32373
8865--- /dev/null
8866+++ b/kernel/sched_global_edf.c
8867@@ -0,0 +1,550 @@
8868+/*
8869+ * kernel/sched-global-edf.c
8870+ *
8871+ * Re-Implementation of the Global EDF scheduler.
8872+ *
8873+ * This version works without using the struct queue. It uses the
8874+ * builtin kernel lists.
8875+ */
8876+
8877+#include <linux/percpu.h>
8878+#include <linux/sched.h>
8879+#include <linux/list.h>
8880+
8881+#include <linux/litmus.h>
8882+#include <linux/sched_plugin.h>
8883+
8884+#include <linux/edf_common.h>
8885+#include <linux/sched_trace.h>
8886+
8887+
8888+/* cpu_entry_t - maintain state of the priority of cpu's current task
8889+ * this is needed to check for priority inversions.
8890+ */
8891+typedef struct {
8892+ int cpu;
8893+ int executes_realtime;
8894+ jiffie_t cur_deadline;
8895+ struct list_head list;
8896+ atomic_t will_schedule;
8897+} cpu_entry_t;
8898+DEFINE_PER_CPU(cpu_entry_t, gedf_cpu_entries);
8899+
8900+#define set_will_schedule() \
8901+ (atomic_set(&__get_cpu_var(gedf_cpu_entries).will_schedule, 1))
8902+#define clear_will_schedule() \
8903+ (atomic_set(&__get_cpu_var(gedf_cpu_entries).will_schedule, 0))
8904+#define test_will_schedule(cpu) \
8905+ (atomic_read(&per_cpu(gedf_cpu_entries, cpu).will_schedule))
8906+
8907+
8908+/* always acquire the cpu lock as the last lock to avoid deadlocks */
8909+static spinlock_t gedf_cpu_lock = SPIN_LOCK_UNLOCKED;
8910+/* the cpus queue themselves according to priority in here */
8911+static LIST_HEAD(gedf_cpu_queue);
8912+
8913+
8914+static rt_domain_t gedf;
8915+
8916+#define DUMP(args...) TRACE(args)
8917+
8918+/* adjust_cpu_queue - Move the cpu entry to the correct place to maintain
8919+ * order in the cpu queue. Caller must hold ready write lock.
8920+ *
8921+ */
8922+static void adjust_cpu_queue(int exec_rt, jiffie_t deadline)
8923+{
8924+ struct list_head *pos;
8925+ cpu_entry_t *other;
8926+ cpu_entry_t *entry;
8927+
8928+ spin_lock(&gedf_cpu_lock);
8929+
8930+ entry = &__get_cpu_var(gedf_cpu_entries);
8931+ entry->executes_realtime = exec_rt;
8932+ entry->cur_deadline = deadline;
8933+
8934+ list_del(&entry->list);
8935+ /* if we do not execute real-time jobs we just move
8936+ * to the end of the queue
8937+ */
8938+ if (entry->executes_realtime)
8939+ list_for_each(pos, &gedf_cpu_queue) {
8940+ other = list_entry(pos, cpu_entry_t, list);
8941+ if (!other->executes_realtime ||
8942+ time_before_eq(entry->cur_deadline,
8943+ other->cur_deadline))
8944+ {
8945+ __list_add(&entry->list, pos->prev, pos);
8946+ goto out;
8947+ }
8948+ }
8949+ /* if we get this far we have the lowest priority task */
8950+ list_add_tail(&entry->list, &gedf_cpu_queue);
8951+
8952+ out:
8953+ spin_unlock(&gedf_cpu_lock);
8954+}
8955+
8956+
8957+/* check_reschedule_needed - Check whether another CPU needs to reschedule.
8958+ *
8959+ * The function only checks and kicks the last CPU. It will reschedule and
8960+ * kick the next if necessary, and so on. The caller is responsible for making
8961+ * sure that it is not the last entry or that a reschedule is not necessary.
8962+ *
8963+ */
8964+static int gedf_check_resched(rt_domain_t *edf)
8965+{
8966+ cpu_entry_t *last;
8967+ int ret = 0;
8968+
8969+ spin_lock(&gedf_cpu_lock);
8970+
8971+ if (!list_empty(&edf->ready_queue)) {
8972+ last = list_entry(gedf_cpu_queue.prev, cpu_entry_t, list);
8973+ if (!last->executes_realtime ||
8974+ time_before(next_ready(edf)->rt_param.times.deadline,
8975+ last->cur_deadline))
8976+ {
8977+ if (smp_processor_id() == last->cpu)
8978+ set_tsk_need_resched(current);
8979+ else
8980+ if (!test_will_schedule(last->cpu))
8981+ smp_send_reschedule(last->cpu);
8982+ ret = 1;
8983+ }
8984+ }
8985+
8986+ spin_unlock(&gedf_cpu_lock);
8987+ return ret;
8988+}
8989+
8990+
8991+
8992+/* gedf_scheduler_tick - this function is called for every local timer
8993+ * interrupt.
8994+ *
8995+ * checks whether the current task has expired and checks
8996+ * whether we need to preempt it if it has not expired
8997+ */
8998+static reschedule_check_t gedf_scheduler_tick(void)
8999+{
9000+ unsigned long flags;
9001+ struct task_struct *t = current;
9002+ reschedule_check_t want_resched = NO_RESCHED;
9003+
9004+ /* expire tasks even if not in real-time mode
9005+ * this makes sure that at the end of real-time mode
9006+ * no tasks "run away forever".
9007+ */
9008+ BUG_ON(is_realtime(t) && t->time_slice > 100000);
9009+ if (is_realtime(t) && (!--t->time_slice)) {
9010+ /* this task has exhausted its budget in this period */
9011+ set_rt_flags(t, RT_F_SLEEP);
9012+ want_resched = FORCE_RESCHED;
9013+ set_will_schedule();
9014+ sched_trace_job_completion(t);
9015+ }
9016+ if (get_rt_mode() == MODE_RT_RUN)
9017+ {
9018+ /* check whether anything is waiting to be released
9019+ * this could probably be moved to the global timer
9020+ * interrupt handler since the state will only change
9021+ * once per jiffie
9022+ */
9023+ try_release_pending(&gedf);
9024+ if (want_resched != FORCE_RESCHED)
9025+ {
9026+ read_lock_irqsave(&gedf.ready_lock, flags);
9027+ if (edf_preemption_needed(&gedf, t))
9028+ {
9029+ want_resched = FORCE_RESCHED;
9030+ set_will_schedule();
9031+ }
9032+ read_unlock_irqrestore(&gedf.ready_lock, flags);
9033+ }
9034+ }
9035+ return want_resched;
9036+}
9037+
9038+/* This is main Global EDF schedule function
9039+ *
9040+ * Assumes the caller holds the lock for rq and that irqs are disabled
9041+ * This is function only works for indirect switching
9042+ */
9043+static int gedf_schedule(struct task_struct * prev,
9044+ struct task_struct ** next,
9045+ runqueue_t * rq)
9046+{
9047+ int need_deactivate = 1;
9048+ int rt;
9049+ jiffie_t deadline;
9050+ unsigned long flags;
9051+
9052+
9053+ if (is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP)
9054+ {
9055+ DUMP("preparing %d for next period\n", prev->pid);
9056+ edf_prepare_for_next_period(prev);
9057+ }
9058+
9059+ if (get_rt_mode() == MODE_RT_RUN) {
9060+ write_lock_irqsave(&gedf.ready_lock, flags);
9061+
9062+ clear_will_schedule();
9063+
9064+ if (is_realtime(prev) && is_released(prev) && is_running(prev)
9065+ && !edf_preemption_needed(&gedf, prev)) {
9066+ /* Our current task's next job has already been
9067+ * released and has higher priority than the highest
9068+ * prioriy waiting task; in other words: it is tardy.
9069+ * We just keep it.
9070+ */
9071+ DUMP("prev will be next, already released\n");
9072+ *next = prev;
9073+ rt = 1;
9074+ deadline = prev->rt_param.times.deadline;
9075+ need_deactivate = 0;
9076+ } else {
9077+ /* either not yet released, preempted, or non-rt */
9078+ *next = __take_ready(&gedf);
9079+ if (*next) {
9080+ /* mark the task as executing on this cpu */
9081+ set_task_cpu(*next, smp_processor_id());
9082+
9083+ /* stick the task into the runqueue */
9084+ __activate_task(*next, rq);
9085+ rt = 1;
9086+ deadline = (*next)->rt_param.times.deadline;
9087+ }
9088+ else
9089+ rt = deadline = 0;
9090+ }
9091+
9092+ adjust_cpu_queue(rt, deadline);
9093+
9094+ if (rt) {
9095+ set_rt_flags(*next, RT_F_RUNNING);
9096+ gedf.check_resched(&gedf);
9097+ }
9098+ write_unlock_irqrestore(&gedf.ready_lock, flags);
9099+ }
9100+
9101+ if (is_realtime(prev) && need_deactivate && prev->array) {
9102+ /* take it out of the run queue */
9103+ deactivate_task(prev, rq);
9104+ }
9105+
9106+ /* don't put back into release yet.
9107+ * We first need to actually switch
9108+ * stacks before we can execute it
9109+ * on a different CPU */
9110+
9111+ /* in the current implementation nobody cares about the return value */
9112+ return 0;
9113+}
9114+
9115+
9116+/* _finish_switch - we just finished the switch away from prev
9117+ * it is now safe to requeue the task
9118+ */
9119+static void gedf_finish_switch(struct task_struct *prev)
9120+{
9121+ if (!is_realtime(prev) || !is_running(prev))
9122+ return;
9123+
9124+ /*printk(KERN_INFO "gedf finish switch for %d\n", prev->pid);*/
9125+ if (get_rt_flags(prev) == RT_F_SLEEP ||
9126+ get_rt_mode() != MODE_RT_RUN) {
9127+ /* this task has expired
9128+ * _schedule has already taken care of updating
9129+ * the release and
9130+ * deadline. We just must check if has been released.
9131+ */
9132+ if (time_before_eq(prev->rt_param.times.release, jiffies)
9133+ && get_rt_mode() == MODE_RT_RUN) {
9134+ /* already released */
9135+ add_ready(&gedf, prev);
9136+ DUMP("%d goes straight to ready queue\n", prev->pid);
9137+ }
9138+ else
9139+ /* it has got to wait */
9140+ add_release(&gedf, prev);
9141+ }
9142+ else {
9143+ /* this is a forced preemption
9144+ * thus the task stays in the ready_queue
9145+ * we only must make it available to others
9146+ */
9147+ add_ready(&gedf, prev);
9148+ }
9149+}
9150+
9151+
9152+/* Prepare a task for running in RT mode
9153+ * Enqueues the task into master queue data structure
9154+ * returns
9155+ * -EPERM if task is not TASK_STOPPED
9156+ */
9157+static long gedf_prepare_task(struct task_struct * t)
9158+{
9159+ TRACE("global edf: prepare task %d\n", t->pid);
9160+
9161+ if (t->state == TASK_STOPPED) {
9162+ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
9163+
9164+ if (get_rt_mode() == MODE_RT_RUN)
9165+ /* The action is already on.
9166+ * Prepare immediate release
9167+ */
9168+ edf_release_now(t);
9169+ /* The task should be running in the queue, otherwise signal
9170+ * code will try to wake it up with fatal consequences.
9171+ */
9172+ t->state = TASK_RUNNING;
9173+ add_release(&gedf, t);
9174+ return 0;
9175+ }
9176+ else
9177+ return -EPERM;
9178+}
9179+
9180+static void gedf_wake_up_task(struct task_struct *task)
9181+{
9182+ /* We must determine whether task should go into the release
9183+ * queue or into the ready queue. It may enter the ready queue
9184+ * if it has credit left in its time slice and has not yet reached
9185+ * its deadline. If it is now passed its deadline we assume this the
9186+ * arrival of a new sporadic job and thus put it in the ready queue
9187+ * anyway.If it has zero budget and the next release is in the future
9188+ * it has to go to the release queue.
9189+ */
9190+ TRACE("global edf: wake up %d with budget=%d\n",
9191+ task->pid, task->time_slice);
9192+ task->state = TASK_RUNNING;
9193+ if (is_tardy(task)) {
9194+ /* new sporadic release */
9195+ edf_release_now(task);
9196+ sched_trace_job_release(task);
9197+ add_ready(&gedf, task);
9198+ }
9199+ else if (task->time_slice) {
9200+ /* came back in time before deadline
9201+ */
9202+ set_rt_flags(task, RT_F_RUNNING);
9203+ add_ready(&gedf, task);
9204+ }
9205+ else {
9206+ add_release(&gedf, task);
9207+ }
9208+
9209+}
9210+
9211+static void gedf_task_blocks(struct task_struct *t)
9212+{
9213+ BUG_ON(!is_realtime(t));
9214+ /* not really anything to do since it can only block if
9215+ * it is running, and when it is not running it is not in any
9216+ * queue anyway.
9217+ *
9218+ */
9219+ TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice);
9220+ BUG_ON(t->rt_list.next != LIST_POISON1);
9221+ BUG_ON(t->rt_list.prev != LIST_POISON2);
9222+}
9223+
9224+
9225+/* When _tear_down is called, the task should not be in any queue any more
9226+ * as it must have blocked first. We don't have any internal state for the task,
9227+ * it is all in the task_struct.
9228+ */
9229+static long gedf_tear_down(struct task_struct * t)
9230+{
9231+ BUG_ON(!is_realtime(t));
9232+ TRACE("global edf: tear down called for %d \n", t->pid);
9233+ BUG_ON(t->array);
9234+ BUG_ON(t->rt_list.next != LIST_POISON1);
9235+ BUG_ON(t->rt_list.prev != LIST_POISON2);
9236+ return 0;
9237+}
9238+
9239+
9240+static int gedf_mode_change(int new_mode)
9241+{
9242+ int cpu;
9243+ cpu_entry_t *entry;
9244+
9245+/* printk(KERN_INFO "[%d] global edf: mode changed to %d\n", smp_processor_id(),
9246+ new_mode);*/
9247+ if (new_mode == MODE_RT_RUN) {
9248+ rerelease_all(&gedf, edf_release_at);
9249+
9250+ /* initialize per CPU state
9251+ * we can't do this at boot time because we don't know
9252+ * which CPUs will be online and we can't put non-existing
9253+ * cpus into the queue
9254+ */
9255+ spin_lock(&gedf_cpu_lock);
9256+ /* get old cruft out of the way in case we reenter real-time
9257+ * mode for a second time
9258+ */
9259+ while (!list_empty(&gedf_cpu_queue))
9260+ list_del(gedf_cpu_queue.next);
9261+ /* reinitialize */
9262+ for_each_online_cpu(cpu) {
9263+ entry = &per_cpu(gedf_cpu_entries, cpu);
9264+ atomic_set(&entry->will_schedule, 0);
9265+ entry->executes_realtime = 0;
9266+ entry->cur_deadline = 0;
9267+ entry->cpu = cpu;
9268+ list_add(&entry->list, &gedf_cpu_queue);
9269+ }
9270+ spin_unlock(&gedf_cpu_lock);
9271+ }
9272+ /*printk(KERN_INFO "[%d] global edf: mode change done\n", smp_processor_id()); */
9273+ return 0;
9274+}
9275+
9276+
9277+/* Plugin object */
9278+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
9279+ .ready_to_use = 0
9280+};
9281+
9282+
9283+/*
9284+ * Plugin initialization code.
9285+ */
9286+#define INIT_SCHED_PLUGIN (struct sched_plugin){\
9287+ .plugin_name = "Global EDF",\
9288+ .ready_to_use = 1,\
9289+ .scheduler_tick = gedf_scheduler_tick,\
9290+ .prepare_task = gedf_prepare_task,\
9291+ .sleep_next_period = edf_sleep_next_period,\
9292+ .tear_down = gedf_tear_down,\
9293+ .schedule = gedf_schedule,\
9294+ .finish_switch = gedf_finish_switch,\
9295+ .mode_change = gedf_mode_change,\
9296+ .wake_up_task = gedf_wake_up_task,\
9297+ .task_blocks = gedf_task_blocks \
9298+ }
9299+
9300+
9301+sched_plugin_t *__init init_global_edf_plugin(void)
9302+{
9303+ if (!s_plugin.ready_to_use)
9304+ {
9305+ edf_domain_init(&gedf, gedf_check_resched);
9306+ s_plugin = INIT_SCHED_PLUGIN;
9307+ }
9308+ return &s_plugin;
9309+}
9310+
9311+
9312+
9313+/*****************************************************************************/
9314+/*****************************************************************************/
9315+/*****************************************************************************/
9316+/* NON-PREEMPTIVE GLOBAL EDF */
9317+
9318+
9319+/* gedf_np_scheduler_tick - this function is called for every local timer
9320+ * interrupt.
9321+ *
9322+ * checks whether the current task has expired and checks
9323+ * whether we need to preempt it if it has not expired
9324+ */
9325+static reschedule_check_t gedf_np_scheduler_tick(void)
9326+{
9327+ if (get_rt_mode() == MODE_RT_RUN)
9328+ {
9329+ /* check whether anything is waiting to be released
9330+ * this could probably be moved to the global timer
9331+ * interrupt handler since the state will only change
9332+ * once per jiffie
9333+ */
9334+ try_release_pending(&gedf);
9335+ }
9336+
9337+ /* expire tasks even if not in real-time mode
9338+ * this makes sure that at the end of real-time mode
9339+ * no tasks "run away forever".
9340+ */
9341+ BUG_ON(current->time_slice > 1000);
9342+ if (is_realtime(current) && (!--current->time_slice)) {
9343+ /* this task has exhausted its budget in this period */
9344+ set_rt_flags(current, RT_F_SLEEP);
9345+ return FORCE_RESCHED;
9346+ }
9347+ else
9348+ return NO_RESCHED;
9349+}
9350+
9351+/* gedf_np_check_resched - Check whether another CPU needs to reschedule.
9352+ *
9353+ * The function only checks and kicks the last CPU. It will reschedule and
9354+ * kick the next if necessary, and so on. The caller is responsible for making
9355+ * sure that it is not the last entry or that a reschedule is not necessary.
9356+ *
9357+ */
9358+static int gedf_np_check_resched(rt_domain_t *edf)
9359+{
9360+ cpu_entry_t *last;
9361+ int ret = 0;
9362+
9363+ spin_lock(&gedf_cpu_lock);
9364+
9365+ if (!list_empty(&edf->ready_queue)) {
9366+ last = list_entry(gedf_cpu_queue.prev, cpu_entry_t, list);
9367+ /* preemption happens only for non-realtime tasks */
9368+ if (!last->executes_realtime)
9369+ {
9370+ if (smp_processor_id() == last->cpu)
9371+ set_tsk_need_resched(current);
9372+ else
9373+ smp_send_reschedule(last->cpu);
9374+ ret = 1;
9375+ goto out;
9376+ }
9377+ }
9378+
9379+ out:
9380+ spin_unlock(&gedf_cpu_lock);
9381+ return ret;
9382+}
9383+
9384+
9385+/* non-preemptive global EDF
9386+ *
9387+ * Non-preemptive EDF is almost the same as normal EDF. We only have to
9388+ * adjust the scheduler tick and the resched function.
9389+ */
9390+#define INIT_SCHED_PLUGIN_NP (struct sched_plugin){\
9391+ .plugin_name = "Non-Preemptive Global EDF",\
9392+ .ready_to_use = 1,\
9393+ .scheduler_tick = gedf_np_scheduler_tick,\
9394+ .prepare_task = gedf_prepare_task,\
9395+ .sleep_next_period = edf_sleep_next_period,\
9396+ .tear_down = gedf_tear_down,\
9397+ .schedule = gedf_schedule,\
9398+ .finish_switch = gedf_finish_switch,\
9399+ .mode_change = gedf_mode_change,\
9400+ .wake_up_task = gedf_wake_up_task,\
9401+ .task_blocks = gedf_task_blocks \
9402+ }
9403+
9404+
9405+/* as we only set the plugin at boot time,
9406+ * we use the same structure as preemptive EDF. This simplifies a lot
9407+ * of the funtions.
9408+ */
9409+sched_plugin_t* __init init_global_edf_np_plugin(void)
9410+{
9411+ if (!s_plugin.ready_to_use)
9412+ {
9413+ edf_domain_init(&gedf, gedf_np_check_resched);
9414+ s_plugin = INIT_SCHED_PLUGIN_NP;
9415+ }
9416+ return &s_plugin;
9417+}
9418diff --git a/kernel/sched_gsn_edf.c b/kernel/sched_gsn_edf.c
9419new file mode 100644
9420index 0000000..f6ba521
9421--- /dev/null
9422+++ b/kernel/sched_gsn_edf.c
9423@@ -0,0 +1,816 @@
9424+/*
9425+ * kernel/sched_gsn_edf.c
9426+ *
9427+ * Implementation of the GSN-EDF scheduling algorithm.
9428+ *
9429+ * This version uses the simple approach and serializes all scheduling
9430+ * decisions by the use of a queue lock. This is probably not the
9431+ * best way to do it, but it should suffice for now. It should not
9432+ * affect the benchmarks since all synchronization primitives will
9433+ * take the same performance hit, if any.
9434+ */
9435+
9436+#include <linux/percpu.h>
9437+#include <linux/sched.h>
9438+#include <linux/list.h>
9439+
9440+#include <linux/queuelock.h>
9441+#include <linux/litmus.h>
9442+#include <linux/sched_plugin.h>
9443+#include <linux/edf_common.h>
9444+#include <linux/sched_trace.h>
9445+
9446+/* Overview of GSN-EDF operations.
9447+ *
9448+ * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This
9449+ * description only covers how the individual operations are implemented in
9450+ * LITMUS.
9451+ *
9452+ * link_task_to_cpu(T, cpu) - Low-level operation to update the linkage
9453+ * structure (NOT the actually scheduled
9454+ * task). If there is another linked task To
9455+ * already it will set To->linked_on = NO_CPU
9456+ * (thereby removing its association with this
9457+ * CPU). However, it will not requeue the
9458+ * previously linked task (if any). It will set
9459+ * T's state to RT_F_RUNNING and check whether
9460+ * it is already running somewhere else. If T
9461+ * is scheduled somewhere else it will link
9462+ * it to that CPU instead (and pull the linked
9463+ * task to cpu). T may be NULL.
9464+ *
9465+ * unlink(T) - Unlink removes T from all scheduler data
9466+ * structures. If it is linked to some CPU it
9467+ * will link NULL to that CPU. If it is
9468+ * currently queued in the gsnedf queue it will
9469+ * be removed from the T->rt_list. It is safe to
9470+ * call unlink(T) if T is not linked. T may not
9471+ * be NULL.
9472+ *
9473+ * requeue(T) - Requeue will insert T into the appropriate
9474+ * queue. If the system is in real-time mode and
9475+ * the T is released already, it will go into the
9476+ * ready queue. If the system is not in
9477+ * real-time mode is T, then T will go into the
9478+ * release queue. If T's release time is in the
9479+ * future, it will go into the release
9480+ * queue. That means that T's release time/job
9481+ * no/etc. has to be updated before requeu(T) is
9482+ * called. It is not safe to call requeue(T)
9483+ * when T is already queued. T may not be NULL.
9484+ *
9485+ * gsnedf_job_arrival(T) - This is the catch all function when T enters
9486+ * the system after either a suspension or at a
9487+ * job release. It will queue T (which means it
9488+ * is not safe to call gsnedf_job_arrival(T) if
9489+ * T is already queued) and then check whether a
9490+ * preemption is necessary. If a preemption is
9491+ * necessary it will update the linkage
9492+ * accordingly and cause scheduled to be called
9493+ * (either with an IPI or need_resched). It is
9494+ * safe to call gsnedf_job_arrival(T) if T's
9495+ * next job has not been actually released yet
9496+ * (releast time in the future). T will be put
9497+ * on the release queue in that case.
9498+ *
9499+ * job_completion(T) - Take care of everything that needs to be done
9500+ * to prepare T for its next release and place
9501+ * it in the right queue with
9502+ * gsnedf_job_arrival().
9503+ *
9504+ *
9505+ * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is
9506+ * equivalent to unlink(T). Note that if you unlink a task from a CPU none of
9507+ * the functions will automatically propagate pending task from the ready queue
9508+ * to a linked task. This is the job of the calling function ( by means of
9509+ * __take_ready).
9510+ */
9511+
9512+
9513+/* cpu_entry_t - maintain the linked and scheduled state
9514+ */
9515+typedef struct {
9516+ int cpu;
9517+ struct task_struct* linked; /* only RT tasks */
9518+ struct task_struct* scheduled; /* only RT tasks */
9519+ struct list_head list;
9520+ atomic_t will_schedule; /* prevent unneeded IPIs */
9521+} cpu_entry_t;
9522+DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries);
9523+
9524+#define set_will_schedule() \
9525+ (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 1))
9526+#define clear_will_schedule() \
9527+ (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 0))
9528+#define test_will_schedule(cpu) \
9529+ (atomic_read(&per_cpu(gsnedf_cpu_entries, cpu).will_schedule))
9530+
9531+
9532+#define NO_CPU 0xffffffff
9533+
9534+/* The gsnedf_lock is used to serialize all scheduling events.
9535+ * It protects
9536+ */
9537+static queuelock_t gsnedf_lock;
9538+/* the cpus queue themselves according to priority in here */
9539+static LIST_HEAD(gsnedf_cpu_queue);
9540+
9541+static rt_domain_t gsnedf;
9542+
9543+
9544+/* update_cpu_position - Move the cpu entry to the correct place to maintain
9545+ * order in the cpu queue. Caller must hold gsnedf lock.
9546+ */
9547+static void update_cpu_position(cpu_entry_t *entry)
9548+{
9549+ cpu_entry_t *other;
9550+ struct list_head *pos;
9551+ list_del(&entry->list);
9552+ /* if we do not execute real-time jobs we just move
9553+ * to the end of the queue
9554+ */
9555+ if (entry->linked) {
9556+ list_for_each(pos, &gsnedf_cpu_queue) {
9557+ other = list_entry(pos, cpu_entry_t, list);
9558+ if (edf_higher_prio(entry->linked, other->linked)) {
9559+ __list_add(&entry->list, pos->prev, pos);
9560+ return;
9561+ }
9562+ }
9563+ }
9564+ /* if we get this far we have the lowest priority job */
9565+ list_add_tail(&entry->list, &gsnedf_cpu_queue);
9566+}
9567+
9568+/* link_task_to_cpu - Update the link of a CPU.
9569+ * Handles the case where the to-be-linked task is already
9570+ * scheduled on a different CPU.
9571+ */
9572+static noinline void link_task_to_cpu(struct task_struct* linked,
9573+ cpu_entry_t *entry)
9574+
9575+{
9576+ cpu_entry_t *sched;
9577+ struct task_struct* tmp;
9578+ int on_cpu;
9579+
9580+ BUG_ON(linked && !is_realtime(linked));
9581+
9582+ /* Currently linked task is set to be unlinked. */
9583+ if (entry->linked) {
9584+ entry->linked->rt_param.linked_on = NO_CPU;
9585+ }
9586+
9587+ /* Link new task to CPU. */
9588+ if (linked) {
9589+ set_rt_flags(linked, RT_F_RUNNING);
9590+ /* handle task is already scheduled somewhere! */
9591+ on_cpu = linked->rt_param.scheduled_on;
9592+ if (on_cpu != NO_CPU) {
9593+ sched = &per_cpu(gsnedf_cpu_entries, on_cpu);
9594+ /* this should only happen if not linked already */
9595+ BUG_ON(sched->linked == linked);
9596+
9597+ /* If we are already scheduled on the CPU to which we
9598+ * wanted to link, we don't need to do the swap --
9599+ * we just link ourselves to the CPU and depend on
9600+ * the caller to get things right.
9601+ */
9602+ if (entry != sched) {
9603+ tmp = sched->linked;
9604+ linked->rt_param.linked_on = sched->cpu;
9605+ sched->linked = linked;
9606+ update_cpu_position(sched);
9607+ linked = tmp;
9608+ }
9609+ }
9610+ if (linked) /* might be NULL due to swap */
9611+ linked->rt_param.linked_on = entry->cpu;
9612+ }
9613+ entry->linked = linked;
9614+ update_cpu_position(entry);
9615+}
9616+
9617+/* unlink - Make sure a task is not linked any longer to an entry
9618+ * where it was linked before. Must hold gsnedf_lock.
9619+ */
9620+static noinline void unlink(struct task_struct* t)
9621+{
9622+ cpu_entry_t *entry;
9623+
9624+ if (unlikely(!t)) {
9625+ TRACE_BUG_ON(!t);
9626+ return;
9627+ }
9628+
9629+ if (t->rt_param.linked_on != NO_CPU) {
9630+ /* unlink */
9631+ entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on);
9632+ t->rt_param.linked_on = NO_CPU;
9633+ link_task_to_cpu(NULL, entry);
9634+ } else if (in_list(&t->rt_list)) {
9635+ /* This is an interesting situation: t is scheduled,
9636+ * but was just recently unlinked. It cannot be
9637+ * linked anywhere else (because then it would have
9638+ * been relinked to this CPU), thus it must be in some
9639+ * queue. We must remove it from the list in this
9640+ * case.
9641+ */
9642+ list_del(&t->rt_list);
9643+ }
9644+}
9645+
9646+
9647+/* preempt - force a CPU to reschedule
9648+ */
9649+static noinline void preempt(cpu_entry_t *entry)
9650+{
9651+ /* We cannot make the is_np() decision here if it is a remote CPU
9652+ * because requesting exit_np() requires that we currently use the
9653+ * address space of the task. Thus, in the remote case we just send
9654+ * the IPI and let schedule() handle the problem.
9655+ */
9656+
9657+ if (smp_processor_id() == entry->cpu) {
9658+ if (entry->scheduled && is_np(entry->scheduled))
9659+ request_exit_np(entry->scheduled);
9660+ else
9661+ set_tsk_need_resched(current);
9662+ } else
9663+ /* in case that it is a remote CPU we have to defer the
9664+ * the decision to the remote CPU
9665+ * FIXME: We could save a few IPI's here if we leave the flag
9666+ * set when we are waiting for a np_exit().
9667+ */
9668+ if (!test_will_schedule(entry->cpu))
9669+ smp_send_reschedule(entry->cpu);
9670+}
9671+
9672+/* requeue - Put an unlinked task into gsn-edf domain.
9673+ * Caller must hold gsnedf_lock.
9674+ */
9675+static noinline void requeue(struct task_struct* task)
9676+{
9677+ BUG_ON(!task);
9678+ /* sanity check rt_list before insertion */
9679+ BUG_ON(in_list(&task->rt_list));
9680+
9681+ if (get_rt_flags(task) == RT_F_SLEEP ||
9682+ get_rt_mode() != MODE_RT_RUN) {
9683+ /* this task has expired
9684+ * _schedule has already taken care of updating
9685+ * the release and
9686+ * deadline. We just must check if it has been released.
9687+ */
9688+ if (is_released(task) && get_rt_mode() == MODE_RT_RUN)
9689+ __add_ready(&gsnedf, task);
9690+ else {
9691+ /* it has got to wait */
9692+ __add_release(&gsnedf, task);
9693+ }
9694+
9695+ } else
9696+ /* this is a forced preemption
9697+ * thus the task stays in the ready_queue
9698+ * we only must make it available to others
9699+ */
9700+ __add_ready(&gsnedf, task);
9701+}
9702+
9703+/* gsnedf_job_arrival: task is either resumed or released */
9704+static noinline void gsnedf_job_arrival(struct task_struct* task)
9705+{
9706+ cpu_entry_t* last;
9707+
9708+ BUG_ON(list_empty(&gsnedf_cpu_queue));
9709+ BUG_ON(!task);
9710+
9711+ /* first queue arriving job */
9712+ requeue(task);
9713+
9714+ /* then check for any necessary preemptions */
9715+ last = list_entry(gsnedf_cpu_queue.prev, cpu_entry_t, list);
9716+ if (edf_preemption_needed(&gsnedf, last->linked)) {
9717+ /* preemption necessary */
9718+ task = __take_ready(&gsnedf);
9719+ TRACE("job_arrival: task %d linked to %d\n",
9720+ task->pid, last->cpu);
9721+ if (last->linked)
9722+ requeue(last->linked);
9723+
9724+ link_task_to_cpu(task, last);
9725+ preempt(last);
9726+ }
9727+}
9728+
9729+/* check for current job releases */
9730+static noinline void gsnedf_release_jobs(void)
9731+{
9732+ struct list_head *pos, *save;
9733+ struct task_struct *queued;
9734+
9735+ list_for_each_safe(pos, save, &gsnedf.release_queue) {
9736+ queued = list_entry(pos, struct task_struct, rt_list);
9737+ if (likely(is_released(queued))) {
9738+ /* this one is ready to go*/
9739+ list_del(pos);
9740+ set_rt_flags(queued, RT_F_RUNNING);
9741+
9742+ sched_trace_job_release(queued);
9743+ gsnedf_job_arrival(queued);
9744+ }
9745+ else
9746+ /* the release queue is ordered */
9747+ break;
9748+ }
9749+}
9750+
9751+/* gsnedf_scheduler_tick - this function is called for every local timer
9752+ * interrupt.
9753+ *
9754+ * checks whether the current task has expired and checks
9755+ * whether we need to preempt it if it has not expired
9756+ */
9757+static reschedule_check_t gsnedf_scheduler_tick(void)
9758+{
9759+ unsigned long flags;
9760+ struct task_struct* t = current;
9761+ reschedule_check_t want_resched = NO_RESCHED;
9762+
9763+ /* expire tasks even if not in real-time mode
9764+ * this makes sure that at the end of real-time mode
9765+ * no task "runs away forever".
9766+ */
9767+ if (is_realtime(t)) {
9768+ TRACE_CUR("before dec: time_slice == %u\n", t->time_slice);
9769+ }
9770+
9771+ if (is_realtime(t) && t->time_slice && !--t->time_slice) {
9772+ if (!is_np(t)) { /* np tasks will be preempted when they become
9773+ preemptable again */
9774+ want_resched = FORCE_RESCHED;
9775+ set_will_schedule();
9776+ TRACE("gsnedf_scheduler_tick: "
9777+ "%d is preemptable "
9778+ " => FORCE_RESCHED\n", t->pid);
9779+ } else {
9780+ TRACE("gsnedf_scheduler_tick: "
9781+ "%d is non-preemptable, "
9782+ "preemption delayed.\n", t->pid);
9783+ request_exit_np(t);
9784+ }
9785+ }
9786+
9787+ /* only the first CPU needs to release jobs */
9788+ if (get_rt_mode() == MODE_RT_RUN && smp_processor_id() == 0) {
9789+ queue_lock_irqsave(&gsnedf_lock, flags);
9790+
9791+ /* (1) try to release pending jobs */
9792+ gsnedf_release_jobs();
9793+
9794+ /* we don't need to check linked != scheduled since
9795+ * set_tsk_need_resched has been set by preempt() if necessary
9796+ */
9797+
9798+ queue_unlock_irqrestore(&gsnedf_lock, flags);
9799+ }
9800+
9801+ return want_resched;
9802+}
9803+
9804+/* caller holds gsnedf_lock */
9805+static noinline void job_completion(struct task_struct *t)
9806+{
9807+ BUG_ON(!t);
9808+
9809+ sched_trace_job_completion(t);
9810+
9811+ TRACE_TASK(t, "job_completion().\n");
9812+
9813+ /* set flags */
9814+ set_rt_flags(t, RT_F_SLEEP);
9815+ /* prepare for next period */
9816+ edf_prepare_for_next_period(t);
9817+ /* unlink */
9818+ unlink(t);
9819+ /* requeue
9820+ * But don't requeue a blocking task. */
9821+ if (is_running(t))
9822+ gsnedf_job_arrival(t);
9823+}
9824+
9825+
9826+/* Getting schedule() right is a bit tricky. schedule() may not make any
9827+ * assumptions on the state of the current task since it may be called for a
9828+ * number of reasons. The reasons include a scheduler_tick() determined that it
9829+ * was necessary, because sys_exit_np() was called, because some Linux
9830+ * subsystem determined so, or even (in the worst case) because there is a bug
9831+ * hidden somewhere. Thus, we must take extreme care to determine what the
9832+ * current state is.
9833+ *
9834+ * The CPU could currently be scheduling a task (or not), be linked (or not).
9835+ *
9836+ * The following assertions for the scheduled task could hold:
9837+ *
9838+ * - !is_running(scheduled) // the job blocks
9839+ * - scheduled->timeslice == 0 // the job completed (forcefully)
9840+ * - get_rt_flag() == RT_F_SLEEP // the job completed (by syscall)
9841+ * - linked != scheduled // we need to reschedule (for any reason)
9842+ * - is_np(scheduled) // rescheduling must be delayed,
9843+ * sys_exit_np must be requested
9844+ *
9845+ * Any of these can occur together.
9846+ */
9847+static int gsnedf_schedule(struct task_struct * prev,
9848+ struct task_struct ** next,
9849+ runqueue_t * rq)
9850+{
9851+ cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries);
9852+ int out_of_time, sleep, preempt, np, exists,
9853+ rt, blocks;
9854+ struct task_struct* linked;
9855+
9856+ /* Will be released in finish_switch. */
9857+ queue_lock(&gsnedf_lock);
9858+ clear_will_schedule();
9859+
9860+ /* sanity checking */
9861+ BUG_ON(entry->scheduled && entry->scheduled != prev);
9862+ BUG_ON(entry->scheduled && !is_realtime(prev));
9863+ BUG_ON(is_realtime(prev) && !entry->scheduled);
9864+
9865+ /* (0) Determine state */
9866+ exists = entry->scheduled != NULL;
9867+ blocks = exists && !is_running(entry->scheduled);
9868+ out_of_time = exists && !entry->scheduled->time_slice;
9869+ np = exists && is_np(entry->scheduled);
9870+ sleep = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
9871+ preempt = entry->scheduled != entry->linked;
9872+ rt = get_rt_mode() == MODE_RT_RUN;
9873+
9874+ /* If a task blocks we have no choice but to reschedule.
9875+ */
9876+ if (blocks)
9877+ unlink(entry->scheduled);
9878+
9879+ /* Request a sys_exit_np() call if we would like to preempt but cannot.
9880+ * We need to make sure to update the link structure anyway in case
9881+ * that we are still linked. Multiple calls to request_exit_np() don't
9882+ * hurt.
9883+ */
9884+ if (np && (out_of_time || preempt || sleep)) {
9885+ unlink(entry->scheduled);
9886+ request_exit_np(entry->scheduled);
9887+ }
9888+
9889+ /* Any task that is preemptable and either exhausts its execution
9890+ * budget or wants to sleep completes. We may have to reschedule after
9891+ * this.
9892+ */
9893+ if (!np && (out_of_time || sleep))
9894+ job_completion(entry->scheduled);
9895+
9896+ /* Stop real-time tasks when we leave real-time mode
9897+ */
9898+ if (!rt && entry->linked) {
9899+ /* task will be preempted once it is preemptable
9900+ * (which it may be already)
9901+ */
9902+ linked = entry->linked;
9903+ unlink(linked);
9904+ requeue(linked);
9905+ }
9906+
9907+ /* Link pending task if we became unlinked.
9908+ */
9909+ if (rt && !entry->linked)
9910+ link_task_to_cpu(__take_ready(&gsnedf), entry);
9911+
9912+ /* The final scheduling decision. Do we need to switch for some reason?
9913+ * If linked different from scheduled select linked as next.
9914+ */
9915+ if ((!np || blocks) &&
9916+ entry->linked != entry->scheduled) {
9917+ /* Take care of a previously scheduled
9918+ * job by taking it out of the Linux runqueue.
9919+ */
9920+ if (entry->scheduled) {
9921+ if (prev->array)
9922+ /* take it out of the run queue */
9923+ deactivate_task(prev, rq);
9924+ }
9925+
9926+ /* Schedule a linked job? */
9927+ if (entry->linked) {
9928+ *next = entry->linked;
9929+ /* mark the task as executing on this cpu */
9930+ set_task_cpu(*next, smp_processor_id());
9931+ /* stick the task into the runqueue */
9932+ __activate_task(*next, rq);
9933+ }
9934+ } else
9935+ /* Only override Linux scheduler if we have real-time task
9936+ * scheduled that needs to continue.
9937+ */
9938+ if (exists)
9939+ *next = prev;
9940+
9941+ /* Unlock in case that we don't affect real-time tasks or
9942+ * if nothing changed and finish_switch won't be called.
9943+ */
9944+ if (prev == *next || (!is_realtime(prev) && !*next))
9945+ queue_unlock(&gsnedf_lock);
9946+
9947+ return 0;
9948+}
9949+
9950+
9951+/* _finish_switch - we just finished the switch away from prev
9952+ */
9953+static void gsnedf_finish_switch(struct task_struct *prev)
9954+{
9955+ cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries);
9956+
9957+ if (is_realtime(current))
9958+ entry->scheduled = current;
9959+ else
9960+ entry->scheduled = NULL;
9961+
9962+ prev->rt_param.scheduled_on = NO_CPU;
9963+ current->rt_param.scheduled_on = smp_processor_id();
9964+
9965+ /* unlock in case schedule() left it locked */
9966+ if (is_realtime(current) || is_realtime(prev))
9967+ queue_unlock(&gsnedf_lock);
9968+}
9969+
9970+
9971+/* Prepare a task for running in RT mode
9972+ * Enqueues the task into master queue data structure
9973+ * returns
9974+ * -EPERM if task is not TASK_STOPPED
9975+ */
9976+static long gsnedf_prepare_task(struct task_struct * t)
9977+{
9978+ unsigned long flags;
9979+ TRACE("gsn edf: prepare task %d\n", t->pid);
9980+
9981+ if (t->state == TASK_STOPPED) {
9982+ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
9983+
9984+ t->rt_param.scheduled_on = NO_CPU;
9985+ t->rt_param.linked_on = NO_CPU;
9986+ if (get_rt_mode() == MODE_RT_RUN)
9987+ /* The action is already on.
9988+ * Prepare immediate release
9989+ */
9990+ edf_release_now(t);
9991+ /* The task should be running in the queue, otherwise signal
9992+ * code will try to wake it up with fatal consequences.
9993+ */
9994+ t->state = TASK_RUNNING;
9995+
9996+ queue_lock_irqsave(&gsnedf_lock, flags);
9997+ requeue(t);
9998+ queue_unlock_irqrestore(&gsnedf_lock, flags);
9999+ return 0;
10000+ }
10001+ else
10002+ return -EPERM;
10003+}
10004+
10005+static void gsnedf_wake_up_task(struct task_struct *task)
10006+{
10007+ unsigned long flags;
10008+ /* We must determine whether task should go into the release
10009+ * queue or into the ready queue. It may enter the ready queue
10010+ * if it has credit left in its time slice and has not yet reached
10011+ * its deadline. If it is now passed its deadline we assume this the
10012+ * arrival of a new sporadic job and thus put it in the ready queue
10013+ * anyway.If it has zero budget and the next release is in the future
10014+ * it has to go to the release queue.
10015+ */
10016+ TRACE("gsnedf: %d unsuspends with budget=%d\n",
10017+ task->pid, task->time_slice);
10018+ task->state = TASK_RUNNING;
10019+
10020+ /* We need to take suspensions because of semaphores into
10021+ * account! If a job resumes after being suspended due to acquiring
10022+ * a semaphore, it should never be treated as a new job release.
10023+ */
10024+ if (get_rt_flags(task) == RT_F_EXIT_SEM) {
10025+ set_rt_flags(task, RT_F_RUNNING);
10026+ } else {
10027+ if (is_tardy(task)) {
10028+ /* new sporadic release */
10029+ edf_release_now(task);
10030+ sched_trace_job_release(task);
10031+ }
10032+ else if (task->time_slice)
10033+ /* came back in time before deadline
10034+ */
10035+ set_rt_flags(task, RT_F_RUNNING);
10036+ }
10037+
10038+ queue_lock_irqsave(&gsnedf_lock, flags);
10039+ gsnedf_job_arrival(task);
10040+ queue_unlock_irqrestore(&gsnedf_lock, flags);
10041+}
10042+
10043+static void gsnedf_task_blocks(struct task_struct *t)
10044+{
10045+ unsigned long flags;
10046+
10047+ /* unlink if necessary */
10048+ queue_lock_irqsave(&gsnedf_lock, flags);
10049+ unlink(t);
10050+ queue_unlock_irqrestore(&gsnedf_lock, flags);
10051+
10052+ BUG_ON(!is_realtime(t));
10053+ TRACE("task %d suspends with budget=%d\n", t->pid, t->time_slice);
10054+ BUG_ON(t->rt_list.next != LIST_POISON1);
10055+ BUG_ON(t->rt_list.prev != LIST_POISON2);
10056+}
10057+
10058+
10059+/* When _tear_down is called, the task should not be in any queue any more
10060+ * as it must have blocked first. We don't have any internal state for the task,
10061+ * it is all in the task_struct.
10062+ */
10063+static long gsnedf_tear_down(struct task_struct * t)
10064+{
10065+ BUG_ON(!is_realtime(t));
10066+ TRACE_TASK(t, "RIP\n");
10067+ BUG_ON(t->array);
10068+ BUG_ON(t->rt_list.next != LIST_POISON1);
10069+ BUG_ON(t->rt_list.prev != LIST_POISON2);
10070+ return 0;
10071+}
10072+
10073+static long gsnedf_pi_block(struct pi_semaphore *sem,
10074+ struct task_struct *new_waiter)
10075+{
10076+ /* This callback has to handle the situation where a new waiter is
10077+ * added to the wait queue of the semaphore.
10078+ *
10079+ * We must check if has a higher priority than the currently
10080+ * highest-priority task, and then potentially reschedule.
10081+ */
10082+
10083+ BUG_ON(!new_waiter);
10084+
10085+ if (edf_higher_prio(new_waiter, sem->hp.task)) {
10086+ TRACE_TASK(new_waiter, " boosts priority\n");
10087+ /* called with IRQs disabled */
10088+ queue_lock(&gsnedf_lock);
10089+ /* store new highest-priority task */
10090+ sem->hp.task = new_waiter;
10091+ if (sem->holder) {
10092+ /* let holder inherit */
10093+ sem->holder->rt_param.inh_task = new_waiter;
10094+ unlink(sem->holder);
10095+ gsnedf_job_arrival(sem->holder);
10096+ }
10097+ queue_unlock(&gsnedf_lock);
10098+ }
10099+
10100+ return 0;
10101+}
10102+
10103+static long gsnedf_inherit_priority(struct pi_semaphore *sem,
10104+ struct task_struct *new_owner)
10105+{
10106+ /* We don't need to acquire the gsnedf_lock since at the time of this
10107+ * call new_owner isn't actually scheduled yet (it's still sleeping)
10108+ * and since the calling function already holds sem->wait.lock, which
10109+ * prevents concurrent sem->hp.task changes.
10110+ */
10111+
10112+ if (sem->hp.task && sem->hp.task != new_owner) {
10113+ new_owner->rt_param.inh_task = sem->hp.task;
10114+ TRACE_TASK(new_owner, "inherited priority from %s/%d\n",
10115+ sem->hp.task->comm, sem->hp.task->pid);
10116+ } else
10117+ TRACE_TASK(new_owner,
10118+ "cannot inherit priority, "
10119+ "no higher priority job waits.\n");
10120+ return 0;
10121+}
10122+
10123+/* This function is called on a semaphore release, and assumes that
10124+ * the current task is also the semaphore holder.
10125+ */
10126+static long gsnedf_return_priority(struct pi_semaphore *sem)
10127+{
10128+ struct task_struct* t = current;
10129+ int ret = 0;
10130+
10131+ /* Find new highest-priority semaphore task
10132+ * if holder task is the current hp.task.
10133+ *
10134+ * Calling function holds sem->wait.lock.
10135+ */
10136+ if (t == sem->hp.task)
10137+ edf_set_hp_task(sem);
10138+
10139+ TRACE_CUR("gsnedf_return_priority for lock %p\n", sem);
10140+
10141+ if (t->rt_param.inh_task) {
10142+ /* interrupts already disabled by PI code */
10143+ queue_lock(&gsnedf_lock);
10144+
10145+ /* Reset inh_task to NULL. */
10146+ t->rt_param.inh_task = NULL;
10147+
10148+ /* Check if rescheduling is necessary */
10149+ unlink(t);
10150+ gsnedf_job_arrival(t);
10151+ queue_unlock(&gsnedf_lock);
10152+ }
10153+
10154+ return ret;
10155+}
10156+
10157+static int gsnedf_mode_change(int new_mode)
10158+{
10159+ unsigned long flags;
10160+ int cpu;
10161+ cpu_entry_t *entry;
10162+
10163+ if (new_mode == MODE_RT_RUN) {
10164+ queue_lock_irqsave(&gsnedf_lock, flags);
10165+
10166+ __rerelease_all(&gsnedf, edf_release_at);
10167+
10168+ /* get old cruft out of the way in case we reenter real-time
10169+ * mode for a second time
10170+ */
10171+ while (!list_empty(&gsnedf_cpu_queue))
10172+ list_del(gsnedf_cpu_queue.next);
10173+ /* reinitialize */
10174+ for_each_online_cpu(cpu) {
10175+ entry = &per_cpu(gsnedf_cpu_entries, cpu);
10176+ atomic_set(&entry->will_schedule, 0);
10177+ entry->linked = NULL;
10178+ entry->scheduled = NULL;
10179+ list_add(&entry->list, &gsnedf_cpu_queue);
10180+ }
10181+
10182+ queue_unlock_irqrestore(&gsnedf_lock, flags);
10183+
10184+ }
10185+ return 0;
10186+}
10187+
10188+
10189+/* Plugin object */
10190+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
10191+ .ready_to_use = 0
10192+};
10193+
10194+
10195+/*
10196+ * Plugin initialization code.
10197+ */
10198+#define INIT_SCHED_PLUGIN (struct sched_plugin){ \
10199+ .plugin_name = "GSN-EDF", \
10200+ .ready_to_use = 1, \
10201+ .scheduler_tick = gsnedf_scheduler_tick, \
10202+ .prepare_task = gsnedf_prepare_task, \
10203+ .sleep_next_period = edf_sleep_next_period, \
10204+ .tear_down = gsnedf_tear_down, \
10205+ .schedule = gsnedf_schedule, \
10206+ .finish_switch = gsnedf_finish_switch, \
10207+ .mode_change = gsnedf_mode_change, \
10208+ .wake_up_task = gsnedf_wake_up_task, \
10209+ .task_blocks = gsnedf_task_blocks, \
10210+ .inherit_priority = gsnedf_inherit_priority, \
10211+ .return_priority = gsnedf_return_priority, \
10212+ .pi_block = gsnedf_pi_block \
10213+}
10214+
10215+
10216+sched_plugin_t *__init init_gsn_edf_plugin(void)
10217+{
10218+ int cpu;
10219+ cpu_entry_t *entry;
10220+
10221+ if (!s_plugin.ready_to_use)
10222+ {
10223+ /* initialize CPU state */
10224+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
10225+ entry = &per_cpu(gsnedf_cpu_entries, cpu);
10226+ atomic_set(&entry->will_schedule, 0);
10227+ entry->linked = NULL;
10228+ entry->scheduled = NULL;
10229+ entry->cpu = cpu;
10230+ }
10231+
10232+ queue_lock_init(&gsnedf_lock);
10233+ edf_domain_init(&gsnedf, NULL);
10234+ s_plugin = INIT_SCHED_PLUGIN;
10235+ }
10236+ return &s_plugin;
10237+}
10238+
10239+
10240diff --git a/kernel/sched_part_edf.c b/kernel/sched_part_edf.c
10241new file mode 100644
10242index 0000000..df741f5
10243--- /dev/null
10244+++ b/kernel/sched_part_edf.c
10245@@ -0,0 +1,340 @@
10246+/*
10247+ * kernel/sched_part_edf.c
10248+ *
10249+ * Implementation of the partitioned EDF scheduler plugin.
10250+ */
10251+
10252+#include <linux/percpu.h>
10253+#include <linux/sched.h>
10254+#include <linux/list.h>
10255+#include <linux/spinlock.h>
10256+
10257+#include <linux/litmus.h>
10258+#include <linux/sched_plugin.h>
10259+#include <linux/edf_common.h>
10260+
10261+
10262+typedef struct {
10263+ rt_domain_t domain;
10264+ int cpu;
10265+ struct task_struct* scheduled; /* only RT tasks */
10266+ spinlock_t lock;
10267+} part_edf_domain_t;
10268+
10269+
10270+#define local_edf (&__get_cpu_var(part_edf_domains).domain)
10271+#define local_pedf (&__get_cpu_var(part_edf_domains))
10272+#define remote_edf(cpu) (&per_cpu(part_edf_domains, cpu).domain)
10273+#define remote_pedf(cpu) (&per_cpu(part_edf_domains, cpu))
10274+#define task_edf(task) remote_edf(get_partition(task))
10275+
10276+static void part_edf_domain_init(part_edf_domain_t* pedf,
10277+ check_resched_needed_t check,
10278+ int cpu)
10279+{
10280+ edf_domain_init(&pedf->domain, check);
10281+ pedf->cpu = cpu;
10282+ pedf->lock = SPIN_LOCK_UNLOCKED;
10283+ pedf->scheduled = NULL;
10284+}
10285+
10286+DEFINE_PER_CPU(part_edf_domain_t, part_edf_domains);
10287+
10288+/* This check is trivial in partioned systems as we only have to consider
10289+ * the CPU of the partition.
10290+ *
10291+ */
10292+static int part_edf_check_resched(rt_domain_t *edf)
10293+{
10294+ part_edf_domain_t *pedf = container_of(edf, part_edf_domain_t, domain);
10295+ int ret = 0;
10296+
10297+ spin_lock(&pedf->lock);
10298+
10299+ /* because this is a callback from rt_domain_t we already hold
10300+ * the necessary lock for the ready queue
10301+ */
10302+ if (edf_preemption_needed(edf, pedf->scheduled)) {
10303+ if (pedf->cpu == smp_processor_id())
10304+ set_tsk_need_resched(current);
10305+ else
10306+ smp_send_reschedule(pedf->cpu);
10307+ ret = 1;
10308+ }
10309+ spin_unlock(&pedf->lock);
10310+ return ret;
10311+}
10312+
10313+
10314+static reschedule_check_t part_edf_scheduler_tick(void)
10315+{
10316+ unsigned long flags;
10317+ struct task_struct *t = current;
10318+ reschedule_check_t want_resched = NO_RESCHED;
10319+ rt_domain_t *edf = local_edf;
10320+ part_edf_domain_t *pedf = local_pedf;
10321+
10322+ /* Check for inconsistency. We don't need the lock for this since
10323+ * ->scheduled is only changed in schedule, which obviously is not
10324+ * executing in parallel on this CPU
10325+ */
10326+ BUG_ON(is_realtime(t) && t != pedf->scheduled);
10327+
10328+ /* expire tasks even if not in real-time mode
10329+ * this makes sure that at the end of real-time mode
10330+ * no tasks "run away forever".
10331+ */
10332+ if (is_realtime(t) && (!--t->time_slice)) {
10333+ /* this task has exhausted its budget in this period */
10334+ set_rt_flags(t, RT_F_SLEEP);
10335+ want_resched = FORCE_RESCHED;
10336+ }
10337+ if (get_rt_mode() == MODE_RT_RUN)
10338+ {
10339+ /* check whether anything is waiting to be released
10340+ * this could probably be moved to the global timer
10341+ * interrupt handler since the state will only change
10342+ * once per jiffie
10343+ */
10344+ try_release_pending(edf);
10345+ if (want_resched != FORCE_RESCHED)
10346+ {
10347+ read_lock_irqsave(&edf->ready_lock, flags);
10348+ if (edf_preemption_needed(edf, t))
10349+ want_resched = FORCE_RESCHED;
10350+ read_unlock_irqrestore(&edf->ready_lock, flags);
10351+ }
10352+ }
10353+ return want_resched;
10354+}
10355+
10356+static int part_edf_schedule(struct task_struct * prev,
10357+ struct task_struct ** next,
10358+ runqueue_t * rq)
10359+{
10360+ int need_deactivate = 1;
10361+ part_edf_domain_t* pedf = local_pedf;
10362+ rt_domain_t* edf = &pedf->domain;
10363+
10364+
10365+ if (is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP)
10366+ edf_prepare_for_next_period(prev);
10367+
10368+ if (get_rt_mode() == MODE_RT_RUN) {
10369+ write_lock(&edf->ready_lock);
10370+ if (is_realtime(prev) && is_released(prev) && is_running(prev)
10371+ && !edf_preemption_needed(edf, prev)) {
10372+ /* this really should only happen if the task has
10373+ * 100% utilization...
10374+ */
10375+ TRACE("prev will be next, already released\n");
10376+ *next = prev;
10377+ need_deactivate = 0;
10378+ } else {
10379+ /* either not yet released, preempted, or non-rt */
10380+ *next = __take_ready(edf);
10381+ if (*next) {
10382+ /* stick the task into the runqueue */
10383+ __activate_task(*next, rq);
10384+ set_task_cpu(*next, smp_processor_id());
10385+ }
10386+ }
10387+ spin_lock(&pedf->lock);
10388+ pedf->scheduled = *next;
10389+ spin_unlock(&pedf->lock);
10390+ if (*next)
10391+ set_rt_flags(*next, RT_F_RUNNING);
10392+
10393+ write_unlock(&edf->ready_lock);
10394+ }
10395+
10396+ if (is_realtime(prev) && need_deactivate && prev->array) {
10397+ /* take it out of the run queue */
10398+ deactivate_task(prev, rq);
10399+ }
10400+
10401+ return 0;
10402+}
10403+
10404+
10405+static void part_edf_finish_switch(struct task_struct *prev)
10406+{
10407+ rt_domain_t* edf = local_edf;
10408+
10409+ if (!is_realtime(prev) || !is_running(prev))
10410+ return;
10411+
10412+ if (get_rt_flags(prev) == RT_F_SLEEP ||
10413+ get_rt_mode() != MODE_RT_RUN) {
10414+ /* this task has expired
10415+ * _schedule has already taken care of updating
10416+ * the release and
10417+ * deadline. We just must check if has been released.
10418+ */
10419+ if (is_released(prev) && get_rt_mode() == MODE_RT_RUN) {
10420+ /* already released */
10421+ add_ready(edf, prev);
10422+ TRACE("%d goes straight to ready queue\n", prev->pid);
10423+ } else
10424+ /* it has got to wait */
10425+ add_release(edf, prev);
10426+ } else {
10427+ /* this is a forced preemption
10428+ * thus the task stays in the ready_queue
10429+ * we only must make it available to others
10430+ */
10431+ add_ready(edf, prev);
10432+ }
10433+}
10434+
10435+
10436+/* Prepare a task for running in RT mode
10437+ * Enqueues the task into master queue data structure
10438+ * returns
10439+ * -EPERM if task is not TASK_STOPPED
10440+ */
10441+static long part_edf_prepare_task(struct task_struct * t)
10442+{
10443+ rt_domain_t* edf = task_edf(t);
10444+
10445+
10446+ TRACE("[%d] part edf: prepare task %d on CPU %d\n",
10447+ smp_processor_id(), t->pid, get_partition(t));
10448+ if (t->state == TASK_STOPPED) {
10449+ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
10450+
10451+ if (get_rt_mode() == MODE_RT_RUN)
10452+ /* The action is already on.
10453+ * Prepare immediate release.
10454+ */
10455+ edf_release_now(t);
10456+ /* The task should be running in the queue, otherwise signal
10457+ * code will try to wake it up with fatal consequences.
10458+ */
10459+ t->state = TASK_RUNNING;
10460+ add_release(edf, t);
10461+ return 0;
10462+ } else
10463+ return -EPERM;
10464+}
10465+
10466+static void part_edf_wake_up_task(struct task_struct *task)
10467+{
10468+ rt_domain_t* edf;
10469+
10470+ edf = task_edf(task);
10471+
10472+ /* We must determine whether task should go into the release
10473+ * queue or into the ready queue. It may enter the ready queue
10474+ * if it has credit left in its time slice and has not yet reached
10475+ * its deadline. If it is now passed its deadline we assume this the
10476+ * arrival of a new sporadic job and thus put it in the ready queue
10477+ * anyway.If it has zero budget and the next release is in the future
10478+ * it has to go to the release queue.
10479+ */
10480+ TRACE("part edf: wake up %d with budget=%d for cpu %d\n",
10481+ task->pid, task->time_slice, get_partition(task));
10482+ task->state = TASK_RUNNING;
10483+ if (is_tardy(task)) {
10484+ /* new sporadic release */
10485+ edf_release_now(task);
10486+ add_ready(edf, task);
10487+
10488+ } else if (task->time_slice) {
10489+ /* Came back in time before deadline. This may cause
10490+ * deadline overruns, but since we don't handle suspensions
10491+ * in the analytical model, we don't care since we can't
10492+ * guarantee anything at all if tasks block.
10493+ */
10494+ set_rt_flags(task, RT_F_RUNNING);
10495+ add_ready(edf, task);
10496+
10497+ } else {
10498+ add_release(edf, task);
10499+ }
10500+
10501+}
10502+
10503+static void part_edf_task_blocks(struct task_struct *t)
10504+{
10505+ BUG_ON(!is_realtime(t));
10506+ /* not really anything to do since it can only block if
10507+ * it is running, and when it is not running it is not in any
10508+ * queue anyway.
10509+ *
10510+ */
10511+ TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice);
10512+ BUG_ON(in_list(&t->rt_list));
10513+}
10514+
10515+
10516+/* When _tear_down is called, the task should not be in any queue any more
10517+ * as it must have blocked first. We don't have any internal state for the task,
10518+ * it is all in the task_struct.
10519+ */
10520+static long part_edf_tear_down(struct task_struct * t)
10521+{
10522+ BUG_ON(!is_realtime(t));
10523+ TRACE("part edf: tear down called for %d \n", t->pid);
10524+ BUG_ON(t->array);
10525+ BUG_ON(in_list(&t->rt_list));
10526+ return 0;
10527+}
10528+
10529+
10530+static int part_edf_mode_change(int new_mode)
10531+{
10532+ int cpu;
10533+
10534+ if (new_mode == MODE_RT_RUN)
10535+ for_each_online_cpu(cpu)
10536+ rerelease_all(remote_edf(cpu), edf_release_at);
10537+ TRACE("[%d] part edf: mode changed to %d\n",
10538+ smp_processor_id(), new_mode);
10539+ return 0;
10540+}
10541+
10542+
10543+/* Plugin object */
10544+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
10545+ .ready_to_use = 0
10546+};
10547+
10548+
10549+/*
10550+ * Plugin initialization code.
10551+ */
10552+#define INIT_SCHED_PLUGIN (struct sched_plugin) {\
10553+ .plugin_name = "Partitioned EDF",\
10554+ .ready_to_use = 1,\
10555+ .scheduler_tick = part_edf_scheduler_tick,\
10556+ .prepare_task = part_edf_prepare_task,\
10557+ .sleep_next_period = edf_sleep_next_period,\
10558+ .tear_down = part_edf_tear_down,\
10559+ .schedule = part_edf_schedule,\
10560+ .finish_switch = part_edf_finish_switch,\
10561+ .mode_change = part_edf_mode_change,\
10562+ .wake_up_task = part_edf_wake_up_task,\
10563+ .task_blocks = part_edf_task_blocks \
10564+}
10565+
10566+
10567+sched_plugin_t *__init init_part_edf_plugin(void)
10568+{
10569+ int i;
10570+
10571+ if (!s_plugin.ready_to_use)
10572+ {
10573+ for (i = 0; i < NR_CPUS; i++)
10574+ {
10575+ part_edf_domain_init(remote_pedf(i),
10576+ part_edf_check_resched, i);
10577+ printk("CPU partition %d initialized.", i);
10578+ }
10579+ s_plugin = INIT_SCHED_PLUGIN;
10580+ }
10581+ return &s_plugin;
10582+}
10583+
10584+
10585+
10586diff --git a/kernel/sched_pfair.c b/kernel/sched_pfair.c
10587new file mode 100644
10588index 0000000..dbb7e5c
10589--- /dev/null
10590+++ b/kernel/sched_pfair.c
10591@@ -0,0 +1,503 @@
10592+/*
10593+ *
10594+ * Implementation of synchronized PFAIR PD2 scheduler
10595+ *
10596+ */
10597+
10598+#include <linux/percpu.h>
10599+#include <linux/sched.h>
10600+#include <linux/list.h>
10601+
10602+#include <linux/litmus.h>
10603+#include <linux/sched_plugin.h>
10604+#include <linux/pfair_common.h>
10605+#include <linux/sched_trace.h>
10606+#include <linux/queuelock.h>
10607+
10608+struct cpu_state {
10609+ struct task_struct * t;
10610+ volatile jiffie_t jiffie_marker;
10611+};
10612+/* PFAIR scheduling domain, release and ready queues */
10613+static pfair_domain_t pfair __cacheline_aligned_in_smp;
10614+
10615+/* An indicator that quantum boundary was crossed
10616+ * and a decision has to be made
10617+ */
10618+static int sync_go[NR_CPUS];
10619+
10620+
10621+/* A collection of CPU states protected by pfair lock */
10622+DEFINE_PER_CPU(struct cpu_state, states);
10623+
10624+/*
10625+ * This function gets called by the timer code, with HZ frequency
10626+ * with interrupts disabled.
10627+ *
10628+ * The function merges the release queue with the ready queue
10629+ * and indicates that quantum boundary was crossed.
10630+ *
10631+ * It also suggests to schedule off currently running
10632+ * real-time task if the mode is non-real-time.
10633+ */
10634+static reschedule_check_t pfair_scheduler_tick(void)
10635+{
10636+ int want_resched = NO_RESCHED;
10637+ sync_go[smp_processor_id()] = 0;
10638+ if (!cpu_isset(smp_processor_id(), pfair.domain_cpus))
10639+ goto out;
10640+ /* Now determine if we want current task to be preempted */
10641+ if (get_rt_mode() == MODE_RT_RUN) {
10642+ pfair_try_release_pending(&pfair);
10643+ want_resched = FORCE_RESCHED;
10644+ /* indicate that the interrupt fired */
10645+ sync_go[smp_processor_id()] = 1;
10646+ barrier();
10647+ } else if (is_realtime(current) && is_running(current)) {
10648+ /* In non real-time mode we want to
10649+ * schedule off real-time tasks */
10650+ want_resched = FORCE_RESCHED;
10651+ } else if (is_realtime(current) && !is_running(current)) {
10652+ TRACE("[%d] %d Timer interrupt on not runninng %d\n",
10653+ smp_processor_id(),
10654+ jiffies-rt_start_time, current->pid);
10655+ }
10656+out:
10657+ return want_resched;
10658+}
10659+
10660+/**
10661+ * This function is called by the processor
10662+ * that performs rescheduling. It saves the timing
10663+ * parameters of currently running jobs that were not rescheduled yet
10664+ * and releases next subtask for these jobs placing them into
10665+ * release and ready queues.
10666+ */
10667+static void pretend_release(cpumask_t p)
10668+{
10669+ int i = 0;
10670+ struct task_struct * t = NULL;
10671+ /* for all the tasks increment the number of used quanta
10672+ * and release next subtask or job depending on the number
10673+ * of used quanta
10674+ */
10675+ for_each_cpu_mask(i, p) {
10676+ t = per_cpu(states, i).t;
10677+ if (t != NULL) {
10678+ backup_times(t);
10679+ inc_passed_quanta(t);
10680+ if ( get_passed_quanta(t) == get_exec_cost(t)) {
10681+ pfair_prepare_next_job(t);
10682+ } else {
10683+ pfair_prepare_next_subtask(t);
10684+ }
10685+ /*
10686+ TRACE("[%d] %d pretending release %d with (%d, %d)\n",
10687+ smp_processor_id(),
10688+ jiffies-rt_start_time,t->pid,
10689+ get_release(t)-rt_start_time,
10690+ get_deadline(t)-rt_start_time);*/
10691+ /* detect if the job or subtask has to be released now*/
10692+ if (time_before_eq(get_release(t), jiffies))
10693+ pfair_add_ready(&pfair, t);
10694+ else
10695+ pfair_add_release(&pfair, t);
10696+ }
10697+ }
10698+}
10699+/*
10700+ * Rollback the the pretended release of tasks.
10701+ * Timing parameters are restored and tasks are removed
10702+ * from the queues as it was before calling the schedule() function.
10703+ *
10704+ */
10705+static void rollback_release(cpumask_t p)
10706+{
10707+ int i = -1;
10708+ struct task_struct * t = NULL;
10709+ /*
10710+ * Rollback the pretended changes
10711+ */
10712+ for_each_cpu_mask(i, p) {
10713+ t = per_cpu(states, i).t;
10714+ if (t != NULL) {
10715+ restore_times(t);
10716+ if(t->rt_list.prev != LIST_POISON1 ||
10717+ t->rt_list.next != LIST_POISON2) {
10718+ /* Delete the task from a queue */
10719+ list_del(&t->rt_list);
10720+ }
10721+ }
10722+ }
10723+}
10724+
10725+/*
10726+ * The procedure creates a list of cpu's whose tasks have not been
10727+ * rescheduled yet. These are CPU's with jiffie marker different from
10728+ * the value of jiffies.
10729+ */
10730+static void find_participants(cpumask_t * target)
10731+{
10732+ cpumask_t res;int i;
10733+ cpus_clear(res);
10734+ for_each_online_cpu(i) {
10735+ if(per_cpu(states, i).jiffie_marker != jiffies)
10736+ cpu_set(i, res);
10737+ }
10738+ /* Examine only cpus in the domain */
10739+ cpus_and(res, pfair.domain_cpus, res);
10740+ (*target) = res;
10741+}
10742+
10743+/*
10744+ * This is main PFAIR schedule function,
10745+ * each processor pretends that some currently running tasks are
10746+ * released in the next quantum and determines whether it should
10747+ * keep the task that is currently running (this is usually the case
10748+ * for heavy tasks).
10749+*/
10750+static int pfair_schedule(struct task_struct *prev,
10751+ struct task_struct **next,
10752+ runqueue_t * rq)
10753+{
10754+ int cpu =-1;
10755+ int k =-1;
10756+ int need_deactivate = 1;
10757+ int keep =0;
10758+ unsigned long flags;
10759+ cpumask_t participants;
10760+ /* A temporary array */
10761+ struct task_struct * rs_old_ptr[NR_CPUS];
10762+
10763+ *next = NULL;
10764+ cpu = smp_processor_id();
10765+ /* CPU's not in the domain just bypass */
10766+ if (!cpu_isset(cpu, pfair.domain_cpus)) {
10767+ goto out;
10768+ }
10769+ queue_lock_irqsave(&pfair.pfair_lock, flags);
10770+
10771+ /* If we happen to run in non-realtime mode
10772+ * then we have to schedule off currently running tasks
10773+ * */
10774+ if (get_rt_mode() != MODE_RT_RUN) {
10775+ if (is_realtime(prev)) {
10776+ per_cpu(states, cpu).t = NULL;
10777+ TRACE("[%d] %d Suspending %d\n",
10778+ cpu, jiffies - rt_start_time,
10779+ prev->pid);
10780+ /* Move the task to the
10781+ * release queue for future runs
10782+ * FIXME: Do something smarter.
10783+ * For example create a set where
10784+ * prepared or inactive tasks are placed
10785+ * and then released.
10786+ * */
10787+ set_release(prev, get_release(prev) + 1000);
10788+ pfair_add_release(&pfair, prev);
10789+ }
10790+ goto out_deactivate;
10791+ }
10792+ /* If the current task stops or dies */
10793+ if (is_realtime(prev) && !is_running(prev)) {
10794+ /* remove it from the running set */
10795+ per_cpu(states, cpu).t = NULL;
10796+ }
10797+ /* Make pfair decisions at quantum boundaries only,
10798+ * but schedule off stopped or dead tasks */
10799+
10800+ if ((sync_go[cpu]--) != 1)
10801+ goto out_deactivate;
10802+
10803+ /*TRACE("[%d] %d Scheduler activation", cpu, jiffies-rt_start_time);
10804+ cpus_and(res, pfair.domain_cpus, cpu_online_map);
10805+ for_each_cpu_mask(k, res) {
10806+ TRACE("%d" ,(per_cpu(states, k).jiffie_marker!=jiffies));
10807+ }
10808+ TRACE("\n");*/
10809+
10810+ /* Find processors that have not rescheduled yet */
10811+ find_participants(&participants);
10812+ /* For each task on remote cpu's pretend release */
10813+ pretend_release(participants);
10814+ /* Clear temporary array */
10815+ for_each_possible_cpu(k) { rs_old_ptr[k] = NULL; }
10816+ /* Select a new subset of eligible tasks */
10817+ for_each_cpu_mask(k, participants) {
10818+ rs_old_ptr[k] = __pfair_take_ready (&pfair);
10819+ /* Check if our current task must be scheduled in the next quantum */
10820+ if (rs_old_ptr[k] == per_cpu(states, cpu).t) {
10821+ /* this is our current task, keep it */
10822+ *next = per_cpu(states, cpu).t;
10823+ need_deactivate = 0;
10824+ keep = 1;
10825+ break;
10826+ }
10827+ }
10828+ /* Put all the extracted tasks back into the ready queue */
10829+ for_each_cpu_mask(k, participants) {
10830+ if (rs_old_ptr[k] != NULL){
10831+ pfair_add_ready(&pfair, rs_old_ptr[k]);
10832+ rs_old_ptr[k] = NULL;
10833+ }
10834+ }
10835+ /* Rollback the pretended release,
10836+ * task parameters are restored and running tasks are removed
10837+ * from queues */
10838+ rollback_release(participants);
10839+ /*
10840+ * If the current task is not scheduled in the next quantum
10841+ * then select a new pfair task
10842+ */
10843+ if(!keep) {
10844+ *next = per_cpu(states, cpu).t = __pfair_take_ready(&pfair);
10845+ if (*next != NULL) {
10846+ /*TRACE("[%d] %d Scheduling %d with (%d, %d)\n",
10847+ cpu, jiffies-rt_start_time,
10848+ get_release(*next),
10849+ get_deadline(*next));
10850+ */
10851+ set_task_cpu(*next, cpu);
10852+ __activate_task(*next, rq);
10853+ }
10854+ } else {
10855+ if (is_realtime(prev)) {
10856+ /*TRACE("[%d] %d prev==next %d\n",
10857+ cpu,jiffies-rt_start_time,
10858+ (prev)->pid);*/
10859+
10860+ /* The task will not be switched off but we
10861+ * need to track the execution time
10862+ */
10863+ inc_passed_quanta(prev);
10864+ }
10865+ }
10866+
10867+ /*Show that our task does not participate in subsequent selections*/
10868+ __get_cpu_var(states).jiffie_marker = jiffies;
10869+
10870+out_deactivate:
10871+ if ( is_realtime(prev) && need_deactivate && prev->array) {
10872+ /* take prev out of the linux run queue */
10873+ deactivate_task(prev, rq);
10874+ }
10875+ queue_unlock_irqrestore(&pfair.pfair_lock, flags);
10876+out:
10877+ return 0;
10878+}
10879+
10880+static void pfair_finish_task_switch(struct task_struct *t)
10881+{
10882+ if (!is_realtime(t) || !is_running(t))
10883+ return;
10884+
10885+ queue_lock(&pfair.pfair_lock);
10886+ /* Release in real-time mode only,
10887+ * if the mode is non real-time, then
10888+ * the task is already in the release queue
10889+ * with the time far in the future
10890+ */
10891+ if (get_rt_mode() == MODE_RT_RUN) {
10892+ inc_passed_quanta(t);
10893+ if ( get_passed_quanta(t) == get_exec_cost(t)) {
10894+ sched_trace_job_completion(t);
10895+ pfair_prepare_next_job(t);
10896+ } else {
10897+ pfair_prepare_next_subtask(t);
10898+ }
10899+ /*TRACE("[%d] %d releasing %d with (%d, %d)\n",
10900+ smp_processor_id(),
10901+ jiffies-rt_start_time,
10902+ t->pid,
10903+ get_release(t)-rt_start_time,
10904+ get_deadline(t)-rt_start_time);*/
10905+ if (time_before_eq(get_release(t), jiffies))
10906+ pfair_add_ready(&pfair, t);
10907+ else
10908+ pfair_add_release(&pfair, t);
10909+ }
10910+ queue_unlock(&pfair.pfair_lock);
10911+}
10912+
10913+/* Prepare a task for running in RT mode
10914+ * Enqueues the task into master queue data structure
10915+ * returns
10916+ * -EPERM if task is not TASK_STOPPED
10917+ */
10918+static long pfair_prepare_task(struct task_struct * t)
10919+{
10920+ unsigned long flags;
10921+ TRACE("pfair: prepare task %d\n", t->pid);
10922+ if (t->state == TASK_STOPPED) {
10923+ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
10924+
10925+ if (get_rt_mode() == MODE_RT_RUN)
10926+ /* The action is already on.
10927+ * Prepare immediate release
10928+ */
10929+ __pfair_prepare_new_release(t, jiffies);
10930+ /* The task should be running in the queue, otherwise signal
10931+ * code will try to wake it up with fatal consequences.
10932+ */
10933+ t->state = TASK_RUNNING;
10934+ queue_lock_irqsave(&pfair.pfair_lock, flags);
10935+ pfair_add_release(&pfair, t);
10936+ queue_unlock_irqrestore(&pfair.pfair_lock, flags);
10937+ return 0;
10938+ } else
10939+ return -EPERM;
10940+}
10941+
10942+
10943+
10944+static void pfair_wake_up_task(struct task_struct *task)
10945+{
10946+
10947+ unsigned long flags;
10948+
10949+ /* We must determine whether task should go into the release
10950+ * queue or into the ready queue.
10951+ * The task enters the ready queue if the previous deadline was missed,
10952+ * so we treat the invoked job as a new sporadic release.
10953+ *
10954+ * The job can also enter the ready queue if it was invoked before its
10955+ * global deadline, but its budjet must be clipped down to one quantum
10956+ */
10957+ task->state = TASK_RUNNING;
10958+ if (time_after_eq(jiffies, task->rt_param.times.last_release
10959+ + get_rt_period(task))) {
10960+ /* new sporadic release */
10961+ TRACE("[%d] Sporadic release of %d at %d\n",
10962+ smp_processor_id(),
10963+ jiffies-rt_start_time,
10964+ task->pid);
10965+ __pfair_prepare_new_release(task, jiffies);
10966+ queue_lock_irqsave(&pfair.pfair_lock, flags);
10967+ sched_trace_job_release(task);
10968+ pfair_add_ready(&pfair, task);
10969+ queue_unlock_irqrestore(&pfair.pfair_lock, flags);
10970+ } else if (task->time_slice) {
10971+ /* came back in time before deadline
10972+ * clip the budget to be the last subtask of a job or
10973+ * the new job.
10974+ */
10975+ task->rt_param.times.exec_time = get_exec_cost(task) - 1;
10976+ if (task->rt_param.times.exec_time == 0) {
10977+ pfair_prepare_next_job(task);
10978+ } else {
10979+ pfair_prepare_next_subtask(task);
10980+ }
10981+ TRACE("[%d] %d Resume of %d with %d, %d, %d\n",
10982+ smp_processor_id(), jiffies-rt_start_time,
10983+ task->pid, get_release(task)-rt_start_time,
10984+ get_deadline(task)-rt_start_time,
10985+ get_passed_quanta(task));
10986+
10987+ set_rt_flags(task, RT_F_RUNNING);
10988+ queue_lock_irqsave(&pfair.pfair_lock, flags);
10989+ sched_trace_job_release(task);
10990+ if (time_after_eq(jiffies, get_release(task))) {
10991+ pfair_add_ready(&pfair, task);
10992+ } else {
10993+ pfair_add_release(&pfair, task);
10994+ }
10995+ queue_unlock_irqrestore(&pfair.pfair_lock, flags);
10996+
10997+ } else {
10998+ TRACE("[%d] %d Strange release of %d with %d, %d, %d\n",
10999+ smp_processor_id(), jiffies-rt_start_time,
11000+ task->pid,
11001+ get_release(task), get_deadline(task),
11002+ get_passed_quanta(task));
11003+
11004+ queue_lock_irqsave(&pfair.pfair_lock, flags);
11005+ pfair_add_release(&pfair, task);
11006+ queue_unlock_irqrestore(&pfair.pfair_lock, flags);
11007+ }
11008+}
11009+
11010+
11011+static void pfair_task_blocks(struct task_struct *t)
11012+{
11013+ unsigned long flags;
11014+ int i;
11015+ cpumask_t res;
11016+ BUG_ON(!is_realtime(t));
11017+ /* If the task blocks, then it must be removed from the running set */
11018+ queue_lock_irqsave(&pfair.pfair_lock, flags);
11019+ cpus_and(res,pfair.domain_cpus, cpu_online_map);
11020+ for_each_cpu_mask(i, res) {
11021+ if (per_cpu(states, i).t == t)
11022+ per_cpu(states, i).t = NULL;
11023+ }
11024+ /* If the task is running and in some
11025+ * list it might have been released by another
11026+ * processor
11027+ */
11028+ if((t->rt_list.next != LIST_POISON1 ||
11029+ t->rt_list.prev != LIST_POISON2)) {
11030+ TRACE("[%d] %d task %d is deleted from the list\n",
11031+ smp_processor_id(),
11032+ jiffies-rt_start_time, t->pid);
11033+ list_del(&t->rt_list);
11034+ }
11035+ queue_unlock_irqrestore(&pfair.pfair_lock, flags);
11036+ TRACE("[%d] %d task %d blocks with budget=%d state=%d\n",
11037+ smp_processor_id(), jiffies-rt_start_time,
11038+ t->pid, t->time_slice, t->state);
11039+}
11040+
11041+static long pfair_tear_down(struct task_struct * t)
11042+{
11043+ BUG_ON(!is_realtime(t));
11044+ TRACE("pfair: tear down called for %d \n", t->pid);
11045+ BUG_ON(t->array);
11046+ BUG_ON(t->rt_list.next != LIST_POISON1);
11047+ BUG_ON(t->rt_list.prev != LIST_POISON2);
11048+ return 0;
11049+}
11050+
11051+static int pfair_mode_change(int new_mode)
11052+{
11053+ printk(KERN_INFO "[%d] pfair mode change %d\n",
11054+ smp_processor_id(), new_mode);
11055+ if (new_mode == MODE_RT_RUN) {
11056+ pfair_prepare_new_releases(&pfair, jiffies + 10);
11057+ }
11058+ printk(KERN_INFO "[%d] pfair: mode change done\n", smp_processor_id());
11059+ return 0;
11060+}
11061+
11062+/* Plugin object */
11063+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
11064+ .ready_to_use = 0
11065+};
11066+/*
11067+* PFAIR plugin initialization macro.
11068+*/
11069+#define INIT_PFAIR_PLUGIN (struct sched_plugin){\
11070+ .plugin_name = "PFAIR",\
11071+ .ready_to_use = 1,\
11072+ .scheduler_tick = pfair_scheduler_tick,\
11073+ .prepare_task = pfair_prepare_task,\
11074+ .tear_down = pfair_tear_down,\
11075+ .schedule = pfair_schedule,\
11076+ .finish_switch = pfair_finish_task_switch,\
11077+ .mode_change = pfair_mode_change,\
11078+ .wake_up_task = pfair_wake_up_task,\
11079+ .task_blocks = pfair_task_blocks \
11080+ }
11081+
11082+sched_plugin_t* __init init_pfair_plugin(void)
11083+{
11084+ int i=0;
11085+ if (!s_plugin.ready_to_use) {
11086+ pfair_domain_init(&pfair);
11087+ for (i=0; i<NR_CPUS; i++) {
11088+ sync_go[i] = 0;
11089+ per_cpu(states, i).t = NULL;
11090+ }
11091+ s_plugin = INIT_PFAIR_PLUGIN;
11092+ }
11093+ return &s_plugin;
11094+}
11095diff --git a/kernel/sched_plugin.c b/kernel/sched_plugin.c
11096new file mode 100644
11097index 0000000..9fb10af
11098--- /dev/null
11099+++ b/kernel/sched_plugin.c
11100@@ -0,0 +1,108 @@
11101+/* sched_plugin.c -- core infrastructure for the scheduler plugin system
11102+ *
11103+ * This file includes the initialization of the plugin system, the no-op Linux
11104+ * scheduler plugin and some dummy functions.
11105+ */
11106+
11107+
11108+#include <linux/litmus.h>
11109+#include <linux/sched_plugin.h>
11110+
11111+
11112+/*************************************************************
11113+ * Dummy plugin functions *
11114+ *************************************************************/
11115+
11116+void litmus_dummy_finish_switch(struct task_struct * prev)
11117+{
11118+}
11119+
11120+int litmus_dummy_schedule(struct task_struct * prev,
11121+ struct task_struct** next,
11122+ runqueue_t* q)
11123+{
11124+ return 0;
11125+}
11126+
11127+reschedule_check_t litmus_dummy_scheduler_tick(void)
11128+{
11129+ return NO_RESCHED;
11130+}
11131+
11132+
11133+long litmus_dummy_prepare_task(struct task_struct *t)
11134+{
11135+ return 0;
11136+}
11137+
11138+void litmus_dummy_wake_up_task(struct task_struct *task)
11139+{
11140+ printk(KERN_WARNING "task %d: unhandled real-time wake up!\n",
11141+ task->pid);
11142+}
11143+
11144+void litmus_dummy_task_blocks(struct task_struct *task)
11145+{
11146+}
11147+
11148+long litmus_dummy_tear_down(struct task_struct *task)
11149+{
11150+ return 0;
11151+}
11152+
11153+int litmus_dummy_scheduler_setup(int cmd, void __user *parameter)
11154+{
11155+ return -EPERM;
11156+}
11157+
11158+long litmus_dummy_sleep_next_period(void)
11159+{
11160+ return -EPERM;
11161+}
11162+
11163+long litmus_dummy_inherit_priority(struct pi_semaphore *sem,
11164+ struct task_struct *new_owner)
11165+{
11166+ return -EPERM;
11167+}
11168+
11169+long litmus_dummy_return_priority(struct pi_semaphore *sem)
11170+{
11171+ return -EPERM;
11172+}
11173+
11174+long litmus_dummy_pi_block(struct pi_semaphore *sem,
11175+ struct task_struct *new_waiter)
11176+{
11177+ return -EPERM;
11178+}
11179+
11180+
11181+/* The default scheduler plugin. It doesn't do anything and lets Linux do its
11182+ * job.
11183+ */
11184+
11185+sched_plugin_t linux_sched_plugin = {
11186+ .plugin_name = "Linux",
11187+ .ready_to_use = 1,
11188+ .scheduler_tick = litmus_dummy_scheduler_tick,
11189+ .prepare_task = litmus_dummy_prepare_task,
11190+ .tear_down = litmus_dummy_tear_down,
11191+ .wake_up_task = litmus_dummy_wake_up_task,
11192+ .task_blocks = litmus_dummy_task_blocks,
11193+ .sleep_next_period = litmus_dummy_sleep_next_period,
11194+ .schedule = litmus_dummy_schedule,
11195+ .finish_switch = litmus_dummy_finish_switch,
11196+ .scheduler_setup = litmus_dummy_scheduler_setup,
11197+ .inherit_priority = litmus_dummy_inherit_priority,
11198+ .return_priority = litmus_dummy_return_priority,
11199+ .pi_block = litmus_dummy_pi_block
11200+};
11201+
11202+/*
11203+ * The reference to current plugin that is used to schedule tasks within
11204+ * the system. It stores references to actual function implementations
11205+ * Should be initialized by calling "init_***_plugin()"
11206+ */
11207+sched_plugin_t *curr_sched_plugin = &linux_sched_plugin;
11208+
11209diff --git a/kernel/sched_psn_edf.c b/kernel/sched_psn_edf.c
11210new file mode 100644
11211index 0000000..a1e12e0
11212--- /dev/null
11213+++ b/kernel/sched_psn_edf.c
11214@@ -0,0 +1,523 @@
11215+
11216+/*
11217+ * kernel/sched_psn_edf.c
11218+ *
11219+ * Implementation of the PSN-EDF scheduler plugin.
11220+ * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c.
11221+ *
11222+ * Suspensions and non-preemptable sections are supported.
11223+ * Priority inheritance is not supported.
11224+ */
11225+
11226+#include <linux/percpu.h>
11227+#include <linux/sched.h>
11228+#include <linux/list.h>
11229+#include <linux/spinlock.h>
11230+
11231+#include <linux/litmus.h>
11232+#include <linux/sched_plugin.h>
11233+#include <linux/edf_common.h>
11234+
11235+
11236+typedef struct {
11237+ rt_domain_t domain;
11238+ int cpu;
11239+ struct task_struct* scheduled; /* only RT tasks */
11240+ spinlock_t lock; /* protects the domain and
11241+ * serializes scheduling decisions
11242+ */
11243+} psnedf_domain_t;
11244+
11245+DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains);
11246+
11247+#define local_edf (&__get_cpu_var(psnedf_domains).domain)
11248+#define local_pedf (&__get_cpu_var(psnedf_domains))
11249+#define remote_edf(cpu) (&per_cpu(psnedf_domains, cpu).domain)
11250+#define remote_pedf(cpu) (&per_cpu(psnedf_domains, cpu))
11251+#define task_edf(task) remote_edf(get_partition(task))
11252+#define task_pedf(task) remote_pedf(get_partition(task))
11253+
11254+
11255+static void psnedf_domain_init(psnedf_domain_t* pedf,
11256+ check_resched_needed_t check,
11257+ int cpu)
11258+{
11259+ edf_domain_init(&pedf->domain, check);
11260+ pedf->cpu = cpu;
11261+ pedf->lock = SPIN_LOCK_UNLOCKED;
11262+ pedf->scheduled = NULL;
11263+}
11264+
11265+static void requeue(struct task_struct* t, rt_domain_t *edf)
11266+{
11267+ /* only requeue if t is actually running */
11268+ BUG_ON(!is_running(t));
11269+
11270+ if (t->state != TASK_RUNNING)
11271+ TRACE_TASK(t, "requeue: !TASK_RUNNING");
11272+
11273+ set_rt_flags(t, RT_F_RUNNING);
11274+ if (!is_released(t) ||
11275+ get_rt_mode() != MODE_RT_RUN)
11276+ __add_release(edf, t); /* it has got to wait */
11277+ else
11278+ __add_ready(edf, t);
11279+}
11280+
11281+/* we assume the lock is being held */
11282+static void preempt(psnedf_domain_t *pedf)
11283+{
11284+ if (smp_processor_id() == pedf->cpu) {
11285+ if (pedf->scheduled && is_np(pedf->scheduled))
11286+ request_exit_np(pedf->scheduled);
11287+ else
11288+ set_tsk_need_resched(current);
11289+ } else
11290+ /* in case that it is a remote CPU we have to defer the
11291+ * the decision to the remote CPU
11292+ */
11293+ smp_send_reschedule(pedf->cpu);
11294+}
11295+
11296+/* This check is trivial in partioned systems as we only have to consider
11297+ * the CPU of the partition.
11298+ */
11299+static int psnedf_check_resched(rt_domain_t *edf)
11300+{
11301+ psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain);
11302+ int ret = 0;
11303+
11304+ /* because this is a callback from rt_domain_t we already hold
11305+ * the necessary lock for the ready queue
11306+ */
11307+ if (edf_preemption_needed(edf, pedf->scheduled)) {
11308+ preempt(pedf);
11309+ ret = 1;
11310+ }
11311+ return ret;
11312+}
11313+
11314+
11315+static reschedule_check_t psnedf_scheduler_tick(void)
11316+{
11317+ unsigned long flags;
11318+ struct task_struct *t = current;
11319+ reschedule_check_t want_resched = NO_RESCHED;
11320+ rt_domain_t *edf = local_edf;
11321+ psnedf_domain_t *pedf = local_pedf;
11322+
11323+ /* Check for inconsistency. We don't need the lock for this since
11324+ * ->scheduled is only changed in schedule, which obviously is not
11325+ * executing in parallel on this CPU
11326+ */
11327+ BUG_ON(is_realtime(t) && t != pedf->scheduled);
11328+
11329+ if (is_realtime(t))
11330+ TRACE("%s/%d was hit by scheduler tick\n", t->comm, t->pid);
11331+
11332+ /* expire tasks even if not in real-time mode
11333+ * this makes sure that at the end of real-time mode
11334+ * no tasks "run away forever".
11335+ */
11336+ if (is_realtime(t) && t->time_slice && !--t->time_slice) {
11337+ if (!is_np(t)) {
11338+ want_resched = FORCE_RESCHED;
11339+ } else {
11340+ TRACE("psnedf_scheduler_tick: "
11341+ "%d is non-preemptable, "
11342+ "preemption delayed.\n", t->pid);
11343+ request_exit_np(t);
11344+ }
11345+ }
11346+
11347+ if (get_rt_mode() == MODE_RT_RUN)
11348+ {
11349+ /* check whether anything is waiting to be released
11350+ * this could probably be moved to the global timer
11351+ * interrupt handler since the state will only change
11352+ * once per jiffie
11353+ */
11354+ spin_lock_irqsave(&pedf->lock, flags);
11355+ __release_pending(edf);
11356+ if (want_resched != FORCE_RESCHED &&
11357+ edf_preemption_needed(edf, t))
11358+ want_resched = FORCE_RESCHED;
11359+
11360+ spin_unlock_irqrestore(&pedf->lock, flags);
11361+
11362+ }
11363+ return want_resched;
11364+}
11365+
11366+static void job_completion(struct task_struct* t)
11367+{
11368+ TRACE_TASK(t, "job_completion().\n");
11369+ set_rt_flags(t, RT_F_SLEEP);
11370+ edf_prepare_for_next_period(t);
11371+}
11372+
11373+static int psnedf_schedule(struct task_struct * prev,
11374+ struct task_struct ** next,
11375+ runqueue_t * rq)
11376+{
11377+ psnedf_domain_t* pedf = local_pedf;
11378+ rt_domain_t* edf = &pedf->domain;
11379+
11380+ int out_of_time, sleep, preempt,
11381+ np, exists, rt, blocks, resched;
11382+
11383+ spin_lock(&pedf->lock);
11384+
11385+ /* sanity checking */
11386+ BUG_ON(pedf->scheduled && pedf->scheduled != prev);
11387+ BUG_ON(pedf->scheduled && !is_realtime(prev));
11388+
11389+ /* (0) Determine state */
11390+ exists = pedf->scheduled != NULL;
11391+ blocks = exists && !is_running(pedf->scheduled);
11392+ out_of_time = exists && !pedf->scheduled->time_slice;
11393+ np = exists && is_np(pedf->scheduled);
11394+ sleep = exists && get_rt_flags(pedf->scheduled) == RT_F_SLEEP;
11395+ preempt = edf_preemption_needed(edf, prev);
11396+ rt = get_rt_mode() == MODE_RT_RUN;
11397+
11398+
11399+ /* If we need to preempt do so.
11400+ * The following checks set resched to 1 in case of special
11401+ * circumstances.
11402+ */
11403+ resched = preempt;
11404+
11405+ /* If a task blocks we have no choice but to reschedule.
11406+ */
11407+ if (blocks)
11408+ resched = 1;
11409+
11410+ /* Request a sys_exit_np() call if we would like to preempt but cannot.
11411+ * Multiple calls to request_exit_np() don't hurt.
11412+ */
11413+ if (np && (out_of_time || preempt || sleep))
11414+ request_exit_np(pedf->scheduled);
11415+
11416+ /* Any task that is preemptable and either exhausts its execution
11417+ * budget or wants to sleep completes. We may have to reschedule after
11418+ * this.
11419+ */
11420+ if (!np && (out_of_time || sleep)) {
11421+ job_completion(pedf->scheduled);
11422+ resched = 1;
11423+ }
11424+
11425+ /* Stop real-time tasks when we leave real-time mode
11426+ */
11427+ if (!rt && exists)
11428+ resched = 1;
11429+
11430+ /* The final scheduling decision. Do we need to switch for some reason?
11431+ * Switch if we are in RT mode and have no task or if we need to
11432+ * resched.
11433+ */
11434+ *next = NULL;
11435+ if ((!np || blocks) && (resched || (!exists && rt))) {
11436+ /* Take care of a previously scheduled
11437+ * job by taking it out of the Linux runqueue.
11438+ */
11439+ if (pedf->scheduled) {
11440+ /* as opposed to global schedulers that switch without
11441+ * a lock being held we can requeue already here since
11442+ * no other CPU will schedule from this domain.
11443+ */
11444+ if (!blocks)
11445+ requeue(pedf->scheduled, edf);
11446+ if (prev->array)
11447+ /* take it out of the run queue */
11448+ deactivate_task(prev, rq);
11449+ }
11450+
11451+ /* only pick tasks if we are actually in RT mode */
11452+ if (rt)
11453+ *next = __take_ready(edf);
11454+ if (*next) {
11455+ /* stick the task into the runqueue */
11456+ __activate_task(*next, rq);
11457+ set_task_cpu(*next, smp_processor_id());
11458+ }
11459+
11460+ } else
11461+ /* Only override Linux scheduler if we have a real-time task
11462+ * scheduled that needs to continue.
11463+ */
11464+ if (exists)
11465+ *next = prev;
11466+
11467+ if (*next)
11468+ set_rt_flags(*next, RT_F_RUNNING);
11469+
11470+ pedf->scheduled = *next;
11471+ spin_unlock(&pedf->lock);
11472+ return 0;
11473+}
11474+
11475+
11476+/* Prepare a task for running in RT mode
11477+ * Enqueues the task into master queue data structure
11478+ * returns
11479+ * -EPERM if task is not TASK_STOPPED
11480+ */
11481+static long psnedf_prepare_task(struct task_struct * t)
11482+{
11483+ rt_domain_t* edf = task_edf(t);
11484+ psnedf_domain_t* pedf = task_pedf(t);
11485+ unsigned long flags;
11486+
11487+ TRACE("[%d] psn edf: prepare task %d on CPU %d\n",
11488+ smp_processor_id(), t->pid, get_partition(t));
11489+ if (t->state == TASK_STOPPED) {
11490+ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
11491+
11492+ if (get_rt_mode() == MODE_RT_RUN)
11493+ /* The action is already on.
11494+ * Prepare immediate release.
11495+ */
11496+ edf_release_now(t);
11497+ /* The task should be running in the queue, otherwise signal
11498+ * code will try to wake it up with fatal consequences.
11499+ */
11500+ t->state = TASK_RUNNING;
11501+ spin_lock_irqsave(&pedf->lock, flags);
11502+ __add_release(edf, t);
11503+ spin_unlock_irqrestore(&pedf->lock, flags);
11504+ return 0;
11505+ } else
11506+ return -EPERM;
11507+}
11508+
11509+static void psnedf_wake_up_task(struct task_struct *task)
11510+{
11511+ unsigned long flags;
11512+ psnedf_domain_t* pedf = task_pedf(task);
11513+ rt_domain_t* edf = task_edf(task);
11514+
11515+ TRACE("psnedf: %d unsuspends with budget=%d\n",
11516+ task->pid, task->time_slice);
11517+
11518+ /* After fixing the litmus_controlled bug,
11519+ * this should hold again.
11520+ */
11521+ BUG_ON(in_list(&task->rt_list));
11522+
11523+ task->state = TASK_RUNNING;
11524+
11525+ /* We need to take suspensions because of semaphores into
11526+ * account! If a job resumes after being suspended due to acquiring
11527+ * a semaphore, it should never be treated as a new job release.
11528+ */
11529+ if (is_tardy(task) && get_rt_flags(task) != RT_F_EXIT_SEM) {
11530+ /* new sporadic release */
11531+ edf_release_now(task);
11532+ sched_trace_job_release(task);
11533+ }
11534+
11535+ spin_lock_irqsave(&pedf->lock, flags);
11536+ requeue(task, edf);
11537+ spin_unlock_irqrestore(&pedf->lock, flags);
11538+}
11539+
11540+static void psnedf_task_blocks(struct task_struct *t)
11541+{
11542+ BUG_ON(!is_realtime(t));
11543+ /* not really anything to do since it can only block if
11544+ * it is running, and when it is not running it is not in any
11545+ * queue anyway.
11546+ */
11547+ TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice);
11548+ BUG_ON(in_list(&t->rt_list));
11549+}
11550+
11551+
11552+/* When _tear_down is called, the task should not be in any queue any more
11553+ * as it must have blocked first. We don't have any internal state for the task,
11554+ * it is all in the task_struct.
11555+ */
11556+static long psnedf_tear_down(struct task_struct * t)
11557+{
11558+ BUG_ON(!is_realtime(t));
11559+ TRACE_TASK(t, "tear down called");
11560+ BUG_ON(t->array);
11561+ BUG_ON(in_list(&t->rt_list));
11562+ return 0;
11563+}
11564+
11565+static long psnedf_pi_block(struct pi_semaphore *sem,
11566+ struct task_struct *new_waiter)
11567+{
11568+ psnedf_domain_t* pedf;
11569+ rt_domain_t* edf;
11570+ struct task_struct* t;
11571+ int cpu = get_partition(new_waiter);
11572+
11573+ BUG_ON(!new_waiter);
11574+
11575+ if (edf_higher_prio(new_waiter, sem->hp.cpu_task[cpu])) {
11576+ TRACE_TASK(new_waiter, " boosts priority\n");
11577+ pedf = task_pedf(new_waiter);
11578+ edf = task_edf(new_waiter);
11579+
11580+ /* interrupts already disabled */
11581+ spin_lock(&pedf->lock);
11582+
11583+ /* store new highest-priority task */
11584+ sem->hp.cpu_task[cpu] = new_waiter;
11585+ if (sem->holder &&
11586+ get_partition(sem->holder) == get_partition(new_waiter)) {
11587+ /* let holder inherit */
11588+ sem->holder->rt_param.inh_task = new_waiter;
11589+ t = sem->holder;
11590+ if (in_list(&t->rt_list)) {
11591+ /* queued in domain*/
11592+ list_del(&t->rt_list);
11593+ /* readd to make priority change take place */
11594+ if (is_released(t))
11595+ __add_ready(edf, t);
11596+ else
11597+ __add_release(edf, t);
11598+ }
11599+ }
11600+
11601+ /* check if we need to reschedule */
11602+ if (edf_preemption_needed(edf, current))
11603+ preempt(pedf);
11604+
11605+ spin_unlock(&pedf->lock);
11606+ }
11607+
11608+ return 0;
11609+}
11610+
11611+static long psnedf_inherit_priority(struct pi_semaphore *sem,
11612+ struct task_struct *new_owner)
11613+{
11614+ int cpu = get_partition(new_owner);
11615+
11616+ new_owner->rt_param.inh_task = sem->hp.cpu_task[cpu];
11617+ if (sem->hp.cpu_task[cpu] && new_owner != sem->hp.cpu_task[cpu]) {
11618+ TRACE_TASK(new_owner,
11619+ "inherited priority from %s/%d\n",
11620+ sem->hp.cpu_task[cpu]->comm,
11621+ sem->hp.cpu_task[cpu]->pid);
11622+ } else
11623+ TRACE_TASK(new_owner,
11624+ "cannot inherit priority: "
11625+ "no higher priority job waits on this CPU!\n");
11626+ /* make new owner non-preemptable as required by FMLP under
11627+ * PSN-EDF.
11628+ */
11629+ make_np(new_owner);
11630+ return 0;
11631+}
11632+
11633+
11634+/* This function is called on a semaphore release, and assumes that
11635+ * the current task is also the semaphore holder.
11636+ */
11637+static long psnedf_return_priority(struct pi_semaphore *sem)
11638+{
11639+ struct task_struct* t = current;
11640+ psnedf_domain_t* pedf = task_pedf(t);
11641+ rt_domain_t* edf = task_edf(t);
11642+ int ret = 0;
11643+ int cpu = get_partition(current);
11644+
11645+
11646+ /* Find new highest-priority semaphore task
11647+ * if holder task is the current hp.cpu_task[cpu].
11648+ *
11649+ * Calling function holds sem->wait.lock.
11650+ */
11651+ if (t == sem->hp.cpu_task[cpu])
11652+ edf_set_hp_cpu_task(sem, cpu);
11653+
11654+ take_np(t);
11655+ if (current->rt_param.inh_task) {
11656+ TRACE_CUR("return priority of %s/%d\n",
11657+ current->rt_param.inh_task->comm,
11658+ current->rt_param.inh_task->pid);
11659+ spin_lock(&pedf->lock);
11660+
11661+ /* Reset inh_task to NULL. */
11662+ current->rt_param.inh_task = NULL;
11663+
11664+ /* check if we need to reschedule */
11665+ if (edf_preemption_needed(edf, current))
11666+ preempt(pedf);
11667+
11668+ spin_unlock(&pedf->lock);
11669+ } else
11670+ TRACE_CUR(" no priority to return %p\n", sem);
11671+
11672+ return ret;
11673+}
11674+
11675+
11676+static int psnedf_mode_change(int new_mode)
11677+{
11678+ int cpu;
11679+
11680+ if (new_mode == MODE_RT_RUN)
11681+ for_each_online_cpu(cpu) {
11682+ spin_lock(&remote_pedf(cpu)->lock);
11683+ __rerelease_all(remote_edf(cpu), edf_release_at);
11684+ spin_unlock(&remote_pedf(cpu)->lock);
11685+ }
11686+
11687+ TRACE("[%d] psn edf: mode changed to %d\n",
11688+ smp_processor_id(), new_mode);
11689+ return 0;
11690+}
11691+
11692+
11693+/* Plugin object */
11694+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
11695+ .ready_to_use = 0
11696+};
11697+
11698+
11699+/*
11700+ * Plugin initialization code.
11701+ */
11702+#define INIT_SCHED_PLUGIN (struct sched_plugin) {\
11703+ .plugin_name = "PSN-EDF",\
11704+ .ready_to_use = 1,\
11705+ .scheduler_tick = psnedf_scheduler_tick,\
11706+ .prepare_task = psnedf_prepare_task,\
11707+ .sleep_next_period = edf_sleep_next_period,\
11708+ .tear_down = psnedf_tear_down,\
11709+ .schedule = psnedf_schedule,\
11710+ .mode_change = psnedf_mode_change,\
11711+ .wake_up_task = psnedf_wake_up_task,\
11712+ .task_blocks = psnedf_task_blocks, \
11713+ .pi_block = psnedf_pi_block, \
11714+ .inherit_priority = psnedf_inherit_priority, \
11715+ .return_priority = psnedf_return_priority \
11716+}
11717+
11718+
11719+sched_plugin_t *__init init_psn_edf_plugin(void)
11720+{
11721+ int i;
11722+
11723+ if (!s_plugin.ready_to_use)
11724+ {
11725+ for (i = 0; i < NR_CPUS; i++)
11726+ {
11727+ psnedf_domain_init(remote_pedf(i),
11728+ psnedf_check_resched, i);
11729+ printk("PSN-EDF: CPU partition %d initialized.\n", i);
11730+ }
11731+ s_plugin = INIT_SCHED_PLUGIN;
11732+ }
11733+ return &s_plugin;
11734+}
11735+
11736+
11737+
11738diff --git a/kernel/sched_trace.c b/kernel/sched_trace.c
11739new file mode 100644
11740index 0000000..0213ca7
11741--- /dev/null
11742+++ b/kernel/sched_trace.c
11743@@ -0,0 +1,755 @@
11744+/* sched_trace.c -- record scheduling events to a byte stream.
11745+ *
11746+ * TODO: Move ring buffer to a lockfree implementation.
11747+ */
11748+
11749+#include <linux/spinlock.h>
11750+#include <linux/fs.h>
11751+#include <linux/cdev.h>
11752+#include <asm/semaphore.h>
11753+#include <asm/uaccess.h>
11754+#include <linux/module.h>
11755+
11756+#include <linux/queuelock.h>
11757+#include <linux/sched_trace.h>
11758+#include <linux/litmus.h>
11759+
11760+
11761+typedef struct {
11762+ /* guard read and write pointers */
11763+ spinlock_t lock;
11764+ /* guard against concurrent freeing of buffer */
11765+ rwlock_t del_lock;
11766+
11767+ /* memory allocated for ring buffer */
11768+ unsigned long order;
11769+ char* buf;
11770+ char* end;
11771+
11772+ /* Read/write pointer. May not cross.
11773+ * They point to the position of next write and
11774+ * last read.
11775+ */
11776+ char* writep;
11777+ char* readp;
11778+
11779+} ring_buffer_t;
11780+
11781+#define EMPTY_RING_BUFFER { \
11782+ .lock = SPIN_LOCK_UNLOCKED, \
11783+ .del_lock = RW_LOCK_UNLOCKED, \
11784+ .buf = NULL, \
11785+ .end = NULL, \
11786+ .writep = NULL, \
11787+ .readp = NULL \
11788+}
11789+
11790+void rb_init(ring_buffer_t* buf)
11791+{
11792+ *buf = (ring_buffer_t) EMPTY_RING_BUFFER;
11793+}
11794+
11795+int rb_alloc_buf(ring_buffer_t* buf, unsigned long order)
11796+{
11797+ unsigned long flags;
11798+ int error = 0;
11799+ char *mem;
11800+
11801+ /* do memory allocation while not atomic */
11802+ mem = (char *) __get_free_pages(GFP_KERNEL, order);
11803+ if (!mem)
11804+ return -ENOMEM;
11805+ write_lock_irqsave(&buf->del_lock, flags);
11806+ BUG_ON(buf->buf);
11807+ buf->buf = mem;
11808+ buf->end = buf->buf + PAGE_SIZE * (1 << order) - 1;
11809+ memset(buf->buf, 0xff, buf->end - buf->buf);
11810+ buf->order = order;
11811+ buf->writep = buf->buf + 1;
11812+ buf->readp = buf->buf;
11813+ write_unlock_irqrestore(&buf->del_lock, flags);
11814+ return error;
11815+}
11816+
11817+int rb_free_buf(ring_buffer_t* buf)
11818+{
11819+ unsigned long flags;
11820+ int error = 0;
11821+ write_lock_irqsave(&buf->del_lock, flags);
11822+ BUG_ON(!buf->buf);
11823+ free_pages((unsigned long) buf->buf, buf->order);
11824+ buf->buf = NULL;
11825+ buf->end = NULL;
11826+ buf->writep = NULL;
11827+ buf->readp = NULL;
11828+ write_unlock_irqrestore(&buf->del_lock, flags);
11829+ return error;
11830+}
11831+
11832+/* Assumption: concurrent writes are serialized externally
11833+ *
11834+ * Will only succeed if there is enough space for all len bytes.
11835+ */
11836+int rb_put(ring_buffer_t* buf, char* mem, size_t len)
11837+{
11838+ unsigned long flags;
11839+ char* r , *w;
11840+ int error = 0;
11841+ read_lock_irqsave(&buf->del_lock, flags);
11842+ if (!buf->buf) {
11843+ error = -ENODEV;
11844+ goto out;
11845+ }
11846+ spin_lock(&buf->lock);
11847+ r = buf->readp;
11848+ w = buf->writep;
11849+ spin_unlock(&buf->lock);
11850+ if (r < w && buf->end - w >= len - 1) {
11851+ /* easy case: there is enough space in the buffer
11852+ * to write it in one continous chunk*/
11853+ memcpy(w, mem, len);
11854+ w += len;
11855+ if (w > buf->end)
11856+ /* special case: fit exactly into buffer
11857+ * w is now buf->end + 1
11858+ */
11859+ w = buf->buf;
11860+ } else if (w < r && r - w >= len) { /* >= len because may not cross */
11861+ /* we are constrained by the read pointer but we there
11862+ * is enough space
11863+ */
11864+ memcpy(w, mem, len);
11865+ w += len;
11866+ } else if (r <= w && buf->end - w < len - 1) {
11867+ /* the wrap around case: there may or may not be space */
11868+ if ((buf->end - w) + (r - buf->buf) >= len - 1) {
11869+ /* copy chunk that fits at the end */
11870+ memcpy(w, mem, buf->end - w + 1);
11871+ mem += buf->end - w + 1;
11872+ len -= (buf->end - w + 1);
11873+ w = buf->buf;
11874+ /* copy the rest */
11875+ memcpy(w, mem, len);
11876+ w += len;
11877+ }
11878+ else
11879+ error = -ENOMEM;
11880+ } else {
11881+ error = -ENOMEM;
11882+ }
11883+ if (!error) {
11884+ spin_lock(&buf->lock);
11885+ buf->writep = w;
11886+ spin_unlock(&buf->lock);
11887+ }
11888+ out:
11889+ read_unlock_irqrestore(&buf->del_lock, flags);
11890+ return error;
11891+}
11892+
11893+/* Assumption: concurrent reads are serialized externally */
11894+int rb_get(ring_buffer_t* buf, char* mem, size_t len)
11895+{
11896+ unsigned long flags;
11897+ char* r , *w;
11898+ int error = 0;
11899+ read_lock_irqsave(&buf->del_lock, flags);
11900+ if (!buf->buf) {
11901+ error = -ENODEV;
11902+ goto out;
11903+ }
11904+ spin_lock(&buf->lock);
11905+ r = buf->readp;
11906+ w = buf->writep;
11907+ spin_unlock(&buf->lock);
11908+
11909+ if (w <= r && buf->end - r >= len) {
11910+ /* easy case: there is enough data in the buffer
11911+ * to get it in one chunk*/
11912+ memcpy(mem, r + 1, len);
11913+ r += len;
11914+ error = len;
11915+
11916+ } else if (r + 1 < w && w - r - 1 >= len) {
11917+ /* we are constrained by the write pointer but
11918+ * there is enough data
11919+ */
11920+ memcpy(mem, r + 1, len);
11921+ r += len;
11922+ error = len;
11923+
11924+ } else if (r + 1 < w && w - r - 1 < len) {
11925+ /* we are constrained by the write pointer and there
11926+ * there is not enough data
11927+ */
11928+ memcpy(mem, r + 1, w - r - 1);
11929+ error = w - r - 1;
11930+ r += w - r - 1;
11931+
11932+ } else if (w <= r && buf->end - r < len) {
11933+ /* the wrap around case: there may or may not be enough data
11934+ * first let's get what is available
11935+ */
11936+ memcpy(mem, r + 1, buf->end - r);
11937+ error += (buf->end - r);
11938+ mem += (buf->end - r);
11939+ len -= (buf->end - r);
11940+ r += (buf->end - r);
11941+
11942+ if (w > buf->buf) {
11943+ /* there is more to get */
11944+ r = buf->buf - 1;
11945+ if (w - r >= len) {
11946+ /* plenty */
11947+ memcpy(mem, r + 1, len);
11948+ error += len;
11949+ r += len;
11950+ } else {
11951+ memcpy(mem, r + 1, w - r - 1);
11952+ error += w - r - 1;
11953+ r += w - r - 1;
11954+ }
11955+ }
11956+ } /* nothing available */
11957+
11958+ if (error > 0) {
11959+ spin_lock(&buf->lock);
11960+ buf->readp = r;
11961+ spin_unlock(&buf->lock);
11962+ }
11963+ out:
11964+ read_unlock_irqrestore(&buf->del_lock, flags);
11965+ return error;
11966+}
11967+
11968+
11969+
11970+/******************************************************************************/
11971+/* DEVICE FILE DRIVER */
11972+/******************************************************************************/
11973+
11974+
11975+
11976+/* Allocate a buffer of about 1 MB per CPU.
11977+ *
11978+ */
11979+#define BUFFER_ORDER 8
11980+
11981+typedef struct {
11982+ ring_buffer_t buf;
11983+ atomic_t reader_cnt;
11984+ struct semaphore reader_mutex;
11985+} trace_buffer_t;
11986+
11987+
11988+/* This does not initialize the semaphore!! */
11989+
11990+#define EMPTY_TRACE_BUFFER \
11991+ { .buf = EMPTY_RING_BUFFER, .reader_cnt = ATOMIC_INIT(0)}
11992+
11993+static DEFINE_PER_CPU(trace_buffer_t, trace_buffer);
11994+
11995+#ifdef CONFIG_SCHED_DEBUG_TRACE
11996+static spinlock_t log_buffer_lock = SPIN_LOCK_UNLOCKED;
11997+#endif
11998+static trace_buffer_t log_buffer = EMPTY_TRACE_BUFFER;
11999+
12000+static void init_buffers(void)
12001+{
12002+ int i;
12003+
12004+ for (i = 0; i < NR_CPUS; i++) {
12005+ rb_init(&per_cpu(trace_buffer, i).buf);
12006+ init_MUTEX(&per_cpu(trace_buffer, i).reader_mutex);
12007+ atomic_set(&per_cpu(trace_buffer, i).reader_cnt, 0);
12008+ }
12009+ /* only initialize the mutex, the rest was initialized as part
12010+ * of the static initialization macro
12011+ */
12012+ init_MUTEX(&log_buffer.reader_mutex);
12013+}
12014+
12015+static int trace_release(struct inode *in, struct file *filp)
12016+{
12017+ int error = -EINVAL;
12018+ trace_buffer_t* buf = filp->private_data;
12019+
12020+ BUG_ON(!filp->private_data);
12021+
12022+ if (down_interruptible(&buf->reader_mutex)) {
12023+ error = -ERESTARTSYS;
12024+ goto out;
12025+ }
12026+
12027+ /* last release must deallocate buffers */
12028+ if (atomic_dec_return(&buf->reader_cnt) == 0) {
12029+ error = rb_free_buf(&buf->buf);
12030+ }
12031+
12032+ up(&buf->reader_mutex);
12033+ out:
12034+ return error;
12035+}
12036+
12037+static ssize_t trace_read(struct file *filp, char __user *to, size_t len,
12038+ loff_t *f_pos)
12039+{
12040+ /* we ignore f_pos, this is strictly sequential */
12041+
12042+ ssize_t error = -EINVAL;
12043+ char* mem;
12044+ trace_buffer_t *buf = filp->private_data;
12045+
12046+ if (down_interruptible(&buf->reader_mutex)) {
12047+ error = -ERESTARTSYS;
12048+ goto out;
12049+ }
12050+
12051+ if (len > 64 * 1024)
12052+ len = 64 * 1024;
12053+ mem = kmalloc(len, GFP_KERNEL);
12054+ if (!mem) {
12055+ error = -ENOMEM;
12056+ goto out_unlock;
12057+ }
12058+
12059+ error = rb_get(&buf->buf, mem, len);
12060+ while (!error) {
12061+ set_current_state(TASK_INTERRUPTIBLE);
12062+ schedule_timeout(110);
12063+ if (signal_pending(current))
12064+ error = -ERESTARTSYS;
12065+ else
12066+ error = rb_get(&buf->buf, mem, len);
12067+ }
12068+
12069+ if (error > 0 && copy_to_user(to, mem, error))
12070+ error = -EFAULT;
12071+
12072+ kfree(mem);
12073+ out_unlock:
12074+ up(&buf->reader_mutex);
12075+ out:
12076+ return error;
12077+}
12078+
12079+
12080+/* trace_open - Open one of the per-CPU sched_trace buffers.
12081+ */
12082+static int trace_open(struct inode *in, struct file *filp)
12083+{
12084+ int error = -EINVAL;
12085+ int cpu = MINOR(in->i_rdev);
12086+ trace_buffer_t* buf;
12087+
12088+ if (!cpu_online(cpu)) {
12089+ printk(KERN_WARNING "sched trace: "
12090+ "CPU #%d is not online. (open failed)\n", cpu);
12091+ error = -ENODEV;
12092+ goto out;
12093+ }
12094+
12095+ buf = &per_cpu(trace_buffer, cpu);
12096+
12097+ if (down_interruptible(&buf->reader_mutex)) {
12098+ error = -ERESTARTSYS;
12099+ goto out;
12100+ }
12101+
12102+ /* first open must allocate buffers */
12103+ if (atomic_inc_return(&buf->reader_cnt) == 1) {
12104+ if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER)))
12105+ {
12106+ atomic_dec(&buf->reader_cnt);
12107+ goto out_unlock;
12108+ }
12109+ }
12110+
12111+ error = 0;
12112+ filp->private_data = buf;
12113+
12114+ out_unlock:
12115+ up(&buf->reader_mutex);
12116+ out:
12117+ return error;
12118+}
12119+
12120+/* log_open - open the global log message ring buffer.
12121+ */
12122+static int log_open(struct inode *in, struct file *filp)
12123+{
12124+ int error = -EINVAL;
12125+ trace_buffer_t* buf;
12126+
12127+ buf = &log_buffer;
12128+
12129+ if (down_interruptible(&buf->reader_mutex)) {
12130+ error = -ERESTARTSYS;
12131+ goto out;
12132+ }
12133+
12134+ /* first open must allocate buffers */
12135+ if (atomic_inc_return(&buf->reader_cnt) == 1) {
12136+ if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER)))
12137+ {
12138+ atomic_dec(&buf->reader_cnt);
12139+ goto out_unlock;
12140+ }
12141+ }
12142+
12143+ error = 0;
12144+ filp->private_data = buf;
12145+
12146+ out_unlock:
12147+ up(&buf->reader_mutex);
12148+ out:
12149+ return error;
12150+}
12151+
12152+/******************************************************************************/
12153+/* Device Registration */
12154+/******************************************************************************/
12155+
12156+/* the major numbes are from the unassigned/local use block
12157+ *
12158+ * This should be converted to dynamic allocation at some point...
12159+ */
12160+#define TRACE_MAJOR 250
12161+#define LOG_MAJOR 251
12162+
12163+/* trace_fops - The file operations for accessing the per-CPU scheduling event
12164+ * trace buffers.
12165+ */
12166+struct file_operations trace_fops = {
12167+ .owner = THIS_MODULE,
12168+ .open = trace_open,
12169+ .release = trace_release,
12170+ .read = trace_read,
12171+};
12172+
12173+/* log_fops - The file operations for accessing the global LITMUS log message
12174+ * buffer.
12175+ *
12176+ * Except for opening the device file it uses the same operations as trace_fops.
12177+ */
12178+struct file_operations log_fops = {
12179+ .owner = THIS_MODULE,
12180+ .open = log_open,
12181+ .release = trace_release,
12182+ .read = trace_read,
12183+};
12184+
12185+static int __init register_buffer_dev(const char* name,
12186+ struct file_operations* fops,
12187+ int major, int count)
12188+{
12189+ dev_t trace_dev;
12190+ struct cdev *cdev;
12191+ int error = 0;
12192+
12193+ trace_dev = MKDEV(major, 0);
12194+ error = register_chrdev_region(trace_dev, count, name);
12195+ if (error)
12196+ {
12197+ printk(KERN_WARNING "sched trace: "
12198+ "Could not register major/minor number %d\n", major);
12199+ return error;
12200+ }
12201+ cdev = cdev_alloc();
12202+ if (!cdev) {
12203+ printk(KERN_WARNING "sched trace: "
12204+ "Could not get a cdev for %s.\n", name);
12205+ return -ENOMEM;
12206+ }
12207+ cdev->owner = THIS_MODULE;
12208+ cdev->ops = fops;
12209+ error = cdev_add(cdev, trace_dev, count);
12210+ if (error) {
12211+ printk(KERN_WARNING "sched trace: "
12212+ "add_cdev failed for %s.\n", name);
12213+ return -ENOMEM;
12214+ }
12215+ return error;
12216+
12217+}
12218+
12219+static int __init init_sched_trace(void)
12220+{
12221+ int error1 = 0, error2 = 0;
12222+
12223+ printk("Initializing scheduler trace device\n");
12224+ init_buffers();
12225+
12226+ error1 = register_buffer_dev("schedtrace", &trace_fops,
12227+ TRACE_MAJOR, NR_CPUS);
12228+
12229+ error2 = register_buffer_dev("litmus_log", &log_fops,
12230+ LOG_MAJOR, 1);
12231+ if (error1 || error2)
12232+ return min(error1, error2);
12233+ else
12234+ return 0;
12235+}
12236+
12237+module_init(init_sched_trace);
12238+
12239+/******************************************************************************/
12240+/* KERNEL API */
12241+/******************************************************************************/
12242+
12243+/* The per-CPU LITMUS log buffer. Don't put it on the stack, it is too big for
12244+ * that and the kernel gets very picky with nested interrupts and small stacks.
12245+ */
12246+
12247+#ifdef CONFIG_SCHED_DEBUG_TRACE
12248+
12249+#define MSG_SIZE 255
12250+static DEFINE_PER_CPU(char[MSG_SIZE], fmt_buffer);
12251+
12252+/* sched_trace_log_message - This is the only function that accesses the the
12253+ * log buffer inside the kernel for writing.
12254+ * Concurrent access to it is serialized via the
12255+ * log_buffer_lock.
12256+ *
12257+ * The maximum length of a formatted message is 255.
12258+ */
12259+void sched_trace_log_message(const char* fmt, ...)
12260+{
12261+ unsigned long flags;
12262+ va_list args;
12263+ size_t len;
12264+ char* buf;
12265+
12266+ va_start(args, fmt);
12267+ local_irq_save(flags);
12268+
12269+ /* format message */
12270+ buf = __get_cpu_var(fmt_buffer);
12271+ len = vscnprintf(buf, MSG_SIZE, fmt, args);
12272+
12273+ spin_lock(&log_buffer_lock);
12274+ /* Don't copy the trailing null byte, we don't want null bytes
12275+ * in a text file.
12276+ */
12277+ rb_put(&log_buffer.buf, buf, len);
12278+ spin_unlock(&log_buffer_lock);
12279+
12280+ local_irq_restore(flags);
12281+ va_end(args);
12282+}
12283+
12284+#endif
12285+
12286+#ifdef CONFIG_SCHED_TASK_TRACE
12287+
12288+static inline void __put_trace(char* mem, size_t size)
12289+{
12290+ trace_buffer_t* buf = &__get_cpu_var(trace_buffer);
12291+ rb_put(&buf->buf, mem, size);
12292+}
12293+
12294+#define put_trace(obj) \
12295+ if (get_rt_mode() == MODE_RT_RUN) \
12296+ __put_trace((char *) &obj, sizeof(obj))
12297+
12298+#define header(rec, type) \
12299+{ \
12300+ rec.header.trace = type; \
12301+ rec.header.timestamp = sched_clock(); \
12302+ rec.header.size = sizeof(rec); \
12303+}
12304+
12305+#define tinfo(info, t) \
12306+{ \
12307+ info.is_rt = is_realtime(t); \
12308+ info.is_server = 0; \
12309+ info.class = get_class(t); \
12310+ info.budget = (t)->time_slice; \
12311+ info.pid = (t)->pid; \
12312+ info.deadline = (t)->rt_param.times.deadline; \
12313+}
12314+
12315+#define rtinfo(info, t) \
12316+{ \
12317+ info.wcet = get_exec_cost(t); \
12318+ info.period = get_rt_period(t); \
12319+}
12320+
12321+void sched_trace_scheduler_invocation(void)
12322+{
12323+ invocation_record_t rec;
12324+ header(rec, ST_INVOCATION);
12325+ rec.flags = current->flags;
12326+ put_trace(rec);
12327+}
12328+
12329+void sched_trace_task_arrival(struct task_struct *t)
12330+{
12331+ arrival_record_t rec;
12332+ header(rec, ST_ARRIVAL);
12333+ tinfo(rec.task, t);
12334+ put_trace(rec);
12335+}
12336+
12337+
12338+void sched_trace_task_departure(struct task_struct *t)
12339+{
12340+ departure_record_t rec;
12341+ header(rec, ST_DEPARTURE);
12342+ tinfo(rec.task, t);
12343+ put_trace(rec);
12344+}
12345+
12346+void sched_trace_task_preemption(struct task_struct *t, struct task_struct* by)
12347+{
12348+ preemption_record_t rec;
12349+ header(rec, ST_PREEMPTION);
12350+ tinfo(rec.task, t);
12351+ tinfo(rec.by, by);
12352+ put_trace(rec);
12353+}
12354+
12355+
12356+void sched_trace_task_scheduled(struct task_struct *t)
12357+{
12358+ scheduled_record_t rec;
12359+ header(rec, ST_SCHEDULED);
12360+ tinfo(rec.task, t);
12361+ put_trace(rec);
12362+}
12363+
12364+
12365+void sched_trace_job_release(struct task_struct *t)
12366+{
12367+ release_record_t rec;
12368+ header(rec, ST_JOB_RELEASE);
12369+ tinfo(rec.task, t);
12370+ rtinfo(rec, t);
12371+ put_trace(rec);
12372+}
12373+
12374+void sched_trace_job_completion(struct task_struct *t)
12375+{
12376+ completion_record_t rec;
12377+ header(rec, ST_JOB_COMPLETION);
12378+ tinfo(rec.task, t);
12379+ rtinfo(rec, t);
12380+ rec.tardiness = jiffies - t->rt_param.times.deadline;
12381+ rec.job_no = t->rt_param.times.job_no;
12382+ TRACE_TASK(t, "AAATardiness : %d\n", rec.tardiness);
12383+ put_trace(rec);
12384+}
12385+
12386+
12387+void sched_trace_server_scheduled(int id, task_class_t class,
12388+ unsigned int budget, jiffie_t deadline)
12389+{
12390+ scheduled_record_t rec;
12391+ header(rec, ST_SCHEDULED);
12392+ rec.task.pid = id;
12393+ rec.task.is_rt = 1;
12394+ rec.task.is_server = 1;
12395+ rec.task.class = class;
12396+ rec.task.budget = budget;
12397+ rec.task.deadline = deadline;
12398+ put_trace(rec);
12399+}
12400+
12401+void sched_trace_server_release(int id, unsigned int wcet,
12402+ unsigned int period, task_class_t class)
12403+{
12404+ release_record_t rec;
12405+ header(rec, ST_JOB_RELEASE);
12406+ rec.task.pid = id;
12407+ rec.task.is_rt = 1;
12408+ rec.task.is_server = 1;
12409+ rec.task.class = class;
12410+ rec.task.budget = wcet;
12411+ rec.period = period;
12412+ rec.wcet = wcet;
12413+ put_trace(rec);
12414+}
12415+
12416+void sched_trace_server_completion(int id, unsigned int budget,
12417+ jiffie_t deadline, task_class_t class)
12418+{
12419+ completion_record_t rec;
12420+ header(rec, ST_JOB_COMPLETION);
12421+ rec.task.pid = id;
12422+ rec.task.is_rt = 1;
12423+ rec.task.is_server = 1;
12424+ rec.task.class = class;
12425+ rec.task.budget = budget;
12426+ rec.task.deadline = deadline;
12427+ rec.period = 0;
12428+ rec.tardiness = jiffies - deadline;
12429+ put_trace(rec);
12430+
12431+}
12432+
12433+void sched_trace_capacity_release(struct task_struct *t)
12434+{
12435+ cap_release_record_t rec;
12436+ header(rec, ST_CAPACITY_RELEASE);
12437+ tinfo(rec.task, t);
12438+ put_trace(rec);
12439+}
12440+
12441+void sched_trace_capacity_allocation(struct task_struct *t, u16 budget, u32 deadline,
12442+ pid_t donor)
12443+{
12444+ cap_allocation_record_t rec;
12445+ header(rec, ST_CAPACITY_ALLOCATION);
12446+ tinfo(rec.task, t);
12447+ rec.donor = donor;
12448+ rec.budget = budget;
12449+ rec.deadline = deadline;
12450+ put_trace(rec);
12451+}
12452+
12453+void sched_trace_capacity_alloc_srv(pid_t srv, u32 srv_dl, task_class_t cls,
12454+ u16 srv_budget,
12455+ u16 budget, u32 deadline, pid_t donor)
12456+{
12457+ cap_allocation_record_t rec;
12458+ header(rec, ST_CAPACITY_ALLOCATION);
12459+ rec.task.pid = srv;
12460+ rec.task.is_rt = 1;
12461+ rec.task.is_server = 1;
12462+ rec.task.class = cls;
12463+ rec.task.budget = srv_budget;
12464+ rec.task.deadline = srv_dl;
12465+ rec.donor = donor;
12466+ rec.budget = budget;
12467+ rec.deadline = deadline;
12468+ put_trace(rec);
12469+}
12470+
12471+void sched_trace_service_level_change(struct task_struct *t,
12472+ unsigned int from,
12473+ unsigned int to)
12474+{
12475+ service_level_change_record_t rec;
12476+ header(rec, ST_SERVICE_LEVEL_CHANGE);
12477+ tinfo(rec.task, t);
12478+ rec.to = to;
12479+ rec.from = from;
12480+ rec.new_level =
12481+ t->rt_param.service_level[to];
12482+ rec.old_level =
12483+ t->rt_param.service_level[from];
12484+ put_trace(rec);
12485+}
12486+
12487+void sched_trace_weight_error(struct task_struct* t, fp_t actual)
12488+{
12489+ weight_error_record_t rec;
12490+ header(rec, ST_WEIGHT_ERROR);
12491+ rec.task = t->pid;
12492+ rec.actual = actual;
12493+ rec.estimate = get_est_weight(t);
12494+ put_trace(rec);
12495+}
12496+
12497+
12498+#endif
12499diff --git a/kernel/timer.c b/kernel/timer.c
12500index c2a8ccf..77a1b6b 100644
12501--- a/kernel/timer.c
12502+++ b/kernel/timer.c
12503@@ -737,6 +737,27 @@ static inline s64 __get_nsec_offset(void)
12504 return ns_offset;
12505 }
12506
12507+/* Non-static, non-inline, public version of function above.
12508+ * It's up to the programmer to decide how to use it, no guarantees
12509+ * about anything are made here.
12510+ */
12511+s64 get_nsec_offset(void)
12512+{
12513+ cycle_t cycle_now, cycle_delta;
12514+ s64 ns_offset;
12515+
12516+ /* read clocksource: */
12517+ cycle_now = clocksource_read(clock);
12518+
12519+ /* calculate the delta since the last update_wall_time: */
12520+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
12521+
12522+ /* convert to nanoseconds: */
12523+ ns_offset = cyc2ns(clock, cycle_delta);
12524+
12525+ return ns_offset;
12526+}
12527+
12528 /**
12529 * __get_realtime_clock_ts - Returns the time of day in a timespec
12530 * @ts: pointer to the timespec to be set
12531@@ -789,6 +810,7 @@ void do_gettimeofday(struct timeval *tv)
12532 }
12533
12534 EXPORT_SYMBOL(do_gettimeofday);
12535+
12536 /**
12537 * do_settimeofday - Sets the time of day
12538 * @tv: pointer to the timespec variable containing the new time
12539diff --git a/kernel/trace.c b/kernel/trace.c
12540new file mode 100644
12541index 0000000..6119574
12542--- /dev/null
12543+++ b/kernel/trace.c
12544@@ -0,0 +1,302 @@
12545+#include <linux/fs.h>
12546+#include <linux/cdev.h>
12547+#include <asm/semaphore.h>
12548+#include <asm/uaccess.h>
12549+#include <linux/module.h>
12550+
12551+#include <linux/trace.h>
12552+
12553+/******************************************************************************/
12554+/* Allocation */
12555+/******************************************************************************/
12556+
12557+struct ft_buffer* trace_ts_buf = NULL;
12558+
12559+static unsigned int ts_seq_no = 0;
12560+
12561+feather_callback void save_timestamp(unsigned long event)
12562+{
12563+ unsigned int seq_no = fetch_and_inc((int *) &ts_seq_no);
12564+ struct timestamp *ts;
12565+ if (ft_buffer_start_write(trace_ts_buf, (void**) &ts)) {
12566+ ts->event = event;
12567+ ts->timestamp = ft_read_tsc();
12568+ ts->seq_no = seq_no;
12569+ ts->cpu = raw_smp_processor_id();
12570+ ft_buffer_finish_write(trace_ts_buf, ts);
12571+ }
12572+}
12573+
12574+static struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size)
12575+{
12576+ struct ft_buffer* buf;
12577+ size_t total = (size + 1) * count;
12578+ char* mem;
12579+ int order = 0, pages = 1;
12580+
12581+ buf = kmalloc(sizeof(struct ft_buffer), GFP_KERNEL);
12582+ if (!buf)
12583+ return NULL;
12584+
12585+ total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
12586+ while (pages < total) {
12587+ order++;
12588+ pages *= 2;
12589+ }
12590+
12591+ mem = (char*) __get_free_pages(GFP_KERNEL, order);
12592+ if (!mem) {
12593+ kfree(buf);
12594+ return NULL;
12595+ }
12596+
12597+ if (!init_ft_buffer(buf, count, size,
12598+ mem + (count * size), /* markers at the end */
12599+ mem)) { /* buffer objects */
12600+ free_pages((unsigned long) mem, order);
12601+ kfree(buf);
12602+ return NULL;
12603+ }
12604+ return buf;
12605+}
12606+
12607+static void free_ft_buffer(struct ft_buffer* buf)
12608+{
12609+ int order = 0, pages = 1;
12610+ size_t total;
12611+
12612+ if (buf) {
12613+ total = (buf->slot_size + 1) * buf->slot_count;
12614+ total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
12615+ while (pages < total) {
12616+ order++;
12617+ pages *= 2;
12618+ }
12619+ free_pages((unsigned long) buf->buffer_mem, order);
12620+ kfree(buf);
12621+ }
12622+}
12623+
12624+
12625+/******************************************************************************/
12626+/* DEVICE FILE DRIVER */
12627+/******************************************************************************/
12628+
12629+#define NO_TIMESTAMPS 262144
12630+
12631+static DECLARE_MUTEX(feather_lock);
12632+static int use_count = 0;
12633+
12634+static int trace_release(struct inode *in, struct file *filp)
12635+{
12636+ int err = -EINVAL;
12637+
12638+ if (down_interruptible(&feather_lock)) {
12639+ err = -ERESTARTSYS;
12640+ goto out;
12641+ }
12642+
12643+ printk(KERN_ALERT "%s/%d disconnects from feather trace device. "
12644+ "use_count=%d\n",
12645+ current->comm, current->pid, use_count);
12646+
12647+ if (use_count == 1) {
12648+ /* disable events */
12649+ ft_disable_all_events();
12650+
12651+ /* wait for any pending events to complete */
12652+ set_current_state(TASK_UNINTERRUPTIBLE);
12653+ schedule_timeout(HZ);
12654+
12655+ printk(KERN_ALERT "Failed trace writes: %u\n",
12656+ trace_ts_buf->failed_writes);
12657+
12658+ free_ft_buffer(trace_ts_buf);
12659+ trace_ts_buf = NULL;
12660+ }
12661+
12662+ use_count--;
12663+ up(&feather_lock);
12664+out:
12665+ return err;
12666+}
12667+
12668+
12669+static ssize_t trace_read(struct file *filp, char __user *to, size_t len,
12670+ loff_t *f_pos)
12671+{
12672+ /* we ignore f_pos, this is strictly sequential */
12673+ ssize_t error = 0;
12674+ struct timestamp ts;
12675+
12676+ if (down_interruptible(&feather_lock)) {
12677+ error = -ERESTARTSYS;
12678+ goto out;
12679+ }
12680+
12681+
12682+ while (len >= sizeof(struct timestamp)) {
12683+ if (ft_buffer_read(trace_ts_buf, &ts)) {
12684+ if (copy_to_user(to, &ts, sizeof(struct timestamp))) {
12685+ error = -EFAULT;
12686+ break;
12687+ } else {
12688+ len -= sizeof(struct timestamp);
12689+ to += sizeof(struct timestamp);
12690+ error += sizeof(struct timestamp);
12691+ }
12692+ } else {
12693+ set_current_state(TASK_INTERRUPTIBLE);
12694+ schedule_timeout(50);
12695+ if (signal_pending(current)) {
12696+ error = -ERESTARTSYS;
12697+ break;
12698+ }
12699+ }
12700+ }
12701+ up(&feather_lock);
12702+out:
12703+ return error;
12704+}
12705+
12706+#define ENABLE_CMD 0
12707+#define DISABLE_CMD 1
12708+
12709+static ssize_t trace_write(struct file *filp, const char __user *from,
12710+ size_t len, loff_t *f_pos)
12711+{
12712+ ssize_t error = -EINVAL;
12713+ unsigned long cmd;
12714+ unsigned long id;
12715+
12716+ if (len % sizeof(long) || len < 2 * sizeof(long))
12717+ goto out;
12718+
12719+ if (copy_from_user(&cmd, from, sizeof(long))) {
12720+ error = -EFAULT;
12721+ goto out;
12722+ }
12723+ len -= sizeof(long);
12724+ from += sizeof(long);
12725+
12726+ if (cmd != ENABLE_CMD && cmd != DISABLE_CMD)
12727+ goto out;
12728+
12729+ if (down_interruptible(&feather_lock)) {
12730+ error = -ERESTARTSYS;
12731+ goto out;
12732+ }
12733+
12734+ error = sizeof(long);
12735+ while (len) {
12736+ if (copy_from_user(&id, from, sizeof(long))) {
12737+ error = -EFAULT;
12738+ goto out;
12739+ }
12740+ len -= sizeof(long);
12741+ from += sizeof(long);
12742+ if (cmd) {
12743+ printk(KERN_INFO
12744+ "Disabling feather-trace event %lu.\n", id);
12745+ ft_disable_event(id);
12746+ } else {
12747+ printk(KERN_INFO
12748+ "Enabling feather-trace event %lu.\n", id);
12749+ ft_enable_event(id);
12750+ }
12751+ error += sizeof(long);
12752+ }
12753+
12754+ up(&feather_lock);
12755+ out:
12756+ return error;
12757+}
12758+
12759+static int trace_open(struct inode *in, struct file *filp)
12760+{
12761+ int err = 0;
12762+ unsigned int count = NO_TIMESTAMPS;
12763+
12764+ if (down_interruptible(&feather_lock)) {
12765+ err = -ERESTARTSYS;
12766+ goto out;
12767+ }
12768+
12769+ while (count && !trace_ts_buf) {
12770+ printk("trace: trying to allocate %u time stamps.\n", count);
12771+ trace_ts_buf = alloc_ft_buffer(count, sizeof(struct timestamp));
12772+ count /= 2;
12773+ }
12774+ if (!trace_ts_buf)
12775+ err = -ENOMEM;
12776+ else
12777+ use_count++;
12778+
12779+ up(&feather_lock);
12780+out:
12781+ return err;
12782+}
12783+
12784+/******************************************************************************/
12785+/* Device Registration */
12786+/******************************************************************************/
12787+
12788+#define FT_TRACE_MAJOR 252
12789+
12790+struct file_operations ft_trace_fops = {
12791+ .owner = THIS_MODULE,
12792+ .open = trace_open,
12793+ .release = trace_release,
12794+ .write = trace_write,
12795+ .read = trace_read,
12796+};
12797+
12798+
12799+static int __init register_buffer_dev(const char* name,
12800+ struct file_operations* fops,
12801+ int major, int count)
12802+{
12803+ dev_t trace_dev;
12804+ struct cdev *cdev;
12805+ int error = 0;
12806+
12807+ trace_dev = MKDEV(major, 0);
12808+ error = register_chrdev_region(trace_dev, count, name);
12809+ if (error)
12810+ {
12811+ printk(KERN_WARNING "trace: "
12812+ "Could not register major/minor number %d\n", major);
12813+ return error;
12814+ }
12815+ cdev = cdev_alloc();
12816+ if (!cdev) {
12817+ printk(KERN_WARNING "trace: "
12818+ "Could not get a cdev for %s.\n", name);
12819+ return -ENOMEM;
12820+ }
12821+ cdev->owner = THIS_MODULE;
12822+ cdev->ops = fops;
12823+ error = cdev_add(cdev, trace_dev, count);
12824+ if (error) {
12825+ printk(KERN_WARNING "trace: "
12826+ "add_cdev failed for %s.\n", name);
12827+ return -ENOMEM;
12828+ }
12829+ return error;
12830+
12831+}
12832+
12833+static int __init init_sched_trace(void)
12834+{
12835+ int error = 0;
12836+
12837+ printk("Initializing Feather-Trace device\n");
12838+ /* dummy entry to make linker happy */
12839+ ft_event0(666, save_timestamp);
12840+
12841+ error = register_buffer_dev("ft_trace", &ft_trace_fops,
12842+ FT_TRACE_MAJOR, 1);
12843+ return error;
12844+}
12845+
12846+module_init(init_sched_trace);
12847diff --git a/lib/semaphore-sleepers.c b/lib/semaphore-sleepers.c
12848index 1281805..3f4d543 100644
12849--- a/lib/semaphore-sleepers.c
12850+++ b/lib/semaphore-sleepers.c
12851@@ -108,7 +108,7 @@ fastcall int __sched __down_interruptible(struct semaphore * sem)
12852 /*
12853 * With signals pending, this turns into
12854 * the trylock failure case - we won't be
12855- * sleeping, and we* can't get the lock as
12856+ * sleeping, and we can't get the lock as
12857 * it has contention. Just correct the count
12858 * and exit.
12859 */
diff --git a/index.html b/index.html
index 3070daf..de7c876 100644
--- a/index.html
+++ b/index.html
@@ -40,7 +40,7 @@
40 kernel with focus on multiprocessor real-time scheduling and 40 kernel with focus on multiprocessor real-time scheduling and
41 synchronization. The Linux kernel is modified to support the sporadic task 41 synchronization. The Linux kernel is modified to support the sporadic task
42 model and modular scheduler plugins. Both partitioned and global scheduling 42 model and modular scheduler plugins. Both partitioned and global scheduling
43 is supported. In the current version (2007.2), plugins for the following 43 is supported. In the current version (2007.3), plugins for the following
44 scheduling policies are included: 44 scheduling policies are included:
45 </p> 45 </p>
46 <ul> 46 <ul>
@@ -54,7 +54,7 @@
54 <li> PFAIR (both staggered and aligned quanta are supported)</li> 54 <li> PFAIR (both staggered and aligned quanta are supported)</li>
55 </ul> 55 </ul>
56 <p> 56 <p>
57 The latest public release of LITMUS<sup>RT</sup> occurred on 10/29/2007. 57 The latest public release of LITMUS<sup>RT</sup> occurred on 01/28/2008.
58 </p> 58 </p>
59 </div> 59 </div>
60 60
@@ -175,7 +175,7 @@
175 General Public License (GPL)</a>. 175 General Public License (GPL)</a>.
176 </p> 176 </p>
177 <p> 177 <p>
178 The latest version of LITMUS<sup>RT</sup> is 2007.2 and was released on 10/29/2007. 178 The latest version of LITMUS<sup>RT</sup> is 2007.3 and was released on 01/28/2008
179 It consists of 179 It consists of
180 our Linux kernel modifications in the form of 180 our Linux kernel modifications in the form of
181 a patch against Linux 2.6.20, 181 a patch against Linux 2.6.20,
@@ -184,21 +184,48 @@
184 provides synchronization primitives suitable for real-time tasks. 184 provides synchronization primitives suitable for real-time tasks.
185 </p> 185 </p>
186 186
187 187 <ul>
188 <ul > 188 <li>
189 <li><a href="download/litmus-rt-2007.2.patch">litmus-rt-2007.2.patch</a> 189 2007.3 (January 2008)<br/>
190 (328 KB)<br/> 190 Based on Linux 2.6.20. (see <a href="#install">Section Install</a>
191 Applies 191 below) <br/>
192 against Linux 2.6.20 (see <a href="#install">Section Install</a> below).</li> 192 Files:
193 193 <ul>
194 <li><a href="download/liblitmus-2007.2.tgz">liblitmus-2007.2.tgz</a> 194 <li><a href="download/2007.3/litmus-rt-2007.3.patch">litmus-rt-2007.3.patch</a> (344 KB)</li>
195 (11 KB) 195 <li><a href="download/2007.3/liblitmus-2007.3.tgz">liblitmus-2007.3.tgz</a>
196 </li> 196 (14 KB)</li>
197 197 <li><a href="download/2007.3/libso-2007.3.tgz">libso-2007.3.tgz</a>
198 <li><a href="download/libso-2007.2.tgz">libso-2007.2.tgz</a> 198 (15 KB)</li>
199 (16 KB) 199 <li><a href="download/2007.3/SHA256SUMS">SHA256 check sums.</a>
200 </li> 200 </li>
201 </ul>
202 </li>
203 <li>
204 Major changes:
205 <ul>
206 <li>
207 Support for multi-threaded real-time applications added. The
208 use of <span class="src">libso</span> is no longer required.
209 </li>
210 <li>
211 All allocations (semaphores, etc.) are now dynamic. No more
212 running out of resources.
213 </li>
214 <li>
215 Real-Time tasks do not have be launched with
216 <span class="src">rt_launch</span> anymore. Instead, a new
217 <span class="src">task_mode()</span> API was introduced that
218 allows (Linux) tasks to transition in and out of
219 background task mode (std. Linux task) and LITMUS<sup>RT</sup>
220 real-time task.
221 </li>
222 <li>
223 Many bug fixes.
224 </li>
225 </ul>
226 </li>
201 </ul> 227 </ul>
228
202 <p class="nobottommargin"> 229 <p class="nobottommargin">
203 Please note that the current implementation is a <em>prototype</em> with 230 Please note that the current implementation is a <em>prototype</em> with
204 certain limitations. Most notably, it is not secure in a multiuser context, 231 certain limitations. Most notably, it is not secure in a multiuser context,
@@ -210,7 +237,18 @@
210 <p class="nobottommargin"> 237 <p class="nobottommargin">
211 Old releases: 238 Old releases:
212 </p> 239 </p>
240
213 <ul> 241 <ul>
242 <li> 2007.2 (November 2007)<br/>
243 Based on Linux 2.6.20. <br/>
244 <a href="download/litmus-rt-2007.2.patch">litmus-rt-2007.2.patch</a>
245 (328 KB)<br/>
246 <a href="download/liblitmus-2007.2.tgz">liblitmus-2007.2.tgz</a>
247 (11 KB) <br/>
248 <a href="download/libso-2007.2.tgz">libso-2007.2.tgz</a>
249 (16 KB) <br/><br/>
250 </li>
251
214 <li> 2007.1 (May 2007)<br/> 252 <li> 2007.1 (May 2007)<br/>
215 Based on Linux 2.6.20. <br/> 253 Based on Linux 2.6.20. <br/>
216 <a href="download/litmus-rt-2007.1.patch">litmus-rt-2007.1.patch</a> 254 <a href="download/litmus-rt-2007.1.patch">litmus-rt-2007.1.patch</a>
@@ -253,11 +291,11 @@ cd $DIR
253# get Linux 2.6.20 291# get Linux 2.6.20
254wget http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.20.tar.bz2 292wget http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.20.tar.bz2
255tar xjf linux-2.6.20.tar.bz2 293tar xjf linux-2.6.20.tar.bz2
256wget http://www.cs.unc.edu/~anderson/litmus-rt/download/litmus-rt-2007.2.patch 294wget http://www.cs.unc.edu/~anderson/litmus-rt/download/2007.3/litmus-rt-2007.3.patch
257mv linux-2.6.20 litmus-rt 295mv linux-2.6.20 litmus-rt
258# apply the LITMUS RT patch 296# apply the LITMUS RT patch
259cd litmus-rt 297cd litmus-rt
260patch -p1 &lt; ../litmus-rt-2007.2.patch 298patch -p1 &lt; ../litmus-rt-2007.3.patch
261# create a working kernel configuration with HZ=1000 299# create a working kernel configuration with HZ=1000
262make gconfig 300make gconfig
263# compile the kernel 301# compile the kernel
@@ -298,8 +336,8 @@ initrd /boot/kernel-2.6.20-LITMUSRT.img
298 </p> 336 </p>
299<pre class="shell"> 337<pre class="shell">
300cd $DIR 338cd $DIR
301wget http://www.cs.unc.edu/~anderson/litmus-rt/download/liblitmus-2007.2.tgz 339wget http://www.cs.unc.edu/~anderson/litmus-rt/download/2007.3/liblitmus-2007.3.tgz
302tar xzf liblitmus-2007.2.tgz 340tar xzf liblitmus-2007.3.tgz
303cd liblitmus 341cd liblitmus
304make 342make
305</pre> 343</pre>
@@ -312,8 +350,8 @@ make
312 </p> 350 </p>
313<pre class="shell"> 351<pre class="shell">
314cd $DIR 352cd $DIR
315wget http://www.cs.unc.edu/~anderson/litmus-rt/download/libso-2007.2.tgz 353wget http://www.cs.unc.edu/~anderson/litmus-rt/download/2007.3/libso-2007.3.tgz
316tar xzf libso-2007.2.tgz 354tar xzf libso-2007.3.tgz
317cd libso 355cd libso
318make 356make
319make tests 357make tests
@@ -330,26 +368,27 @@ make tests
330 <h2 id="doc">Documentation</h2> 368 <h2 id="doc">Documentation</h2>
331 <div class="box"> 369 <div class="box">
332 370
333 <p class="nomargin"> 371 <p class="notopmargin">
334 Most of the documentation has yet to be written. To get an overview of 372 Most of the documentation has yet to be written. To get an overview of
335 the architecture of the kernel extension, we recommend to read the paper 373 the architecture of the kernel extension, we recommend to read the paper
336 <a href="http://www.cs.unc.edu/~anderson/papers/rtlws07.pdf">&ldquo;LITMUS<sup>RT</sup>: 374 <a href="http://www.cs.unc.edu/~anderson/papers/rtlws07.pdf">&ldquo;LITMUS<sup>RT</sup>:
337 A Status Report&rdquo;</a>. 375 A Status Report&rdquo;</a>.
338 <br/> 376 </p>
339 <br/> 377 <p>
378 The user space library that provides the LITMUS<sup>RT</sup> API,
379 <span class="src">liblitmus</span>, contains two example real-time tasks
380 (<span class="src">base_task.c</span> and
381 <span class="src">base_mt_task.c</span>)
382 that both illustrate how to use the API and provide a skeleton for real-time
383 task development. To get started with development, take a look these example
384 programs.
385 </p>
386 <p class="nobottommargin">
340 Please contact <span class="src">bbb[AT]cs.unc.edu</span> if you have any 387 Please contact <span class="src">bbb[AT]cs.unc.edu</span> if you have any
341 questions. 388 questions.
342 </p> 389 </p>
343 390
344<!-- <p class="nomargin">
345 <em>To be written...</em>
346 <ul class="nomargin">
347 <li>How to use LITMUS<sup>RT</sup></li>
348 <li>A real-time &quot;Hello World!&quot;</li>
349 </ul>
350 </p>
351 391
352-->
353 </div> 392 </div>
354 393
355 <h2 id="credits">Credits</h2> 394 <h2 id="credits">Credits</h2>