summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBjoern B. Brandenburg <bbb@cs.unc.edu>2007-10-29 04:30:48 -0400
committerBjoern B. Brandenburg <bbb@cs.unc.edu>2007-10-29 04:30:48 -0400
commit9aaa23e28a41fb579ef33ecb845e22cd717195c9 (patch)
treeca5a0939f68dcfe10103ffd3ad1d3d9652d8790d
parent972022fc01d151980cd2994283b4aec837ed419b (diff)
Added LITMUS release 2007.2.
Also some text changes.
-rw-r--r--download/MD5SUM3
-rw-r--r--download/liblitmus-2007.2.tgzbin0 -> 10825 bytes
-rw-r--r--download/libso-2007.2.tgzbin0 -> 15836 bytes
-rw-r--r--download/litmus-rt-2007.2.patch12100
-rw-r--r--index.html83
5 files changed, 12166 insertions, 20 deletions
diff --git a/download/MD5SUM b/download/MD5SUM
index 4d34aa9..4876c6d 100644
--- a/download/MD5SUM
+++ b/download/MD5SUM
@@ -1,3 +1,6 @@
1991469b3a8c9b6a0caa4cedfb663e9be liblitmus-2007.1.tgz 1991469b3a8c9b6a0caa4cedfb663e9be liblitmus-2007.1.tgz
2eddf0c80b0942f792ad8323cb62c9234 liblitmus-2007.2.tgz
26a80c8bb52af8f38dc1bbd874fa2e44f libso-2007.1.tgz 36a80c8bb52af8f38dc1bbd874fa2e44f libso-2007.1.tgz
4f3cb1e78f38dd22c4cca84a03fab3bbd libso-2007.2.tgz
3c6ef29d2e198c2fbc08e47d6f2f404bb litmus-rt-2007.1.patch 5c6ef29d2e198c2fbc08e47d6f2f404bb litmus-rt-2007.1.patch
6f4a1888b942a82ccce9daa55fce98202 litmus-rt-2007.2.patch
diff --git a/download/liblitmus-2007.2.tgz b/download/liblitmus-2007.2.tgz
new file mode 100644
index 0000000..616f345
--- /dev/null
+++ b/download/liblitmus-2007.2.tgz
Binary files differ
diff --git a/download/libso-2007.2.tgz b/download/libso-2007.2.tgz
new file mode 100644
index 0000000..394665f
--- /dev/null
+++ b/download/libso-2007.2.tgz
Binary files differ
diff --git a/download/litmus-rt-2007.2.patch b/download/litmus-rt-2007.2.patch
new file mode 100644
index 0000000..deea27d
--- /dev/null
+++ b/download/litmus-rt-2007.2.patch
@@ -0,0 +1,12100 @@
1diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
2index 0dfee81..da6f1e9 100644
3--- a/arch/i386/Kconfig
4+++ b/arch/i386/Kconfig
5@@ -1210,6 +1210,7 @@ config KPROBES
6 a probepoint and specifies the callback. Kprobes is useful
7 for kernel debugging, non-intrusive instrumentation and testing.
8 If in doubt, say "N".
9+
10 endmenu
11
12 source "arch/i386/Kconfig.debug"
13@@ -1259,3 +1260,30 @@ config X86_TRAMPOLINE
14 config KTIME_SCALAR
15 bool
16 default y
17+
18+
19+menu "LITMUS^RT"
20+
21+
22+config SCHED_TASK_TRACE
23+ bool "Trace real-time tasks"
24+ default y
25+ help
26+ Include support for the sched_trace_XXX() tracing functions. This
27+ allows the collection of real-time task events such as job
28+ completions, job releases, early completions, etc. This results in a
29+ small overhead in the scheduling code. Disable if the overhead is not
30+ acceptable (e.g., benchmarking).
31+
32+config SCHED_DEBUG_TRACE
33+ bool "TRACE() debugging"
34+ default y
35+ help
36+ Include support for sched_trace_log_messageg(), which is used to
37+ implement TRACE(). If disabled, no TRACE() messages will be included
38+ in the kernel, and no overheads due to debugging statements will be
39+ incurred by the scheduler. Disable if the overhead is not acceptable
40+ (e.g. benchmarking).
41+
42+
43+endmenu
44diff --git a/arch/i386/kernel/apic.c b/arch/i386/kernel/apic.c
45index 776d9be..2e8909f 100644
46--- a/arch/i386/kernel/apic.c
47+++ b/arch/i386/kernel/apic.c
48@@ -26,6 +26,7 @@
49 #include <linux/sysdev.h>
50 #include <linux/cpu.h>
51 #include <linux/module.h>
52+#include <linux/litmus.h>
53
54 #include <asm/atomic.h>
55 #include <asm/smp.h>
56@@ -43,6 +44,8 @@
57
58 #include "io_ports.h"
59
60+#include <linux/trace.h>
61+
62 /*
63 * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
64 * IPIs in place of local APIC timers
65@@ -54,6 +57,15 @@ static cpumask_t timer_bcast_ipi;
66 */
67 static int enable_local_apic __initdata = 0; /* -1=force-disable, +1=force-enable */
68
69+/*
70+ * Definitions and variables related to quantum synchronization.
71+ */
72+#define WAIT_TO_SYNC 30000 /* time after boot until sync */
73+static int stagger = 0; /* are we using staggered quanta? */
74+static atomic_t qsync_time = ATOMIC_INIT(INITIAL_JIFFIES);
75+static atomic_t quantum_sync_barrier = ATOMIC_INIT(0);
76+static atomic_t sync_done = ATOMIC_INIT(0);
77+
78 static inline void lapic_disable(void)
79 {
80 enable_local_apic = -1;
81@@ -786,6 +798,23 @@ static int __init apic_set_verbosity(char *str)
82
83 __setup("apic=", apic_set_verbosity);
84
85+/*
86+ * Determine whether to use aligned or staggerd quanta.
87+ */
88+
89+static int __init apic_synch_type(char *str)
90+{
91+ if (strcmp("aligned", str) == 0)
92+ stagger = 0;
93+ else if (strcmp("staggered", str) == 0)
94+ stagger = 1;
95+ else
96+ stagger = 0; /* aligned quanta by default */
97+ return 1;
98+}
99+
100+__setup("quanta=", apic_synch_type);
101+
102 static int __init detect_init_APIC (void)
103 {
104 u32 h, l, features;
105@@ -1198,6 +1227,47 @@ EXPORT_SYMBOL(switch_ipi_to_APIC_timer);
106 #undef APIC_DIVISOR
107
108 /*
109+ * This function is called to align all quanta, and to stagger quanta if
110+ * necessary. It relies on a barrier to synchronize all processors, so
111+ * that they all reset their APIC timers at the same time. If quanta
112+ * should be staggered, the appropriate stagger delay is then added at
113+ * each processor.
114+ */
115+
116+void synchronize_quanta(void)
117+{
118+ int cpu = smp_processor_id();
119+ int total_cpus = num_online_cpus();
120+ int stagger_interval = jiffies_to_usecs(1) / total_cpus;
121+
122+ /*
123+ * Disable APIC timer, wait for all other processors to reach barrier,
124+ * and re-enable all timers concurrently.
125+ */
126+ disable_APIC_timer();
127+ atomic_inc(&quantum_sync_barrier);
128+ while (atomic_read(&quantum_sync_barrier) < total_cpus) {
129+ /* Delay, otherwise atomic_inc's cannot occur. */
130+ udelay(1);
131+ }
132+
133+ /* Add necessary stagger for this CPU, if required. */
134+ if (stagger) {
135+ int stagger_us = cpu * stagger_interval;
136+ udelay(stagger_us);
137+ }
138+
139+ /* Re-enable all timers. */
140+ __setup_APIC_LVTT(calibration_result);
141+ enable_APIC_timer();
142+
143+ /* The first CPU signals that quantum sync is complete. */
144+ if (cpu == 0)
145+ atomic_inc(&sync_done);
146+}
147+
148+
149+/*
150 * Local timer interrupt handler. It does both profiling and
151 * process statistics/rescheduling.
152 *
153@@ -1209,11 +1279,32 @@ EXPORT_SYMBOL(switch_ipi_to_APIC_timer);
154
155 inline void smp_local_timer_interrupt(void)
156 {
157+/* s64 offset; */
158+
159+ TS_TICK_START;
160+
161 profile_tick(CPU_PROFILING);
162 #ifdef CONFIG_SMP
163 update_process_times(user_mode_vm(get_irq_regs()));
164 #endif
165
166+ /* Print out timing data - can be commented out if necessary. */
167+/* offset = get_nsec_offset(); */
168+/* TRACE("%d\n", offset); */
169+
170+ /*
171+ * Synchronize quanta if we have reached qsync_time plus wait
172+ * interval. The synchronization code itself is placed in its own
173+ * (non-inline) function, to avoid issues with creating an inline
174+ * function that is too large.
175+ */
176+ if (unlikely(!atomic_read(&sync_done) &&
177+ time_after(jiffies,
178+ (unsigned long)(atomic_read(&qsync_time) +
179+ msecs_to_jiffies(WAIT_TO_SYNC))))) {
180+ synchronize_quanta();
181+ }
182+
183 /*
184 * We take the 'long' return path, and there every subsystem
185 * grabs the apropriate locks (kernel lock/ irq lock).
186@@ -1224,6 +1315,7 @@ inline void smp_local_timer_interrupt(void)
187 * Currently this isn't too much of an issue (performance wise),
188 * we can take more than 100K local irqs per second on a 100 MHz P5.
189 */
190+ TS_TICK_END;
191 }
192
193 /*
194diff --git a/arch/i386/kernel/i386_ksyms.c b/arch/i386/kernel/i386_ksyms.c
195index e3d4b73..9670f77 100644
196--- a/arch/i386/kernel/i386_ksyms.c
197+++ b/arch/i386/kernel/i386_ksyms.c
198@@ -6,6 +6,7 @@ EXPORT_SYMBOL(__down_failed);
199 EXPORT_SYMBOL(__down_failed_interruptible);
200 EXPORT_SYMBOL(__down_failed_trylock);
201 EXPORT_SYMBOL(__up_wakeup);
202+
203 /* Networking helper routines. */
204 EXPORT_SYMBOL(csum_partial_copy_generic);
205
206diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
207index 2697e92..9a633ea 100644
208--- a/arch/i386/kernel/syscall_table.S
209+++ b/arch/i386/kernel/syscall_table.S
210@@ -319,3 +319,32 @@ ENTRY(sys_call_table)
211 .long sys_move_pages
212 .long sys_getcpu
213 .long sys_epoll_pwait
214+ /* LITMUS syscalls */
215+ .long sys_sched_setpolicy /* 320 */
216+ .long sys_sched_getpolicy
217+ .long sys_set_rt_mode
218+ .long sys_set_rt_task_param
219+ .long sys_get_rt_task_param
220+ .long sys_prepare_rt_task /* 325 */
221+ .long sys_ni_syscall /* CLEANUP: sys_reset_stat */
222+ .long sys_sleep_next_period
223+ .long sys_scheduler_setup
224+ .long sys_register_np_flag
225+ .long sys_exit_np /* 330 */
226+ .long sys_pi_sema_init
227+ .long sys_pi_down
228+ .long sys_pi_up
229+ .long sys_pi_sema_free
230+ .long sys_sema_init /* 335 */
231+ .long sys_down
232+ .long sys_up
233+ .long sys_sema_free
234+ .long sys_srp_sema_init
235+ .long sys_srp_down /* 340 */
236+ .long sys_srp_up
237+ .long sys_reg_task_srp_sem
238+ .long sys_srp_sema_free
239+ .long sys_query_job_no
240+ .long sys_wait_for_job_release /* 345 */
241+ .long sys_set_service_levels
242+ .long sys_get_cur_service_level
243\ No newline at end of file
244diff --git a/include/asm-i386/semaphore.h b/include/asm-i386/semaphore.h
245index 4e34a46..7212f4b 100644
246--- a/include/asm-i386/semaphore.h
247+++ b/include/asm-i386/semaphore.h
248@@ -45,6 +45,7 @@ struct semaphore {
249 atomic_t count;
250 int sleepers;
251 wait_queue_head_t wait;
252+ int used; /* allows semaphores to allocated to user space processes */
253 };
254
255
256diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
257index 833fa17..8a5d47c 100644
258--- a/include/asm-i386/unistd.h
259+++ b/include/asm-i386/unistd.h
260@@ -325,10 +325,40 @@
261 #define __NR_move_pages 317
262 #define __NR_getcpu 318
263 #define __NR_epoll_pwait 319
264+/* LITMUS */
265+#define __NR_sched_setpolicy 320
266+#define __NR_sched_getpolicy 321
267+/* Syscall definitions for mode change and task creation-manipulation */
268+#define __NR_set_rt_mode 322
269+#define __NR_set_rt_task_param 323
270+#define __NR_get_rt_task_param 324
271+#define __NR_prepare_rt_task 325
272+#define __NR_reset_stat 326
273+#define __NR_sleep_next_period 327
274+#define __NR_scheduler_setup 328
275+#define __NR_enter_np 329
276+#define __NR_exit_np 330
277+#define __NR_pi_sema_init 331
278+#define __NR_pi_down 332
279+#define __NR_pi_up 333
280+#define __NR_pi_sema_free 334
281+#define __NR_sema_init 335
282+#define __NR_down 336
283+#define __NR_up 337
284+#define __NR_sema_free 338
285+#define __NR_srp_sema_init 339
286+#define __NR_srp_down 340
287+#define __NR_srp_up 341
288+#define __NR_reg_task_srp_sem 342
289+#define __NR_srp_sema_free 343
290+#define __NR_query_job_no 344
291+#define __NR_wait_for_job_release 345
292+#define __NR_set_service_levels 346
293+#define __NR_get_cur_service_level 347
294
295 #ifdef __KERNEL__
296
297-#define NR_syscalls 320
298+#define NR_syscalls 343
299
300 #define __ARCH_WANT_IPC_PARSE_VERSION
301 #define __ARCH_WANT_OLD_READDIR
302diff --git a/include/linux/edf_common.h b/include/linux/edf_common.h
303new file mode 100644
304index 0000000..f940308
305--- /dev/null
306+++ b/include/linux/edf_common.h
307@@ -0,0 +1,36 @@
308+/* EDF common data structures and utility functions shared by all EDF
309+ * based scheduler plugins
310+ */
311+
312+/* CLEANUP: Add comments and make it less messy.
313+ *
314+ */
315+
316+#ifndef __UNC_EDF_COMMON_H__
317+#define __UNC_EDF_COMMON_H__
318+
319+#include <linux/rt_domain.h>
320+
321+
322+void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched);
323+
324+int edf_higher_prio(struct task_struct* first,
325+ struct task_struct* second);
326+
327+int edf_ready_order(struct list_head* a, struct list_head* b);
328+
329+void edf_release_at(struct task_struct *t, jiffie_t start);
330+#define edf_release_now(t) edf_release_at(t, jiffies)
331+
332+int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t);
333+long edf_sleep_next_period(void);
334+
335+void edf_prepare_for_next_period(struct task_struct *t);
336+
337+#define job_completed(t) (!is_be(t) && \
338+ (t)->rt_param.times.exec_time == (t)->rt_param.basic_params.exec_cost)
339+
340+int edf_set_hp_task(struct pi_semaphore *sem);
341+int edf_set_hp_cpu_task(struct pi_semaphore *sem, int cpu);
342+
343+#endif
344diff --git a/include/linux/feather_buffer.h b/include/linux/feather_buffer.h
345new file mode 100644
346index 0000000..c477772
347--- /dev/null
348+++ b/include/linux/feather_buffer.h
349@@ -0,0 +1,108 @@
350+#ifndef _FEATHER_BUFFER_H_
351+#define _FEATHER_BUFFER_H_
352+
353+/* requires UINT_MAX and memcpy */
354+
355+static inline int fetch_and_inc(int *val)
356+{
357+ int ret = 1;
358+ __asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" );
359+ return ret;
360+}
361+
362+static inline int fetch_and_dec(int *val)
363+{
364+ int ret = -1;
365+ __asm__ __volatile__("lock; xaddl %0, %1" : "+r" (ret), "+m" (*val) : : "memory" );
366+ return ret;
367+}
368+
369+#define SLOT_FREE 0
370+#define SLOT_BUSY 1
371+#define SLOT_READY 2
372+
373+struct ft_buffer {
374+ unsigned int slot_count;
375+ unsigned int slot_size;
376+
377+ int free_count;
378+ unsigned int write_idx;
379+ unsigned int read_idx;
380+
381+ char* slots;
382+ void* buffer_mem;
383+ unsigned int failed_writes;
384+};
385+
386+static inline int init_ft_buffer(struct ft_buffer* buf,
387+ unsigned int slot_count,
388+ unsigned int slot_size,
389+ char* slots,
390+ void* buffer_mem)
391+{
392+ int i = 0;
393+ if (!slot_count || UINT_MAX % slot_count != slot_count - 1) {
394+ /* The slot count must divide UNIT_MAX + 1 so that when it
395+ * wraps around the index correctly points to 0.
396+ */
397+ return 0;
398+ } else {
399+ buf->slot_count = slot_count;
400+ buf->slot_size = slot_size;
401+ buf->slots = slots;
402+ buf->buffer_mem = buffer_mem;
403+ buf->free_count = slot_count;
404+ buf->write_idx = 0;
405+ buf->read_idx = 0;
406+ buf->failed_writes = 0;
407+ for (i = 0; i < slot_count; i++)
408+ buf->slots[i] = SLOT_FREE;
409+ return 1;
410+ }
411+}
412+
413+static inline int ft_buffer_start_write(struct ft_buffer* buf, void **ptr)
414+{
415+ int free = fetch_and_dec(&buf->free_count);
416+ unsigned int idx;
417+ if (free <= 0) {
418+ fetch_and_inc(&buf->free_count);
419+ *ptr = 0;
420+ fetch_and_inc(&buf->failed_writes);
421+ return 0;
422+ } else {
423+ idx = fetch_and_inc((int*) &buf->write_idx) % buf->slot_count;
424+ buf->slots[idx] = SLOT_BUSY;
425+ *ptr = ((char*) buf->buffer_mem) + idx * buf->slot_size;
426+ return 1;
427+ }
428+}
429+
430+static inline void ft_buffer_finish_write(struct ft_buffer* buf, void *ptr)
431+{
432+ unsigned int idx = ((char*) ptr - (char*) buf->buffer_mem) / buf->slot_size;
433+ buf->slots[idx] = SLOT_READY;
434+}
435+
436+
437+/* exclusive reader access is assumed */
438+static inline int ft_buffer_read(struct ft_buffer* buf, void* dest)
439+{
440+ unsigned int idx;
441+ if (buf->free_count == buf->slot_count)
442+ /* nothing available */
443+ return 0;
444+ idx = buf->read_idx % buf->slot_count;
445+ if (buf->slots[idx] == SLOT_READY) {
446+ memcpy(dest, ((char*) buf->buffer_mem) + idx * buf->slot_size,
447+ buf->slot_size);
448+ buf->slots[idx] = SLOT_FREE;
449+ buf->read_idx++;
450+ fetch_and_inc(&buf->free_count);
451+ return 1;
452+ } else
453+ return 0;
454+}
455+
456+
457+#endif
458diff --git a/include/linux/feather_trace.h b/include/linux/feather_trace.h
459new file mode 100644
460index 0000000..57a21a5
461--- /dev/null
462+++ b/include/linux/feather_trace.h
463@@ -0,0 +1,93 @@
464+#ifndef _FEATHER_TRACE_H_
465+#define _FEATHER_TRACE_H_
466+
467+#define feather_callback __attribute__((regparm(0)))
468+
469+/* make the compiler reload any register that is not saved in
470+ * a cdecl function call
471+ */
472+#define CLOBBER_LIST "memory", "cc", "eax", "ecx", "edx"
473+
474+#define ft_event(id, callback) \
475+ __asm__ __volatile__( \
476+ "1: jmp 2f \n\t" \
477+ " call " #callback " \n\t" \
478+ ".section __event_table, \"aw\" \n\t" \
479+ ".long " #id ", 0, 1b, 2f \n\t" \
480+ ".previous \n\t" \
481+ "2: \n\t" \
482+ : : : CLOBBER_LIST)
483+
484+#define ft_event0(id, callback) \
485+ __asm__ __volatile__( \
486+ "1: jmp 2f \n\t" \
487+ " subl $4, %%esp \n\t" \
488+ " movl $" #id ", (%%esp) \n\t" \
489+ " call " #callback " \n\t" \
490+ " addl $4, %%esp \n\t" \
491+ ".section __event_table, \"aw\" \n\t" \
492+ ".long " #id ", 0, 1b, 2f \n\t" \
493+ ".previous \n\t" \
494+ "2: \n\t" \
495+ : : : CLOBBER_LIST)
496+
497+#define ft_event1(id, callback, param) \
498+ __asm__ __volatile__( \
499+ "1: jmp 2f \n\t" \
500+ " subl $8, %%esp \n\t" \
501+ " movl %0, 4(%%esp) \n\t" \
502+ " movl $" #id ", (%%esp) \n\t" \
503+ " call " #callback " \n\t" \
504+ " addl $8, %%esp \n\t" \
505+ ".section __event_table, \"aw\" \n\t" \
506+ ".long " #id ", 0, 1b, 2f \n\t" \
507+ ".previous \n\t" \
508+ "2: \n\t" \
509+ : : "r" (param) : CLOBBER_LIST)
510+
511+#define ft_event2(id, callback, param, param2) \
512+ __asm__ __volatile__( \
513+ "1: jmp 2f \n\t" \
514+ " subl $12, %%esp \n\t" \
515+ " movl %1, 8(%%esp) \n\t" \
516+ " movl %0, 4(%%esp) \n\t" \
517+ " movl $" #id ", (%%esp) \n\t" \
518+ " call " #callback " \n\t" \
519+ " addl $12, %%esp \n\t" \
520+ ".section __event_table, \"aw\" \n\t" \
521+ ".long " #id ", 0, 1b, 2f \n\t" \
522+ ".previous \n\t" \
523+ "2: \n\t" \
524+ : : "r" (param), "r" (param2) : CLOBBER_LIST)
525+
526+
527+#define ft_event3(id, callback, p, p2, p3) \
528+ __asm__ __volatile__( \
529+ "1: jmp 2f \n\t" \
530+ " subl $16, %%esp \n\t" \
531+ " movl %1, 12(%%esp) \n\t" \
532+ " movl %1, 8(%%esp) \n\t" \
533+ " movl %0, 4(%%esp) \n\t" \
534+ " movl $" #id ", (%%esp) \n\t" \
535+ " call " #callback " \n\t" \
536+ " addl $16, %%esp \n\t" \
537+ ".section __event_table, \"aw\" \n\t" \
538+ ".long " #id ", 0, 1b, 2f \n\t" \
539+ ".previous \n\t" \
540+ "2: \n\t" \
541+ : : "r" (p), "r" (p2), "r" (p3) : CLOBBER_LIST)
542+
543+
544+static inline unsigned long long ft_read_tsc(void)
545+{
546+ unsigned long long ret;
547+ __asm__ __volatile__("rdtsc" : "=A" (ret));
548+ return ret;
549+}
550+
551+int ft_enable_event(unsigned long id);
552+int ft_disable_event(unsigned long id);
553+int ft_is_event_enabled(unsigned long id);
554+int ft_disable_all_events(void);
555+
556+#endif
557diff --git a/include/linux/fifo_common.h b/include/linux/fifo_common.h
558new file mode 100644
559index 0000000..0883226
560--- /dev/null
561+++ b/include/linux/fifo_common.h
562@@ -0,0 +1,18 @@
563+/* FIFO common definitions and utility functions.
564+ */
565+#ifndef __UNC_SCHED_FIFO_H__
566+#define __UNC_SCHED_FIFO_H__
567+
568+#include <linux/rt_domain.h>
569+
570+
571+int fifo_higher_prio(struct task_struct* first,
572+ struct task_struct* second);
573+
574+int fifo_ready_order(struct list_head* a, struct list_head* b);
575+
576+
577+void fifo_domain_init(rt_domain_t* fifo, check_resched_needed_t resched);
578+
579+
580+#endif
581diff --git a/include/linux/fpmath.h b/include/linux/fpmath.h
582new file mode 100644
583index 0000000..a15c239
584--- /dev/null
585+++ b/include/linux/fpmath.h
586@@ -0,0 +1,111 @@
587+#ifndef __FP_MATH_H__
588+#define __FP_MATH_H__
589+
590+#define FP_SHIFT 10
591+#define ROUND_BIT (FP_SHIFT - 1)
592+#define ONE FP(1)
593+
594+#define _fp(x) ((fp_t) {x})
595+
596+static inline long _point(fp_t x)
597+{
598+ return (x.val % (1 << FP_SHIFT));
599+
600+}
601+
602+#define fp2str(x) x.val
603+/*(x.val >> FP_SHIFT), (x.val % (1 << FP_SHIFT)) */
604+#define _FP_ "%ld/1024"
605+
606+
607+static inline fp_t FP(long x)
608+{
609+ return _fp(((long) x) << FP_SHIFT);
610+}
611+
612+static inline long _floor(fp_t x)
613+{
614+ return x.val >> FP_SHIFT;
615+}
616+
617+/* FIXME: negative rounding */
618+static inline long _round(fp_t x)
619+{
620+ return _floor(x) + ((x.val >> ROUND_BIT) & 1);
621+}
622+
623+/* divide two integers to obtain a fixed point value */
624+static inline fp_t _frac(long a, long b)
625+{
626+ return _fp(FP(a).val / (b));
627+}
628+
629+/* multiply two fixed point values */
630+static inline fp_t _mul(fp_t a, fp_t b)
631+{
632+ return _fp((a.val * b.val) >> FP_SHIFT);
633+}
634+
635+static inline fp_t _div(fp_t a, fp_t b)
636+{
637+ /* try not to overflow */
638+ if (unlikely(a.val > 2 << (BITS_PER_LONG - FP_SHIFT)))
639+ return _fp((a.val / b.val) << FP_SHIFT);
640+ else
641+ return _fp((a.val << FP_SHIFT) / b.val);
642+}
643+
644+static inline fp_t _add(fp_t a, fp_t b)
645+{
646+ return _fp(a.val + b.val);
647+}
648+
649+static inline fp_t _sub(fp_t a, fp_t b)
650+{
651+ return _fp(a.val - b.val);
652+}
653+
654+static inline fp_t _neg(fp_t x)
655+{
656+ return _fp(-x.val);
657+}
658+
659+static inline fp_t _abs(fp_t x)
660+{
661+ return _fp(abs(x.val));
662+}
663+
664+static inline int _leq(fp_t a, fp_t b)
665+{
666+ return a.val <= b.val;
667+}
668+
669+static inline int _geq(fp_t a, fp_t b)
670+{
671+ return a.val >= b.val;
672+}
673+
674+static inline int _lt(fp_t a, fp_t b)
675+{
676+ return a.val < b.val;
677+}
678+
679+static inline int _gt(fp_t a, fp_t b)
680+{
681+ return a.val > b.val;
682+}
683+
684+static inline int _eq(fp_t a, fp_t b)
685+{
686+ return a.val == b.val;
687+}
688+
689+static inline fp_t _max(fp_t a, fp_t b)
690+{
691+ if (a.val < b.val)
692+ return b;
693+ else
694+ return a;
695+}
696+
697+#endif
698diff --git a/include/linux/list.h b/include/linux/list.h
699index 611059d..319c5ed 100644
700--- a/include/linux/list.h
701+++ b/include/linux/list.h
702@@ -898,6 +898,36 @@ static inline void hlist_add_after_rcu(struct hlist_node *prev,
703 ({ tpos = hlist_entry(pos, typeof(*tpos), member); 1;}); \
704 pos = pos->next)
705
706+
707+typedef int (*list_cmp_t)(struct list_head*, struct list_head*);
708+
709+static inline unsigned int list_insert(struct list_head* new,
710+ struct list_head* head,
711+ list_cmp_t order_before)
712+{
713+ struct list_head *pos;
714+ unsigned int passed = 0;
715+
716+ BUG_ON(!new);
717+
718+ /* find a spot where the new entry is less than the next */
719+ list_for_each(pos, head) {
720+ if (unlikely(order_before(new, pos))) {
721+ /* pos is not less than new, thus insert here */
722+ __list_add(new, pos->prev, pos);
723+ goto out;
724+ }
725+ passed++;
726+ }
727+ /* if we get to this point either the list is empty or every entry
728+ * queued element is less than new.
729+ * Let's add new to the end. */
730+ list_add_tail(new, head);
731+ out:
732+ return passed;
733+}
734+
735+
736 #else
737 #warning "don't include kernel headers in userspace"
738 #endif /* __KERNEL__ */
739diff --git a/include/linux/litmus.h b/include/linux/litmus.h
740new file mode 100644
741index 0000000..259594e
742--- /dev/null
743+++ b/include/linux/litmus.h
744@@ -0,0 +1,128 @@
745+/*
746+ * Constant definitions related to
747+ * scheduling policy.
748+ */
749+
750+#ifndef _LINUX_LITMUS_H_
751+#define _LINUX_LITMUS_H_
752+
753+#include <linux/jiffies.h>
754+#include <linux/sched_trace.h>
755+
756+typedef enum {
757+ SCHED_BEG = 0,
758+ SCHED_LINUX = 0,
759+ SCHED_PFAIR = 1,
760+ SCHED_PFAIR_STAGGER = 2,
761+ SCHED_PART_EDF = 3,
762+ SCHED_PART_EEVDF = 4,
763+ SCHED_GLOBAL_EDF = 5,
764+ SCHED_PFAIR_DESYNC = 6,
765+ SCHED_GLOBAL_EDF_NP = 7,
766+ SCHED_CUSTOM = 8,
767+ SCHED_EDF_HSB = 9,
768+ SCHED_GSN_EDF = 10,
769+ SCHED_PSN_EDF = 11,
770+ SCHED_ADAPTIVE = 12,
771+ /* Add your scheduling policy here */
772+
773+ SCHED_END = 12,
774+ SCHED_DEFAULT = 0,
775+ SCHED_INVALID = -1,
776+} spolicy;
777+
778+
779+typedef enum {
780+ LITMUS_RESERVED_RANGE = 1024,
781+
782+} sched_setup_cmd_t;
783+
784+/* Runtime modes */
785+enum rt_mode_t {
786+ MODE_NON_RT = 0,
787+ MODE_RT_RUN = 1
788+};
789+
790+/* Plugin boot options, for convenience */
791+#define PLUGIN_LINUX "linux"
792+#define PLUGIN_PFAIR "pfair"
793+#define PLUGIN_PART_EDF "part_edf"
794+#define PLUGIN_GLOBAL_EDF "global_edf"
795+#define PLUGIN_GLOBAL_EDF_NP "global_edf_np"
796+#define PLUGIN_EDF_HSB "edf_hsb"
797+#define PLUGIN_GSN_EDF "gsn_edf"
798+#define PLUGIN_PSN_EDF "psn_edf"
799+#define PLUGIN_ADAPTIVE "adaptive"
800+
801+extern spolicy sched_policy;
802+
803+/* RT mode start time */
804+extern volatile unsigned long rt_start_time;
805+
806+/* Here we store the current mode of the system */
807+extern atomic_t rt_mode;
808+
809+#define get_rt_mode() (atomic_read(&rt_mode))
810+#define set_rt_mode(a) atomic_set(&rt_mode,(a))
811+
812+#define TRACE(fmt, args...) \
813+ sched_trace_log_message("%d: " fmt, raw_smp_processor_id(), ## args)
814+
815+#define TRACE_TASK(t, fmt, args...) \
816+ TRACE("(%s/%d) " fmt, (t)->comm, (t)->pid, ##args)
817+
818+#define TRACE_CUR(fmt, args...) \
819+ TRACE_TASK(current, fmt, ## args)
820+
821+#define TRACE_BUG_ON(cond) \
822+ do { if (cond) TRACE("BUG_ON(%s) at %s:%d " \
823+ "called from %p current=%s/%d state=%d " \
824+ "flags=%x mode=%d partition=%d cpu=%d rtflags=%d"\
825+ " job=%u knp=%d timeslice=%u\n", \
826+ #cond, __FILE__, __LINE__, __builtin_return_address(0), current->comm, \
827+ current->pid, current->state, current->flags, get_rt_mode(), \
828+ get_partition(current), smp_processor_id(), get_rt_flags(current), \
829+ current->rt_param.times.job_no, current->rt_param.kernel_np, \
830+ current->time_slice\
831+ ); } while(0);
832+
833+
834+/* in_list - is a given list_head queued on some list?
835+ */
836+static inline int in_list(struct list_head* list)
837+{
838+ return !( /* case 1: deleted */
839+ (list->next == LIST_POISON1 &&
840+ list->prev == LIST_POISON2)
841+ ||
842+ /* case 2: initialized */
843+ (list->next == list &&
844+ list->prev == list)
845+ );
846+}
847+
848+void list_qsort(struct list_head* list, list_cmp_t less_than);
849+
850+
851+#define RT_PREEMPTIVE 0x2050 /* = NP */
852+#define RT_NON_PREEMPTIVE 0x4e50 /* = P */
853+#define RT_EXIT_NP_REQUESTED 0x5251 /* = RQ */
854+
855+/* returns 1 if task t has registered np flag and set it to RT_NON_PREEMPTIVE
856+ */
857+int is_np(struct task_struct *t);
858+
859+/* request that the task should call sys_exit_np()
860+ */
861+void request_exit_np(struct task_struct *t);
862+
863+/* kill naughty tasks
864+ */
865+void scheduler_signal(struct task_struct *t, unsigned int signal);
866+void send_scheduler_signals(void);
867+void np_mem_kill(struct task_struct *t);
868+
869+/* clean up real-time state of a task */
870+void exit_litmus(struct task_struct *dead_tsk);
871+
872+#endif
873diff --git a/include/linux/pfair_common.h b/include/linux/pfair_common.h
874new file mode 100644
875index 0000000..67e18c6
876--- /dev/null
877+++ b/include/linux/pfair_common.h
878@@ -0,0 +1,40 @@
879+/* PFAIR common data structures and utility functions shared by all PFAIR
880+ * based scheduler plugins
881+ */
882+
883+#ifndef __UNC_PFAIR_COMMON_H__
884+#define __UNC_PFAIR_COMMON_H__
885+
886+#include <linux/queuelock.h>
887+#include <linux/cpumask.h>
888+
889+typedef struct _pfair_domain {
890+ /* Global lock to protect the data structures */
891+ queuelock_t pfair_lock;
892+ /* runnable rt tasks are in here */
893+ struct list_head ready_queue;
894+
895+ /* real-time tasks waiting for release are in here */
896+ struct list_head release_queue;
897+
898+ /* CPU's in the domain */
899+ cpumask_t domain_cpus;
900+
901+} pfair_domain_t;
902+
903+#define next_ready(pfair) \
904+ (list_entry((pfair)->ready_queue.next, struct task_struct, rt_list))
905+void pfair_domain_init(pfair_domain_t *pfair);
906+void pfair_add_ready(pfair_domain_t* pfair, struct task_struct *new);
907+struct task_struct* __pfair_take_ready(pfair_domain_t* pfair);
908+void pfair_add_release(pfair_domain_t* pfair, struct task_struct *task);
909+void pfair_try_release_pending(pfair_domain_t* pfair);
910+void __pfair_prepare_new_release(struct task_struct *t, jiffie_t start);
911+
912+void pfair_prepare_next_job(struct task_struct *t);
913+void pfair_prepare_next_subtask(struct task_struct *t);
914+
915+void pfair_prepare_new_releases(pfair_domain_t *pfair, jiffie_t start);
916+
917+#endif
918+
919diff --git a/include/linux/pfair_math.h b/include/linux/pfair_math.h
920new file mode 100644
921index 0000000..b2a14e4
922--- /dev/null
923+++ b/include/linux/pfair_math.h
924@@ -0,0 +1,80 @@
925+/* PFAIR Mathematical functions */
926+#ifndef __UNC_PFAIR_MATH_H__
927+#define __UNC_PFAIR_MATH_H__
928+
929+#include <linux/rt_param.h>
930+#include <asm/div64.h>
931+#include <linux/litmus.h>
932+#include <linux/sched.h>
933+
934+/* Type definition for our quantums */
935+typedef unsigned long long quantum_t;
936+
937+/*
938+* This file defines mathematical functions "ceiling", "floor",
939+* and PFAIR specific functions for computing the release and
940+* the deadline of a subtask, as well as tie breakers:
941+* b-bit and group deadline.
942+*/
943+static inline quantum_t FLOOR(quantum_t a, unsigned long b)
944+{
945+ BUG_ON( b == 0);
946+ do_div(a, b);
947+ return a;
948+}
949+static inline quantum_t CEIL(quantum_t a, unsigned long b)
950+{
951+ quantum_t t = FLOOR(a, b);
952+ return (quantum_t)((t * b == a) ? t : (t + 1));
953+}
954+
955+
956+/*
957+* invariant - i-1=get_passed_quanta(t)
958+*
959+* release time of i-th subtask of j-th job is
960+* r_{ij}+\lfloor i-1/wt(T) \rfloor
961+* This operation should be robust to wrap-around
962+* so we can compare the result with jiffies safely
963+*/
964+static inline quantum_t release_time(struct task_struct * t)
965+{
966+ quantum_t e = get_exec_cost(t);
967+ quantum_t p = get_rt_period(t);
968+ return FLOOR((get_passed_quanta(t)) * p, e);
969+}
970+/*
971+* deadline time of i-th subtask of j-th job is
972+* r_{ij}+\lceil i/wt(T) \rceil
973+* This operation should be robust to wrap-around
974+* so we can compare the result with jiffies safely
975+*/
976+static inline quantum_t pfair_deadline(struct task_struct * t)
977+{
978+ quantum_t e = get_exec_cost(t);
979+ quantum_t p = get_rt_period(t);
980+ return CEIL((get_passed_quanta(t) + 1) * p, e);
981+}
982+/* In PFAIR b-bit is defined as
983+* \lceil i/wt(T) \rceil-\lfloor i/wt(T) \rfloor
984+*/
985+static inline int b_bit(struct task_struct *t)
986+{
987+ quantum_t e = get_exec_cost(t);
988+ quantum_t p = get_rt_period(t);
989+ return CEIL((get_passed_quanta(t) + 1) * p, e)-
990+ FLOOR((get_passed_quanta(t) + 1) * p, e);
991+}
992+/*
993+* Group deadline
994+*/
995+static inline quantum_t group_deadline(struct task_struct * t)
996+{
997+ quantum_t p = get_rt_period(t);
998+ quantum_t e = get_exec_cost(t);
999+ quantum_t stage1 = CEIL((get_passed_quanta(t) + 1) * p, e);
1000+ quantum_t stage2 = CEIL(stage1 * (p - e), p);
1001+ return CEIL(stage2 * p, p - e);
1002+}
1003+
1004+#endif /* __UNC_PFAIR_MATH_H__ */
1005diff --git a/include/linux/queuelock.h b/include/linux/queuelock.h
1006new file mode 100644
1007index 0000000..454ff81
1008--- /dev/null
1009+++ b/include/linux/queuelock.h
1010@@ -0,0 +1,98 @@
1011+#ifndef _UNC_QUEUELOCK_H_
1012+#define _UNC_QUEUELOCK_H_
1013+/**
1014+* Queue lock
1015+*
1016+* This is an implementation of T. Anderson's queue lock.
1017+* It strives to follow the normal Linux locking conventions
1018+* as much as possible. The rules for acquiring a lock are:
1019+*
1020+* 1) The caller must ensure interrupts and preemptions are disabled.
1021+*
1022+* 2) The caller _cannot_ recursively acquire the lock.
1023+*
1024+* 3) The caller may not sleep while holding the lock. This is currently
1025+* not enforced, but it will not work.
1026+*/
1027+
1028+#include <linux/cache.h>
1029+#include <asm/atomic.h>
1030+#include <linux/smp.h>
1031+
1032+typedef struct {
1033+ /* pad the values being spun on to make sure
1034+ that they are cache local
1035+ */
1036+ union {
1037+ volatile enum {
1038+ MUST_WAIT,
1039+ HAS_LOCK
1040+ } val;
1041+ char padding[SMP_CACHE_BYTES];
1042+ } slots[NR_CPUS];
1043+
1044+ /* since spin_slot is not being spun on it can be
1045+ * in a shared cache line. next_slot will be evicted
1046+ * anyway on every attempt to acquire the lock.
1047+ */
1048+ int spin_slot[NR_CPUS];
1049+
1050+ /* The next slot that will be available.
1051+ */
1052+ atomic_t next_slot;
1053+} queuelock_t;
1054+
1055+
1056+static inline void queue_lock_init(queuelock_t *lock)
1057+{
1058+ int i;
1059+ for (i = 0; i < NR_CPUS; i++) {
1060+ lock->slots[i].val = MUST_WAIT;
1061+ lock->spin_slot[i] = i;
1062+ }
1063+ lock->slots[0].val = HAS_LOCK;
1064+ atomic_set(&lock->next_slot, 0);
1065+}
1066+
1067+
1068+static inline void queue_lock(queuelock_t *lock)
1069+{
1070+ int me = smp_processor_id();
1071+ volatile int* spin_var;
1072+ /* Get slot to spin on. atomic_inc_return() returns the incremented
1073+ * value, so take one of again
1074+ */
1075+ lock->spin_slot[me] = atomic_inc_return(&lock->next_slot) - 1;
1076+ /* check for wrap-around
1077+ * This could probably optimized away if we ensure that NR_CPUS divides
1078+ * INT_MAX...
1079+ */
1080+ if (unlikely(lock->spin_slot[me] == NR_CPUS - 1))
1081+ atomic_add(-NR_CPUS, &lock->next_slot);
1082+ /* range limit*/
1083+ lock->spin_slot[me] %= NR_CPUS;
1084+ /* spin until you acquire the lock */
1085+ spin_var = (int*) &lock->slots[lock->spin_slot[me]].val;
1086+ while (*spin_var == MUST_WAIT)
1087+ cpu_relax();
1088+
1089+ /* reset the lock */
1090+ lock->slots[lock->spin_slot[me]].val = MUST_WAIT;
1091+ barrier();
1092+}
1093+
1094+
1095+static inline void queue_unlock(queuelock_t *lock)
1096+{
1097+ int me = smp_processor_id();
1098+ barrier();
1099+ lock->slots[(lock->spin_slot[me] + 1) % NR_CPUS].val = HAS_LOCK;
1100+}
1101+
1102+#define queue_lock_irqsave(lock, flags) \
1103+ do { local_irq_save(flags); queue_lock(lock); } while (0);
1104+
1105+#define queue_unlock_irqrestore(lock, flags) \
1106+ do { queue_unlock(lock); local_irq_restore(flags); } while (0);
1107+
1108+#endif /* _UNC_QUEUELOCK_H_ */
1109diff --git a/include/linux/rt_domain.h b/include/linux/rt_domain.h
1110new file mode 100644
1111index 0000000..237eac7
1112--- /dev/null
1113+++ b/include/linux/rt_domain.h
1114@@ -0,0 +1,98 @@
1115+/* CLEANUP: Add comments and make it less messy.
1116+ *
1117+ */
1118+
1119+#ifndef __UNC_RT_DOMAIN_H__
1120+#define __UNC_RT_DOMAIN_H__
1121+
1122+struct _rt_domain;
1123+
1124+typedef int (*check_resched_needed_t)(struct _rt_domain *rt);
1125+typedef void (*release_at_t)(struct task_struct *t, jiffie_t start);
1126+
1127+typedef struct _rt_domain {
1128+ /* runnable rt tasks are in here */
1129+ rwlock_t ready_lock;
1130+ struct list_head ready_queue;
1131+
1132+ /* real-time tasks waiting for release are in here */
1133+ spinlock_t release_lock;
1134+ struct list_head release_queue;
1135+
1136+ /* how do we check if we need to kick another CPU? */
1137+ check_resched_needed_t check_resched;
1138+
1139+ /* how are tasks ordered in the ready queue? */
1140+ list_cmp_t order;
1141+} rt_domain_t;
1142+
1143+#define next_ready(rt) \
1144+ (list_entry((rt)->ready_queue.next, struct task_struct, rt_list))
1145+
1146+#define ready_jobs_pending(rt) \
1147+ (!list_empty(&(rt)->ready_queue))
1148+
1149+void rt_domain_init(rt_domain_t *rt, check_resched_needed_t f,
1150+ list_cmp_t order);
1151+
1152+void __add_ready(rt_domain_t* rt, struct task_struct *new);
1153+void __add_release(rt_domain_t* rt, struct task_struct *task);
1154+
1155+struct task_struct* __take_ready_rq(rt_domain_t* rt, runqueue_t* rq, int cpu);
1156+struct task_struct* __take_ready(rt_domain_t* rt);
1157+struct task_struct* __peek_ready(rt_domain_t* rt);
1158+
1159+void try_release_pending(rt_domain_t* rt);
1160+void __release_pending(rt_domain_t* rt);
1161+
1162+void rerelease_all(rt_domain_t *rt, release_at_t release);
1163+void __rerelease_all(rt_domain_t *rt, release_at_t release);
1164+
1165+static inline void add_ready(rt_domain_t* rt, struct task_struct *new)
1166+{
1167+ unsigned long flags;
1168+ /* first we need the write lock for rt_ready_queue */
1169+ write_lock_irqsave(&rt->ready_lock, flags);
1170+ __add_ready(rt, new);
1171+ write_unlock_irqrestore(&rt->ready_lock, flags);
1172+}
1173+
1174+static inline struct task_struct* take_ready(rt_domain_t* rt)
1175+{
1176+ unsigned long flags;
1177+ struct task_struct* ret;
1178+ /* first we need the write lock for rt_ready_queue */
1179+ write_lock_irqsave(&rt->ready_lock, flags);
1180+ ret = __take_ready(rt);
1181+ write_unlock_irqrestore(&rt->ready_lock, flags);
1182+ return ret;
1183+}
1184+
1185+
1186+static inline void add_release(rt_domain_t* rt, struct task_struct *task)
1187+{
1188+ unsigned long flags;
1189+ /* first we need the write lock for rt_ready_queue */
1190+ spin_lock_irqsave(&rt->release_lock, flags);
1191+ __add_release(rt, task);
1192+ spin_unlock_irqrestore(&rt->release_lock, flags);
1193+}
1194+
1195+static inline int __jobs_pending(rt_domain_t* rt)
1196+{
1197+ return !list_empty(&rt->ready_queue);
1198+}
1199+
1200+static inline int jobs_pending(rt_domain_t* rt)
1201+{
1202+ unsigned long flags;
1203+ int ret;
1204+ /* first we need the write lock for rt_ready_queue */
1205+ read_lock_irqsave(&rt->ready_lock, flags);
1206+ ret = __jobs_pending(rt);
1207+ read_unlock_irqrestore(&rt->ready_lock, flags);
1208+ return ret;
1209+}
1210+
1211+
1212+#endif
1213diff --git a/include/linux/rt_param.h b/include/linux/rt_param.h
1214new file mode 100644
1215index 0000000..426a929
1216--- /dev/null
1217+++ b/include/linux/rt_param.h
1218@@ -0,0 +1,264 @@
1219+/*
1220+ * Definition of the scheduler plugin interface.
1221+ *
1222+ */
1223+#ifndef _LINUX_RT_PARAM_H_
1224+#define _LINUX_RT_PARAM_H_
1225+
1226+#include <linux/wait.h>
1227+
1228+typedef unsigned long jiffie_t;
1229+
1230+/* different types of clients */
1231+typedef enum {
1232+ RT_CLASS_HARD,
1233+ RT_CLASS_SOFT,
1234+ RT_CLASS_BEST_EFFORT
1235+} task_class_t;
1236+
1237+typedef struct rt_param {
1238+ unsigned long exec_cost;
1239+ unsigned long period;
1240+ unsigned int cpu;
1241+ task_class_t class;
1242+} rt_param_t;
1243+
1244+/* fixed point wrapper to force compiler
1245+ * errors in case of misuse of a fixed point value
1246+ */
1247+typedef struct
1248+{
1249+ long val;
1250+} fp_t;
1251+
1252+typedef struct {
1253+ fp_t weight;
1254+ unsigned long period;
1255+ fp_t value;
1256+} service_level_t;
1257+
1258+typedef struct {
1259+ fp_t estimate;
1260+ fp_t accumulated;
1261+} predictor_state_t;
1262+
1263+typedef struct {
1264+ /* when will this task be release the next time? */
1265+ jiffie_t release;
1266+ /* time instant the last job was released */
1267+ jiffie_t last_release;
1268+ /* what is the current deadline? */
1269+ jiffie_t deadline;
1270+ /* b-bit tie breaker for PFAIR, it is ignored in EDF */
1271+ int b_bit;
1272+ /* group deadline tie breaker, it is ignored in EDF */
1273+ jiffie_t group_deadline;
1274+ /* how long has this task executed so far?
1275+ * In case of capacity sharing a job completion cannot be
1276+ * detected by checking time_slice == 0 as the job may have
1277+ * executed while using another capacity. Use this counter
1278+ * to keep track of the time spent on a CPU by a job.
1279+ *
1280+ * In other words: The number of consumed quanta since the
1281+ * last job release.
1282+ */
1283+ unsigned int exec_time;
1284+
1285+ /* Which job is this. This is used to let user space
1286+ * specify which job to wait for, which is important if jobs
1287+ * overrun. If we just call sys_sleep_next_period() then we
1288+ * will unintentionally miss jobs after an overrun.
1289+ *
1290+ * Increase this sequence number when a job is released.
1291+ */
1292+ unsigned int job_no;
1293+} rt_times_t;
1294+
1295+
1296+/* RT task parameters for scheduling extensions
1297+ * These parameters are inherited during clone and therefore must
1298+ * be explicitly set up before the task set is launched.
1299+ */
1300+typedef struct task_rt_param {
1301+ /* is the task sleeping? */
1302+ unsigned int flags:8;
1303+
1304+ /* Real-time marker: 1 iff it is a LITMUS real-time task.
1305+ */
1306+ unsigned int is_realtime:1;
1307+
1308+ /* is this task under control of litmus?
1309+` *
1310+ * this is necessary because otherwise signal delivery code
1311+ * may try to wake up a task that is already queued in plugin
1312+ * data structures.
1313+ */
1314+ unsigned int litmus_controlled:1;
1315+
1316+ /* Did this task register any SRP controlled resource accesses?
1317+ * This, of course, should only ever be true under partitioning.
1318+ * However, this limitation is not currently enforced.
1319+ */
1320+ unsigned int subject_to_srp:1;
1321+
1322+ /* user controlled parameters */
1323+ rt_param_t basic_params;
1324+
1325+ /* task representing the current "inherited" task
1326+ * priority, assigned by inherit_priority and
1327+ * return priority in the scheduler plugins.
1328+ * could point to self if PI does not result in
1329+ * an increased task priority.
1330+ */
1331+ struct task_struct* inh_task;
1332+
1333+ /* Don't just dereference this pointer in kernel space!
1334+ * It might very well point to junk or nothing at all.
1335+ * NULL indicates that the task has not requested any non-preemptable
1336+ * section support.
1337+ * TODO: What happens on fork?
1338+ */
1339+ __user short* np_flag;
1340+
1341+ /* For the FMLP under PSN-EDF, it is required to make the task
1342+ * non-preemptive from kernel space. In order not to interfere with
1343+ * user space, this counter indicates the kernel space np setting.
1344+ * kernel_np > 0 => task is non-preemptive
1345+ */
1346+ unsigned int kernel_np;
1347+
1348+ /* timing parameters */
1349+ rt_times_t times;
1350+
1351+ /* This is currently only used by the PFAIR code
1352+ * and a prime candidate for cleanup.
1353+ */
1354+ rt_times_t backup;
1355+
1356+ /* This field can be used by plugins to store where the task
1357+ * is currently scheduled. It is the responsibility of the
1358+ * plugin to avoid race conditions.
1359+ *
1360+ * Used by GSN-EDF.
1361+ */
1362+ int scheduled_on;
1363+
1364+ /* This field can be used by plugins to store where the task
1365+ * is currently linked. It is the responsibility of the plugin
1366+ * to avoid race conditions.
1367+ *
1368+ * Used by GSN-EDF.
1369+ */
1370+ int linked_on;
1371+
1372+ /* Adaptive support. Adaptive tasks will store service levels
1373+ * in this (dynamically allocated) structure.
1374+ */
1375+ service_level_t* service_level;
1376+ unsigned int no_service_levels;
1377+ unsigned int cur_service_level;
1378+
1379+ /* Adaptive support. Store state for weight estimation.
1380+ */
1381+ predictor_state_t predictor_state;
1382+
1383+ /* Adaptive support. Optimizer fields.
1384+ */
1385+ struct list_head opt_list;
1386+ fp_t opt_order;
1387+ fp_t opt_dw;
1388+ fp_t opt_nw;
1389+ unsigned int opt_level;
1390+ jiffie_t opt_change;
1391+} task_rt_param_t;
1392+
1393+/* Possible RT flags */
1394+#define RT_F_RUNNING 0x00000000
1395+#define RT_F_SLEEP 0x00000001
1396+#define RT_F_EXP_QUANTA 0x00000002
1397+#define RT_F_NON_PREEMTABLE 0x00000004
1398+#define RT_F_EXIT_SEM 0x00000008
1399+
1400+#define is_realtime(t) ((t)->rt_param.is_realtime)
1401+
1402+/* Realtime utility macros */
1403+#define get_passed_quanta(t) ((t)->rt_param.times.exec_time)
1404+#define inc_passed_quanta(t) ((t)->rt_param.times.exec_time += 1)
1405+#define get_rt_flags(t) ((t)->rt_param.flags)
1406+#define set_rt_flags(t,f) (t)->rt_param.flags=(f)
1407+#define get_exec_cost(t) ((t)->rt_param.basic_params.exec_cost)
1408+#define get_rt_period(t) ((t)->rt_param.basic_params.period)
1409+#define set_rt_period(t,p) (t)->rt_param.basic_params.period=(p)
1410+#define set_exec_cost(t,e) (t)->rt_param.basic_params.exec_cost=(e)
1411+#define get_partition(t) (t)->rt_param.basic_params.cpu
1412+#define get_deadline(t) ((t)->rt_param.times.deadline)
1413+#define get_last_release(t) ((t)->rt_param.times.last_release)
1414+#define get_class(t) ((t)->rt_param.basic_params.class)
1415+
1416+#define has_active_job(t) \
1417+ (time_before(get_last_release(t), jiffies) \
1418+ && time_before_eq(jiffies, get_deadline(t)))
1419+
1420+#define get_est_weight(t) ((t)->rt_param.predictor_state.estimate)
1421+#define get_sl(t, l) \
1422+ ((t)->rt_param.service_level[l])
1423+#define get_cur_sl(t) ((t)->rt_param.cur_service_level)
1424+#define get_max_sl(t) ((t)->rt_param.no_service_levels - 1)
1425+#define get_opt_sl(t) ((t)->rt_param.opt_level)
1426+
1427+
1428+#define is_subject_to_srp(t) ((t)->rt_param.subject_to_srp)
1429+#define is_hrt(t) \
1430+ ((t)->rt_param.basic_params.class == RT_CLASS_HARD)
1431+#define is_srt(t) \
1432+ ((t)->rt_param.basic_params.class == RT_CLASS_SOFT)
1433+#define is_be(t) \
1434+ ((t)->rt_param.basic_params.class == RT_CLASS_BEST_EFFORT)
1435+
1436+#define clear_rt_params(t) \
1437+memset(&(t)->rt_param,0, sizeof(struct task_rt_param))
1438+
1439+#define get_release(t) ((t)->rt_param.times.release)
1440+#define set_release(t,r) ((t)->rt_param.times.release=(r))
1441+
1442+/* honor the flag that is set when scheduling is in progress
1443+ * This is some dirty hack in Linux that creates race conditions in our code
1444+ * if we don't pay attention to it.
1445+ */
1446+#define is_running(t) \
1447+ ((t)->state == TASK_RUNNING || \
1448+ (t)->thread_info->preempt_count & PREEMPT_ACTIVE)
1449+
1450+#define is_blocked(t) (!is_running(t))
1451+#define is_released(t) (time_before_eq((t)->rt_param.times.release, jiffies))
1452+#define is_tardy(t) (time_before_eq((t)->rt_param.times.deadline, jiffies))
1453+#define task_slack(t) ( (int) (t)->rt_param.times.deadline - (int) jiffies - \
1454+ (int) ((t)->rt_param.basic_params.exec_cost - \
1455+ (t)->rt_param.times.exec_time))
1456+
1457+
1458+/* real-time comparison macros */
1459+#define earlier_deadline(a, b) (time_before(\
1460+ (a)->rt_param.times.deadline,\
1461+ (b)->rt_param.times.deadline))
1462+#define earlier_release(a, b) (time_before(\
1463+ (a)->rt_param.times.release,\
1464+ (b)->rt_param.times.release))
1465+
1466+#define earlier_last_release(a, b) (time_before(\
1467+ (a)->rt_param.times.last_release,\
1468+ (b)->rt_param.times.last_release))
1469+
1470+
1471+#define make_np(t) do {t->rt_param.kernel_np++;} while(0);
1472+#define take_np(t) do {t->rt_param.kernel_np--;} while(0);
1473+
1474+#define backup_times(t) do { (t)->rt_param.backup=(t)->rt_param.times; \
1475+ } while(0);
1476+#define restore_times(t) do { (t)->rt_param.times=(t)->rt_param.backup; \
1477+ } while(0);
1478+
1479+
1480+#define rt_list2task(p) list_entry(p, struct task_struct, rt_list)
1481+
1482+#endif
1483diff --git a/include/linux/sched.h b/include/linux/sched.h
1484index 4463735..f533ae3 100644
1485--- a/include/linux/sched.h
1486+++ b/include/linux/sched.h
1487@@ -3,6 +3,8 @@
1488
1489 #include <linux/auxvec.h> /* For AT_VECTOR_SIZE */
1490
1491+#include <linux/rt_param.h>
1492+
1493 /*
1494 * cloning flags:
1495 */
1496@@ -26,6 +28,8 @@
1497 #define CLONE_STOPPED 0x02000000 /* Start in stopped state */
1498 #define CLONE_NEWUTS 0x04000000 /* New utsname group? */
1499 #define CLONE_NEWIPC 0x08000000 /* New ipcs */
1500+#define CLONE_REALTIME 0x10000000 /* LITMUS real-time task creation */
1501+
1502
1503 /*
1504 * Scheduling policies
1505@@ -1051,6 +1055,12 @@ struct task_struct {
1506 #ifdef CONFIG_FAULT_INJECTION
1507 int make_it_fail;
1508 #endif
1509+ /* litmus parameters and state */
1510+ task_rt_param_t rt_param;
1511+
1512+ /* allow scheduler plugins to queue in release lists, etc. */
1513+ struct list_head rt_list;
1514+
1515 };
1516
1517 static inline pid_t process_group(struct task_struct *tsk)
1518diff --git a/include/linux/sched_plugin.h b/include/linux/sched_plugin.h
1519new file mode 100644
1520index 0000000..1ea8178
1521--- /dev/null
1522+++ b/include/linux/sched_plugin.h
1523@@ -0,0 +1,149 @@
1524+/*
1525+ * Definition of the scheduler plugin interface.
1526+ *
1527+ */
1528+#ifndef _LINUX_SCHED_PLUGIN_H_
1529+#define _LINUX_SCHED_PLUGIN_H_
1530+
1531+#include <linux/sched.h>
1532+
1533+/* struct for semaphore with priority inheritance */
1534+struct pi_semaphore {
1535+ atomic_t count;
1536+ int sleepers;
1537+ wait_queue_head_t wait;
1538+ union {
1539+ /* highest-prio holder/waiter */
1540+ struct task_struct *task;
1541+ struct task_struct* cpu_task[NR_CPUS];
1542+ } hp;
1543+ /* current lock holder */
1544+ struct task_struct *holder;
1545+ /* is the semaphore being used? */
1546+ int used;
1547+};
1548+
1549+
1550+/* Enforce runqueues to be opaque objects.
1551+ *
1552+ * This allows us to pass around pointers to runqueues,
1553+ * without actually having to rip it out of sched.c. It
1554+ * also discourages plugins from trying to be
1555+ * overly clever.
1556+ */
1557+typedef void runqueue_t;
1558+
1559+
1560+/********************* scheduler invocation ******************/
1561+
1562+typedef enum {
1563+ NO_RESCHED = 0,
1564+ FORCE_RESCHED = 1
1565+} reschedule_check_t;
1566+
1567+
1568+/* Plugin-specific realtime tick handler */
1569+typedef reschedule_check_t (*scheduler_tick_t) (void);
1570+/* Novell make sched decision function */
1571+typedef int (*schedule_t) (struct task_struct * prev,
1572+ struct task_struct ** next,
1573+ runqueue_t * rq);
1574+/* Clean up after the task switch has occured.
1575+ * This function is called after every (even non-rt) task switch.
1576+ */
1577+typedef void (*finish_switch_t)(struct task_struct *prev);
1578+
1579+
1580+/********************* task state changes ********************/
1581+
1582+/* called to setup a new real-time task */
1583+typedef long (*prepare_task_t) (struct task_struct *task);
1584+/* called to re-introduce a task after blocking */
1585+typedef void (*wake_up_task_t) (struct task_struct *task);
1586+/* called to notify the plugin of a blocking real-time task
1587+ * it will only be called for real-time tasks and before schedule is called */
1588+typedef void (*task_blocks_t) (struct task_struct *task);
1589+/* called when a real-time task exits. Free any allocated resources */
1590+typedef long (*tear_down_t) (struct task_struct *);
1591+
1592+/* Called when the new_owner is released from the wait queue
1593+ * it should now inherit the priority from sem, _before_ it gets readded
1594+ * to any queue
1595+ */
1596+typedef long (*inherit_priority_t) (struct pi_semaphore *sem,
1597+ struct task_struct *new_owner);
1598+
1599+/* Called when the current task releases a semahpore where it might have
1600+ * inherited a piority from
1601+ */
1602+typedef long (*return_priority_t) (struct pi_semaphore *sem);
1603+
1604+/* Called when a task tries to acquire a semaphore and fails. Check if its
1605+ * priority is higher than that of the current holder.
1606+ */
1607+typedef long (*pi_block_t) (struct pi_semaphore *sem, struct task_struct *t);
1608+
1609+
1610+/********************* sys call backends ********************/
1611+/* This function causes the caller to sleep until the next release */
1612+typedef long (*sleep_next_period_t) (void);
1613+
1614+typedef int (*scheduler_setup_t) (int cmd, void __user *parameter);
1615+
1616+typedef int (*mode_change_t) (int);
1617+
1618+struct sched_plugin {
1619+ /* basic info */
1620+ char *plugin_name;
1621+ int ready_to_use;
1622+
1623+ /* management interface */
1624+ mode_change_t mode_change;
1625+
1626+ /* scheduler invocation */
1627+ scheduler_tick_t scheduler_tick;
1628+ schedule_t schedule;
1629+ finish_switch_t finish_switch;
1630+
1631+ /* syscall backend */
1632+ sleep_next_period_t sleep_next_period;
1633+ scheduler_setup_t scheduler_setup;
1634+
1635+ /* task state changes */
1636+ prepare_task_t prepare_task;
1637+ wake_up_task_t wake_up_task;
1638+ task_blocks_t task_blocks;
1639+ tear_down_t tear_down;
1640+
1641+ /* priority inheritance */
1642+ inherit_priority_t inherit_priority;
1643+ return_priority_t return_priority;
1644+ pi_block_t pi_block;
1645+} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
1646+
1647+typedef struct sched_plugin sched_plugin_t;
1648+
1649+extern sched_plugin_t *curr_sched_plugin;
1650+
1651+
1652+/* common scheduler tick */
1653+reschedule_check_t rt_scheduler_tick(void);
1654+
1655+
1656+/* Don't pull in our definitions on top of the real ones
1657+ * in sched.c!
1658+ */
1659+#ifndef __SCHED_C__
1660+
1661+/* External linux scheduler facilities */
1662+void deactivate_task(struct task_struct *, runqueue_t *);
1663+/* This function is defined in sched.c. We need acces to it for
1664+ * indirect switching.
1665+ */
1666+void __activate_task(struct task_struct *, runqueue_t *);
1667+void __setscheduler(struct task_struct *, int, int);
1668+
1669+#endif
1670+
1671+extern int get_sched_options(void);
1672+#endif
1673diff --git a/include/linux/sched_trace.h b/include/linux/sched_trace.h
1674new file mode 100644
1675index 0000000..308cc7d
1676--- /dev/null
1677+++ b/include/linux/sched_trace.h
1678@@ -0,0 +1,182 @@
1679+/* sched_trace.h -- record scheduler events to a byte stream for offline analysis.
1680+ */
1681+#ifndef _LINUX_SCHED_TRACE_H_
1682+#define _LINUX_SCHED_TRACE_H_
1683+
1684+#include <linux/sched.h>
1685+
1686+typedef enum {
1687+ ST_INVOCATION = 0,
1688+ ST_ARRIVAL = 1,
1689+ ST_DEPARTURE = 2,
1690+ ST_PREEMPTION = 3,
1691+ ST_SCHEDULED = 4,
1692+ ST_JOB_RELEASE = 5,
1693+ ST_JOB_COMPLETION = 6,
1694+ ST_CAPACITY_RELEASE = 7,
1695+ ST_CAPACITY_ALLOCATION = 8,
1696+ ST_SERVICE_LEVEL_CHANGE = 9,
1697+ ST_WEIGHT_ERROR = 10,
1698+} trace_type_t;
1699+
1700+typedef struct {
1701+ trace_type_t trace:8;
1702+ unsigned int size:24;
1703+ unsigned long long timestamp;
1704+} trace_header_t;
1705+
1706+
1707+typedef struct {
1708+ unsigned int is_rt:1;
1709+ unsigned int is_server:1;
1710+ task_class_t class:4;
1711+ unsigned int budget:24;
1712+ u32 deadline;
1713+
1714+ pid_t pid;
1715+} task_info_t;
1716+
1717+typedef struct {
1718+ trace_header_t header;
1719+ unsigned long flags;
1720+} invocation_record_t;
1721+
1722+typedef struct {
1723+ trace_header_t header;
1724+ task_info_t task;
1725+} arrival_record_t;
1726+
1727+typedef struct {
1728+ trace_header_t header;
1729+ task_info_t task;
1730+} departure_record_t;
1731+
1732+typedef struct {
1733+ trace_header_t header;
1734+ task_info_t task;
1735+ task_info_t by;
1736+} preemption_record_t;
1737+
1738+typedef struct {
1739+ trace_header_t header;
1740+ task_info_t task;
1741+} scheduled_record_t;
1742+
1743+typedef struct {
1744+ trace_header_t header;
1745+ task_info_t task;
1746+ u16 period;
1747+ u16 wcet;
1748+} release_record_t;
1749+
1750+typedef struct {
1751+ trace_header_t header;
1752+ task_info_t task;
1753+ u16 period;
1754+ u16 wcet;
1755+ int tardiness;
1756+ unsigned int job_no;
1757+} completion_record_t;
1758+
1759+typedef struct {
1760+ trace_header_t header;
1761+ task_info_t task;
1762+} cap_release_record_t;
1763+
1764+typedef struct {
1765+ trace_header_t header;
1766+ task_info_t task;
1767+ u16 budget;
1768+ u32 deadline;
1769+ pid_t donor;
1770+} cap_allocation_record_t;
1771+
1772+typedef struct {
1773+ trace_header_t header;
1774+ task_info_t task;
1775+ unsigned int from:16;
1776+ unsigned int to:16;
1777+ service_level_t new_level;
1778+ service_level_t old_level;
1779+} service_level_change_record_t;
1780+
1781+typedef struct {
1782+ trace_header_t header;
1783+ pid_t task;
1784+ fp_t estimate;
1785+ fp_t actual;
1786+} weight_error_record_t;
1787+
1788+#ifdef CONFIG_SCHED_TASK_TRACE
1789+void sched_trace_scheduler_invocation(void);
1790+
1791+void sched_trace_task_arrival(struct task_struct *t);
1792+void sched_trace_task_departure(struct task_struct *t);
1793+void sched_trace_task_preemption(struct task_struct *t,
1794+ struct task_struct* by);
1795+void sched_trace_task_scheduled(struct task_struct *);
1796+
1797+void sched_trace_job_release(struct task_struct *t);
1798+void sched_trace_job_completion(struct task_struct *t);
1799+
1800+void sched_trace_capacity_release(struct task_struct *t);
1801+void sched_trace_capacity_allocation(struct task_struct *t,
1802+ u16 budget, u32 deadline, pid_t donor);
1803+
1804+void sched_trace_capacity_alloc_srv(pid_t srv, u32 srv_dl, task_class_t cls,
1805+ u16 srv_budget,
1806+ u16 budget, u32 deadline, pid_t donor);
1807+
1808+void sched_trace_server_release(int id, unsigned int wcet,
1809+ unsigned int period,
1810+ task_class_t class);
1811+
1812+void sched_trace_server_completion(int id, unsigned int budget,
1813+ jiffie_t deadline,
1814+ task_class_t class);
1815+
1816+void sched_trace_server_scheduled(int id, task_class_t class,
1817+ unsigned int budget, jiffie_t deadline);
1818+
1819+void sched_trace_service_level_change(struct task_struct* t,
1820+ unsigned int from,
1821+ unsigned int to);
1822+
1823+void sched_trace_weight_error(struct task_struct* t, fp_t actual);
1824+
1825+#else
1826+#define sched_trace_scheduler_invocation(x)
1827+
1828+#define sched_trace_task_arrival(t)
1829+#define sched_trace_task_departure(t)
1830+#define sched_trace_task_preemption(t, by)
1831+#define sched_trace_task_scheduled(t)
1832+#define sched_trace_job_release(t)
1833+#define sched_trace_job_completion(t)
1834+#define sched_trace_capacity_release(t)
1835+#define sched_trace_capacity_allocation(t, budget, deadline, donor)
1836+#define sched_trace_capacity_alloc_srv(srv, srv_dl, cls, srv_budget,\
1837+ budget, deadline, donor)
1838+#define sched_trace_server_release(id, wcet, period, class)
1839+#define sched_trace_server_completion(id, budget, deadline, class)
1840+#define sched_trace_server_scheduled(id, class, budget, deadline)
1841+
1842+#define sched_trace_service_level_change(t, a, b)
1843+
1844+#define sched_trace_weight_error(x, y)
1845+
1846+
1847+#endif
1848+
1849+
1850+#ifdef CONFIG_SCHED_DEBUG_TRACE
1851+void sched_trace_log_message(const char* fmt, ...);
1852+
1853+#else
1854+
1855+#define sched_trace_log_message(fmt, ...)
1856+
1857+#endif
1858+
1859+
1860+#endif
1861diff --git a/include/linux/trace.h b/include/linux/trace.h
1862new file mode 100644
1863index 0000000..9e457aa
1864--- /dev/null
1865+++ b/include/linux/trace.h
1866@@ -0,0 +1,74 @@
1867+
1868+#ifndef _SYS_TRACE_H_
1869+#define _SYS_TRACE_H_
1870+
1871+#include <linux/feather_trace.h>
1872+#include <linux/feather_buffer.h>
1873+
1874+
1875+/*********************** TIMESTAMPS ************************/
1876+
1877+struct timestamp {
1878+ unsigned long event;
1879+ unsigned long long timestamp;
1880+ unsigned int seq_no;
1881+ int cpu;
1882+};
1883+
1884+
1885+/* buffer holding time stamps - will be provided by driver */
1886+extern struct ft_buffer* trace_ts_buf;
1887+
1888+/* save_timestamp: stores current time as struct timestamp
1889+ * in trace_ts_buf
1890+ */
1891+asmlinkage void save_timestamp(unsigned long event);
1892+
1893+#define TIMESTAMP(id) ft_event0(id, save_timestamp)
1894+
1895+/* Convention for timestamps
1896+ * =========================
1897+ *
1898+ * In order to process the trace files with a common tool, we use the following
1899+ * convention to measure execution times: The end time id of a code segment is
1900+ * always the next number after the start time event id.
1901+ */
1902+
1903+#define TS_SCHED_START TIMESTAMP(100)
1904+#define TS_SCHED_END TIMESTAMP(101)
1905+#define TS_CXS_START TIMESTAMP(102)
1906+#define TS_CXS_END TIMESTAMP(103)
1907+
1908+#define TS_TICK_START TIMESTAMP(110)
1909+#define TS_TICK_END TIMESTAMP(111)
1910+
1911+#define TS_PLUGIN_SCHED_START TIMESTAMP(120)
1912+#define TS_PLUGIN_SCHED_END TIMESTAMP(121)
1913+
1914+#define TS_PLUGIN_TICK_START TIMESTAMP(130)
1915+#define TS_PLUGIN_TICK_END TIMESTAMP(131)
1916+
1917+#define TS_ENTER_NP_START TIMESTAMP(140)
1918+#define TS_ENTER_NP_END TIMESTAMP(141)
1919+
1920+#define TS_EXIT_NP_START TIMESTAMP(150)
1921+#define TS_EXIT_NP_END TIMESTAMP(151)
1922+
1923+#define TS_SRP_UP_START TIMESTAMP(160)
1924+#define TS_SRP_UP_END TIMESTAMP(161)
1925+#define TS_SRP_DOWN_START TIMESTAMP(162)
1926+#define TS_SRP_DOWN_END TIMESTAMP(163)
1927+
1928+#define TS_PI_UP_START TIMESTAMP(170)
1929+#define TS_PI_UP_END TIMESTAMP(171)
1930+#define TS_PI_DOWN_START TIMESTAMP(172)
1931+#define TS_PI_DOWN_END TIMESTAMP(173)
1932+
1933+#define TS_FIFO_UP_START TIMESTAMP(180)
1934+#define TS_FIFO_UP_END TIMESTAMP(181)
1935+#define TS_FIFO_DOWN_START TIMESTAMP(182)
1936+#define TS_FIFO_DOWN_END TIMESTAMP(183)
1937+
1938+
1939+
1940+#endif /* !_SYS_TRACE_H_ */
1941diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h
1942index 975c963..6ae0ff9 100644
1943--- a/include/linux/uaccess.h
1944+++ b/include/linux/uaccess.h
1945@@ -84,4 +84,20 @@ static inline unsigned long __copy_from_user_nocache(void *to,
1946 ret; \
1947 })
1948
1949+/* This is a naive attempt at a write version of the above native Linux macro.
1950+ */
1951+#define poke_kernel_address(val, addr) \
1952+ ({ \
1953+ long ret; \
1954+ mm_segment_t old_fs = get_fs(); \
1955+ \
1956+ set_fs(KERNEL_DS); \
1957+ pagefault_disable(); \
1958+ ret = __put_user(val, (__force typeof(val) __user *)(addr)); \
1959+ pagefault_enable(); \
1960+ set_fs(old_fs); \
1961+ ret; \
1962+ })
1963+
1964+
1965 #endif /* __LINUX_UACCESS_H__ */
1966diff --git a/include/linux/wait.h b/include/linux/wait.h
1967index e820d00..c7e96b6 100644
1968--- a/include/linux/wait.h
1969+++ b/include/linux/wait.h
1970@@ -161,6 +161,8 @@ wait_queue_head_t *FASTCALL(bit_waitqueue(void *, int));
1971 #define wake_up_locked(x) __wake_up_locked((x), TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE)
1972 #define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1)
1973
1974+#define pi_wake_up(x) __pi_wake_up(x, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, NULL)
1975+
1976 #define __wait_event(wq, condition) \
1977 do { \
1978 DEFINE_WAIT(__wait); \
1979diff --git a/kernel/Makefile b/kernel/Makefile
1980index 14f4d45..55acc93 100644
1981--- a/kernel/Makefile
1982+++ b/kernel/Makefile
1983@@ -8,7 +8,12 @@ obj-y = sched.o fork.o exec_domain.o panic.o printk.o profile.o \
1984 signal.o sys.o kmod.o workqueue.o pid.o \
1985 rcupdate.o extable.o params.o posix-timers.o \
1986 kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o mutex.o \
1987- hrtimer.o rwsem.o latency.o nsproxy.o srcu.o
1988+ hrtimer.o rwsem.o latency.o nsproxy.o srcu.o \
1989+ sched_plugin.o litmus.o sched_trace.o \
1990+ edf_common.o fifo_common.o pfair_common.o\
1991+ sched_global_edf.o sched_part_edf.o sched_edf_hsb.o sched_pfair.o \
1992+ sched_gsn_edf.o sched_psn_edf.o litmus_sem.o \
1993+ trace.o ft_event.o rt_domain.o sched_adaptive.o
1994
1995 obj-$(CONFIG_STACKTRACE) += stacktrace.o
1996 obj-y += time/
1997diff --git a/kernel/edf_common.c b/kernel/edf_common.c
1998new file mode 100644
1999index 0000000..4746c66
2000--- /dev/null
2001+++ b/kernel/edf_common.c
2002@@ -0,0 +1,135 @@
2003+/*
2004+ * kernel/edf_common.c
2005+ *
2006+ * Common functions for EDF based scheduler.
2007+ */
2008+
2009+#include <linux/percpu.h>
2010+#include <linux/sched.h>
2011+#include <linux/list.h>
2012+
2013+#include <linux/litmus.h>
2014+#include <linux/sched_plugin.h>
2015+#include <linux/sched_trace.h>
2016+
2017+
2018+#include <linux/edf_common.h>
2019+
2020+/* edf_higher_prio - returns true if first has a higher EDF priority
2021+ * than second. Deadline ties are broken by PID.
2022+ *
2023+ * first first must not be NULL and a real-time task.
2024+ * second may be NULL or a non-rt task.
2025+ */
2026+int edf_higher_prio(struct task_struct* first,
2027+ struct task_struct* second)
2028+{
2029+ struct task_struct *first_task = first;
2030+ struct task_struct *second_task = second;
2031+
2032+ /* Check for inherited priorities. Change task
2033+ * used for comparison in such a case.
2034+ */
2035+ if (first && first->rt_param.inh_task)
2036+ first_task = first->rt_param.inh_task;
2037+ if (second && second->rt_param.inh_task)
2038+ second_task = second->rt_param.inh_task;
2039+
2040+ return
2041+ /* does the second task exist and is it a real-time task? If
2042+ * not, the first task (which is a RT task) has higher
2043+ * priority.
2044+ */
2045+ !second_task || !is_realtime(second_task) ||
2046+
2047+ /* is the deadline of the first task earlier?
2048+ * Then it has higher priority.
2049+ */
2050+ earlier_deadline(first_task, second_task) ||
2051+
2052+ /* Do we have a deadline tie?
2053+ * Then break by PID.
2054+ */
2055+ (get_deadline(first_task) == get_deadline(second_task) &&
2056+ (first_task->pid < second_task->pid ||
2057+
2058+ /* If the PIDs are the same then the task with the inherited
2059+ * priority wins.
2060+ */
2061+ (first_task->pid == second_task->pid &&
2062+ !second->rt_param.inh_task)));
2063+}
2064+
2065+int edf_ready_order(struct list_head* a, struct list_head* b)
2066+{
2067+ return edf_higher_prio(
2068+ list_entry(a, struct task_struct, rt_list),
2069+ list_entry(b, struct task_struct, rt_list));
2070+}
2071+
2072+void edf_release_at(struct task_struct *t, jiffie_t start)
2073+{
2074+ t->rt_param.times.deadline = start;
2075+ edf_prepare_for_next_period(t);
2076+ t->rt_param.times.last_release = start;
2077+ set_rt_flags(t, RT_F_RUNNING);
2078+}
2079+
2080+void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched)
2081+{
2082+ rt_domain_init(rt, resched, edf_ready_order);
2083+}
2084+
2085+void edf_prepare_for_next_period(struct task_struct *t)
2086+{
2087+ BUG_ON(!t);
2088+ /* prepare next release */
2089+ t->rt_param.times.release = t->rt_param.times.deadline;
2090+ t->rt_param.times.deadline += get_rt_period(t);
2091+ t->rt_param.times.exec_time = 0;
2092+ /* update job sequence number */
2093+ t->rt_param.times.job_no++;
2094+
2095+ t->time_slice = get_exec_cost(t);
2096+
2097+ /* who uses this? statistics? */
2098+ t->first_time_slice = 0;
2099+}
2100+
2101+/* need_to_preempt - check whether the task t needs to be preempted
2102+ * call only with irqs disabled and with ready_lock acquired
2103+ * THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT!
2104+ */
2105+int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t)
2106+{
2107+ /* we need the read lock for edf_ready_queue */
2108+ /* no need to preempt if there is nothing pending */
2109+ if (!ready_jobs_pending(rt))
2110+ return 0;
2111+ /* we need to reschedule if t doesn't exist */
2112+ if (!t)
2113+ return 1;
2114+
2115+ /* NOTE: We cannot check for non-preemptibility since we
2116+ * don't know what address space we're currently in.
2117+ */
2118+
2119+ /* make sure to get non-rt stuff out of the way */
2120+ return !is_realtime(t) || edf_higher_prio(next_ready(rt), t);
2121+}
2122+
2123+
2124+/*
2125+ * Deactivate current task until the beginning of the next period.
2126+ */
2127+long edf_sleep_next_period(void)
2128+{
2129+ /* Mark that we do not excute anymore */
2130+ set_rt_flags(current, RT_F_SLEEP);
2131+ /* call schedule, this will return when a new job arrives
2132+ * it also takes care of preparing for the next release
2133+ */
2134+ schedule();
2135+ return 0;
2136+}
2137+
2138diff --git a/kernel/fifo_common.c b/kernel/fifo_common.c
2139new file mode 100644
2140index 0000000..c1641a1
2141--- /dev/null
2142+++ b/kernel/fifo_common.c
2143@@ -0,0 +1,86 @@
2144+/*
2145+ * kernel/fifo_common.c
2146+ *
2147+ * Fifo helper functions. Could one day be a FIFO plugin if someone
2148+ * is interested.
2149+ *
2150+ * The current FIFO implementaion automatically chops Linux tasks into
2151+ * smaller jobs by assigning a fixed time slice. Once that time slice expires,
2152+ * it is treated as a new job release (that is queued in the back).
2153+ *
2154+ * The result is that it provides FIFO properties on a job level and round-robin
2155+ * on a task level if the tasks execute continuously.
2156+ */
2157+
2158+#include <asm/uaccess.h>
2159+#include <linux/percpu.h>
2160+#include <linux/sched.h>
2161+#include <linux/list.h>
2162+
2163+#include <linux/litmus.h>
2164+#include <linux/sched_plugin.h>
2165+#include <linux/sched_trace.h>
2166+#include <linux/fifo_common.h>
2167+
2168+/* This function is defined in sched.c. We need access it for
2169+ * indirect switching.
2170+ */
2171+void __activate_task(struct task_struct *p, runqueue_t *rq);
2172+
2173+/* fifo_higher_prio - returns true if first has a higher FIFO priority
2174+ * than second. Release time ties are broken by PID.
2175+ *
2176+ * first first must not be NULL and a real-time task.
2177+ * second may be NULL or a non-rt task.
2178+ */
2179+int fifo_higher_prio(struct task_struct* first,
2180+ struct task_struct* second)
2181+{
2182+ struct task_struct *first_task = first;
2183+ struct task_struct *second_task = second;
2184+
2185+ /* Check for inherited priorities. Change task
2186+ * used for comparison in such a case.
2187+ */
2188+ if (first && first->rt_param.inh_task)
2189+ first_task = first->rt_param.inh_task;
2190+ if (second && second->rt_param.inh_task)
2191+ second_task = second->rt_param.inh_task;
2192+
2193+ return
2194+ /* does the second task exist and is it a real-time task? If
2195+ * not, the first task (which is a RT task) has higher
2196+ * priority.
2197+ */
2198+ !second_task || !is_realtime(second_task) ||
2199+
2200+ /* is the release of the first task earlier?
2201+ * Then it has higher priority.
2202+ */
2203+ earlier_last_release(first_task, second_task) ||
2204+
2205+ /* Do we have a release time tie?
2206+ * Then break by PID.
2207+ */
2208+ (get_last_release(first_task) ==
2209+ get_last_release(second_task) &&
2210+ (first_task->pid < second_task->pid ||
2211+
2212+ /* If the PIDs are the same then the task with the inherited
2213+ * priority wins.
2214+ */
2215+ (first_task->pid == second_task->pid &&
2216+ !second->rt_param.inh_task)));
2217+}
2218+
2219+int fifo_ready_order(struct list_head* a, struct list_head* b)
2220+{
2221+ return fifo_higher_prio(
2222+ list_entry(a, struct task_struct, rt_list),
2223+ list_entry(b, struct task_struct, rt_list));
2224+}
2225+
2226+void fifo_domain_init(rt_domain_t* rt, check_resched_needed_t resched)
2227+{
2228+ rt_domain_init(rt, resched, fifo_ready_order);
2229+}
2230diff --git a/kernel/fork.c b/kernel/fork.c
2231index d57118d..d786dcf 100644
2232--- a/kernel/fork.c
2233+++ b/kernel/fork.c
2234@@ -57,6 +57,9 @@
2235 #include <asm/cacheflush.h>
2236 #include <asm/tlbflush.h>
2237
2238+#include <linux/litmus.h>
2239+#include <linux/sched_plugin.h>
2240+
2241 /*
2242 * Protected counters by write_lock_irq(&tasklist_lock)
2243 */
2244@@ -118,6 +121,9 @@ void __put_task_struct(struct task_struct *tsk)
2245 WARN_ON(atomic_read(&tsk->usage));
2246 WARN_ON(tsk == current);
2247
2248+ if (is_realtime(tsk))
2249+ exit_litmus(tsk);
2250+
2251 security_task_free(tsk);
2252 free_uid(tsk->user);
2253 put_group_info(tsk->group_info);
2254diff --git a/kernel/ft_event.c b/kernel/ft_event.c
2255new file mode 100644
2256index 0000000..10318ee
2257--- /dev/null
2258+++ b/kernel/ft_event.c
2259@@ -0,0 +1,104 @@
2260+#include <linux/types.h>
2261+
2262+#include <linux/feather_trace.h>
2263+
2264+/* the feather trace management functions assume
2265+ * exclusive access to the event table
2266+ */
2267+
2268+
2269+#define BYTE_JUMP 0xeb
2270+#define BYTE_JUMP_LEN 0x02
2271+
2272+/* for each event, there is an entry in the event table */
2273+struct trace_event {
2274+ long id;
2275+ long count;
2276+ long start_addr;
2277+ long end_addr;
2278+};
2279+
2280+extern struct trace_event __start___event_table[];
2281+extern struct trace_event __stop___event_table[];
2282+
2283+int ft_enable_event(unsigned long id)
2284+{
2285+ struct trace_event* te = __start___event_table;
2286+ int count = 0;
2287+ char* delta;
2288+ unsigned char* instr;
2289+
2290+ while (te < __stop___event_table) {
2291+ if (te->id == id && ++te->count == 1) {
2292+ instr = (unsigned char*) te->start_addr;
2293+ /* make sure we don't clobber something wrong */
2294+ if (*instr == BYTE_JUMP) {
2295+ delta = (((unsigned char*) te->start_addr) + 1);
2296+ *delta = 0;
2297+ }
2298+ }
2299+ if (te->id == id)
2300+ count++;
2301+ te++;
2302+ }
2303+ return count;
2304+}
2305+
2306+int ft_disable_event(unsigned long id)
2307+{
2308+ struct trace_event* te = __start___event_table;
2309+ int count = 0;
2310+ char* delta;
2311+ unsigned char* instr;
2312+
2313+ while (te < __stop___event_table) {
2314+ if (te->id == id && --te->count == 0) {
2315+ instr = (unsigned char*) te->start_addr;
2316+ if (*instr == BYTE_JUMP) {
2317+ delta = (((unsigned char*) te->start_addr) + 1);
2318+ *delta = te->end_addr - te->start_addr -
2319+ BYTE_JUMP_LEN;
2320+ }
2321+ }
2322+ if (te->id == id)
2323+ count++;
2324+ te++;
2325+ }
2326+ return count;
2327+}
2328+
2329+int ft_disable_all_events(void)
2330+{
2331+ struct trace_event* te = __start___event_table;
2332+ int count = 0;
2333+ char* delta;
2334+ unsigned char* instr;
2335+
2336+ while (te < __stop___event_table) {
2337+ if (te->count) {
2338+ instr = (unsigned char*) te->start_addr;
2339+ if (*instr == BYTE_JUMP) {
2340+ delta = (((unsigned char*) te->start_addr)
2341+ + 1);
2342+ *delta = te->end_addr - te->start_addr -
2343+ BYTE_JUMP_LEN;
2344+ te->count = 0;
2345+ count++;
2346+ }
2347+ }
2348+ te++;
2349+ }
2350+ return count;
2351+}
2352+
2353+int ft_is_event_enabled(unsigned long id)
2354+{
2355+ struct trace_event* te = __start___event_table;
2356+
2357+ while (te < __stop___event_table) {
2358+ if (te->id == id)
2359+ return te->count;
2360+ te++;
2361+ }
2362+ return 0;
2363+}
2364diff --git a/kernel/litmus.c b/kernel/litmus.c
2365new file mode 100644
2366index 0000000..8f238ba
2367--- /dev/null
2368+++ b/kernel/litmus.c
2369@@ -0,0 +1,953 @@
2370+/* litmus.c -- Implementation of the LITMUS syscalls, the LITMUS intialization,
2371+ * and the common tick function.
2372+ */
2373+#include <asm/uaccess.h>
2374+#include <linux/uaccess.h>
2375+#include <linux/sysrq.h>
2376+
2377+#include <linux/queuelock.h>
2378+#include <linux/litmus.h>
2379+#include <linux/sched.h>
2380+#include <linux/sched_plugin.h>
2381+#include <linux/fpmath.h>
2382+
2383+#include <linux/trace.h>
2384+
2385+#define MAX_SERVICE_LEVELS 10
2386+
2387+/* Variables that govern the scheduling process */
2388+spolicy sched_policy = SCHED_DEFAULT;
2389+int sched_options = 0;
2390+
2391+
2392+/* This is a flag for switching the system into RT mode when it is booted up
2393+ * In RT-mode non-realtime tasks are scheduled as background tasks.
2394+ */
2395+
2396+/* The system is booting in non-realtime mode */
2397+atomic_t rt_mode = ATOMIC_INIT(MODE_NON_RT);
2398+/* Here we specify a mode change to be made */
2399+atomic_t new_mode = ATOMIC_INIT(MODE_NON_RT);
2400+/* Number of RT tasks that exist in the system */
2401+atomic_t n_rt_tasks = ATOMIC_INIT(0);
2402+
2403+/* Only one CPU may perform a mode change. */
2404+static queuelock_t mode_change_lock;
2405+
2406+/* The time instant when we switched to RT mode */
2407+volatile jiffie_t rt_start_time = 0;
2408+
2409+/* To send signals from the scheduler
2410+ * Must drop locks first.
2411+ */
2412+static LIST_HEAD(sched_sig_list);
2413+static DEFINE_SPINLOCK(sched_sig_list_lock);
2414+
2415+/**
2416+ * sys_set_rt_mode
2417+ * @newmode: new mode the scheduler must be switched to
2418+ * External syscall for setting the RT mode flag
2419+ * Returns EINVAL if mode is not recognized or mode transition is
2420+ * not permitted
2421+ * On success 0 is returned
2422+ *
2423+ * FIXME: In a "real" OS we cannot just let any user switch the mode...
2424+ */
2425+asmlinkage long sys_set_rt_mode(int newmode)
2426+{
2427+ if ((newmode == MODE_NON_RT) || (newmode == MODE_RT_RUN)) {
2428+ printk(KERN_INFO "real-time mode switch to %s\n",
2429+ (newmode == MODE_RT_RUN ? "rt" : "non-rt"));
2430+ atomic_set(&new_mode, newmode);
2431+ return 0;
2432+ }
2433+ return -EINVAL;
2434+}
2435+
2436+/*
2437+ * sys_set_task_rt_param
2438+ * @pid: Pid of the task which scheduling parameters must be changed
2439+ * @param: New real-time extension parameters such as the execution cost and
2440+ * period
2441+ * Syscall for manipulating with task rt extension params
2442+ * Returns EFAULT if param is NULL.
2443+ * ESRCH if pid is not corrsponding
2444+ * to a valid task.
2445+ * EINVAL if either period or execution cost is <=0
2446+ * EPERM if pid is a real-time task
2447+ * 0 if success
2448+ *
2449+ * Only non-real-time tasks may be configured with this system call
2450+ * to avoid races with the scheduler. In practice, this means that a
2451+ * task's parameters must be set _before_ calling sys_prepare_rt_task()
2452+ */
2453+asmlinkage long sys_set_rt_task_param(pid_t pid, rt_param_t __user * param)
2454+{
2455+ rt_param_t tp;
2456+ struct task_struct *target;
2457+ int retval = -EINVAL;
2458+
2459+ printk("Setting up rt task parameters for process %d.\n", pid);
2460+
2461+ if (pid < 0 || param == 0) {
2462+ goto out;
2463+ }
2464+ if (copy_from_user(&tp, param, sizeof(tp))) {
2465+ retval = -EFAULT;
2466+ goto out;
2467+ }
2468+
2469+ /* Task search and manipulation must be protected */
2470+ read_lock_irq(&tasklist_lock);
2471+ if (!(target = find_task_by_pid(pid))) {
2472+ retval = -ESRCH;
2473+ goto out_unlock;
2474+ }
2475+
2476+ if (is_realtime(target)) {
2477+ /* The task is already a real-time task.
2478+ * We cannot not allow parameter changes at this point.
2479+ */
2480+ retval = -EPERM;
2481+ goto out_unlock;
2482+ }
2483+
2484+ if (tp.exec_cost <= 0)
2485+ goto out_unlock;
2486+ if (tp.period <= 0)
2487+ goto out_unlock;
2488+ if (!cpu_online(tp.cpu))
2489+ goto out_unlock;
2490+ if (tp.period < tp.exec_cost)
2491+ {
2492+ printk(KERN_INFO "litmus: real-time task %d rejected "
2493+ "because wcet > period\n", pid);
2494+ goto out_unlock;
2495+ }
2496+
2497+ /* Assign params */
2498+ target->rt_param.basic_params = tp;
2499+
2500+ retval = 0;
2501+ out_unlock:
2502+ read_unlock_irq(&tasklist_lock);
2503+ out:
2504+ return retval;
2505+}
2506+
2507+/* Getter of task's RT params
2508+ * returns EINVAL if param or pid is NULL
2509+ * returns ESRCH if pid does not correspond to a valid task
2510+ * returns EFAULT if copying of parameters has failed.
2511+ */
2512+asmlinkage long sys_get_rt_task_param(pid_t pid, rt_param_t __user * param)
2513+{
2514+ int retval = -EINVAL;
2515+ struct task_struct *source;
2516+ rt_param_t lp;
2517+ if (param == 0 || pid < 0)
2518+ goto out;
2519+ read_lock(&tasklist_lock);
2520+ if (!(source = find_task_by_pid(pid))) {
2521+ retval = -ESRCH;
2522+ goto out_unlock;
2523+ }
2524+ lp = source->rt_param.basic_params;
2525+ read_unlock(&tasklist_lock);
2526+ /* Do copying outside the lock */
2527+ retval =
2528+ copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0;
2529+ return retval;
2530+ out_unlock:
2531+ read_unlock(&tasklist_lock);
2532+ out:
2533+ return retval;
2534+
2535+}
2536+
2537+/*
2538+ * sys_set_service_levels
2539+ * @pid: Pid of the task that is to be configured
2540+ * @count: The number of service levels
2541+ * @levels: The new service levels.
2542+ *
2543+ * Returns EFAULT if levels is not a valid address.
2544+ * ESRCH if pid is not corrsponding
2545+ * to a valid task.
2546+ * EINVAL if either period or execution cost is <=0 for any level,
2547+ * of if utility is not incresing.
2548+ * EPERM if pid is a real-time task
2549+ * ENOMEM if there is insufficient memory available
2550+ * 0 if success
2551+ *
2552+ * May not be used on RT tasks to avoid races.
2553+ */
2554+asmlinkage long sys_set_service_levels(pid_t pid,
2555+ unsigned int count,
2556+ service_level_t __user *levels)
2557+{
2558+ struct task_struct *target;
2559+ service_level_t level, *klevels;
2560+ int retval = -EINVAL, i;
2561+ fp_t last_value = FP(0);
2562+ fp_t last_weight = FP(0);
2563+
2564+ TRACE("Setting up service levels for process %d.\n", pid);
2565+
2566+ if (pid < 0 || count > MAX_SERVICE_LEVELS) {
2567+ goto out;
2568+ }
2569+
2570+ /* Task search and manipulation must be protected */
2571+ read_lock_irq(&tasklist_lock);
2572+ if (!(target = find_task_by_pid(pid))) {
2573+ retval = -ESRCH;
2574+ read_unlock_irq(&tasklist_lock);
2575+ goto out;
2576+ }
2577+ read_unlock_irq(&tasklist_lock);
2578+
2579+ if (is_realtime(target)) {
2580+ /* The task is already a real-time task.
2581+ * We cannot not allow parameter changes at this point.
2582+ */
2583+ retval = -EPERM;
2584+ goto out;
2585+ }
2586+
2587+ /* get rid of old service levels, if any */
2588+ kfree(target->rt_param.service_level);
2589+ target->rt_param.service_level = NULL;
2590+ target->rt_param.no_service_levels = 0;
2591+
2592+ /* count == 0 means tear down service levels*/
2593+ if (count == 0) {
2594+ retval = 0;
2595+ goto out;
2596+ }
2597+
2598+ klevels = kmalloc(sizeof(service_level_t) * count, GFP_KERNEL);
2599+ if (!klevels) {
2600+ retval = -ENOMEM;
2601+ goto out;
2602+ }
2603+
2604+ for (i = 0; i < count; i++) {
2605+ if (copy_from_user(&level, levels + i, sizeof(level))) {
2606+ retval = -EFAULT;
2607+ kfree(klevels);
2608+ goto out;
2609+ }
2610+ if (level.period <= 0) {
2611+ TRACE("service level %d period <= 0\n", i);
2612+ goto out;
2613+ }
2614+ if (_leq(level.weight, last_weight)) {
2615+ TRACE("service level %d weight non-increase\n", i);
2616+ goto out;
2617+ }
2618+ if (_leq(level.value, last_value)) {
2619+ TRACE("service level %d value non-increase\n", i);
2620+ goto out;
2621+ }
2622+ last_value = level.value;
2623+ last_weight = level.weight;
2624+ klevels[i] = level;
2625+ }
2626+ target->rt_param.basic_params.exec_cost = _round(_mul(klevels[0].weight,
2627+ FP(klevels[0].period)));
2628+ target->rt_param.basic_params.period = klevels[0].period;
2629+ target->rt_param.service_level = klevels;
2630+ target->rt_param.no_service_levels = count;
2631+ retval = 0;
2632+
2633+ out:
2634+ return retval;
2635+}
2636+
2637+asmlinkage long sys_get_cur_service_level(void)
2638+{
2639+ long level;
2640+
2641+ if (!is_realtime(current))
2642+ return -EINVAL;
2643+
2644+ /* block scheduler that might cause reweighting to happen */
2645+ local_irq_disable();
2646+ level = current->rt_param.cur_service_level;
2647+ local_irq_enable();
2648+ return level;
2649+}
2650+
2651+
2652+/*
2653+ * sys_prepare_rt_task
2654+ * @pid: Pid of the task we want to prepare for RT mode
2655+ * Syscall for adding a task to RT queue, plugin dependent.
2656+ * Must be called before RT tasks are going to start up.
2657+ * Returns EPERM if current plugin does not define prepare operation
2658+ * or scheduling policy does not allow the operation.
2659+ * ESRCH if pid does not correspond to a valid task.
2660+ * EINVAL if a task is non-realtime or in invalid state
2661+ * from underlying plugin function
2662+ * EAGAIN if a task is not in the right state
2663+ * ENOMEM if there is no memory space to handle this task
2664+ * 0 if success
2665+ */
2666+asmlinkage long sys_prepare_rt_task(pid_t pid)
2667+{
2668+ int retval = -EINVAL;
2669+ struct task_struct *target = 0;
2670+ /* If a plugin does not define preparation mode then nothing to do */
2671+ if (curr_sched_plugin->prepare_task == 0
2672+ || sched_policy == SCHED_DEFAULT) {
2673+ retval = -EPERM;
2674+ goto out_prepare;
2675+ }
2676+ read_lock_irq(&tasklist_lock);
2677+ if (!(target = find_task_by_pid(pid))) {
2678+ retval = -ESRCH;
2679+ goto out_prepare_unlock;
2680+ }
2681+ if (!cpu_online(get_partition(target)))
2682+ {
2683+ printk(KERN_WARNING "litmus prepare: cpu %d is not online\n",
2684+ get_partition(target));
2685+ goto out_prepare_unlock;
2686+ }
2687+ retval = curr_sched_plugin->prepare_task(target);
2688+ if (!retval) {
2689+ atomic_inc(&n_rt_tasks);
2690+ target->rt_param.is_realtime = 1;
2691+ target->rt_param.litmus_controlled = 1;
2692+ }
2693+ out_prepare_unlock:
2694+ read_unlock_irq(&tasklist_lock);
2695+ out_prepare:
2696+ return retval;
2697+}
2698+
2699+
2700+/* implemented in kernel/litmus_sem.c */
2701+void srp_ceiling_block(void);
2702+
2703+/*
2704+ * This is the crucial function for periodic task implementation,
2705+ * It checks if a task is periodic, checks if such kind of sleep
2706+ * is permitted and calls plugin-specific sleep, which puts the
2707+ * task into a wait array.
2708+ * returns 0 on successful wakeup
2709+ * returns EPERM if current conditions do not permit such sleep
2710+ * returns EINVAL if current task is not able to go to sleep
2711+ */
2712+asmlinkage long sys_sleep_next_period(void)
2713+{
2714+ int retval = -EPERM;
2715+ if (!is_realtime(current)) {
2716+ retval = -EINVAL;
2717+ goto out;
2718+ }
2719+ /* Task with negative or zero period cannot sleep */
2720+ if (get_rt_period(current) <= 0) {
2721+ retval = -EINVAL;
2722+ goto out;
2723+ }
2724+ /* The plugin has to put the task into an
2725+ * appropriate queue and call schedule
2726+ */
2727+ retval = curr_sched_plugin->sleep_next_period();
2728+ if (!retval && is_subject_to_srp(current))
2729+ srp_ceiling_block();
2730+ out:
2731+ return retval;
2732+}
2733+
2734+/* This is an "improved" version of sys_sleep_next_period() that
2735+ * addresses the problem of unintentionally missing a job after
2736+ * an overrun.
2737+ *
2738+ * returns 0 on successful wakeup
2739+ * returns EPERM if current conditions do not permit such sleep
2740+ * returns EINVAL if current task is not able to go to sleep
2741+ */
2742+asmlinkage long sys_wait_for_job_release(unsigned int job)
2743+{
2744+ int retval = -EPERM;
2745+ if (!is_realtime(current)) {
2746+ retval = -EINVAL;
2747+ goto out;
2748+ }
2749+
2750+ /* Task with negative or zero period cannot sleep */
2751+ if (get_rt_period(current) <= 0) {
2752+ retval = -EINVAL;
2753+ goto out;
2754+ }
2755+
2756+ retval = 0;
2757+
2758+ /* first wait until we have "reached" the desired job
2759+ *
2760+ * This implementation has at least two problems:
2761+ *
2762+ * 1) It doesn't gracefully handle the wrap around of
2763+ * job_no. Since LITMUS is a prototype, this is not much
2764+ * of a problem right now.
2765+ *
2766+ * 2) It is theoretically racy if a job release occurs
2767+ * between checking job_no and calling sleep_next_period().
2768+ * A proper solution would requiring adding another callback
2769+ * in the plugin structure and testing the condition with
2770+ * interrupts disabled.
2771+ *
2772+ * FIXME: At least problem 2 should be taken care of eventually.
2773+ */
2774+ while (!retval && job > current->rt_param.times.job_no)
2775+ /* If the last job overran then job <= job_no and we
2776+ * don't send the task to sleep.
2777+ */
2778+ retval = curr_sched_plugin->sleep_next_period();
2779+
2780+ /* We still have to honor the SRP after the actual release.
2781+ */
2782+ if (!retval && is_subject_to_srp(current))
2783+ srp_ceiling_block();
2784+ out:
2785+ return retval;
2786+}
2787+
2788+/* This is a helper syscall to query the current job sequence number.
2789+ *
2790+ * returns 0 on successful query
2791+ * returns EPERM if task is not a real-time task.
2792+ * returns EFAULT if &job is not a valid pointer.
2793+ */
2794+asmlinkage long sys_query_job_no(unsigned int __user *job)
2795+{
2796+ int retval = -EPERM;
2797+ if (is_realtime(current))
2798+ retval = put_user(current->rt_param.times.job_no, job);
2799+
2800+ return retval;
2801+}
2802+
2803+
2804+/* The LITMUS tick function. It manages the change to and from real-time mode
2805+ * and then calls the plugin's tick function.
2806+ */
2807+reschedule_check_t __sched rt_scheduler_tick(void)
2808+{
2809+ /* Check for mode change */
2810+ if ((get_rt_mode() != atomic_read(&new_mode))) {
2811+ queue_lock(&mode_change_lock);
2812+ // If the mode is already changed, proceed
2813+ if (get_rt_mode() == atomic_read(&new_mode)) {
2814+ queue_unlock(&mode_change_lock);
2815+ goto proceed;
2816+ }
2817+ // change the mode
2818+ if ((atomic_read(&new_mode) == MODE_RT_RUN)) {
2819+ /* The deferral of entering real-time mode should be
2820+ * handled by deferring task releases in the plugin.
2821+ * The plugin interface does not really need to know
2822+ * about quanta, that is the plugin's job.
2823+ */
2824+
2825+ /* update rt start time */
2826+ rt_start_time = jiffies;
2827+ printk(KERN_INFO "Real-Time mode enabled at %ld "
2828+ "on %d\n",
2829+ jiffies, smp_processor_id());
2830+ } else
2831+ printk(KERN_INFO "Real-Time mode disabled at %ld "
2832+ "on %d\n",
2833+ jiffies, smp_processor_id());
2834+ if (curr_sched_plugin->mode_change)
2835+ curr_sched_plugin->
2836+ mode_change(atomic_read(&new_mode));
2837+ printk(KERN_INFO "Plugin mode change done at %ld\n",
2838+ jiffies);
2839+ set_rt_mode(atomic_read(&new_mode));
2840+ queue_unlock(&mode_change_lock);
2841+ }
2842+
2843+ proceed:
2844+ /* Call plugin-defined tick handler
2845+ *
2846+ * It is the plugin's tick handler' job to detect quantum
2847+ * boundaries in pfair.
2848+ */
2849+ return curr_sched_plugin->scheduler_tick();
2850+}
2851+
2852+asmlinkage spolicy sys_sched_setpolicy(spolicy newpolicy)
2853+{
2854+ /* Dynamic policy change is disabled at the moment */
2855+ return SCHED_INVALID;
2856+}
2857+
2858+asmlinkage spolicy sys_sched_getpolicy(void)
2859+{
2860+ return sched_policy;
2861+}
2862+
2863+
2864+asmlinkage int sys_scheduler_setup(int cmd, void __user *parameter)
2865+{
2866+ int ret = -EINVAL;
2867+
2868+ ret = curr_sched_plugin->scheduler_setup(cmd, parameter);
2869+
2870+ return ret;
2871+}
2872+
2873+struct sched_sig {
2874+ struct list_head list;
2875+ struct task_struct* task;
2876+ unsigned int signal:31;
2877+ int force:1;
2878+};
2879+
2880+static void __scheduler_signal(struct task_struct *t, unsigned int signo,
2881+ int force)
2882+{
2883+ struct sched_sig* sig;
2884+
2885+ sig = kmalloc(GFP_ATOMIC, sizeof(struct sched_sig));
2886+ if (!sig) {
2887+ TRACE_TASK(t, "dropping signal: %u\n", t);
2888+ return;
2889+ }
2890+
2891+ spin_lock(&sched_sig_list_lock);
2892+
2893+ sig->signal = signo;
2894+ sig->force = force;
2895+ sig->task = t;
2896+ get_task_struct(t);
2897+ list_add(&sig->list, &sched_sig_list);
2898+
2899+ spin_unlock(&sched_sig_list_lock);
2900+}
2901+
2902+void scheduler_signal(struct task_struct *t, unsigned int signo)
2903+{
2904+ __scheduler_signal(t, signo, 0);
2905+}
2906+
2907+void force_scheduler_signal(struct task_struct *t, unsigned int signo)
2908+{
2909+ __scheduler_signal(t, signo, 1);
2910+}
2911+
2912+void send_scheduler_signals(void)
2913+{
2914+ unsigned long flags;
2915+ struct list_head *p, *extra;
2916+ struct siginfo info;
2917+ struct sched_sig* sig;
2918+ struct task_struct* t;
2919+ struct list_head claimed;
2920+
2921+ if (spin_trylock_irqsave(&sched_sig_list_lock, flags)) {
2922+ if (list_empty(&sched_sig_list))
2923+ p = NULL;
2924+ else {
2925+ p = sched_sig_list.next;
2926+ list_del(&sched_sig_list);
2927+ INIT_LIST_HEAD(&sched_sig_list);
2928+ }
2929+ spin_unlock_irqrestore(&sched_sig_list_lock, flags);
2930+
2931+ /* abort if there are no signals */
2932+ if (!p)
2933+ return;
2934+
2935+ /* take signal list we just obtained */
2936+ list_add(&claimed, p);
2937+
2938+ list_for_each_safe(p, extra, &claimed) {
2939+ list_del(p);
2940+ sig = list_entry(p, struct sched_sig, list);
2941+ t = sig->task;
2942+ info.si_signo = sig->signal;
2943+ info.si_errno = 0;
2944+ info.si_code = SI_KERNEL;
2945+ info.si_pid = 1;
2946+ info.si_uid = 0;
2947+ TRACE("sending signal %d to %d\n", info.si_signo,
2948+ t->pid);
2949+ if (sig->force)
2950+ force_sig_info(sig->signal, &info, t);
2951+ else
2952+ send_sig_info(sig->signal, &info, t);
2953+ put_task_struct(t);
2954+ kfree(sig);
2955+ }
2956+ }
2957+
2958+}
2959+
2960+static inline void np_mem_error(struct task_struct* t, const char* reason)
2961+{
2962+ if (t->state != TASK_DEAD && !(t->flags & PF_EXITING)) {
2963+ TRACE("np section: %s => %s/%d killed\n",
2964+ reason, t->comm, t->pid);
2965+ force_scheduler_signal(t, SIGKILL);
2966+ }
2967+}
2968+
2969+/* sys_register_np_flag() allows real-time tasks to register an
2970+ * np section indicator.
2971+ * returns 0 if the flag was successfully registered
2972+ * returns EINVAL if current task is not a real-time task
2973+ * returns EFAULT if *flag couldn't be written
2974+ */
2975+asmlinkage long sys_register_np_flag(short __user *flag)
2976+{
2977+ int retval = -EINVAL;
2978+ short test_val = RT_PREEMPTIVE;
2979+
2980+ /* avoid races with the scheduler */
2981+ preempt_disable();
2982+ TRACE("reg_np_flag(%p) for %s/%d\n", flag,
2983+ current->comm, current->pid);
2984+ if (!is_realtime(current))
2985+ goto out;
2986+
2987+ /* Let's first try to write to the address.
2988+ * That way it is initialized and any bugs
2989+ * involving dangling pointers will caught
2990+ * early.
2991+ * NULL indicates disabling np section support
2992+ * and should not be tested.
2993+ */
2994+ if (flag)
2995+ retval = poke_kernel_address(test_val, flag);
2996+ else
2997+ retval = 0;
2998+ TRACE("reg_np_flag: retval=%d\n", retval);
2999+ if (unlikely(0 != retval))
3000+ np_mem_error(current, "np flag: not writable");
3001+ else
3002+ /* the pointer is ok */
3003+ current->rt_param.np_flag = flag;
3004+
3005+ out:
3006+ preempt_enable();
3007+ /* force rescheduling so that we can be preempted */
3008+ return retval;
3009+}
3010+
3011+
3012+void request_exit_np(struct task_struct *t)
3013+{
3014+ int ret;
3015+ short flag;
3016+
3017+ /* We can only do this if t is actually currently scheduled on this CPU
3018+ * because otherwise we are in the wrong address space. Thus make sure
3019+ * to check.
3020+ */
3021+ BUG_ON(t != current);
3022+
3023+ if (unlikely(!is_realtime(t) || !t->rt_param.np_flag)) {
3024+ TRACE_TASK(t, "request_exit_np(): BAD TASK!\n");
3025+ return;
3026+ }
3027+
3028+ flag = RT_EXIT_NP_REQUESTED;
3029+ ret = poke_kernel_address(flag, t->rt_param.np_flag + 1);
3030+ TRACE("request_exit_np(%s/%d)\n", t->comm, t->pid);
3031+ if (unlikely(0 != ret))
3032+ np_mem_error(current, "request_exit_np(): flag not writable");
3033+
3034+}
3035+
3036+
3037+int is_np(struct task_struct* t)
3038+{
3039+ int ret;
3040+ unsigned short flag = 0x5858; /* = XX, looks nicer in debug*/
3041+
3042+ BUG_ON(t != current);
3043+
3044+ if (unlikely(t->rt_param.kernel_np))
3045+ return 1;
3046+ else if (unlikely(t->rt_param.np_flag == NULL) ||
3047+ t->flags & PF_EXITING ||
3048+ t->state == TASK_DEAD)
3049+ return 0;
3050+ else {
3051+ /* This is the tricky part. The process has registered a
3052+ * non-preemptive section marker. We now need to check whether
3053+ * it is set to to NON_PREEMPTIVE. Along the way we could
3054+ * discover that the pointer points to an unmapped region (=>
3055+ * kill the task) or that the location contains some garbage
3056+ * value (=> also kill the task). Killing the task in any case
3057+ * forces userspace to play nicely. Any bugs will be discovered
3058+ * immediately.
3059+ */
3060+ ret = probe_kernel_address(t->rt_param.np_flag, flag);
3061+ if (0 == ret && (flag == RT_NON_PREEMPTIVE ||
3062+ flag == RT_PREEMPTIVE))
3063+ return flag != RT_PREEMPTIVE;
3064+ else {
3065+ /* either we could not read from the address or
3066+ * it contained garbage => kill the process
3067+ * FIXME: Should we cause a SEGFAULT instead?
3068+ */
3069+ TRACE("is_np: ret=%d flag=%c%c (%x)\n", ret,
3070+ flag & 0xff, (flag >> 8) & 0xff, flag);
3071+ np_mem_error(t, "is_np() could not read");
3072+ return 0;
3073+ }
3074+ }
3075+}
3076+
3077+/*
3078+ * sys_exit_np() allows real-time tasks to signal that it left a
3079+ * non-preemptable section. It will be called after the kernel requested a
3080+ * callback in the preemption indicator flag.
3081+ * returns 0 if the signal was valid and processed.
3082+ * returns EINVAL if current task is not a real-time task
3083+ */
3084+asmlinkage long sys_exit_np(void)
3085+{
3086+ int retval = -EINVAL;
3087+
3088+ TS_EXIT_NP_START;
3089+
3090+ if (!is_realtime(current))
3091+ goto out;
3092+
3093+ TRACE("sys_exit_np(%s/%d)\n", current->comm, current->pid);
3094+ /* force rescheduling so that we can be preempted */
3095+ set_tsk_need_resched(current);
3096+ retval = 0;
3097+ out:
3098+
3099+ TS_EXIT_NP_END;
3100+ return retval;
3101+}
3102+
3103+void exit_litmus(struct task_struct *dead_tsk)
3104+{
3105+ kfree(dead_tsk->rt_param.service_level);
3106+ curr_sched_plugin->tear_down(dead_tsk);
3107+}
3108+
3109+
3110+void list_qsort(struct list_head* list, list_cmp_t less_than)
3111+{
3112+ struct list_head lt;
3113+ struct list_head geq;
3114+ struct list_head *pos, *extra, *pivot;
3115+ int n_lt = 0, n_geq = 0;
3116+ BUG_ON(!list);
3117+
3118+ if (list->next == list)
3119+ return;
3120+
3121+ INIT_LIST_HEAD(&lt);
3122+ INIT_LIST_HEAD(&geq);
3123+
3124+ pivot = list->next;
3125+ list_del(pivot);
3126+ list_for_each_safe(pos, extra, list) {
3127+ list_del(pos);
3128+ if (less_than(pos, pivot)) {
3129+ list_add(pos, &lt);
3130+ n_lt++;
3131+ } else {
3132+ list_add(pos, &geq);
3133+ n_geq++;
3134+ }
3135+ }
3136+ if (n_lt < n_geq) {
3137+ list_qsort(&lt, less_than);
3138+ list_qsort(&geq, less_than);
3139+ } else {
3140+ list_qsort(&geq, less_than);
3141+ list_qsort(&lt, less_than);
3142+ }
3143+ list_splice(&geq, list);
3144+ list_add(pivot, list);
3145+ list_splice(&lt, list);
3146+}
3147+
3148+#ifdef CONFIG_MAGIC_SYSRQ
3149+/* We offer the possibility to change the real-time mode of the system
3150+ * with a magic sys request. This helps in debugging in case the system fails
3151+ * to perform its planned switch back to normal mode. This may happen if we have
3152+ * total system utilization and the task that is supposed to do the switch is
3153+ * always preempted (if it is not a real-time task).
3154+ */
3155+int sys_kill(int pid, int sig);
3156+
3157+
3158+static void sysrq_handle_toGgle_rt_mode(int key, struct tty_struct *tty)
3159+{
3160+ sys_set_rt_mode(get_rt_mode() == MODE_NON_RT);
3161+}
3162+
3163+static struct sysrq_key_op sysrq_toGgle_rt_mode_op = {
3164+ .handler = sysrq_handle_toGgle_rt_mode,
3165+ .help_msg = "toGgle-rt-mode",
3166+ .action_msg = "real-time mode changed",
3167+};
3168+
3169+static void sysrq_handle_kill_rt_tasks(int key, struct tty_struct *tty)
3170+{
3171+ struct task_struct *t;
3172+ read_lock(&tasklist_lock);
3173+ for_each_process(t) {
3174+ if (is_realtime(t)) {
3175+ sys_kill(t->pid, SIGKILL);
3176+ }
3177+ }
3178+ read_unlock(&tasklist_lock);
3179+}
3180+
3181+static struct sysrq_key_op sysrq_kill_rt_tasks_op = {
3182+ .handler = sysrq_handle_kill_rt_tasks,
3183+ .help_msg = "Quit-rt-tasks",
3184+ .action_msg = "sent SIGKILL to all real-time tasks",
3185+};
3186+#endif
3187+
3188+/*
3189+ * Scheduler initialization so that customized scheduler is
3190+ * enabled at boot time
3191+ * by setting boot option "rtsched=plugin_name", e.g. "rtsched=pfair"
3192+ */
3193+
3194+/* All we need to know about other plugins is their initialization
3195+ * functions. These functions initialize internal data structures of a
3196+ * scheduler and return a pointer to initialized sched_plugin data
3197+ * structure with pointers to scheduling function implementations.
3198+ * If called repeatedly these init functions just return an existing
3199+ * plugin pointer.
3200+ */
3201+sched_plugin_t *init_global_edf_plugin(void);
3202+sched_plugin_t *init_global_edf_np_plugin(void);
3203+sched_plugin_t *init_part_edf_plugin(void);
3204+sched_plugin_t *init_edf_hsb_plugin(void);
3205+sched_plugin_t *init_pfair_plugin(void);
3206+sched_plugin_t *init_gsn_edf_plugin(void);
3207+sched_plugin_t *init_psn_edf_plugin(void);
3208+sched_plugin_t *init_adaptive_plugin(void);
3209+
3210+/* keep everything needed to setup plugins in one place */
3211+
3212+/* we are lazy, so we use a convention for function naming to fill
3213+ * a table
3214+ */
3215+#define PLUGIN(caps, small) \
3216+ {PLUGIN_ ## caps, SCHED_ ## caps, init_ ## small ## _plugin}
3217+
3218+#define init_nosetup_plugin 0
3219+
3220+static struct {
3221+ const char *name;
3222+ const spolicy policy_id;
3223+ sched_plugin_t *(*init) (void);
3224+} available_plugins[] = {
3225+ PLUGIN(LINUX, nosetup),
3226+ PLUGIN(GLOBAL_EDF_NP, global_edf_np),
3227+ PLUGIN(GLOBAL_EDF, global_edf),
3228+ PLUGIN(PART_EDF, part_edf),
3229+ PLUGIN(EDF_HSB, edf_hsb),
3230+ PLUGIN(PFAIR, pfair),
3231+ PLUGIN(GSN_EDF, gsn_edf),
3232+ PLUGIN(PSN_EDF, psn_edf),
3233+ PLUGIN(ADAPTIVE, adaptive),
3234+ /*********************************************
3235+ * Add your custom plugin here
3236+ **********************************************/
3237+};
3238+
3239+/* Some plugins may leave important functions unused. We define dummies
3240+ * so that we don't have to check for null pointers all over the place.
3241+ */
3242+void litmus_dummy_finish_switch(struct task_struct * prev);
3243+int litmus_dummy_schedule(struct task_struct * prev, struct task_struct** next,
3244+ runqueue_t* q);
3245+reschedule_check_t litmus_dummy_scheduler_tick(void);
3246+long litmus_dummy_prepare_task(struct task_struct *t);
3247+void litmus_dummy_wake_up_task(struct task_struct *task);
3248+void litmus_dummy_task_blocks(struct task_struct *task);
3249+long litmus_dummy_tear_down(struct task_struct *task);
3250+int litmus_dummy_scheduler_setup(int cmd, void __user *parameter);
3251+long litmus_dummy_sleep_next_period(void);
3252+long litmus_dummy_inherit_priority(struct pi_semaphore *sem,
3253+ struct task_struct *new_owner);
3254+long litmus_dummy_return_priority(struct pi_semaphore *sem);
3255+long litmus_dummy_pi_block(struct pi_semaphore *sem,
3256+ struct task_struct *t);
3257+
3258+#define CHECK(func) {\
3259+ if (!curr_sched_plugin->func) \
3260+ curr_sched_plugin->func = litmus_dummy_ ## func;}
3261+
3262+static int boot_sched_setup(char *plugin_name)
3263+{
3264+ int i = 0;
3265+
3266+ /* Common initializers,
3267+ * mode change lock is used to enforce single mode change
3268+ * operation.
3269+ */
3270+ queue_lock_init(&mode_change_lock);
3271+
3272+ printk("Starting LITMUS^RT kernel\n");
3273+
3274+ /* Look for a matching plugin.
3275+ */
3276+ for (i = 0; i < ARRAY_SIZE(available_plugins); i++) {
3277+ if (!strcmp(plugin_name, available_plugins[i].name)) {
3278+ printk("Using %s scheduler plugin\n", plugin_name);
3279+ sched_policy = available_plugins[i].policy_id;
3280+ if (available_plugins[i].init)
3281+ curr_sched_plugin = available_plugins[i].init();
3282+ goto out;
3283+ }
3284+ }
3285+
3286+
3287+ /* Otherwise we have default linux scheduler */
3288+ printk("Plugin name %s is unknown, using default %s\n", plugin_name,
3289+ curr_sched_plugin->plugin_name);
3290+
3291+out:
3292+ /* make sure we don't trip over null pointers later */
3293+ CHECK(finish_switch);
3294+ CHECK(schedule);
3295+ CHECK(scheduler_tick);
3296+ CHECK(wake_up_task);
3297+ CHECK(tear_down);
3298+ CHECK(task_blocks);
3299+ CHECK(prepare_task);
3300+ CHECK(scheduler_setup);
3301+ CHECK(sleep_next_period);
3302+ CHECK(inherit_priority);
3303+ CHECK(return_priority);
3304+ CHECK(pi_block);
3305+
3306+#ifdef CONFIG_MAGIC_SYSRQ
3307+ /* offer some debugging help */
3308+ if (!register_sysrq_key('g', &sysrq_toGgle_rt_mode_op))
3309+ printk("Registered eXit real-time mode magic sysrq.\n");
3310+ else
3311+ printk("Could not register eXit real-time mode magic sysrq.\n");
3312+ if (!register_sysrq_key('q', &sysrq_kill_rt_tasks_op))
3313+ printk("Registered kill rt tasks magic sysrq.\n");
3314+ else
3315+ printk("Could not register kill rt tasks magic sysrq.\n");
3316+#endif
3317+ printk("Litmus setup complete.");
3318+ return 1;
3319+}
3320+
3321+/* Register for boot option */
3322+__setup("rtsched=", boot_sched_setup);
3323diff --git a/kernel/litmus_sem.c b/kernel/litmus_sem.c
3324new file mode 100644
3325index 0000000..12a6ab1
3326--- /dev/null
3327+++ b/kernel/litmus_sem.c
3328@@ -0,0 +1,765 @@
3329+
3330+/*
3331+ * SMP- and interrupt-safe semaphores. Also PI and SRP implementations.
3332+ * Much of the code here is borrowed from include/asm-i386/semaphore.h.
3333+ *
3334+ * NOTE: This implementation is very much a prototype and horribly insecure. It
3335+ * is intended to be a proof of concept, not a feature-complete solution.
3336+ */
3337+
3338+#include <asm/atomic.h>
3339+#include <asm/semaphore.h>
3340+#include <linux/sched.h>
3341+#include <linux/wait.h>
3342+#include <linux/spinlock.h>
3343+#include <linux/queuelock.h>
3344+#include <linux/litmus.h>
3345+#include <linux/sched_plugin.h>
3346+#include <linux/edf_common.h>
3347+
3348+
3349+#include <linux/trace.h>
3350+/* ************************************************************************** */
3351+/* STANDARD FIFO SEMAPHORES */
3352+/* ************************************************************************** */
3353+
3354+#define MAX_SEMAPHORES 16000
3355+#define MAX_PI_SEMAPHORES 16000
3356+#define MAX_SRP_SEMAPHORES 16000
3357+
3358+
3359+struct semaphore sems[MAX_SEMAPHORES]; /* all sems */
3360+typedef int sema_id; /* Userspace ID of a semaphore */
3361+
3362+static int rt_fifo_wake_up(wait_queue_t *wait, unsigned mode, int sync,
3363+ void *key)
3364+{
3365+ struct task_struct* t = (struct task_struct*) wait->private;
3366+ set_rt_flags(t, RT_F_EXIT_SEM);
3367+ TRACE_TASK(t, "woken up by rt_fifo_wake_up(), set RT_F_EXIT_SEM\n");
3368+ default_wake_function(wait, mode, sync, key);
3369+ /* for reason why we always return 1 see rt_pi_wake_up() below */
3370+ return 1;
3371+}
3372+
3373+static fastcall void rt_fifo_up(struct semaphore * sem)
3374+{
3375+ TRACE_CUR("releases lock %p\n");
3376+ preempt_disable();
3377+ TS_FIFO_UP_START;
3378+ if (atomic_inc_return(&sem->count) < 1)
3379+ /* there is a task queued */
3380+ wake_up(&sem->wait);
3381+ TS_FIFO_UP_END;
3382+ preempt_enable();
3383+}
3384+
3385+/* not optimized like the Linux down() implementation, but then
3386+ * again we incur the cost of a syscall anyway, so this hardly matters
3387+ */
3388+static fastcall void rt_fifo_down(struct semaphore * sem)
3389+{
3390+ struct task_struct *tsk = current;
3391+ wait_queue_t wait = {
3392+ .private = tsk,
3393+ .func = rt_fifo_wake_up,
3394+ .task_list = {NULL, NULL}
3395+ };
3396+
3397+ preempt_disable();
3398+ TS_FIFO_DOWN_START;
3399+
3400+ spin_lock(&sem->wait.lock);
3401+ if (atomic_dec_return(&sem->count) < 0 ||
3402+ waitqueue_active(&sem->wait)) {
3403+ /* we need to suspend */
3404+ tsk->state = TASK_UNINTERRUPTIBLE;
3405+ add_wait_queue_exclusive_locked(&sem->wait, &wait);
3406+
3407+ TRACE_CUR("suspends on lock %p\n", sem);
3408+
3409+ /* release lock before sleeping */
3410+ spin_unlock(&sem->wait.lock);
3411+
3412+ TS_FIFO_DOWN_END;
3413+ preempt_enable_no_resched();
3414+
3415+ /* we depend on the FIFO order
3416+ * Thus, we don't need to recheck when we wake up, we
3417+ * are guaranteed to have the lock since there is only one
3418+ * wake up per release
3419+ */
3420+ schedule();
3421+
3422+ TRACE_CUR("woke up, now owns lock %p\n", sem);
3423+
3424+ /* try_to_wake_up() set our state to TASK_RUNNING,
3425+ * all we need to do is to remove our wait queue entry
3426+ */
3427+ spin_lock(&sem->wait.lock);
3428+ remove_wait_queue_locked(&sem->wait, &wait);
3429+ spin_unlock(&sem->wait.lock);
3430+ } else {
3431+ TRACE_CUR("acquired lock %p, no contention\n", sem);
3432+ spin_unlock(&sem->wait.lock);
3433+ TS_FIFO_DOWN_END;
3434+ preempt_enable();
3435+ }
3436+}
3437+
3438+
3439+
3440+/* Initialize semaphores at boot time. */
3441+static int __init sema_boot_init(void)
3442+{
3443+ sema_id sem_id;
3444+
3445+ printk("Initializing semaphores...");
3446+ for (sem_id = 0; sem_id < MAX_SEMAPHORES; sem_id++)
3447+ sems[sem_id].used = 0;
3448+ printk(" done!\n");
3449+
3450+ return 0;
3451+}
3452+__initcall(sema_boot_init);
3453+
3454+/* Find a free semaphore and return. */
3455+asmlinkage long sys_sema_init (void)
3456+{
3457+ sema_id sem_id;
3458+
3459+ for (sem_id = 0; sem_id < MAX_SEMAPHORES; sem_id++) {
3460+ if (!cmpxchg(&sems[sem_id].used, 0, 1)) {
3461+ sema_init(&sems[sem_id], 1);
3462+ return sem_id;
3463+ }
3464+ }
3465+ return -ENOMEM;
3466+}
3467+
3468+asmlinkage long sys_down(sema_id sem_id)
3469+{
3470+ if (sem_id < 0 || sem_id >= MAX_SEMAPHORES)
3471+ return -EINVAL;
3472+
3473+ if (!sems[sem_id].used)
3474+ return -EINVAL;
3475+ /* This allows for FIFO sems and gives others a chance... */
3476+ rt_fifo_down(sems + sem_id);
3477+ return 0;
3478+}
3479+
3480+asmlinkage long sys_up(sema_id sem_id)
3481+{
3482+ if (sem_id < 0 || sem_id >= MAX_SEMAPHORES)
3483+ return -EINVAL;
3484+
3485+ if (!sems[sem_id].used)
3486+ return -EINVAL;
3487+ rt_fifo_up(sems + sem_id);
3488+ return 0;
3489+}
3490+
3491+asmlinkage long sys_sema_free(sema_id sem_id)
3492+{
3493+ struct list_head *tmp, *next;
3494+ unsigned long flags;
3495+
3496+ if (sem_id < 0 || sem_id >= MAX_SEMAPHORES)
3497+ return -EINVAL;
3498+
3499+ if (!sems[sem_id].used)
3500+ return -EINVAL;
3501+
3502+ spin_lock_irqsave(&sems[sem_id].wait.lock, flags);
3503+ if (waitqueue_active(&sems[sem_id].wait)) {
3504+ list_for_each_safe(tmp, next, &sems[sem_id].wait.task_list) {
3505+ wait_queue_t *curr = list_entry(tmp, wait_queue_t,
3506+ task_list);
3507+ list_del(tmp);
3508+ set_rt_flags((struct task_struct*)curr->private,
3509+ RT_F_EXIT_SEM);
3510+ curr->func(curr,
3511+ TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3512+ 0, NULL);
3513+ }
3514+ }
3515+
3516+ spin_unlock_irqrestore(&sems[sem_id].wait.lock, flags);
3517+ sems[sem_id].used = 0;
3518+
3519+ return 0;
3520+}
3521+
3522+
3523+
3524+
3525+/* ************************************************************************** */
3526+/* PRIORITY INHERITANCE */
3527+/* ************************************************************************** */
3528+
3529+
3530+
3531+struct pi_semaphore pi_sems[MAX_PI_SEMAPHORES]; /* all PI sems */
3532+typedef int pi_sema_id; /* Userspace ID of a pi_semaphore */
3533+
3534+struct wq_pair {
3535+ struct task_struct* tsk;
3536+ struct pi_semaphore* sem;
3537+};
3538+
3539+static int rt_pi_wake_up(wait_queue_t *wait, unsigned mode, int sync,
3540+ void *key)
3541+{
3542+ struct wq_pair* wqp = (struct wq_pair*) wait->private;
3543+ set_rt_flags(wqp->tsk, RT_F_EXIT_SEM);
3544+ curr_sched_plugin->inherit_priority(wqp->sem, wqp->tsk);
3545+ TRACE_TASK(wqp->tsk,
3546+ "woken up by rt_pi_wake_up() (RT_F_SEM_EXIT, PI)\n");
3547+ /* point to task for default_wake_function() */
3548+ wait->private = wqp->tsk;
3549+ default_wake_function(wait, mode, sync, key);
3550+
3551+ /* Always return true since we know that if we encountered a task
3552+ * that was already running the wake_up raced with the schedule in
3553+ * rt_pi_down(). In that case the task in rt_pi_down() will be scheduled
3554+ * immediately and own the lock. We must not wake up another task in
3555+ * any case.
3556+ */
3557+ return 1;
3558+}
3559+
3560+
3561+/* caller is responsible for locking */
3562+int edf_set_hp_task(struct pi_semaphore *sem)
3563+{
3564+ struct list_head *tmp, *next;
3565+ struct task_struct *queued;
3566+ int ret = 0;
3567+
3568+ sem->hp.task = NULL;
3569+ list_for_each_safe(tmp, next, &sem->wait.task_list) {
3570+ queued = ((struct wq_pair*)
3571+ list_entry(tmp, wait_queue_t,
3572+ task_list)->private)->tsk;
3573+
3574+ /* Compare task prios, find high prio task. */
3575+ if (edf_higher_prio(queued, sem->hp.task)) {
3576+ sem->hp.task = queued;
3577+ ret = 1;
3578+ }
3579+ }
3580+ return ret;
3581+}
3582+
3583+
3584+/* caller is responsible for locking */
3585+int edf_set_hp_cpu_task(struct pi_semaphore *sem, int cpu)
3586+{
3587+ struct list_head *tmp, *next;
3588+ struct task_struct *queued;
3589+ int ret = 0;
3590+
3591+ sem->hp.cpu_task[cpu] = NULL;
3592+ list_for_each_safe(tmp, next, &sem->wait.task_list) {
3593+ queued = ((struct wq_pair*)
3594+ list_entry(tmp, wait_queue_t,
3595+ task_list)->private)->tsk;
3596+
3597+ /* Compare task prios, find high prio task. */
3598+ if (get_partition(queued) == cpu &&
3599+ edf_higher_prio(queued, sem->hp.cpu_task[cpu])) {
3600+ sem->hp.cpu_task[cpu] = queued;
3601+ ret = 1;
3602+ }
3603+ }
3604+ return ret;
3605+}
3606+
3607+
3608+/* Initialize PI semaphores at boot time. */
3609+static int __init pi_sema_boot_init(void)
3610+{
3611+ pi_sema_id sem_id;
3612+
3613+ printk("Initializing PI semaphores...");
3614+ for (sem_id = 0; sem_id < MAX_PI_SEMAPHORES; sem_id++)
3615+ pi_sems[sem_id].used = 0;
3616+ printk(" done!\n");
3617+
3618+ return 0;
3619+}
3620+__initcall(pi_sema_boot_init);
3621+
3622+/* Find a free semaphore and return. */
3623+asmlinkage long sys_pi_sema_init (void)
3624+{
3625+ pi_sema_id sem_id;
3626+ int i = 0;
3627+
3628+ for (sem_id = 0; sem_id < MAX_PI_SEMAPHORES; sem_id++) {
3629+ if (!cmpxchg(&pi_sems[sem_id].used, 0, 1)) {
3630+ atomic_set(&pi_sems[sem_id].count, 1);
3631+ pi_sems[sem_id].sleepers = 0;
3632+ init_waitqueue_head(&pi_sems[sem_id].wait);
3633+ pi_sems[sem_id].hp.task = NULL;
3634+ pi_sems[sem_id].holder = NULL;
3635+ for (i = 0; i < NR_CPUS; i++)
3636+ pi_sems[sem_id].hp.cpu_task[i] = NULL;
3637+ return sem_id;
3638+ }
3639+ }
3640+ return -ENOMEM;
3641+}
3642+
3643+asmlinkage long sys_pi_down(pi_sema_id sem_id)
3644+{
3645+ struct pi_semaphore * sem;
3646+ unsigned long flags;
3647+ struct task_struct *tsk = current;
3648+ struct wq_pair pair;
3649+ long ret = -EINVAL;
3650+ wait_queue_t wait = {
3651+ .private = &pair,
3652+ .func = rt_pi_wake_up,
3653+ .task_list = {NULL, NULL}
3654+ };
3655+
3656+ preempt_disable();
3657+ TS_PI_DOWN_START;
3658+
3659+ if (sem_id < 0 || sem_id >= MAX_PI_SEMAPHORES)
3660+ goto out;
3661+
3662+ if (!pi_sems[sem_id].used)
3663+ goto out;
3664+
3665+ sem = pi_sems + sem_id;
3666+ pair.tsk = tsk;
3667+ pair.sem = sem;
3668+ spin_lock_irqsave(&sem->wait.lock, flags);
3669+
3670+ if (atomic_dec_return(&sem->count) < 0 ||
3671+ waitqueue_active(&sem->wait)) {
3672+ /* we need to suspend */
3673+ tsk->state = TASK_UNINTERRUPTIBLE;
3674+ add_wait_queue_exclusive_locked(&sem->wait, &wait);
3675+
3676+ TRACE_CUR("suspends on PI lock %p\n", sem);
3677+ curr_sched_plugin->pi_block(sem, tsk);
3678+
3679+ /* release lock before sleeping */
3680+ spin_unlock_irqrestore(&sem->wait.lock, flags);
3681+
3682+ TS_PI_DOWN_END;
3683+ preempt_enable_no_resched();
3684+
3685+
3686+ /* we depend on the FIFO order
3687+ * Thus, we don't need to recheck when we wake up, we
3688+ * are guaranteed to have the lock since there is only one
3689+ * wake up per release
3690+ */
3691+ schedule();
3692+
3693+ TRACE_CUR("woke up, now owns PI lock %p\n", sem);
3694+
3695+ /* try_to_wake_up() set our state to TASK_RUNNING,
3696+ * all we need to do is to remove our wait queue entry
3697+ */
3698+ remove_wait_queue(&sem->wait, &wait);
3699+ } else {
3700+ /* no priority inheritance necessary, since there are no queued
3701+ * tasks.
3702+ */
3703+ TRACE_CUR("acquired PI lock %p, no contention\n", sem);
3704+ sem->holder = tsk;
3705+ sem->hp.task = tsk;
3706+ curr_sched_plugin->inherit_priority(sem, tsk);
3707+ spin_unlock_irqrestore(&sem->wait.lock, flags);
3708+ out:
3709+ TS_PI_DOWN_END;
3710+ preempt_enable();
3711+ }
3712+ ret = 0;
3713+ return ret;
3714+}
3715+
3716+asmlinkage long sys_pi_up(pi_sema_id sem_id)
3717+{
3718+ unsigned long flags;
3719+ long ret = -EINVAL;
3720+ struct pi_semaphore * sem;
3721+
3722+ preempt_disable();
3723+ TS_PI_UP_START;
3724+
3725+ if (sem_id < 0 || sem_id >= MAX_PI_SEMAPHORES)
3726+ goto out;
3727+
3728+ if (!pi_sems[sem_id].used)
3729+ goto out;
3730+
3731+ sem = pi_sems + sem_id;
3732+ spin_lock_irqsave(&sem->wait.lock, flags);
3733+
3734+ TRACE_CUR("releases PI lock %p\n", sem);
3735+ curr_sched_plugin->return_priority(sem);
3736+ sem->holder = NULL;
3737+ if (atomic_inc_return(&sem->count) < 1)
3738+ /* there is a task queued */
3739+ wake_up_locked(&sem->wait);
3740+
3741+ spin_unlock_irqrestore(&sem->wait.lock, flags);
3742+
3743+ ret = 0;
3744+ out:
3745+ TS_PI_UP_END;
3746+ preempt_enable();
3747+ return ret;
3748+}
3749+
3750+/* Clear wait queue and wakeup waiting tasks, and free semaphore. */
3751+asmlinkage long sys_pi_sema_free(pi_sema_id sem_id)
3752+{
3753+ struct list_head *tmp, *next;
3754+ unsigned long flags;
3755+
3756+ if (sem_id < 0 || sem_id >= MAX_PI_SEMAPHORES)
3757+ return -EINVAL;
3758+
3759+ if (!pi_sems[sem_id].used)
3760+ return -EINVAL;
3761+
3762+ spin_lock_irqsave(&pi_sems[sem_id].wait.lock, flags);
3763+ if (waitqueue_active(&pi_sems[sem_id].wait)) {
3764+ list_for_each_safe(tmp, next,
3765+ &pi_sems[sem_id].wait.task_list) {
3766+ wait_queue_t *curr = list_entry(tmp, wait_queue_t,
3767+ task_list);
3768+ list_del(tmp);
3769+ set_rt_flags((struct task_struct*)curr->private,
3770+ RT_F_EXIT_SEM);
3771+ curr->func(curr,
3772+ TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
3773+ 0, NULL);
3774+ }
3775+ }
3776+
3777+ spin_unlock_irqrestore(&pi_sems[sem_id].wait.lock, flags);
3778+ pi_sems[sem_id].used = 0;
3779+
3780+ return 0;
3781+}
3782+
3783+
3784+
3785+
3786+/* ************************************************************************** */
3787+/* STACK RESOURCE POLICY */
3788+/* ************************************************************************** */
3789+
3790+
3791+struct srp_priority {
3792+ struct list_head list;
3793+ unsigned int period;
3794+ pid_t pid;
3795+};
3796+
3797+#define list2prio(l) list_entry(l, struct srp_priority, list)
3798+
3799+static int srp_higher_prio(struct srp_priority* first,
3800+ struct srp_priority* second)
3801+{
3802+ if (!first->period)
3803+ return 0;
3804+ else
3805+ return !second->period ||
3806+ first->period < second->period || (
3807+ first->period == second->period &&
3808+ first->pid < second->pid);
3809+}
3810+
3811+struct srp {
3812+ struct list_head ceiling;
3813+ wait_queue_head_t ceiling_blocked;
3814+};
3815+
3816+#define system_ceiling(srp) list2prio(srp->ceiling.next)
3817+
3818+static int srp_exceeds_ceiling(struct task_struct* first,
3819+ struct srp* srp)
3820+{
3821+ return list_empty(&srp->ceiling) ||
3822+ get_rt_period(first) < system_ceiling(srp)->period ||
3823+ (get_rt_period(first) == system_ceiling(srp)->period &&
3824+ first->pid < system_ceiling(srp)->pid);
3825+}
3826+
3827+static void srp_add_prio(struct srp* srp, struct srp_priority* prio)
3828+{
3829+ struct list_head *pos;
3830+ if (in_list(&prio->list)) {
3831+ TRACE_CUR("WARNING: SRP violation detected, prio is already in "
3832+ "ceiling list!\n");
3833+ return;
3834+ }
3835+ list_for_each(pos, &srp->ceiling)
3836+ if (unlikely(srp_higher_prio(prio, list2prio(pos)))) {
3837+ __list_add(&prio->list, pos->prev, pos);
3838+ return;
3839+ }
3840+
3841+ list_add_tail(&prio->list, &srp->ceiling);
3842+}
3843+
3844+/* struct for uniprocessor SRP "semaphore" */
3845+struct srp_semaphore {
3846+ struct srp_priority ceiling;
3847+ int cpu; /* cpu associated with this "semaphore" and resource */
3848+ int claimed; /* is the resource claimed (ceiling should be used)? */
3849+ int used; /* is the semaphore being used? */
3850+};
3851+
3852+
3853+struct srp_semaphore srp_sems[MAX_SRP_SEMAPHORES]; /* all SRP sems */
3854+typedef int srp_sema_id; /* Userspace ID of a srp_semaphore */
3855+
3856+DEFINE_PER_CPU(struct srp, srp);
3857+
3858+/* Initialize SRP semaphores at boot time. */
3859+static int __init srp_sema_boot_init(void)
3860+{
3861+ srp_sema_id sem_id;
3862+ int i;
3863+
3864+ printk("Initializing SRP semaphores...");
3865+ for (sem_id = 0; sem_id < MAX_SRP_SEMAPHORES; sem_id++) {
3866+ srp_sems[sem_id].used = 0;
3867+ srp_sems[sem_id].claimed = 0;
3868+ srp_sems[sem_id].cpu = -1;
3869+ INIT_LIST_HEAD(&srp_sems[sem_id].ceiling.list);
3870+ }
3871+ for (i = 0; i < NR_CPUS; i++) {
3872+ init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked);
3873+ INIT_LIST_HEAD(&per_cpu(srp, i).ceiling);
3874+ }
3875+ printk(" done!\n");
3876+
3877+ return 0;
3878+}
3879+__initcall(srp_sema_boot_init);
3880+
3881+/* Find a free semaphore and return. */
3882+asmlinkage long sys_srp_sema_init (void)
3883+{
3884+ srp_sema_id sem_id;
3885+
3886+ if (!is_realtime(current))
3887+ return -EPERM;
3888+
3889+ for (sem_id = 0; sem_id < MAX_SRP_SEMAPHORES; sem_id++) {
3890+ if (!cmpxchg(&srp_sems[sem_id].used, 0, 1)) {
3891+ srp_sems[sem_id].ceiling.period = 0;
3892+ srp_sems[sem_id].cpu = get_partition(current);
3893+ return sem_id;
3894+ }
3895+ }
3896+ return -ENOMEM;
3897+}
3898+
3899+/* SRP task priority comparison function. Smaller periods have highest
3900+ * priority, tie-break is PID.
3901+ */
3902+
3903+/* Adjust the system-wide priority ceiling if resource is claimed. */
3904+asmlinkage long sys_srp_down(srp_sema_id sem_id)
3905+{
3906+ int cpu;
3907+ int ret = -EINVAL;
3908+
3909+ /* disabling preemptions is sufficient protection since
3910+ * SRP is strictly per CPU and we don't interfere with any
3911+ * interrupt handlers
3912+ */
3913+ preempt_disable();
3914+ TS_SRP_DOWN_START;
3915+
3916+
3917+ cpu = smp_processor_id();
3918+
3919+ if (sem_id < 0 || sem_id >= MAX_SRP_SEMAPHORES ||
3920+ srp_sems[sem_id].cpu != cpu)
3921+ goto out;
3922+
3923+ if (!srp_sems[sem_id].used)
3924+ goto out;
3925+
3926+ /* claim... */
3927+ srp_sems[sem_id].claimed = 1;
3928+ /* ...and update ceiling */
3929+ srp_add_prio(&__get_cpu_var(srp), &srp_sems[sem_id].ceiling);
3930+
3931+ ret = 0;
3932+ out:
3933+ TS_SRP_DOWN_END;
3934+ preempt_enable();
3935+ return ret;
3936+}
3937+
3938+/* Adjust the system-wide priority ceiling if resource is freed. */
3939+asmlinkage long sys_srp_up(srp_sema_id sem_id)
3940+{
3941+ int cpu;
3942+ int ret = -EINVAL;
3943+
3944+ preempt_disable();
3945+ TS_SRP_UP_START;
3946+
3947+ cpu = smp_processor_id();
3948+
3949+ if (sem_id < 0 || sem_id >= MAX_SRP_SEMAPHORES ||
3950+ srp_sems[sem_id].cpu != cpu)
3951+ goto out;
3952+
3953+ if (!srp_sems[sem_id].used)
3954+ goto out;
3955+
3956+ srp_sems[sem_id].claimed = 0;
3957+ /* Determine new system priority ceiling for this CPU. */
3958+ if (in_list(&srp_sems[sem_id].ceiling.list))
3959+ list_del(&srp_sems[sem_id].ceiling.list);
3960+ else
3961+ TRACE_CUR("WARNING: SRP violation detected, prio not in ceiling"
3962+ " list!\n");
3963+
3964+ /* Wake tasks on this CPU, if they exceed current ceiling. */
3965+ wake_up_all(&__get_cpu_var(srp).ceiling_blocked);
3966+ ret = 0;
3967+ out:
3968+ TS_SRP_UP_END;
3969+ preempt_enable();
3970+ return ret;
3971+}
3972+
3973+/* Indicate that task will use a resource associated with a given
3974+ * semaphore. Should be done *a priori* before RT task system is
3975+ * executed, so this does *not* update the system priority
3976+ * ceiling! (The ceiling would be meaningless anyway, as the SRP
3977+ * breaks without this a priori knowledge.)
3978+ */
3979+asmlinkage long sys_reg_task_srp_sem(srp_sema_id sem_id, pid_t t_pid)
3980+{
3981+ struct pid *task_pid;
3982+ struct task_struct *t;
3983+ struct srp_priority t_prio;
3984+
3985+ if (sem_id < 0 || sem_id >= MAX_SRP_SEMAPHORES)
3986+ return -EINVAL;
3987+
3988+ task_pid = find_get_pid(t_pid);
3989+ if (!task_pid)
3990+ return -EINVAL;
3991+
3992+ t = get_pid_task(task_pid, PIDTYPE_PID);
3993+ if (!t)
3994+ return -EINVAL;
3995+
3996+ if (!is_realtime(t))
3997+ return -EPERM;
3998+
3999+ if (!srp_sems[sem_id].used)
4000+ return -EINVAL;
4001+
4002+ if (srp_sems[sem_id].cpu != get_partition(t))
4003+ return -EINVAL;
4004+
4005+ preempt_disable();
4006+ t->rt_param.subject_to_srp = 1;
4007+ t_prio.period = get_rt_period(t);
4008+ t_prio.pid = t->pid;
4009+ if (srp_higher_prio(&t_prio, &srp_sems[sem_id].ceiling)) {
4010+ srp_sems[sem_id].ceiling.period = t_prio.period;
4011+ srp_sems[sem_id].ceiling.pid = t_prio.pid;
4012+ }
4013+
4014+ preempt_enable();
4015+
4016+ return 0;
4017+}
4018+
4019+static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync,
4020+ void *key)
4021+{
4022+ int cpu = smp_processor_id();
4023+ struct task_struct *tsk = wait->private;
4024+ if (cpu != get_partition(tsk))
4025+ TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b",
4026+ get_partition(tsk));
4027+ else if (srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
4028+ return default_wake_function(wait, mode, sync, key);
4029+ return 0;
4030+}
4031+
4032+
4033+/* Wait for current task priority to exceed system-wide priority ceiling.
4034+ * Can be used to determine when it is safe to run a job after its release.
4035+ */
4036+void srp_ceiling_block(void)
4037+{
4038+ struct task_struct *tsk = current;
4039+ wait_queue_t wait = {
4040+ .private = tsk,
4041+ .func = srp_wake_up,
4042+ .task_list = {NULL, NULL}
4043+ };
4044+
4045+ preempt_disable();
4046+ if (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) {
4047+ tsk->state = TASK_UNINTERRUPTIBLE;
4048+ add_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
4049+ TRACE_CUR("is priority ceiling blocked.\n");
4050+ preempt_enable_no_resched();
4051+ schedule();
4052+ /* Access to CPU var must occur with preemptions disabled, otherwise
4053+ * Linux debug code complains loudly, even if it is ok here.
4054+ */
4055+ preempt_disable();
4056+ TRACE_CUR("finally exceeds system ceiling.\n");
4057+ remove_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
4058+ preempt_enable();
4059+ } else {
4060+ TRACE_CUR("is not priority ceiling blocked\n");
4061+ preempt_enable();
4062+ }
4063+}
4064+
4065+/* Free semaphore, adjusting the system-wide priority ceiling if necessary. */
4066+asmlinkage long sys_srp_sema_free(srp_sema_id sem_id)
4067+{
4068+ int cpu;
4069+ int ret = 0;
4070+
4071+ preempt_disable();
4072+ cpu = smp_processor_id();
4073+
4074+ if (sem_id < 0 || sem_id >= MAX_SRP_SEMAPHORES ||
4075+ srp_sems[sem_id].cpu != cpu) {
4076+ ret = -EINVAL;
4077+ goto out;
4078+ }
4079+
4080+ srp_sems[sem_id].claimed = 0;
4081+ srp_sems[sem_id].used = 0;
4082+
4083+out:
4084+ preempt_enable();
4085+ return ret;
4086+}
4087+
4088+
4089+
4090+/* ************************************************************************** */
4091+
4092+
4093+
4094diff --git a/kernel/pfair_common.c b/kernel/pfair_common.c
4095new file mode 100644
4096index 0000000..c50fdab
4097--- /dev/null
4098+++ b/kernel/pfair_common.c
4099@@ -0,0 +1,237 @@
4100+/*
4101+ * Common functions for PFAIR based scheduler.
4102+ */
4103+
4104+#include <linux/percpu.h>
4105+#include <linux/sched.h>
4106+#include <linux/list.h>
4107+
4108+#include <linux/litmus.h>
4109+#include <linux/sched_plugin.h>
4110+#include <linux/sched_trace.h>
4111+
4112+#include <linux/pfair_common.h>
4113+#include <linux/pfair_math.h>
4114+/* Comparison of two tasks whether
4115+ * the lhs has higher priority than the rhs */
4116+int is_pfair_hp(struct task_struct *lhs, struct task_struct *rhs)
4117+{
4118+ /* Favor subtasks with earlier deadlines */
4119+ if(time_before(get_deadline(lhs), get_deadline(rhs)))
4120+ return 1;
4121+ if(get_deadline(lhs) == get_deadline(rhs)) {
4122+ /* If deadlines are equal,
4123+ * favor non-zero b-bit (a heavy task) */
4124+ if(lhs->rt_param.times.b_bit > rhs->rt_param.times.b_bit)
4125+ return 1;
4126+
4127+ if(lhs->rt_param.times.b_bit == rhs->rt_param.times.b_bit &&
4128+ lhs->rt_param.times.b_bit == 1)
4129+ /* If b-bit is 1, favor tasks with later
4130+ * group deadline */
4131+ return time_after(lhs->rt_param.times.group_deadline,
4132+ rhs->rt_param.times.group_deadline);
4133+
4134+ }
4135+ return 0;
4136+}
4137+
4138+void pfair_domain_init(pfair_domain_t *pfair)
4139+{
4140+ BUG_ON(!pfair);
4141+ INIT_LIST_HEAD(&pfair->ready_queue);
4142+ INIT_LIST_HEAD(&pfair->release_queue);
4143+ queue_lock_init(&pfair->pfair_lock);
4144+ cpus_setall(pfair->domain_cpus);
4145+ /* Use cpu 0 to keep the system alive
4146+ * TODO: Remove later or make it configurable
4147+ * */
4148+ cpu_clear(0, pfair->domain_cpus);
4149+}
4150+
4151+
4152+/* add_ready - add a real-time task to the PFAIR ready queue.
4153+ * It must be runnable. Global domain lock must be held before
4154+ * calling this function.
4155+ *
4156+ * @new: the newly released task
4157+ */
4158+void pfair_add_ready(pfair_domain_t* pfair, struct task_struct *new)
4159+{
4160+ struct list_head *pos;
4161+ struct task_struct *queued;
4162+
4163+ BUG_ON(!new);
4164+ /* find a spot where our deadline is earlier than the next */
4165+ list_for_each(pos, &pfair->ready_queue) {
4166+ queued = list_entry(pos, struct task_struct, rt_list);
4167+ if (unlikely(is_pfair_hp(new, queued))) {
4168+ /* the task at pos has a later deadline */
4169+ /* insert the new task in front of it */
4170+ __list_add(&new->rt_list, pos->prev, pos);
4171+ return;
4172+ }
4173+ }
4174+ /* if we get to this point either the list is empty or new has the
4175+ * lowest priority. Let's add it to the end. */
4176+ list_add_tail(&new->rt_list, &pfair->ready_queue);
4177+}
4178+/**
4179+ * Extraction function.
4180+ */
4181+struct task_struct* __pfair_take_ready(pfair_domain_t* pfair)
4182+{
4183+ struct task_struct *t = NULL;
4184+ /* either not yet released, preempted, or non-rt */
4185+ if (!list_empty(&pfair->ready_queue)) {
4186+
4187+ /* take next rt task */
4188+ t = list_entry(pfair->ready_queue.next, struct task_struct,
4189+ rt_list);
4190+
4191+ /* kick it out of the ready list */
4192+ list_del(&t->rt_list);
4193+ }
4194+ return t;
4195+}
4196+
4197+
4198+/* add_release - add a real-time task to the PFAIR release queue.
4199+ * Domain lock must be acquired before the function is called.
4200+ *
4201+ * @task: the sleeping task
4202+ */
4203+void pfair_add_release(pfair_domain_t* pfair, struct task_struct *task)
4204+{
4205+ struct list_head *pos;
4206+ struct task_struct *queued;
4207+
4208+ BUG_ON(!task);
4209+ /* find a spot where our deadline is earlier than the next */
4210+ list_for_each_prev(pos, &pfair->release_queue) {
4211+ queued = list_entry(pos, struct task_struct, rt_list);
4212+ if ((unlikely(time_before(queued->rt_param.times.release,
4213+ task->rt_param.times.release)))) {
4214+ /* the task at pos has an earlier release */
4215+ /* insert the new task in behind it */
4216+ __list_add(&task->rt_list, pos, pos->next);
4217+ return;
4218+ }
4219+ }
4220+ /* if we get to this point either the list is empty or task has the
4221+ * earliest release. Let's add it to the front. */
4222+ list_add(&task->rt_list, &pfair->release_queue);
4223+}
4224+/**
4225+ * This function is called from tick handler, it acquires the lock
4226+ * automatically. Only one processor effectively merges the queues.
4227+ */
4228+void pfair_try_release_pending(pfair_domain_t* pfair)
4229+{
4230+ unsigned long flags;
4231+ struct list_head *pos, *save;
4232+ struct task_struct *queued;
4233+ queue_lock_irqsave(&pfair->pfair_lock, flags);
4234+
4235+ list_for_each_safe(pos, save, &pfair->release_queue) {
4236+ queued = list_entry(pos, struct task_struct, rt_list);
4237+ if (likely(time_before_eq(
4238+ queued->rt_param.times.release, jiffies))) {
4239+ /* this one is ready to go*/
4240+ list_del(pos);
4241+ set_rt_flags(queued, RT_F_RUNNING);
4242+
4243+ sched_trace_job_release(queued);
4244+ /* now it can be picked up */
4245+ barrier();
4246+ pfair_add_ready(pfair, queued);
4247+ }
4248+ else
4249+ /* the release queue is ordered */
4250+ break;
4251+ }
4252+ queue_unlock_irqrestore(&pfair->pfair_lock, flags);
4253+}
4254+/*
4255+ * Subtask preparation. Assuming that last_release
4256+ * denotes the time when the job was released.
4257+ */
4258+void pfair_prepare_next_subtask(struct task_struct *t)
4259+{
4260+ BUG_ON(!t);
4261+ /* assign subtask release time, deadline, b-bit,
4262+ * and group deadline
4263+ */
4264+ t->rt_param.times.release = t->rt_param.times.last_release
4265+ +release_time(t);
4266+ t->rt_param.times.deadline = t->rt_param.times.last_release
4267+ +pfair_deadline(t);
4268+ t->rt_param.times.b_bit = b_bit(t);
4269+ t->rt_param.times.group_deadline = t->rt_param.times.last_release
4270+ +group_deadline(t);
4271+}
4272+
4273+void pfair_prepare_next_job(struct task_struct *t)
4274+{
4275+ BUG_ON(!t);
4276+
4277+ /* prepare next job release */
4278+ /* make passed quantums zero so that we could compute new release times
4279+ * and deadlines for subtasks correctly
4280+ */
4281+ t->rt_param.times.exec_time = 0;
4282+ /* assign job-wide release time,
4283+ * this is the starting point to
4284+ * compute subtask releases, deadlines and group deadlines
4285+ */
4286+ t->rt_param.times.last_release = t->rt_param.times.last_release
4287+ +get_rt_period(t);
4288+ /* Release the first subtask. */
4289+ pfair_prepare_next_subtask(t);
4290+ t->first_time_slice = 0;
4291+ /* Increase job sequence number */
4292+ t->rt_param.times.job_no++;
4293+}
4294+
4295+void __pfair_prepare_new_release(struct task_struct *t, jiffie_t start)
4296+{
4297+ t->rt_param.times.release = start;
4298+ t->rt_param.times.last_release = start;
4299+ t->rt_param.times.exec_time = 0;
4300+ t->first_time_slice = 0;
4301+ pfair_prepare_next_subtask(t);
4302+ set_rt_flags(t, RT_F_RUNNING);
4303+}
4304+
4305+void pfair_prepare_new_releases(pfair_domain_t *pfair, jiffie_t start)
4306+{
4307+ unsigned long flags;
4308+ struct list_head tmp_list;
4309+ struct list_head *pos, *n;
4310+ struct task_struct *t;
4311+
4312+ INIT_LIST_HEAD(&tmp_list);
4313+
4314+ queue_lock_irqsave(&pfair->pfair_lock, flags);
4315+
4316+
4317+ while (!list_empty(&pfair->release_queue)) {
4318+ pos = pfair->release_queue.next;
4319+ list_del(pos);
4320+ list_add(pos, &tmp_list);
4321+ }
4322+ while (!list_empty(&pfair->ready_queue)) {
4323+ pos = pfair->ready_queue.next;
4324+ list_del(pos);
4325+ list_add(pos, &tmp_list);
4326+ }
4327+
4328+ list_for_each_safe(pos, n, &tmp_list) {
4329+ t = list_entry(pos, struct task_struct, rt_list);
4330+ list_del(pos);
4331+ __pfair_prepare_new_release(t, start);
4332+ pfair_add_release(pfair, t);
4333+ }
4334+ queue_unlock_irqrestore(&pfair->pfair_lock, flags);
4335+}
4336+
4337diff --git a/kernel/rt_domain.c b/kernel/rt_domain.c
4338new file mode 100644
4339index 0000000..4875c53
4340--- /dev/null
4341+++ b/kernel/rt_domain.c
4342@@ -0,0 +1,185 @@
4343+/*
4344+ * kernel/rt_domain.c
4345+ *
4346+ * LITMUS real-time infrastructure. This file contains the
4347+ * functions that manipulate RT domains. RT domains are an abstraction
4348+ * of a ready queue and a release queue.
4349+ */
4350+
4351+#include <linux/percpu.h>
4352+#include <linux/sched.h>
4353+#include <linux/list.h>
4354+
4355+#include <linux/litmus.h>
4356+#include <linux/sched_plugin.h>
4357+#include <linux/sched_trace.h>
4358+
4359+#include <linux/rt_domain.h>
4360+
4361+
4362+static int dummy_resched(rt_domain_t *rt)
4363+{
4364+ return 0;
4365+}
4366+
4367+static int dummy_order(struct list_head* a, struct list_head* b)
4368+{
4369+ return 0;
4370+}
4371+
4372+int release_order(struct list_head* a, struct list_head* b)
4373+{
4374+ return earlier_release(
4375+ list_entry(a, struct task_struct, rt_list),
4376+ list_entry(b, struct task_struct, rt_list));
4377+}
4378+
4379+
4380+void rt_domain_init(rt_domain_t *rt,
4381+ check_resched_needed_t f,
4382+ list_cmp_t order)
4383+{
4384+ BUG_ON(!rt);
4385+ if (!f)
4386+ f = dummy_resched;
4387+ if (!order)
4388+ order = dummy_order;
4389+ INIT_LIST_HEAD(&rt->ready_queue);
4390+ INIT_LIST_HEAD(&rt->release_queue);
4391+ rt->ready_lock = RW_LOCK_UNLOCKED;
4392+ rt->release_lock = SPIN_LOCK_UNLOCKED;
4393+ rt->check_resched = f;
4394+ rt->order = order;
4395+}
4396+
4397+/* add_ready - add a real-time task to the rt ready queue. It must be runnable.
4398+ * @new: the newly released task
4399+ */
4400+void __add_ready(rt_domain_t* rt, struct task_struct *new)
4401+{
4402+ TRACE("rt: adding %s/%d (%u, %u) to ready queue\n",
4403+ new->comm, new->pid, get_exec_cost(new), get_rt_period(new));
4404+
4405+ if (!list_insert(&new->rt_list, &rt->ready_queue, rt->order))
4406+ rt->check_resched(rt);
4407+}
4408+
4409+struct task_struct* __take_ready(rt_domain_t* rt)
4410+{
4411+ struct task_struct *t = __peek_ready(rt);
4412+
4413+ /* kick it out of the ready list */
4414+ if (t)
4415+ list_del(&t->rt_list);
4416+ return t;
4417+}
4418+
4419+struct task_struct* __peek_ready(rt_domain_t* rt)
4420+{
4421+ if (!list_empty(&rt->ready_queue))
4422+ return next_ready(rt);
4423+ else
4424+ return NULL;
4425+}
4426+
4427+struct task_struct* __take_ready_rq(rt_domain_t* rt, runqueue_t* rq, int cpu)
4428+{
4429+ struct task_struct *task = __take_ready(rt);
4430+
4431+ if (task) {
4432+ set_task_cpu(task, cpu);
4433+ __activate_task(task, rq);
4434+ }
4435+ return task;
4436+}
4437+
4438+/* add_release - add a real-time task to the rt release queue.
4439+ * @task: the sleeping task
4440+ */
4441+void __add_release(rt_domain_t* rt, struct task_struct *task)
4442+{
4443+ TRACE("rt: adding %s/%d (%u, %u) rel=%d to release queue\n",
4444+ task->comm, task->pid, get_exec_cost(task), get_rt_period(task),
4445+ get_release(task));
4446+
4447+ list_insert(&task->rt_list, &rt->release_queue, release_order);
4448+}
4449+
4450+void __release_pending(rt_domain_t* rt)
4451+{
4452+ struct list_head *pos, *save;
4453+ struct task_struct *queued;
4454+ list_for_each_safe(pos, save, &rt->release_queue) {
4455+ queued = list_entry(pos, struct task_struct, rt_list);
4456+ if (likely(is_released(queued))) {
4457+ /* this one is ready to go*/
4458+ list_del(pos);
4459+ set_rt_flags(queued, RT_F_RUNNING);
4460+
4461+ sched_trace_job_release(queued);
4462+
4463+ /* now it can be picked up */
4464+ barrier();
4465+ add_ready(rt, queued);
4466+ }
4467+ else
4468+ /* the release queue is ordered */
4469+ break;
4470+ }
4471+}
4472+
4473+void try_release_pending(rt_domain_t* rt)
4474+{
4475+ unsigned long flags;
4476+
4477+ if (spin_trylock_irqsave(&rt->release_lock, flags)) {
4478+ __release_pending(rt);
4479+ spin_unlock_irqrestore(&rt->release_lock, flags);
4480+ }
4481+}
4482+
4483+void rerelease_all(rt_domain_t *rt,
4484+ release_at_t release)
4485+{
4486+ unsigned long flags;
4487+
4488+ spin_lock_irqsave(&rt->release_lock, flags);
4489+ write_lock(&rt->ready_lock);
4490+
4491+ __rerelease_all(rt, release);
4492+
4493+ write_unlock(&rt->ready_lock);
4494+ spin_unlock_irqrestore(&rt->release_lock, flags);
4495+}
4496+
4497+void __rerelease_all(rt_domain_t *rt,
4498+ release_at_t release)
4499+{
4500+ jiffie_t start = jiffies + 10;
4501+ struct list_head tmp_list;
4502+ struct list_head *pos, *n;
4503+ struct task_struct *t;
4504+
4505+ INIT_LIST_HEAD(&tmp_list);
4506+
4507+ while (!list_empty(&rt->release_queue)) {
4508+ pos = rt->release_queue.next;
4509+ list_del(pos);
4510+ list_add(pos, &tmp_list);
4511+ }
4512+ while (!list_empty(&rt->ready_queue)) {
4513+ pos = rt->ready_queue.next;
4514+ list_del(pos);
4515+ list_add(pos, &tmp_list);
4516+ }
4517+
4518+ list_for_each_safe(pos, n, &tmp_list) {
4519+ t = list_entry(pos, struct task_struct, rt_list);
4520+ list_del(pos);
4521+ release(t, start);
4522+ __add_release(rt, t);
4523+ }
4524+
4525+}
4526+
4527+
4528diff --git a/kernel/sched.c b/kernel/sched.c
4529index cca93cc..5ad4276 100644
4530--- a/kernel/sched.c
4531+++ b/kernel/sched.c
4532@@ -56,6 +56,16 @@
4533
4534 #include <asm/unistd.h>
4535
4536+#include <linux/litmus.h>
4537+#define __SCHED_C__
4538+#include <linux/sched_plugin.h>
4539+#include <linux/sched_trace.h>
4540+#include <linux/rt_param.h>
4541+#include <linux/trace.h>
4542+
4543+/* LITMUS: avoid races with multiple task wake-ups */
4544+DEFINE_SPINLOCK(litmus_task_set_lock);
4545+
4546 /*
4547 * Convert user-nice values [ -20 ... 0 ... 19 ]
4548 * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
4549@@ -836,7 +846,7 @@ static int effective_prio(struct task_struct *p)
4550 * keep the priority unchanged. Otherwise, update priority
4551 * to the normal priority:
4552 */
4553- if (!rt_prio(p->prio))
4554+ if (!rt_prio(p->prio) && !is_realtime(p))
4555 return p->normal_prio;
4556 return p->prio;
4557 }
4558@@ -844,7 +854,7 @@ static int effective_prio(struct task_struct *p)
4559 /*
4560 * __activate_task - move a task to the runqueue.
4561 */
4562-static void __activate_task(struct task_struct *p, struct rq *rq)
4563+void __activate_task(struct task_struct *p, struct rq *rq)
4564 {
4565 struct prio_array *target = rq->active;
4566
4567@@ -999,7 +1009,7 @@ out:
4568 /*
4569 * deactivate_task - remove a task from the runqueue.
4570 */
4571-static void deactivate_task(struct task_struct *p, struct rq *rq)
4572+void deactivate_task(struct task_struct *p, struct rq *rq)
4573 {
4574 dec_nr_running(p, rq);
4575 dequeue_task(p, p->array);
4576@@ -1408,13 +1418,44 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
4577 #endif
4578
4579 rq = task_rq_lock(p, &flags);
4580+
4581+ if (is_realtime(p))
4582+ TRACE("try_to_wake_up(%s/%d)\n", p->comm, p->pid);
4583+
4584 old_state = p->state;
4585 if (!(old_state & state))
4586- goto out;
4587+ goto out;
4588
4589 if (p->array)
4590 goto out_running;
4591
4592+
4593+ spin_lock(&litmus_task_set_lock);
4594+ if (p->rt_param.litmus_controlled) {
4595+ /* Already included. This can happen
4596+ * if the task dropped all locks to call
4597+ * schedule() but a wake up raced and came in
4598+ * early.
4599+ */
4600+
4601+ spin_unlock(&litmus_task_set_lock);
4602+ goto out_running;
4603+ }
4604+
4605+ sched_trace_task_arrival(p);
4606+ if (is_realtime(p)) {
4607+ p->rt_param.litmus_controlled = 1;
4608+ curr_sched_plugin->wake_up_task(p);
4609+
4610+ spin_unlock(&litmus_task_set_lock);
4611+ goto out_running;
4612+ }
4613+
4614+ p->rt_param.litmus_controlled = 0;
4615+ spin_unlock(&litmus_task_set_lock);
4616+
4617+
4618+
4619 cpu = task_cpu(p);
4620 this_cpu = smp_processor_id();
4621
4622@@ -1580,6 +1621,7 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
4623 cpu = sched_balance_self(cpu, SD_BALANCE_FORK);
4624 #endif
4625 set_task_cpu(p, cpu);
4626+ clear_rt_params(p);
4627
4628 /*
4629 * We mark the process as running here, but have not actually
4630@@ -1595,6 +1637,10 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags)
4631 p->prio = current->normal_prio;
4632
4633 INIT_LIST_HEAD(&p->run_list);
4634+ INIT_LIST_HEAD(&p->rt_list);
4635+ p->rt_param.basic_params.class = RT_CLASS_BEST_EFFORT;
4636+ p->rt_param.litmus_controlled = 0;
4637+ p->rt_param.inh_task = NULL;
4638 p->array = NULL;
4639 #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
4640 if (unlikely(sched_info_on()))
4641@@ -1647,6 +1693,12 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
4642 unsigned long flags;
4643 int this_cpu, cpu;
4644
4645+ if (clone_flags & CLONE_REALTIME) {
4646+ /* just mark the task as stopped */
4647+ p->state = TASK_STOPPED;
4648+ return;
4649+ }
4650+
4651 rq = task_rq_lock(p, &flags);
4652 BUG_ON(p->state != TASK_RUNNING);
4653 this_cpu = smp_processor_id();
4654@@ -1730,6 +1782,9 @@ void fastcall sched_exit(struct task_struct *p)
4655 unsigned long flags;
4656 struct rq *rq;
4657
4658+ if (is_realtime(p))
4659+ return;
4660+
4661 /*
4662 * If the child was a (relative-) CPU hog then decrease
4663 * the sleep_avg of the parent as well.
4664@@ -1801,6 +1856,13 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
4665 */
4666 prev_state = prev->state;
4667 finish_arch_switch(prev);
4668+ /* Requeue previous real-time task before we drop the rq lock, cause
4669+ * that may lead to a preemption.
4670+ */
4671+ curr_sched_plugin->finish_switch(prev);
4672+ sched_trace_task_scheduled(current);
4673+ /* trace before IRQs are enabled */
4674+ TS_CXS_END;
4675 finish_lock_switch(rq, prev);
4676 if (mm)
4677 mmdrop(mm);
4678@@ -1811,7 +1873,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev)
4679 */
4680 kprobe_flush_task(prev);
4681 put_task_struct(prev);
4682- }
4683+ }
4684 }
4685
4686 /**
4687@@ -2990,7 +3052,7 @@ static inline void idle_balance(int cpu, struct rq *rq)
4688 static inline void wake_priority_sleeper(struct rq *rq)
4689 {
4690 #ifdef CONFIG_SCHED_SMT
4691- if (!rq->nr_running)
4692+ if (!rq->nr_running || get_rt_mode() == MODE_RT_RUN)
4693 return;
4694
4695 spin_lock(&rq->lock);
4696@@ -3220,14 +3282,30 @@ void scheduler_tick(void)
4697
4698 update_cpu_clock(p, rq, now);
4699
4700- if (p == rq->idle)
4701- /* Task on the idle queue */
4702- wake_priority_sleeper(rq);
4703- else
4704- task_running_tick(rq, p);
4705+ /* check whether the RT scheduler plugin requires a call to
4706+ * schedule
4707+ */
4708+ TS_PLUGIN_TICK_START;
4709+ if (rt_scheduler_tick() == FORCE_RESCHED)
4710+ set_tsk_need_resched(p);
4711+ TS_PLUGIN_TICK_END;
4712+
4713+ /* real-time accounting is done by the plugin
4714+ * call linux functions only for background tasks
4715+ */
4716+ if (!is_realtime(p)) {
4717+ if (p == rq->idle)
4718+ /* Task on the idle queue */
4719+ wake_priority_sleeper(rq);
4720+ else
4721+ task_running_tick(rq, p);
4722+ }
4723+ send_scheduler_signals();
4724+
4725 #ifdef CONFIG_SMP
4726 update_load(rq);
4727- if (time_after_eq(jiffies, rq->next_balance))
4728+ if (time_after_eq(jiffies, rq->next_balance) &&
4729+ get_rt_mode() == MODE_NON_RT)
4730 raise_softirq(SCHED_SOFTIRQ);
4731 #endif
4732 }
4733@@ -3420,6 +3498,7 @@ asmlinkage void __sched schedule(void)
4734 long *switch_count;
4735 struct rq *rq;
4736
4737+
4738 /*
4739 * Test if we are atomic. Since do_exit() needs to call into
4740 * schedule() atomically, we ignore that path for now.
4741@@ -3427,8 +3506,9 @@ asmlinkage void __sched schedule(void)
4742 */
4743 if (unlikely(in_atomic() && !current->exit_state)) {
4744 printk(KERN_ERR "BUG: scheduling while atomic: "
4745- "%s/0x%08x/%d\n",
4746- current->comm, preempt_count(), current->pid);
4747+ "%s/0x%08x/%d %s\n",
4748+ current->comm, preempt_count(), current->pid,
4749+ is_realtime(current) ? "rt" : "non-rt");
4750 debug_show_held_locks(current);
4751 if (irqs_disabled())
4752 print_irqtrace_events(current);
4753@@ -3438,6 +3518,7 @@ asmlinkage void __sched schedule(void)
4754
4755 need_resched:
4756 preempt_disable();
4757+ TS_SCHED_START;
4758 prev = current;
4759 release_kernel_lock(prev);
4760 need_resched_nonpreemptible:
4761@@ -3470,6 +3551,7 @@ need_resched_nonpreemptible:
4762 spin_lock_irq(&rq->lock);
4763
4764 switch_count = &prev->nivcsw;
4765+ /* check for blocking tasks */
4766 if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
4767 switch_count = &prev->nvcsw;
4768 if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
4769@@ -3478,13 +3560,66 @@ need_resched_nonpreemptible:
4770 else {
4771 if (prev->state == TASK_UNINTERRUPTIBLE)
4772 rq->nr_uninterruptible++;
4773+ /* we need to remove real-time tasks from the runqueue*/
4774+
4775+ /* protect against races with signal delivery and IO
4776+ * interrupts on other CPUs
4777+ *
4778+ * FIXME: This is probably not sufficient,
4779+ * as (in theory) after
4780+ * unlocking the task_set_lock this task could
4781+ * be scheduled elsewere before we switched away
4782+ * from it. This has not been observed
4783+ * yet. To get this locking right is tricky.
4784+ */
4785+ spin_lock(&litmus_task_set_lock);
4786+ if (prev->rt_param.litmus_controlled)
4787+ prev->rt_param.litmus_controlled = 0;
4788+ spin_unlock(&litmus_task_set_lock);
4789+
4790+ if (is_realtime(prev)) {
4791+ TRACE("schedule: %s/%d blocks. state = %d\n",
4792+ prev->comm, prev->pid, prev->state);
4793+ curr_sched_plugin->task_blocks(prev);
4794+ /* Enable this for all tasks to get _a lot_ of
4795+ * data. Can be helpful for debugging.
4796+ */
4797+ sched_trace_task_departure(prev);
4798+ }
4799+
4800+ /* only indirect switching is supported in the current
4801+ * version of LITMUS
4802+ */
4803 deactivate_task(prev, rq);
4804 }
4805 }
4806
4807+ next = NULL;
4808+
4809+ /* consult the real-time plugin */
4810+ TS_PLUGIN_SCHED_START;
4811+ curr_sched_plugin->schedule(prev, &next, rq);
4812+ TS_PLUGIN_SCHED_END;
4813+ /* If the real-time plugin wants to switch to a specific task
4814+ * it'll be on the rq and have the highest priority. There will
4815+ * be exaclty one such task, thus the selection of the next task
4816+ * is unambiguous and the following code can only get
4817+ * triggered if there are no RT tasks pending (on this CPU). Thus,
4818+ * we may as well skip it.
4819+ */
4820+ if (next)
4821+ goto switch_tasks;
4822+
4823 cpu = smp_processor_id();
4824 if (unlikely(!rq->nr_running)) {
4825- idle_balance(cpu, rq);
4826+ /* only load-balance if we are not in RT mode
4827+ *
4828+ * TODO: Maybe this can be relaxed by modifiying the
4829+ * load-balancing routines in such a way that they never touch
4830+ * real-time tasks.
4831+ */
4832+ if (get_rt_mode() == MODE_NON_RT)
4833+ idle_balance(cpu, rq);
4834 if (!rq->nr_running) {
4835 next = rq->idle;
4836 rq->expired_timestamp = 0;
4837@@ -3528,7 +3663,7 @@ need_resched_nonpreemptible:
4838 }
4839 }
4840 next->sleep_type = SLEEP_NORMAL;
4841- if (dependent_sleeper(cpu, rq, next))
4842+ if (get_rt_mode() == MODE_NON_RT && dependent_sleeper(cpu, rq, next))
4843 next = rq->idle;
4844 switch_tasks:
4845 if (next == rq->idle)
4846@@ -3546,7 +3681,11 @@ switch_tasks:
4847 prev->timestamp = prev->last_ran = now;
4848
4849 sched_info_switch(prev, next);
4850+ TS_SCHED_END;
4851 if (likely(prev != next)) {
4852+ TS_CXS_START;
4853+ if (is_running(prev))
4854+ sched_trace_task_preemption(prev, next);
4855 next->timestamp = now;
4856 rq->nr_switches++;
4857 rq->curr = next;
4858@@ -3560,9 +3699,12 @@ switch_tasks:
4859 * CPUs since it called schedule(), thus the 'rq' on its stack
4860 * frame will be invalid.
4861 */
4862- finish_task_switch(this_rq(), prev);
4863- } else
4864+ finish_task_switch(this_rq(), prev);
4865+ } else {
4866 spin_unlock_irq(&rq->lock);
4867+ }
4868+
4869+ send_scheduler_signals();
4870
4871 prev = current;
4872 if (unlikely(reacquire_kernel_lock(prev) < 0))
4873@@ -3691,6 +3833,7 @@ static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
4874 }
4875 }
4876
4877+
4878 /**
4879 * __wake_up - wake up threads blocked on a waitqueue.
4880 * @q: the waitqueue
4881@@ -3709,6 +3852,7 @@ void fastcall __wake_up(wait_queue_head_t *q, unsigned int mode,
4882 }
4883 EXPORT_SYMBOL(__wake_up);
4884
4885+
4886 /*
4887 * Same as __wake_up but called with the spinlock in wait_queue_head_t held.
4888 */
4889@@ -3717,6 +3861,7 @@ void fastcall __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
4890 __wake_up_common(q, mode, 1, 0, NULL);
4891 }
4892
4893+
4894 /**
4895 * __wake_up_sync - wake up threads blocked on a waitqueue.
4896 * @q: the waitqueue
4897@@ -4175,7 +4320,7 @@ static inline struct task_struct *find_process_by_pid(pid_t pid)
4898 }
4899
4900 /* Actually do priority change: must hold rq lock. */
4901-static void __setscheduler(struct task_struct *p, int policy, int prio)
4902+void __setscheduler(struct task_struct *p, int policy, int prio)
4903 {
4904 BUG_ON(p->array);
4905
4906@@ -6877,7 +7022,7 @@ void __init sched_init_smp(void)
4907 BUG();
4908 }
4909 #else
4910-void __init sched_init_smp(void)
4911+void __init linux_sched_init_smp(void)
4912 {
4913 }
4914 #endif /* CONFIG_SMP */
4915diff --git a/kernel/sched_adaptive.c b/kernel/sched_adaptive.c
4916new file mode 100644
4917index 0000000..319ebbc
4918--- /dev/null
4919+++ b/kernel/sched_adaptive.c
4920@@ -0,0 +1,1454 @@
4921+
4922+
4923+/*
4924+ * kernel/sched_adaptive.c
4925+ *
4926+ * Implementation of Aaron's adaptive global EDF scheduling algorithm. It is
4927+ * based on the GSN-EDF scheduler. However, it does not support synchronization
4928+ * primitives.
4929+ *
4930+ * It implements a version of FC-GEDF with a bunch of linearity assumptions for
4931+ * the optimizer and the the weight-transfer function. The code is meant to be
4932+ * clear, however you really need to read the paper if you want to understand
4933+ * what is going on here.
4934+ *
4935+ * Block et al., "Feedback-Controlled Adaptive Multiprocessor Real-Time
4936+ * Systems", submitted to RTAS 2008.
4937+ */
4938+
4939+#include <linux/percpu.h>
4940+#include <linux/sched.h>
4941+#include <linux/list.h>
4942+
4943+#include <linux/queuelock.h>
4944+#include <linux/litmus.h>
4945+#include <linux/sched_plugin.h>
4946+#include <linux/edf_common.h>
4947+#include <linux/sched_trace.h>
4948+#include <asm/uaccess.h>
4949+
4950+#include <linux/fpmath.h>
4951+
4952+/* Overview of GSN-EDF operations.
4953+ *
4954+ * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This
4955+ * description only covers how the individual operations are implemented in
4956+ * LITMUS.
4957+ *
4958+ * link_task_to_cpu(T, cpu) - Low-level operation to update the linkage
4959+ * structure (NOT the actually scheduled
4960+ * task). If there is another linked task To
4961+ * already it will set To->linked_on = NO_CPU
4962+ * (thereby removing its association with this
4963+ * CPU). However, it will not requeue the
4964+ * previously linked task (if any). It will set
4965+ * T's state to RT_F_RUNNING and check whether
4966+ * it is already running somewhere else. If T
4967+ * is scheduled somewhere else it will link
4968+ * it to that CPU instead (and pull the linked
4969+ * task to cpu). T may be NULL.
4970+ *
4971+ * unlink(T) - Unlink removes T from all scheduler data
4972+ * structures. If it is linked to some CPU it
4973+ * will link NULL to that CPU. If it is
4974+ * currently queued in the gsnedf queue it will
4975+ * be removed from the T->rt_list. It is safe to
4976+ * call unlink(T) if T is not linked. T may not
4977+ * be NULL.
4978+ *
4979+ * requeue(T) - Requeue will insert T into the appropriate
4980+ * queue. If the system is in real-time mode and
4981+ * the T is released already, it will go into the
4982+ * ready queue. If the system is not in
4983+ * real-time mode is T, then T will go into the
4984+ * release queue. If T's release time is in the
4985+ * future, it will go into the release
4986+ * queue. That means that T's release time/job
4987+ * no/etc. has to be updated before requeu(T) is
4988+ * called. It is not safe to call requeue(T)
4989+ * when T is already queued. T may not be NULL.
4990+ *
4991+ * gsnedf_job_arrival(T) - This is the catch all function when T enters
4992+ * the system after either a suspension or at a
4993+ * job release. It will queue T (which means it
4994+ * is not safe to call gsnedf_job_arrival(T) if
4995+ * T is already queued) and then check whether a
4996+ * preemption is necessary. If a preemption is
4997+ * necessary it will update the linkage
4998+ * accordingly and cause scheduled to be called
4999+ * (either with an IPI or need_resched). It is
5000+ * safe to call gsnedf_job_arrival(T) if T's
5001+ * next job has not been actually released yet
5002+ * (releast time in the future). T will be put
5003+ * on the release queue in that case.
5004+ *
5005+ * job_completion(T) - Take care of everything that needs to be done
5006+ * to prepare T for its next release and place
5007+ * it in the right queue with
5008+ * gsnedf_job_arrival().
5009+ *
5010+ *
5011+ * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is
5012+ * equivalent to unlink(T). Note that if you unlink a task from a CPU none of
5013+ * the functions will automatically propagate pending task from the ready queue
5014+ * to a linked task. This is the job of the calling function ( by means of
5015+ * __take_ready).
5016+ */
5017+
5018+static void unlink(struct task_struct* t);
5019+static void adaptive_job_arrival(struct task_struct* task);
5020+
5021+/* cpu_entry_t - maintain the linked and scheduled state
5022+ */
5023+typedef struct {
5024+ int cpu;
5025+ struct task_struct* linked; /* only RT tasks */
5026+ struct task_struct* scheduled; /* only RT tasks */
5027+ struct list_head list;
5028+ atomic_t will_schedule; /* prevent unneeded IPIs */
5029+} cpu_entry_t;
5030+DEFINE_PER_CPU(cpu_entry_t, adaptive_cpu_entries);
5031+
5032+#define set_will_schedule() \
5033+ (atomic_set(&__get_cpu_var(adaptive_cpu_entries).will_schedule, 1))
5034+#define clear_will_schedule() \
5035+ (atomic_set(&__get_cpu_var(adaptive_cpu_entries).will_schedule, 0))
5036+#define test_will_schedule(cpu) \
5037+ (atomic_read(&per_cpu(adaptive_cpu_entries, cpu).will_schedule))
5038+
5039+
5040+#define NO_CPU 0xffffffff
5041+
5042+/* The gsnedf_lock is used to serialize all scheduling events.
5043+ * It protects
5044+ */
5045+static queuelock_t adaptive_lock;
5046+/* the cpus queue themselves according to priority in here */
5047+static LIST_HEAD(adaptive_cpu_queue);
5048+
5049+static rt_domain_t adaptive;
5050+
5051+/* feedback control parameters */
5052+static fp_t fc_a, fc_b;
5053+
5054+/* optimizer trigger */
5055+static jiffie_t last_optimizer_run;
5056+static jiffie_t optimizer_min_invocation_sep;
5057+static jiffie_t optimizer_period;
5058+static fp_t task_error_threshold;
5059+
5060+static fp_t system_capacity;
5061+/* total actual weight of the task system */
5062+static fp_t total_weight;
5063+
5064+/* optimizer time snapshot */
5065+jiffie_t opt_time;
5066+
5067+/* Delayed weight increase notification list.
5068+ * This list gets clobbered on each optimizer run.
5069+ */
5070+static LIST_HEAD(adaptive_inc_list);
5071+
5072+/* comment out to disable optimizer debugging */
5073+#define ENABLE_OPTIMIZER_DEBUGGING
5074+
5075+#ifdef ENABLE_OPTIMIZER_DEBUGGING
5076+#define OPT_DBG TRACE
5077+#define OPT_DBG_T TRACE_TASK
5078+#else
5079+#define OPT_DBG
5080+#define OPT_DBG_T OPT_D
5081+#endif
5082+
5083+/******************************************************************************/
5084+/* OPTIMIZER MATH */
5085+/******************************************************************************/
5086+
5087+/* All time dependent functions
5088+ * rely on opt_time.
5089+ * Update in the optimizer before use!
5090+ */
5091+
5092+static inline fp_t ideal(fp_t weight, jiffie_t delta_t)
5093+{
5094+ return _mul(weight, FP(delta_t));
5095+}
5096+
5097+static noinline long ideal_exec_time(struct task_struct* t)
5098+{
5099+ jiffie_t delta = opt_time - get_last_release(t);
5100+ return _round(ideal(get_est_weight(t), delta));
5101+}
5102+
5103+/* this makes a whole bunch of linearity assumptions */
5104+static noinline fp_t weight_transfer(struct task_struct* t,
5105+ unsigned int from, unsigned int to,
5106+ fp_t act_weight)
5107+{
5108+ fp_t rel_from, rel_to, ret;
5109+ rel_from = get_sl(t, from).weight;
5110+ rel_to = get_sl(t, to).weight;
5111+ ret.val = (act_weight.val * rel_to.val) / rel_from.val;
5112+ OPT_DBG("weight_transfer(%ld, %ld, %ld) => %ld to=%u from=%u\n",
5113+ rel_from.val, rel_to.val, act_weight.val, ret.val, from, to);
5114+
5115+ return ret;
5116+}
5117+
5118+static noinline fp_t est_weight_at(struct task_struct* t, unsigned int level)
5119+{
5120+ if (t->rt_param.no_service_levels)
5121+ return weight_transfer(t, get_cur_sl(t), level,
5122+ get_est_weight(t));
5123+ else
5124+ return get_est_weight(t);
5125+
5126+}
5127+
5128+static noinline void update_estimate(predictor_state_t *state, fp_t actual_weight,
5129+ fp_t a, fp_t b)
5130+{
5131+ fp_t err, new;
5132+
5133+ OPT_DBG("OLD ESTIMATE Weight" _FP_ " ActWt " _FP_ " A:" _FP_ ", B:" _FP_
5134+ "\n", fp2str(state->estimate), fp2str(actual_weight), fp2str(a),
5135+ fp2str(b));
5136+ err = _sub(actual_weight, state->estimate);
5137+ new = _add(_mul(a, err),
5138+ _mul(b, state->accumulated));
5139+
5140+ total_weight = _sub(total_weight, state->estimate);
5141+ state->estimate = new;
5142+ total_weight = _add(total_weight, state->estimate);
5143+
5144+ state->accumulated = _add(state->accumulated, err);
5145+ OPT_DBG("ERROR " _FP_ ", NEW " _FP_ ", ACC" _FP_ "\n", fp2str(err),
5146+ fp2str(new), fp2str(state->accumulated));
5147+
5148+}
5149+
5150+static noinline fp_t linear_metric(struct task_struct* t)
5151+{
5152+ fp_t v1, vmax, g1, gmax;
5153+ fp_t est_w;
5154+ unsigned int l = t->rt_param.no_service_levels;
5155+ unsigned int lcur;
5156+
5157+ if (l <= 1)
5158+ return FP(0);
5159+
5160+ lcur = get_cur_sl(t);;
5161+ est_w = get_est_weight(t);
5162+
5163+ OPT_DBG_T(t, " linear_metric: lcur=%u l=%u est_w=" _FP_ "\n",
5164+ lcur, l, est_w);
5165+ OPT_DBG_T(t, " linear_metric: est_w.val=%ld\n", est_w.val);
5166+
5167+
5168+ v1 = t->rt_param.service_level[0].value;
5169+ vmax = t->rt_param.service_level[l - 1].value;
5170+
5171+ OPT_DBG_T(t, " linear_metric: v1=" _FP_ " vmax=" _FP_ "\n", v1, vmax);
5172+ OPT_DBG_T(t, " linear_metric: v1=%ld vmax=%ld\n", v1.val, vmax.val);
5173+
5174+
5175+ g1 = weight_transfer(t, lcur, 0, est_w);
5176+ gmax = weight_transfer(t, lcur, l - 1, est_w);
5177+
5178+ OPT_DBG_T(t, " linear_metric: g1=" _FP_ " gmax=" _FP_ "\n", g1, gmax);
5179+ OPT_DBG_T(t, " linear_metric: g1=%ld gmax=%ld\n", g1, gmax);
5180+
5181+
5182+ TRACE_BUG_ON(_eq(_sub(gmax, g1), FP(0)));
5183+ if (_eq(_sub(gmax, g1), FP(0)))
5184+ return FP(0);
5185+ return _div(_sub(vmax, v1),
5186+ _sub(gmax, g1));
5187+}
5188+
5189+static noinline unsigned long reweighted_period(fp_t ow, fp_t nw,
5190+ unsigned long alloc,
5191+ jiffie_t deadline,
5192+ jiffie_t release)
5193+{
5194+ fp_t dl;
5195+ dl = _mul(FP(deadline - release), ow);
5196+ dl = _sub(dl, FP(alloc));
5197+ if(_eq(nw, FP(0)))
5198+ return 0;
5199+ dl = _div(dl, nw);
5200+ return _round(dl);
5201+}
5202+
5203+static noinline int is_under_allocated(struct task_struct* t)
5204+{
5205+ return ideal_exec_time(t) >= t->rt_param.times.exec_time;
5206+}
5207+
5208+static noinline jiffie_t dec_equal_point_delay(struct task_struct* t)
5209+{
5210+ if (_lt(FP(0), get_est_weight(t)))
5211+ /* when t was released plus time needed to equalize
5212+ * minus now
5213+ */
5214+ return get_last_release(t) +
5215+ _round(_div( FP(t->rt_param.times.exec_time),
5216+ get_est_weight(t))) -
5217+ opt_time;
5218+ else
5219+ /* if the weight is zero we just take the
5220+ * deadline
5221+ */
5222+ return t->rt_param.times.deadline;
5223+}
5224+
5225+static noinline jiffie_t inc_equal_point_delay(struct task_struct* t)
5226+{
5227+ if (_lt(FP(0), t->rt_param.opt_nw))
5228+ /* when t was released plus time needed to equalize
5229+ * minus now
5230+ */
5231+ return get_last_release(t) +
5232+ _round(_div( FP(t->rt_param.times.exec_time),
5233+ t->rt_param.opt_nw)) -
5234+ opt_time;
5235+ else
5236+ /* if the weight is zero we just take the
5237+ * deadline
5238+ */
5239+ return t->rt_param.times.deadline;
5240+}
5241+
5242+static noinline jiffie_t decrease_delay(struct task_struct* t)
5243+{
5244+ if (has_active_job(t) && !is_under_allocated(t))
5245+ return dec_equal_point_delay(t);
5246+ return 0;
5247+}
5248+
5249+
5250+
5251+/******************************************************************************/
5252+/* SORT ORDERS */
5253+/******************************************************************************/
5254+
5255+static int by_linear_metric(struct list_head* a, struct list_head* b)
5256+{
5257+ struct task_struct *ta, *tb;
5258+ ta = list_entry(a, struct task_struct, rt_param.opt_list);
5259+ tb = list_entry(b, struct task_struct, rt_param.opt_list);
5260+ return _gt(ta->rt_param.opt_order, tb->rt_param.opt_order);
5261+}
5262+
5263+static int by_delta_weight(struct list_head* a, struct list_head* b)
5264+{
5265+ struct task_struct *ta, *tb;
5266+ ta = list_entry(a, struct task_struct, rt_param.opt_list);
5267+ tb = list_entry(b, struct task_struct, rt_param.opt_list);
5268+ return _lt(ta->rt_param.opt_dw, tb->rt_param.opt_dw);
5269+}
5270+
5271+static int by_enactment_time(struct list_head* a, struct list_head* b)
5272+{
5273+ struct task_struct *ta, *tb;
5274+ ta = list_entry(a, struct task_struct, rt_param.opt_list);
5275+ tb = list_entry(b, struct task_struct, rt_param.opt_list);
5276+ return ta->rt_param.opt_change < tb->rt_param.opt_change;
5277+}
5278+
5279+/******************************************************************************/
5280+/* WEIGHT CHANGE MECHANICS */
5281+/******************************************************************************/
5282+
5283+static void set_service_level(struct task_struct* t, unsigned int level)
5284+{
5285+ service_level_t *new;
5286+ unsigned int old;
5287+ BUG_ON(!t);
5288+ BUG_ON(t->rt_param.no_service_levels <= level);
5289+
5290+ old = t->rt_param.cur_service_level;
5291+ t->rt_param.cur_service_level = level;
5292+ new = t->rt_param.service_level + level;
5293+ t->rt_param.basic_params.period = new->period;
5294+ t->rt_param.basic_params.exec_cost = _round(_mul(new->weight,
5295+ FP(new->period)));
5296+
5297+ scheduler_signal(t, SIGUSR1);
5298+
5299+ sched_trace_service_level_change(t, old, level);
5300+ OPT_DBG_T(t, "service level %u activated\n", level);
5301+}
5302+
5303+/* call this _before_ updating deadline and release of t */
5304+static void update_weight_estimate(struct task_struct* t)
5305+{
5306+ fp_t nw, ow;
5307+ jiffie_t sl_period, exec_time;
5308+
5309+ ow = get_est_weight(t);
5310+ nw = t->rt_param.opt_nw;
5311+ exec_time = t->rt_param.times.exec_time;
5312+ sl_period = get_sl(t, get_opt_sl(t)).period;
5313+
5314+ OPT_DBG("ow=" _FP_ " nw=" _FP_ ", r-d " _FP_
5315+ ", deadline %d, release %d, exec_time=%ld sl_period=%lu\n",
5316+ fp2str(ow), fp2str(nw),
5317+ fp2str(FP(get_deadline(t) - get_last_release(t))),
5318+ get_deadline(t), get_last_release(t), exec_time, sl_period);
5319+
5320+ total_weight = _sub(total_weight, get_est_weight(t));
5321+ t->rt_param.predictor_state.estimate = nw;
5322+ OPT_DBG_T(t, "update_weight_estimate from " _FP_ " to "_FP_"\n",
5323+ fp2str(ow), fp2str(nw));
5324+ total_weight = _add(total_weight, get_est_weight(t));
5325+
5326+ OPT_DBG_T(t, " update_weight_estimate: " _FP_ " => " _FP_ "\n",
5327+ fp2str(ow), fp2str(get_est_weight(t)));
5328+}
5329+
5330+
5331+static void decrease_weight(struct task_struct* t)
5332+{
5333+ fp_t ow, nw;
5334+ jiffie_t last, period, delay;
5335+
5336+ ow = get_sl(t, get_cur_sl(t)).weight;
5337+ nw = get_sl(t, get_opt_sl(t)).weight;
5338+ last = t->rt_param.times.last_release;
5339+ period = reweighted_period(ow, nw, t->rt_param.times.exec_time,
5340+ t->rt_param.times.deadline, last);
5341+
5342+ /* necessary delay has already been computed by optimizer */
5343+ delay = t->rt_param.opt_change;
5344+
5345+ update_weight_estimate(t);
5346+
5347+ if (!delay)
5348+ t->rt_param.times.last_release = opt_time;
5349+ t->rt_param.times.release = opt_time + delay;
5350+ t->rt_param.times.deadline = opt_time + delay + period;
5351+
5352+ set_service_level(t, get_opt_sl(t));
5353+
5354+ /* take out of queue/link structure */
5355+ unlink(t);
5356+ /* present as a new job */
5357+ adaptive_job_arrival(t);
5358+}
5359+
5360+
5361+static void increase_weight(struct task_struct* t)
5362+{
5363+ fp_t ow, nw;
5364+ jiffie_t last, period, delay;
5365+
5366+ ow = get_sl(t, get_cur_sl(t)).weight;
5367+ nw = get_sl(t, get_opt_sl(t)).weight;
5368+ last = t->rt_param.times.last_release;
5369+ period = reweighted_period(ow, nw, t->rt_param.times.exec_time,
5370+ t->rt_param.times.deadline, last);
5371+
5372+ if (t->rt_param.opt_change == 0) {
5373+ /* can be enacted now */
5374+ if (is_under_allocated(t) ||
5375+ time_before(opt_time + period, get_deadline(t)))
5376+ /* do it now */
5377+ delay = 0;
5378+ else {
5379+ if (is_under_allocated(t)) {
5380+ t->rt_param.opt_change += opt_time;
5381+ /* The next job release will notice that opt !=
5382+ * sl and initiate a weight change.
5383+ */
5384+ return;
5385+ } else
5386+ /* nope, wait for equal point */
5387+ delay = inc_equal_point_delay(t);
5388+ }
5389+
5390+ update_weight_estimate(t);
5391+
5392+ if (!delay)
5393+ t->rt_param.times.last_release = opt_time;
5394+ t->rt_param.times.release = opt_time + delay;
5395+ t->rt_param.times.deadline = opt_time + delay + period;
5396+
5397+ set_service_level(t, get_opt_sl(t));
5398+
5399+ /* take out of queue/link structure */
5400+ unlink(t);
5401+ /* present as a new job */
5402+ adaptive_job_arrival(t);
5403+
5404+ } else {
5405+ /* must wait until capacity is released */
5406+ t->rt_param.opt_change += opt_time;
5407+ list_insert(&t->rt_param.opt_list, &adaptive_inc_list,
5408+ by_enactment_time);
5409+ }
5410+}
5411+
5412+static void delayed_increase_weight(void)
5413+{
5414+ struct list_head *p, *extra;
5415+ struct task_struct* t;
5416+
5417+ opt_time = jiffies;
5418+ list_for_each_safe(p, extra, &adaptive_inc_list) {
5419+ t = list_entry(p, struct task_struct, rt_param.opt_list);
5420+ if (time_before_eq(t->rt_param.opt_change, opt_time)) {
5421+ list_del(p);
5422+ /* prevent recursion */
5423+ t->rt_param.opt_change = 0;
5424+ /* this takes care of everything */
5425+ increase_weight(t);
5426+ } else
5427+ /* list is sorted */
5428+ break;
5429+ }
5430+}
5431+
5432+static void change_weight(struct task_struct* t)
5433+{
5434+ if (get_cur_sl(t) < get_opt_sl(t))
5435+ increase_weight(t);
5436+ else
5437+ decrease_weight(t);
5438+ OPT_DBG_T(t, "after change_weight: last_rel:%d rel:%d dl:%d\n",
5439+ get_last_release(t),
5440+ get_release(t),
5441+ get_deadline(t));
5442+}
5443+
5444+/******************************************************************************/
5445+/* OPTIMIZER */
5446+/******************************************************************************/
5447+
5448+/* only invoke with adaptive_lock behing held */
5449+void adaptive_optimize(void)
5450+{
5451+ struct list_head list;
5452+ struct list_head inc, dec;
5453+ struct list_head *p, *extra;
5454+ cpu_entry_t *cpu;
5455+ struct task_struct* t;
5456+ fp_t M = FP(0), w0, wl, tmp, estU = FP(0);
5457+ unsigned int l;
5458+ jiffie_t enactment_time;
5459+
5460+ if (time_before(jiffies,
5461+ last_optimizer_run + optimizer_min_invocation_sep))
5462+ return;
5463+
5464+ OPT_DBG(":::::: running adaptive optimizer\n");
5465+ opt_time = jiffies;
5466+
5467+ INIT_LIST_HEAD(&list);
5468+
5469+ /* 1) gather all tasks */
5470+ list_for_each(p, &adaptive.ready_queue)
5471+ list_add(&(rt_list2task(p)->rt_param.opt_list), &list);
5472+ list_for_each(p, &adaptive.release_queue)
5473+ list_add(&(rt_list2task(p)->rt_param.opt_list), &list);
5474+ list_for_each(p, &adaptive_cpu_queue) {
5475+ cpu = list_entry(p, cpu_entry_t, list);
5476+ if (cpu->linked)
5477+ list_add(&cpu->linked->rt_param.opt_list, &list);
5478+ }
5479+
5480+ /* 2) determine current system capacity */
5481+ M = system_capacity;
5482+ OPT_DBG("opt: system capacity: " _FP_ "\n", fp2str(M));
5483+
5484+ /* 3) Compute L value for all tasks,
5485+ * and set tasks to service level 0,
5486+ * also account for weight.
5487+ * Also establish current estimated utilization
5488+ */
5489+ list_for_each_safe(p, extra, &list) {
5490+ t = list_entry(p, struct task_struct, rt_param.opt_list);
5491+ if (time_before(opt_time, get_last_release(t))) {
5492+ list_del(p);
5493+ continue;
5494+ }
5495+ t->rt_param.opt_order = linear_metric(t);
5496+ OPT_DBG_T(t, "est_w = " _FP_ " L = " _FP_ "\n",
5497+ get_est_weight(t),
5498+ fp2str(t->rt_param.opt_order));
5499+ t->rt_param.opt_level = 0;
5500+ M = _sub(M, est_weight_at(t, 0));
5501+ estU = _add(estU, get_est_weight(t));
5502+ }
5503+ OPT_DBG("opt: estimated utilization: " _FP_ "\n", fp2str(estU));
5504+ OPT_DBG("opt: estimated capacity at all sl=0: " _FP_ "\n", fp2str(M));
5505+
5506+
5507+ /* 4) sort list by decreasing linear metric */
5508+ list_qsort(&list, by_linear_metric);
5509+
5510+ /* 5) assign each task a service level */
5511+ list_for_each(p, &list) {
5512+ t = list_entry(p, struct task_struct, rt_param.opt_list);
5513+ l = t->rt_param.no_service_levels;
5514+ w0 = est_weight_at(t, 0);
5515+ while (l > 1) {
5516+ l--;
5517+ wl = est_weight_at(t, l);
5518+ tmp = _sub(M, _sub(wl, w0));
5519+ if (_leq(FP(0), tmp)) {
5520+ /* this level fits in */
5521+ M = tmp;
5522+ t->rt_param.opt_level = l;
5523+ t->rt_param.opt_dw = _sub(wl,
5524+ get_est_weight(t));
5525+ t->rt_param.opt_nw = wl;
5526+ break; /* proceed to next task */
5527+ }
5528+ }
5529+ OPT_DBG_T(t, " will run at sl=%u, prior=%u dw=" _FP_ "\n",
5530+ l, get_cur_sl(t), fp2str(t->rt_param.opt_dw));
5531+
5532+ }
5533+
5534+ /* 6) filter tasks that reweight */
5535+ INIT_LIST_HEAD(&inc);
5536+ INIT_LIST_HEAD(&dec);
5537+ list_for_each_safe(p, extra, &list) {
5538+ t = list_entry(p, struct task_struct, rt_param.opt_list);
5539+ list_del(p);
5540+ if (t->rt_param.opt_level < get_cur_sl(t)) {
5541+ list_add(p, &dec);
5542+ t->rt_param.opt_change = decrease_delay(t);
5543+ } else if (t->rt_param.opt_level > get_cur_sl(t)) {
5544+ list_add(p, &inc);
5545+ t->rt_param.opt_change = 0;
5546+ }
5547+ /* if t doesn't change we can ignore it from now on */
5548+ }
5549+
5550+ /* 7) sort dec and inc list */
5551+ list_qsort(&dec, by_enactment_time);
5552+ list_qsort(&inc, by_delta_weight);
5553+
5554+ /* 8) now figure out when we can enact weight increases
5555+ * It works like this: We know the current system utilization.
5556+ * Thus, we know the remaining capacity. We also know when
5557+ * decreases are going to be enacted (=> capacity increases).
5558+ * Now we only need to find a spot where the weight increase will
5559+ * not drive the system into overload.
5560+ */
5561+
5562+ /* Very ugly jump, but we need to force enactment_time = 0
5563+ * during the first iteration.
5564+ */
5565+ M = system_capacity;
5566+ enactment_time = 0;
5567+ goto first_iteration;
5568+
5569+ while (!list_empty(&inc)) {
5570+ enactment_time = list_entry(dec.next, struct task_struct,
5571+ rt_param.opt_list)
5572+ ->rt_param.opt_change;
5573+ first_iteration:
5574+ /* Start by collapsing the next decrease.
5575+ * Except for in the first iteration, it will always
5576+ * pick off at least one task.
5577+ */
5578+ list_for_each_safe(p, extra, &dec) {
5579+ t = list_entry(p, struct task_struct,
5580+ rt_param.opt_list);
5581+ if (t->rt_param.opt_change == enactment_time) {
5582+ list_del(p);
5583+ /* opt_dw is negative */
5584+ estU = _add(estU, t->rt_param.opt_dw);
5585+ list_add(p, &list);
5586+
5587+ OPT_DBG_T(t, " weight decrease at %ld => estU="
5588+ _FP_ "\n", enactment_time,
5589+ fp2str(estU));
5590+
5591+ } else
5592+ /* stop decrease loop */
5593+ break;
5594+ }
5595+
5596+ /* now start setting enactment times for increases */
5597+ while (!list_empty(&inc)) {
5598+ p = inc.next;
5599+ t = list_entry(p, struct task_struct,
5600+ rt_param.opt_list);
5601+ tmp = _add(estU, t->rt_param.opt_dw);
5602+ if (_leq(tmp, M)) {
5603+ /* it fits */
5604+ estU = tmp;
5605+ t->rt_param.opt_change = enactment_time;
5606+ list_del(p);
5607+ list_add(p, &list);
5608+
5609+ OPT_DBG_T(t, " weight increase at %ld => estU="
5610+ _FP_ "\n", enactment_time,
5611+ fp2str(estU));
5612+
5613+ } else
5614+ /* stop increase loop */
5615+ break;
5616+ }
5617+
5618+ TRACE_BUG_ON(list_empty(&dec) && !list_empty(&inc));
5619+ if (list_empty(&dec) && !list_empty(&inc))
5620+ /* break out in case of bug */
5621+ break;
5622+ }
5623+
5624+ /* 9) Wow. We made it. Every task has a now a new service level
5625+ * assigned, together with a correct (earliest) enactment time.
5626+ * all we have left to do now is to enact changes that did not get
5627+ * delayed. Also convert change fields to actual timestamp for to be
5628+ * nice to the scheduler_tick().
5629+ */
5630+ INIT_LIST_HEAD(&adaptive_inc_list);
5631+ list_for_each_safe(p, extra, &list) {
5632+ t = list_entry(p, struct task_struct, rt_param.opt_list);
5633+ list_del(p);
5634+ change_weight(t);
5635+ }
5636+
5637+ last_optimizer_run = jiffies;
5638+ OPT_DBG(":::::: optimizer run complete\n");
5639+}
5640+
5641+/* update_cpu_position - Move the cpu entry to the correct place to maintain
5642+ * order in the cpu queue. Caller must hold adaptive lock.
5643+ */
5644+static void update_cpu_position(cpu_entry_t *entry)
5645+{
5646+ cpu_entry_t *other;
5647+ struct list_head *pos;
5648+ list_del(&entry->list);
5649+ /* if we do not execute real-time jobs we just move
5650+ * to the end of the queue
5651+ */
5652+ if (entry->linked) {
5653+ list_for_each(pos, &adaptive_cpu_queue) {
5654+ other = list_entry(pos, cpu_entry_t, list);
5655+ if (edf_higher_prio(entry->linked, other->linked)) {
5656+ __list_add(&entry->list, pos->prev, pos);
5657+ return;
5658+ }
5659+ }
5660+ }
5661+ /* if we get this far we have the lowest priority job */
5662+ list_add_tail(&entry->list, &adaptive_cpu_queue);
5663+}
5664+
5665+/* link_task_to_cpu - Update the link of a CPU.
5666+ * Handles the case where the to-be-linked task is already
5667+ * scheduled on a different CPU.
5668+ */
5669+static noinline void link_task_to_cpu(struct task_struct* linked,
5670+ cpu_entry_t *entry)
5671+
5672+{
5673+ cpu_entry_t *sched;
5674+ struct task_struct* tmp;
5675+ int on_cpu;
5676+
5677+ BUG_ON(linked && !is_realtime(linked));
5678+
5679+ /* Currently linked task is set to be unlinked. */
5680+ if (entry->linked)
5681+ entry->linked->rt_param.linked_on = NO_CPU;
5682+
5683+ /* Link new task to CPU. */
5684+ if (linked) {
5685+ set_rt_flags(linked, RT_F_RUNNING);
5686+ /* handle task is already scheduled somewhere! */
5687+ on_cpu = linked->rt_param.scheduled_on;
5688+ if (on_cpu != NO_CPU) {
5689+ sched = &per_cpu(adaptive_cpu_entries, on_cpu);
5690+ /* this should only happen if not linked already */
5691+ BUG_ON(sched->linked == linked);
5692+
5693+ /* If we are already scheduled on the CPU to which we
5694+ * wanted to link, we don't need to do the swap --
5695+ * we just link ourselves to the CPU and depend on
5696+ * the caller to get things right.
5697+ */
5698+ if (entry != sched) {
5699+ tmp = sched->linked;
5700+ linked->rt_param.linked_on = sched->cpu;
5701+ sched->linked = linked;
5702+ update_cpu_position(sched);
5703+ linked = tmp;
5704+ }
5705+ }
5706+ if (linked) /* might be NULL due to swap */
5707+ linked->rt_param.linked_on = entry->cpu;
5708+ }
5709+ entry->linked = linked;
5710+ update_cpu_position(entry);
5711+}
5712+
5713+/* unlink - Make sure a task is not linked any longer to an entry
5714+ * where it was linked before. Must hold adaptive_lock.
5715+ */
5716+static void unlink(struct task_struct* t)
5717+{
5718+ cpu_entry_t *entry;
5719+
5720+ if (unlikely(!t)) {
5721+ TRACE_BUG_ON(!t);
5722+ return;
5723+ }
5724+
5725+ if (t->rt_param.linked_on != NO_CPU) {
5726+ /* unlink */
5727+ entry = &per_cpu(adaptive_cpu_entries, t->rt_param.linked_on);
5728+ t->rt_param.linked_on = NO_CPU;
5729+ link_task_to_cpu(NULL, entry);
5730+ } else if (in_list(&t->rt_list)) {
5731+ /* This is an interesting situation: t is scheduled,
5732+ * but was just recently unlinked. It cannot be
5733+ * linked anywhere else (because then it would have
5734+ * been relinked to this CPU), thus it must be in some
5735+ * queue. We must remove it from the list in this
5736+ * case.
5737+ */
5738+ list_del(&t->rt_list);
5739+ }
5740+}
5741+
5742+
5743+/* preempt - force a CPU to reschedule
5744+ */
5745+static noinline void preempt(cpu_entry_t *entry)
5746+{
5747+ /* We cannot make the is_np() decision here if it is a remote CPU
5748+ * because requesting exit_np() requires that we currently use the
5749+ * address space of the task. Thus, in the remote case we just send
5750+ * the IPI and let schedule() handle the problem.
5751+ */
5752+
5753+ if (smp_processor_id() == entry->cpu) {
5754+ if (entry->scheduled && is_np(entry->scheduled))
5755+ request_exit_np(entry->scheduled);
5756+ else
5757+ set_tsk_need_resched(current);
5758+ } else
5759+ /* in case that it is a remote CPU we have to defer the
5760+ * the decision to the remote CPU
5761+ */
5762+ if (!test_will_schedule(entry->cpu))
5763+ smp_send_reschedule(entry->cpu);
5764+}
5765+
5766+/* requeue - Put an unlinked task into gsn-edf domain.
5767+ * Caller must hold adaptive_lock.
5768+ */
5769+static noinline void requeue(struct task_struct* task)
5770+{
5771+ BUG_ON(!task);
5772+ /* sanity check rt_list before insertion */
5773+ BUG_ON(in_list(&task->rt_list));
5774+
5775+ if (get_rt_flags(task) == RT_F_SLEEP ||
5776+ get_rt_mode() != MODE_RT_RUN) {
5777+ /* this task has expired
5778+ * _schedule has already taken care of updating
5779+ * the release and
5780+ * deadline. We just must check if it has been released.
5781+ */
5782+ if (is_released(task) && get_rt_mode() == MODE_RT_RUN)
5783+ __add_ready(&adaptive, task);
5784+ else {
5785+ /* it has got to wait */
5786+ __add_release(&adaptive, task);
5787+ }
5788+
5789+ } else
5790+ /* this is a forced preemption
5791+ * thus the task stays in the ready_queue
5792+ * we only must make it available to others
5793+ */
5794+ __add_ready(&adaptive, task);
5795+}
5796+
5797+/* adaptive_job_arrival: task is either resumed or released */
5798+static void adaptive_job_arrival(struct task_struct* task)
5799+{
5800+ cpu_entry_t* last;
5801+
5802+ BUG_ON(list_empty(&adaptive_cpu_queue));
5803+ BUG_ON(!task);
5804+
5805+ TRACE_TASK(task, "job_arrival: last_rel=%d rel=%d dl=%d now=%d\n",
5806+ get_last_release(task), get_release(task),
5807+ get_deadline(task),
5808+ jiffies);
5809+
5810+
5811+ /* first queue arriving job */
5812+ requeue(task);
5813+
5814+ /* then check for any necessary preemptions */
5815+ last = list_entry(adaptive_cpu_queue.prev, cpu_entry_t, list);
5816+ if (edf_preemption_needed(&adaptive, last->linked)) {
5817+ /* preemption necessary */
5818+ task = __take_ready(&adaptive);
5819+
5820+ TRACE("job_arrival: task %d linked to %d\n",
5821+ task->pid, last->cpu);
5822+
5823+ if (last->linked)
5824+ requeue(last->linked);
5825+
5826+ link_task_to_cpu(task, last);
5827+ preempt(last);
5828+ }
5829+}
5830+
5831+/* check for current job releases */
5832+static noinline void adaptive_release_jobs(void)
5833+{
5834+ struct list_head *pos, *save;
5835+ struct task_struct *queued;
5836+
5837+ list_for_each_safe(pos, save, &adaptive.release_queue) {
5838+ queued = list_entry(pos, struct task_struct, rt_list);
5839+ if (likely(is_released(queued))) {
5840+ TRACE_TASK(queued, "released rel=%d now=%d\n",
5841+ get_release(queued), jiffies);
5842+ /* this one is ready to go*/
5843+ list_del(pos);
5844+ set_rt_flags(queued, RT_F_RUNNING);
5845+ queued->rt_param.times.last_release =
5846+ queued->rt_param.times.release;
5847+
5848+ /* check for delayed weight increase */
5849+ if (get_opt_sl(queued) != get_cur_sl(queued) &&
5850+ time_before_eq(queued->rt_param.opt_change, jiffies)) {
5851+ opt_time = jiffies;
5852+ set_service_level(queued, get_opt_sl(queued));
5853+ queued->rt_param.times.deadline =
5854+ get_last_release(queued) +
5855+ get_rt_period(queued);
5856+ total_weight = _sub(total_weight, get_est_weight(queued));
5857+ queued->rt_param.predictor_state.estimate =
5858+ queued->rt_param.opt_nw;
5859+ total_weight = _add(total_weight, get_est_weight(queued));
5860+ }
5861+
5862+ sched_trace_job_release(queued);
5863+ adaptive_job_arrival(queued);
5864+ }
5865+ else
5866+ /* the release queue is ordered */
5867+ break;
5868+ }
5869+}
5870+
5871+/* adaptive_scheduler_tick - this function is called for every local timer
5872+ * interrupt.
5873+ *
5874+ * checks whether the current task has expired and checks
5875+ * whether we need to preempt it if it has not expired
5876+ */
5877+static reschedule_check_t adaptive_scheduler_tick(void)
5878+{
5879+ unsigned long flags;
5880+ struct task_struct* t = current;
5881+ reschedule_check_t want_resched = NO_RESCHED;
5882+
5883+ /* Account for exec time.
5884+ * Since we don't preempt forcefully, nothing else needs to be done.
5885+ */
5886+ if (is_realtime(t))
5887+ t->rt_param.times.exec_time++;
5888+
5889+ /* only the first CPU needs to release jobs */
5890+ if (get_rt_mode() == MODE_RT_RUN) {
5891+ queue_lock_irqsave(&adaptive_lock, flags);
5892+
5893+ /* (1) run the optimizer if it did not trigger often enough */
5894+ if (time_before_eq(last_optimizer_run + optimizer_period, jiffies)) {
5895+
5896+ OPT_DBG("adaptive: optimizing due to period threshold\n");
5897+
5898+ adaptive_optimize();
5899+ }
5900+
5901+ /* (2) enact delayed weight increases */
5902+ delayed_increase_weight();
5903+
5904+ /* (3) try to release pending jobs */
5905+ adaptive_release_jobs();
5906+
5907+ /* we don't need to check linked != scheduled since
5908+ * set_tsk_need_resched has been set by preempt() if necessary
5909+ */
5910+
5911+ queue_unlock_irqrestore(&adaptive_lock, flags);
5912+ }
5913+
5914+ return want_resched;
5915+}
5916+
5917+/* caller holds adaptive_lock */
5918+static noinline void job_completion(struct task_struct *t)
5919+{
5920+ long delta;
5921+ fp_t actual_weight, old_estimate;
5922+ unsigned int lcurr = get_cur_sl(t);
5923+ fp_t v = t->rt_param.service_level[lcurr].value;
5924+
5925+ int non_zero_weight;
5926+ fp_t error_percentage;
5927+ int exceeds_threshold;
5928+
5929+ BUG_ON(!t);
5930+
5931+ TRACE_TASK(t, " completion, last_rel=%d rel=%d dl=%d now=%d "
5932+ "period=%d\n",
5933+ get_last_release(t), get_release(t), get_deadline(t),
5934+ jiffies, get_rt_period(t));
5935+
5936+ sched_trace_job_completion(t);
5937+ delta = t->rt_param.times.exec_time -
5938+ t->rt_param.basic_params.exec_cost;
5939+
5940+ OPT_DBG_T(t, "job %d completes, delta WCET = %d\n",
5941+ t->rt_param.times.job_no, delta);
5942+
5943+ actual_weight = _frac(t->rt_param.times.exec_time,
5944+ t->rt_param.basic_params.period);
5945+ sched_trace_weight_error(t, actual_weight);
5946+ old_estimate = get_est_weight(t);
5947+ update_estimate(&t->rt_param.predictor_state, actual_weight,
5948+ fc_a, fc_b);
5949+
5950+ OPT_DBG_T(t, "Job %d completes. Current value " _FP_
5951+ ", Weight estimation: error=" _FP_ " weight="
5952+ _FP_ " => " _FP_ "\n",t->rt_param.times.job_no, v,
5953+ _sub(get_est_weight(t), old_estimate),
5954+ old_estimate, get_est_weight(t));
5955+
5956+ /* Now we have determined the task error.
5957+ * Next we release the next job.
5958+ * Then we optimize. It's easier for the optimizer to deal
5959+ * with just-released jobs.
5960+ */
5961+
5962+ /* prepare for next period */
5963+ edf_prepare_for_next_period(t);
5964+
5965+ TRACE_TASK(t, " prepped, last_rel=%d rel=%d dl=%d now=%d\n",
5966+ get_last_release(t), get_release(t), get_deadline(t),
5967+ jiffies);
5968+
5969+ if (is_released(t)) {
5970+ /* set flags */
5971+ /* prevent fake completions */
5972+ set_rt_flags(t, RT_F_RUNNING);
5973+ t->rt_param.times.last_release =
5974+ t->rt_param.times.release;
5975+ }
5976+
5977+
5978+ non_zero_weight = !_eq(get_est_weight(t),FP(0));
5979+ if (non_zero_weight)
5980+ error_percentage = _div(_abs(_sub(get_est_weight(t),
5981+ old_estimate)),
5982+ get_est_weight(t));
5983+ else
5984+ error_percentage = FP(0);
5985+ exceeds_threshold = _gt(error_percentage, task_error_threshold);
5986+
5987+
5988+ if (exceeds_threshold) {
5989+ OPT_DBG("adaptive: optimizing due to task error threshold\n");
5990+ adaptive_optimize();
5991+ } else if (_gt(total_weight, system_capacity)) {
5992+ OPT_DBG("adaptive: optimizing due to system capacity exceeded\n");
5993+ adaptive_optimize();
5994+ }
5995+
5996+
5997+ /* unlink */
5998+ unlink(t);
5999+ /* requeue
6000+ * But don't requeue a blocking task. */
6001+ if (is_running(t))
6002+ adaptive_job_arrival(t);
6003+}
6004+
6005+
6006+/* Getting schedule() right is a bit tricky. schedule() may not make any
6007+ * assumptions on the state of the current task since it may be called for a
6008+ * number of reasons. The reasons include a scheduler_tick() determined that it
6009+ * was necessary, because sys_exit_np() was called, because some Linux
6010+ * subsystem determined so, or even (in the worst case) because there is a bug
6011+ * hidden somewhere. Thus, we must take extreme care to determine what the
6012+ * current state is.
6013+ *
6014+ * The CPU could currently be scheduling a task (or not), be linked (or not).
6015+ *
6016+ * The following assertions for the scheduled task could hold:
6017+ *
6018+ * - !is_running(scheduled) // the job blocks
6019+ * - get_rt_flag() == RT_F_SLEEP // the job completed (by syscall)
6020+ * - linked != scheduled // we need to reschedule (for any reason)
6021+ *
6022+ * Any of these can occur together.
6023+ */
6024+static int adaptive_schedule(struct task_struct * prev,
6025+ struct task_struct ** next,
6026+ runqueue_t * rq)
6027+{
6028+ cpu_entry_t* entry = &__get_cpu_var(adaptive_cpu_entries);
6029+ int sleep, preempt, exists,
6030+ rt, blocks;
6031+ struct task_struct* linked;
6032+
6033+ /* Will be released in finish_switch. */
6034+ queue_lock(&adaptive_lock);
6035+ clear_will_schedule();
6036+
6037+ /* sanity checking */
6038+ BUG_ON(entry->scheduled && entry->scheduled != prev);
6039+ BUG_ON(entry->scheduled && !is_realtime(prev));
6040+
6041+ /* (0) Determine state */
6042+ exists = entry->scheduled != NULL;
6043+ blocks = exists && !is_running(entry->scheduled);
6044+ sleep = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
6045+ preempt = entry->scheduled != entry->linked;
6046+ rt = get_rt_mode() == MODE_RT_RUN;
6047+
6048+ /* If a task blocks we have no choice but to reschedule.
6049+ */
6050+ if (blocks)
6051+ unlink(entry->scheduled);
6052+
6053+ /* Task wants to sleep -> job is done.
6054+ */
6055+ if (sleep)
6056+ job_completion(entry->scheduled);
6057+
6058+ /* Stop real-time tasks when we leave real-time mode
6059+ */
6060+ if (!rt && entry->linked) {
6061+ /* task will be preempted once it is preemptable
6062+ * (which it may be already)
6063+ */
6064+ linked = entry->linked;
6065+ unlink(linked);
6066+ requeue(linked);
6067+ }
6068+
6069+ /* Link pending task if we became unlinked.
6070+ */
6071+ if (rt && !entry->linked)
6072+ link_task_to_cpu(__take_ready(&adaptive), entry);
6073+
6074+ /* The final scheduling decision. Do we need to switch for some reason?
6075+ * If linked different from scheduled select linked as next.
6076+ */
6077+ if (entry->linked != entry->scheduled) {
6078+ /* Take care of a previously scheduled
6079+ * job by taking it out of the Linux runqueue.
6080+ */
6081+ if (entry->scheduled)
6082+ if (prev->array)
6083+ /* take it out of the run queue */
6084+ deactivate_task(prev, rq);
6085+
6086+ /* Schedule a linked job? */
6087+ if (entry->linked) {
6088+ *next = entry->linked;
6089+ /* mark the task as executing on this cpu */
6090+ set_task_cpu(*next, smp_processor_id());
6091+ /* stick the task into the runqueue */
6092+ __activate_task(*next, rq);
6093+ }
6094+ } else
6095+ /* Only override Linux scheduler if we have real-time task
6096+ * scheduled that needs to continue.
6097+ */
6098+ if (exists)
6099+ *next = prev;
6100+
6101+ /* Unlock in case that we don't affect real-time tasks or
6102+ * if nothing changed and finish_switch won't be called.
6103+ */
6104+ if (prev == *next || (!is_realtime(prev) && !*next))
6105+ queue_unlock(&adaptive_lock);
6106+
6107+ return 0;
6108+}
6109+
6110+
6111+/* _finish_switch - we just finished the switch away from prev
6112+ */
6113+static void adaptive_finish_switch(struct task_struct *prev)
6114+{
6115+ cpu_entry_t* entry = &__get_cpu_var(adaptive_cpu_entries);
6116+
6117+ if (is_realtime(current))
6118+ entry->scheduled = current;
6119+ else
6120+ entry->scheduled = NULL;
6121+
6122+ prev->rt_param.scheduled_on = NO_CPU;
6123+ current->rt_param.scheduled_on = smp_processor_id();
6124+
6125+ /* unlock in case schedule() left it locked */
6126+ if (is_realtime(current) || is_realtime(prev))
6127+ queue_unlock(&adaptive_lock);
6128+}
6129+
6130+
6131+/* Prepare a task for running in RT mode
6132+ * Enqueues the task into master queue data structure
6133+ * returns
6134+ * -EPERM if task is not TASK_STOPPED
6135+ */
6136+static long adaptive_prepare_task(struct task_struct * t)
6137+{
6138+ unsigned long flags;
6139+
6140+ TRACE("adaptive: prepare task %d\n", t->pid);
6141+
6142+ if (t->state == TASK_STOPPED) {
6143+ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
6144+
6145+ t->rt_param.scheduled_on = NO_CPU;
6146+ t->rt_param.linked_on = NO_CPU;
6147+ if (t->rt_param.no_service_levels) {
6148+ t->rt_param.predictor_state.estimate =
6149+ get_sl(t, 0).weight;
6150+ } else
6151+ t->rt_param.predictor_state.estimate =
6152+ _frac(get_exec_cost(t), get_rt_period(t));
6153+
6154+ TRACE_TASK(t, "est_weight=" _FP_ "\n", get_est_weight(t));
6155+
6156+ if (get_rt_mode() == MODE_RT_RUN)
6157+ /* The action is already on.
6158+ * Prepare immediate release
6159+ */
6160+ edf_release_now(t);
6161+ /* The task should be running in the queue, otherwise signal
6162+ * code will try to wake it up with fatal consequences.
6163+ */
6164+ t->state = TASK_RUNNING;
6165+
6166+ queue_lock_irqsave(&adaptive_lock, flags);
6167+ total_weight = _add(total_weight, get_est_weight(t));
6168+ requeue(t);
6169+ queue_unlock_irqrestore(&adaptive_lock, flags);
6170+ return 0;
6171+ }
6172+ else
6173+ return -EPERM;
6174+}
6175+
6176+static void adaptive_wake_up_task(struct task_struct *task)
6177+{
6178+ unsigned long flags;
6179+ /* We must determine whether task should go into the release
6180+ * queue or into the ready queue. It may enter the ready queue
6181+ * if it has credit left in its time slice and has not yet reached
6182+ * its deadline. If it is now passed its deadline we assume this the
6183+ * arrival of a new sporadic job and thus put it in the ready queue
6184+ * anyway.If it has zero budget and the next release is in the future
6185+ * it has to go to the release queue.
6186+ */
6187+
6188+ TRACE("adaptive: %d unsuspends\n", task->pid);
6189+
6190+ task->state = TASK_RUNNING;
6191+
6192+ if (is_tardy(task)) {
6193+ /* new sporadic release */
6194+ edf_release_now(task);
6195+ sched_trace_job_release(task);
6196+ }
6197+ else if (task->time_slice)
6198+ /* came back in time before deadline */
6199+ set_rt_flags(task, RT_F_RUNNING);
6200+
6201+ queue_lock_irqsave(&adaptive_lock, flags);
6202+ total_weight = _add(total_weight, get_est_weight(task));
6203+ adaptive_job_arrival(task);
6204+ queue_unlock_irqrestore(&adaptive_lock, flags);
6205+}
6206+
6207+static void adaptive_task_blocks(struct task_struct *t)
6208+{
6209+ unsigned long flags;
6210+
6211+ /* unlink if necessary */
6212+ queue_lock_irqsave(&adaptive_lock, flags);
6213+ total_weight = _sub(total_weight, get_est_weight(t));
6214+ unlink(t);
6215+ queue_unlock_irqrestore(&adaptive_lock, flags);
6216+
6217+ BUG_ON(!is_realtime(t));
6218+
6219+ TRACE("task %d suspends\n", t->pid);
6220+
6221+ BUG_ON(t->rt_list.next != LIST_POISON1);
6222+ BUG_ON(t->rt_list.prev != LIST_POISON2);
6223+}
6224+
6225+
6226+/* When _tear_down is called, the task should not be in any queue any more
6227+ * as it must have blocked first. We don't have any internal state for the task,
6228+ * it is all in the task_struct.
6229+ */
6230+static long adaptive_tear_down(struct task_struct * t)
6231+{
6232+ BUG_ON(!is_realtime(t));
6233+ TRACE_TASK(t, "RIP\n");
6234+ BUG_ON(t->array);
6235+ BUG_ON(t->rt_list.next != LIST_POISON1);
6236+ BUG_ON(t->rt_list.prev != LIST_POISON2);
6237+ return 0;
6238+}
6239+
6240+static int adaptive_mode_change(int new_mode)
6241+{
6242+ unsigned long flags;
6243+ int cpu;
6244+ cpu_entry_t *entry;
6245+ struct task_struct* t;
6246+ struct list_head* pos;
6247+
6248+ if (new_mode == MODE_RT_RUN) {
6249+ queue_lock_irqsave(&adaptive_lock, flags);
6250+
6251+ system_capacity = FP(0);
6252+ for_each_online_cpu(cpu)
6253+ system_capacity = _add(system_capacity, FP(1));
6254+
6255+ __rerelease_all(&adaptive, edf_release_at);
6256+
6257+ total_weight = FP(0);
6258+ list_for_each(pos, &adaptive.release_queue) {
6259+ t = list_entry(pos, struct task_struct, rt_list);
6260+ total_weight = _add(total_weight, get_est_weight(t));
6261+ }
6262+ TRACE("adaptive: total weight: " _FP_
6263+ " (at mode change)\n", total_weight);
6264+
6265+
6266+ /* get old cruft out of the way in case we reenter real-time
6267+ * mode for a second time
6268+ */
6269+ while (!list_empty(&adaptive_cpu_queue))
6270+ list_del(adaptive_cpu_queue.next);
6271+ /* reinitialize */
6272+ for_each_online_cpu(cpu) {
6273+ entry = &per_cpu(adaptive_cpu_entries, cpu);
6274+ atomic_set(&entry->will_schedule, 0);
6275+ entry->linked = NULL;
6276+ entry->scheduled = NULL;
6277+ list_add(&entry->list, &adaptive_cpu_queue);
6278+ }
6279+
6280+ adaptive_optimize();
6281+
6282+ queue_unlock_irqrestore(&adaptive_lock, flags);
6283+
6284+ }
6285+ return 0;
6286+}
6287+
6288+
6289+typedef enum {
6290+ ADAPTIVE_SET_MIN_OPT_SEP = 1
6291+} adaptive_cmds_t;
6292+
6293+
6294+static int adaptive_setup(int cmd, void __user *up)
6295+{
6296+ unsigned int error = -EINVAL;
6297+ unsigned int val;
6298+
6299+ if (copy_from_user(&val, up, sizeof(unsigned int))) {
6300+ error = -EFAULT;
6301+ goto out;
6302+ }
6303+
6304+ switch (cmd) {
6305+ case ADAPTIVE_SET_MIN_OPT_SEP:
6306+ optimizer_min_invocation_sep = val;
6307+ TRACE("adaptive: min opt sep set to %d\n",
6308+ optimizer_min_invocation_sep);
6309+ return 0;
6310+ break;
6311+ }
6312+
6313+out:
6314+ return error;
6315+}
6316+
6317+
6318+/* Plugin object */
6319+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
6320+ .ready_to_use = 0
6321+};
6322+
6323+
6324+/*
6325+ * Plugin initialization code.
6326+ */
6327+#define INIT_SCHED_PLUGIN (struct sched_plugin){ \
6328+ .plugin_name = "ADAPTIVE", \
6329+ .ready_to_use = 1, \
6330+ .scheduler_tick = adaptive_scheduler_tick, \
6331+ .prepare_task = adaptive_prepare_task, \
6332+ .sleep_next_period = edf_sleep_next_period, \
6333+ .tear_down = adaptive_tear_down, \
6334+ .schedule = adaptive_schedule, \
6335+ .finish_switch = adaptive_finish_switch, \
6336+ .mode_change = adaptive_mode_change, \
6337+ .wake_up_task = adaptive_wake_up_task, \
6338+ .task_blocks = adaptive_task_blocks, \
6339+ .scheduler_setup = adaptive_setup \
6340+}
6341+
6342+
6343+sched_plugin_t *__init init_adaptive_plugin(void)
6344+{
6345+ int cpu;
6346+ cpu_entry_t *entry;
6347+
6348+ /* magic values given in the paper */
6349+ fc_a = _frac( 102, 1000);
6350+ fc_b = _frac( 303, 1000);
6351+
6352+ optimizer_period = 1000;
6353+ optimizer_min_invocation_sep = 200;
6354+ task_error_threshold = _frac(1, 2);
6355+
6356+ if (!s_plugin.ready_to_use)
6357+ {
6358+ /* initialize CPU state */
6359+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
6360+ entry = &per_cpu(adaptive_cpu_entries, cpu);
6361+ atomic_set(&entry->will_schedule, 0);
6362+ entry->linked = NULL;
6363+ entry->scheduled = NULL;
6364+ entry->cpu = cpu;
6365+ }
6366+
6367+ queue_lock_init(&adaptive_lock);
6368+ edf_domain_init(&adaptive, NULL);
6369+ s_plugin = INIT_SCHED_PLUGIN;
6370+ }
6371+ return &s_plugin;
6372+}
6373+
6374+
6375diff --git a/kernel/sched_edf_hsb.c b/kernel/sched_edf_hsb.c
6376new file mode 100644
6377index 0000000..a2f670d
6378--- /dev/null
6379+++ b/kernel/sched_edf_hsb.c
6380@@ -0,0 +1,1724 @@
6381+/*
6382+ * kernel/sched_edf_hsb.c
6383+ *
6384+ * Implementation of the EDF-HSB scheduler plugin.
6385+ *
6386+ */
6387+
6388+#include <asm/uaccess.h>
6389+#include <linux/percpu.h>
6390+#include <linux/sched.h>
6391+#include <linux/list.h>
6392+
6393+#include <linux/litmus.h>
6394+#include <linux/sched_plugin.h>
6395+#include <linux/edf_common.h>
6396+#include <linux/fifo_common.h>
6397+#include <linux/sched_trace.h>
6398+
6399+/* undefine to remove capacity sharing */
6400+#define HSB_CAP_SHARE_ENABLED
6401+
6402+/* fake server PIDs */
6403+#define HRT_BASE_PID 50000
6404+#define SRT_BASE_PID 60000
6405+
6406+
6407+/******************************************************************************/
6408+/* Capacity queue */
6409+/******************************************************************************/
6410+
6411+int cap_check_resched(jiffie_t deadline);
6412+
6413+typedef struct {
6414+ int budget;
6415+ jiffie_t deadline;
6416+ pid_t donor;
6417+
6418+ struct list_head list;
6419+} capacity_t;
6420+
6421+typedef struct {
6422+ spinlock_t lock;
6423+ struct list_head queue;
6424+} capacity_queue_t;
6425+
6426+#define next_cap(q) list_entry((q)->queue.next, capacity_t, list)
6427+
6428+void capacity_queue_init(capacity_queue_t* queue)
6429+{
6430+ queue->lock = SPIN_LOCK_UNLOCKED;
6431+ INIT_LIST_HEAD(&queue->queue);
6432+}
6433+
6434+void __add_capacity(capacity_queue_t* queue, capacity_t *cap)
6435+{
6436+ struct list_head* pos;
6437+ capacity_t* queued;
6438+
6439+ list_for_each_prev(pos, &queue->queue) {
6440+ queued = list_entry(pos, capacity_t, list);
6441+ if ( time_before_eq(queued->deadline, cap->deadline)) {
6442+ __list_add(&cap->list, pos, pos->next);
6443+ return;
6444+ }
6445+ }
6446+ list_add(&cap->list, &queue->queue);
6447+}
6448+
6449+int __capacity_available(capacity_queue_t* queue)
6450+{
6451+ capacity_t *cap;
6452+
6453+ while (!list_empty(&queue->queue)) {
6454+ cap = list_entry(queue->queue.next, capacity_t, list);
6455+
6456+
6457+ if (time_before_eq(cap->deadline, jiffies)) {
6458+ list_del(queue->queue.next);
6459+ kfree(cap);
6460+ cap = NULL;
6461+ } else
6462+ break;
6463+ }
6464+
6465+ return !list_empty(&queue->queue);
6466+}
6467+
6468+void __return_capacity(capacity_queue_t* queue, capacity_t *cap)
6469+{
6470+ if (!cap->budget || time_before_eq(cap->deadline, jiffies))
6471+ kfree(cap);
6472+ else
6473+ __add_capacity(queue, cap);
6474+}
6475+
6476+
6477+void return_capacity(capacity_queue_t* queue, capacity_t *cap)
6478+
6479+{
6480+ unsigned long flags;
6481+
6482+ if (!cap->budget || time_before_eq(cap->deadline, jiffies))
6483+ kfree(cap);
6484+ else {
6485+ spin_lock_irqsave(&queue->lock, flags);
6486+ __add_capacity(queue, cap);
6487+ spin_unlock_irqrestore(&queue->lock, flags);
6488+ }
6489+}
6490+
6491+
6492+#define MIN_TIME_DELTA 1
6493+#define MIN_BUDGET 1
6494+
6495+#ifdef HSB_CAP_SHARE_ENABLED
6496+void release_capacity(capacity_queue_t* queue, unsigned int budget,
6497+ jiffie_t deadline, struct task_struct* t)
6498+{
6499+ capacity_t* cap;
6500+ unsigned long flags;
6501+
6502+ if (deadline >= jiffies + MIN_TIME_DELTA && budget >= MIN_BUDGET) {
6503+ cap = kmalloc(sizeof(capacity_t), GFP_ATOMIC);
6504+ if (cap) {
6505+ cap->budget = budget;
6506+ cap->deadline = deadline;
6507+ if (t)
6508+ cap->donor = t->pid;
6509+ else
6510+ cap->donor = 0;
6511+ spin_lock_irqsave(&queue->lock, flags);
6512+ __add_capacity(queue, cap);
6513+ cap_check_resched(next_cap(queue)->deadline);
6514+ spin_unlock_irqrestore(&queue->lock, flags);
6515+ if (t)
6516+ sched_trace_capacity_release(t);
6517+ }
6518+ }
6519+}
6520+
6521+void __release_capacity(capacity_queue_t* queue, unsigned int budget,
6522+ jiffie_t deadline, struct task_struct* t)
6523+{
6524+ capacity_t* cap;
6525+
6526+ if (deadline >= jiffies + MIN_TIME_DELTA && budget >= MIN_BUDGET) {
6527+ cap = kmalloc(sizeof(capacity_t), GFP_ATOMIC);
6528+ if (cap) {
6529+ cap->budget = budget;
6530+ cap->deadline = deadline;
6531+ if (t)
6532+ cap->donor = t->pid;
6533+ else
6534+ cap->donor = 0;
6535+ /* no locking, no resched check -- called from schedule */
6536+ __add_capacity(queue, cap);
6537+ if (t)
6538+ sched_trace_capacity_release(t);
6539+ }
6540+ }
6541+}
6542+
6543+
6544+capacity_t* __take_capacity(capacity_queue_t* queue, jiffie_t deadline, int deadline_matters)
6545+{
6546+ capacity_t* cap = NULL;
6547+
6548+ while (!list_empty(&queue->queue)) {
6549+ cap = list_entry(queue->queue.next, capacity_t, list);
6550+
6551+ if (deadline_matters && time_before(deadline, cap->deadline)) {
6552+ cap = NULL;
6553+ break;
6554+ }
6555+
6556+ list_del(queue->queue.next);
6557+ if (cap->deadline > jiffies) {
6558+ if (cap->deadline - jiffies < cap->budget)
6559+ cap->budget = cap->deadline - jiffies;
6560+ break;
6561+ }
6562+ kfree(cap);
6563+ cap = NULL;
6564+ }
6565+
6566+ return cap;
6567+}
6568+#else
6569+
6570+/* no capacity sharing */
6571+void release_capacity(capacity_queue_t* queue, unsigned int budget,
6572+ jiffie_t deadline, struct task_struct* t)
6573+{
6574+}
6575+
6576+capacity_t* __take_capacity(capacity_queue_t* queue, jiffie_t deadline, int deadline_matters)
6577+{
6578+ return NULL;
6579+}
6580+#endif
6581+
6582+
6583+/******************************************************************************/
6584+/* server abstractions */
6585+/******************************************************************************/
6586+
6587+
6588+/* hrt_server_t - Abstraction of a hard real-time server.
6589+ *
6590+ * One HRT server per CPU. If it is unused period and wcet may be zero.
6591+ * HRT servers are strictly periodic and retain their budget.
6592+ */
6593+typedef struct {
6594+ rt_domain_t domain;
6595+
6596+ unsigned int period;
6597+ unsigned int wcet;
6598+
6599+ jiffie_t deadline;
6600+ int budget;
6601+} hrt_server_t;
6602+
6603+/* be_server_t - Abstraction of best-effort server.
6604+ *
6605+ * This is pretty much only an accounting abstraction.
6606+ */
6607+typedef struct {
6608+ unsigned int period;
6609+ unsigned int wcet;
6610+
6611+ jiffie_t deadline;
6612+ jiffie_t release;
6613+ int budget;
6614+
6615+ struct list_head list;
6616+ pid_t pid;
6617+} be_server_t;
6618+
6619+/* cast to int to allow for negative slack, i.e. tardiness */
6620+#define server_slack(srv) \
6621+ ( ((int) (srv)->deadline - (int) jiffies) - (int) (srv)->budget )
6622+
6623+typedef struct {
6624+ int cpu;
6625+
6626+ hrt_server_t hrt;
6627+ be_server_t* be;
6628+ capacity_t* cap;
6629+
6630+ task_class_t exec_class;
6631+ jiffie_t cur_deadline;
6632+ atomic_t will_schedule;
6633+
6634+ struct list_head list;
6635+ spinlock_t lock;
6636+} cpu_state_t;
6637+
6638+
6639+DEFINE_PER_CPU(cpu_state_t, hsb_cpu_state);
6640+
6641+#define hrt_dom(cpu) (&per_cpu(hsb_cpu_state, cpu).hrt.domain)
6642+
6643+#define set_will_schedule() \
6644+ (atomic_set(&__get_cpu_var(hsb_cpu_state).will_schedule, 1))
6645+#define clear_will_schedule() \
6646+ (atomic_set(&__get_cpu_var(hsb_cpu_state).will_schedule, 0))
6647+#define test_will_schedule(cpu) \
6648+ (atomic_read(&per_cpu(hsb_cpu_state, cpu).will_schedule))
6649+
6650+
6651+static void prepare_hrt_release(hrt_server_t *srv, jiffie_t start)
6652+{
6653+ if (srv->period && srv->wcet) {
6654+ srv->deadline = start;
6655+ srv->budget = 0;
6656+ }
6657+}
6658+
6659+static void check_for_hrt_release(hrt_server_t *srv) {
6660+ if (srv->wcet && srv->period &&
6661+ time_before_eq(srv->deadline, jiffies)) {
6662+ srv->deadline += srv->period;
6663+ srv->budget = srv->wcet;
6664+ sched_trace_server_release(HRT_BASE_PID + smp_processor_id(),
6665+ srv->budget, srv->period, RT_CLASS_HARD);
6666+ }
6667+}
6668+
6669+/* A HRT client is eligible if either its deadline is before the
6670+ * the server deadline or if the server has zero slack. The server
6671+ * must have budget left.
6672+ */
6673+static inline int hrt_client_eligible(hrt_server_t *srv)
6674+{
6675+ if (!list_empty(&srv->domain.ready_queue))
6676+ return srv->budget && (
6677+ time_before(get_deadline(next_ready(&srv->domain)),
6678+ srv->deadline)
6679+ || server_slack(srv) <= 0);
6680+ else
6681+ return 0;
6682+}
6683+
6684+static void hsb_cpu_state_init(cpu_state_t* cpu_state,
6685+ check_resched_needed_t check,
6686+ int cpu)
6687+{
6688+ edf_domain_init(&cpu_state->hrt.domain, check);
6689+ cpu_state->hrt.budget = 0;
6690+ cpu_state->hrt.deadline = 0;
6691+ cpu_state->hrt.period = 0;
6692+ cpu_state->hrt.wcet = 0;
6693+
6694+ cpu_state->be = NULL;
6695+ cpu_state->cap = NULL;
6696+
6697+ cpu_state->cur_deadline = 0;
6698+ cpu_state->cpu = cpu;
6699+ cpu_state->lock = SPIN_LOCK_UNLOCKED;
6700+ cpu_state->exec_class = RT_CLASS_BEST_EFFORT;
6701+
6702+ atomic_set(&cpu_state->will_schedule, 0);
6703+ INIT_LIST_HEAD(&cpu_state->list);
6704+}
6705+
6706+/******************************************************************************/
6707+/* BE queue functions - mostly like edf_common.c */
6708+/******************************************************************************/
6709+
6710+#define be_earlier_deadline(a, b) (time_before(\
6711+ (a)->deadline, (b)->deadline))
6712+#define be_earlier_release(a, b) (time_before(\
6713+ (a)->release, (b)->release))
6714+
6715+
6716+static void be_add_ready(rt_domain_t* edf, be_server_t *new)
6717+{
6718+ unsigned long flags;
6719+ struct list_head *pos;
6720+ be_server_t *queued;
6721+ unsigned int passed = 0;
6722+
6723+ BUG_ON(!new);
6724+ /* first we need the write lock for rt_ready_queue */
6725+ write_lock_irqsave(&edf->ready_lock, flags);
6726+ /* find a spot where our deadline is earlier than the next */
6727+ list_for_each(pos, &edf->ready_queue) {
6728+ queued = list_entry(pos, be_server_t, list);
6729+ if (unlikely(be_earlier_deadline(new, queued))) {
6730+ __list_add(&new->list, pos->prev, pos);
6731+ goto out;
6732+ }
6733+ passed++;
6734+ }
6735+ /* if we get to this point either the list is empty or new has the
6736+ * lowest priority. Let's add it to the end. */
6737+ list_add_tail(&new->list, &edf->ready_queue);
6738+ out:
6739+ if (!passed)
6740+ edf->check_resched(edf);
6741+ write_unlock_irqrestore(&edf->ready_lock, flags);
6742+}
6743+
6744+static be_server_t* be_take_ready(rt_domain_t* edf)
6745+{
6746+ be_server_t *t = NULL;
6747+
6748+ if (!list_empty(&edf->ready_queue)) {
6749+ t = list_entry(edf->ready_queue.next, be_server_t, list);
6750+ /* kick it out of the ready list */
6751+ list_del(&t->list);
6752+ }
6753+ return t;
6754+}
6755+
6756+/*static be_server_t* get_be_server(rt_domain_t* edf)
6757+{
6758+ be_server_t *t = NULL;
6759+
6760+ spin_lock(&edf->release_lock);
6761+ write_lock(&edf->ready_lock);
6762+ t = be_take_ready(edf);
6763+
6764+ if (!t && !list_empty(&edf->release_queue)) {
6765+ t = list_entry(edf->release_queue.next, be_server_t, list);
6766+
6767+ list_del(&t->list);
6768+ }
6769+
6770+ write_unlock(&edf->ready_lock);
6771+ spin_unlock(&edf->release_lock);
6772+ return t;
6773+}*/
6774+
6775+static void be_add_release(rt_domain_t* edf, be_server_t *srv)
6776+{
6777+ unsigned long flags;
6778+ struct list_head *pos;
6779+ be_server_t *queued;
6780+
6781+ spin_lock_irqsave(&edf->release_lock, flags);
6782+ list_for_each_prev(pos, &edf->release_queue) {
6783+ queued = list_entry(pos, be_server_t, list);
6784+ if ((unlikely(be_earlier_release(queued, srv)))) {
6785+ /* the task at pos has an earlier release */
6786+ /* insert the new task in behind it */
6787+ __list_add(&srv->list, pos, pos->next);
6788+ goto out;
6789+ }
6790+ }
6791+
6792+ list_add(&srv->list, &edf->release_queue);
6793+ out:
6794+ spin_unlock_irqrestore(&edf->release_lock, flags);
6795+}
6796+
6797+static void be_try_release_pending(rt_domain_t* edf)
6798+{
6799+ unsigned long flags;
6800+ struct list_head *pos, *save;
6801+ be_server_t *queued;
6802+
6803+ if (spin_trylock_irqsave(&edf->release_lock, flags)) {
6804+ list_for_each_safe(pos, save, &edf->release_queue) {
6805+ queued = list_entry(pos, be_server_t, list);
6806+ if (likely(time_before_eq(
6807+ queued->release,
6808+ jiffies))) {
6809+ list_del(pos);
6810+ be_add_ready(edf, queued);
6811+ sched_trace_server_release(
6812+ queued->pid, queued->budget,
6813+ queued->period, RT_CLASS_BEST_EFFORT);
6814+ } else
6815+ /* the release queue is ordered */
6816+ break;
6817+ }
6818+ spin_unlock_irqrestore(&edf->release_lock, flags);
6819+ }
6820+}
6821+
6822+static void be_prepare_new_release(be_server_t *t, jiffie_t start) {
6823+ t->release = start;
6824+ t->deadline = t->release + t->period;
6825+ t->budget = t->wcet;
6826+}
6827+
6828+static void be_prepare_new_releases(rt_domain_t *edf, jiffie_t start)
6829+{
6830+ unsigned long flags;
6831+ struct list_head tmp_list;
6832+ struct list_head *pos, *n;
6833+ be_server_t *t;
6834+
6835+ INIT_LIST_HEAD(&tmp_list);
6836+
6837+ spin_lock_irqsave(&edf->release_lock, flags);
6838+ write_lock(&edf->ready_lock);
6839+
6840+
6841+ while (!list_empty(&edf->release_queue)) {
6842+ pos = edf->release_queue.next;
6843+ list_del(pos);
6844+ list_add(pos, &tmp_list);
6845+ }
6846+
6847+ while (!list_empty(&edf->ready_queue)) {
6848+ pos = edf->ready_queue.next;
6849+ list_del(pos);
6850+ list_add(pos, &tmp_list);
6851+
6852+ }
6853+
6854+ write_unlock(&edf->ready_lock);
6855+ spin_unlock_irqrestore(&edf->release_lock, flags);
6856+
6857+ list_for_each_safe(pos, n, &tmp_list) {
6858+ t = list_entry(pos, be_server_t, list);
6859+ list_del(pos);
6860+ be_prepare_new_release(t, start);
6861+ be_add_release(edf, t);
6862+ }
6863+
6864+}
6865+
6866+static void be_prepare_for_next_period(be_server_t *t)
6867+{
6868+ BUG_ON(!t);
6869+ /* prepare next release */
6870+ t->release = t->deadline;
6871+ t->deadline += t->period;
6872+ t->budget = t->wcet;
6873+}
6874+
6875+#define be_next_ready(edf) \
6876+ list_entry((edf)->ready_queue.next, be_server_t, list)
6877+
6878+
6879+/* need_to_preempt - check whether the task t needs to be preempted by a
6880+ * best-effort server.
6881+ */
6882+static inline int be_preemption_needed(rt_domain_t* edf, cpu_state_t* state)
6883+{
6884+ /* we need the read lock for rt_ready_queue */
6885+ if (!list_empty(&edf->ready_queue))
6886+ {
6887+
6888+ if (state->exec_class == RT_CLASS_SOFT) {
6889+ if (state->cap)
6890+ return time_before(
6891+ be_next_ready(edf)->deadline,
6892+ state->cap->deadline);
6893+ else
6894+ return time_before(
6895+ be_next_ready(edf)->deadline,
6896+ state->cur_deadline);
6897+ } else
6898+ return 1;
6899+ }
6900+ return 0;
6901+}
6902+
6903+static void be_enqueue(rt_domain_t* edf, be_server_t* srv)
6904+{
6905+ int new_release = 0;
6906+ if (!srv->budget) {
6907+ be_prepare_for_next_period(srv);
6908+ new_release = 1;
6909+ }
6910+
6911+ if (time_before_eq(srv->release, jiffies) &&
6912+ get_rt_mode() == MODE_RT_RUN) {
6913+ be_add_ready(edf, srv);
6914+ if (new_release)
6915+ sched_trace_server_release(
6916+ srv->pid, srv->budget,
6917+ srv->period, RT_CLASS_BEST_EFFORT);
6918+ } else
6919+ be_add_release(edf, srv);
6920+}
6921+
6922+static void be_preempt(rt_domain_t *be, cpu_state_t *state)
6923+{
6924+ be_server_t *srv;
6925+
6926+ spin_lock(&state->lock);
6927+ srv = state->be;
6928+ state->be = NULL;
6929+ spin_unlock(&state->lock);
6930+
6931+ /* add outside of lock to avoid deadlock */
6932+ if (srv)
6933+ be_enqueue(be, srv);
6934+}
6935+
6936+
6937+/******************************************************************************/
6938+/* Actual HSB implementation */
6939+/******************************************************************************/
6940+
6941+/* always acquire the cpu lock as the last lock to avoid deadlocks */
6942+static spinlock_t hsb_cpu_lock = SPIN_LOCK_UNLOCKED;
6943+/* the cpus queue themselves according to priority in here */
6944+static LIST_HEAD(hsb_cpu_queue);
6945+
6946+
6947+/* the global soft real-time domain */
6948+static rt_domain_t srt;
6949+/* the global best-effort server domain
6950+ * belongs conceptually to the srt domain, but has
6951+ * be_server_t* queued instead of tast_t*
6952+ */
6953+static rt_domain_t be;
6954+
6955+static rt_domain_t hsb_fifo;
6956+
6957+static capacity_queue_t cap_queue;
6958+
6959+
6960+
6961+
6962+/* adjust_cpu_queue - Move the cpu entry to the correct place to maintain
6963+ * order in the cpu queue.
6964+ *
6965+ */
6966+static void adjust_cpu_queue(task_class_t class, jiffie_t deadline,
6967+ be_server_t *be)
6968+{
6969+ struct list_head *pos;
6970+ cpu_state_t *other;
6971+ cpu_state_t *entry;
6972+
6973+ spin_lock(&hsb_cpu_lock);
6974+
6975+ entry = &__get_cpu_var(hsb_cpu_state);
6976+
6977+ spin_lock(&entry->lock);
6978+ entry->exec_class = class;
6979+ entry->cur_deadline = deadline;
6980+ entry->be = be;
6981+
6982+ spin_unlock(&entry->lock);
6983+
6984+
6985+
6986+ if (be)
6987+ sched_trace_server_scheduled(
6988+ be->pid, RT_CLASS_BEST_EFFORT, be->budget,
6989+ be->deadline);
6990+ else if (class == RT_CLASS_HARD)
6991+ sched_trace_server_scheduled(
6992+ HRT_BASE_PID + smp_processor_id(), RT_CLASS_HARD,
6993+ entry->hrt.budget, entry->hrt.deadline);
6994+
6995+ list_del(&entry->list);
6996+ /* If we do not execute real-time jobs we just move
6997+ * to the end of the queue .
6998+ * If we execute hard real-time jobs we move the start
6999+ * of the queue.
7000+ */
7001+
7002+ switch (entry->exec_class) {
7003+ case RT_CLASS_HARD:
7004+ list_add(&entry->list, &hsb_cpu_queue);
7005+ break;
7006+
7007+ case RT_CLASS_SOFT:
7008+ list_for_each(pos, &hsb_cpu_queue) {
7009+ other = list_entry(pos, cpu_state_t, list);
7010+ if (other->exec_class > RT_CLASS_SOFT ||
7011+ time_before_eq(entry->cur_deadline,
7012+ other->cur_deadline))
7013+ {
7014+ __list_add(&entry->list, pos->prev, pos);
7015+ goto out;
7016+ }
7017+ }
7018+ /* possible fall through if lowest SRT priority */
7019+
7020+ case RT_CLASS_BEST_EFFORT:
7021+ list_add_tail(&entry->list, &hsb_cpu_queue);
7022+ break;
7023+
7024+ default:
7025+ /* something wrong in the variable */
7026+ BUG();
7027+ }
7028+ out:
7029+ spin_unlock(&hsb_cpu_lock);
7030+}
7031+
7032+
7033+/* hrt_check_resched - check whether the HRT server on given CPU needs to
7034+ * preempt the running task.
7035+ */
7036+static int hrt_check_resched(rt_domain_t *edf)
7037+{
7038+ hrt_server_t *srv = container_of(edf, hrt_server_t, domain);
7039+ cpu_state_t *state = container_of(srv, cpu_state_t, hrt);
7040+ int ret = 0;
7041+
7042+ spin_lock(&state->lock);
7043+
7044+ if (hrt_client_eligible(srv)) {
7045+ if (state->exec_class > RT_CLASS_HARD ||
7046+ time_before(
7047+ get_deadline(next_ready(edf)),
7048+ state->cur_deadline)
7049+ ) {
7050+ if (state->cpu == smp_processor_id())
7051+ set_tsk_need_resched(current);
7052+ else
7053+ smp_send_reschedule(state->cpu);
7054+ }
7055+ }
7056+
7057+ spin_unlock(&state->lock);
7058+ return ret;
7059+}
7060+
7061+
7062+/* srt_check_resched - Check whether another CPU needs to switch to a SRT task.
7063+ *
7064+ * The function only checks and kicks the last CPU. It will reschedule and
7065+ * kick the next if necessary, and so on. The caller is responsible for making
7066+ * sure that it is not the last entry or that a reschedule is not necessary.
7067+ *
7068+ * Caller must hold edf->ready_lock!
7069+ */
7070+static int srt_check_resched(rt_domain_t *edf)
7071+{
7072+ cpu_state_t *last;
7073+ int ret = 0;
7074+
7075+ spin_lock(&hsb_cpu_lock);
7076+
7077+ if (!list_empty(&srt.ready_queue)) {
7078+ last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list);
7079+ /* guard against concurrent updates */
7080+ spin_lock(&last->lock);
7081+ if (last->exec_class == RT_CLASS_BEST_EFFORT || (
7082+ last->exec_class == RT_CLASS_SOFT &&
7083+ time_before(get_deadline(next_ready(&srt)),
7084+ last->cur_deadline)))
7085+ {
7086+ if (smp_processor_id() == last->cpu)
7087+ set_tsk_need_resched(current);
7088+ else
7089+ if (!test_will_schedule(last->cpu))
7090+ smp_send_reschedule(last->cpu);
7091+ ret = 1;
7092+ }
7093+ spin_unlock(&last->lock);
7094+ }
7095+
7096+ spin_unlock(&hsb_cpu_lock);
7097+ return ret;
7098+}
7099+
7100+
7101+/* be_check_resched - Check whether another CPU needs to switch to a BE server..
7102+ *
7103+ * Caller must hold edf->ready_lock!
7104+ */
7105+static int be_check_resched(rt_domain_t *edf)
7106+{
7107+ cpu_state_t *last;
7108+ int soft, bg;
7109+ int ret = 0;
7110+
7111+ spin_lock(&hsb_cpu_lock);
7112+
7113+ if (!list_empty(&be.ready_queue)) {
7114+ last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list);
7115+ /* guard against concurrent updates */
7116+ spin_lock(&last->lock);
7117+
7118+ bg = last->exec_class == RT_CLASS_BEST_EFFORT;
7119+ soft = last->exec_class == RT_CLASS_SOFT;
7120+
7121+ if (bg || (soft && time_before(be_next_ready(&be)->deadline,
7122+ last->cur_deadline)))
7123+ {
7124+ if (smp_processor_id() == last->cpu)
7125+ set_tsk_need_resched(current);
7126+ else
7127+ if (!test_will_schedule(last->cpu))
7128+ smp_send_reschedule(last->cpu);
7129+ ret = 1;
7130+ }
7131+
7132+ spin_unlock(&last->lock);
7133+ }
7134+
7135+ spin_unlock(&hsb_cpu_lock);
7136+ return ret;
7137+}
7138+
7139+
7140+int cap_check_resched(jiffie_t deadline)
7141+{
7142+ unsigned long flags;
7143+ cpu_state_t *last;
7144+ int soft, bg;
7145+ int ret = 0;
7146+
7147+
7148+
7149+ if (get_rt_mode() == MODE_RT_RUN) {
7150+ spin_lock_irqsave(&hsb_cpu_lock, flags);
7151+
7152+ last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list);
7153+ /* guard against concurrent updates */
7154+ spin_lock(&last->lock);
7155+
7156+ bg = last->exec_class == RT_CLASS_BEST_EFFORT;
7157+ soft = last->exec_class == RT_CLASS_SOFT;
7158+
7159+ if (bg || (soft && time_before(deadline,
7160+ last->cur_deadline)))
7161+ {
7162+ if (smp_processor_id() == last->cpu)
7163+ set_tsk_need_resched(current);
7164+ else
7165+ if (!test_will_schedule(last->cpu))
7166+ smp_send_reschedule(last->cpu);
7167+ ret = 1;
7168+ }
7169+
7170+ spin_unlock(&last->lock);
7171+
7172+ spin_unlock_irqrestore(&hsb_cpu_lock, flags);
7173+ }
7174+ return ret;
7175+}
7176+
7177+int fifo_check_resched(void)
7178+{
7179+ unsigned long flags;
7180+ cpu_state_t *last;
7181+ int ret = 0;
7182+
7183+ if (get_rt_mode() == MODE_RT_RUN) {
7184+ spin_lock_irqsave(&hsb_cpu_lock, flags);
7185+
7186+
7187+ last = list_entry(hsb_cpu_queue.prev, cpu_state_t, list);
7188+ /* guard against concurrent updates */
7189+
7190+ spin_lock(&last->lock);
7191+
7192+ if (last->exec_class == RT_CLASS_BEST_EFFORT)
7193+ {
7194+ if (smp_processor_id() == last->cpu)
7195+ set_tsk_need_resched(current);
7196+ else
7197+ if (!test_will_schedule(last->cpu))
7198+ smp_send_reschedule(last->cpu);
7199+ ret = 1;
7200+ }
7201+
7202+ spin_unlock(&last->lock);
7203+
7204+ spin_unlock_irqrestore(&hsb_cpu_lock, flags);
7205+ }
7206+ return ret;
7207+}
7208+
7209+
7210+
7211+static inline int hsb_preemption_needed(rt_domain_t* edf, cpu_state_t* state)
7212+{
7213+ /* we need the read lock for rt_ready_queue */
7214+ if (!list_empty(&edf->ready_queue))
7215+ {
7216+ if (state->exec_class == RT_CLASS_SOFT) {
7217+ if (state->cap)
7218+ return time_before(get_deadline(next_ready(edf))
7219+ , state->cap->deadline);
7220+ else
7221+ return time_before(get_deadline(next_ready(edf))
7222+ , state->cur_deadline);
7223+ } else
7224+ return 1;
7225+ }
7226+ return 0;
7227+}
7228+
7229+static inline int cap_preemption_needed(capacity_queue_t* q, cpu_state_t* state)
7230+{
7231+ /* we need the read lock for rt_ready_queue */
7232+ if (!list_empty(&q->queue))
7233+ {
7234+ if (state->exec_class == RT_CLASS_SOFT) {
7235+ if (state->cap)
7236+ return time_before(next_cap(q)->deadline
7237+ , state->cap->deadline);
7238+ else
7239+ return time_before(next_cap(q)->deadline
7240+ , state->cur_deadline);
7241+ } else
7242+ return 1;
7243+ }
7244+ return 0;
7245+}
7246+
7247+/* hsb_scheduler_tick - this function is called for every local timer
7248+ * interrupt.
7249+ *
7250+ * checks whether the current task has expired and checks
7251+ * whether we need to preempt it if it has not expired
7252+ */
7253+static reschedule_check_t hsb_scheduler_tick(void)
7254+{
7255+ unsigned long flags;
7256+ struct task_struct *t = current;
7257+ int resched = 0;
7258+
7259+ cpu_state_t *state = &__get_cpu_var(hsb_cpu_state);
7260+
7261+ /* expire tasks even if not in real-time mode
7262+ * this makes sure that at the end of real-time mode
7263+ * no tasks "run away forever".
7264+ */
7265+
7266+ /* charge BE server only if we are not running on a spare capacity */
7267+ if (state->be && !state->cap && --state->be->budget <= 0) {
7268+ sched_trace_server_completion(state->be->pid, 0,
7269+ state->be->deadline,
7270+ RT_CLASS_BEST_EFFORT);
7271+ be_preempt(&be, state);
7272+ resched = 1;
7273+ }
7274+
7275+ if (state->cap)
7276+ if (--state->cap->budget <= 0 ||
7277+ time_before_eq(state->cap->deadline, jiffies)) {
7278+ kfree(state->cap);
7279+ state->cap = NULL;
7280+ resched = 1;
7281+ }
7282+
7283+ if (is_realtime(t)) {
7284+ if (is_hrt(t) && (--state->hrt.budget <= 0)) {
7285+ sched_trace_server_completion(
7286+ HRT_BASE_PID + smp_processor_id(), 0,
7287+ state->hrt.deadline, RT_CLASS_HARD);
7288+ resched = 1;
7289+ }
7290+
7291+ /* account for received service... */
7292+ t->rt_param.times.exec_time++;
7293+
7294+ /* ...and charge current budget */
7295+ if (!state->cap) {
7296+ --t->time_slice;
7297+ /* a task always should be able to finish its job */
7298+ BUG_ON(!is_be(t) && !t->time_slice && !job_completed(t));
7299+ }
7300+
7301+ if (job_completed(t) || (is_be(t) && !t->time_slice)) {
7302+ sched_trace_job_completion(t);
7303+ set_rt_flags(t, RT_F_SLEEP);
7304+ resched = 1;
7305+ }
7306+ }
7307+
7308+
7309+ if (get_rt_mode() == MODE_RT_RUN)
7310+ {
7311+ try_release_pending(&state->hrt.domain);
7312+ check_for_hrt_release(&state->hrt);
7313+ try_release_pending(&srt);
7314+ be_try_release_pending(&be);
7315+
7316+ if (!resched)
7317+ switch (state->exec_class) {
7318+ case RT_CLASS_HARD:
7319+ read_lock_irqsave(&state->hrt.domain.ready_lock,
7320+ flags);
7321+ resched = edf_preemption_needed(
7322+ &state->hrt.domain,
7323+ t);
7324+ read_unlock_irqrestore(
7325+ &state->hrt.domain.ready_lock, flags);
7326+ break;
7327+
7328+ case RT_CLASS_SOFT:
7329+ case RT_CLASS_BEST_EFFORT:
7330+ local_irq_save(flags);
7331+
7332+ /* check for HRT jobs */
7333+ read_lock(&state->hrt.domain.ready_lock);
7334+ resched = hrt_client_eligible(&state->hrt);
7335+ read_unlock(&state->hrt.domain.ready_lock);
7336+
7337+ /* check for spare capacities */
7338+ if (!resched) {
7339+ spin_lock(&cap_queue.lock);
7340+ resched =
7341+ cap_preemption_needed(&cap_queue,
7342+ state);
7343+ spin_unlock(&cap_queue.lock);
7344+ }
7345+
7346+ /* check for SRT jobs */
7347+ if (!resched) {
7348+ read_lock(&srt.ready_lock);
7349+ resched = hsb_preemption_needed(
7350+ &srt, state);
7351+ read_unlock(&srt.ready_lock);
7352+ }
7353+
7354+ /* check for BE jobs */
7355+ if (!resched) {
7356+ read_lock(&be.ready_lock);
7357+ resched = be_preemption_needed(
7358+ &be, state);
7359+ read_unlock(&be.ready_lock);
7360+ }
7361+
7362+ /* check for background jobs */
7363+ if (!resched && !is_realtime(current))
7364+ resched = jobs_pending(&hsb_fifo);
7365+ local_irq_restore(flags);
7366+ break;
7367+
7368+ default:
7369+ /* something wrong in the variable */
7370+ BUG();
7371+ }
7372+ }
7373+
7374+ if (resched) {
7375+ set_will_schedule();
7376+ return FORCE_RESCHED;
7377+ } else
7378+ return NO_RESCHED;
7379+}
7380+
7381+static int schedule_hrt(struct task_struct * prev,
7382+ struct task_struct ** next, runqueue_t * rq)
7383+{
7384+ unsigned long flags;
7385+ int deactivate = 1;
7386+ cpu_state_t *state;
7387+
7388+
7389+ state = &__get_cpu_var(hsb_cpu_state);
7390+
7391+ write_lock_irqsave(&state->hrt.domain.ready_lock, flags);
7392+
7393+
7394+ if (state->cap) {
7395+ /* hrt_schedule does not have the cap_queue lock */
7396+ return_capacity(&cap_queue, state->cap);
7397+ state->cap = NULL;
7398+ }
7399+
7400+ if (is_hrt(prev) && is_released(prev) && is_running(prev)
7401+ && !edf_preemption_needed(&state->hrt.domain, prev)) {
7402+ /* This really should only happen if the task has
7403+ * 100% utilization or when we got a bogus/delayed
7404+ * resched IPI.
7405+ */
7406+ TRACE("HRT: prev will be next, already released\n");
7407+ *next = prev;
7408+ deactivate = 0;
7409+ } else {
7410+ /* either not yet released, preempted, or non-rt */
7411+ *next = __take_ready(&state->hrt.domain);
7412+ /* the logic in hsb_schedule makes sure *next must exist
7413+ * if we get here */
7414+ BUG_ON(!*next);
7415+ /* stick the task into the runqueue */
7416+ __activate_task(*next, rq);
7417+ set_task_cpu(*next, smp_processor_id());
7418+ }
7419+
7420+ set_rt_flags(*next, RT_F_RUNNING);
7421+ adjust_cpu_queue(RT_CLASS_HARD, get_deadline(*next), NULL);
7422+ clear_will_schedule();
7423+
7424+ write_unlock_irqrestore(&state->hrt.domain.ready_lock, flags);
7425+ return deactivate;
7426+}
7427+
7428+
7429+static struct task_struct* find_min_slack_task(struct task_struct *prev,
7430+ rt_domain_t* edf)
7431+{
7432+ struct list_head *pos;
7433+ struct task_struct* tsk = NULL;
7434+ struct task_struct* cur;
7435+
7436+ if (is_realtime(prev) && is_running(prev) &&
7437+ get_rt_flags(prev) != RT_F_SLEEP)
7438+ tsk = prev;
7439+ list_for_each(pos, &edf->ready_queue) {
7440+ cur = list_entry(pos, struct task_struct, rt_list);
7441+ if (!tsk || task_slack(tsk) > task_slack(cur))
7442+ tsk = cur;
7443+ }
7444+ return tsk;
7445+}
7446+
7447+static struct task_struct* null_heuristic(struct task_struct *prev,
7448+ rt_domain_t* edf,
7449+ rt_domain_t* fifo)
7450+{
7451+ if (jobs_pending(fifo))
7452+ return NULL;
7453+ else if (!list_empty(&edf->ready_queue))
7454+ return list_entry(edf->ready_queue.next,
7455+ struct task_struct, rt_list);
7456+ else
7457+ return NULL;
7458+}
7459+
7460+/* caller holds all locks
7461+ */
7462+
7463+static int schedule_capacity(struct task_struct *prev,
7464+ struct task_struct **next, runqueue_t *rq)
7465+{
7466+ cpu_state_t *state = &__get_cpu_var(hsb_cpu_state);
7467+ capacity_t* old;
7468+
7469+ if (state->cap) {
7470+ old = state->cap;
7471+ state->cap = __take_capacity(&cap_queue, old->deadline, 1);
7472+ if (!state->cap)
7473+ state->cap = old;
7474+ else
7475+ __return_capacity(&cap_queue, old);
7476+ } else
7477+ state->cap = __take_capacity(&cap_queue, 0, 0);
7478+
7479+
7480+ /* pick a task likely to be tardy */
7481+ *next = find_min_slack_task(prev, &srt);
7482+
7483+ /* only give away spare capacities if there is no task that
7484+ * is going to be tardy
7485+ */
7486+ if (*next && task_slack(*next) >= 0)
7487+ *next = null_heuristic(prev, &srt, &hsb_fifo);
7488+ if (*next && *next != prev)
7489+ list_del(&(*next)->rt_list);
7490+
7491+
7492+ /* if there is none pick a BE job */
7493+ if (!*next) {
7494+ if (is_realtime(prev) && is_be(prev) && is_running(prev) &&
7495+ get_rt_flags(prev) != RT_F_SLEEP)
7496+ *next = prev;
7497+ else
7498+ *next = take_ready(&hsb_fifo);
7499+ }
7500+
7501+ if (state->be)
7502+ be_preempt(&be, state);
7503+ BUG_ON(!state->cap);
7504+ if (*next && state->cap->donor) {
7505+ sched_trace_capacity_allocation(
7506+ *next, state->cap->budget, state->cap->deadline,
7507+ state->cap->donor);
7508+ }
7509+
7510+ return *next != prev;
7511+}
7512+
7513+
7514+
7515+#define BG 0
7516+#define SRT 1
7517+#define BE 2
7518+#define CAP 3
7519+
7520+static inline int what_first(rt_domain_t *be, rt_domain_t *srt, capacity_queue_t* q)
7521+{
7522+ jiffie_t sdl = 0, bdl= 0, cdl = 0, cur;
7523+ int _srt = !list_empty(&srt->ready_queue);
7524+ int _be = !list_empty(&be->ready_queue);
7525+ int _cap = __capacity_available(q);
7526+
7527+
7528+ int ret = BG; /* nothing ready => background mode*/
7529+ cur = 0;
7530+
7531+ if (_srt)
7532+ sdl = get_deadline(next_ready(srt));
7533+ if (_be)
7534+ bdl = be_next_ready(be)->deadline;
7535+ if (_cap)
7536+ cdl = next_cap(q)->deadline;
7537+
7538+
7539+
7540+ if (_cap) {
7541+ ret = CAP;
7542+ cur = cdl;
7543+ }
7544+ if (_srt && (time_before(sdl, cur) || !ret)) {
7545+ ret = SRT;
7546+ cur = sdl;
7547+ }
7548+ if (_be && (time_before(bdl, cur) || !ret)) {
7549+ ret = BE;
7550+ cur = bdl;
7551+ }
7552+ return ret;
7553+}
7554+
7555+
7556+
7557+static int schedule_srt_be_cap(struct task_struct *prev,
7558+ struct task_struct **next, runqueue_t *rq)
7559+{
7560+ task_class_t class = RT_CLASS_BEST_EFFORT;
7561+ jiffie_t deadline = 0;
7562+ unsigned long flags;
7563+ int deactivate = 1;
7564+ be_server_t* bes;
7565+ cpu_state_t* state;
7566+ int type = BG;
7567+
7568+reschedule:
7569+ write_lock_irqsave(&srt.ready_lock, flags);
7570+ write_lock(&be.ready_lock);
7571+ spin_lock(&cap_queue.lock);
7572+
7573+
7574+ state = &__get_cpu_var(hsb_cpu_state);
7575+ bes = NULL;
7576+
7577+ clear_will_schedule();
7578+
7579+ if (is_realtime(prev) && (is_released(prev) || is_be(prev)) &&
7580+ is_running(prev) && !hsb_preemption_needed(&srt, state) &&
7581+ !be_preemption_needed(&be, state)
7582+ ) {
7583+ /* Our current task's next job has already been
7584+ * released and has higher priority than the highest
7585+ * prioriy waiting task; in other words: it is tardy.
7586+ * We just keep it.
7587+ */
7588+ TRACE("prev will be next, already released\n");
7589+ *next = prev;
7590+ class = prev->rt_param.basic_params.class;
7591+ deadline = get_deadline(*next);
7592+ deactivate = 0;
7593+ } else {
7594+ /* either not yet released, preempted, or non-rt */
7595+ type = what_first(&be, &srt, &cap_queue);
7596+ switch (type) {
7597+ case CAP:
7598+ /* capacity */
7599+ deactivate = schedule_capacity(prev, next, rq);
7600+ deadline = state->cap->deadline;
7601+ if (*next)
7602+ class = RT_CLASS_SOFT;
7603+ else
7604+ class = RT_CLASS_BEST_EFFORT;
7605+ break;
7606+ case BE:
7607+ /* be */
7608+ *next = NULL;
7609+ bes = be_take_ready(&be);
7610+ if (bes) {
7611+ class = RT_CLASS_SOFT;
7612+ deadline = bes->deadline;
7613+ *next = take_ready(&hsb_fifo);
7614+ if (!*next) {
7615+ /* deactivate */
7616+ __release_capacity(&cap_queue,
7617+ bes->budget,
7618+ bes->deadline, NULL);
7619+ bes->budget = 0;
7620+ barrier();
7621+ spin_unlock(&cap_queue.lock);
7622+ write_unlock(&be.ready_lock);
7623+ write_unlock_irqrestore(&srt.ready_lock,
7624+ flags);
7625+ be_enqueue(&be, bes);
7626+ goto reschedule;
7627+ }
7628+ }
7629+ break;
7630+ case SRT:
7631+ /* srt */
7632+ *next = __take_ready(&srt);
7633+ if (*next) {
7634+ class = RT_CLASS_SOFT;
7635+ deadline = get_deadline(*next);
7636+ }
7637+ break;
7638+ case BG:
7639+ /* background server mode */
7640+ class = RT_CLASS_BEST_EFFORT;
7641+ deadline = 0;
7642+ *next = take_ready(&hsb_fifo);
7643+ break;
7644+ }
7645+
7646+
7647+ /* give back capacities */
7648+ if (type != CAP && state->cap) {
7649+ __return_capacity(&cap_queue, state->cap);
7650+ state->cap = NULL;
7651+ }
7652+ if (*next && deactivate) {
7653+ /* mark the task as executing on this cpu */
7654+ set_task_cpu(*next, smp_processor_id());
7655+ /* stick the task into the runqueue */
7656+ __activate_task(*next, rq);
7657+ }
7658+ }
7659+
7660+ adjust_cpu_queue(class, deadline, bes);
7661+
7662+ switch (type) {
7663+ case BG:
7664+ break;
7665+ case BE:
7666+ be.check_resched(&be);
7667+ break;
7668+ case SRT:
7669+ srt.check_resched(&srt);
7670+ break;
7671+ case CAP:
7672+ if (!list_empty(&cap_queue.queue))
7673+ cap_check_resched(list_entry(cap_queue.queue.next,
7674+ capacity_t, list)->deadline);
7675+ break;
7676+ }
7677+
7678+
7679+ if(*next)
7680+ set_rt_flags(*next, RT_F_RUNNING);
7681+
7682+ spin_unlock(&cap_queue.lock);
7683+ write_unlock(&be.ready_lock);
7684+ write_unlock_irqrestore(&srt.ready_lock, flags);
7685+ return deactivate;
7686+}
7687+
7688+
7689+static int hsb_schedule(struct task_struct * prev, struct task_struct ** next,
7690+ runqueue_t * rq)
7691+{
7692+ int need_deactivate = 1;
7693+ cpu_state_t *state = NULL;
7694+
7695+ preempt_disable();
7696+
7697+ state = &__get_cpu_var(hsb_cpu_state);
7698+
7699+ be_preempt(&be, state);
7700+
7701+
7702+ if (is_realtime(prev) && !is_be(prev) &&
7703+ get_rt_flags(prev) == RT_F_SLEEP)
7704+ {
7705+ TRACE("preparing %d for next period\n", prev->pid);
7706+ release_capacity(&cap_queue, prev->time_slice,
7707+ prev->rt_param.times.deadline, prev);
7708+ edf_prepare_for_next_period(prev);
7709+ }
7710+
7711+ if (get_rt_mode() == MODE_RT_RUN) {
7712+ /* we need to schedule hrt if a hrt job is pending or when
7713+ * we have a non expired hrt job on the cpu
7714+ */
7715+
7716+ if (hrt_client_eligible(&state->hrt) ||
7717+ unlikely((is_hrt(prev) && is_running(prev) &&
7718+ get_rt_flags(prev) != RT_F_SLEEP))) {
7719+ if (state->cap) {
7720+ return_capacity(&cap_queue, state->cap);
7721+ state->cap = NULL;
7722+ }
7723+ need_deactivate = schedule_hrt(prev, next, rq);
7724+ } else
7725+ need_deactivate = schedule_srt_be_cap(prev, next, rq);
7726+
7727+ }
7728+
7729+ if (is_realtime(prev) && need_deactivate && prev->array) {
7730+ /* take it out of the run queue */
7731+ deactivate_task(prev, rq);
7732+ }
7733+
7734+ preempt_enable();
7735+
7736+ return 0;
7737+}
7738+
7739+/* put task into correct queue */
7740+static inline void hsb_add_release(struct task_struct *t)
7741+{
7742+ if (is_hrt(t))
7743+ add_release(hrt_dom(get_partition(t)), t);
7744+ else if (is_srt(t))
7745+ add_release(&srt, t);
7746+ else if (is_be(t)) {
7747+ t->time_slice = 0;
7748+ add_ready(&hsb_fifo, t);
7749+ fifo_check_resched();
7750+ } else
7751+ BUG();
7752+
7753+}
7754+
7755+/* put task into correct queue */
7756+static inline void hsb_add_ready(struct task_struct *t)
7757+{
7758+ if (is_hrt(t))
7759+ add_ready(hrt_dom(get_partition(t)), t);
7760+ else if (is_srt(t))
7761+ add_ready(&srt, t);
7762+ else if (is_be(t)) {
7763+ add_ready(&hsb_fifo, t);
7764+ fifo_check_resched();
7765+ }
7766+ else
7767+ BUG();
7768+}
7769+
7770+
7771+/* _finish_switch - we just finished the switch away from prev
7772+ * it is now safe to requeue the task
7773+ */
7774+static void hsb_finish_switch(struct task_struct *prev)
7775+{
7776+ if (!is_realtime(prev) || !is_running(prev))
7777+ return;
7778+
7779+ TRACE("finish switch for %d\n", prev->pid);
7780+
7781+ if (is_be(prev)) {
7782+ add_ready(&hsb_fifo, prev);
7783+ return;
7784+ }
7785+
7786+ if (get_rt_flags(prev) == RT_F_SLEEP ||
7787+ get_rt_mode() != MODE_RT_RUN) {
7788+ /* this task has expired
7789+ * _schedule has already taken care of updating
7790+ * the release and
7791+ * deadline. We just must check if has been released.
7792+ */
7793+ if (is_released(prev) && get_rt_mode() == MODE_RT_RUN) {
7794+ sched_trace_job_release(prev);
7795+ hsb_add_ready(prev);
7796+ TRACE("%d goes straight to ready queue\n", prev->pid);
7797+ }
7798+ else
7799+ /* it has got to wait */
7800+ hsb_add_release(prev);
7801+ }
7802+ else {
7803+ /* this is a forced preemption
7804+ * thus the task stays in the ready_queue
7805+ * we only must make it available to other cpus
7806+ */
7807+ hsb_add_ready(prev);
7808+ }
7809+}
7810+
7811+
7812+/* Prepare a task for running in RT mode
7813+ * Enqueues the task into master queue data structure
7814+ * returns
7815+ * -EPERM if task is not TASK_STOPPED
7816+ */
7817+static long hsb_prepare_task(struct task_struct * t)
7818+{
7819+ TRACE("edf-hsb: prepare task %d\n", t->pid);
7820+
7821+ if (t->state == TASK_STOPPED) {
7822+ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
7823+
7824+ if (get_rt_mode() == MODE_RT_RUN && !is_be(t))
7825+ /* The action is already on.
7826+ * Prepare immediate release
7827+ */
7828+ edf_release_now(t);
7829+ /* The task should be running in the queue, otherwise signal
7830+ * code will try to wake it up with fatal consequences.
7831+ */
7832+ t->state = TASK_RUNNING;
7833+ if (is_be(t))
7834+ t->rt_param.times.deadline = 0;
7835+ hsb_add_release(t);
7836+ return 0;
7837+ }
7838+ else
7839+ return -EPERM;
7840+}
7841+
7842+static void hsb_wake_up_task(struct task_struct *task)
7843+{
7844+ /* We must determine whether task should go into the release
7845+ * queue or into the ready queue. It may enter the ready queue
7846+ * if it has credit left in its time slice and has not yet reached
7847+ * its deadline. If it is now passed its deadline we assume this the
7848+ * arrival of a new sporadic job and thus put it in the ready queue
7849+ * anyway.If it has zero budget and the next release is in the future
7850+ * it has to go to the release queue.
7851+ */
7852+ TRACE("edf-hsb: wake up %d with budget=%d\n",
7853+ task->pid, task->time_slice);
7854+ task->state = TASK_RUNNING;
7855+
7856+ if (is_be(task)) {
7857+ task->rt_param.times.last_release = jiffies;
7858+ hsb_add_release(task);
7859+ }
7860+ else if (is_tardy(task)) {
7861+ /* new sporadic release */
7862+ edf_release_now(task);
7863+ sched_trace_job_release(task);
7864+ hsb_add_ready(task);
7865+ }
7866+ else if (task->time_slice) {
7867+ /* came back in time before deadline
7868+ */
7869+ set_rt_flags(task, RT_F_RUNNING);
7870+ hsb_add_ready(task);
7871+ }
7872+ else {
7873+ hsb_add_release(task);
7874+ }
7875+
7876+}
7877+
7878+static void hsb_task_blocks(struct task_struct *t)
7879+{
7880+ /* not really anything to do since it can only block if
7881+ * it is running, and when it is not running it is not in any
7882+ * queue anyway.
7883+ */
7884+ TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice);
7885+ if (is_be(t))
7886+ sched_trace_job_completion(t);
7887+}
7888+
7889+
7890+static int hsb_mode_change(int new_mode)
7891+{
7892+ int cpu;
7893+ cpu_state_t *entry;
7894+ jiffie_t start;
7895+
7896+ TRACE("[%d] edf-hsb: mode changed to %d\n", smp_processor_id(),
7897+ new_mode);
7898+ if (new_mode == MODE_RT_RUN) {
7899+ start = jiffies + 20;
7900+ rerelease_all(&srt, edf_release_at);
7901+ be_prepare_new_releases(&be, start);
7902+
7903+ /* initialize per CPU state
7904+ * we can't do this at boot time because we don't know
7905+ * which CPUs will be online and we can't put non-existing
7906+ * cpus into the queue
7907+ */
7908+ spin_lock(&hsb_cpu_lock);
7909+ /* get old cruft out of the way in case we reenter real-time
7910+ * mode for a second time
7911+ */
7912+ while (!list_empty(&hsb_cpu_queue))
7913+ list_del(hsb_cpu_queue.next);
7914+ /* reinitialize */
7915+ for_each_online_cpu(cpu) {
7916+ entry = &per_cpu(hsb_cpu_state, cpu);
7917+ atomic_set(&entry->will_schedule, 0);
7918+ entry->exec_class = RT_CLASS_BEST_EFFORT;
7919+ entry->cur_deadline = 0;
7920+ list_add(&entry->list, &hsb_cpu_queue);
7921+
7922+ rerelease_all(&entry->hrt.domain, edf_release_at);
7923+ prepare_hrt_release(&entry->hrt, start);
7924+ }
7925+ spin_unlock(&hsb_cpu_lock);
7926+
7927+ }
7928+ TRACE("[%d] edf-hsb: mode change done\n", smp_processor_id());
7929+ return 0;
7930+}
7931+
7932+
7933+typedef enum {
7934+ EDF_HSB_SET_HRT,
7935+ EDF_HSB_GET_HRT,
7936+ EDF_HSB_CREATE_BE
7937+} edf_hsb_setup_cmds_t;
7938+
7939+typedef struct {
7940+ int cpu;
7941+ unsigned int wcet;
7942+ unsigned int period;
7943+} setup_hrt_param_t;
7944+
7945+typedef struct {
7946+ unsigned int wcet;
7947+ unsigned int period;
7948+} create_be_param_t;
7949+
7950+typedef struct {
7951+ union {
7952+ setup_hrt_param_t setup_hrt;
7953+ create_be_param_t create_be;
7954+ };
7955+} param_t;
7956+
7957+static pid_t next_be_server_pid = SRT_BASE_PID;
7958+
7959+static int hsb_scheduler_setup(int cmd, void __user* up)
7960+{
7961+ unsigned long flags;
7962+ int error = -EINVAL;
7963+ cpu_state_t* state;
7964+ be_server_t* srv;
7965+ param_t param;
7966+
7967+ switch (cmd) {
7968+ case EDF_HSB_SET_HRT:
7969+ if (copy_from_user(&param, up, sizeof(setup_hrt_param_t))) {
7970+ error = -EFAULT;
7971+ goto out;
7972+ }
7973+ if (!cpu_online(param.setup_hrt.cpu)) {
7974+ printk(KERN_WARNING "scheduler setup: "
7975+ "CPU %d is not online!\n", param.setup_hrt.cpu);
7976+ error = -EINVAL;
7977+ goto out;
7978+ }
7979+ if (param.setup_hrt.period < param.setup_hrt.wcet) {
7980+ printk(KERN_WARNING "period < wcet!\n");
7981+ error = -EINVAL;
7982+ goto out;
7983+ }
7984+
7985+ state = &per_cpu(hsb_cpu_state, param.setup_hrt.cpu);
7986+ spin_lock_irqsave(&state->lock, flags);
7987+
7988+ state->hrt.wcet = param.setup_hrt.wcet;
7989+ state->hrt.period = param.setup_hrt.period;
7990+
7991+ spin_unlock_irqrestore(&state->lock, flags);
7992+
7993+ printk(KERN_WARNING "edf-hsb: set HRT #%d to (%d, %d)\n",
7994+ param.setup_hrt.cpu, param.setup_hrt.wcet,
7995+ param.setup_hrt.period);
7996+
7997+ error = 0;
7998+
7999+ break;
8000+
8001+ case EDF_HSB_GET_HRT:
8002+ if (copy_from_user(&param, up, sizeof(setup_hrt_param_t))) {
8003+ error = -EFAULT;
8004+ goto out;
8005+ }
8006+ if (!cpu_online(param.setup_hrt.cpu)) {
8007+ error = -EINVAL;
8008+ goto out;
8009+ }
8010+ state = &per_cpu(hsb_cpu_state, param.setup_hrt.cpu);
8011+ spin_lock_irqsave(&state->lock, flags);
8012+
8013+ param.setup_hrt.wcet = state->hrt.wcet;
8014+ param.setup_hrt.period = state->hrt.period;
8015+
8016+ spin_unlock_irqrestore(&state->lock, flags);
8017+
8018+ if (copy_to_user(up, &param, sizeof(setup_hrt_param_t))) {
8019+ error = -EFAULT;
8020+ goto out;
8021+ }
8022+ error = 0;
8023+ break;
8024+
8025+ case EDF_HSB_CREATE_BE:
8026+ if (copy_from_user(&param, up, sizeof(create_be_param_t))) {
8027+ error = -EFAULT;
8028+ goto out;
8029+ }
8030+ if (param.create_be.period < param.create_be.wcet ||
8031+ !param.create_be.period || !param.create_be.wcet) {
8032+ error = -EINVAL;
8033+ goto out;
8034+ }
8035+ srv = (be_server_t*) kmalloc(sizeof(be_server_t), GFP_KERNEL);
8036+ if (!srv) {
8037+ error = -ENOMEM;
8038+ goto out;
8039+ }
8040+ srv->wcet = param.create_be.wcet;
8041+ srv->period = param.create_be.period;
8042+ srv->pid = next_be_server_pid++;
8043+ INIT_LIST_HEAD(&srv->list);
8044+ be_prepare_new_release(srv, jiffies);
8045+ be_enqueue(&be, srv);
8046+
8047+ printk(KERN_WARNING "edf-hsb: created a BE with (%d, %d)\n",
8048+ param.create_be.wcet, param.create_be.period);
8049+
8050+ error = 0;
8051+ break;
8052+
8053+ default:
8054+ printk(KERN_WARNING "edf-hsb: unknown command %d\n", cmd);
8055+ }
8056+
8057+out:
8058+ return error;
8059+}
8060+
8061+/* Plugin object */
8062+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
8063+ .ready_to_use = 0
8064+};
8065+
8066+
8067+/*
8068+ * Plugin initialization code.
8069+ */
8070+#define INIT_SCHED_PLUGIN (struct sched_plugin){\
8071+ .plugin_name = "EDF-HSB",\
8072+ .ready_to_use = 1,\
8073+ .scheduler_tick = hsb_scheduler_tick,\
8074+ .prepare_task = hsb_prepare_task,\
8075+ .sleep_next_period = edf_sleep_next_period,\
8076+ .schedule = hsb_schedule,\
8077+ .finish_switch = hsb_finish_switch,\
8078+ .mode_change = hsb_mode_change,\
8079+ .wake_up_task = hsb_wake_up_task,\
8080+ .task_blocks = hsb_task_blocks, \
8081+ .scheduler_setup = hsb_scheduler_setup \
8082+}
8083+
8084+
8085+sched_plugin_t *__init init_edf_hsb_plugin(void)
8086+{
8087+ int i;
8088+
8089+ if (!s_plugin.ready_to_use)
8090+ {
8091+ capacity_queue_init(&cap_queue);
8092+ edf_domain_init(&srt, srt_check_resched);
8093+ edf_domain_init(&be, be_check_resched);
8094+ fifo_domain_init(&hsb_fifo, NULL);
8095+ for (i = 0; i < NR_CPUS; i++)
8096+ {
8097+ hsb_cpu_state_init(&per_cpu(hsb_cpu_state, i),
8098+ hrt_check_resched, i);
8099+ printk("HRT server %d initialized.\n", i);
8100+ }
8101+ s_plugin = INIT_SCHED_PLUGIN;
8102+ }
8103+ return &s_plugin;
8104+}
8105diff --git a/kernel/sched_global_edf.c b/kernel/sched_global_edf.c
8106new file mode 100644
8107index 0000000..4b36bc5
8108--- /dev/null
8109+++ b/kernel/sched_global_edf.c
8110@@ -0,0 +1,550 @@
8111+/*
8112+ * kernel/sched-global-edf.c
8113+ *
8114+ * Re-Implementation of the Global EDF scheduler.
8115+ *
8116+ * This version works without using the struct queue. It uses the
8117+ * builtin kernel lists.
8118+ */
8119+
8120+#include <linux/percpu.h>
8121+#include <linux/sched.h>
8122+#include <linux/list.h>
8123+
8124+#include <linux/litmus.h>
8125+#include <linux/sched_plugin.h>
8126+
8127+#include <linux/edf_common.h>
8128+#include <linux/sched_trace.h>
8129+
8130+
8131+/* cpu_entry_t - maintain state of the priority of cpu's current task
8132+ * this is needed to check for priority inversions.
8133+ */
8134+typedef struct {
8135+ int cpu;
8136+ int executes_realtime;
8137+ jiffie_t cur_deadline;
8138+ struct list_head list;
8139+ atomic_t will_schedule;
8140+} cpu_entry_t;
8141+DEFINE_PER_CPU(cpu_entry_t, gedf_cpu_entries);
8142+
8143+#define set_will_schedule() \
8144+ (atomic_set(&__get_cpu_var(gedf_cpu_entries).will_schedule, 1))
8145+#define clear_will_schedule() \
8146+ (atomic_set(&__get_cpu_var(gedf_cpu_entries).will_schedule, 0))
8147+#define test_will_schedule(cpu) \
8148+ (atomic_read(&per_cpu(gedf_cpu_entries, cpu).will_schedule))
8149+
8150+
8151+/* always acquire the cpu lock as the last lock to avoid deadlocks */
8152+static spinlock_t gedf_cpu_lock = SPIN_LOCK_UNLOCKED;
8153+/* the cpus queue themselves according to priority in here */
8154+static LIST_HEAD(gedf_cpu_queue);
8155+
8156+
8157+static rt_domain_t gedf;
8158+
8159+#define DUMP(args...) TRACE(args)
8160+
8161+/* adjust_cpu_queue - Move the cpu entry to the correct place to maintain
8162+ * order in the cpu queue. Caller must hold ready write lock.
8163+ *
8164+ */
8165+static void adjust_cpu_queue(int exec_rt, jiffie_t deadline)
8166+{
8167+ struct list_head *pos;
8168+ cpu_entry_t *other;
8169+ cpu_entry_t *entry;
8170+
8171+ spin_lock(&gedf_cpu_lock);
8172+
8173+ entry = &__get_cpu_var(gedf_cpu_entries);
8174+ entry->executes_realtime = exec_rt;
8175+ entry->cur_deadline = deadline;
8176+
8177+ list_del(&entry->list);
8178+ /* if we do not execute real-time jobs we just move
8179+ * to the end of the queue
8180+ */
8181+ if (entry->executes_realtime)
8182+ list_for_each(pos, &gedf_cpu_queue) {
8183+ other = list_entry(pos, cpu_entry_t, list);
8184+ if (!other->executes_realtime ||
8185+ time_before_eq(entry->cur_deadline,
8186+ other->cur_deadline))
8187+ {
8188+ __list_add(&entry->list, pos->prev, pos);
8189+ goto out;
8190+ }
8191+ }
8192+ /* if we get this far we have the lowest priority task */
8193+ list_add_tail(&entry->list, &gedf_cpu_queue);
8194+
8195+ out:
8196+ spin_unlock(&gedf_cpu_lock);
8197+}
8198+
8199+
8200+/* check_reschedule_needed - Check whether another CPU needs to reschedule.
8201+ *
8202+ * The function only checks and kicks the last CPU. It will reschedule and
8203+ * kick the next if necessary, and so on. The caller is responsible for making
8204+ * sure that it is not the last entry or that a reschedule is not necessary.
8205+ *
8206+ */
8207+static int gedf_check_resched(rt_domain_t *edf)
8208+{
8209+ cpu_entry_t *last;
8210+ int ret = 0;
8211+
8212+ spin_lock(&gedf_cpu_lock);
8213+
8214+ if (!list_empty(&edf->ready_queue)) {
8215+ last = list_entry(gedf_cpu_queue.prev, cpu_entry_t, list);
8216+ if (!last->executes_realtime ||
8217+ time_before(next_ready(edf)->rt_param.times.deadline,
8218+ last->cur_deadline))
8219+ {
8220+ if (smp_processor_id() == last->cpu)
8221+ set_tsk_need_resched(current);
8222+ else
8223+ if (!test_will_schedule(last->cpu))
8224+ smp_send_reschedule(last->cpu);
8225+ ret = 1;
8226+ }
8227+ }
8228+
8229+ spin_unlock(&gedf_cpu_lock);
8230+ return ret;
8231+}
8232+
8233+
8234+
8235+/* gedf_scheduler_tick - this function is called for every local timer
8236+ * interrupt.
8237+ *
8238+ * checks whether the current task has expired and checks
8239+ * whether we need to preempt it if it has not expired
8240+ */
8241+static reschedule_check_t gedf_scheduler_tick(void)
8242+{
8243+ unsigned long flags;
8244+ struct task_struct *t = current;
8245+ reschedule_check_t want_resched = NO_RESCHED;
8246+
8247+ /* expire tasks even if not in real-time mode
8248+ * this makes sure that at the end of real-time mode
8249+ * no tasks "run away forever".
8250+ */
8251+ BUG_ON(is_realtime(t) && t->time_slice > 100000);
8252+ if (is_realtime(t) && (!--t->time_slice)) {
8253+ /* this task has exhausted its budget in this period */
8254+ set_rt_flags(t, RT_F_SLEEP);
8255+ want_resched = FORCE_RESCHED;
8256+ set_will_schedule();
8257+ sched_trace_job_completion(t);
8258+ }
8259+ if (get_rt_mode() == MODE_RT_RUN)
8260+ {
8261+ /* check whether anything is waiting to be released
8262+ * this could probably be moved to the global timer
8263+ * interrupt handler since the state will only change
8264+ * once per jiffie
8265+ */
8266+ try_release_pending(&gedf);
8267+ if (want_resched != FORCE_RESCHED)
8268+ {
8269+ read_lock_irqsave(&gedf.ready_lock, flags);
8270+ if (edf_preemption_needed(&gedf, t))
8271+ {
8272+ want_resched = FORCE_RESCHED;
8273+ set_will_schedule();
8274+ }
8275+ read_unlock_irqrestore(&gedf.ready_lock, flags);
8276+ }
8277+ }
8278+ return want_resched;
8279+}
8280+
8281+/* This is main Global EDF schedule function
8282+ *
8283+ * Assumes the caller holds the lock for rq and that irqs are disabled
8284+ * This is function only works for indirect switching
8285+ */
8286+static int gedf_schedule(struct task_struct * prev,
8287+ struct task_struct ** next,
8288+ runqueue_t * rq)
8289+{
8290+ int need_deactivate = 1;
8291+ int rt;
8292+ jiffie_t deadline;
8293+ unsigned long flags;
8294+
8295+
8296+ if (is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP)
8297+ {
8298+ DUMP("preparing %d for next period\n", prev->pid);
8299+ edf_prepare_for_next_period(prev);
8300+ }
8301+
8302+ if (get_rt_mode() == MODE_RT_RUN) {
8303+ write_lock_irqsave(&gedf.ready_lock, flags);
8304+
8305+ clear_will_schedule();
8306+
8307+ if (is_realtime(prev) && is_released(prev) && is_running(prev)
8308+ && !edf_preemption_needed(&gedf, prev)) {
8309+ /* Our current task's next job has already been
8310+ * released and has higher priority than the highest
8311+ * prioriy waiting task; in other words: it is tardy.
8312+ * We just keep it.
8313+ */
8314+ DUMP("prev will be next, already released\n");
8315+ *next = prev;
8316+ rt = 1;
8317+ deadline = prev->rt_param.times.deadline;
8318+ need_deactivate = 0;
8319+ } else {
8320+ /* either not yet released, preempted, or non-rt */
8321+ *next = __take_ready(&gedf);
8322+ if (*next) {
8323+ /* mark the task as executing on this cpu */
8324+ set_task_cpu(*next, smp_processor_id());
8325+
8326+ /* stick the task into the runqueue */
8327+ __activate_task(*next, rq);
8328+ rt = 1;
8329+ deadline = (*next)->rt_param.times.deadline;
8330+ }
8331+ else
8332+ rt = deadline = 0;
8333+ }
8334+
8335+ adjust_cpu_queue(rt, deadline);
8336+
8337+ if (rt) {
8338+ set_rt_flags(*next, RT_F_RUNNING);
8339+ gedf.check_resched(&gedf);
8340+ }
8341+ write_unlock_irqrestore(&gedf.ready_lock, flags);
8342+ }
8343+
8344+ if (is_realtime(prev) && need_deactivate && prev->array) {
8345+ /* take it out of the run queue */
8346+ deactivate_task(prev, rq);
8347+ }
8348+
8349+ /* don't put back into release yet.
8350+ * We first need to actually switch
8351+ * stacks before we can execute it
8352+ * on a different CPU */
8353+
8354+ /* in the current implementation nobody cares about the return value */
8355+ return 0;
8356+}
8357+
8358+
8359+/* _finish_switch - we just finished the switch away from prev
8360+ * it is now safe to requeue the task
8361+ */
8362+static void gedf_finish_switch(struct task_struct *prev)
8363+{
8364+ if (!is_realtime(prev) || !is_running(prev))
8365+ return;
8366+
8367+ /*printk(KERN_INFO "gedf finish switch for %d\n", prev->pid);*/
8368+ if (get_rt_flags(prev) == RT_F_SLEEP ||
8369+ get_rt_mode() != MODE_RT_RUN) {
8370+ /* this task has expired
8371+ * _schedule has already taken care of updating
8372+ * the release and
8373+ * deadline. We just must check if has been released.
8374+ */
8375+ if (time_before_eq(prev->rt_param.times.release, jiffies)
8376+ && get_rt_mode() == MODE_RT_RUN) {
8377+ /* already released */
8378+ add_ready(&gedf, prev);
8379+ DUMP("%d goes straight to ready queue\n", prev->pid);
8380+ }
8381+ else
8382+ /* it has got to wait */
8383+ add_release(&gedf, prev);
8384+ }
8385+ else {
8386+ /* this is a forced preemption
8387+ * thus the task stays in the ready_queue
8388+ * we only must make it available to others
8389+ */
8390+ add_ready(&gedf, prev);
8391+ }
8392+}
8393+
8394+
8395+/* Prepare a task for running in RT mode
8396+ * Enqueues the task into master queue data structure
8397+ * returns
8398+ * -EPERM if task is not TASK_STOPPED
8399+ */
8400+static long gedf_prepare_task(struct task_struct * t)
8401+{
8402+ TRACE("global edf: prepare task %d\n", t->pid);
8403+
8404+ if (t->state == TASK_STOPPED) {
8405+ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
8406+
8407+ if (get_rt_mode() == MODE_RT_RUN)
8408+ /* The action is already on.
8409+ * Prepare immediate release
8410+ */
8411+ edf_release_now(t);
8412+ /* The task should be running in the queue, otherwise signal
8413+ * code will try to wake it up with fatal consequences.
8414+ */
8415+ t->state = TASK_RUNNING;
8416+ add_release(&gedf, t);
8417+ return 0;
8418+ }
8419+ else
8420+ return -EPERM;
8421+}
8422+
8423+static void gedf_wake_up_task(struct task_struct *task)
8424+{
8425+ /* We must determine whether task should go into the release
8426+ * queue or into the ready queue. It may enter the ready queue
8427+ * if it has credit left in its time slice and has not yet reached
8428+ * its deadline. If it is now passed its deadline we assume this the
8429+ * arrival of a new sporadic job and thus put it in the ready queue
8430+ * anyway.If it has zero budget and the next release is in the future
8431+ * it has to go to the release queue.
8432+ */
8433+ TRACE("global edf: wake up %d with budget=%d\n",
8434+ task->pid, task->time_slice);
8435+ task->state = TASK_RUNNING;
8436+ if (is_tardy(task)) {
8437+ /* new sporadic release */
8438+ edf_release_now(task);
8439+ sched_trace_job_release(task);
8440+ add_ready(&gedf, task);
8441+ }
8442+ else if (task->time_slice) {
8443+ /* came back in time before deadline
8444+ */
8445+ set_rt_flags(task, RT_F_RUNNING);
8446+ add_ready(&gedf, task);
8447+ }
8448+ else {
8449+ add_release(&gedf, task);
8450+ }
8451+
8452+}
8453+
8454+static void gedf_task_blocks(struct task_struct *t)
8455+{
8456+ BUG_ON(!is_realtime(t));
8457+ /* not really anything to do since it can only block if
8458+ * it is running, and when it is not running it is not in any
8459+ * queue anyway.
8460+ *
8461+ */
8462+ TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice);
8463+ BUG_ON(t->rt_list.next != LIST_POISON1);
8464+ BUG_ON(t->rt_list.prev != LIST_POISON2);
8465+}
8466+
8467+
8468+/* When _tear_down is called, the task should not be in any queue any more
8469+ * as it must have blocked first. We don't have any internal state for the task,
8470+ * it is all in the task_struct.
8471+ */
8472+static long gedf_tear_down(struct task_struct * t)
8473+{
8474+ BUG_ON(!is_realtime(t));
8475+ TRACE("global edf: tear down called for %d \n", t->pid);
8476+ BUG_ON(t->array);
8477+ BUG_ON(t->rt_list.next != LIST_POISON1);
8478+ BUG_ON(t->rt_list.prev != LIST_POISON2);
8479+ return 0;
8480+}
8481+
8482+
8483+static int gedf_mode_change(int new_mode)
8484+{
8485+ int cpu;
8486+ cpu_entry_t *entry;
8487+
8488+/* printk(KERN_INFO "[%d] global edf: mode changed to %d\n", smp_processor_id(),
8489+ new_mode);*/
8490+ if (new_mode == MODE_RT_RUN) {
8491+ rerelease_all(&gedf, edf_release_at);
8492+
8493+ /* initialize per CPU state
8494+ * we can't do this at boot time because we don't know
8495+ * which CPUs will be online and we can't put non-existing
8496+ * cpus into the queue
8497+ */
8498+ spin_lock(&gedf_cpu_lock);
8499+ /* get old cruft out of the way in case we reenter real-time
8500+ * mode for a second time
8501+ */
8502+ while (!list_empty(&gedf_cpu_queue))
8503+ list_del(gedf_cpu_queue.next);
8504+ /* reinitialize */
8505+ for_each_online_cpu(cpu) {
8506+ entry = &per_cpu(gedf_cpu_entries, cpu);
8507+ atomic_set(&entry->will_schedule, 0);
8508+ entry->executes_realtime = 0;
8509+ entry->cur_deadline = 0;
8510+ entry->cpu = cpu;
8511+ list_add(&entry->list, &gedf_cpu_queue);
8512+ }
8513+ spin_unlock(&gedf_cpu_lock);
8514+ }
8515+ /*printk(KERN_INFO "[%d] global edf: mode change done\n", smp_processor_id()); */
8516+ return 0;
8517+}
8518+
8519+
8520+/* Plugin object */
8521+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
8522+ .ready_to_use = 0
8523+};
8524+
8525+
8526+/*
8527+ * Plugin initialization code.
8528+ */
8529+#define INIT_SCHED_PLUGIN (struct sched_plugin){\
8530+ .plugin_name = "Global EDF",\
8531+ .ready_to_use = 1,\
8532+ .scheduler_tick = gedf_scheduler_tick,\
8533+ .prepare_task = gedf_prepare_task,\
8534+ .sleep_next_period = edf_sleep_next_period,\
8535+ .tear_down = gedf_tear_down,\
8536+ .schedule = gedf_schedule,\
8537+ .finish_switch = gedf_finish_switch,\
8538+ .mode_change = gedf_mode_change,\
8539+ .wake_up_task = gedf_wake_up_task,\
8540+ .task_blocks = gedf_task_blocks \
8541+ }
8542+
8543+
8544+sched_plugin_t *__init init_global_edf_plugin(void)
8545+{
8546+ if (!s_plugin.ready_to_use)
8547+ {
8548+ edf_domain_init(&gedf, gedf_check_resched);
8549+ s_plugin = INIT_SCHED_PLUGIN;
8550+ }
8551+ return &s_plugin;
8552+}
8553+
8554+
8555+
8556+/*****************************************************************************/
8557+/*****************************************************************************/
8558+/*****************************************************************************/
8559+/* NON-PREEMPTIVE GLOBAL EDF */
8560+
8561+
8562+/* gedf_np_scheduler_tick - this function is called for every local timer
8563+ * interrupt.
8564+ *
8565+ * checks whether the current task has expired and checks
8566+ * whether we need to preempt it if it has not expired
8567+ */
8568+static reschedule_check_t gedf_np_scheduler_tick(void)
8569+{
8570+ if (get_rt_mode() == MODE_RT_RUN)
8571+ {
8572+ /* check whether anything is waiting to be released
8573+ * this could probably be moved to the global timer
8574+ * interrupt handler since the state will only change
8575+ * once per jiffie
8576+ */
8577+ try_release_pending(&gedf);
8578+ }
8579+
8580+ /* expire tasks even if not in real-time mode
8581+ * this makes sure that at the end of real-time mode
8582+ * no tasks "run away forever".
8583+ */
8584+ BUG_ON(current->time_slice > 1000);
8585+ if (is_realtime(current) && (!--current->time_slice)) {
8586+ /* this task has exhausted its budget in this period */
8587+ set_rt_flags(current, RT_F_SLEEP);
8588+ return FORCE_RESCHED;
8589+ }
8590+ else
8591+ return NO_RESCHED;
8592+}
8593+
8594+/* gedf_np_check_resched - Check whether another CPU needs to reschedule.
8595+ *
8596+ * The function only checks and kicks the last CPU. It will reschedule and
8597+ * kick the next if necessary, and so on. The caller is responsible for making
8598+ * sure that it is not the last entry or that a reschedule is not necessary.
8599+ *
8600+ */
8601+static int gedf_np_check_resched(rt_domain_t *edf)
8602+{
8603+ cpu_entry_t *last;
8604+ int ret = 0;
8605+
8606+ spin_lock(&gedf_cpu_lock);
8607+
8608+ if (!list_empty(&edf->ready_queue)) {
8609+ last = list_entry(gedf_cpu_queue.prev, cpu_entry_t, list);
8610+ /* preemption happens only for non-realtime tasks */
8611+ if (!last->executes_realtime)
8612+ {
8613+ if (smp_processor_id() == last->cpu)
8614+ set_tsk_need_resched(current);
8615+ else
8616+ smp_send_reschedule(last->cpu);
8617+ ret = 1;
8618+ goto out;
8619+ }
8620+ }
8621+
8622+ out:
8623+ spin_unlock(&gedf_cpu_lock);
8624+ return ret;
8625+}
8626+
8627+
8628+/* non-preemptive global EDF
8629+ *
8630+ * Non-preemptive EDF is almost the same as normal EDF. We only have to
8631+ * adjust the scheduler tick and the resched function.
8632+ */
8633+#define INIT_SCHED_PLUGIN_NP (struct sched_plugin){\
8634+ .plugin_name = "Non-Preemptive Global EDF",\
8635+ .ready_to_use = 1,\
8636+ .scheduler_tick = gedf_np_scheduler_tick,\
8637+ .prepare_task = gedf_prepare_task,\
8638+ .sleep_next_period = edf_sleep_next_period,\
8639+ .tear_down = gedf_tear_down,\
8640+ .schedule = gedf_schedule,\
8641+ .finish_switch = gedf_finish_switch,\
8642+ .mode_change = gedf_mode_change,\
8643+ .wake_up_task = gedf_wake_up_task,\
8644+ .task_blocks = gedf_task_blocks \
8645+ }
8646+
8647+
8648+/* as we only set the plugin at boot time,
8649+ * we use the same structure as preemptive EDF. This simplifies a lot
8650+ * of the funtions.
8651+ */
8652+sched_plugin_t* __init init_global_edf_np_plugin(void)
8653+{
8654+ if (!s_plugin.ready_to_use)
8655+ {
8656+ edf_domain_init(&gedf, gedf_np_check_resched);
8657+ s_plugin = INIT_SCHED_PLUGIN_NP;
8658+ }
8659+ return &s_plugin;
8660+}
8661diff --git a/kernel/sched_gsn_edf.c b/kernel/sched_gsn_edf.c
8662new file mode 100644
8663index 0000000..27d1b37
8664--- /dev/null
8665+++ b/kernel/sched_gsn_edf.c
8666@@ -0,0 +1,814 @@
8667+/*
8668+ * kernel/sched_gsn_edf.c
8669+ *
8670+ * Implementation of the GSN-EDF scheduling algorithm.
8671+ *
8672+ * This version uses the simple approach and serializes all scheduling
8673+ * decisions by the use of a queue lock. This is probably not the
8674+ * best way to do it, but it should suffice for now. It should not
8675+ * affect the benchmarks since all synchronization primitives will
8676+ * take the same performance hit, if any.
8677+ */
8678+
8679+#include <linux/percpu.h>
8680+#include <linux/sched.h>
8681+#include <linux/list.h>
8682+
8683+#include <linux/queuelock.h>
8684+#include <linux/litmus.h>
8685+#include <linux/sched_plugin.h>
8686+#include <linux/edf_common.h>
8687+#include <linux/sched_trace.h>
8688+
8689+/* Overview of GSN-EDF operations.
8690+ *
8691+ * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This
8692+ * description only covers how the individual operations are implemented in
8693+ * LITMUS.
8694+ *
8695+ * link_task_to_cpu(T, cpu) - Low-level operation to update the linkage
8696+ * structure (NOT the actually scheduled
8697+ * task). If there is another linked task To
8698+ * already it will set To->linked_on = NO_CPU
8699+ * (thereby removing its association with this
8700+ * CPU). However, it will not requeue the
8701+ * previously linked task (if any). It will set
8702+ * T's state to RT_F_RUNNING and check whether
8703+ * it is already running somewhere else. If T
8704+ * is scheduled somewhere else it will link
8705+ * it to that CPU instead (and pull the linked
8706+ * task to cpu). T may be NULL.
8707+ *
8708+ * unlink(T) - Unlink removes T from all scheduler data
8709+ * structures. If it is linked to some CPU it
8710+ * will link NULL to that CPU. If it is
8711+ * currently queued in the gsnedf queue it will
8712+ * be removed from the T->rt_list. It is safe to
8713+ * call unlink(T) if T is not linked. T may not
8714+ * be NULL.
8715+ *
8716+ * requeue(T) - Requeue will insert T into the appropriate
8717+ * queue. If the system is in real-time mode and
8718+ * the T is released already, it will go into the
8719+ * ready queue. If the system is not in
8720+ * real-time mode is T, then T will go into the
8721+ * release queue. If T's release time is in the
8722+ * future, it will go into the release
8723+ * queue. That means that T's release time/job
8724+ * no/etc. has to be updated before requeu(T) is
8725+ * called. It is not safe to call requeue(T)
8726+ * when T is already queued. T may not be NULL.
8727+ *
8728+ * gsnedf_job_arrival(T) - This is the catch all function when T enters
8729+ * the system after either a suspension or at a
8730+ * job release. It will queue T (which means it
8731+ * is not safe to call gsnedf_job_arrival(T) if
8732+ * T is already queued) and then check whether a
8733+ * preemption is necessary. If a preemption is
8734+ * necessary it will update the linkage
8735+ * accordingly and cause scheduled to be called
8736+ * (either with an IPI or need_resched). It is
8737+ * safe to call gsnedf_job_arrival(T) if T's
8738+ * next job has not been actually released yet
8739+ * (releast time in the future). T will be put
8740+ * on the release queue in that case.
8741+ *
8742+ * job_completion(T) - Take care of everything that needs to be done
8743+ * to prepare T for its next release and place
8744+ * it in the right queue with
8745+ * gsnedf_job_arrival().
8746+ *
8747+ *
8748+ * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is
8749+ * equivalent to unlink(T). Note that if you unlink a task from a CPU none of
8750+ * the functions will automatically propagate pending task from the ready queue
8751+ * to a linked task. This is the job of the calling function ( by means of
8752+ * __take_ready).
8753+ */
8754+
8755+
8756+/* cpu_entry_t - maintain the linked and scheduled state
8757+ */
8758+typedef struct {
8759+ int cpu;
8760+ struct task_struct* linked; /* only RT tasks */
8761+ struct task_struct* scheduled; /* only RT tasks */
8762+ struct list_head list;
8763+ atomic_t will_schedule; /* prevent unneeded IPIs */
8764+} cpu_entry_t;
8765+DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries);
8766+
8767+#define set_will_schedule() \
8768+ (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 1))
8769+#define clear_will_schedule() \
8770+ (atomic_set(&__get_cpu_var(gsnedf_cpu_entries).will_schedule, 0))
8771+#define test_will_schedule(cpu) \
8772+ (atomic_read(&per_cpu(gsnedf_cpu_entries, cpu).will_schedule))
8773+
8774+
8775+#define NO_CPU 0xffffffff
8776+
8777+/* The gsnedf_lock is used to serialize all scheduling events.
8778+ * It protects
8779+ */
8780+static queuelock_t gsnedf_lock;
8781+/* the cpus queue themselves according to priority in here */
8782+static LIST_HEAD(gsnedf_cpu_queue);
8783+
8784+static rt_domain_t gsnedf;
8785+
8786+
8787+/* update_cpu_position - Move the cpu entry to the correct place to maintain
8788+ * order in the cpu queue. Caller must hold gsnedf lock.
8789+ */
8790+static void update_cpu_position(cpu_entry_t *entry)
8791+{
8792+ cpu_entry_t *other;
8793+ struct list_head *pos;
8794+ list_del(&entry->list);
8795+ /* if we do not execute real-time jobs we just move
8796+ * to the end of the queue
8797+ */
8798+ if (entry->linked) {
8799+ list_for_each(pos, &gsnedf_cpu_queue) {
8800+ other = list_entry(pos, cpu_entry_t, list);
8801+ if (edf_higher_prio(entry->linked, other->linked)) {
8802+ __list_add(&entry->list, pos->prev, pos);
8803+ return;
8804+ }
8805+ }
8806+ }
8807+ /* if we get this far we have the lowest priority job */
8808+ list_add_tail(&entry->list, &gsnedf_cpu_queue);
8809+}
8810+
8811+/* link_task_to_cpu - Update the link of a CPU.
8812+ * Handles the case where the to-be-linked task is already
8813+ * scheduled on a different CPU.
8814+ */
8815+static noinline void link_task_to_cpu(struct task_struct* linked,
8816+ cpu_entry_t *entry)
8817+
8818+{
8819+ cpu_entry_t *sched;
8820+ struct task_struct* tmp;
8821+ int on_cpu;
8822+
8823+ BUG_ON(linked && !is_realtime(linked));
8824+
8825+ /* Currently linked task is set to be unlinked. */
8826+ if (entry->linked) {
8827+ entry->linked->rt_param.linked_on = NO_CPU;
8828+ }
8829+
8830+ /* Link new task to CPU. */
8831+ if (linked) {
8832+ set_rt_flags(linked, RT_F_RUNNING);
8833+ /* handle task is already scheduled somewhere! */
8834+ on_cpu = linked->rt_param.scheduled_on;
8835+ if (on_cpu != NO_CPU) {
8836+ sched = &per_cpu(gsnedf_cpu_entries, on_cpu);
8837+ /* this should only happen if not linked already */
8838+ BUG_ON(sched->linked == linked);
8839+
8840+ /* If we are already scheduled on the CPU to which we
8841+ * wanted to link, we don't need to do the swap --
8842+ * we just link ourselves to the CPU and depend on
8843+ * the caller to get things right.
8844+ */
8845+ if (entry != sched) {
8846+ tmp = sched->linked;
8847+ linked->rt_param.linked_on = sched->cpu;
8848+ sched->linked = linked;
8849+ update_cpu_position(sched);
8850+ linked = tmp;
8851+ }
8852+ }
8853+ if (linked) /* might be NULL due to swap */
8854+ linked->rt_param.linked_on = entry->cpu;
8855+ }
8856+ entry->linked = linked;
8857+ update_cpu_position(entry);
8858+}
8859+
8860+/* unlink - Make sure a task is not linked any longer to an entry
8861+ * where it was linked before. Must hold gsnedf_lock.
8862+ */
8863+static noinline void unlink(struct task_struct* t)
8864+{
8865+ cpu_entry_t *entry;
8866+
8867+ if (unlikely(!t)) {
8868+ TRACE_BUG_ON(!t);
8869+ return;
8870+ }
8871+
8872+ if (t->rt_param.linked_on != NO_CPU) {
8873+ /* unlink */
8874+ entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on);
8875+ t->rt_param.linked_on = NO_CPU;
8876+ link_task_to_cpu(NULL, entry);
8877+ } else if (in_list(&t->rt_list)) {
8878+ /* This is an interesting situation: t is scheduled,
8879+ * but was just recently unlinked. It cannot be
8880+ * linked anywhere else (because then it would have
8881+ * been relinked to this CPU), thus it must be in some
8882+ * queue. We must remove it from the list in this
8883+ * case.
8884+ */
8885+ list_del(&t->rt_list);
8886+ }
8887+}
8888+
8889+
8890+/* preempt - force a CPU to reschedule
8891+ */
8892+static noinline void preempt(cpu_entry_t *entry)
8893+{
8894+ /* We cannot make the is_np() decision here if it is a remote CPU
8895+ * because requesting exit_np() requires that we currently use the
8896+ * address space of the task. Thus, in the remote case we just send
8897+ * the IPI and let schedule() handle the problem.
8898+ */
8899+
8900+ if (smp_processor_id() == entry->cpu) {
8901+ if (entry->scheduled && is_np(entry->scheduled))
8902+ request_exit_np(entry->scheduled);
8903+ else
8904+ set_tsk_need_resched(current);
8905+ } else
8906+ /* in case that it is a remote CPU we have to defer the
8907+ * the decision to the remote CPU
8908+ * FIXME: We could save a few IPI's here if we leave the flag
8909+ * set when we are waiting for a np_exit().
8910+ */
8911+ if (!test_will_schedule(entry->cpu))
8912+ smp_send_reschedule(entry->cpu);
8913+}
8914+
8915+/* requeue - Put an unlinked task into gsn-edf domain.
8916+ * Caller must hold gsnedf_lock.
8917+ */
8918+static noinline void requeue(struct task_struct* task)
8919+{
8920+ BUG_ON(!task);
8921+ /* sanity check rt_list before insertion */
8922+ BUG_ON(in_list(&task->rt_list));
8923+
8924+ if (get_rt_flags(task) == RT_F_SLEEP ||
8925+ get_rt_mode() != MODE_RT_RUN) {
8926+ /* this task has expired
8927+ * _schedule has already taken care of updating
8928+ * the release and
8929+ * deadline. We just must check if it has been released.
8930+ */
8931+ if (is_released(task) && get_rt_mode() == MODE_RT_RUN)
8932+ __add_ready(&gsnedf, task);
8933+ else {
8934+ /* it has got to wait */
8935+ __add_release(&gsnedf, task);
8936+ }
8937+
8938+ } else
8939+ /* this is a forced preemption
8940+ * thus the task stays in the ready_queue
8941+ * we only must make it available to others
8942+ */
8943+ __add_ready(&gsnedf, task);
8944+}
8945+
8946+/* gsnedf_job_arrival: task is either resumed or released */
8947+static noinline void gsnedf_job_arrival(struct task_struct* task)
8948+{
8949+ cpu_entry_t* last;
8950+
8951+ BUG_ON(list_empty(&gsnedf_cpu_queue));
8952+ BUG_ON(!task);
8953+
8954+ /* first queue arriving job */
8955+ requeue(task);
8956+
8957+ /* then check for any necessary preemptions */
8958+ last = list_entry(gsnedf_cpu_queue.prev, cpu_entry_t, list);
8959+ if (edf_preemption_needed(&gsnedf, last->linked)) {
8960+ /* preemption necessary */
8961+ task = __take_ready(&gsnedf);
8962+ TRACE("job_arrival: task %d linked to %d\n",
8963+ task->pid, last->cpu);
8964+ if (last->linked)
8965+ requeue(last->linked);
8966+
8967+ link_task_to_cpu(task, last);
8968+ preempt(last);
8969+ }
8970+}
8971+
8972+/* check for current job releases */
8973+static noinline void gsnedf_release_jobs(void)
8974+{
8975+ struct list_head *pos, *save;
8976+ struct task_struct *queued;
8977+
8978+ list_for_each_safe(pos, save, &gsnedf.release_queue) {
8979+ queued = list_entry(pos, struct task_struct, rt_list);
8980+ if (likely(is_released(queued))) {
8981+ /* this one is ready to go*/
8982+ list_del(pos);
8983+ set_rt_flags(queued, RT_F_RUNNING);
8984+
8985+ sched_trace_job_release(queued);
8986+ gsnedf_job_arrival(queued);
8987+ }
8988+ else
8989+ /* the release queue is ordered */
8990+ break;
8991+ }
8992+}
8993+
8994+/* gsnedf_scheduler_tick - this function is called for every local timer
8995+ * interrupt.
8996+ *
8997+ * checks whether the current task has expired and checks
8998+ * whether we need to preempt it if it has not expired
8999+ */
9000+static reschedule_check_t gsnedf_scheduler_tick(void)
9001+{
9002+ unsigned long flags;
9003+ struct task_struct* t = current;
9004+ reschedule_check_t want_resched = NO_RESCHED;
9005+
9006+ /* expire tasks even if not in real-time mode
9007+ * this makes sure that at the end of real-time mode
9008+ * no task "runs away forever".
9009+ */
9010+ if (is_realtime(t))
9011+ TRACE_CUR("before dec: time_slice == %u\n", t->time_slice);
9012+
9013+ if (is_realtime(t) && t->time_slice && !--t->time_slice) {
9014+ if (!is_np(t)) { /* np tasks will be preempted when they become
9015+ preemptable again */
9016+ want_resched = FORCE_RESCHED;
9017+ set_will_schedule();
9018+ TRACE("gsnedf_scheduler_tick: "
9019+ "%d is preemptable "
9020+ " => FORCE_RESCHED\n", t->pid);
9021+ } else {
9022+ TRACE("gsnedf_scheduler_tick: "
9023+ "%d is non-preemptable, "
9024+ "preemption delayed.\n", t->pid);
9025+ request_exit_np(t);
9026+ }
9027+ }
9028+
9029+ /* only the first CPU needs to release jobs */
9030+ if (get_rt_mode() == MODE_RT_RUN && smp_processor_id() == 0) {
9031+ queue_lock_irqsave(&gsnedf_lock, flags);
9032+
9033+ /* (1) try to release pending jobs */
9034+ gsnedf_release_jobs();
9035+
9036+ /* we don't need to check linked != scheduled since
9037+ * set_tsk_need_resched has been set by preempt() if necessary
9038+ */
9039+
9040+ queue_unlock_irqrestore(&gsnedf_lock, flags);
9041+ }
9042+
9043+ return want_resched;
9044+}
9045+
9046+/* caller holds gsnedf_lock */
9047+static noinline void job_completion(struct task_struct *t)
9048+{
9049+ BUG_ON(!t);
9050+
9051+ sched_trace_job_completion(t);
9052+
9053+ TRACE_TASK(t, "job_completion().\n");
9054+
9055+ /* set flags */
9056+ set_rt_flags(t, RT_F_SLEEP);
9057+ /* prepare for next period */
9058+ edf_prepare_for_next_period(t);
9059+ /* unlink */
9060+ unlink(t);
9061+ /* requeue
9062+ * But don't requeue a blocking task. */
9063+ if (is_running(t))
9064+ gsnedf_job_arrival(t);
9065+}
9066+
9067+
9068+/* Getting schedule() right is a bit tricky. schedule() may not make any
9069+ * assumptions on the state of the current task since it may be called for a
9070+ * number of reasons. The reasons include a scheduler_tick() determined that it
9071+ * was necessary, because sys_exit_np() was called, because some Linux
9072+ * subsystem determined so, or even (in the worst case) because there is a bug
9073+ * hidden somewhere. Thus, we must take extreme care to determine what the
9074+ * current state is.
9075+ *
9076+ * The CPU could currently be scheduling a task (or not), be linked (or not).
9077+ *
9078+ * The following assertions for the scheduled task could hold:
9079+ *
9080+ * - !is_running(scheduled) // the job blocks
9081+ * - scheduled->timeslice == 0 // the job completed (forcefully)
9082+ * - get_rt_flag() == RT_F_SLEEP // the job completed (by syscall)
9083+ * - linked != scheduled // we need to reschedule (for any reason)
9084+ * - is_np(scheduled) // rescheduling must be delayed,
9085+ * sys_exit_np must be requested
9086+ *
9087+ * Any of these can occur together.
9088+ */
9089+static int gsnedf_schedule(struct task_struct * prev,
9090+ struct task_struct ** next,
9091+ runqueue_t * rq)
9092+{
9093+ cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries);
9094+ int out_of_time, sleep, preempt, np, exists,
9095+ rt, blocks;
9096+ struct task_struct* linked;
9097+
9098+ /* Will be released in finish_switch. */
9099+ queue_lock(&gsnedf_lock);
9100+ clear_will_schedule();
9101+
9102+ /* sanity checking */
9103+ BUG_ON(entry->scheduled && entry->scheduled != prev);
9104+ BUG_ON(entry->scheduled && !is_realtime(prev));
9105+
9106+ /* (0) Determine state */
9107+ exists = entry->scheduled != NULL;
9108+ blocks = exists && !is_running(entry->scheduled);
9109+ out_of_time = exists && !entry->scheduled->time_slice;
9110+ np = exists && is_np(entry->scheduled);
9111+ sleep = exists && get_rt_flags(entry->scheduled) == RT_F_SLEEP;
9112+ preempt = entry->scheduled != entry->linked;
9113+ rt = get_rt_mode() == MODE_RT_RUN;
9114+
9115+ /* If a task blocks we have no choice but to reschedule.
9116+ */
9117+ if (blocks)
9118+ unlink(entry->scheduled);
9119+
9120+ /* Request a sys_exit_np() call if we would like to preempt but cannot.
9121+ * We need to make sure to update the link structure anyway in case
9122+ * that we are still linked. Multiple calls to request_exit_np() don't
9123+ * hurt.
9124+ */
9125+ if (np && (out_of_time || preempt || sleep)) {
9126+ unlink(entry->scheduled);
9127+ request_exit_np(entry->scheduled);
9128+ }
9129+
9130+ /* Any task that is preemptable and either exhausts its execution
9131+ * budget or wants to sleep completes. We may have to reschedule after
9132+ * this.
9133+ */
9134+ if (!np && (out_of_time || sleep))
9135+ job_completion(entry->scheduled);
9136+
9137+ /* Stop real-time tasks when we leave real-time mode
9138+ */
9139+ if (!rt && entry->linked) {
9140+ /* task will be preempted once it is preemptable
9141+ * (which it may be already)
9142+ */
9143+ linked = entry->linked;
9144+ unlink(linked);
9145+ requeue(linked);
9146+ }
9147+
9148+ /* Link pending task if we became unlinked.
9149+ */
9150+ if (rt && !entry->linked)
9151+ link_task_to_cpu(__take_ready(&gsnedf), entry);
9152+
9153+ /* The final scheduling decision. Do we need to switch for some reason?
9154+ * If linked different from scheduled select linked as next.
9155+ */
9156+ if ((!np || blocks) &&
9157+ entry->linked != entry->scheduled) {
9158+ /* Take care of a previously scheduled
9159+ * job by taking it out of the Linux runqueue.
9160+ */
9161+ if (entry->scheduled) {
9162+ if (prev->array)
9163+ /* take it out of the run queue */
9164+ deactivate_task(prev, rq);
9165+ }
9166+
9167+ /* Schedule a linked job? */
9168+ if (entry->linked) {
9169+ *next = entry->linked;
9170+ /* mark the task as executing on this cpu */
9171+ set_task_cpu(*next, smp_processor_id());
9172+ /* stick the task into the runqueue */
9173+ __activate_task(*next, rq);
9174+ }
9175+ } else
9176+ /* Only override Linux scheduler if we have real-time task
9177+ * scheduled that needs to continue.
9178+ */
9179+ if (exists)
9180+ *next = prev;
9181+
9182+ /* Unlock in case that we don't affect real-time tasks or
9183+ * if nothing changed and finish_switch won't be called.
9184+ */
9185+ if (prev == *next || (!is_realtime(prev) && !*next))
9186+ queue_unlock(&gsnedf_lock);
9187+
9188+ return 0;
9189+}
9190+
9191+
9192+/* _finish_switch - we just finished the switch away from prev
9193+ */
9194+static void gsnedf_finish_switch(struct task_struct *prev)
9195+{
9196+ cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries);
9197+
9198+ if (is_realtime(current))
9199+ entry->scheduled = current;
9200+ else
9201+ entry->scheduled = NULL;
9202+
9203+ prev->rt_param.scheduled_on = NO_CPU;
9204+ current->rt_param.scheduled_on = smp_processor_id();
9205+
9206+ /* unlock in case schedule() left it locked */
9207+ if (is_realtime(current) || is_realtime(prev))
9208+ queue_unlock(&gsnedf_lock);
9209+}
9210+
9211+
9212+/* Prepare a task for running in RT mode
9213+ * Enqueues the task into master queue data structure
9214+ * returns
9215+ * -EPERM if task is not TASK_STOPPED
9216+ */
9217+static long gsnedf_prepare_task(struct task_struct * t)
9218+{
9219+ unsigned long flags;
9220+ TRACE("gsn edf: prepare task %d\n", t->pid);
9221+
9222+ if (t->state == TASK_STOPPED) {
9223+ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
9224+
9225+ t->rt_param.scheduled_on = NO_CPU;
9226+ t->rt_param.linked_on = NO_CPU;
9227+ if (get_rt_mode() == MODE_RT_RUN)
9228+ /* The action is already on.
9229+ * Prepare immediate release
9230+ */
9231+ edf_release_now(t);
9232+ /* The task should be running in the queue, otherwise signal
9233+ * code will try to wake it up with fatal consequences.
9234+ */
9235+ t->state = TASK_RUNNING;
9236+
9237+ queue_lock_irqsave(&gsnedf_lock, flags);
9238+ requeue(t);
9239+ queue_unlock_irqrestore(&gsnedf_lock, flags);
9240+ return 0;
9241+ }
9242+ else
9243+ return -EPERM;
9244+}
9245+
9246+static void gsnedf_wake_up_task(struct task_struct *task)
9247+{
9248+ unsigned long flags;
9249+ /* We must determine whether task should go into the release
9250+ * queue or into the ready queue. It may enter the ready queue
9251+ * if it has credit left in its time slice and has not yet reached
9252+ * its deadline. If it is now passed its deadline we assume this the
9253+ * arrival of a new sporadic job and thus put it in the ready queue
9254+ * anyway.If it has zero budget and the next release is in the future
9255+ * it has to go to the release queue.
9256+ */
9257+ TRACE("gsnedf: %d unsuspends with budget=%d\n",
9258+ task->pid, task->time_slice);
9259+ task->state = TASK_RUNNING;
9260+
9261+ /* We need to take suspensions because of semaphores into
9262+ * account! If a job resumes after being suspended due to acquiring
9263+ * a semaphore, it should never be treated as a new job release.
9264+ */
9265+ if (get_rt_flags(task) == RT_F_EXIT_SEM) {
9266+ set_rt_flags(task, RT_F_RUNNING);
9267+ } else {
9268+ if (is_tardy(task)) {
9269+ /* new sporadic release */
9270+ edf_release_now(task);
9271+ sched_trace_job_release(task);
9272+ }
9273+ else if (task->time_slice)
9274+ /* came back in time before deadline
9275+ */
9276+ set_rt_flags(task, RT_F_RUNNING);
9277+ }
9278+
9279+ queue_lock_irqsave(&gsnedf_lock, flags);
9280+ gsnedf_job_arrival(task);
9281+ queue_unlock_irqrestore(&gsnedf_lock, flags);
9282+}
9283+
9284+static void gsnedf_task_blocks(struct task_struct *t)
9285+{
9286+ unsigned long flags;
9287+
9288+ /* unlink if necessary */
9289+ queue_lock_irqsave(&gsnedf_lock, flags);
9290+ unlink(t);
9291+ queue_unlock_irqrestore(&gsnedf_lock, flags);
9292+
9293+ BUG_ON(!is_realtime(t));
9294+ TRACE("task %d suspends with budget=%d\n", t->pid, t->time_slice);
9295+ BUG_ON(t->rt_list.next != LIST_POISON1);
9296+ BUG_ON(t->rt_list.prev != LIST_POISON2);
9297+}
9298+
9299+
9300+/* When _tear_down is called, the task should not be in any queue any more
9301+ * as it must have blocked first. We don't have any internal state for the task,
9302+ * it is all in the task_struct.
9303+ */
9304+static long gsnedf_tear_down(struct task_struct * t)
9305+{
9306+ BUG_ON(!is_realtime(t));
9307+ TRACE_TASK(t, "RIP\n");
9308+ BUG_ON(t->array);
9309+ BUG_ON(t->rt_list.next != LIST_POISON1);
9310+ BUG_ON(t->rt_list.prev != LIST_POISON2);
9311+ return 0;
9312+}
9313+
9314+static long gsnedf_pi_block(struct pi_semaphore *sem,
9315+ struct task_struct *new_waiter)
9316+{
9317+ /* This callback has to handle the situation where a new waiter is
9318+ * added to the wait queue of the semaphore.
9319+ *
9320+ * We must check if has a higher priority than the currently
9321+ * highest-priority task, and then potentially reschedule.
9322+ */
9323+
9324+ BUG_ON(!new_waiter);
9325+
9326+ if (edf_higher_prio(new_waiter, sem->hp.task)) {
9327+ TRACE_TASK(new_waiter, " boosts priority\n");
9328+ /* called with IRQs disabled */
9329+ queue_lock(&gsnedf_lock);
9330+ /* store new highest-priority task */
9331+ sem->hp.task = new_waiter;
9332+ if (sem->holder) {
9333+ /* let holder inherit */
9334+ sem->holder->rt_param.inh_task = new_waiter;
9335+ unlink(sem->holder);
9336+ gsnedf_job_arrival(sem->holder);
9337+ }
9338+ queue_unlock(&gsnedf_lock);
9339+ }
9340+
9341+ return 0;
9342+}
9343+
9344+static long gsnedf_inherit_priority(struct pi_semaphore *sem,
9345+ struct task_struct *new_owner)
9346+{
9347+ /* We don't need to acquire the gsnedf_lock since at the time of this
9348+ * call new_owner isn't actually scheduled yet (it's still sleeping)
9349+ * and since the calling function already holds sem->wait.lock, which
9350+ * prevents concurrent sem->hp.task changes.
9351+ */
9352+
9353+ if (sem->hp.task && sem->hp.task != new_owner) {
9354+ new_owner->rt_param.inh_task = sem->hp.task;
9355+ TRACE_TASK(new_owner, "inherited priority from %s/%d\n",
9356+ sem->hp.task->comm, sem->hp.task->pid);
9357+ } else
9358+ TRACE_TASK(new_owner,
9359+ "cannot inherit priority, "
9360+ "no higher priority job waits.\n");
9361+ return 0;
9362+}
9363+
9364+/* This function is called on a semaphore release, and assumes that
9365+ * the current task is also the semaphore holder.
9366+ */
9367+static long gsnedf_return_priority(struct pi_semaphore *sem)
9368+{
9369+ struct task_struct* t = current;
9370+ int ret = 0;
9371+
9372+ /* Find new highest-priority semaphore task
9373+ * if holder task is the current hp.task.
9374+ *
9375+ * Calling function holds sem->wait.lock.
9376+ */
9377+ if (t == sem->hp.task)
9378+ edf_set_hp_task(sem);
9379+
9380+ TRACE_CUR("gsnedf_return_priority for lock %p\n", sem);
9381+
9382+ if (t->rt_param.inh_task) {
9383+ /* interrupts already disabled by PI code */
9384+ queue_lock(&gsnedf_lock);
9385+
9386+ /* Reset inh_task to NULL. */
9387+ t->rt_param.inh_task = NULL;
9388+
9389+ /* Check if rescheduling is necessary */
9390+ unlink(t);
9391+ gsnedf_job_arrival(t);
9392+ queue_unlock(&gsnedf_lock);
9393+ }
9394+
9395+ return ret;
9396+}
9397+
9398+static int gsnedf_mode_change(int new_mode)
9399+{
9400+ unsigned long flags;
9401+ int cpu;
9402+ cpu_entry_t *entry;
9403+
9404+ if (new_mode == MODE_RT_RUN) {
9405+ queue_lock_irqsave(&gsnedf_lock, flags);
9406+
9407+ __rerelease_all(&gsnedf, edf_release_at);
9408+
9409+ /* get old cruft out of the way in case we reenter real-time
9410+ * mode for a second time
9411+ */
9412+ while (!list_empty(&gsnedf_cpu_queue))
9413+ list_del(gsnedf_cpu_queue.next);
9414+ /* reinitialize */
9415+ for_each_online_cpu(cpu) {
9416+ entry = &per_cpu(gsnedf_cpu_entries, cpu);
9417+ atomic_set(&entry->will_schedule, 0);
9418+ entry->linked = NULL;
9419+ entry->scheduled = NULL;
9420+ list_add(&entry->list, &gsnedf_cpu_queue);
9421+ }
9422+
9423+ queue_unlock_irqrestore(&gsnedf_lock, flags);
9424+
9425+ }
9426+ return 0;
9427+}
9428+
9429+
9430+/* Plugin object */
9431+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
9432+ .ready_to_use = 0
9433+};
9434+
9435+
9436+/*
9437+ * Plugin initialization code.
9438+ */
9439+#define INIT_SCHED_PLUGIN (struct sched_plugin){ \
9440+ .plugin_name = "GSN-EDF", \
9441+ .ready_to_use = 1, \
9442+ .scheduler_tick = gsnedf_scheduler_tick, \
9443+ .prepare_task = gsnedf_prepare_task, \
9444+ .sleep_next_period = edf_sleep_next_period, \
9445+ .tear_down = gsnedf_tear_down, \
9446+ .schedule = gsnedf_schedule, \
9447+ .finish_switch = gsnedf_finish_switch, \
9448+ .mode_change = gsnedf_mode_change, \
9449+ .wake_up_task = gsnedf_wake_up_task, \
9450+ .task_blocks = gsnedf_task_blocks, \
9451+ .inherit_priority = gsnedf_inherit_priority, \
9452+ .return_priority = gsnedf_return_priority, \
9453+ .pi_block = gsnedf_pi_block \
9454+}
9455+
9456+
9457+sched_plugin_t *__init init_gsn_edf_plugin(void)
9458+{
9459+ int cpu;
9460+ cpu_entry_t *entry;
9461+
9462+ if (!s_plugin.ready_to_use)
9463+ {
9464+ /* initialize CPU state */
9465+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
9466+ entry = &per_cpu(gsnedf_cpu_entries, cpu);
9467+ atomic_set(&entry->will_schedule, 0);
9468+ entry->linked = NULL;
9469+ entry->scheduled = NULL;
9470+ entry->cpu = cpu;
9471+ }
9472+
9473+ queue_lock_init(&gsnedf_lock);
9474+ edf_domain_init(&gsnedf, NULL);
9475+ s_plugin = INIT_SCHED_PLUGIN;
9476+ }
9477+ return &s_plugin;
9478+}
9479+
9480+
9481diff --git a/kernel/sched_part_edf.c b/kernel/sched_part_edf.c
9482new file mode 100644
9483index 0000000..a792ac5
9484--- /dev/null
9485+++ b/kernel/sched_part_edf.c
9486@@ -0,0 +1,340 @@
9487+/*
9488+ * kernel/sched_part_edf.c
9489+ *
9490+ * Implementation of the partitioned EDF scheduler plugin.
9491+ */
9492+
9493+#include <linux/percpu.h>
9494+#include <linux/sched.h>
9495+#include <linux/list.h>
9496+#include <linux/spinlock.h>
9497+
9498+#include <linux/litmus.h>
9499+#include <linux/sched_plugin.h>
9500+#include <linux/edf_common.h>
9501+
9502+
9503+typedef struct {
9504+ rt_domain_t domain;
9505+ int cpu;
9506+ struct task_struct* scheduled; /* only RT tasks */
9507+ spinlock_t lock;
9508+} part_edf_domain_t;
9509+
9510+
9511+#define local_edf (&__get_cpu_var(part_edf_domains).domain)
9512+#define local_pedf (&__get_cpu_var(part_edf_domains))
9513+#define remote_edf(cpu) (&per_cpu(part_edf_domains, cpu).domain)
9514+#define remote_pedf(cpu) (&per_cpu(part_edf_domains, cpu))
9515+#define task_edf(task) remote_edf(get_partition(task))
9516+
9517+static void part_edf_domain_init(part_edf_domain_t* pedf,
9518+ check_resched_needed_t check,
9519+ int cpu)
9520+{
9521+ edf_domain_init(&pedf->domain, check);
9522+ pedf->cpu = cpu;
9523+ pedf->lock = SPIN_LOCK_UNLOCKED;
9524+ pedf->scheduled = NULL;
9525+}
9526+
9527+DEFINE_PER_CPU(part_edf_domain_t, part_edf_domains);
9528+
9529+/* This check is trivial in partioned systems as we only have to consider
9530+ * the CPU of the partition.
9531+ *
9532+ */
9533+static int part_edf_check_resched(rt_domain_t *edf)
9534+{
9535+ part_edf_domain_t *pedf = container_of(edf, part_edf_domain_t, domain);
9536+ int ret = 0;
9537+
9538+ spin_lock(&pedf->lock);
9539+
9540+ /* because this is a callback from rt_domain_t we already hold
9541+ * the necessary lock for the ready queue
9542+ */
9543+ if (edf_preemption_needed(edf, pedf->scheduled)) {
9544+ if (pedf->cpu == smp_processor_id())
9545+ set_tsk_need_resched(current);
9546+ else
9547+ smp_send_reschedule(pedf->cpu);
9548+ ret = 1;
9549+ }
9550+ spin_unlock(&pedf->lock);
9551+ return ret;
9552+}
9553+
9554+
9555+static reschedule_check_t part_edf_scheduler_tick(void)
9556+{
9557+ unsigned long flags;
9558+ struct task_struct *t = current;
9559+ reschedule_check_t want_resched = NO_RESCHED;
9560+ rt_domain_t *edf = local_edf;
9561+ part_edf_domain_t *pedf = local_pedf;
9562+
9563+ /* Check for inconsistency. We don't need the lock for this since
9564+ * ->scheduled is only changed in schedule, which obviously is not
9565+ * executing in parallel on this CPU
9566+ */
9567+ BUG_ON(is_realtime(t) && t != pedf->scheduled);
9568+
9569+ /* expire tasks even if not in real-time mode
9570+ * this makes sure that at the end of real-time mode
9571+ * no tasks "run away forever".
9572+ */
9573+ if (is_realtime(t) && (!--t->time_slice)) {
9574+ /* this task has exhausted its budget in this period */
9575+ set_rt_flags(t, RT_F_SLEEP);
9576+ want_resched = FORCE_RESCHED;
9577+ }
9578+ if (get_rt_mode() == MODE_RT_RUN)
9579+ {
9580+ /* check whether anything is waiting to be released
9581+ * this could probably be moved to the global timer
9582+ * interrupt handler since the state will only change
9583+ * once per jiffie
9584+ */
9585+ try_release_pending(edf);
9586+ if (want_resched != FORCE_RESCHED)
9587+ {
9588+ read_lock_irqsave(&edf->ready_lock, flags);
9589+ if (edf_preemption_needed(edf, t))
9590+ want_resched = FORCE_RESCHED;
9591+ read_unlock_irqrestore(&edf->ready_lock, flags);
9592+ }
9593+ }
9594+ return want_resched;
9595+}
9596+
9597+static int part_edf_schedule(struct task_struct * prev,
9598+ struct task_struct ** next,
9599+ runqueue_t * rq)
9600+{
9601+ int need_deactivate = 1;
9602+ part_edf_domain_t* pedf = local_pedf;
9603+ rt_domain_t* edf = &pedf->domain;
9604+
9605+
9606+ if (is_realtime(prev) && get_rt_flags(prev) == RT_F_SLEEP)
9607+ edf_prepare_for_next_period(prev);
9608+
9609+ if (get_rt_mode() == MODE_RT_RUN) {
9610+ write_lock(&edf->ready_lock);
9611+ if (is_realtime(prev) && is_released(prev) && is_running(prev)
9612+ && !edf_preemption_needed(edf, prev)) {
9613+ /* this really should only happen if the task has
9614+ * 100% utilization...
9615+ */
9616+ TRACE("prev will be next, already released\n");
9617+ *next = prev;
9618+ need_deactivate = 0;
9619+ } else {
9620+ /* either not yet released, preempted, or non-rt */
9621+ *next = __take_ready(edf);
9622+ if (*next) {
9623+ /* stick the task into the runqueue */
9624+ __activate_task(*next, rq);
9625+ set_task_cpu(*next, smp_processor_id());
9626+ }
9627+ }
9628+ spin_lock(&pedf->lock);
9629+ pedf->scheduled = *next;
9630+ spin_unlock(&pedf->lock);
9631+ if (*next)
9632+ set_rt_flags(*next, RT_F_RUNNING);
9633+
9634+ write_unlock(&edf->ready_lock);
9635+ }
9636+
9637+ if (is_realtime(prev) && need_deactivate && prev->array) {
9638+ /* take it out of the run queue */
9639+ deactivate_task(prev, rq);
9640+ }
9641+
9642+ return 0;
9643+}
9644+
9645+
9646+static void part_edf_finish_switch(struct task_struct *prev)
9647+{
9648+ rt_domain_t* edf = local_edf;
9649+
9650+ if (!is_realtime(prev) || !is_running(prev))
9651+ return;
9652+
9653+ if (get_rt_flags(prev) == RT_F_SLEEP ||
9654+ get_rt_mode() != MODE_RT_RUN) {
9655+ /* this task has expired
9656+ * _schedule has already taken care of updating
9657+ * the release and
9658+ * deadline. We just must check if has been released.
9659+ */
9660+ if (is_released(prev) && get_rt_mode() == MODE_RT_RUN) {
9661+ /* already released */
9662+ add_ready(edf, prev);
9663+ TRACE("%d goes straight to ready queue\n", prev->pid);
9664+ } else
9665+ /* it has got to wait */
9666+ add_release(edf, prev);
9667+ } else {
9668+ /* this is a forced preemption
9669+ * thus the task stays in the ready_queue
9670+ * we only must make it available to others
9671+ */
9672+ add_ready(edf, prev);
9673+ }
9674+}
9675+
9676+
9677+/* Prepare a task for running in RT mode
9678+ * Enqueues the task into master queue data structure
9679+ * returns
9680+ * -EPERM if task is not TASK_STOPPED
9681+ */
9682+static long part_edf_prepare_task(struct task_struct * t)
9683+{
9684+ rt_domain_t* edf = task_edf(t);
9685+
9686+
9687+ TRACE("[%d] part edf: prepare task %d on CPU %d\n",
9688+ smp_processor_id(), t->pid, get_partition(t));
9689+ if (t->state == TASK_STOPPED) {
9690+ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
9691+
9692+ if (get_rt_mode() == MODE_RT_RUN)
9693+ /* The action is already on.
9694+ * Prepare immediate release.
9695+ */
9696+ edf_release_now(t);
9697+ /* The task should be running in the queue, otherwise signal
9698+ * code will try to wake it up with fatal consequences.
9699+ */
9700+ t->state = TASK_RUNNING;
9701+ add_release(edf, t);
9702+ return 0;
9703+ } else
9704+ return -EPERM;
9705+}
9706+
9707+static void part_edf_wake_up_task(struct task_struct *task)
9708+{
9709+ rt_domain_t* edf;
9710+
9711+ edf = task_edf(task);
9712+
9713+ /* We must determine whether task should go into the release
9714+ * queue or into the ready queue. It may enter the ready queue
9715+ * if it has credit left in its time slice and has not yet reached
9716+ * its deadline. If it is now passed its deadline we assume this the
9717+ * arrival of a new sporadic job and thus put it in the ready queue
9718+ * anyway.If it has zero budget and the next release is in the future
9719+ * it has to go to the release queue.
9720+ */
9721+ TRACE("part edf: wake up %d with budget=%d for cpu %d\n",
9722+ task->pid, task->time_slice, get_partition(task));
9723+ task->state = TASK_RUNNING;
9724+ if (is_tardy(task)) {
9725+ /* new sporadic release */
9726+ edf_release_now(task);
9727+ add_ready(edf, task);
9728+
9729+ } else if (task->time_slice) {
9730+ /* Came back in time before deadline. This may cause
9731+ * deadline overruns, but since we don't handle suspensions
9732+ * in the analytical model, we don't care since we can't
9733+ * guarantee anything at all if tasks block.
9734+ */
9735+ set_rt_flags(task, RT_F_RUNNING);
9736+ add_ready(edf, task);
9737+
9738+ } else {
9739+ add_release(edf, task);
9740+ }
9741+
9742+}
9743+
9744+static void part_edf_task_blocks(struct task_struct *t)
9745+{
9746+ BUG_ON(!is_realtime(t));
9747+ /* not really anything to do since it can only block if
9748+ * it is running, and when it is not running it is not in any
9749+ * queue anyway.
9750+ *
9751+ */
9752+ TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice);
9753+ BUG_ON(in_list(&t->rt_list));
9754+}
9755+
9756+
9757+/* When _tear_down is called, the task should not be in any queue any more
9758+ * as it must have blocked first. We don't have any internal state for the task,
9759+ * it is all in the task_struct.
9760+ */
9761+static long part_edf_tear_down(struct task_struct * t)
9762+{
9763+ BUG_ON(!is_realtime(t));
9764+ TRACE("part edf: tear down called for %d \n", t->pid);
9765+ BUG_ON(t->array);
9766+ BUG_ON(in_list(&t->rt_list));
9767+ return 0;
9768+}
9769+
9770+
9771+static int part_edf_mode_change(int new_mode)
9772+{
9773+ int cpu;
9774+
9775+ if (new_mode == MODE_RT_RUN)
9776+ for_each_online_cpu(cpu)
9777+ rerelease_all(remote_edf(cpu), edf_release_at);
9778+ TRACE("[%d] part edf: mode changed to %d\n",
9779+ smp_processor_id(), new_mode);
9780+ return 0;
9781+}
9782+
9783+
9784+/* Plugin object */
9785+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
9786+ .ready_to_use = 0
9787+};
9788+
9789+
9790+/*
9791+ * Plugin initialization code.
9792+ */
9793+#define INIT_SCHED_PLUGIN (struct sched_plugin) {\
9794+ .plugin_name = "Partitioned EDF",\
9795+ .ready_to_use = 1,\
9796+ .scheduler_tick = part_edf_scheduler_tick,\
9797+ .prepare_task = part_edf_prepare_task,\
9798+ .sleep_next_period = edf_sleep_next_period,\
9799+ .tear_down = part_edf_tear_down,\
9800+ .schedule = part_edf_schedule,\
9801+ .finish_switch = part_edf_finish_switch,\
9802+ .mode_change = part_edf_mode_change,\
9803+ .wake_up_task = part_edf_wake_up_task,\
9804+ .task_blocks = part_edf_task_blocks \
9805+}
9806+
9807+
9808+sched_plugin_t *__init init_part_edf_plugin(void)
9809+{
9810+ int i;
9811+
9812+ if (!s_plugin.ready_to_use)
9813+ {
9814+ for (i = 0; i < NR_CPUS; i++)
9815+ {
9816+ part_edf_domain_init(remote_pedf(i),
9817+ part_edf_check_resched, i);
9818+ printk("CPU partition %d initialized.", i);
9819+ }
9820+ s_plugin = INIT_SCHED_PLUGIN;
9821+ }
9822+ return &s_plugin;
9823+}
9824+
9825+
9826+
9827diff --git a/kernel/sched_pfair.c b/kernel/sched_pfair.c
9828new file mode 100644
9829index 0000000..1a6a790
9830--- /dev/null
9831+++ b/kernel/sched_pfair.c
9832@@ -0,0 +1,503 @@
9833+/*
9834+ *
9835+ * Implementation of synchronized PFAIR PD2 scheduler
9836+ *
9837+ */
9838+
9839+#include <linux/percpu.h>
9840+#include <linux/sched.h>
9841+#include <linux/list.h>
9842+
9843+#include <linux/litmus.h>
9844+#include <linux/sched_plugin.h>
9845+#include <linux/pfair_common.h>
9846+#include <linux/sched_trace.h>
9847+#include <linux/queuelock.h>
9848+
9849+struct cpu_state {
9850+ struct task_struct * t;
9851+ volatile jiffie_t jiffie_marker;
9852+};
9853+/* PFAIR scheduling domain, release and ready queues */
9854+static pfair_domain_t pfair __cacheline_aligned_in_smp;
9855+
9856+/* An indicator that quantum boundary was crossed
9857+ * and a decision has to be made
9858+ */
9859+static int sync_go[NR_CPUS];
9860+
9861+
9862+/* A collection of CPU states protected by pfair lock */
9863+DEFINE_PER_CPU(struct cpu_state, states);
9864+
9865+/*
9866+ * This function gets called by the timer code, with HZ frequency
9867+ * with interrupts disabled.
9868+ *
9869+ * The function merges the release queue with the ready queue
9870+ * and indicates that quantum boundary was crossed.
9871+ *
9872+ * It also suggests to schedule off currently running
9873+ * real-time task if the mode is non-real-time.
9874+ */
9875+static reschedule_check_t pfair_scheduler_tick(void)
9876+{
9877+ int want_resched = NO_RESCHED;
9878+ sync_go[smp_processor_id()] = 0;
9879+ if (!cpu_isset(smp_processor_id(), pfair.domain_cpus))
9880+ goto out;
9881+ /* Now determine if we want current task to be preempted */
9882+ if (get_rt_mode() == MODE_RT_RUN) {
9883+ pfair_try_release_pending(&pfair);
9884+ want_resched = FORCE_RESCHED;
9885+ /* indicate that the interrupt fired */
9886+ sync_go[smp_processor_id()] = 1;
9887+ barrier();
9888+ } else if (is_realtime(current) && is_running(current)) {
9889+ /* In non real-time mode we want to
9890+ * schedule off real-time tasks */
9891+ want_resched = FORCE_RESCHED;
9892+ } else if (is_realtime(current) && !is_running(current)) {
9893+ TRACE("[%d] %d Timer interrupt on not runninng %d\n",
9894+ smp_processor_id(),
9895+ jiffies-rt_start_time, current->pid);
9896+ }
9897+out:
9898+ return want_resched;
9899+}
9900+
9901+/**
9902+ * This function is called by the processor
9903+ * that performs rescheduling. It saves the timing
9904+ * parameters of currently running jobs that were not rescheduled yet
9905+ * and releases next subtask for these jobs placing them into
9906+ * release and ready queues.
9907+ */
9908+static void pretend_release(cpumask_t p)
9909+{
9910+ int i = 0;
9911+ struct task_struct * t = NULL;
9912+ /* for all the tasks increment the number of used quanta
9913+ * and release next subtask or job depending on the number
9914+ * of used quanta
9915+ */
9916+ for_each_cpu_mask(i, p) {
9917+ t = per_cpu(states, i).t;
9918+ if (t != NULL) {
9919+ backup_times(t);
9920+ inc_passed_quanta(t);
9921+ if ( get_passed_quanta(t) == get_exec_cost(t)) {
9922+ pfair_prepare_next_job(t);
9923+ } else {
9924+ pfair_prepare_next_subtask(t);
9925+ }
9926+ /*
9927+ TRACE("[%d] %d pretending release %d with (%d, %d)\n",
9928+ smp_processor_id(),
9929+ jiffies-rt_start_time,t->pid,
9930+ get_release(t)-rt_start_time,
9931+ get_deadline(t)-rt_start_time);*/
9932+ /* detect if the job or subtask has to be released now*/
9933+ if (time_before_eq(get_release(t), jiffies))
9934+ pfair_add_ready(&pfair, t);
9935+ else
9936+ pfair_add_release(&pfair, t);
9937+ }
9938+ }
9939+}
9940+/*
9941+ * Rollback the the pretended release of tasks.
9942+ * Timing parameters are restored and tasks are removed
9943+ * from the queues as it was before calling the schedule() function.
9944+ *
9945+ */
9946+static void rollback_release(cpumask_t p)
9947+{
9948+ int i = -1;
9949+ struct task_struct * t = NULL;
9950+ /*
9951+ * Rollback the pretended changes
9952+ */
9953+ for_each_cpu_mask(i, p) {
9954+ t = per_cpu(states, i).t;
9955+ if (t != NULL) {
9956+ restore_times(t);
9957+ if(t->rt_list.prev != LIST_POISON1 ||
9958+ t->rt_list.next != LIST_POISON2) {
9959+ /* Delete the task from a queue */
9960+ list_del(&t->rt_list);
9961+ }
9962+ }
9963+ }
9964+}
9965+
9966+/*
9967+ * The procedure creates a list of cpu's whose tasks have not been
9968+ * rescheduled yet. These are CPU's with jiffie marker different from
9969+ * the value of jiffies.
9970+ */
9971+static void find_participants(cpumask_t * target)
9972+{
9973+ cpumask_t res;int i;
9974+ cpus_clear(res);
9975+ for_each_online_cpu(i) {
9976+ if(per_cpu(states, i).jiffie_marker != jiffies)
9977+ cpu_set(i, res);
9978+ }
9979+ /* Examine only cpus in the domain */
9980+ cpus_and(res, pfair.domain_cpus, res);
9981+ (*target) = res;
9982+}
9983+
9984+/*
9985+ * This is main PFAIR schedule function,
9986+ * each processor pretends that some currently running tasks are
9987+ * released in the next quantum and determines whether it should
9988+ * keep the task that is currently running (this is usually the case
9989+ * for heavy tasks).
9990+*/
9991+static int pfair_schedule(struct task_struct *prev,
9992+ struct task_struct **next,
9993+ runqueue_t * rq)
9994+{
9995+ int cpu =-1;
9996+ int k =-1;
9997+ int need_deactivate = 1;
9998+ int keep =0;
9999+ unsigned long flags;
10000+ cpumask_t participants;
10001+ /* A temporary array */
10002+ struct task_struct * rs_old_ptr[NR_CPUS];
10003+
10004+ *next = NULL;
10005+ cpu = smp_processor_id();
10006+ /* CPU's not in the domain just bypass */
10007+ if (!cpu_isset(cpu, pfair.domain_cpus)) {
10008+ goto out;
10009+ }
10010+ queue_lock_irqsave(&pfair.pfair_lock, flags);
10011+
10012+ /* If we happen to run in non-realtime mode
10013+ * then we have to schedule off currently running tasks
10014+ * */
10015+ if (get_rt_mode() != MODE_RT_RUN) {
10016+ if (is_realtime(prev)) {
10017+ per_cpu(states, cpu).t = NULL;
10018+ TRACE("[%d] %d Suspending %d\n",
10019+ cpu, jiffies - rt_start_time,
10020+ prev->pid);
10021+ /* Move the task to the
10022+ * release queue for future runs
10023+ * FIXME: Do something smarter.
10024+ * For example create a set where
10025+ * prepared or inactive tasks are placed
10026+ * and then released.
10027+ * */
10028+ set_release(prev, get_release(prev) + 1000);
10029+ pfair_add_release(&pfair, prev);
10030+ }
10031+ goto out_deactivate;
10032+ }
10033+ /* If the current task stops or dies */
10034+ if (is_realtime(prev) && !is_running(prev)) {
10035+ /* remove it from the running set */
10036+ per_cpu(states, cpu).t = NULL;
10037+ }
10038+ /* Make pfair decisions at quantum boundaries only,
10039+ * but schedule off stopped or dead tasks */
10040+
10041+ if ((sync_go[cpu]--) != 1)
10042+ goto out_deactivate;
10043+
10044+ /*TRACE("[%d] %d Scheduler activation", cpu, jiffies-rt_start_time);
10045+ cpus_and(res, pfair.domain_cpus, cpu_online_map);
10046+ for_each_cpu_mask(k, res) {
10047+ TRACE("%d" ,(per_cpu(states, k).jiffie_marker!=jiffies));
10048+ }
10049+ TRACE("\n");*/
10050+
10051+ /* Find processors that have not rescheduled yet */
10052+ find_participants(&participants);
10053+ /* For each task on remote cpu's pretend release */
10054+ pretend_release(participants);
10055+ /* Clear temporary array */
10056+ for_each_possible_cpu(k) { rs_old_ptr[k] = NULL; }
10057+ /* Select a new subset of eligible tasks */
10058+ for_each_cpu_mask(k, participants) {
10059+ rs_old_ptr[k] = __pfair_take_ready (&pfair);
10060+ /* Check if our current task must be scheduled in the next quantum */
10061+ if (rs_old_ptr[k] == per_cpu(states, cpu).t) {
10062+ /* this is our current task, keep it */
10063+ *next = per_cpu(states, cpu).t;
10064+ need_deactivate = 0;
10065+ keep = 1;
10066+ break;
10067+ }
10068+ }
10069+ /* Put all the extracted tasks back into the ready queue */
10070+ for_each_cpu_mask(k, participants) {
10071+ if (rs_old_ptr[k] != NULL){
10072+ pfair_add_ready(&pfair, rs_old_ptr[k]);
10073+ rs_old_ptr[k] = NULL;
10074+ }
10075+ }
10076+ /* Rollback the pretended release,
10077+ * task parameters are restored and running tasks are removed
10078+ * from queues */
10079+ rollback_release(participants);
10080+ /*
10081+ * If the current task is not scheduled in the next quantum
10082+ * then select a new pfair task
10083+ */
10084+ if(!keep) {
10085+ *next = per_cpu(states, cpu).t = __pfair_take_ready(&pfair);
10086+ if (*next != NULL) {
10087+ /*TRACE("[%d] %d Scheduling %d with (%d, %d)\n",
10088+ cpu, jiffies-rt_start_time,
10089+ get_release(*next),
10090+ get_deadline(*next));
10091+ */
10092+ set_task_cpu(*next, cpu);
10093+ __activate_task(*next, rq);
10094+ }
10095+ } else {
10096+ if (is_realtime(prev)) {
10097+ /*TRACE("[%d] %d prev==next %d\n",
10098+ cpu,jiffies-rt_start_time,
10099+ (prev)->pid);*/
10100+
10101+ /* The task will not be switched off but we
10102+ * need to track the execution time
10103+ */
10104+ inc_passed_quanta(prev);
10105+ }
10106+ }
10107+
10108+ /*Show that our task does not participate in subsequent selections*/
10109+ __get_cpu_var(states).jiffie_marker = jiffies;
10110+
10111+out_deactivate:
10112+ if ( is_realtime(prev) && need_deactivate && prev->array) {
10113+ /* take prev out of the linux run queue */
10114+ deactivate_task(prev, rq);
10115+ }
10116+ queue_unlock_irqrestore(&pfair.pfair_lock, flags);
10117+out:
10118+ return 0;
10119+}
10120+
10121+static void pfair_finish_task_switch(struct task_struct *t)
10122+{
10123+ if (!is_realtime(t) || !is_running(t))
10124+ return;
10125+
10126+ queue_lock(&pfair.pfair_lock);
10127+ /* Release in real-time mode only,
10128+ * if the mode is non real-time, then
10129+ * the task is already in the release queue
10130+ * with the time far in the future
10131+ */
10132+ if (get_rt_mode() == MODE_RT_RUN) {
10133+ inc_passed_quanta(t);
10134+ if ( get_passed_quanta(t) == get_exec_cost(t)) {
10135+ sched_trace_job_completion(t);
10136+ pfair_prepare_next_job(t);
10137+ } else {
10138+ pfair_prepare_next_subtask(t);
10139+ }
10140+ /*TRACE("[%d] %d releasing %d with (%d, %d)\n",
10141+ smp_processor_id(),
10142+ jiffies-rt_start_time,
10143+ t->pid,
10144+ get_release(t)-rt_start_time,
10145+ get_deadline(t)-rt_start_time);*/
10146+ if (time_before_eq(get_release(t), jiffies))
10147+ pfair_add_ready(&pfair, t);
10148+ else
10149+ pfair_add_release(&pfair, t);
10150+ }
10151+ queue_unlock(&pfair.pfair_lock);
10152+}
10153+
10154+/* Prepare a task for running in RT mode
10155+ * Enqueues the task into master queue data structure
10156+ * returns
10157+ * -EPERM if task is not TASK_STOPPED
10158+ */
10159+static long pfair_prepare_task(struct task_struct * t)
10160+{
10161+ unsigned long flags;
10162+ TRACE("pfair: prepare task %d\n", t->pid);
10163+ if (t->state == TASK_STOPPED) {
10164+ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
10165+
10166+ if (get_rt_mode() == MODE_RT_RUN)
10167+ /* The action is already on.
10168+ * Prepare immediate release
10169+ */
10170+ __pfair_prepare_new_release(t, jiffies);
10171+ /* The task should be running in the queue, otherwise signal
10172+ * code will try to wake it up with fatal consequences.
10173+ */
10174+ t->state = TASK_RUNNING;
10175+ queue_lock_irqsave(&pfair.pfair_lock, flags);
10176+ pfair_add_release(&pfair, t);
10177+ queue_unlock_irqrestore(&pfair.pfair_lock, flags);
10178+ return 0;
10179+ } else
10180+ return -EPERM;
10181+}
10182+
10183+
10184+
10185+static void pfair_wake_up_task(struct task_struct *task)
10186+{
10187+
10188+ unsigned long flags;
10189+
10190+ /* We must determine whether task should go into the release
10191+ * queue or into the ready queue.
10192+ * The task enters the ready queue if the previous deadline was missed,
10193+ * so we treat the invoked job as a new sporadic release.
10194+ *
10195+ * The job can also enter the ready queue if it was invoked before its
10196+ * global deadline, but its budjet must be clipped down to one quantum
10197+ */
10198+ task->state = TASK_RUNNING;
10199+ if (time_after_eq(jiffies, task->rt_param.times.last_release
10200+ + get_rt_period(task))) {
10201+ /* new sporadic release */
10202+ TRACE("[%d] Sporadic release of %d at %d\n",
10203+ smp_processor_id(),
10204+ jiffies-rt_start_time,
10205+ task->pid);
10206+ __pfair_prepare_new_release(task, jiffies);
10207+ queue_lock_irqsave(&pfair.pfair_lock, flags);
10208+ sched_trace_job_release(task);
10209+ pfair_add_ready(&pfair, task);
10210+ queue_unlock_irqrestore(&pfair.pfair_lock, flags);
10211+ } else if (task->time_slice) {
10212+ /* came back in time before deadline
10213+ * clip the budget to be the last subtask of a job or
10214+ * the new job.
10215+ */
10216+ task->rt_param.times.exec_time = get_exec_cost(task) - 1;
10217+ if (task->rt_param.times.exec_time == 0) {
10218+ pfair_prepare_next_job(task);
10219+ } else {
10220+ pfair_prepare_next_subtask(task);
10221+ }
10222+ TRACE("[%d] %d Resume of %d with %d, %d, %d\n",
10223+ smp_processor_id(), jiffies-rt_start_time,
10224+ task->pid, get_release(task)-rt_start_time,
10225+ get_deadline(task)-rt_start_time,
10226+ get_passed_quanta(task));
10227+
10228+ set_rt_flags(task, RT_F_RUNNING);
10229+ queue_lock_irqsave(&pfair.pfair_lock, flags);
10230+ sched_trace_job_release(task);
10231+ if (time_after_eq(jiffies, get_release(task))) {
10232+ pfair_add_ready(&pfair, task);
10233+ } else {
10234+ pfair_add_release(&pfair, task);
10235+ }
10236+ queue_unlock_irqrestore(&pfair.pfair_lock, flags);
10237+
10238+ } else {
10239+ TRACE("[%d] %d Strange release of %d with %d, %d, %d\n",
10240+ smp_processor_id(), jiffies-rt_start_time,
10241+ task->pid,
10242+ get_release(task), get_deadline(task),
10243+ get_passed_quanta(task));
10244+
10245+ queue_lock_irqsave(&pfair.pfair_lock, flags);
10246+ pfair_add_release(&pfair, task);
10247+ queue_unlock_irqrestore(&pfair.pfair_lock, flags);
10248+ }
10249+}
10250+
10251+
10252+static void pfair_task_blocks(struct task_struct *t)
10253+{
10254+ unsigned long flags;
10255+ int i;
10256+ cpumask_t res;
10257+ BUG_ON(!is_realtime(t));
10258+ /* If the task blocks, then it must be removed from the running set */
10259+ queue_lock_irqsave(&pfair.pfair_lock, flags);
10260+ cpus_and(res,pfair.domain_cpus, cpu_online_map);
10261+ for_each_cpu_mask(i, res) {
10262+ if (per_cpu(states, i).t == t)
10263+ per_cpu(states, i).t = NULL;
10264+ }
10265+ /* If the task is running and in some
10266+ * list it might have been released by another
10267+ * processor
10268+ */
10269+ if((t->rt_list.next != LIST_POISON1 ||
10270+ t->rt_list.prev != LIST_POISON2)) {
10271+ TRACE("[%d] %d task %d is deleted from the list\n",
10272+ smp_processor_id(),
10273+ jiffies-rt_start_time, t->pid);
10274+ list_del(&t->rt_list);
10275+ }
10276+ queue_unlock_irqrestore(&pfair.pfair_lock, flags);
10277+ TRACE("[%d] %d task %d blocks with budget=%d state=%d\n",
10278+ smp_processor_id(), jiffies-rt_start_time,
10279+ t->pid, t->time_slice, t->state);
10280+}
10281+
10282+static long pfair_tear_down(struct task_struct * t)
10283+{
10284+ BUG_ON(!is_realtime(t));
10285+ TRACE("pfair: tear down called for %d \n", t->pid);
10286+ BUG_ON(t->array);
10287+ BUG_ON(t->rt_list.next != LIST_POISON1);
10288+ BUG_ON(t->rt_list.prev != LIST_POISON2);
10289+ return 0;
10290+}
10291+
10292+static int pfair_mode_change(int new_mode)
10293+{
10294+ printk(KERN_INFO "[%d] pfair mode change %d\n",
10295+ smp_processor_id(), new_mode);
10296+ if (new_mode == MODE_RT_RUN) {
10297+ pfair_prepare_new_releases(&pfair, jiffies + 10);
10298+ }
10299+ printk(KERN_INFO "[%d] pfair: mode change done\n", smp_processor_id());
10300+ return 0;
10301+}
10302+
10303+/* Plugin object */
10304+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
10305+ .ready_to_use = 0
10306+};
10307+/*
10308+* PFAIR plugin initialization macro.
10309+*/
10310+#define INIT_PFAIR_PLUGIN (struct sched_plugin){\
10311+ .plugin_name = "PFAIR",\
10312+ .ready_to_use = 1,\
10313+ .scheduler_tick = pfair_scheduler_tick,\
10314+ .prepare_task = pfair_prepare_task,\
10315+ .tear_down = pfair_tear_down,\
10316+ .schedule = pfair_schedule,\
10317+ .finish_switch = pfair_finish_task_switch,\
10318+ .mode_change = pfair_mode_change,\
10319+ .wake_up_task = pfair_wake_up_task,\
10320+ .task_blocks = pfair_task_blocks \
10321+ }
10322+
10323+sched_plugin_t* __init init_pfair_plugin(void)
10324+{
10325+ int i=0;
10326+ if (!s_plugin.ready_to_use) {
10327+ pfair_domain_init(&pfair);
10328+ for (i=0; i<NR_CPUS; i++) {
10329+ sync_go[i] = 0;
10330+ per_cpu(states, i).t = NULL;
10331+ }
10332+ s_plugin = INIT_PFAIR_PLUGIN;
10333+ }
10334+ return &s_plugin;
10335+}
10336diff --git a/kernel/sched_plugin.c b/kernel/sched_plugin.c
10337new file mode 100644
10338index 0000000..1f759b7
10339--- /dev/null
10340+++ b/kernel/sched_plugin.c
10341@@ -0,0 +1,108 @@
10342+/* sched_plugin.c -- core infrastructure for the scheduler plugin system
10343+ *
10344+ * This file includes the initialization of the plugin system, the no-op Linux
10345+ * scheduler plugin and some dummy functions.
10346+ */
10347+
10348+
10349+#include <linux/litmus.h>
10350+#include <linux/sched_plugin.h>
10351+
10352+
10353+/*************************************************************
10354+ * Dummy plugin functions *
10355+ *************************************************************/
10356+
10357+void litmus_dummy_finish_switch(struct task_struct * prev)
10358+{
10359+}
10360+
10361+int litmus_dummy_schedule(struct task_struct * prev,
10362+ struct task_struct** next,
10363+ runqueue_t* q)
10364+{
10365+ return 0;
10366+}
10367+
10368+reschedule_check_t litmus_dummy_scheduler_tick(void)
10369+{
10370+ return NO_RESCHED;
10371+}
10372+
10373+
10374+long litmus_dummy_prepare_task(struct task_struct *t)
10375+{
10376+ return 0;
10377+}
10378+
10379+void litmus_dummy_wake_up_task(struct task_struct *task)
10380+{
10381+ printk(KERN_WARNING "task %d: unhandled real-time wake up!\n",
10382+ task->pid);
10383+}
10384+
10385+void litmus_dummy_task_blocks(struct task_struct *task)
10386+{
10387+}
10388+
10389+long litmus_dummy_tear_down(struct task_struct *task)
10390+{
10391+ return 0;
10392+}
10393+
10394+int litmus_dummy_scheduler_setup(int cmd, void __user *parameter)
10395+{
10396+ return -EPERM;
10397+}
10398+
10399+long litmus_dummy_sleep_next_period(void)
10400+{
10401+ return -EPERM;
10402+}
10403+
10404+long litmus_dummy_inherit_priority(struct pi_semaphore *sem,
10405+ struct task_struct *new_owner)
10406+{
10407+ return -EPERM;
10408+}
10409+
10410+long litmus_dummy_return_priority(struct pi_semaphore *sem)
10411+{
10412+ return -EPERM;
10413+}
10414+
10415+long litmus_dummy_pi_block(struct pi_semaphore *sem,
10416+ struct task_struct *new_waiter)
10417+{
10418+ return -EPERM;
10419+}
10420+
10421+
10422+/* The default scheduler plugin. It doesn't do anything and lets Linux do its
10423+ * job.
10424+ */
10425+
10426+sched_plugin_t linux_sched_plugin = {
10427+ .plugin_name = "Linux",
10428+ .ready_to_use = 1,
10429+ .scheduler_tick = litmus_dummy_scheduler_tick,
10430+ .prepare_task = litmus_dummy_prepare_task,
10431+ .tear_down = litmus_dummy_tear_down,
10432+ .wake_up_task = litmus_dummy_wake_up_task,
10433+ .task_blocks = litmus_dummy_task_blocks,
10434+ .sleep_next_period = litmus_dummy_sleep_next_period,
10435+ .schedule = litmus_dummy_schedule,
10436+ .finish_switch = litmus_dummy_finish_switch,
10437+ .scheduler_setup = litmus_dummy_scheduler_setup,
10438+ .inherit_priority = litmus_dummy_inherit_priority,
10439+ .return_priority = litmus_dummy_return_priority,
10440+ .pi_block = litmus_dummy_pi_block
10441+};
10442+
10443+/*
10444+ * The reference to current plugin that is used to schedule tasks within
10445+ * the system. It stores references to actual function implementations
10446+ * Should be initialized by calling "init_***_plugin()"
10447+ */
10448+sched_plugin_t *curr_sched_plugin = &linux_sched_plugin;
10449+
10450diff --git a/kernel/sched_psn_edf.c b/kernel/sched_psn_edf.c
10451new file mode 100644
10452index 0000000..9e4f4ab
10453--- /dev/null
10454+++ b/kernel/sched_psn_edf.c
10455@@ -0,0 +1,523 @@
10456+
10457+/*
10458+ * kernel/sched_psn_edf.c
10459+ *
10460+ * Implementation of the PSN-EDF scheduler plugin.
10461+ * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c.
10462+ *
10463+ * Suspensions and non-preemptable sections are supported.
10464+ * Priority inheritance is not supported.
10465+ */
10466+
10467+#include <linux/percpu.h>
10468+#include <linux/sched.h>
10469+#include <linux/list.h>
10470+#include <linux/spinlock.h>
10471+
10472+#include <linux/litmus.h>
10473+#include <linux/sched_plugin.h>
10474+#include <linux/edf_common.h>
10475+
10476+
10477+typedef struct {
10478+ rt_domain_t domain;
10479+ int cpu;
10480+ struct task_struct* scheduled; /* only RT tasks */
10481+ spinlock_t lock; /* protects the domain and
10482+ * serializes scheduling decisions
10483+ */
10484+} psnedf_domain_t;
10485+
10486+DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains);
10487+
10488+#define local_edf (&__get_cpu_var(psnedf_domains).domain)
10489+#define local_pedf (&__get_cpu_var(psnedf_domains))
10490+#define remote_edf(cpu) (&per_cpu(psnedf_domains, cpu).domain)
10491+#define remote_pedf(cpu) (&per_cpu(psnedf_domains, cpu))
10492+#define task_edf(task) remote_edf(get_partition(task))
10493+#define task_pedf(task) remote_pedf(get_partition(task))
10494+
10495+
10496+static void psnedf_domain_init(psnedf_domain_t* pedf,
10497+ check_resched_needed_t check,
10498+ int cpu)
10499+{
10500+ edf_domain_init(&pedf->domain, check);
10501+ pedf->cpu = cpu;
10502+ pedf->lock = SPIN_LOCK_UNLOCKED;
10503+ pedf->scheduled = NULL;
10504+}
10505+
10506+static void requeue(struct task_struct* t, rt_domain_t *edf)
10507+{
10508+ /* only requeue if t is actually running */
10509+ BUG_ON(!is_running(t));
10510+
10511+ if (t->state != TASK_RUNNING)
10512+ TRACE_TASK(t, "requeue: !TASK_RUNNING");
10513+
10514+ set_rt_flags(t, RT_F_RUNNING);
10515+ if (!is_released(t) ||
10516+ get_rt_mode() != MODE_RT_RUN)
10517+ __add_release(edf, t); /* it has got to wait */
10518+ else
10519+ __add_ready(edf, t);
10520+}
10521+
10522+/* we assume the lock is being held */
10523+static void preempt(psnedf_domain_t *pedf)
10524+{
10525+ if (smp_processor_id() == pedf->cpu) {
10526+ if (pedf->scheduled && is_np(pedf->scheduled))
10527+ request_exit_np(pedf->scheduled);
10528+ else
10529+ set_tsk_need_resched(current);
10530+ } else
10531+ /* in case that it is a remote CPU we have to defer the
10532+ * the decision to the remote CPU
10533+ */
10534+ smp_send_reschedule(pedf->cpu);
10535+}
10536+
10537+/* This check is trivial in partioned systems as we only have to consider
10538+ * the CPU of the partition.
10539+ */
10540+static int psnedf_check_resched(rt_domain_t *edf)
10541+{
10542+ psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain);
10543+ int ret = 0;
10544+
10545+ /* because this is a callback from rt_domain_t we already hold
10546+ * the necessary lock for the ready queue
10547+ */
10548+ if (edf_preemption_needed(edf, pedf->scheduled)) {
10549+ preempt(pedf);
10550+ ret = 1;
10551+ }
10552+ return ret;
10553+}
10554+
10555+
10556+static reschedule_check_t psnedf_scheduler_tick(void)
10557+{
10558+ unsigned long flags;
10559+ struct task_struct *t = current;
10560+ reschedule_check_t want_resched = NO_RESCHED;
10561+ rt_domain_t *edf = local_edf;
10562+ psnedf_domain_t *pedf = local_pedf;
10563+
10564+ /* Check for inconsistency. We don't need the lock for this since
10565+ * ->scheduled is only changed in schedule, which obviously is not
10566+ * executing in parallel on this CPU
10567+ */
10568+ BUG_ON(is_realtime(t) && t != pedf->scheduled);
10569+
10570+ if (is_realtime(t))
10571+ TRACE("%s/%d was hit by scheduler tick\n", t->comm, t->pid);
10572+
10573+ /* expire tasks even if not in real-time mode
10574+ * this makes sure that at the end of real-time mode
10575+ * no tasks "run away forever".
10576+ */
10577+ if (is_realtime(t) && t->time_slice && !--t->time_slice) {
10578+ if (!is_np(t)) {
10579+ want_resched = FORCE_RESCHED;
10580+ } else {
10581+ TRACE("psnedf_scheduler_tick: "
10582+ "%d is non-preemptable, "
10583+ "preemption delayed.\n", t->pid);
10584+ request_exit_np(t);
10585+ }
10586+ }
10587+
10588+ if (get_rt_mode() == MODE_RT_RUN)
10589+ {
10590+ /* check whether anything is waiting to be released
10591+ * this could probably be moved to the global timer
10592+ * interrupt handler since the state will only change
10593+ * once per jiffie
10594+ */
10595+ spin_lock_irqsave(&pedf->lock, flags);
10596+ __release_pending(edf);
10597+ if (want_resched != FORCE_RESCHED &&
10598+ edf_preemption_needed(edf, t))
10599+ want_resched = FORCE_RESCHED;
10600+
10601+ spin_unlock_irqrestore(&pedf->lock, flags);
10602+
10603+ }
10604+ return want_resched;
10605+}
10606+
10607+static void job_completion(struct task_struct* t)
10608+{
10609+ TRACE_TASK(t, "job_completion().\n");
10610+ set_rt_flags(t, RT_F_SLEEP);
10611+ edf_prepare_for_next_period(t);
10612+}
10613+
10614+static int psnedf_schedule(struct task_struct * prev,
10615+ struct task_struct ** next,
10616+ runqueue_t * rq)
10617+{
10618+ psnedf_domain_t* pedf = local_pedf;
10619+ rt_domain_t* edf = &pedf->domain;
10620+
10621+ int out_of_time, sleep, preempt,
10622+ np, exists, rt, blocks, resched;
10623+
10624+ spin_lock(&pedf->lock);
10625+
10626+ /* sanity checking */
10627+ BUG_ON(pedf->scheduled && pedf->scheduled != prev);
10628+ BUG_ON(pedf->scheduled && !is_realtime(prev));
10629+
10630+ /* (0) Determine state */
10631+ exists = pedf->scheduled != NULL;
10632+ blocks = exists && !is_running(pedf->scheduled);
10633+ out_of_time = exists && !pedf->scheduled->time_slice;
10634+ np = exists && is_np(pedf->scheduled);
10635+ sleep = exists && get_rt_flags(pedf->scheduled) == RT_F_SLEEP;
10636+ preempt = edf_preemption_needed(edf, prev);
10637+ rt = get_rt_mode() == MODE_RT_RUN;
10638+
10639+
10640+ /* If we need to preempt do so.
10641+ * The following checks set resched to 1 in case of special
10642+ * circumstances.
10643+ */
10644+ resched = preempt;
10645+
10646+ /* If a task blocks we have no choice but to reschedule.
10647+ */
10648+ if (blocks)
10649+ resched = 1;
10650+
10651+ /* Request a sys_exit_np() call if we would like to preempt but cannot.
10652+ * Multiple calls to request_exit_np() don't hurt.
10653+ */
10654+ if (np && (out_of_time || preempt || sleep))
10655+ request_exit_np(pedf->scheduled);
10656+
10657+ /* Any task that is preemptable and either exhausts its execution
10658+ * budget or wants to sleep completes. We may have to reschedule after
10659+ * this.
10660+ */
10661+ if (!np && (out_of_time || sleep)) {
10662+ job_completion(pedf->scheduled);
10663+ resched = 1;
10664+ }
10665+
10666+ /* Stop real-time tasks when we leave real-time mode
10667+ */
10668+ if (!rt && exists)
10669+ resched = 1;
10670+
10671+ /* The final scheduling decision. Do we need to switch for some reason?
10672+ * Switch if we are in RT mode and have no task or if we need to
10673+ * resched.
10674+ */
10675+ *next = NULL;
10676+ if ((!np || blocks) && (resched || (!exists && rt))) {
10677+ /* Take care of a previously scheduled
10678+ * job by taking it out of the Linux runqueue.
10679+ */
10680+ if (pedf->scheduled) {
10681+ /* as opposed to global schedulers that switch without
10682+ * a lock being held we can requeue already here since
10683+ * no other CPU will schedule from this domain.
10684+ */
10685+ if (!blocks)
10686+ requeue(pedf->scheduled, edf);
10687+ if (prev->array)
10688+ /* take it out of the run queue */
10689+ deactivate_task(prev, rq);
10690+ }
10691+
10692+ /* only pick tasks if we are actually in RT mode */
10693+ if (rt)
10694+ *next = __take_ready(edf);
10695+ if (*next) {
10696+ /* stick the task into the runqueue */
10697+ __activate_task(*next, rq);
10698+ set_task_cpu(*next, smp_processor_id());
10699+ }
10700+
10701+ } else
10702+ /* Only override Linux scheduler if we have a real-time task
10703+ * scheduled that needs to continue.
10704+ */
10705+ if (exists)
10706+ *next = prev;
10707+
10708+ if (*next)
10709+ set_rt_flags(*next, RT_F_RUNNING);
10710+
10711+ pedf->scheduled = *next;
10712+ spin_unlock(&pedf->lock);
10713+ return 0;
10714+}
10715+
10716+
10717+/* Prepare a task for running in RT mode
10718+ * Enqueues the task into master queue data structure
10719+ * returns
10720+ * -EPERM if task is not TASK_STOPPED
10721+ */
10722+static long psnedf_prepare_task(struct task_struct * t)
10723+{
10724+ rt_domain_t* edf = task_edf(t);
10725+ psnedf_domain_t* pedf = task_pedf(t);
10726+ unsigned long flags;
10727+
10728+ TRACE("[%d] psn edf: prepare task %d on CPU %d\n",
10729+ smp_processor_id(), t->pid, get_partition(t));
10730+ if (t->state == TASK_STOPPED) {
10731+ __setscheduler(t, SCHED_FIFO, MAX_RT_PRIO - 1);
10732+
10733+ if (get_rt_mode() == MODE_RT_RUN)
10734+ /* The action is already on.
10735+ * Prepare immediate release.
10736+ */
10737+ edf_release_now(t);
10738+ /* The task should be running in the queue, otherwise signal
10739+ * code will try to wake it up with fatal consequences.
10740+ */
10741+ t->state = TASK_RUNNING;
10742+ spin_lock_irqsave(&pedf->lock, flags);
10743+ __add_release(edf, t);
10744+ spin_unlock_irqrestore(&pedf->lock, flags);
10745+ return 0;
10746+ } else
10747+ return -EPERM;
10748+}
10749+
10750+static void psnedf_wake_up_task(struct task_struct *task)
10751+{
10752+ unsigned long flags;
10753+ psnedf_domain_t* pedf = task_pedf(task);
10754+ rt_domain_t* edf = task_edf(task);
10755+
10756+ TRACE("psnedf: %d unsuspends with budget=%d\n",
10757+ task->pid, task->time_slice);
10758+
10759+ /* After fixing the litmus_controlled bug,
10760+ * this should hold again.
10761+ */
10762+ BUG_ON(in_list(&task->rt_list));
10763+
10764+ task->state = TASK_RUNNING;
10765+
10766+ /* We need to take suspensions because of semaphores into
10767+ * account! If a job resumes after being suspended due to acquiring
10768+ * a semaphore, it should never be treated as a new job release.
10769+ */
10770+ if (is_tardy(task) && get_rt_flags(task) != RT_F_EXIT_SEM) {
10771+ /* new sporadic release */
10772+ edf_release_now(task);
10773+ sched_trace_job_release(task);
10774+ }
10775+
10776+ spin_lock_irqsave(&pedf->lock, flags);
10777+ requeue(task, edf);
10778+ spin_unlock_irqrestore(&pedf->lock, flags);
10779+}
10780+
10781+static void psnedf_task_blocks(struct task_struct *t)
10782+{
10783+ BUG_ON(!is_realtime(t));
10784+ /* not really anything to do since it can only block if
10785+ * it is running, and when it is not running it is not in any
10786+ * queue anyway.
10787+ */
10788+ TRACE("task %d blocks with budget=%d\n", t->pid, t->time_slice);
10789+ BUG_ON(in_list(&t->rt_list));
10790+}
10791+
10792+
10793+/* When _tear_down is called, the task should not be in any queue any more
10794+ * as it must have blocked first. We don't have any internal state for the task,
10795+ * it is all in the task_struct.
10796+ */
10797+static long psnedf_tear_down(struct task_struct * t)
10798+{
10799+ BUG_ON(!is_realtime(t));
10800+ TRACE_TASK(t, "tear down called");
10801+ BUG_ON(t->array);
10802+ BUG_ON(in_list(&t->rt_list));
10803+ return 0;
10804+}
10805+
10806+static long psnedf_pi_block(struct pi_semaphore *sem,
10807+ struct task_struct *new_waiter)
10808+{
10809+ psnedf_domain_t* pedf;
10810+ rt_domain_t* edf;
10811+ struct task_struct* t;
10812+ int cpu = get_partition(new_waiter);
10813+
10814+ BUG_ON(!new_waiter);
10815+
10816+ if (edf_higher_prio(new_waiter, sem->hp.cpu_task[cpu])) {
10817+ TRACE_TASK(new_waiter, " boosts priority\n");
10818+ pedf = task_pedf(new_waiter);
10819+ edf = task_edf(new_waiter);
10820+
10821+ /* interrupts already disabled */
10822+ spin_lock(&pedf->lock);
10823+
10824+ /* store new highest-priority task */
10825+ sem->hp.cpu_task[cpu] = new_waiter;
10826+ if (sem->holder &&
10827+ get_partition(sem->holder) == get_partition(new_waiter)) {
10828+ /* let holder inherit */
10829+ sem->holder->rt_param.inh_task = new_waiter;
10830+ t = sem->holder;
10831+ if (in_list(&t->rt_list)) {
10832+ /* queued in domain*/
10833+ list_del(&t->rt_list);
10834+ /* readd to make priority change take place */
10835+ if (is_released(t))
10836+ __add_ready(edf, t);
10837+ else
10838+ __add_release(edf, t);
10839+ }
10840+ }
10841+
10842+ /* check if we need to reschedule */
10843+ if (edf_preemption_needed(edf, current))
10844+ preempt(pedf);
10845+
10846+ spin_unlock(&pedf->lock);
10847+ }
10848+
10849+ return 0;
10850+}
10851+
10852+static long psnedf_inherit_priority(struct pi_semaphore *sem,
10853+ struct task_struct *new_owner)
10854+{
10855+ int cpu = get_partition(new_owner);
10856+
10857+ new_owner->rt_param.inh_task = sem->hp.cpu_task[cpu];
10858+ if (sem->hp.cpu_task[cpu] && new_owner != sem->hp.cpu_task[cpu]) {
10859+ TRACE_TASK(new_owner,
10860+ "inherited priority from %s/%d\n",
10861+ sem->hp.cpu_task[cpu]->comm,
10862+ sem->hp.cpu_task[cpu]->pid);
10863+ } else
10864+ TRACE_TASK(new_owner,
10865+ "cannot inherit priority: "
10866+ "no higher priority job waits on this CPU!\n");
10867+ /* make new owner non-preemptable as required by FMLP under
10868+ * PSN-EDF.
10869+ */
10870+ make_np(new_owner);
10871+ return 0;
10872+}
10873+
10874+
10875+/* This function is called on a semaphore release, and assumes that
10876+ * the current task is also the semaphore holder.
10877+ */
10878+static long psnedf_return_priority(struct pi_semaphore *sem)
10879+{
10880+ struct task_struct* t = current;
10881+ psnedf_domain_t* pedf = task_pedf(t);
10882+ rt_domain_t* edf = task_edf(t);
10883+ int ret = 0;
10884+ int cpu = get_partition(current);
10885+
10886+
10887+ /* Find new highest-priority semaphore task
10888+ * if holder task is the current hp.cpu_task[cpu].
10889+ *
10890+ * Calling function holds sem->wait.lock.
10891+ */
10892+ if (t == sem->hp.cpu_task[cpu])
10893+ edf_set_hp_cpu_task(sem, cpu);
10894+
10895+ take_np(t);
10896+ if (current->rt_param.inh_task) {
10897+ TRACE_CUR("return priority of %s/%d\n",
10898+ current->rt_param.inh_task->comm,
10899+ current->rt_param.inh_task->pid);
10900+ spin_lock(&pedf->lock);
10901+
10902+ /* Reset inh_task to NULL. */
10903+ current->rt_param.inh_task = NULL;
10904+
10905+ /* check if we need to reschedule */
10906+ if (edf_preemption_needed(edf, current))
10907+ preempt(pedf);
10908+
10909+ spin_unlock(&pedf->lock);
10910+ } else
10911+ TRACE_CUR(" no priority to return %p\n", sem);
10912+
10913+ return ret;
10914+}
10915+
10916+
10917+static int psnedf_mode_change(int new_mode)
10918+{
10919+ int cpu;
10920+
10921+ if (new_mode == MODE_RT_RUN)
10922+ for_each_online_cpu(cpu) {
10923+ spin_lock(&remote_pedf(cpu)->lock);
10924+ __rerelease_all(remote_edf(cpu), edf_release_at);
10925+ spin_unlock(&remote_pedf(cpu)->lock);
10926+ }
10927+
10928+ TRACE("[%d] psn edf: mode changed to %d\n",
10929+ smp_processor_id(), new_mode);
10930+ return 0;
10931+}
10932+
10933+
10934+/* Plugin object */
10935+static sched_plugin_t s_plugin __cacheline_aligned_in_smp = {
10936+ .ready_to_use = 0
10937+};
10938+
10939+
10940+/*
10941+ * Plugin initialization code.
10942+ */
10943+#define INIT_SCHED_PLUGIN (struct sched_plugin) {\
10944+ .plugin_name = "PSN-EDF",\
10945+ .ready_to_use = 1,\
10946+ .scheduler_tick = psnedf_scheduler_tick,\
10947+ .prepare_task = psnedf_prepare_task,\
10948+ .sleep_next_period = edf_sleep_next_period,\
10949+ .tear_down = psnedf_tear_down,\
10950+ .schedule = psnedf_schedule,\
10951+ .mode_change = psnedf_mode_change,\
10952+ .wake_up_task = psnedf_wake_up_task,\
10953+ .task_blocks = psnedf_task_blocks, \
10954+ .pi_block = psnedf_pi_block, \
10955+ .inherit_priority = psnedf_inherit_priority, \
10956+ .return_priority = psnedf_return_priority \
10957+}
10958+
10959+
10960+sched_plugin_t *__init init_psn_edf_plugin(void)
10961+{
10962+ int i;
10963+
10964+ if (!s_plugin.ready_to_use)
10965+ {
10966+ for (i = 0; i < NR_CPUS; i++)
10967+ {
10968+ psnedf_domain_init(remote_pedf(i),
10969+ psnedf_check_resched, i);
10970+ printk("PSN-EDF: CPU partition %d initialized.\n", i);
10971+ }
10972+ s_plugin = INIT_SCHED_PLUGIN;
10973+ }
10974+ return &s_plugin;
10975+}
10976+
10977+
10978+
10979diff --git a/kernel/sched_trace.c b/kernel/sched_trace.c
10980new file mode 100644
10981index 0000000..4cfe0c4
10982--- /dev/null
10983+++ b/kernel/sched_trace.c
10984@@ -0,0 +1,755 @@
10985+/* sched_trace.c -- record scheduling events to a byte stream.
10986+ *
10987+ * TODO: Move ring buffer to a lockfree implementation.
10988+ */
10989+
10990+#include <linux/spinlock.h>
10991+#include <linux/fs.h>
10992+#include <linux/cdev.h>
10993+#include <asm/semaphore.h>
10994+#include <asm/uaccess.h>
10995+#include <linux/module.h>
10996+
10997+#include <linux/queuelock.h>
10998+#include <linux/sched_trace.h>
10999+#include <linux/litmus.h>
11000+
11001+
11002+typedef struct {
11003+ /* guard read and write pointers */
11004+ spinlock_t lock;
11005+ /* guard against concurrent freeing of buffer */
11006+ rwlock_t del_lock;
11007+
11008+ /* memory allocated for ring buffer */
11009+ unsigned long order;
11010+ char* buf;
11011+ char* end;
11012+
11013+ /* Read/write pointer. May not cross.
11014+ * They point to the position of next write and
11015+ * last read.
11016+ */
11017+ char* writep;
11018+ char* readp;
11019+
11020+} ring_buffer_t;
11021+
11022+#define EMPTY_RING_BUFFER { \
11023+ .lock = SPIN_LOCK_UNLOCKED, \
11024+ .del_lock = RW_LOCK_UNLOCKED, \
11025+ .buf = NULL, \
11026+ .end = NULL, \
11027+ .writep = NULL, \
11028+ .readp = NULL \
11029+}
11030+
11031+void rb_init(ring_buffer_t* buf)
11032+{
11033+ *buf = (ring_buffer_t) EMPTY_RING_BUFFER;
11034+}
11035+
11036+int rb_alloc_buf(ring_buffer_t* buf, unsigned long order)
11037+{
11038+ unsigned long flags;
11039+ int error = 0;
11040+ char *mem;
11041+
11042+ /* do memory allocation while not atomic */
11043+ mem = (char *) __get_free_pages(GFP_KERNEL, order);
11044+ if (!mem)
11045+ return -ENOMEM;
11046+ write_lock_irqsave(&buf->del_lock, flags);
11047+ BUG_ON(buf->buf);
11048+ buf->buf = mem;
11049+ buf->end = buf->buf + PAGE_SIZE * (1 << order) - 1;
11050+ memset(buf->buf, 0xff, buf->end - buf->buf);
11051+ buf->order = order;
11052+ buf->writep = buf->buf + 1;
11053+ buf->readp = buf->buf;
11054+ write_unlock_irqrestore(&buf->del_lock, flags);
11055+ return error;
11056+}
11057+
11058+int rb_free_buf(ring_buffer_t* buf)
11059+{
11060+ unsigned long flags;
11061+ int error = 0;
11062+ write_lock_irqsave(&buf->del_lock, flags);
11063+ BUG_ON(!buf->buf);
11064+ free_pages((unsigned long) buf->buf, buf->order);
11065+ buf->buf = NULL;
11066+ buf->end = NULL;
11067+ buf->writep = NULL;
11068+ buf->readp = NULL;
11069+ write_unlock_irqrestore(&buf->del_lock, flags);
11070+ return error;
11071+}
11072+
11073+/* Assumption: concurrent writes are serialized externally
11074+ *
11075+ * Will only succeed if there is enough space for all len bytes.
11076+ */
11077+int rb_put(ring_buffer_t* buf, char* mem, size_t len)
11078+{
11079+ unsigned long flags;
11080+ char* r , *w;
11081+ int error = 0;
11082+ read_lock_irqsave(&buf->del_lock, flags);
11083+ if (!buf->buf) {
11084+ error = -ENODEV;
11085+ goto out;
11086+ }
11087+ spin_lock(&buf->lock);
11088+ r = buf->readp;
11089+ w = buf->writep;
11090+ spin_unlock(&buf->lock);
11091+ if (r < w && buf->end - w >= len - 1) {
11092+ /* easy case: there is enough space in the buffer
11093+ * to write it in one continous chunk*/
11094+ memcpy(w, mem, len);
11095+ w += len;
11096+ if (w > buf->end)
11097+ /* special case: fit exactly into buffer
11098+ * w is now buf->end + 1
11099+ */
11100+ w = buf->buf;
11101+ } else if (w < r && r - w >= len) { /* >= len because may not cross */
11102+ /* we are constrained by the read pointer but we there
11103+ * is enough space
11104+ */
11105+ memcpy(w, mem, len);
11106+ w += len;
11107+ } else if (r <= w && buf->end - w < len - 1) {
11108+ /* the wrap around case: there may or may not be space */
11109+ if ((buf->end - w) + (r - buf->buf) >= len - 1) {
11110+ /* copy chunk that fits at the end */
11111+ memcpy(w, mem, buf->end - w + 1);
11112+ mem += buf->end - w + 1;
11113+ len -= (buf->end - w + 1);
11114+ w = buf->buf;
11115+ /* copy the rest */
11116+ memcpy(w, mem, len);
11117+ w += len;
11118+ }
11119+ else
11120+ error = -ENOMEM;
11121+ } else {
11122+ error = -ENOMEM;
11123+ }
11124+ if (!error) {
11125+ spin_lock(&buf->lock);
11126+ buf->writep = w;
11127+ spin_unlock(&buf->lock);
11128+ }
11129+ out:
11130+ read_unlock_irqrestore(&buf->del_lock, flags);
11131+ return error;
11132+}
11133+
11134+/* Assumption: concurrent reads are serialized externally */
11135+int rb_get(ring_buffer_t* buf, char* mem, size_t len)
11136+{
11137+ unsigned long flags;
11138+ char* r , *w;
11139+ int error = 0;
11140+ read_lock_irqsave(&buf->del_lock, flags);
11141+ if (!buf->buf) {
11142+ error = -ENODEV;
11143+ goto out;
11144+ }
11145+ spin_lock(&buf->lock);
11146+ r = buf->readp;
11147+ w = buf->writep;
11148+ spin_unlock(&buf->lock);
11149+
11150+ if (w <= r && buf->end - r >= len) {
11151+ /* easy case: there is enough data in the buffer
11152+ * to get it in one chunk*/
11153+ memcpy(mem, r + 1, len);
11154+ r += len;
11155+ error = len;
11156+
11157+ } else if (r + 1 < w && w - r - 1 >= len) {
11158+ /* we are constrained by the write pointer but
11159+ * there is enough data
11160+ */
11161+ memcpy(mem, r + 1, len);
11162+ r += len;
11163+ error = len;
11164+
11165+ } else if (r + 1 < w && w - r - 1 < len) {
11166+ /* we are constrained by the write pointer and there
11167+ * there is not enough data
11168+ */
11169+ memcpy(mem, r + 1, w - r - 1);
11170+ error = w - r - 1;
11171+ r += w - r - 1;
11172+
11173+ } else if (w <= r && buf->end - r < len) {
11174+ /* the wrap around case: there may or may not be enough data
11175+ * first let's get what is available
11176+ */
11177+ memcpy(mem, r + 1, buf->end - r);
11178+ error += (buf->end - r);
11179+ mem += (buf->end - r);
11180+ len -= (buf->end - r);
11181+ r += (buf->end - r);
11182+
11183+ if (w > buf->buf) {
11184+ /* there is more to get */
11185+ r = buf->buf - 1;
11186+ if (w - r >= len) {
11187+ /* plenty */
11188+ memcpy(mem, r + 1, len);
11189+ error += len;
11190+ r += len;
11191+ } else {
11192+ memcpy(mem, r + 1, w - r - 1);
11193+ error += w - r - 1;
11194+ r += w - r - 1;
11195+ }
11196+ }
11197+ } /* nothing available */
11198+
11199+ if (error > 0) {
11200+ spin_lock(&buf->lock);
11201+ buf->readp = r;
11202+ spin_unlock(&buf->lock);
11203+ }
11204+ out:
11205+ read_unlock_irqrestore(&buf->del_lock, flags);
11206+ return error;
11207+}
11208+
11209+
11210+
11211+/******************************************************************************/
11212+/* DEVICE FILE DRIVER */
11213+/******************************************************************************/
11214+
11215+
11216+
11217+/* Allocate a buffer of about 1 MB per CPU.
11218+ *
11219+ */
11220+#define BUFFER_ORDER 8
11221+
11222+typedef struct {
11223+ ring_buffer_t buf;
11224+ atomic_t reader_cnt;
11225+ struct semaphore reader_mutex;
11226+} trace_buffer_t;
11227+
11228+
11229+/* This does not initialize the semaphore!! */
11230+
11231+#define EMPTY_TRACE_BUFFER \
11232+ { .buf = EMPTY_RING_BUFFER, .reader_cnt = ATOMIC_INIT(0)}
11233+
11234+static DEFINE_PER_CPU(trace_buffer_t, trace_buffer);
11235+
11236+#ifdef CONFIG_SCHED_DEBUG_TRACE
11237+static spinlock_t log_buffer_lock = SPIN_LOCK_UNLOCKED;
11238+#endif
11239+static trace_buffer_t log_buffer = EMPTY_TRACE_BUFFER;
11240+
11241+static void init_buffers(void)
11242+{
11243+ int i;
11244+
11245+ for (i = 0; i < NR_CPUS; i++) {
11246+ rb_init(&per_cpu(trace_buffer, i).buf);
11247+ init_MUTEX(&per_cpu(trace_buffer, i).reader_mutex);
11248+ atomic_set(&per_cpu(trace_buffer, i).reader_cnt, 0);
11249+ }
11250+ /* only initialize the mutex, the rest was initialized as part
11251+ * of the static initialization macro
11252+ */
11253+ init_MUTEX(&log_buffer.reader_mutex);
11254+}
11255+
11256+static int trace_release(struct inode *in, struct file *filp)
11257+{
11258+ int error = -EINVAL;
11259+ trace_buffer_t* buf = filp->private_data;
11260+
11261+ BUG_ON(!filp->private_data);
11262+
11263+ if (down_interruptible(&buf->reader_mutex)) {
11264+ error = -ERESTARTSYS;
11265+ goto out;
11266+ }
11267+
11268+ /* last release must deallocate buffers */
11269+ if (atomic_dec_return(&buf->reader_cnt) == 0) {
11270+ error = rb_free_buf(&buf->buf);
11271+ }
11272+
11273+ up(&buf->reader_mutex);
11274+ out:
11275+ return error;
11276+}
11277+
11278+static ssize_t trace_read(struct file *filp, char __user *to, size_t len,
11279+ loff_t *f_pos)
11280+{
11281+ /* we ignore f_pos, this is strictly sequential */
11282+
11283+ ssize_t error = -EINVAL;
11284+ char* mem;
11285+ trace_buffer_t *buf = filp->private_data;
11286+
11287+ if (down_interruptible(&buf->reader_mutex)) {
11288+ error = -ERESTARTSYS;
11289+ goto out;
11290+ }
11291+
11292+ if (len > 64 * 1024)
11293+ len = 64 * 1024;
11294+ mem = kmalloc(len, GFP_KERNEL);
11295+ if (!mem) {
11296+ error = -ENOMEM;
11297+ goto out_unlock;
11298+ }
11299+
11300+ error = rb_get(&buf->buf, mem, len);
11301+ while (!error) {
11302+ set_current_state(TASK_INTERRUPTIBLE);
11303+ schedule_timeout(110);
11304+ if (signal_pending(current))
11305+ error = -ERESTARTSYS;
11306+ else
11307+ error = rb_get(&buf->buf, mem, len);
11308+ }
11309+
11310+ if (error > 0 && copy_to_user(to, mem, error))
11311+ error = -EFAULT;
11312+
11313+ kfree(mem);
11314+ out_unlock:
11315+ up(&buf->reader_mutex);
11316+ out:
11317+ return error;
11318+}
11319+
11320+
11321+/* trace_open - Open one of the per-CPU sched_trace buffers.
11322+ */
11323+static int trace_open(struct inode *in, struct file *filp)
11324+{
11325+ int error = -EINVAL;
11326+ int cpu = MINOR(in->i_rdev);
11327+ trace_buffer_t* buf;
11328+
11329+ if (!cpu_online(cpu)) {
11330+ printk(KERN_WARNING "sched trace: "
11331+ "CPU #%d is not online. (open failed)\n", cpu);
11332+ error = -ENODEV;
11333+ goto out;
11334+ }
11335+
11336+ buf = &per_cpu(trace_buffer, cpu);
11337+
11338+ if (down_interruptible(&buf->reader_mutex)) {
11339+ error = -ERESTARTSYS;
11340+ goto out;
11341+ }
11342+
11343+ /* first open must allocate buffers */
11344+ if (atomic_inc_return(&buf->reader_cnt) == 1) {
11345+ if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER)))
11346+ {
11347+ atomic_dec(&buf->reader_cnt);
11348+ goto out_unlock;
11349+ }
11350+ }
11351+
11352+ error = 0;
11353+ filp->private_data = buf;
11354+
11355+ out_unlock:
11356+ up(&buf->reader_mutex);
11357+ out:
11358+ return error;
11359+}
11360+
11361+/* log_open - open the global log message ring buffer.
11362+ */
11363+static int log_open(struct inode *in, struct file *filp)
11364+{
11365+ int error = -EINVAL;
11366+ trace_buffer_t* buf;
11367+
11368+ buf = &log_buffer;
11369+
11370+ if (down_interruptible(&buf->reader_mutex)) {
11371+ error = -ERESTARTSYS;
11372+ goto out;
11373+ }
11374+
11375+ /* first open must allocate buffers */
11376+ if (atomic_inc_return(&buf->reader_cnt) == 1) {
11377+ if ((error = rb_alloc_buf(&buf->buf, BUFFER_ORDER)))
11378+ {
11379+ atomic_dec(&buf->reader_cnt);
11380+ goto out_unlock;
11381+ }
11382+ }
11383+
11384+ error = 0;
11385+ filp->private_data = buf;
11386+
11387+ out_unlock:
11388+ up(&buf->reader_mutex);
11389+ out:
11390+ return error;
11391+}
11392+
11393+/******************************************************************************/
11394+/* Device Registration */
11395+/******************************************************************************/
11396+
11397+/* the major numbes are from the unassigned/local use block
11398+ *
11399+ * This should be converted to dynamic allocation at some point...
11400+ */
11401+#define TRACE_MAJOR 250
11402+#define LOG_MAJOR 251
11403+
11404+/* trace_fops - The file operations for accessing the per-CPU scheduling event
11405+ * trace buffers.
11406+ */
11407+struct file_operations trace_fops = {
11408+ .owner = THIS_MODULE,
11409+ .open = trace_open,
11410+ .release = trace_release,
11411+ .read = trace_read,
11412+};
11413+
11414+/* log_fops - The file operations for accessing the global LITMUS log message
11415+ * buffer.
11416+ *
11417+ * Except for opening the device file it uses the same operations as trace_fops.
11418+ */
11419+struct file_operations log_fops = {
11420+ .owner = THIS_MODULE,
11421+ .open = log_open,
11422+ .release = trace_release,
11423+ .read = trace_read,
11424+};
11425+
11426+static int __init register_buffer_dev(const char* name,
11427+ struct file_operations* fops,
11428+ int major, int count)
11429+{
11430+ dev_t trace_dev;
11431+ struct cdev *cdev;
11432+ int error = 0;
11433+
11434+ trace_dev = MKDEV(major, 0);
11435+ error = register_chrdev_region(trace_dev, count, name);
11436+ if (error)
11437+ {
11438+ printk(KERN_WARNING "sched trace: "
11439+ "Could not register major/minor number %d\n", major);
11440+ return error;
11441+ }
11442+ cdev = cdev_alloc();
11443+ if (!cdev) {
11444+ printk(KERN_WARNING "sched trace: "
11445+ "Could not get a cdev for %s.\n", name);
11446+ return -ENOMEM;
11447+ }
11448+ cdev->owner = THIS_MODULE;
11449+ cdev->ops = fops;
11450+ error = cdev_add(cdev, trace_dev, count);
11451+ if (error) {
11452+ printk(KERN_WARNING "sched trace: "
11453+ "add_cdev failed for %s.\n", name);
11454+ return -ENOMEM;
11455+ }
11456+ return error;
11457+
11458+}
11459+
11460+static int __init init_sched_trace(void)
11461+{
11462+ int error1 = 0, error2 = 0;
11463+
11464+ printk("Initializing scheduler trace device\n");
11465+ init_buffers();
11466+
11467+ error1 = register_buffer_dev("schedtrace", &trace_fops,
11468+ TRACE_MAJOR, NR_CPUS);
11469+
11470+ error2 = register_buffer_dev("litmus_log", &log_fops,
11471+ LOG_MAJOR, 1);
11472+ if (error1 || error2)
11473+ return min(error1, error2);
11474+ else
11475+ return 0;
11476+}
11477+
11478+module_init(init_sched_trace);
11479+
11480+/******************************************************************************/
11481+/* KERNEL API */
11482+/******************************************************************************/
11483+
11484+/* The per-CPU LITMUS log buffer. Don't put it on the stack, it is too big for
11485+ * that and the kernel gets very picky with nested interrupts and small stacks.
11486+ */
11487+
11488+#ifdef CONFIG_SCHED_DEBUG_TRACE
11489+
11490+#define MSG_SIZE 255
11491+static DEFINE_PER_CPU(char[MSG_SIZE], fmt_buffer);
11492+
11493+/* sched_trace_log_message - This is the only function that accesses the the
11494+ * log buffer inside the kernel for writing.
11495+ * Concurrent access to it is serialized via the
11496+ * log_buffer_lock.
11497+ *
11498+ * The maximum length of a formatted message is 255.
11499+ */
11500+void sched_trace_log_message(const char* fmt, ...)
11501+{
11502+ unsigned long flags;
11503+ va_list args;
11504+ size_t len;
11505+ char* buf;
11506+
11507+ va_start(args, fmt);
11508+ local_irq_save(flags);
11509+
11510+ /* format message */
11511+ buf = __get_cpu_var(fmt_buffer);
11512+ len = vscnprintf(buf, MSG_SIZE, fmt, args);
11513+
11514+ spin_lock(&log_buffer_lock);
11515+ /* Don't copy the trailing null byte, we don't want null bytes
11516+ * in a text file.
11517+ */
11518+ rb_put(&log_buffer.buf, buf, len);
11519+ spin_unlock(&log_buffer_lock);
11520+
11521+ local_irq_restore(flags);
11522+ va_end(args);
11523+}
11524+
11525+#endif
11526+
11527+#ifdef CONFIG_SCHED_TASK_TRACE
11528+
11529+static inline void __put_trace(char* mem, size_t size)
11530+{
11531+ trace_buffer_t* buf = &__get_cpu_var(trace_buffer);
11532+ rb_put(&buf->buf, mem, size);
11533+}
11534+
11535+#define put_trace(obj) \
11536+ if (get_rt_mode() == MODE_RT_RUN) \
11537+ __put_trace((char *) &obj, sizeof(obj))
11538+
11539+#define header(rec, type) \
11540+{ \
11541+ rec.header.trace = type; \
11542+ rec.header.timestamp = sched_clock(); \
11543+ rec.header.size = sizeof(rec); \
11544+}
11545+
11546+#define tinfo(info, t) \
11547+{ \
11548+ info.is_rt = is_realtime(t); \
11549+ info.is_server = 0; \
11550+ info.class = get_class(t); \
11551+ info.budget = (t)->time_slice; \
11552+ info.pid = (t)->pid; \
11553+ info.deadline = (t)->rt_param.times.deadline; \
11554+}
11555+
11556+#define rtinfo(info, t) \
11557+{ \
11558+ info.wcet = get_exec_cost(t); \
11559+ info.period = get_rt_period(t); \
11560+}
11561+
11562+void sched_trace_scheduler_invocation(void)
11563+{
11564+ invocation_record_t rec;
11565+ header(rec, ST_INVOCATION);
11566+ rec.flags = current->flags;
11567+ put_trace(rec);
11568+}
11569+
11570+void sched_trace_task_arrival(struct task_struct *t)
11571+{
11572+ arrival_record_t rec;
11573+ header(rec, ST_ARRIVAL);
11574+ tinfo(rec.task, t);
11575+ put_trace(rec);
11576+}
11577+
11578+
11579+void sched_trace_task_departure(struct task_struct *t)
11580+{
11581+ departure_record_t rec;
11582+ header(rec, ST_DEPARTURE);
11583+ tinfo(rec.task, t);
11584+ put_trace(rec);
11585+}
11586+
11587+void sched_trace_task_preemption(struct task_struct *t, struct task_struct* by)
11588+{
11589+ preemption_record_t rec;
11590+ header(rec, ST_PREEMPTION);
11591+ tinfo(rec.task, t);
11592+ tinfo(rec.by, by);
11593+ put_trace(rec);
11594+}
11595+
11596+
11597+void sched_trace_task_scheduled(struct task_struct *t)
11598+{
11599+ scheduled_record_t rec;
11600+ header(rec, ST_SCHEDULED);
11601+ tinfo(rec.task, t);
11602+ put_trace(rec);
11603+}
11604+
11605+
11606+void sched_trace_job_release(struct task_struct *t)
11607+{
11608+ release_record_t rec;
11609+ header(rec, ST_JOB_RELEASE);
11610+ tinfo(rec.task, t);
11611+ rtinfo(rec, t);
11612+ put_trace(rec);
11613+}
11614+
11615+void sched_trace_job_completion(struct task_struct *t)
11616+{
11617+ completion_record_t rec;
11618+ header(rec, ST_JOB_COMPLETION);
11619+ tinfo(rec.task, t);
11620+ rtinfo(rec, t);
11621+ rec.tardiness = jiffies - t->rt_param.times.deadline;
11622+ rec.job_no = t->rt_param.times.job_no;
11623+ TRACE_TASK(t, "AAATardiness : %d\n", rec.tardiness);
11624+ put_trace(rec);
11625+}
11626+
11627+
11628+void sched_trace_server_scheduled(int id, task_class_t class,
11629+ unsigned int budget, jiffie_t deadline)
11630+{
11631+ scheduled_record_t rec;
11632+ header(rec, ST_SCHEDULED);
11633+ rec.task.pid = id;
11634+ rec.task.is_rt = 1;
11635+ rec.task.is_server = 1;
11636+ rec.task.class = class;
11637+ rec.task.budget = budget;
11638+ rec.task.deadline = deadline;
11639+ put_trace(rec);
11640+}
11641+
11642+void sched_trace_server_release(int id, unsigned int wcet,
11643+ unsigned int period, task_class_t class)
11644+{
11645+ release_record_t rec;
11646+ header(rec, ST_JOB_RELEASE);
11647+ rec.task.pid = id;
11648+ rec.task.is_rt = 1;
11649+ rec.task.is_server = 1;
11650+ rec.task.class = class;
11651+ rec.task.budget = wcet;
11652+ rec.period = period;
11653+ rec.wcet = wcet;
11654+ put_trace(rec);
11655+}
11656+
11657+void sched_trace_server_completion(int id, unsigned int budget,
11658+ jiffie_t deadline, task_class_t class)
11659+{
11660+ completion_record_t rec;
11661+ header(rec, ST_JOB_COMPLETION);
11662+ rec.task.pid = id;
11663+ rec.task.is_rt = 1;
11664+ rec.task.is_server = 1;
11665+ rec.task.class = class;
11666+ rec.task.budget = budget;
11667+ rec.task.deadline = deadline;
11668+ rec.period = 0;
11669+ rec.tardiness = jiffies - deadline;
11670+ put_trace(rec);
11671+
11672+}
11673+
11674+void sched_trace_capacity_release(struct task_struct *t)
11675+{
11676+ cap_release_record_t rec;
11677+ header(rec, ST_CAPACITY_RELEASE);
11678+ tinfo(rec.task, t);
11679+ put_trace(rec);
11680+}
11681+
11682+void sched_trace_capacity_allocation(struct task_struct *t, u16 budget, u32 deadline,
11683+ pid_t donor)
11684+{
11685+ cap_allocation_record_t rec;
11686+ header(rec, ST_CAPACITY_ALLOCATION);
11687+ tinfo(rec.task, t);
11688+ rec.donor = donor;
11689+ rec.budget = budget;
11690+ rec.deadline = deadline;
11691+ put_trace(rec);
11692+}
11693+
11694+void sched_trace_capacity_alloc_srv(pid_t srv, u32 srv_dl, task_class_t cls,
11695+ u16 srv_budget,
11696+ u16 budget, u32 deadline, pid_t donor)
11697+{
11698+ cap_allocation_record_t rec;
11699+ header(rec, ST_CAPACITY_ALLOCATION);
11700+ rec.task.pid = srv;
11701+ rec.task.is_rt = 1;
11702+ rec.task.is_server = 1;
11703+ rec.task.class = cls;
11704+ rec.task.budget = srv_budget;
11705+ rec.task.deadline = srv_dl;
11706+ rec.donor = donor;
11707+ rec.budget = budget;
11708+ rec.deadline = deadline;
11709+ put_trace(rec);
11710+}
11711+
11712+void sched_trace_service_level_change(struct task_struct *t,
11713+ unsigned int from,
11714+ unsigned int to)
11715+{
11716+ service_level_change_record_t rec;
11717+ header(rec, ST_SERVICE_LEVEL_CHANGE);
11718+ tinfo(rec.task, t);
11719+ rec.to = to;
11720+ rec.from = from;
11721+ rec.new_level =
11722+ t->rt_param.service_level[to];
11723+ rec.old_level =
11724+ t->rt_param.service_level[from];
11725+ put_trace(rec);
11726+}
11727+
11728+void sched_trace_weight_error(struct task_struct* t, fp_t actual)
11729+{
11730+ weight_error_record_t rec;
11731+ header(rec, ST_WEIGHT_ERROR);
11732+ rec.task = t->pid;
11733+ rec.actual = actual;
11734+ rec.estimate = get_est_weight(t);
11735+ put_trace(rec);
11736+}
11737+
11738+
11739+#endif
11740diff --git a/kernel/timer.c b/kernel/timer.c
11741index c2a8ccf..77a1b6b 100644
11742--- a/kernel/timer.c
11743+++ b/kernel/timer.c
11744@@ -737,6 +737,27 @@ static inline s64 __get_nsec_offset(void)
11745 return ns_offset;
11746 }
11747
11748+/* Non-static, non-inline, public version of function above.
11749+ * It's up to the programmer to decide how to use it, no guarantees
11750+ * about anything are made here.
11751+ */
11752+s64 get_nsec_offset(void)
11753+{
11754+ cycle_t cycle_now, cycle_delta;
11755+ s64 ns_offset;
11756+
11757+ /* read clocksource: */
11758+ cycle_now = clocksource_read(clock);
11759+
11760+ /* calculate the delta since the last update_wall_time: */
11761+ cycle_delta = (cycle_now - clock->cycle_last) & clock->mask;
11762+
11763+ /* convert to nanoseconds: */
11764+ ns_offset = cyc2ns(clock, cycle_delta);
11765+
11766+ return ns_offset;
11767+}
11768+
11769 /**
11770 * __get_realtime_clock_ts - Returns the time of day in a timespec
11771 * @ts: pointer to the timespec to be set
11772@@ -789,6 +810,7 @@ void do_gettimeofday(struct timeval *tv)
11773 }
11774
11775 EXPORT_SYMBOL(do_gettimeofday);
11776+
11777 /**
11778 * do_settimeofday - Sets the time of day
11779 * @tv: pointer to the timespec variable containing the new time
11780diff --git a/kernel/trace.c b/kernel/trace.c
11781new file mode 100644
11782index 0000000..6119574
11783--- /dev/null
11784+++ b/kernel/trace.c
11785@@ -0,0 +1,302 @@
11786+#include <linux/fs.h>
11787+#include <linux/cdev.h>
11788+#include <asm/semaphore.h>
11789+#include <asm/uaccess.h>
11790+#include <linux/module.h>
11791+
11792+#include <linux/trace.h>
11793+
11794+/******************************************************************************/
11795+/* Allocation */
11796+/******************************************************************************/
11797+
11798+struct ft_buffer* trace_ts_buf = NULL;
11799+
11800+static unsigned int ts_seq_no = 0;
11801+
11802+feather_callback void save_timestamp(unsigned long event)
11803+{
11804+ unsigned int seq_no = fetch_and_inc((int *) &ts_seq_no);
11805+ struct timestamp *ts;
11806+ if (ft_buffer_start_write(trace_ts_buf, (void**) &ts)) {
11807+ ts->event = event;
11808+ ts->timestamp = ft_read_tsc();
11809+ ts->seq_no = seq_no;
11810+ ts->cpu = raw_smp_processor_id();
11811+ ft_buffer_finish_write(trace_ts_buf, ts);
11812+ }
11813+}
11814+
11815+static struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size)
11816+{
11817+ struct ft_buffer* buf;
11818+ size_t total = (size + 1) * count;
11819+ char* mem;
11820+ int order = 0, pages = 1;
11821+
11822+ buf = kmalloc(sizeof(struct ft_buffer), GFP_KERNEL);
11823+ if (!buf)
11824+ return NULL;
11825+
11826+ total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
11827+ while (pages < total) {
11828+ order++;
11829+ pages *= 2;
11830+ }
11831+
11832+ mem = (char*) __get_free_pages(GFP_KERNEL, order);
11833+ if (!mem) {
11834+ kfree(buf);
11835+ return NULL;
11836+ }
11837+
11838+ if (!init_ft_buffer(buf, count, size,
11839+ mem + (count * size), /* markers at the end */
11840+ mem)) { /* buffer objects */
11841+ free_pages((unsigned long) mem, order);
11842+ kfree(buf);
11843+ return NULL;
11844+ }
11845+ return buf;
11846+}
11847+
11848+static void free_ft_buffer(struct ft_buffer* buf)
11849+{
11850+ int order = 0, pages = 1;
11851+ size_t total;
11852+
11853+ if (buf) {
11854+ total = (buf->slot_size + 1) * buf->slot_count;
11855+ total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
11856+ while (pages < total) {
11857+ order++;
11858+ pages *= 2;
11859+ }
11860+ free_pages((unsigned long) buf->buffer_mem, order);
11861+ kfree(buf);
11862+ }
11863+}
11864+
11865+
11866+/******************************************************************************/
11867+/* DEVICE FILE DRIVER */
11868+/******************************************************************************/
11869+
11870+#define NO_TIMESTAMPS 262144
11871+
11872+static DECLARE_MUTEX(feather_lock);
11873+static int use_count = 0;
11874+
11875+static int trace_release(struct inode *in, struct file *filp)
11876+{
11877+ int err = -EINVAL;
11878+
11879+ if (down_interruptible(&feather_lock)) {
11880+ err = -ERESTARTSYS;
11881+ goto out;
11882+ }
11883+
11884+ printk(KERN_ALERT "%s/%d disconnects from feather trace device. "
11885+ "use_count=%d\n",
11886+ current->comm, current->pid, use_count);
11887+
11888+ if (use_count == 1) {
11889+ /* disable events */
11890+ ft_disable_all_events();
11891+
11892+ /* wait for any pending events to complete */
11893+ set_current_state(TASK_UNINTERRUPTIBLE);
11894+ schedule_timeout(HZ);
11895+
11896+ printk(KERN_ALERT "Failed trace writes: %u\n",
11897+ trace_ts_buf->failed_writes);
11898+
11899+ free_ft_buffer(trace_ts_buf);
11900+ trace_ts_buf = NULL;
11901+ }
11902+
11903+ use_count--;
11904+ up(&feather_lock);
11905+out:
11906+ return err;
11907+}
11908+
11909+
11910+static ssize_t trace_read(struct file *filp, char __user *to, size_t len,
11911+ loff_t *f_pos)
11912+{
11913+ /* we ignore f_pos, this is strictly sequential */
11914+ ssize_t error = 0;
11915+ struct timestamp ts;
11916+
11917+ if (down_interruptible(&feather_lock)) {
11918+ error = -ERESTARTSYS;
11919+ goto out;
11920+ }
11921+
11922+
11923+ while (len >= sizeof(struct timestamp)) {
11924+ if (ft_buffer_read(trace_ts_buf, &ts)) {
11925+ if (copy_to_user(to, &ts, sizeof(struct timestamp))) {
11926+ error = -EFAULT;
11927+ break;
11928+ } else {
11929+ len -= sizeof(struct timestamp);
11930+ to += sizeof(struct timestamp);
11931+ error += sizeof(struct timestamp);
11932+ }
11933+ } else {
11934+ set_current_state(TASK_INTERRUPTIBLE);
11935+ schedule_timeout(50);
11936+ if (signal_pending(current)) {
11937+ error = -ERESTARTSYS;
11938+ break;
11939+ }
11940+ }
11941+ }
11942+ up(&feather_lock);
11943+out:
11944+ return error;
11945+}
11946+
11947+#define ENABLE_CMD 0
11948+#define DISABLE_CMD 1
11949+
11950+static ssize_t trace_write(struct file *filp, const char __user *from,
11951+ size_t len, loff_t *f_pos)
11952+{
11953+ ssize_t error = -EINVAL;
11954+ unsigned long cmd;
11955+ unsigned long id;
11956+
11957+ if (len % sizeof(long) || len < 2 * sizeof(long))
11958+ goto out;
11959+
11960+ if (copy_from_user(&cmd, from, sizeof(long))) {
11961+ error = -EFAULT;
11962+ goto out;
11963+ }
11964+ len -= sizeof(long);
11965+ from += sizeof(long);
11966+
11967+ if (cmd != ENABLE_CMD && cmd != DISABLE_CMD)
11968+ goto out;
11969+
11970+ if (down_interruptible(&feather_lock)) {
11971+ error = -ERESTARTSYS;
11972+ goto out;
11973+ }
11974+
11975+ error = sizeof(long);
11976+ while (len) {
11977+ if (copy_from_user(&id, from, sizeof(long))) {
11978+ error = -EFAULT;
11979+ goto out;
11980+ }
11981+ len -= sizeof(long);
11982+ from += sizeof(long);
11983+ if (cmd) {
11984+ printk(KERN_INFO
11985+ "Disabling feather-trace event %lu.\n", id);
11986+ ft_disable_event(id);
11987+ } else {
11988+ printk(KERN_INFO
11989+ "Enabling feather-trace event %lu.\n", id);
11990+ ft_enable_event(id);
11991+ }
11992+ error += sizeof(long);
11993+ }
11994+
11995+ up(&feather_lock);
11996+ out:
11997+ return error;
11998+}
11999+
12000+static int trace_open(struct inode *in, struct file *filp)
12001+{
12002+ int err = 0;
12003+ unsigned int count = NO_TIMESTAMPS;
12004+
12005+ if (down_interruptible(&feather_lock)) {
12006+ err = -ERESTARTSYS;
12007+ goto out;
12008+ }
12009+
12010+ while (count && !trace_ts_buf) {
12011+ printk("trace: trying to allocate %u time stamps.\n", count);
12012+ trace_ts_buf = alloc_ft_buffer(count, sizeof(struct timestamp));
12013+ count /= 2;
12014+ }
12015+ if (!trace_ts_buf)
12016+ err = -ENOMEM;
12017+ else
12018+ use_count++;
12019+
12020+ up(&feather_lock);
12021+out:
12022+ return err;
12023+}
12024+
12025+/******************************************************************************/
12026+/* Device Registration */
12027+/******************************************************************************/
12028+
12029+#define FT_TRACE_MAJOR 252
12030+
12031+struct file_operations ft_trace_fops = {
12032+ .owner = THIS_MODULE,
12033+ .open = trace_open,
12034+ .release = trace_release,
12035+ .write = trace_write,
12036+ .read = trace_read,
12037+};
12038+
12039+
12040+static int __init register_buffer_dev(const char* name,
12041+ struct file_operations* fops,
12042+ int major, int count)
12043+{
12044+ dev_t trace_dev;
12045+ struct cdev *cdev;
12046+ int error = 0;
12047+
12048+ trace_dev = MKDEV(major, 0);
12049+ error = register_chrdev_region(trace_dev, count, name);
12050+ if (error)
12051+ {
12052+ printk(KERN_WARNING "trace: "
12053+ "Could not register major/minor number %d\n", major);
12054+ return error;
12055+ }
12056+ cdev = cdev_alloc();
12057+ if (!cdev) {
12058+ printk(KERN_WARNING "trace: "
12059+ "Could not get a cdev for %s.\n", name);
12060+ return -ENOMEM;
12061+ }
12062+ cdev->owner = THIS_MODULE;
12063+ cdev->ops = fops;
12064+ error = cdev_add(cdev, trace_dev, count);
12065+ if (error) {
12066+ printk(KERN_WARNING "trace: "
12067+ "add_cdev failed for %s.\n", name);
12068+ return -ENOMEM;
12069+ }
12070+ return error;
12071+
12072+}
12073+
12074+static int __init init_sched_trace(void)
12075+{
12076+ int error = 0;
12077+
12078+ printk("Initializing Feather-Trace device\n");
12079+ /* dummy entry to make linker happy */
12080+ ft_event0(666, save_timestamp);
12081+
12082+ error = register_buffer_dev("ft_trace", &ft_trace_fops,
12083+ FT_TRACE_MAJOR, 1);
12084+ return error;
12085+}
12086+
12087+module_init(init_sched_trace);
12088diff --git a/lib/semaphore-sleepers.c b/lib/semaphore-sleepers.c
12089index 1281805..3f4d543 100644
12090--- a/lib/semaphore-sleepers.c
12091+++ b/lib/semaphore-sleepers.c
12092@@ -108,7 +108,7 @@ fastcall int __sched __down_interruptible(struct semaphore * sem)
12093 /*
12094 * With signals pending, this turns into
12095 * the trylock failure case - we won't be
12096- * sleeping, and we* can't get the lock as
12097+ * sleeping, and we can't get the lock as
12098 * it has contention. Just correct the count
12099 * and exit.
12100 */
diff --git a/index.html b/index.html
index bbdcf1a..623a80f 100644
--- a/index.html
+++ b/index.html
@@ -30,13 +30,26 @@
30 kernel with focus on multiprocessor real-time scheduling and 30 kernel with focus on multiprocessor real-time scheduling and
31 synchronization. The Linux kernel is modified to support the sporadic task 31 synchronization. The Linux kernel is modified to support the sporadic task
32 model and modular scheduler plugins. Both partitioned and global scheduling 32 model and modular scheduler plugins. Both partitioned and global scheduling
33 is supported. In the current version (2007.1), scheduler plugins that 33 is supported. In the current version (2007.2), plugins for the following
34 implement various EDF variants and PFAIR scheduling are included. 34 scheduling policies are included:
35 <ul>
36 <li> Partitioned EDF (P-EDF)</li>
37 <li> Partitioned EDF with synchronization support (PSN-EDF)</li>
38 <li> Global EDF (G-EDF)</li>
39 <li> Global EDF with synchronization support (GSN-EDF)</li>
40 <li> Global non-preemptive EDF (G-NP-EDF)</li>
41 <li> Global Feedback-Controlled EDF (FC-EDF)</li>
42 <li> EDF for heterogeneous task systems (EDF-HSB)</li>
43 <li> PFAIR (both staggered and aligned quanta are supported)</li>
44 </ul>
45
46 The latest public release of LITMUS<sup>RT</sup> occurred on 10/29/2007.
35 </p> 47 </p>
36 48
37 <p class="nobottommargin"> 49<!-- <p class="nobottommargin">
38 <em>To be continued...</em> 50 <em>To be continued...</em>
39 </p> 51 </p>
52-->
40 </div> 53 </div>
41 54
42 <h2 id="support">Support</h2> 55 <h2 id="support">Support</h2>
@@ -142,7 +155,8 @@
142 General Public License (GPL)</a>. 155 General Public License (GPL)</a>.
143 </p> 156 </p>
144 <p> 157 <p>
145 The current release (2007.1) consists of 158 The latest version of LITMUS<sup>RT</sup> is 2007.2 and was released on 10/29/2007.
159 It consists of
146 our Linux kernel modifications in the form of 160 our Linux kernel modifications in the form of
147 a patch against Linux 2.6.20, 161 a patch against Linux 2.6.20,
148 <span class="src">liblitmus</span>, the user-space API for real-time tasks, 162 <span class="src">liblitmus</span>, the user-space API for real-time tasks,
@@ -152,32 +166,47 @@
152 166
153 <p class="nobottommargin"> 167 <p class="nobottommargin">
154 <ul> 168 <ul>
155 <li><a href="download/litmus-rt-2007.1.patch">litmus-rt-2007.1.patch</a> 169 <li><a href="download/litmus-rt-2007.2.patch">litmus-rt-2007.2.patch</a>
156 (266 KB)<br/> 170 (328 KB)<br/>
157 Applies 171 Applies
158 against Linux 2.6.20 (see <a href="#install">Section Install</a> below).</li> 172 against Linux 2.6.20 (see <a href="#install">Section Install</a> below).</li>
159 173
160 <li><a href="download/liblitmus-2007.1.tgz">liblitmus-2007.1.tgz</a> 174 <li><a href="download/liblitmus-2007.2.tgz">liblitmus-2007.2.tgz</a>
161 (6.8 KB) 175 (11 KB)
162 </li> 176 </li>
163 177
164 <li><a href="download/libso-2007.1.tgz">libso-2007.1.tgz</a> 178 <li><a href="download/libso-2007.2.tgz">libso-2007.2.tgz</a>
165 (16 KB) 179 (16 KB)
166 </li> 180 </li>
167 </ul> 181 </ul>
168 Please note that the current implementation is a <em>prototype</em> with 182 Please note that the current implementation is a <em>prototype</em> with
169 certain limitations. Most notably, it is not safe in a multiuser context, 183 certain limitations. Most notably, it is not secure in a multiuser context,
170 <em>i.e.</em>, real-time system calls do not check for superuser 184 <em>i.e.</em>, real-time system calls do not require superuser
171 privileges. Further, some resources (<em>e.g.</em> semaphores) that 185 privileges. Further, some resources (<em>e.g.</em> semaphores) that
172 should be dynamically allocated are allocated statically in the current version. 186 should be dynamically allocated are allocated statically in the current version.
173 </p> 187 </p>
188
189 <p class="nobottommargin">
190 Old releases:
191 <ul>
192 <li> 2007.1 (May 2007)<br/>
193 Based on Linux 2.6.20. <br/>
194 <a href="download/litmus-rt-2007.1.patch">litmus-rt-2007.1.patch</a>
195 (266 KB) <br/>
196 <a href="download/liblitmus-2007.1.tgz">liblitmus-2007.1.tgz</a>
197 (6.8 KB) <br/>
198 <a href="download/libso-2007.1.tgz">libso-2007.1.tgz</a>
199 (16 KB) <br/>
200 </li>
201 </ul>
202 </p>
174 </div> 203 </div>
175 204
176 205
177 <h2 id="install">Installation</h2> 206 <h2 id="install">Installation</h2>
178 <div class="box"> 207 <div class="box">
179 <p class="notopmargin"> 208 <p class="notopmargin">
180 The current release of LITMUS<sup>RT</sup>, version 2007.1, consists of an 209 The current release of LITMUS<sup>RT</sup>, version 2007.2, consists of an
181 extension of the Linux kernel that adds support for the sporadic task 210 extension of the Linux kernel that adds support for the sporadic task
182 model, a scheduler plugin infrastructure, and some scheduler plugins, as 211 model, a scheduler plugin infrastructure, and some scheduler plugins, as
183 well as two user-space libraries that provide the LITMUS<sup>RT</sup> 212 well as two user-space libraries that provide the LITMUS<sup>RT</sup>
@@ -202,11 +231,11 @@ cd $DIR
202# get Linux 2.6.20 231# get Linux 2.6.20
203wget http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.20.tar.bz2 232wget http://www.kernel.org/pub/linux/kernel/v2.6/linux-2.6.20.tar.bz2
204tar xjf linux-2.6.20.tar.bz2 233tar xjf linux-2.6.20.tar.bz2
205wget http://www.cs.unc.edu/~anderson/litmus-rt/download/litmus-rt-2007.1.patch 234wget http://www.cs.unc.edu/~anderson/litmus-rt/download/litmus-rt-2007.2.patch
206mv linux-2.6.20 litmus-rt 235mv linux-2.6.20 litmus-rt
207# apply the LITMUS RT patch 236# apply the LITMUS RT patch
208cd litmus-rt 237cd litmus-rt
209patch -p1 < ../litmus-rt-2007.1.patch 238patch -p1 < ../litmus-rt-2007.2.patch
210# create a working kernel configuration with HZ=1000 239# create a working kernel configuration with HZ=1000
211make gconfig 240make gconfig
212# compile the kernel 241# compile the kernel
@@ -223,7 +252,7 @@ make modules
223 class="src">rtsched</span> kernel parameter. 252 class="src">rtsched</span> kernel parameter.
224 </p> 253 </p>
225<pre class="shell"> 254<pre class="shell">
226rtsched={linux, pfair, part_edf, global_edf, global_edf_np, edf_hsb, gsn_edf, psn_edf} 255rtsched={linux, pfair, part_edf, global_edf, global_edf_np, edf_hsb, gsn_edf, psn_edf, adaptive}
227</pre> 256</pre>
228 <p> 257 <p>
229 For example, on our test machine, we use the 258 For example, on our test machine, we use the
@@ -246,8 +275,8 @@ initrd /boot/kernel-2.6.20-LITMUSRT.img
246 </p> 275 </p>
247<pre class="shell"> 276<pre class="shell">
248cd $DIR 277cd $DIR
249wget http://www.cs.unc.edu/~anderson/litmus-rt/download/liblitmus-2007.1.tgz 278wget http://www.cs.unc.edu/~anderson/litmus-rt/download/liblitmus-2007.2.tgz
250tar xzf liblitmus-2007.1.tgz 279tar xzf liblitmus-2007.2.tgz
251cd liblitmus 280cd liblitmus
252make 281make
253</pre> 282</pre>
@@ -260,8 +289,8 @@ make
260 </p> 289 </p>
261<pre class="shell"> 290<pre class="shell">
262cd $DIR 291cd $DIR
263wget http://www.cs.unc.edu/~anderson/litmus-rt/download/libso-2007.1.tgz 292wget http://www.cs.unc.edu/~anderson/litmus-rt/download/libso-2007.2.tgz
264tar xzf libso-2007.1.tgz 293tar xzf libso-2007.2.tgz
265cd libso 294cd libso
266make 295make
267make tests 296make tests
@@ -277,13 +306,27 @@ make tests
277 306
278 <h2 id="doc">Documentation</h2> 307 <h2 id="doc">Documentation</h2>
279 <div class="box"> 308 <div class="box">
309
280 <p class="nomargin"> 310 <p class="nomargin">
311 Most of the documentation has yet to be written. To get an overview of
312 the architecture of the kernel extension, we recommend to read the paper
313 <a href="http://www.cs.unc.edu/~anderson/papers/rtlws07.pdf">&ldquo;LITMUS<sup>RT</sup>:
314 A Status Report&rdquo;</a>.
315 <br/>
316 <br/>
317 Please contact <span class="src">bbb[AT]cs.unc.edu</span> if you have any
318 questions.
319 </p>
320
321<!-- <p class="nomargin">
281 <em>To be written...</em> 322 <em>To be written...</em>
282 <ul class="nomargin"> 323 <ul class="nomargin">
283 <li>How to use LITMUS<sup>RT</sup></li> 324 <li>How to use LITMUS<sup>RT</sup></li>
284 <li>A real-time &quot;Hello World!&quot;</li> 325 <li>A real-time &quot;Hello World!&quot;</li>
285 </ul> 326 </ul>
286 </p> 327 </p>
328
329-->
287 </div> 330 </div>
288 331
289 <hr/> 332 <hr/>