aboutsummaryrefslogtreecommitdiffstats
path: root/litmus
diff options
context:
space:
mode:
Diffstat (limited to 'litmus')
-rw-r--r--litmus/Kconfig303
-rw-r--r--litmus/Makefile33
-rw-r--r--litmus/affinity.c42
-rw-r--r--litmus/bheap.c314
-rw-r--r--litmus/binheap.c388
-rw-r--r--litmus/budget.c113
-rw-r--r--litmus/clustered.c111
-rw-r--r--litmus/ctrldev.c160
-rw-r--r--litmus/edf_common.c200
-rw-r--r--litmus/fdso.c306
-rw-r--r--litmus/fp_common.c119
-rw-r--r--litmus/ft_event.c43
-rw-r--r--litmus/ftdev.c446
-rw-r--r--litmus/jobs.c73
-rw-r--r--litmus/litmus.c579
-rw-r--r--litmus/litmus_proc.c347
-rw-r--r--litmus/locking.c236
-rw-r--r--litmus/preempt.c137
-rw-r--r--litmus/rt_domain.c349
-rw-r--r--litmus/sched_cedf.c857
-rw-r--r--litmus/sched_gsn_edf.c1030
-rw-r--r--litmus/sched_litmus.c330
-rw-r--r--litmus/sched_pfair.c1074
-rw-r--r--litmus/sched_pfp.c1751
-rw-r--r--litmus/sched_plugin.c227
-rw-r--r--litmus/sched_psn_edf.c999
-rw-r--r--litmus/sched_task_trace.c272
-rw-r--r--litmus/sched_trace.c252
-rw-r--r--litmus/srp.c305
-rw-r--r--litmus/sync.c152
-rw-r--r--litmus/trace.c300
-rw-r--r--litmus/uncachedev.c102
32 files changed, 11950 insertions, 0 deletions
diff --git a/litmus/Kconfig b/litmus/Kconfig
new file mode 100644
index 00000000000..795fbe1a769
--- /dev/null
+++ b/litmus/Kconfig
@@ -0,0 +1,303 @@
1menu "LITMUS^RT"
2
3menu "Scheduling"
4
5config PLUGIN_CEDF
6 bool "Clustered-EDF"
7 depends on X86 && SYSFS
8 default y
9 help
10 Include the Clustered EDF (C-EDF) plugin in the kernel.
11 This is appropriate for large platforms with shared caches.
12 On smaller platforms (e.g., ARM PB11MPCore), using C-EDF
13 makes little sense since there aren't any shared caches.
14
15config PLUGIN_PFAIR
16 bool "PFAIR"
17 depends on HIGH_RES_TIMERS && !NO_HZ
18 default y
19 help
20 Include the PFAIR plugin (i.e., the PD^2 scheduler) in the kernel.
21 The PFAIR plugin requires high resolution timers (for staggered quanta)
22 and does not support NO_HZ (quanta could be missed when the system is idle).
23
24 If unsure, say Yes.
25
26config RELEASE_MASTER
27 bool "Release-master Support"
28 depends on ARCH_HAS_SEND_PULL_TIMERS
29 default n
30 help
31 Allow one processor to act as a dedicated interrupt processor
32 that services all timer interrupts, but that does not schedule
33 real-time tasks. See RTSS'09 paper for details
34 (http://www.cs.unc.edu/~anderson/papers.html).
35 Currently only supported by GSN-EDF.
36
37endmenu
38
39menu "Real-Time Synchronization"
40
41config NP_SECTION
42 bool "Non-preemptive section support"
43 default n
44 help
45 Allow tasks to become non-preemptable.
46 Note that plugins still need to explicitly support non-preemptivity.
47 Currently, only GSN-EDF and PSN-EDF have such support.
48
49 This is required to support locking protocols such as the FMLP.
50 If disabled, all tasks will be considered preemptable at all times.
51
52config LITMUS_LOCKING
53 bool "Support for real-time locking protocols"
54 depends on NP_SECTION
55 default n
56 help
57 Enable LITMUS^RT's deterministic multiprocessor real-time
58 locking protocols.
59
60 Say Yes if you want to include locking protocols such as the FMLP and
61 Baker's SRP.
62
63endmenu
64
65menu "Performance Enhancements"
66
67config SCHED_CPU_AFFINITY
68 bool "Local Migration Affinity"
69 depends on X86
70 default y
71 help
72 Rescheduled tasks prefer CPUs near to their previously used CPU. This
73 may improve performance through possible preservation of cache affinity.
74
75 Warning: May make bugs harder to find since tasks may migrate less often.
76
77 NOTES:
78 * Feature is not utilized by PFair/PD^2.
79
80 Say Yes if unsure.
81
82config ALLOW_EARLY_RELEASE
83 bool "Allow Early Releasing"
84 default y
85 help
86 Allow tasks to release jobs early (while still maintaining job
87 precedence constraints). Only supported by EDF schedulers. Early
88 releasing must be explicitly requested by real-time tasks via
89 the task_params passed to sys_set_task_rt_param().
90
91 Early releasing can improve job response times while maintaining
92 real-time correctness. However, it can easily peg your CPUs
93 since tasks never suspend to wait for their next job. As such, early
94 releasing is really only useful in the context of implementing
95 bandwidth servers, interrupt handling threads, or short-lived
96 computations.
97
98 Beware that early releasing may affect real-time analysis
99 if using locking protocols or I/O.
100
101 Say Yes if unsure.
102
103choice
104 prompt "EDF Tie-Break Behavior"
105 default EDF_TIE_BREAK_LATENESS_NORM
106 help
107 Allows the configuration of tie-breaking behavior when the deadlines
108 of two EDF-scheduled tasks are equal.
109
110 config EDF_TIE_BREAK_LATENESS
111 bool "Lateness-based Tie Break"
112 help
113 Break ties between two jobs, A and B, based upon the lateness of their
114 prior jobs. The job with the greatest lateness has priority. Note that
115 lateness has a negative value if the prior job finished before its
116 deadline.
117
118 config EDF_TIE_BREAK_LATENESS_NORM
119 bool "Normalized Lateness-based Tie Break"
120 help
121 Break ties between two jobs, A and B, based upon the lateness, normalized
122 by relative deadline, of their prior jobs. The job with the greatest
123 normalized lateness has priority. Note that lateness has a negative value
124 if the prior job finished before its deadline.
125
126 Normalized lateness tie-breaks are likely desireable over non-normalized
127 tie-breaks if the execution times and/or relative deadlines of tasks in a
128 task set vary greatly.
129
130 config EDF_TIE_BREAK_HASH
131 bool "Hash-based Tie Breaks"
132 help
133 Break ties between two jobs, A and B, with equal deadlines by using a
134 uniform hash; i.e.: hash(A.pid, A.job_num) < hash(B.pid, B.job_num). Job
135 A has ~50% of winning a given tie-break.
136
137 config EDF_PID_TIE_BREAK
138 bool "PID-based Tie Breaks"
139 help
140 Break ties based upon OS-assigned thread IDs. Use this option if
141 required by algorithm's real-time analysis or per-task response-time
142 jitter must be minimized.
143
144 NOTES:
145 * This tie-breaking method was default in Litmus 2012.2 and before.
146
147endchoice
148
149endmenu
150
151menu "Tracing"
152
153config FEATHER_TRACE
154 bool "Feather-Trace Infrastructure"
155 default y
156 help
157 Feather-Trace basic tracing infrastructure. Includes device file
158 driver and instrumentation point support.
159
160 There are actually two implementations of Feather-Trace.
161 1) A slower, but portable, default implementation.
162 2) Architecture-specific implementations that rewrite kernel .text at runtime.
163
164 If enabled, Feather-Trace will be based on 2) if available (currently only for x86).
165 However, if DEBUG_RODATA=y, then Feather-Trace will choose option 1) in any case
166 to avoid problems with write-protected .text pages.
167
168 Bottom line: to avoid increased overheads, choose DEBUG_RODATA=n.
169
170 Note that this option only enables the basic Feather-Trace infrastructure;
171 you still need to enable SCHED_TASK_TRACE and/or SCHED_OVERHEAD_TRACE to
172 actually enable any events.
173
174config SCHED_TASK_TRACE
175 bool "Trace real-time tasks"
176 depends on FEATHER_TRACE
177 default y
178 help
179 Include support for the sched_trace_XXX() tracing functions. This
180 allows the collection of real-time task events such as job
181 completions, job releases, early completions, etc. This results in a
182 small overhead in the scheduling code. Disable if the overhead is not
183 acceptable (e.g., benchmarking).
184
185 Say Yes for debugging.
186 Say No for overhead tracing.
187
188config SCHED_TASK_TRACE_SHIFT
189 int "Buffer size for sched_trace_xxx() events"
190 depends on SCHED_TASK_TRACE
191 range 8 13
192 default 9
193 help
194
195 Select the buffer size of sched_trace_xxx() events as a power of two.
196 These buffers are statically allocated as per-CPU data. Each event
197 requires 24 bytes storage plus one additional flag byte. Too large
198 buffers can cause issues with the per-cpu allocator (and waste
199 memory). Too small buffers can cause scheduling events to be lost. The
200 "right" size is workload dependent and depends on the number of tasks,
201 each task's period, each task's number of suspensions, and how often
202 the buffer is flushed.
203
204 Examples: 12 => 4k events
205 10 => 1k events
206 8 => 512 events
207
208config SCHED_LITMUS_TRACEPOINT
209 bool "Enable Event/Tracepoint Tracing for real-time task tracing"
210 depends on TRACEPOINTS
211 default n
212 help
213 Enable kernel-style events (tracepoint) for Litmus. Litmus events
214 trace the same functions as the above sched_trace_XXX(), but can
215 be enabled independently.
216 Litmus tracepoints can be recorded and analyzed together (single
217 time reference) with all other kernel tracing events (e.g.,
218 sched:sched_switch, etc.).
219
220 This also enables a quick way to visualize schedule traces using
221 trace-cmd utility and kernelshark visualizer.
222
223 Say Yes for debugging and visualization purposes.
224 Say No for overhead tracing.
225
226config SCHED_OVERHEAD_TRACE
227 bool "Record timestamps for overhead measurements"
228 depends on FEATHER_TRACE
229 default n
230 help
231 Export event stream for overhead tracing.
232 Say Yes for overhead tracing.
233
234config SCHED_DEBUG_TRACE
235 bool "TRACE() debugging"
236 default y
237 help
238 Include support for sched_trace_log_messageg(), which is used to
239 implement TRACE(). If disabled, no TRACE() messages will be included
240 in the kernel, and no overheads due to debugging statements will be
241 incurred by the scheduler. Disable if the overhead is not acceptable
242 (e.g. benchmarking).
243
244 Say Yes for debugging.
245 Say No for overhead tracing.
246
247config SCHED_DEBUG_TRACE_SHIFT
248 int "Buffer size for TRACE() buffer"
249 depends on SCHED_DEBUG_TRACE
250 range 14 22
251 default 18
252 help
253
254 Select the amount of memory needed per for the TRACE() buffer, as a
255 power of two. The TRACE() buffer is global and statically allocated. If
256 the buffer is too small, there will be holes in the TRACE() log if the
257 buffer-flushing task is starved.
258
259 The default should be sufficient for most systems. Increase the buffer
260 size if the log contains holes. Reduce the buffer size when running on
261 a memory-constrained system.
262
263 Examples: 14 => 16KB
264 18 => 256KB
265 20 => 1MB
266
267 This buffer is exported to usespace using a misc device as
268 'litmus/log'. On a system with default udev rules, a corresponding
269 character device node should be created at /dev/litmus/log. The buffer
270 can be flushed using cat, e.g., 'cat /dev/litmus/log > my_log_file.txt'.
271
272config SCHED_DEBUG_TRACE_CALLER
273 bool "Include [function@file:line] tag in TRACE() log"
274 depends on SCHED_DEBUG_TRACE
275 default n
276 help
277 With this option enabled, TRACE() prepends
278
279 "[<function name>@<filename>:<line number>]"
280
281 to each message in the debug log. Enable this to aid in figuring out
282 what was called in which order. The downside is that it adds a lot of
283 clutter.
284
285 If unsure, say No.
286
287config PREEMPT_STATE_TRACE
288 bool "Trace preemption state machine transitions"
289 depends on SCHED_DEBUG_TRACE && DEBUG_KERNEL
290 default n
291 help
292 With this option enabled, each CPU will log when it transitions
293 states in the preemption state machine. This state machine is
294 used to determine how to react to IPIs (avoid races with in-flight IPIs).
295
296 Warning: this creates a lot of information in the debug trace. Only
297 recommended when you are debugging preemption-related races.
298
299 If unsure, say No.
300
301endmenu
302
303endmenu
diff --git a/litmus/Makefile b/litmus/Makefile
new file mode 100644
index 00000000000..2bddc94a399
--- /dev/null
+++ b/litmus/Makefile
@@ -0,0 +1,33 @@
1#
2# Makefile for LITMUS^RT
3#
4
5obj-y = sched_plugin.o litmus.o \
6 preempt.o \
7 litmus_proc.o \
8 budget.o \
9 clustered.o \
10 jobs.o \
11 sync.o \
12 rt_domain.o \
13 edf_common.o \
14 fp_common.o \
15 fdso.o \
16 locking.o \
17 srp.o \
18 bheap.o \
19 binheap.o \
20 ctrldev.o \
21 uncachedev.o \
22 sched_gsn_edf.o \
23 sched_psn_edf.o \
24 sched_pfp.o
25
26obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o
27obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o
28obj-$(CONFIG_SCHED_CPU_AFFINITY) += affinity.o
29
30obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
31obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
32obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o
33obj-$(CONFIG_SCHED_OVERHEAD_TRACE) += trace.o
diff --git a/litmus/affinity.c b/litmus/affinity.c
new file mode 100644
index 00000000000..3fa6dd78940
--- /dev/null
+++ b/litmus/affinity.c
@@ -0,0 +1,42 @@
1#include <linux/cpu.h>
2
3#include <litmus/affinity.h>
4
5struct neighborhood neigh_info[NR_CPUS];
6
7/* called by _init_litmus() */
8void init_topology(void) {
9 int cpu;
10 int i;
11 int chk;
12 int depth = num_cache_leaves;
13
14 if (depth > NUM_CACHE_LEVELS)
15 depth = NUM_CACHE_LEVELS;
16
17 for_each_online_cpu(cpu) {
18 for (i = 0; i < depth; ++i) {
19 chk = get_shared_cpu_map((struct cpumask *)&neigh_info[cpu].neighbors[i], cpu, i);
20 if (chk) {
21 /* failed */
22 neigh_info[cpu].size[i] = 0;
23 } else {
24 /* size = num bits in mask */
25 neigh_info[cpu].size[i] =
26 cpumask_weight((struct cpumask *)&neigh_info[cpu].neighbors[i]);
27 }
28 printk("CPU %d has %d neighbors at level %d. (mask = %lx)\n",
29 cpu, neigh_info[cpu].size[i], i,
30 *cpumask_bits(neigh_info[cpu].neighbors[i]));
31 }
32
33 /* set data for non-existent levels */
34 for (; i < NUM_CACHE_LEVELS; ++i) {
35 neigh_info[cpu].size[i] = 0;
36
37 printk("CPU %d has %d neighbors at level %d. (mask = %lx)\n",
38 cpu, neigh_info[cpu].size[i], i, 0lu);
39 }
40 }
41}
42
diff --git a/litmus/bheap.c b/litmus/bheap.c
new file mode 100644
index 00000000000..528af97f18a
--- /dev/null
+++ b/litmus/bheap.c
@@ -0,0 +1,314 @@
1#include "linux/kernel.h"
2#include "litmus/bheap.h"
3
4void bheap_init(struct bheap* heap)
5{
6 heap->head = NULL;
7 heap->min = NULL;
8}
9
10void bheap_node_init(struct bheap_node** _h, void* value)
11{
12 struct bheap_node* h = *_h;
13 h->parent = NULL;
14 h->next = NULL;
15 h->child = NULL;
16 h->degree = NOT_IN_HEAP;
17 h->value = value;
18 h->ref = _h;
19}
20
21
22/* make child a subtree of root */
23static void __bheap_link(struct bheap_node* root,
24 struct bheap_node* child)
25{
26 child->parent = root;
27 child->next = root->child;
28 root->child = child;
29 root->degree++;
30}
31
32/* merge root lists */
33static struct bheap_node* __bheap_merge(struct bheap_node* a,
34 struct bheap_node* b)
35{
36 struct bheap_node* head = NULL;
37 struct bheap_node** pos = &head;
38
39 while (a && b) {
40 if (a->degree < b->degree) {
41 *pos = a;
42 a = a->next;
43 } else {
44 *pos = b;
45 b = b->next;
46 }
47 pos = &(*pos)->next;
48 }
49 if (a)
50 *pos = a;
51 else
52 *pos = b;
53 return head;
54}
55
56/* reverse a linked list of nodes. also clears parent pointer */
57static struct bheap_node* __bheap_reverse(struct bheap_node* h)
58{
59 struct bheap_node* tail = NULL;
60 struct bheap_node* next;
61
62 if (!h)
63 return h;
64
65 h->parent = NULL;
66 while (h->next) {
67 next = h->next;
68 h->next = tail;
69 tail = h;
70 h = next;
71 h->parent = NULL;
72 }
73 h->next = tail;
74 return h;
75}
76
77static void __bheap_min(bheap_prio_t higher_prio, struct bheap* heap,
78 struct bheap_node** prev, struct bheap_node** node)
79{
80 struct bheap_node *_prev, *cur;
81 *prev = NULL;
82
83 if (!heap->head) {
84 *node = NULL;
85 return;
86 }
87
88 *node = heap->head;
89 _prev = heap->head;
90 cur = heap->head->next;
91 while (cur) {
92 if (higher_prio(cur, *node)) {
93 *node = cur;
94 *prev = _prev;
95 }
96 _prev = cur;
97 cur = cur->next;
98 }
99}
100
101static void __bheap_union(bheap_prio_t higher_prio, struct bheap* heap,
102 struct bheap_node* h2)
103{
104 struct bheap_node* h1;
105 struct bheap_node *prev, *x, *next;
106 if (!h2)
107 return;
108 h1 = heap->head;
109 if (!h1) {
110 heap->head = h2;
111 return;
112 }
113 h1 = __bheap_merge(h1, h2);
114 prev = NULL;
115 x = h1;
116 next = x->next;
117 while (next) {
118 if (x->degree != next->degree ||
119 (next->next && next->next->degree == x->degree)) {
120 /* nothing to do, advance */
121 prev = x;
122 x = next;
123 } else if (higher_prio(x, next)) {
124 /* x becomes the root of next */
125 x->next = next->next;
126 __bheap_link(x, next);
127 } else {
128 /* next becomes the root of x */
129 if (prev)
130 prev->next = next;
131 else
132 h1 = next;
133 __bheap_link(next, x);
134 x = next;
135 }
136 next = x->next;
137 }
138 heap->head = h1;
139}
140
141static struct bheap_node* __bheap_extract_min(bheap_prio_t higher_prio,
142 struct bheap* heap)
143{
144 struct bheap_node *prev, *node;
145 __bheap_min(higher_prio, heap, &prev, &node);
146 if (!node)
147 return NULL;
148 if (prev)
149 prev->next = node->next;
150 else
151 heap->head = node->next;
152 __bheap_union(higher_prio, heap, __bheap_reverse(node->child));
153 return node;
154}
155
156/* insert (and reinitialize) a node into the heap */
157void bheap_insert(bheap_prio_t higher_prio, struct bheap* heap,
158 struct bheap_node* node)
159{
160 struct bheap_node *min;
161 node->child = NULL;
162 node->parent = NULL;
163 node->next = NULL;
164 node->degree = 0;
165 if (heap->min && higher_prio(node, heap->min)) {
166 /* swap min cache */
167 min = heap->min;
168 min->child = NULL;
169 min->parent = NULL;
170 min->next = NULL;
171 min->degree = 0;
172 __bheap_union(higher_prio, heap, min);
173 heap->min = node;
174 } else
175 __bheap_union(higher_prio, heap, node);
176}
177
178void bheap_uncache_min(bheap_prio_t higher_prio, struct bheap* heap)
179{
180 struct bheap_node* min;
181 if (heap->min) {
182 min = heap->min;
183 heap->min = NULL;
184 bheap_insert(higher_prio, heap, min);
185 }
186}
187
188/* merge addition into target */
189void bheap_union(bheap_prio_t higher_prio,
190 struct bheap* target, struct bheap* addition)
191{
192 /* first insert any cached minima, if necessary */
193 bheap_uncache_min(higher_prio, target);
194 bheap_uncache_min(higher_prio, addition);
195 __bheap_union(higher_prio, target, addition->head);
196 /* this is a destructive merge */
197 addition->head = NULL;
198}
199
200struct bheap_node* bheap_peek(bheap_prio_t higher_prio,
201 struct bheap* heap)
202{
203 if (!heap->min)
204 heap->min = __bheap_extract_min(higher_prio, heap);
205 return heap->min;
206}
207
208struct bheap_node* bheap_take(bheap_prio_t higher_prio,
209 struct bheap* heap)
210{
211 struct bheap_node *node;
212 if (!heap->min)
213 heap->min = __bheap_extract_min(higher_prio, heap);
214 node = heap->min;
215 heap->min = NULL;
216 if (node)
217 node->degree = NOT_IN_HEAP;
218 return node;
219}
220
221int bheap_decrease(bheap_prio_t higher_prio, struct bheap_node* node)
222{
223 struct bheap_node *parent;
224 struct bheap_node** tmp_ref;
225 void* tmp;
226
227 /* bubble up */
228 parent = node->parent;
229 while (parent && higher_prio(node, parent)) {
230 /* swap parent and node */
231 tmp = parent->value;
232 parent->value = node->value;
233 node->value = tmp;
234 /* swap references */
235 *(parent->ref) = node;
236 *(node->ref) = parent;
237 tmp_ref = parent->ref;
238 parent->ref = node->ref;
239 node->ref = tmp_ref;
240 /* step up */
241 node = parent;
242 parent = node->parent;
243 }
244
245 return parent != NULL;
246}
247
248void bheap_delete(bheap_prio_t higher_prio, struct bheap* heap,
249 struct bheap_node* node)
250{
251 struct bheap_node *parent, *prev, *pos;
252 struct bheap_node** tmp_ref;
253 void* tmp;
254
255 if (heap->min != node) {
256 /* bubble up */
257 parent = node->parent;
258 while (parent) {
259 /* swap parent and node */
260 tmp = parent->value;
261 parent->value = node->value;
262 node->value = tmp;
263 /* swap references */
264 *(parent->ref) = node;
265 *(node->ref) = parent;
266 tmp_ref = parent->ref;
267 parent->ref = node->ref;
268 node->ref = tmp_ref;
269 /* step up */
270 node = parent;
271 parent = node->parent;
272 }
273 /* now delete:
274 * first find prev */
275 prev = NULL;
276 pos = heap->head;
277 while (pos != node) {
278 prev = pos;
279 pos = pos->next;
280 }
281 /* we have prev, now remove node */
282 if (prev)
283 prev->next = node->next;
284 else
285 heap->head = node->next;
286 __bheap_union(higher_prio, heap, __bheap_reverse(node->child));
287 } else
288 heap->min = NULL;
289 node->degree = NOT_IN_HEAP;
290}
291
292/* allocate a heap node for value and insert into the heap */
293int bheap_add(bheap_prio_t higher_prio, struct bheap* heap,
294 void* value, int gfp_flags)
295{
296 struct bheap_node* hn = bheap_node_alloc(gfp_flags);
297 if (likely(hn)) {
298 bheap_node_init(&hn, value);
299 bheap_insert(higher_prio, heap, hn);
300 }
301 return hn != NULL;
302}
303
304void* bheap_take_del(bheap_prio_t higher_prio,
305 struct bheap* heap)
306{
307 struct bheap_node* hn = bheap_take(higher_prio, heap);
308 void* ret = NULL;
309 if (hn) {
310 ret = hn->value;
311 bheap_node_free(hn);
312 }
313 return ret;
314}
diff --git a/litmus/binheap.c b/litmus/binheap.c
new file mode 100644
index 00000000000..40a913f4b5a
--- /dev/null
+++ b/litmus/binheap.c
@@ -0,0 +1,388 @@
1#include <litmus/binheap.h>
2
3/* Returns true of the root ancestor of node is the root of the given heap. */
4int binheap_is_in_this_heap(struct binheap_node *node,
5 struct binheap* heap)
6{
7 if(!binheap_is_in_heap(node)) {
8 return 0;
9 }
10
11 while(node->parent != NULL) {
12 node = node->parent;
13 }
14
15 return (node == heap->root);
16}
17
18
19/* Update the node reference pointers. Same logic as Litmus binomial heap. */
20static void __update_ref(struct binheap_node *parent,
21 struct binheap_node *child)
22{
23 *(parent->ref_ptr) = child;
24 *(child->ref_ptr) = parent;
25
26 swap(parent->ref_ptr, child->ref_ptr);
27}
28
29
30/* Swaps data between two nodes. */
31static void __binheap_swap(struct binheap_node *parent,
32 struct binheap_node *child)
33{
34 swap(parent->data, child->data);
35 __update_ref(parent, child);
36}
37
38
39/* Swaps memory and data between two nodes. Actual nodes swap instead of
40 * just data. Needed when we delete nodes from the heap.
41 */
42static void __binheap_swap_safe(struct binheap *handle,
43 struct binheap_node *a,
44 struct binheap_node *b)
45{
46 swap(a->data, b->data);
47 __update_ref(a, b);
48
49 if((a->parent != NULL) && (a->parent == b->parent)) {
50 /* special case: shared parent */
51 swap(a->parent->left, a->parent->right);
52 }
53 else {
54 /* Update pointers to swap parents. */
55
56 if(a->parent) {
57 if(a == a->parent->left) {
58 a->parent->left = b;
59 }
60 else {
61 a->parent->right = b;
62 }
63 }
64
65 if(b->parent) {
66 if(b == b->parent->left) {
67 b->parent->left = a;
68 }
69 else {
70 b->parent->right = a;
71 }
72 }
73
74 swap(a->parent, b->parent);
75 }
76
77 /* swap children */
78
79 if(a->left) {
80 a->left->parent = b;
81
82 if(a->right) {
83 a->right->parent = b;
84 }
85 }
86
87 if(b->left) {
88 b->left->parent = a;
89
90 if(b->right) {
91 b->right->parent = a;
92 }
93 }
94
95 swap(a->left, b->left);
96 swap(a->right, b->right);
97
98
99 /* update next/last/root pointers */
100
101 if(a == handle->next) {
102 handle->next = b;
103 }
104 else if(b == handle->next) {
105 handle->next = a;
106 }
107
108 if(a == handle->last) {
109 handle->last = b;
110 }
111 else if(b == handle->last) {
112 handle->last = a;
113 }
114
115 if(a == handle->root) {
116 handle->root = b;
117 }
118 else if(b == handle->root) {
119 handle->root = a;
120 }
121}
122
123
124/**
125 * Update the pointer to the last node in the complete binary tree.
126 * Called internally after the root node has been deleted.
127 */
128static void __binheap_update_last(struct binheap *handle)
129{
130 struct binheap_node *temp = handle->last;
131
132 /* find a "bend" in the tree. */
133 while(temp->parent && (temp == temp->parent->left)) {
134 temp = temp->parent;
135 }
136
137 /* step over to sibling if we're not at root */
138 if(temp->parent != NULL) {
139 temp = temp->parent->left;
140 }
141
142 /* now travel right as far as possible. */
143 while(temp->right != NULL) {
144 temp = temp->right;
145 }
146
147 /* take one step to the left if we're not at the bottom-most level. */
148 if(temp->left != NULL) {
149 temp = temp->left;
150 }
151
152 handle->last = temp;
153}
154
155
156/**
157 * Update the pointer to the node that will take the next inserted node.
158 * Called internally after a node has been inserted.
159 */
160static void __binheap_update_next(struct binheap *handle)
161{
162 struct binheap_node *temp = handle->next;
163
164 /* find a "bend" in the tree. */
165 while(temp->parent && (temp == temp->parent->right)) {
166 temp = temp->parent;
167 }
168
169 /* step over to sibling if we're not at root */
170 if(temp->parent != NULL) {
171 temp = temp->parent->right;
172 }
173
174 /* now travel left as far as possible. */
175 while(temp->left != NULL) {
176 temp = temp->left;
177 }
178
179 handle->next = temp;
180}
181
182
183
184/* bubble node up towards root */
185static void __binheap_bubble_up(struct binheap *handle,
186 struct binheap_node *node)
187{
188 /* let BINHEAP_POISON data bubble to the top */
189
190 while((node->parent != NULL) &&
191 ((node->data == BINHEAP_POISON) ||
192 handle->compare(node, node->parent))) {
193 __binheap_swap(node->parent, node);
194 node = node->parent;
195 }
196}
197
198
199/* bubble node down, swapping with min-child */
200static void __binheap_bubble_down(struct binheap *handle)
201{
202 struct binheap_node *node = handle->root;
203
204 while(node->left != NULL) {
205 if(node->right && handle->compare(node->right, node->left)) {
206 if(handle->compare(node->right, node)) {
207 __binheap_swap(node, node->right);
208 node = node->right;
209 }
210 else {
211 break;
212 }
213 }
214 else {
215 if(handle->compare(node->left, node)) {
216 __binheap_swap(node, node->left);
217 node = node->left;
218 }
219 else {
220 break;
221 }
222 }
223 }
224}
225
226
227void __binheap_add(struct binheap_node *new_node,
228 struct binheap *handle,
229 void *data)
230{
231 new_node->data = data;
232 new_node->ref = new_node;
233 new_node->ref_ptr = &(new_node->ref);
234
235 if(!binheap_empty(handle)) {
236 /* insert left side first */
237 if(handle->next->left == NULL) {
238 handle->next->left = new_node;
239 new_node->parent = handle->next;
240 new_node->left = NULL;
241 new_node->right = NULL;
242
243 handle->last = new_node;
244
245 __binheap_bubble_up(handle, new_node);
246 }
247 else {
248 /* left occupied. insert right. */
249 handle->next->right = new_node;
250 new_node->parent = handle->next;
251 new_node->left = NULL;
252 new_node->right = NULL;
253
254 handle->last = new_node;
255
256 __binheap_update_next(handle);
257 __binheap_bubble_up(handle, new_node);
258 }
259 }
260 else {
261 /* first node in heap */
262
263 new_node->parent = NULL;
264 new_node->left = NULL;
265 new_node->right = NULL;
266
267 handle->root = new_node;
268 handle->next = new_node;
269 handle->last = new_node;
270 }
271}
272
273
274/**
275 * Removes the root node from the heap. The node is removed after coalescing
276 * the binheap_node with its original data pointer at the root of the tree.
277 *
278 * The 'last' node in the tree is then swapped up to the root and bubbled
279 * down.
280 */
281void __binheap_delete_root(struct binheap *handle,
282 struct binheap_node *container)
283{
284 struct binheap_node *root = handle->root;
285
286 if(root != container) {
287 /* coalesce */
288 __binheap_swap_safe(handle, root, container);
289 root = container;
290 }
291
292 if(handle->last != root) {
293 /* swap 'last' node up to root and bubble it down. */
294
295 struct binheap_node *to_move = handle->last;
296
297 if(to_move->parent != root) {
298 handle->next = to_move->parent;
299
300 if(handle->next->right == to_move) {
301 /* disconnect from parent */
302 to_move->parent->right = NULL;
303 handle->last = handle->next->left;
304 }
305 else {
306 /* find new 'last' before we disconnect */
307 __binheap_update_last(handle);
308
309 /* disconnect from parent */
310 to_move->parent->left = NULL;
311 }
312 }
313 else {
314 /* 'last' is direct child of root */
315
316 handle->next = to_move;
317
318 if(to_move == to_move->parent->right) {
319 to_move->parent->right = NULL;
320 handle->last = to_move->parent->left;
321 }
322 else {
323 to_move->parent->left = NULL;
324 handle->last = to_move;
325 }
326 }
327 to_move->parent = NULL;
328
329 /* reconnect as root. We can't just swap data ptrs since root node
330 * may be freed after this function returns.
331 */
332 to_move->left = root->left;
333 to_move->right = root->right;
334 if(to_move->left != NULL) {
335 to_move->left->parent = to_move;
336 }
337 if(to_move->right != NULL) {
338 to_move->right->parent = to_move;
339 }
340
341 handle->root = to_move;
342
343 /* bubble down */
344 __binheap_bubble_down(handle);
345 }
346 else {
347 /* removing last node in tree */
348 handle->root = NULL;
349 handle->next = NULL;
350 handle->last = NULL;
351 }
352
353 /* mark as removed */
354 container->parent = BINHEAP_POISON;
355}
356
357
358/**
359 * Delete an arbitrary node. Bubble node to delete up to the root,
360 * and then delete to root.
361 */
362void __binheap_delete(struct binheap_node *node_to_delete,
363 struct binheap *handle)
364{
365 struct binheap_node *target = node_to_delete->ref;
366 void *temp_data = target->data;
367
368 /* temporarily set data to null to allow node to bubble up to the top. */
369 target->data = BINHEAP_POISON;
370
371 __binheap_bubble_up(handle, target);
372 __binheap_delete_root(handle, node_to_delete);
373
374 node_to_delete->data = temp_data; /* restore node data pointer */
375}
376
377
378/**
379 * Bubble up a node whose pointer has decreased in value.
380 */
381void __binheap_decrease(struct binheap_node *orig_node,
382 struct binheap *handle)
383{
384 struct binheap_node *target = orig_node->ref;
385
386 __binheap_bubble_up(handle, target);
387}
388
diff --git a/litmus/budget.c b/litmus/budget.c
new file mode 100644
index 00000000000..f7712be29ad
--- /dev/null
+++ b/litmus/budget.c
@@ -0,0 +1,113 @@
1#include <linux/sched.h>
2#include <linux/percpu.h>
3#include <linux/hrtimer.h>
4
5#include <litmus/litmus.h>
6#include <litmus/preempt.h>
7
8#include <litmus/budget.h>
9
10struct enforcement_timer {
11 /* The enforcement timer is used to accurately police
12 * slice budgets. */
13 struct hrtimer timer;
14 int armed;
15};
16
17DEFINE_PER_CPU(struct enforcement_timer, budget_timer);
18
19static enum hrtimer_restart on_enforcement_timeout(struct hrtimer *timer)
20{
21 struct enforcement_timer* et = container_of(timer,
22 struct enforcement_timer,
23 timer);
24 unsigned long flags;
25
26 local_irq_save(flags);
27 TRACE("enforcement timer fired.\n");
28 et->armed = 0;
29 /* activate scheduler */
30 litmus_reschedule_local();
31 local_irq_restore(flags);
32
33 return HRTIMER_NORESTART;
34}
35
36/* assumes called with IRQs off */
37static void cancel_enforcement_timer(struct enforcement_timer* et)
38{
39 int ret;
40
41 TRACE("cancelling enforcement timer.\n");
42
43 /* Since interrupts are disabled and et->armed is only
44 * modified locally, we do not need any locks.
45 */
46
47 if (et->armed) {
48 ret = hrtimer_try_to_cancel(&et->timer);
49 /* Should never be inactive. */
50 BUG_ON(ret == 0);
51 /* Should never be running concurrently. */
52 BUG_ON(ret == -1);
53
54 et->armed = 0;
55 }
56}
57
58/* assumes called with IRQs off */
59static void arm_enforcement_timer(struct enforcement_timer* et,
60 struct task_struct* t)
61{
62 lt_t when_to_fire;
63 TRACE_TASK(t, "arming enforcement timer.\n");
64
65 /* Calling this when there is no budget left for the task
66 * makes no sense, unless the task is non-preemptive. */
67 BUG_ON(budget_exhausted(t) && (!is_np(t)));
68
69 /* __hrtimer_start_range_ns() cancels the timer
70 * anyway, so we don't have to check whether it is still armed */
71
72 if (likely(!is_np(t))) {
73 when_to_fire = litmus_clock() + budget_remaining(t);
74 __hrtimer_start_range_ns(&et->timer,
75 ns_to_ktime(when_to_fire),
76 0 /* delta */,
77 HRTIMER_MODE_ABS_PINNED,
78 0 /* no wakeup */);
79 et->armed = 1;
80 }
81}
82
83
84/* expects to be called with IRQs off */
85void update_enforcement_timer(struct task_struct* t)
86{
87 struct enforcement_timer* et = &__get_cpu_var(budget_timer);
88
89 if (t && budget_precisely_enforced(t)) {
90 /* Make sure we call into the scheduler when this budget
91 * expires. */
92 arm_enforcement_timer(et, t);
93 } else if (et->armed) {
94 /* Make sure we don't cause unnecessary interrupts. */
95 cancel_enforcement_timer(et);
96 }
97}
98
99
100static int __init init_budget_enforcement(void)
101{
102 int cpu;
103 struct enforcement_timer* et;
104
105 for (cpu = 0; cpu < NR_CPUS; cpu++) {
106 et = &per_cpu(budget_timer, cpu);
107 hrtimer_init(&et->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
108 et->timer.function = on_enforcement_timeout;
109 }
110 return 0;
111}
112
113module_init(init_budget_enforcement);
diff --git a/litmus/clustered.c b/litmus/clustered.c
new file mode 100644
index 00000000000..6fe1b512f62
--- /dev/null
+++ b/litmus/clustered.c
@@ -0,0 +1,111 @@
1#include <linux/gfp.h>
2#include <linux/cpumask.h>
3#include <linux/list.h>
4
5#include <litmus/clustered.h>
6
7#ifndef CONFIG_X86
8/* fake get_shared_cpu_map() on non-x86 architectures */
9
10int get_shared_cpu_map(cpumask_var_t mask, unsigned int cpu, int index)
11{
12 if (index != 1)
13 return 1;
14 else {
15 /* Fake L1: CPU is all by itself. */
16 cpumask_clear(mask);
17 cpumask_set_cpu(cpu, mask);
18 return 0;
19 }
20}
21
22#endif
23
24int get_cluster_size(enum cache_level level)
25{
26 cpumask_var_t mask;
27 int ok;
28 int num_cpus;
29
30 if (level == GLOBAL_CLUSTER)
31 return num_online_cpus();
32 else {
33 if (!zalloc_cpumask_var(&mask, GFP_ATOMIC))
34 return -ENOMEM;
35 /* assumes CPU 0 is representative of all CPUs */
36 ok = get_shared_cpu_map(mask, 0, level);
37 /* ok == 0 means we got the map; otherwise it's an invalid cache level */
38 if (ok == 0)
39 num_cpus = cpumask_weight(mask);
40 free_cpumask_var(mask);
41
42 if (ok == 0)
43 return num_cpus;
44 else
45 return -EINVAL;
46 }
47}
48
49int assign_cpus_to_clusters(enum cache_level level,
50 struct scheduling_cluster* clusters[],
51 unsigned int num_clusters,
52 struct cluster_cpu* cpus[],
53 unsigned int num_cpus)
54{
55 cpumask_var_t mask;
56 unsigned int i, free_cluster = 0, low_cpu;
57 int err = 0;
58
59 if (!zalloc_cpumask_var(&mask, GFP_ATOMIC))
60 return -ENOMEM;
61
62 /* clear cluster pointers */
63 for (i = 0; i < num_cpus; i++) {
64 cpus[i]->id = i;
65 cpus[i]->cluster = NULL;
66 }
67
68 /* initialize clusters */
69 for (i = 0; i < num_clusters; i++) {
70 clusters[i]->id = i;
71 INIT_LIST_HEAD(&clusters[i]->cpus);
72 }
73
74 /* Assign each CPU. Two assumtions are made:
75 * 1) The index of a cpu in cpus corresponds to its processor id (i.e., the index in a cpu mask).
76 * 2) All cpus that belong to some cluster are online.
77 */
78 for_each_online_cpu(i) {
79 /* get lowest-id CPU in cluster */
80 if (level != GLOBAL_CLUSTER) {
81 err = get_shared_cpu_map(mask, cpus[i]->id, level);
82 if (err != 0) {
83 /* ugh... wrong cache level? Either caller screwed up
84 * or the CPU topology is weird. */
85 printk(KERN_ERR "Could not set up clusters for L%d sharing (max: L%d).\n",
86 level, err);
87 err = -EINVAL;
88 goto out;
89 }
90 low_cpu = cpumask_first(mask);
91 } else
92 low_cpu = 0;
93 if (low_cpu == i) {
94 /* caller must provide an appropriate number of clusters */
95 BUG_ON(free_cluster >= num_clusters);
96
97 /* create new cluster */
98 cpus[i]->cluster = clusters[free_cluster++];
99 } else {
100 /* low_cpu points to the right cluster
101 * Assumption: low_cpu is actually online and was processed earlier. */
102 cpus[i]->cluster = cpus[low_cpu]->cluster;
103 }
104 /* enqueue in cpus list */
105 list_add_tail(&cpus[i]->cluster_list, &cpus[i]->cluster->cpus);
106 printk(KERN_INFO "Assigning CPU%u to cluster %u\n.", i, cpus[i]->cluster->id);
107 }
108out:
109 free_cpumask_var(mask);
110 return err;
111}
diff --git a/litmus/ctrldev.c b/litmus/ctrldev.c
new file mode 100644
index 00000000000..41919b2714c
--- /dev/null
+++ b/litmus/ctrldev.c
@@ -0,0 +1,160 @@
1#include <linux/sched.h>
2#include <linux/mm.h>
3#include <linux/fs.h>
4#include <linux/miscdevice.h>
5#include <linux/module.h>
6
7#include <litmus/litmus.h>
8
9/* only one page for now, but we might want to add a RO version at some point */
10
11#define CTRL_NAME "litmus/ctrl"
12
13/* allocate t->rt_param.ctrl_page*/
14static int alloc_ctrl_page(struct task_struct *t)
15{
16 int err = 0;
17
18 /* only allocate if the task doesn't have one yet */
19 if (!tsk_rt(t)->ctrl_page) {
20 tsk_rt(t)->ctrl_page = (void*) get_zeroed_page(GFP_KERNEL);
21 if (!tsk_rt(t)->ctrl_page)
22 err = -ENOMEM;
23 /* will get de-allocated in task teardown */
24 TRACE_TASK(t, "%s ctrl_page = %p\n", __FUNCTION__,
25 tsk_rt(t)->ctrl_page);
26 }
27 return err;
28}
29
30static int map_ctrl_page(struct task_struct *t, struct vm_area_struct* vma)
31{
32 int err;
33
34 struct page* ctrl = virt_to_page(tsk_rt(t)->ctrl_page);
35
36 TRACE_CUR(CTRL_NAME
37 ": mapping %p (pfn:%lx) to 0x%lx (prot:%lx)\n",
38 tsk_rt(t)->ctrl_page,page_to_pfn(ctrl), vma->vm_start,
39 vma->vm_page_prot);
40
41 /* Map it into the vma. */
42 err = vm_insert_page(vma, vma->vm_start, ctrl);
43
44 if (err)
45 TRACE_CUR(CTRL_NAME ": vm_insert_page() failed (%d)\n", err);
46
47 return err;
48}
49
50static void litmus_ctrl_vm_close(struct vm_area_struct* vma)
51{
52 TRACE_CUR("%s flags=0x%x prot=0x%x\n", __FUNCTION__,
53 vma->vm_flags, vma->vm_page_prot);
54
55 TRACE_CUR(CTRL_NAME
56 ": %p:%p vma:%p vma->vm_private_data:%p closed.\n",
57 (void*) vma->vm_start, (void*) vma->vm_end, vma,
58 vma->vm_private_data);
59}
60
61static int litmus_ctrl_vm_fault(struct vm_area_struct* vma,
62 struct vm_fault* vmf)
63{
64 TRACE_CUR("%s flags=0x%x (off:%ld)\n", __FUNCTION__,
65 vma->vm_flags, vmf->pgoff);
66
67 /* This function should never be called, since all pages should have
68 * been mapped by mmap() already. */
69 WARN_ONCE(1, "Page faults should be impossible in the control page\n");
70
71 return VM_FAULT_SIGBUS;
72}
73
74static struct vm_operations_struct litmus_ctrl_vm_ops = {
75 .close = litmus_ctrl_vm_close,
76 .fault = litmus_ctrl_vm_fault,
77};
78
79static int litmus_ctrl_mmap(struct file* filp, struct vm_area_struct* vma)
80{
81 int err = 0;
82
83 /* first make sure mapper knows what he's doing */
84
85 /* you can only get one page */
86 if (vma->vm_end - vma->vm_start != PAGE_SIZE)
87 return -EINVAL;
88
89 /* you can only map the "first" page */
90 if (vma->vm_pgoff != 0)
91 return -EINVAL;
92
93 /* you can't share it with anyone */
94 if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
95 return -EINVAL;
96
97 vma->vm_ops = &litmus_ctrl_vm_ops;
98 /* This mapping should not be kept across forks,
99 * cannot be expanded, and is not a "normal" page. */
100 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_IO;
101
102 /* We don't want the first write access to trigger a "minor" page fault
103 * to mark the page as dirty. This is transient, private memory, we
104 * don't care if it was touched or not. __S011 means RW access, but not
105 * execute, and avoids copy-on-write behavior.
106 * See protection_map in mmap.c. */
107 vma->vm_page_prot = __S011;
108
109 err = alloc_ctrl_page(current);
110 if (!err)
111 err = map_ctrl_page(current, vma);
112
113 TRACE_CUR("%s flags=0x%x prot=0x%lx\n",
114 __FUNCTION__, vma->vm_flags, vma->vm_page_prot);
115
116 return err;
117}
118
119static struct file_operations litmus_ctrl_fops = {
120 .owner = THIS_MODULE,
121 .mmap = litmus_ctrl_mmap,
122};
123
124static struct miscdevice litmus_ctrl_dev = {
125 .name = CTRL_NAME,
126 .minor = MISC_DYNAMIC_MINOR,
127 .fops = &litmus_ctrl_fops,
128};
129
130static int __init init_litmus_ctrl_dev(void)
131{
132 int err;
133
134 BUILD_BUG_ON(sizeof(struct control_page) > PAGE_SIZE);
135
136 BUILD_BUG_ON(sizeof(union np_flag) != sizeof(uint64_t));
137
138 BUILD_BUG_ON(offsetof(struct control_page, sched.raw)
139 != LITMUS_CP_OFFSET_SCHED);
140 BUILD_BUG_ON(offsetof(struct control_page, irq_count)
141 != LITMUS_CP_OFFSET_IRQ_COUNT);
142 BUILD_BUG_ON(offsetof(struct control_page, ts_syscall_start)
143 != LITMUS_CP_OFFSET_TS_SC_START);
144 BUILD_BUG_ON(offsetof(struct control_page, irq_syscall_start)
145 != LITMUS_CP_OFFSET_IRQ_SC_START);
146
147 printk("Initializing LITMUS^RT control device.\n");
148 err = misc_register(&litmus_ctrl_dev);
149 if (err)
150 printk("Could not allocate %s device (%d).\n", CTRL_NAME, err);
151 return err;
152}
153
154static void __exit exit_litmus_ctrl_dev(void)
155{
156 misc_deregister(&litmus_ctrl_dev);
157}
158
159module_init(init_litmus_ctrl_dev);
160module_exit(exit_litmus_ctrl_dev);
diff --git a/litmus/edf_common.c b/litmus/edf_common.c
new file mode 100644
index 00000000000..5aca2934a7b
--- /dev/null
+++ b/litmus/edf_common.c
@@ -0,0 +1,200 @@
1/*
2 * kernel/edf_common.c
3 *
4 * Common functions for EDF based scheduler.
5 */
6
7#include <linux/percpu.h>
8#include <linux/sched.h>
9#include <linux/list.h>
10
11#include <litmus/litmus.h>
12#include <litmus/sched_plugin.h>
13#include <litmus/sched_trace.h>
14
15#include <litmus/edf_common.h>
16
17#ifdef CONFIG_EDF_TIE_BREAK_LATENESS_NORM
18#include <litmus/fpmath.h>
19#endif
20
21#ifdef CONFIG_EDF_TIE_BREAK_HASH
22#include <linux/hash.h>
23static inline long edf_hash(struct task_struct *t)
24{
25 /* pid is 32 bits, so normally we would shove that into the
26 * upper 32-bits and and put the job number in the bottom
27 * and hash the 64-bit number with hash_64(). Sadly,
28 * in testing, hash_64() doesn't distribute keys were the
29 * upper bits are close together (as would be the case with
30 * pids) and job numbers are equal (as would be the case with
31 * synchronous task sets with all relative deadlines equal).
32 *
33 * A 2006 Linux patch proposed the following solution
34 * (but for some reason it wasn't accepted...).
35 *
36 * At least this workaround works for 32-bit systems as well.
37 */
38 return hash_32(hash_32((u32)tsk_rt(t)->job_params.job_no, 32) ^ t->pid, 32);
39}
40#endif
41
42
43/* edf_higher_prio - returns true if first has a higher EDF priority
44 * than second. Deadline ties are broken by PID.
45 *
46 * both first and second may be NULL
47 */
48int edf_higher_prio(struct task_struct* first,
49 struct task_struct* second)
50{
51 struct task_struct *first_task = first;
52 struct task_struct *second_task = second;
53
54 /* There is no point in comparing a task to itself. */
55 if (first && first == second) {
56 TRACE_TASK(first,
57 "WARNING: pointless edf priority comparison.\n");
58 return 0;
59 }
60
61
62 /* check for NULL tasks */
63 if (!first || !second)
64 return first && !second;
65
66#ifdef CONFIG_LITMUS_LOCKING
67
68 /* Check for inherited priorities. Change task
69 * used for comparison in such a case.
70 */
71 if (unlikely(first->rt_param.inh_task))
72 first_task = first->rt_param.inh_task;
73 if (unlikely(second->rt_param.inh_task))
74 second_task = second->rt_param.inh_task;
75
76 /* Check for priority boosting. Tie-break by start of boosting.
77 */
78 if (unlikely(is_priority_boosted(first_task))) {
79 /* first_task is boosted, how about second_task? */
80 if (!is_priority_boosted(second_task) ||
81 lt_before(get_boost_start(first_task),
82 get_boost_start(second_task)))
83 return 1;
84 else
85 return 0;
86 } else if (unlikely(is_priority_boosted(second_task)))
87 /* second_task is boosted, first is not*/
88 return 0;
89
90#endif
91
92 if (earlier_deadline(first_task, second_task)) {
93 return 1;
94 }
95 else if (get_deadline(first_task) == get_deadline(second_task)) {
96 /* Need to tie break. All methods must set pid_break to 0/1 if
97 * first_task does not have priority over second_task.
98 */
99 int pid_break;
100
101
102#if defined(CONFIG_EDF_TIE_BREAK_LATENESS)
103 /* Tie break by lateness. Jobs with greater lateness get
104 * priority. This should spread tardiness across all tasks,
105 * especially in task sets where all tasks have the same
106 * period and relative deadlines.
107 */
108 if (get_lateness(first_task) > get_lateness(second_task)) {
109 return 1;
110 }
111 pid_break = (get_lateness(first_task) == get_lateness(second_task));
112
113
114#elif defined(CONFIG_EDF_TIE_BREAK_LATENESS_NORM)
115 /* Tie break by lateness, normalized by relative deadline. Jobs with
116 * greater normalized lateness get priority.
117 *
118 * Note: Considered using the algebraically equivalent
119 * lateness(first)*relative_deadline(second) >
120 lateness(second)*relative_deadline(first)
121 * to avoid fixed-point math, but values are prone to overflow if inputs
122 * are on the order of several seconds, even in 64-bit.
123 */
124 fp_t fnorm = _frac(get_lateness(first_task),
125 get_rt_relative_deadline(first_task));
126 fp_t snorm = _frac(get_lateness(second_task),
127 get_rt_relative_deadline(second_task));
128 if (_gt(fnorm, snorm)) {
129 return 1;
130 }
131 pid_break = _eq(fnorm, snorm);
132
133
134#elif defined(CONFIG_EDF_TIE_BREAK_HASH)
135 /* Tie break by comparing hashs of (pid, job#) tuple. There should be
136 * a 50% chance that first_task has a higher priority than second_task.
137 */
138 long fhash = edf_hash(first_task);
139 long shash = edf_hash(second_task);
140 if (fhash < shash) {
141 return 1;
142 }
143 pid_break = (fhash == shash);
144#else
145
146
147 /* CONFIG_EDF_PID_TIE_BREAK */
148 pid_break = 1; // fall through to tie-break by pid;
149#endif
150
151 /* Tie break by pid */
152 if(pid_break) {
153 if (first_task->pid < second_task->pid) {
154 return 1;
155 }
156 else if (first_task->pid == second_task->pid) {
157 /* If the PIDs are the same then the task with the
158 * inherited priority wins.
159 */
160 if (!second->rt_param.inh_task) {
161 return 1;
162 }
163 }
164 }
165 }
166 return 0; /* fall-through. prio(second_task) > prio(first_task) */
167}
168
169int edf_ready_order(struct bheap_node* a, struct bheap_node* b)
170{
171 return edf_higher_prio(bheap2task(a), bheap2task(b));
172}
173
174void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
175 release_jobs_t release)
176{
177 rt_domain_init(rt, edf_ready_order, resched, release);
178}
179
180/* need_to_preempt - check whether the task t needs to be preempted
181 * call only with irqs disabled and with ready_lock acquired
182 * THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT!
183 */
184int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t)
185{
186 /* we need the read lock for edf_ready_queue */
187 /* no need to preempt if there is nothing pending */
188 if (!__jobs_pending(rt))
189 return 0;
190 /* we need to reschedule if t doesn't exist */
191 if (!t)
192 return 1;
193
194 /* NOTE: We cannot check for non-preemptibility since we
195 * don't know what address space we're currently in.
196 */
197
198 /* make sure to get non-rt stuff out of the way */
199 return !is_realtime(t) || edf_higher_prio(__next_ready(rt), t);
200}
diff --git a/litmus/fdso.c b/litmus/fdso.c
new file mode 100644
index 00000000000..41852d7b14d
--- /dev/null
+++ b/litmus/fdso.c
@@ -0,0 +1,306 @@
1/* fdso.c - file descriptor attached shared objects
2 *
3 * (c) 2007 B. Brandenburg, LITMUS^RT project
4 *
5 * Notes:
6 * - objects descriptor (OD) tables are not cloned during a fork.
7 * - objects are created on-demand, and freed after the last reference
8 * is dropped.
9 * - for now, object types are hard coded.
10 * - As long as we have live objects, we keep a reference to the inode.
11 */
12
13#include <linux/errno.h>
14#include <linux/sched.h>
15#include <linux/mutex.h>
16#include <linux/file.h>
17#include <asm/uaccess.h>
18
19#include <litmus/fdso.h>
20
21extern struct fdso_ops generic_lock_ops;
22
23static const struct fdso_ops* fdso_ops[] = {
24 &generic_lock_ops, /* FMLP_SEM */
25 &generic_lock_ops, /* SRP_SEM */
26 &generic_lock_ops, /* MPCP_SEM */
27 &generic_lock_ops, /* MPCP_VS_SEM */
28 &generic_lock_ops, /* DPCP_SEM */
29 &generic_lock_ops, /* PCP_SEM */
30 &generic_lock_ops, /* DGL_SEM */
31};
32
33static int fdso_create(void** obj_ref, obj_type_t type, void* __user config)
34{
35 BUILD_BUG_ON(ARRAY_SIZE(fdso_ops) != MAX_OBJ_TYPE + 1);
36
37 if (fdso_ops[type]->create)
38 return fdso_ops[type]->create(obj_ref, type, config);
39 else
40 return -EINVAL;
41}
42
43static void fdso_destroy(obj_type_t type, void* obj)
44{
45 fdso_ops[type]->destroy(type, obj);
46}
47
48static int fdso_open(struct od_table_entry* entry, void* __user config)
49{
50 if (fdso_ops[entry->obj->type]->open)
51 return fdso_ops[entry->obj->type]->open(entry, config);
52 else
53 return 0;
54}
55
56static int fdso_close(struct od_table_entry* entry)
57{
58 if (fdso_ops[entry->obj->type]->close)
59 return fdso_ops[entry->obj->type]->close(entry);
60 else
61 return 0;
62}
63
64/* inode must be locked already */
65static int alloc_inode_obj(struct inode_obj_id** obj_ref,
66 struct inode* inode,
67 obj_type_t type,
68 unsigned int id,
69 void* __user config)
70{
71 struct inode_obj_id* obj;
72 void* raw_obj;
73 int err;
74
75 obj = kmalloc(sizeof(*obj), GFP_KERNEL);
76 if (!obj) {
77 return -ENOMEM;
78 }
79
80 err = fdso_create(&raw_obj, type, config);
81 if (err != 0) {
82 kfree(obj);
83 return err;
84 }
85
86 INIT_LIST_HEAD(&obj->list);
87 atomic_set(&obj->count, 1);
88 obj->type = type;
89 obj->id = id;
90 obj->obj = raw_obj;
91 obj->inode = inode;
92
93 list_add(&obj->list, &inode->i_obj_list);
94 atomic_inc(&inode->i_count);
95
96 printk(KERN_DEBUG "alloc_inode_obj(%p, %d, %d): object created\n", inode, type, id);
97
98 *obj_ref = obj;
99 return 0;
100}
101
102/* inode must be locked already */
103static struct inode_obj_id* get_inode_obj(struct inode* inode,
104 obj_type_t type,
105 unsigned int id)
106{
107 struct list_head* pos;
108 struct inode_obj_id* obj = NULL;
109
110 list_for_each(pos, &inode->i_obj_list) {
111 obj = list_entry(pos, struct inode_obj_id, list);
112 if (obj->id == id && obj->type == type) {
113 atomic_inc(&obj->count);
114 return obj;
115 }
116 }
117 printk(KERN_DEBUG "get_inode_obj(%p, %d, %d): couldn't find object\n", inode, type, id);
118 return NULL;
119}
120
121
122static void put_inode_obj(struct inode_obj_id* obj)
123{
124 struct inode* inode;
125 int let_go = 0;
126
127 inode = obj->inode;
128 if (atomic_dec_and_test(&obj->count)) {
129
130 mutex_lock(&inode->i_obj_mutex);
131 /* no new references can be obtained */
132 if (!atomic_read(&obj->count)) {
133 list_del(&obj->list);
134 fdso_destroy(obj->type, obj->obj);
135 kfree(obj);
136 let_go = 1;
137 }
138 mutex_unlock(&inode->i_obj_mutex);
139 if (let_go)
140 iput(inode);
141 }
142}
143
144static struct od_table_entry* get_od_entry(struct task_struct* t)
145{
146 struct od_table_entry* table;
147 int i;
148
149
150 table = t->od_table;
151 if (!table) {
152 table = kzalloc(sizeof(*table) * MAX_OBJECT_DESCRIPTORS,
153 GFP_KERNEL);
154 t->od_table = table;
155 }
156
157 for (i = 0; table && i < MAX_OBJECT_DESCRIPTORS; i++)
158 if (!table[i].used) {
159 table[i].used = 1;
160 return table + i;
161 }
162 return NULL;
163}
164
165static int put_od_entry(struct od_table_entry* od)
166{
167 put_inode_obj(od->obj);
168 od->used = 0;
169 return 0;
170}
171
172static long close_od_entry(struct od_table_entry *od)
173{
174 long ret;
175
176 /* Give the class a chance to reject the close. */
177 ret = fdso_close(od);
178 if (ret == 0)
179 ret = put_od_entry(od);
180
181 return ret;
182}
183
184void exit_od_table(struct task_struct* t)
185{
186 int i;
187
188 if (t->od_table) {
189 for (i = 0; i < MAX_OBJECT_DESCRIPTORS; i++)
190 if (t->od_table[i].used)
191 close_od_entry(t->od_table + i);
192 kfree(t->od_table);
193 t->od_table = NULL;
194 }
195}
196
197static int do_sys_od_open(struct file* file, obj_type_t type, int id,
198 void* __user config)
199{
200 int idx = 0, err = 0;
201 struct inode* inode;
202 struct inode_obj_id* obj = NULL;
203 struct od_table_entry* entry;
204
205 inode = file->f_dentry->d_inode;
206
207 entry = get_od_entry(current);
208 if (!entry)
209 return -ENOMEM;
210
211 mutex_lock(&inode->i_obj_mutex);
212 obj = get_inode_obj(inode, type, id);
213 if (!obj)
214 err = alloc_inode_obj(&obj, inode, type, id, config);
215 if (err != 0) {
216 obj = NULL;
217 idx = err;
218 entry->used = 0;
219 } else {
220 entry->obj = obj;
221 entry->class = fdso_ops[type];
222 idx = entry - current->od_table;
223 }
224
225 mutex_unlock(&inode->i_obj_mutex);
226
227 /* open only if creation succeeded */
228 if (!err)
229 err = fdso_open(entry, config);
230 if (err < 0) {
231 /* The class rejected the open call.
232 * We need to clean up and tell user space.
233 */
234 if (obj)
235 put_od_entry(entry);
236 idx = err;
237 }
238
239 return idx;
240}
241
242struct od_table_entry* get_entry_for_od(int od)
243{
244 struct task_struct *t = current;
245
246 if (!t->od_table)
247 return NULL;
248 if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
249 return NULL;
250 if (!t->od_table[od].used)
251 return NULL;
252 return t->od_table + od;
253}
254
255asmlinkage long sys_od_open(int fd, int type, int obj_id, void* __user config)
256{
257 int ret = 0;
258 struct file* file;
259
260 /*
261 1) get file from fd, get inode from file
262 2) lock inode
263 3) try to lookup object
264 4) if not present create and enqueue object, inc inode refcnt
265 5) increment refcnt of object
266 6) alloc od_table_entry, setup ptrs
267 7) unlock inode
268 8) return offset in od_table as OD
269 */
270
271 if (type < MIN_OBJ_TYPE || type > MAX_OBJ_TYPE) {
272 ret = -EINVAL;
273 goto out;
274 }
275
276 file = fget(fd);
277 if (!file) {
278 ret = -EBADF;
279 goto out;
280 }
281
282 ret = do_sys_od_open(file, type, obj_id, config);
283
284 fput(file);
285
286out:
287 return ret;
288}
289
290
291asmlinkage long sys_od_close(int od)
292{
293 int ret = -EINVAL;
294 struct task_struct *t = current;
295
296 if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
297 return ret;
298
299 if (!t->od_table || !t->od_table[od].used)
300 return ret;
301
302
303 ret = close_od_entry(t->od_table + od);
304
305 return ret;
306}
diff --git a/litmus/fp_common.c b/litmus/fp_common.c
new file mode 100644
index 00000000000..964a4729def
--- /dev/null
+++ b/litmus/fp_common.c
@@ -0,0 +1,119 @@
1/*
2 * litmus/fp_common.c
3 *
4 * Common functions for fixed-priority scheduler.
5 */
6
7#include <linux/percpu.h>
8#include <linux/sched.h>
9#include <linux/list.h>
10
11#include <litmus/litmus.h>
12#include <litmus/sched_plugin.h>
13#include <litmus/sched_trace.h>
14
15#include <litmus/fp_common.h>
16
17/* fp_higher_prio - returns true if first has a higher static priority
18 * than second. Ties are broken by PID.
19 *
20 * both first and second may be NULL
21 */
22int fp_higher_prio(struct task_struct* first,
23 struct task_struct* second)
24{
25 struct task_struct *first_task = first;
26 struct task_struct *second_task = second;
27
28 /* There is no point in comparing a task to itself. */
29 if (unlikely(first && first == second)) {
30 TRACE_TASK(first,
31 "WARNING: pointless FP priority comparison.\n");
32 return 0;
33 }
34
35
36 /* check for NULL tasks */
37 if (!first || !second)
38 return first && !second;
39
40 if (!is_realtime(second_task))
41 return 1;
42
43#ifdef CONFIG_LITMUS_LOCKING
44
45 /* Check for inherited priorities. Change task
46 * used for comparison in such a case.
47 */
48 if (unlikely(first->rt_param.inh_task))
49 first_task = first->rt_param.inh_task;
50 if (unlikely(second->rt_param.inh_task))
51 second_task = second->rt_param.inh_task;
52
53 /* Check for priority boosting. Tie-break by start of boosting.
54 */
55 if (unlikely(is_priority_boosted(first_task))) {
56 /* first_task is boosted, how about second_task? */
57 if (is_priority_boosted(second_task))
58 /* break by priority point */
59 return lt_before(get_boost_start(first_task),
60 get_boost_start(second_task));
61 else
62 /* priority boosting wins. */
63 return 1;
64 } else if (unlikely(is_priority_boosted(second_task)))
65 /* second_task is boosted, first is not*/
66 return 0;
67
68#endif
69
70 /* Comparisons to itself are not expected; priority inheritance
71 * should also not cause this to happen. */
72 BUG_ON(first_task == second_task);
73
74 if (get_priority(first_task) < get_priority(second_task))
75 return 1;
76 else if (get_priority(first_task) == get_priority(second_task))
77 /* Break by PID. */
78 return first_task->pid < second_task->pid;
79 else
80 return 0;
81}
82
83int fp_ready_order(struct bheap_node* a, struct bheap_node* b)
84{
85 return fp_higher_prio(bheap2task(a), bheap2task(b));
86}
87
88void fp_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
89 release_jobs_t release)
90{
91 rt_domain_init(rt, fp_ready_order, resched, release);
92}
93
94/* need_to_preempt - check whether the task t needs to be preempted
95 */
96int fp_preemption_needed(struct fp_prio_queue *q, struct task_struct *t)
97{
98 struct task_struct *pending;
99
100 pending = fp_prio_peek(q);
101
102 if (!pending)
103 return 0;
104 if (!t)
105 return 1;
106
107 /* make sure to get non-rt stuff out of the way */
108 return !is_realtime(t) || fp_higher_prio(pending, t);
109}
110
111void fp_prio_queue_init(struct fp_prio_queue* q)
112{
113 int i;
114
115 for (i = 0; i < FP_PRIO_BIT_WORDS; i++)
116 q->bitmask[i] = 0;
117 for (i = 0; i < LITMUS_MAX_PRIORITY; i++)
118 bheap_init(&q->queue[i]);
119}
diff --git a/litmus/ft_event.c b/litmus/ft_event.c
new file mode 100644
index 00000000000..399a07becca
--- /dev/null
+++ b/litmus/ft_event.c
@@ -0,0 +1,43 @@
1#include <linux/types.h>
2
3#include <litmus/feather_trace.h>
4
5#if !defined(CONFIG_ARCH_HAS_FEATHER_TRACE) || defined(CONFIG_DEBUG_RODATA)
6/* provide dummy implementation */
7
8int ft_events[MAX_EVENTS];
9
10int ft_enable_event(unsigned long id)
11{
12 if (id < MAX_EVENTS) {
13 ft_events[id]++;
14 return 1;
15 } else
16 return 0;
17}
18
19int ft_disable_event(unsigned long id)
20{
21 if (id < MAX_EVENTS && ft_events[id]) {
22 ft_events[id]--;
23 return 1;
24 } else
25 return 0;
26}
27
28int ft_disable_all_events(void)
29{
30 int i;
31
32 for (i = 0; i < MAX_EVENTS; i++)
33 ft_events[i] = 0;
34
35 return MAX_EVENTS;
36}
37
38int ft_is_event_enabled(unsigned long id)
39{
40 return id < MAX_EVENTS && ft_events[id];
41}
42
43#endif
diff --git a/litmus/ftdev.c b/litmus/ftdev.c
new file mode 100644
index 00000000000..99bc39ffbce
--- /dev/null
+++ b/litmus/ftdev.c
@@ -0,0 +1,446 @@
1#include <linux/sched.h>
2#include <linux/fs.h>
3#include <linux/slab.h>
4#include <linux/cdev.h>
5#include <asm/uaccess.h>
6#include <linux/module.h>
7#include <linux/device.h>
8
9#include <litmus/litmus.h>
10#include <litmus/feather_trace.h>
11#include <litmus/ftdev.h>
12
13struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size)
14{
15 struct ft_buffer* buf;
16 size_t total = (size + 1) * count;
17 char* mem;
18 int order = 0, pages = 1;
19
20 buf = kmalloc(sizeof(*buf), GFP_KERNEL);
21 if (!buf)
22 return NULL;
23
24 total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
25 while (pages < total) {
26 order++;
27 pages *= 2;
28 }
29
30 mem = (char*) __get_free_pages(GFP_KERNEL, order);
31 if (!mem) {
32 kfree(buf);
33 return NULL;
34 }
35
36 if (!init_ft_buffer(buf, count, size,
37 mem + (count * size), /* markers at the end */
38 mem)) { /* buffer objects */
39 free_pages((unsigned long) mem, order);
40 kfree(buf);
41 return NULL;
42 }
43 return buf;
44}
45
46void free_ft_buffer(struct ft_buffer* buf)
47{
48 int order = 0, pages = 1;
49 size_t total;
50
51 if (buf) {
52 total = (buf->slot_size + 1) * buf->slot_count;
53 total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
54 while (pages < total) {
55 order++;
56 pages *= 2;
57 }
58 free_pages((unsigned long) buf->buffer_mem, order);
59 kfree(buf);
60 }
61}
62
63struct ftdev_event {
64 int id;
65 struct ftdev_event* next;
66};
67
68static int activate(struct ftdev_event** chain, int id)
69{
70 struct ftdev_event* ev = kmalloc(sizeof(*ev), GFP_KERNEL);
71 if (ev) {
72 printk(KERN_INFO
73 "Enabling feather-trace event %d.\n", (int) id);
74 ft_enable_event(id);
75 ev->id = id;
76 ev->next = *chain;
77 *chain = ev;
78 }
79 return ev ? 0 : -ENOMEM;
80}
81
82static void deactivate(struct ftdev_event** chain, int id)
83{
84 struct ftdev_event **cur = chain;
85 struct ftdev_event *nxt;
86 while (*cur) {
87 if ((*cur)->id == id) {
88 nxt = (*cur)->next;
89 kfree(*cur);
90 *cur = nxt;
91 printk(KERN_INFO
92 "Disabling feather-trace event %d.\n", (int) id);
93 ft_disable_event(id);
94 break;
95 }
96 cur = &(*cur)->next;
97 }
98}
99
100static int ftdev_open(struct inode *in, struct file *filp)
101{
102 struct ftdev* ftdev;
103 struct ftdev_minor* ftdm;
104 unsigned int buf_idx = iminor(in);
105 int err = 0;
106
107 ftdev = container_of(in->i_cdev, struct ftdev, cdev);
108
109 if (buf_idx >= ftdev->minor_cnt) {
110 err = -ENODEV;
111 goto out;
112 }
113 if (ftdev->can_open && (err = ftdev->can_open(ftdev, buf_idx)))
114 goto out;
115
116 ftdm = ftdev->minor + buf_idx;
117 ftdm->ftdev = ftdev;
118 filp->private_data = ftdm;
119
120 if (mutex_lock_interruptible(&ftdm->lock)) {
121 err = -ERESTARTSYS;
122 goto out;
123 }
124
125 if (!ftdm->readers && ftdev->alloc)
126 err = ftdev->alloc(ftdev, buf_idx);
127 if (0 == err)
128 ftdm->readers++;
129
130 mutex_unlock(&ftdm->lock);
131out:
132 return err;
133}
134
135static int ftdev_release(struct inode *in, struct file *filp)
136{
137 struct ftdev* ftdev;
138 struct ftdev_minor* ftdm;
139 unsigned int buf_idx = iminor(in);
140 int err = 0;
141
142 ftdev = container_of(in->i_cdev, struct ftdev, cdev);
143
144 if (buf_idx >= ftdev->minor_cnt) {
145 err = -ENODEV;
146 goto out;
147 }
148 ftdm = ftdev->minor + buf_idx;
149
150 if (mutex_lock_interruptible(&ftdm->lock)) {
151 err = -ERESTARTSYS;
152 goto out;
153 }
154
155 if (ftdm->readers == 1) {
156 while (ftdm->events)
157 deactivate(&ftdm->events, ftdm->events->id);
158
159 /* wait for any pending events to complete */
160 set_current_state(TASK_UNINTERRUPTIBLE);
161 schedule_timeout(HZ);
162
163 printk(KERN_ALERT "Failed trace writes: %u\n",
164 ftdm->buf->failed_writes);
165
166 if (ftdev->free)
167 ftdev->free(ftdev, buf_idx);
168 }
169
170 ftdm->readers--;
171 mutex_unlock(&ftdm->lock);
172out:
173 return err;
174}
175
176/* based on ft_buffer_read
177 * @returns < 0 : page fault
178 * = 0 : no data available
179 * = 1 : one slot copied
180 */
181static int ft_buffer_copy_to_user(struct ft_buffer* buf, char __user *dest)
182{
183 unsigned int idx;
184 int err = 0;
185 if (buf->free_count != buf->slot_count) {
186 /* data available */
187 idx = buf->read_idx % buf->slot_count;
188 if (buf->slots[idx] == SLOT_READY) {
189 err = copy_to_user(dest, ((char*) buf->buffer_mem) +
190 idx * buf->slot_size,
191 buf->slot_size);
192 if (err == 0) {
193 /* copy ok */
194 buf->slots[idx] = SLOT_FREE;
195 buf->read_idx++;
196 fetch_and_inc(&buf->free_count);
197 err = 1;
198 }
199 }
200 }
201 return err;
202}
203
204static ssize_t ftdev_read(struct file *filp,
205 char __user *to, size_t len, loff_t *f_pos)
206{
207 /* we ignore f_pos, this is strictly sequential */
208
209 ssize_t err = 0;
210 size_t chunk;
211 int copied;
212 struct ftdev_minor* ftdm = filp->private_data;
213
214 if (mutex_lock_interruptible(&ftdm->lock)) {
215 err = -ERESTARTSYS;
216 goto out;
217 }
218
219
220 chunk = ftdm->buf->slot_size;
221 while (len >= chunk) {
222 copied = ft_buffer_copy_to_user(ftdm->buf, to);
223 if (copied == 1) {
224 len -= chunk;
225 to += chunk;
226 err += chunk;
227 } else if (err == 0 && copied == 0 && ftdm->events) {
228 /* Only wait if there are any events enabled and only
229 * if we haven't copied some data yet. We cannot wait
230 * here with copied data because that data would get
231 * lost if the task is interrupted (e.g., killed).
232 */
233 mutex_unlock(&ftdm->lock);
234 set_current_state(TASK_INTERRUPTIBLE);
235
236 schedule_timeout(50);
237
238 if (signal_pending(current)) {
239 if (err == 0)
240 /* nothing read yet, signal problem */
241 err = -ERESTARTSYS;
242 goto out;
243 }
244 if (mutex_lock_interruptible(&ftdm->lock)) {
245 err = -ERESTARTSYS;
246 goto out;
247 }
248 } else if (copied < 0) {
249 /* page fault */
250 err = copied;
251 break;
252 } else
253 /* nothing left to get, return to user space */
254 break;
255 }
256 mutex_unlock(&ftdm->lock);
257out:
258 return err;
259}
260
261static long ftdev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
262{
263 long err = -ENOIOCTLCMD;
264 struct ftdev_minor* ftdm = filp->private_data;
265
266 if (mutex_lock_interruptible(&ftdm->lock)) {
267 err = -ERESTARTSYS;
268 goto out;
269 }
270
271 /* FIXME: check id against list of acceptable events */
272
273 switch (cmd) {
274 case FTDEV_ENABLE_CMD:
275 if (activate(&ftdm->events, arg))
276 err = -ENOMEM;
277 else
278 err = 0;
279 break;
280
281 case FTDEV_DISABLE_CMD:
282 deactivate(&ftdm->events, arg);
283 err = 0;
284 break;
285
286 default:
287 printk(KERN_DEBUG "ftdev: strange ioctl (%u, %lu)\n", cmd, arg);
288 };
289
290 mutex_unlock(&ftdm->lock);
291out:
292 return err;
293}
294
295static ssize_t ftdev_write(struct file *filp, const char __user *from,
296 size_t len, loff_t *f_pos)
297{
298 struct ftdev_minor* ftdm = filp->private_data;
299 ssize_t err = -EINVAL;
300 struct ftdev* ftdev = ftdm->ftdev;
301
302 /* dispatch write to buffer-specific code, if available */
303 if (ftdev->write)
304 err = ftdev->write(ftdm->buf, len, from);
305
306 return err;
307}
308
309struct file_operations ftdev_fops = {
310 .owner = THIS_MODULE,
311 .open = ftdev_open,
312 .release = ftdev_release,
313 .write = ftdev_write,
314 .read = ftdev_read,
315 .unlocked_ioctl = ftdev_ioctl,
316};
317
318int ftdev_init( struct ftdev* ftdev, struct module* owner,
319 const int minor_cnt, const char* name)
320{
321 int i, err;
322
323 BUG_ON(minor_cnt < 1);
324
325 cdev_init(&ftdev->cdev, &ftdev_fops);
326 ftdev->name = name;
327 ftdev->minor_cnt = minor_cnt;
328 ftdev->cdev.owner = owner;
329 ftdev->cdev.ops = &ftdev_fops;
330 ftdev->alloc = NULL;
331 ftdev->free = NULL;
332 ftdev->can_open = NULL;
333 ftdev->write = NULL;
334
335 ftdev->minor = kcalloc(ftdev->minor_cnt, sizeof(*ftdev->minor),
336 GFP_KERNEL);
337 if (!ftdev->minor) {
338 printk(KERN_WARNING "ftdev(%s): Could not allocate memory\n",
339 ftdev->name);
340 err = -ENOMEM;
341 goto err_out;
342 }
343
344 for (i = 0; i < ftdev->minor_cnt; i++) {
345 mutex_init(&ftdev->minor[i].lock);
346 ftdev->minor[i].readers = 0;
347 ftdev->minor[i].buf = NULL;
348 ftdev->minor[i].events = NULL;
349 }
350
351 ftdev->class = class_create(owner, ftdev->name);
352 if (IS_ERR(ftdev->class)) {
353 err = PTR_ERR(ftdev->class);
354 printk(KERN_WARNING "ftdev(%s): "
355 "Could not create device class.\n", ftdev->name);
356 goto err_dealloc;
357 }
358
359 return 0;
360
361err_dealloc:
362 kfree(ftdev->minor);
363err_out:
364 return err;
365}
366
367/*
368 * Destroy minor devices up to, but not including, up_to.
369 */
370static void ftdev_device_destroy(struct ftdev* ftdev, unsigned int up_to)
371{
372 dev_t minor_cntr;
373
374 if (up_to < 1)
375 up_to = (ftdev->minor_cnt < 1) ? 0 : ftdev->minor_cnt;
376
377 for (minor_cntr = 0; minor_cntr < up_to; ++minor_cntr)
378 device_destroy(ftdev->class, MKDEV(ftdev->major, minor_cntr));
379}
380
381void ftdev_exit(struct ftdev* ftdev)
382{
383 printk("ftdev(%s): Exiting\n", ftdev->name);
384 ftdev_device_destroy(ftdev, -1);
385 cdev_del(&ftdev->cdev);
386 unregister_chrdev_region(MKDEV(ftdev->major, 0), ftdev->minor_cnt);
387 class_destroy(ftdev->class);
388 kfree(ftdev->minor);
389}
390
391int register_ftdev(struct ftdev* ftdev)
392{
393 struct device **device;
394 dev_t trace_dev_tmp, minor_cntr;
395 int err;
396
397 err = alloc_chrdev_region(&trace_dev_tmp, 0, ftdev->minor_cnt,
398 ftdev->name);
399 if (err) {
400 printk(KERN_WARNING "ftdev(%s): "
401 "Could not allocate char. device region (%d minors)\n",
402 ftdev->name, ftdev->minor_cnt);
403 goto err_out;
404 }
405
406 ftdev->major = MAJOR(trace_dev_tmp);
407
408 err = cdev_add(&ftdev->cdev, trace_dev_tmp, ftdev->minor_cnt);
409 if (err) {
410 printk(KERN_WARNING "ftdev(%s): "
411 "Could not add cdev for major %u with %u minor(s).\n",
412 ftdev->name, ftdev->major, ftdev->minor_cnt);
413 goto err_unregister;
414 }
415
416 /* create the minor device(s) */
417 for (minor_cntr = 0; minor_cntr < ftdev->minor_cnt; ++minor_cntr)
418 {
419 trace_dev_tmp = MKDEV(ftdev->major, minor_cntr);
420 device = &ftdev->minor[minor_cntr].device;
421
422 *device = device_create(ftdev->class, NULL, trace_dev_tmp, NULL,
423 "litmus/%s%d", ftdev->name, minor_cntr);
424 if (IS_ERR(*device)) {
425 err = PTR_ERR(*device);
426 printk(KERN_WARNING "ftdev(%s): "
427 "Could not create device major/minor number "
428 "%u/%u\n", ftdev->name, ftdev->major,
429 minor_cntr);
430 printk(KERN_WARNING "ftdev(%s): "
431 "will attempt deletion of allocated devices.\n",
432 ftdev->name);
433 goto err_minors;
434 }
435 }
436
437 return 0;
438
439err_minors:
440 ftdev_device_destroy(ftdev, minor_cntr);
441 cdev_del(&ftdev->cdev);
442err_unregister:
443 unregister_chrdev_region(MKDEV(ftdev->major, 0), ftdev->minor_cnt);
444err_out:
445 return err;
446}
diff --git a/litmus/jobs.c b/litmus/jobs.c
new file mode 100644
index 00000000000..7bc75bba863
--- /dev/null
+++ b/litmus/jobs.c
@@ -0,0 +1,73 @@
1/* litmus/jobs.c - common job control code
2 */
3
4#include <linux/sched.h>
5
6#include <litmus/litmus.h>
7#include <litmus/jobs.h>
8#include <litmus/trace.h>
9
10static inline void setup_release(struct task_struct *t, lt_t release)
11{
12 /* prepare next release */
13 tsk_rt(t)->job_params.release = release;
14 tsk_rt(t)->job_params.deadline += release + get_rt_period(t);
15 tsk_rt(t)->job_params.exec_time = 0;
16 /* update job sequence number */
17 tsk_rt(t)->job_params.job_no++;
18
19 /* don't confuse Linux */
20 t->rt.time_slice = 1;
21}
22
23void prepare_for_next_period(struct task_struct *t)
24{
25 BUG_ON(!t);
26
27 /* Record lateness before we set up the next job's
28 * release and deadline. Lateness may be negative.
29 */
30 t->rt_param.job_params.lateness =
31 (long long)litmus_clock() -
32 (long long)t->rt_param.job_params.deadline;
33
34 setup_release(t, get_release(t) + get_rt_period(t));
35}
36
37void release_at(struct task_struct *t, lt_t start)
38{
39 BUG_ON(!t);
40 setup_release(t, start);
41 tsk_rt(t)->completed = 0;
42}
43
44
45/*
46 * Deactivate current task until the beginning of the next period.
47 */
48long complete_job(void)
49{
50 lt_t amount;
51 lt_t now = litmus_clock();
52 lt_t exec_time = tsk_rt(current)->job_params.exec_time;
53
54 tsk_rt(current)->tot_exec_time += exec_time;
55 if (lt_before(tsk_rt(current)->max_exec_time, exec_time))
56 tsk_rt(current)->max_exec_time = exec_time;
57
58 if (is_tardy(current, now)) {
59 amount = now - get_deadline(current);
60 if (lt_after(amount, tsk_rt(current)->max_tardy))
61 tsk_rt(current)->max_tardy = amount;
62 tsk_rt(current)->total_tardy += amount;
63 ++tsk_rt(current)->missed;
64 }
65
66 /* Mark that we do not excute anymore */
67 tsk_rt(current)->completed = 1;
68 /* call schedule, this will return when a new job arrives
69 * it also takes care of preparing for the next release
70 */
71 schedule();
72 return 0;
73}
diff --git a/litmus/litmus.c b/litmus/litmus.c
new file mode 100644
index 00000000000..dc94be71bfb
--- /dev/null
+++ b/litmus/litmus.c
@@ -0,0 +1,579 @@
1/*
2 * litmus.c -- Implementation of the LITMUS syscalls,
3 * the LITMUS intialization code,
4 * and the procfs interface..
5 */
6#include <asm/uaccess.h>
7#include <linux/uaccess.h>
8#include <linux/sysrq.h>
9#include <linux/sched.h>
10#include <linux/module.h>
11#include <linux/slab.h>
12#include <linux/reboot.h>
13#include <linux/stop_machine.h>
14
15#include <litmus/litmus.h>
16#include <litmus/bheap.h>
17#include <litmus/trace.h>
18#include <litmus/rt_domain.h>
19#include <litmus/litmus_proc.h>
20#include <litmus/sched_trace.h>
21
22#ifdef CONFIG_SCHED_CPU_AFFINITY
23#include <litmus/affinity.h>
24#endif
25
26/* Number of RT tasks that exist in the system */
27atomic_t rt_task_count = ATOMIC_INIT(0);
28
29/* Give log messages sequential IDs. */
30atomic_t __log_seq_no = ATOMIC_INIT(0);
31
32#ifdef CONFIG_RELEASE_MASTER
33/* current master CPU for handling timer IRQs */
34atomic_t release_master_cpu = ATOMIC_INIT(NO_CPU);
35#endif
36
37static struct kmem_cache * bheap_node_cache;
38extern struct kmem_cache * release_heap_cache;
39
40struct bheap_node* bheap_node_alloc(int gfp_flags)
41{
42 return kmem_cache_alloc(bheap_node_cache, gfp_flags);
43}
44
45void bheap_node_free(struct bheap_node* hn)
46{
47 kmem_cache_free(bheap_node_cache, hn);
48}
49
50struct release_heap* release_heap_alloc(int gfp_flags);
51void release_heap_free(struct release_heap* rh);
52
53/*
54 * sys_set_task_rt_param
55 * @pid: Pid of the task which scheduling parameters must be changed
56 * @param: New real-time extension parameters such as the execution cost and
57 * period
58 * Syscall for manipulating with task rt extension params
59 * Returns EFAULT if param is NULL.
60 * ESRCH if pid is not corrsponding
61 * to a valid task.
62 * EINVAL if either period or execution cost is <=0
63 * EPERM if pid is a real-time task
64 * 0 if success
65 *
66 * Only non-real-time tasks may be configured with this system call
67 * to avoid races with the scheduler. In practice, this means that a
68 * task's parameters must be set _before_ calling sys_prepare_rt_task()
69 *
70 * find_task_by_vpid() assumes that we are in the same namespace of the
71 * target.
72 */
73asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param)
74{
75 struct rt_task tp;
76 struct task_struct *target;
77 int retval = -EINVAL;
78
79 printk("Setting up rt task parameters for process %d.\n", pid);
80
81 if (pid < 0 || param == 0) {
82 goto out;
83 }
84 if (copy_from_user(&tp, param, sizeof(tp))) {
85 retval = -EFAULT;
86 goto out;
87 }
88
89 /* Task search and manipulation must be protected */
90 read_lock_irq(&tasklist_lock);
91 if (!(target = find_task_by_vpid(pid))) {
92 retval = -ESRCH;
93 goto out_unlock;
94 }
95
96 if (is_realtime(target)) {
97 /* The task is already a real-time task.
98 * We cannot not allow parameter changes at this point.
99 */
100 retval = -EBUSY;
101 goto out_unlock;
102 }
103
104 /* set relative deadline to be implicit if left unspecified */
105 if (tp.relative_deadline == 0)
106 tp.relative_deadline = tp.period;
107
108 if (tp.exec_cost <= 0)
109 goto out_unlock;
110 if (tp.period <= 0)
111 goto out_unlock;
112 if (!cpu_online(tp.cpu))
113 goto out_unlock;
114 if (min(tp.relative_deadline, tp.period) < tp.exec_cost) /*density check*/
115 {
116 printk(KERN_INFO "litmus: real-time task %d rejected "
117 "because task density > 1.0\n", pid);
118 goto out_unlock;
119 }
120 if (tp.cls != RT_CLASS_HARD &&
121 tp.cls != RT_CLASS_SOFT &&
122 tp.cls != RT_CLASS_BEST_EFFORT)
123 {
124 printk(KERN_INFO "litmus: real-time task %d rejected "
125 "because its class is invalid\n", pid);
126 goto out_unlock;
127 }
128 if (tp.budget_policy != NO_ENFORCEMENT &&
129 tp.budget_policy != QUANTUM_ENFORCEMENT &&
130 tp.budget_policy != PRECISE_ENFORCEMENT)
131 {
132 printk(KERN_INFO "litmus: real-time task %d rejected "
133 "because unsupported budget enforcement policy "
134 "specified (%d)\n",
135 pid, tp.budget_policy);
136 goto out_unlock;
137 }
138
139 target->rt_param.task_params = tp;
140
141 retval = 0;
142 out_unlock:
143 read_unlock_irq(&tasklist_lock);
144 out:
145 return retval;
146}
147
148/*
149 * Getter of task's RT params
150 * returns EINVAL if param or pid is NULL
151 * returns ESRCH if pid does not correspond to a valid task
152 * returns EFAULT if copying of parameters has failed.
153 *
154 * find_task_by_vpid() assumes that we are in the same namespace of the
155 * target.
156 */
157asmlinkage long sys_get_rt_task_param(pid_t pid, struct rt_task __user * param)
158{
159 int retval = -EINVAL;
160 struct task_struct *source;
161 struct rt_task lp;
162 if (param == 0 || pid < 0)
163 goto out;
164 read_lock(&tasklist_lock);
165 if (!(source = find_task_by_vpid(pid))) {
166 retval = -ESRCH;
167 goto out_unlock;
168 }
169 lp = source->rt_param.task_params;
170 read_unlock(&tasklist_lock);
171 /* Do copying outside the lock */
172 retval =
173 copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0;
174 return retval;
175 out_unlock:
176 read_unlock(&tasklist_lock);
177 out:
178 return retval;
179
180}
181
182/*
183 * This is the crucial function for periodic task implementation,
184 * It checks if a task is periodic, checks if such kind of sleep
185 * is permitted and calls plugin-specific sleep, which puts the
186 * task into a wait array.
187 * returns 0 on successful wakeup
188 * returns EPERM if current conditions do not permit such sleep
189 * returns EINVAL if current task is not able to go to sleep
190 */
191asmlinkage long sys_complete_job(void)
192{
193 int retval = -EPERM;
194 if (!is_realtime(current)) {
195 retval = -EINVAL;
196 goto out;
197 }
198 /* Task with negative or zero period cannot sleep */
199 if (get_rt_period(current) <= 0) {
200 retval = -EINVAL;
201 goto out;
202 }
203 /* The plugin has to put the task into an
204 * appropriate queue and call schedule
205 */
206 retval = litmus->complete_job();
207 out:
208 return retval;
209}
210
211/* This is an "improved" version of sys_complete_job that
212 * addresses the problem of unintentionally missing a job after
213 * an overrun.
214 *
215 * returns 0 on successful wakeup
216 * returns EPERM if current conditions do not permit such sleep
217 * returns EINVAL if current task is not able to go to sleep
218 */
219asmlinkage long sys_wait_for_job_release(unsigned int job)
220{
221 int retval = -EPERM;
222 if (!is_realtime(current)) {
223 retval = -EINVAL;
224 goto out;
225 }
226
227 /* Task with negative or zero period cannot sleep */
228 if (get_rt_period(current) <= 0) {
229 retval = -EINVAL;
230 goto out;
231 }
232
233 retval = 0;
234
235 /* first wait until we have "reached" the desired job
236 *
237 * This implementation has at least two problems:
238 *
239 * 1) It doesn't gracefully handle the wrap around of
240 * job_no. Since LITMUS is a prototype, this is not much
241 * of a problem right now.
242 *
243 * 2) It is theoretically racy if a job release occurs
244 * between checking job_no and calling sleep_next_period().
245 * A proper solution would requiring adding another callback
246 * in the plugin structure and testing the condition with
247 * interrupts disabled.
248 *
249 * FIXME: At least problem 2 should be taken care of eventually.
250 */
251 while (!retval && job > current->rt_param.job_params.job_no)
252 /* If the last job overran then job <= job_no and we
253 * don't send the task to sleep.
254 */
255 retval = litmus->complete_job();
256 out:
257 return retval;
258}
259
260/* This is a helper syscall to query the current job sequence number.
261 *
262 * returns 0 on successful query
263 * returns EPERM if task is not a real-time task.
264 * returns EFAULT if &job is not a valid pointer.
265 */
266asmlinkage long sys_query_job_no(unsigned int __user *job)
267{
268 int retval = -EPERM;
269 if (is_realtime(current))
270 retval = put_user(current->rt_param.job_params.job_no, job);
271
272 return retval;
273}
274
275/* sys_null_call() is only used for determining raw system call
276 * overheads (kernel entry, kernel exit). It has no useful side effects.
277 * If ts is non-NULL, then the current Feather-Trace time is recorded.
278 */
279asmlinkage long sys_null_call(cycles_t __user *ts)
280{
281 long ret = 0;
282 cycles_t now;
283
284 if (ts) {
285 now = get_cycles();
286 ret = put_user(now, ts);
287 }
288
289 return ret;
290}
291
292/* p is a real-time task. Re-init its state as a best-effort task. */
293static void reinit_litmus_state(struct task_struct* p, int restore)
294{
295 struct rt_task user_config = {};
296 void* ctrl_page = NULL;
297
298 if (restore) {
299 /* Safe user-space provided configuration data.
300 * and allocated page. */
301 user_config = p->rt_param.task_params;
302 ctrl_page = p->rt_param.ctrl_page;
303 }
304
305 /* We probably should not be inheriting any task's priority
306 * at this point in time.
307 */
308 WARN_ON(p->rt_param.inh_task);
309
310 /* Cleanup everything else. */
311 memset(&p->rt_param, 0, sizeof(p->rt_param));
312
313 /* Restore preserved fields. */
314 if (restore) {
315 p->rt_param.task_params = user_config;
316 p->rt_param.ctrl_page = ctrl_page;
317 }
318}
319
320long litmus_admit_task(struct task_struct* tsk)
321{
322 long retval = 0;
323
324 BUG_ON(is_realtime(tsk));
325
326 tsk_rt(tsk)->heap_node = NULL;
327 tsk_rt(tsk)->rel_heap = NULL;
328
329 if (get_rt_relative_deadline(tsk) == 0 ||
330 get_exec_cost(tsk) >
331 min(get_rt_relative_deadline(tsk), get_rt_period(tsk)) ) {
332 TRACE_TASK(tsk,
333 "litmus admit: invalid task parameters "
334 "(e = %lu, p = %lu, d = %lu)\n",
335 get_exec_cost(tsk), get_rt_period(tsk),
336 get_rt_relative_deadline(tsk));
337 retval = -EINVAL;
338 goto out;
339 }
340
341 if (!cpu_online(get_partition(tsk))) {
342 TRACE_TASK(tsk, "litmus admit: cpu %d is not online\n",
343 get_partition(tsk));
344 retval = -EINVAL;
345 goto out;
346 }
347
348 INIT_LIST_HEAD(&tsk_rt(tsk)->list);
349
350 /* allocate heap node for this task */
351 tsk_rt(tsk)->heap_node = bheap_node_alloc(GFP_ATOMIC);
352 tsk_rt(tsk)->rel_heap = release_heap_alloc(GFP_ATOMIC);
353
354 if (!tsk_rt(tsk)->heap_node || !tsk_rt(tsk)->rel_heap) {
355 printk(KERN_WARNING "litmus: no more heap node memory!?\n");
356
357 retval = -ENOMEM;
358 goto out;
359 } else {
360 bheap_node_init(&tsk_rt(tsk)->heap_node, tsk);
361 }
362
363 preempt_disable();
364
365 retval = litmus->admit_task(tsk);
366
367 if (!retval) {
368 sched_trace_task_name(tsk);
369 sched_trace_task_param(tsk);
370 atomic_inc(&rt_task_count);
371 }
372
373 preempt_enable();
374
375out:
376 if (retval) {
377 bheap_node_free(tsk_rt(tsk)->heap_node);
378 release_heap_free(tsk_rt(tsk)->rel_heap);
379 }
380 return retval;
381}
382
383void litmus_exit_task(struct task_struct* tsk)
384{
385 if (is_realtime(tsk)) {
386 sched_trace_task_completion(tsk, 1);
387
388 litmus->task_exit(tsk);
389
390 BUG_ON(bheap_node_in_heap(tsk_rt(tsk)->heap_node));
391 bheap_node_free(tsk_rt(tsk)->heap_node);
392 release_heap_free(tsk_rt(tsk)->rel_heap);
393
394 atomic_dec(&rt_task_count);
395 reinit_litmus_state(tsk, 1);
396 }
397}
398
399static int do_plugin_switch(void *_plugin)
400{
401 int ret;
402 struct sched_plugin* plugin = _plugin;
403
404 /* don't switch if there are active real-time tasks */
405 if (atomic_read(&rt_task_count) == 0) {
406 ret = litmus->deactivate_plugin();
407 if (0 != ret)
408 goto out;
409 ret = plugin->activate_plugin();
410 if (0 != ret) {
411 printk(KERN_INFO "Can't activate %s (%d).\n",
412 plugin->plugin_name, ret);
413 plugin = &linux_sched_plugin;
414 }
415 printk(KERN_INFO "Switching to LITMUS^RT plugin %s.\n", plugin->plugin_name);
416 litmus = plugin;
417 } else
418 ret = -EBUSY;
419out:
420 return ret;
421}
422
423/* Switching a plugin in use is tricky.
424 * We must watch out that no real-time tasks exists
425 * (and that none is created in parallel) and that the plugin is not
426 * currently in use on any processor (in theory).
427 */
428int switch_sched_plugin(struct sched_plugin* plugin)
429{
430 BUG_ON(!plugin);
431
432 if (atomic_read(&rt_task_count) == 0)
433 return stop_machine(do_plugin_switch, plugin, NULL);
434 else
435 return -EBUSY;
436}
437
438/* Called upon fork.
439 * p is the newly forked task.
440 */
441void litmus_fork(struct task_struct* p)
442{
443 if (is_realtime(p)) {
444 /* clean out any litmus related state, don't preserve anything */
445 reinit_litmus_state(p, 0);
446 /* Don't let the child be a real-time task. */
447 p->sched_reset_on_fork = 1;
448 } else
449 /* non-rt tasks might have ctrl_page set */
450 tsk_rt(p)->ctrl_page = NULL;
451
452 /* od tables are never inherited across a fork */
453 p->od_table = NULL;
454}
455
456/* Called upon execve().
457 * current is doing the exec.
458 * Don't let address space specific stuff leak.
459 */
460void litmus_exec(void)
461{
462 struct task_struct* p = current;
463
464 if (is_realtime(p)) {
465 WARN_ON(p->rt_param.inh_task);
466 if (tsk_rt(p)->ctrl_page) {
467 free_page((unsigned long) tsk_rt(p)->ctrl_page);
468 tsk_rt(p)->ctrl_page = NULL;
469 }
470 }
471}
472
473void exit_litmus(struct task_struct *dead_tsk)
474{
475 /* We also allow non-RT tasks to
476 * allocate control pages to allow
477 * measurements with non-RT tasks.
478 * So check if we need to free the page
479 * in any case.
480 */
481 if (tsk_rt(dead_tsk)->ctrl_page) {
482 TRACE_TASK(dead_tsk,
483 "freeing ctrl_page %p\n",
484 tsk_rt(dead_tsk)->ctrl_page);
485 free_page((unsigned long) tsk_rt(dead_tsk)->ctrl_page);
486 }
487
488 /* main cleanup only for RT tasks */
489 if (is_realtime(dead_tsk))
490 litmus_exit_task(dead_tsk);
491}
492
493
494#ifdef CONFIG_MAGIC_SYSRQ
495int sys_kill(int pid, int sig);
496
497static void sysrq_handle_kill_rt_tasks(int key)
498{
499 struct task_struct *t;
500 read_lock(&tasklist_lock);
501 for_each_process(t) {
502 if (is_realtime(t)) {
503 sys_kill(t->pid, SIGKILL);
504 }
505 }
506 read_unlock(&tasklist_lock);
507}
508
509static struct sysrq_key_op sysrq_kill_rt_tasks_op = {
510 .handler = sysrq_handle_kill_rt_tasks,
511 .help_msg = "quit-rt-tasks(X)",
512 .action_msg = "sent SIGKILL to all LITMUS^RT real-time tasks",
513};
514#endif
515
516extern struct sched_plugin linux_sched_plugin;
517
518static int litmus_shutdown_nb(struct notifier_block *unused1,
519 unsigned long unused2, void *unused3)
520{
521 /* Attempt to switch back to regular Linux scheduling.
522 * Forces the active plugin to clean up.
523 */
524 if (litmus != &linux_sched_plugin) {
525 int ret = switch_sched_plugin(&linux_sched_plugin);
526 if (ret) {
527 printk("Auto-shutdown of active Litmus plugin failed.\n");
528 }
529 }
530 return NOTIFY_DONE;
531}
532
533static struct notifier_block shutdown_notifier = {
534 .notifier_call = litmus_shutdown_nb,
535};
536
537static int __init _init_litmus(void)
538{
539 /* Common initializers,
540 * mode change lock is used to enforce single mode change
541 * operation.
542 */
543 printk("Starting LITMUS^RT kernel\n");
544
545 register_sched_plugin(&linux_sched_plugin);
546
547 bheap_node_cache = KMEM_CACHE(bheap_node, SLAB_PANIC);
548 release_heap_cache = KMEM_CACHE(release_heap, SLAB_PANIC);
549
550#ifdef CONFIG_MAGIC_SYSRQ
551 /* offer some debugging help */
552 if (!register_sysrq_key('x', &sysrq_kill_rt_tasks_op))
553 printk("Registered kill rt tasks magic sysrq.\n");
554 else
555 printk("Could not register kill rt tasks magic sysrq.\n");
556#endif
557
558 init_litmus_proc();
559
560#ifdef CONFIG_SCHED_CPU_AFFINITY
561 init_topology();
562#endif
563
564 register_reboot_notifier(&shutdown_notifier);
565
566 return 0;
567}
568
569static void _exit_litmus(void)
570{
571 unregister_reboot_notifier(&shutdown_notifier);
572
573 exit_litmus_proc();
574 kmem_cache_destroy(bheap_node_cache);
575 kmem_cache_destroy(release_heap_cache);
576}
577
578module_init(_init_litmus);
579module_exit(_exit_litmus);
diff --git a/litmus/litmus_proc.c b/litmus/litmus_proc.c
new file mode 100644
index 00000000000..4bf725a36c9
--- /dev/null
+++ b/litmus/litmus_proc.c
@@ -0,0 +1,347 @@
1/*
2 * litmus_proc.c -- Implementation of the /proc/litmus directory tree.
3 */
4
5#include <linux/sched.h>
6#include <linux/uaccess.h>
7
8#include <litmus/litmus.h>
9#include <litmus/litmus_proc.h>
10
11#include <litmus/clustered.h>
12
13/* in litmus/litmus.c */
14extern atomic_t rt_task_count;
15
16static struct proc_dir_entry *litmus_dir = NULL,
17 *curr_file = NULL,
18 *stat_file = NULL,
19 *plugs_dir = NULL,
20#ifdef CONFIG_RELEASE_MASTER
21 *release_master_file = NULL,
22#endif
23 *plugs_file = NULL;
24
25/* in litmus/sync.c */
26int count_tasks_waiting_for_release(void);
27
28static int proc_read_stats(char *page, char **start,
29 off_t off, int count,
30 int *eof, void *data)
31{
32 int len;
33
34 len = snprintf(page, PAGE_SIZE,
35 "real-time tasks = %d\n"
36 "ready for release = %d\n",
37 atomic_read(&rt_task_count),
38 count_tasks_waiting_for_release());
39 return len;
40}
41
42static int proc_read_plugins(char *page, char **start,
43 off_t off, int count,
44 int *eof, void *data)
45{
46 int len;
47
48 len = print_sched_plugins(page, PAGE_SIZE);
49 return len;
50}
51
52static int proc_read_curr(char *page, char **start,
53 off_t off, int count,
54 int *eof, void *data)
55{
56 int len;
57
58 len = snprintf(page, PAGE_SIZE, "%s\n", litmus->plugin_name);
59 return len;
60}
61
62/* in litmus/litmus.c */
63int switch_sched_plugin(struct sched_plugin*);
64
65static int proc_write_curr(struct file *file,
66 const char *buffer,
67 unsigned long count,
68 void *data)
69{
70 int len, ret;
71 char name[65];
72 struct sched_plugin* found;
73
74 len = copy_and_chomp(name, sizeof(name), buffer, count);
75 if (len < 0)
76 return len;
77
78 found = find_sched_plugin(name);
79
80 if (found) {
81 ret = switch_sched_plugin(found);
82 if (ret != 0)
83 printk(KERN_INFO "Could not switch plugin: %d\n", ret);
84 } else
85 printk(KERN_INFO "Plugin '%s' is unknown.\n", name);
86
87 return len;
88}
89
90#ifdef CONFIG_RELEASE_MASTER
91static int proc_read_release_master(char *page, char **start,
92 off_t off, int count,
93 int *eof, void *data)
94{
95 int len, master;
96 master = atomic_read(&release_master_cpu);
97 if (master == NO_CPU)
98 len = snprintf(page, PAGE_SIZE, "NO_CPU\n");
99 else
100 len = snprintf(page, PAGE_SIZE, "%d\n", master);
101 return len;
102}
103
104static int proc_write_release_master(struct file *file,
105 const char *buffer,
106 unsigned long count,
107 void *data)
108{
109 int cpu, err, len, online = 0;
110 char msg[64];
111
112 len = copy_and_chomp(msg, sizeof(msg), buffer, count);
113
114 if (len < 0)
115 return len;
116
117 if (strcmp(msg, "NO_CPU") == 0)
118 atomic_set(&release_master_cpu, NO_CPU);
119 else {
120 err = sscanf(msg, "%d", &cpu);
121 if (err == 1 && cpu >= 0 && (online = cpu_online(cpu))) {
122 atomic_set(&release_master_cpu, cpu);
123 } else {
124 TRACE("invalid release master: '%s' "
125 "(err:%d cpu:%d online:%d)\n",
126 msg, err, cpu, online);
127 len = -EINVAL;
128 }
129 }
130 return len;
131}
132#endif
133
134int __init init_litmus_proc(void)
135{
136 litmus_dir = proc_mkdir("litmus", NULL);
137 if (!litmus_dir) {
138 printk(KERN_ERR "Could not allocate LITMUS^RT procfs entry.\n");
139 return -ENOMEM;
140 }
141
142 curr_file = create_proc_entry("active_plugin",
143 0644, litmus_dir);
144 if (!curr_file) {
145 printk(KERN_ERR "Could not allocate active_plugin "
146 "procfs entry.\n");
147 return -ENOMEM;
148 }
149 curr_file->read_proc = proc_read_curr;
150 curr_file->write_proc = proc_write_curr;
151
152#ifdef CONFIG_RELEASE_MASTER
153 release_master_file = create_proc_entry("release_master",
154 0644, litmus_dir);
155 if (!release_master_file) {
156 printk(KERN_ERR "Could not allocate release_master "
157 "procfs entry.\n");
158 return -ENOMEM;
159 }
160 release_master_file->read_proc = proc_read_release_master;
161 release_master_file->write_proc = proc_write_release_master;
162#endif
163
164 stat_file = create_proc_read_entry("stats", 0444, litmus_dir,
165 proc_read_stats, NULL);
166
167 plugs_dir = proc_mkdir("plugins", litmus_dir);
168 if (!plugs_dir){
169 printk(KERN_ERR "Could not allocate plugins directory "
170 "procfs entry.\n");
171 return -ENOMEM;
172 }
173
174 plugs_file = create_proc_read_entry("loaded", 0444, plugs_dir,
175 proc_read_plugins, NULL);
176
177 return 0;
178}
179
180void exit_litmus_proc(void)
181{
182 if (plugs_file)
183 remove_proc_entry("loaded", plugs_dir);
184 if (plugs_dir)
185 remove_proc_entry("plugins", litmus_dir);
186 if (stat_file)
187 remove_proc_entry("stats", litmus_dir);
188 if (curr_file)
189 remove_proc_entry("active_plugin", litmus_dir);
190#ifdef CONFIG_RELEASE_MASTER
191 if (release_master_file)
192 remove_proc_entry("release_master", litmus_dir);
193#endif
194 if (litmus_dir)
195 remove_proc_entry("litmus", NULL);
196}
197
198long make_plugin_proc_dir(struct sched_plugin* plugin,
199 struct proc_dir_entry** pde_in)
200{
201 struct proc_dir_entry *pde_new = NULL;
202 long rv;
203
204 if (!plugin || !plugin->plugin_name){
205 printk(KERN_ERR "Invalid plugin struct passed to %s.\n",
206 __func__);
207 rv = -EINVAL;
208 goto out_no_pde;
209 }
210
211 if (!plugs_dir){
212 printk(KERN_ERR "Could not make plugin sub-directory, because "
213 "/proc/litmus/plugins does not exist.\n");
214 rv = -ENOENT;
215 goto out_no_pde;
216 }
217
218 pde_new = proc_mkdir(plugin->plugin_name, plugs_dir);
219 if (!pde_new){
220 printk(KERN_ERR "Could not make plugin sub-directory: "
221 "out of memory?.\n");
222 rv = -ENOMEM;
223 goto out_no_pde;
224 }
225
226 rv = 0;
227 *pde_in = pde_new;
228 goto out_ok;
229
230out_no_pde:
231 *pde_in = NULL;
232out_ok:
233 return rv;
234}
235
236void remove_plugin_proc_dir(struct sched_plugin* plugin)
237{
238 if (!plugin || !plugin->plugin_name){
239 printk(KERN_ERR "Invalid plugin struct passed to %s.\n",
240 __func__);
241 return;
242 }
243 remove_proc_entry(plugin->plugin_name, plugs_dir);
244}
245
246
247
248/* misc. I/O helper functions */
249
250int copy_and_chomp(char *kbuf, unsigned long ksize,
251 __user const char* ubuf, unsigned long ulength)
252{
253 /* caller must provide buffer space */
254 BUG_ON(!ksize);
255
256 ksize--; /* leave space for null byte */
257
258 if (ksize > ulength)
259 ksize = ulength;
260
261 if(copy_from_user(kbuf, ubuf, ksize))
262 return -EFAULT;
263
264 kbuf[ksize] = '\0';
265
266 /* chomp kbuf */
267 if (ksize > 0 && kbuf[ksize - 1] == '\n')
268 kbuf[ksize - 1] = '\0';
269
270 return ksize;
271}
272
273/* helper functions for clustered plugins */
274static const char* cache_level_names[] = {
275 "ALL",
276 "L1",
277 "L2",
278 "L3",
279};
280
281int parse_cache_level(const char *cache_name, enum cache_level *level)
282{
283 int err = -EINVAL;
284 int i;
285 /* do a quick and dirty comparison to find the cluster size */
286 for (i = GLOBAL_CLUSTER; i <= L3_CLUSTER; i++)
287 if (!strcmp(cache_name, cache_level_names[i])) {
288 *level = (enum cache_level) i;
289 err = 0;
290 break;
291 }
292 return err;
293}
294
295const char* cache_level_name(enum cache_level level)
296{
297 int idx = level;
298
299 if (idx >= GLOBAL_CLUSTER && idx <= L3_CLUSTER)
300 return cache_level_names[idx];
301 else
302 return "INVALID";
303}
304
305
306/* proc file interface to configure the cluster size */
307static int proc_read_cluster_size(char *page, char **start,
308 off_t off, int count,
309 int *eof, void *data)
310{
311 return snprintf(page, PAGE_SIZE, "%s\n",
312 cache_level_name(*((enum cache_level*) data)));;
313}
314
315static int proc_write_cluster_size(struct file *file,
316 const char *buffer,
317 unsigned long count,
318 void *data)
319{
320 int len;
321 char cache_name[8];
322
323 len = copy_and_chomp(cache_name, sizeof(cache_name), buffer, count);
324
325 if (len > 0 && parse_cache_level(cache_name, (enum cache_level*) data))
326 printk(KERN_INFO "Cluster '%s' is unknown.\n", cache_name);
327
328 return len;
329}
330
331struct proc_dir_entry* create_cluster_file(struct proc_dir_entry* parent,
332 enum cache_level* level)
333{
334 struct proc_dir_entry* cluster_file;
335
336 cluster_file = create_proc_entry("cluster", 0644, parent);
337 if (!cluster_file) {
338 printk(KERN_ERR "Could not allocate %s/cluster "
339 "procfs entry.\n", parent->name);
340 } else {
341 cluster_file->read_proc = proc_read_cluster_size;
342 cluster_file->write_proc = proc_write_cluster_size;
343 cluster_file->data = level;
344 }
345 return cluster_file;
346}
347
diff --git a/litmus/locking.c b/litmus/locking.c
new file mode 100644
index 00000000000..1d46d148e9e
--- /dev/null
+++ b/litmus/locking.c
@@ -0,0 +1,236 @@
1#include <linux/sched.h>
2#include <litmus/litmus.h>
3#include <litmus/fdso.h>
4
5#ifdef CONFIG_LITMUS_LOCKING
6
7#include <linux/sched.h>
8#include <litmus/litmus.h>
9#include <litmus/sched_plugin.h>
10#include <litmus/trace.h>
11#include <litmus/wait.h>
12
13static int create_generic_lock(void** obj_ref, obj_type_t type, void* __user arg);
14static int open_generic_lock(struct od_table_entry* entry, void* __user arg);
15static int close_generic_lock(struct od_table_entry* entry);
16static void destroy_generic_lock(obj_type_t type, void* sem);
17
18struct fdso_ops generic_lock_ops = {
19 .create = create_generic_lock,
20 .open = open_generic_lock,
21 .close = close_generic_lock,
22 .destroy = destroy_generic_lock
23};
24
25static int create_generic_lock(void** obj_ref, obj_type_t type, void* __user arg)
26{
27 struct litmus_lock* lock;
28 int err;
29
30 err = litmus->allocate_lock(&lock, type, arg);
31 if (err == 0)
32 *obj_ref = lock;
33 return err;
34}
35
36static int open_generic_lock(struct od_table_entry* entry, void* __user arg)
37{
38 struct litmus_lock* lock = get_lock(entry);
39 if (lock->ops->open)
40 return lock->ops->open(lock, arg);
41 else
42 return 0; /* default: any task can open it */
43}
44
45static int close_generic_lock(struct od_table_entry* entry)
46{
47 struct litmus_lock* lock = get_lock(entry);
48 if (lock->ops->close)
49 return lock->ops->close(lock);
50 else
51 return 0; /* default: closing succeeds */
52}
53
54static void destroy_generic_lock(obj_type_t type, void* obj)
55{
56 struct litmus_lock* lock = (struct litmus_lock*) obj;
57 lock->ops->deallocate(lock);
58}
59
60asmlinkage long sys_dynamic_group_lock(resource_mask_t lock_ods)
61{
62 long err = -EINVAL;
63 struct od_table_entry* entry;
64 struct litmus_lock* l;
65
66 TS_LOCK_START;
67
68 entry = get_entry_for_od(ffs(lock_ods)-1);
69 if (entry && is_lock(entry)) {
70 l = get_lock(entry);
71 if (l->type == DGL_SEM){
72 err = l->ops->dynamic_group_lock(l, lock_ods);
73 }else{
74 TRACE("Attempted to DG-lock type: %d\n", l->type);
75 }
76 } else {
77 TRACE_CUR("Attempted to lock invalid entry %d\n", entry);
78 }
79
80 TS_LOCK_END;
81
82 return err;
83}
84
85asmlinkage long sys_dynamic_group_unlock(resource_mask_t lock_ods)
86{
87 long err = -EINVAL;
88 struct od_table_entry* entry;
89 struct litmus_lock* l;
90
91 TS_UNLOCK_START;
92
93 entry = get_entry_for_od(ffs(lock_ods)-1);
94 if (entry && is_lock(entry)) {
95 l = get_lock(entry);
96 if (l->type == DGL_SEM){
97 err = l->ops->dynamic_group_unlock(l, lock_ods);
98 } else{
99 TRACE_CUR("Attempted to DG-unlock type: %d\n", l->type);
100 }
101 } else {
102 TRACE_CUR("Attempted to unlock invalid entry %d\n", entry);
103 }
104
105 TS_UNLOCK_END;
106
107 return err;
108}
109
110asmlinkage long sys_litmus_lock(int lock_od)
111{
112 long err = -EINVAL;
113 struct od_table_entry* entry;
114 struct litmus_lock* l;
115
116 TS_LOCK_START;
117
118 entry = get_entry_for_od(lock_od);
119 if (entry && is_lock(entry)) {
120 l = get_lock(entry);
121 TRACE_CUR("attempts to lock 0x%p\n", l);
122 err = l->ops->lock(l);
123 }
124
125 /* Note: task my have been suspended or preempted in between! Take
126 * this into account when computing overheads. */
127 TS_LOCK_END;
128
129 TS_SYSCALL_OUT_START;
130
131 return err;
132}
133
134asmlinkage long sys_litmus_unlock(int lock_od)
135{
136 long err = -EINVAL;
137 struct od_table_entry* entry;
138 struct litmus_lock* l;
139
140 TS_SYSCALL_IN_START;
141
142 TS_SYSCALL_IN_END;
143
144 TS_UNLOCK_START;
145
146 entry = get_entry_for_od(lock_od);
147 if (entry && is_lock(entry)) {
148 l = get_lock(entry);
149 TRACE_CUR("attempts to unlock 0x%p\n", l);
150 err = l->ops->unlock(l);
151 }
152
153 /* Note: task my have been preempted in between! Take this into
154 * account when computing overheads. */
155 TS_UNLOCK_END;
156
157 TS_SYSCALL_OUT_START;
158
159 return err;
160}
161
162struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq)
163{
164 wait_queue_t* q;
165 struct task_struct* t = NULL;
166
167 if (waitqueue_active(wq)) {
168 q = list_entry(wq->task_list.next,
169 wait_queue_t, task_list);
170 t = (struct task_struct*) q->private;
171 __remove_wait_queue(wq, q);
172 }
173 return(t);
174}
175
176struct task_struct* __waitqueue_peek_first(wait_queue_head_t *wq)
177{
178 wait_queue_t* q;
179 struct task_struct* t = NULL;
180
181 if (waitqueue_active(wq)) {
182 q = list_entry(wq->task_list.next,
183 wait_queue_t, task_list);
184 t = (struct task_struct*) q->private;
185 }
186 return(t);
187}
188
189unsigned int __add_wait_queue_prio_exclusive(
190 wait_queue_head_t* head,
191 prio_wait_queue_t *new)
192{
193 struct list_head *pos;
194 unsigned int passed = 0;
195
196 new->wq.flags |= WQ_FLAG_EXCLUSIVE;
197
198 /* find a spot where the new entry is less than the next */
199 list_for_each(pos, &head->task_list) {
200 prio_wait_queue_t* queued = list_entry(pos, prio_wait_queue_t,
201 wq.task_list);
202
203 if (unlikely(lt_before(new->priority, queued->priority) ||
204 (new->priority == queued->priority &&
205 new->tie_breaker < queued->tie_breaker))) {
206 /* pos is not less than new, thus insert here */
207 __list_add(&new->wq.task_list, pos->prev, pos);
208 goto out;
209 }
210 passed++;
211 }
212
213 /* if we get to this point either the list is empty or every entry
214 * queued element is less than new.
215 * Let's add new to the end. */
216 list_add_tail(&new->wq.task_list, &head->task_list);
217out:
218 return passed;
219}
220
221
222#else
223
224struct fdso_ops generic_lock_ops = {};
225
226asmlinkage long sys_litmus_lock(int sem_od)
227{
228 return -ENOSYS;
229}
230
231asmlinkage long sys_litmus_unlock(int sem_od)
232{
233 return -ENOSYS;
234}
235
236#endif
diff --git a/litmus/preempt.c b/litmus/preempt.c
new file mode 100644
index 00000000000..a202d70c627
--- /dev/null
+++ b/litmus/preempt.c
@@ -0,0 +1,137 @@
1#include <linux/sched.h>
2
3#include <litmus/litmus.h>
4#include <litmus/preempt.h>
5#include <litmus/trace.h>
6
7/* The rescheduling state of each processor.
8 */
9DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, resched_state);
10
11void sched_state_will_schedule(struct task_struct* tsk)
12{
13 /* Litmus hack: we only care about processor-local invocations of
14 * set_tsk_need_resched(). We can't reliably set the flag remotely
15 * since it might race with other updates to the scheduling state. We
16 * can't rely on the runqueue lock protecting updates to the sched
17 * state since processors do not acquire the runqueue locks for all
18 * updates to the sched state (to avoid acquiring two runqueue locks at
19 * the same time). Further, if tsk is residing on a remote processor,
20 * then that processor doesn't actually know yet that it is going to
21 * reschedule; it still must receive an IPI (unless a local invocation
22 * races).
23 */
24 if (likely(task_cpu(tsk) == smp_processor_id())) {
25 VERIFY_SCHED_STATE(TASK_SCHEDULED | SHOULD_SCHEDULE | TASK_PICKED | WILL_SCHEDULE);
26 if (is_in_sched_state(TASK_PICKED | PICKED_WRONG_TASK))
27 set_sched_state(PICKED_WRONG_TASK);
28 else
29 set_sched_state(WILL_SCHEDULE);
30 } /* else */
31 /* /\* Litmus tasks should never be subject to a remote */
32 /* * set_tsk_need_resched(). *\/ */
33 /* BUG_ON(is_realtime(tsk)); */
34#ifdef CONFIG_PREEMPT_STATE_TRACE
35 TRACE_TASK(tsk, "set_tsk_need_resched() ret:%p\n",
36 __builtin_return_address(0));
37#endif
38}
39
40/* Called by the IPI handler after another CPU called smp_send_resched(). */
41void sched_state_ipi(void)
42{
43 /* If the IPI was slow, we might be in any state right now. The IPI is
44 * only meaningful if we are in SHOULD_SCHEDULE. */
45 if (is_in_sched_state(SHOULD_SCHEDULE)) {
46 /* Cause scheduler to be invoked.
47 * This will cause a transition to WILL_SCHEDULE. */
48 set_tsk_need_resched(current);
49 TRACE_STATE("IPI -> set_tsk_need_resched(%s/%d)\n",
50 current->comm, current->pid);
51 TS_SEND_RESCHED_END;
52 } else {
53 /* ignore */
54 TRACE_STATE("ignoring IPI in state %x (%s)\n",
55 get_sched_state(),
56 sched_state_name(get_sched_state()));
57 }
58}
59
60/* Called by plugins to cause a CPU to reschedule. IMPORTANT: the caller must
61 * hold the lock that is used to serialize scheduling decisions. */
62void litmus_reschedule(int cpu)
63{
64 int picked_transition_ok = 0;
65 int scheduled_transition_ok = 0;
66
67 /* The (remote) CPU could be in any state. */
68
69 /* The critical states are TASK_PICKED and TASK_SCHEDULED, as the CPU
70 * is not aware of the need to reschedule at this point. */
71
72 /* is a context switch in progress? */
73 if (cpu_is_in_sched_state(cpu, TASK_PICKED))
74 picked_transition_ok = sched_state_transition_on(
75 cpu, TASK_PICKED, PICKED_WRONG_TASK);
76
77 if (!picked_transition_ok &&
78 cpu_is_in_sched_state(cpu, TASK_SCHEDULED)) {
79 /* We either raced with the end of the context switch, or the
80 * CPU was in TASK_SCHEDULED anyway. */
81 scheduled_transition_ok = sched_state_transition_on(
82 cpu, TASK_SCHEDULED, SHOULD_SCHEDULE);
83 }
84
85 /* If the CPU was in state TASK_SCHEDULED, then we need to cause the
86 * scheduler to be invoked. */
87 if (scheduled_transition_ok) {
88 if (smp_processor_id() == cpu)
89 set_tsk_need_resched(current);
90 else {
91 TS_SEND_RESCHED_START(cpu);
92 smp_send_reschedule(cpu);
93 }
94 }
95
96 TRACE_STATE("%s picked-ok:%d sched-ok:%d\n",
97 __FUNCTION__,
98 picked_transition_ok,
99 scheduled_transition_ok);
100}
101
102void litmus_reschedule_local(void)
103{
104 if (is_in_sched_state(TASK_PICKED))
105 set_sched_state(PICKED_WRONG_TASK);
106 else if (is_in_sched_state(TASK_SCHEDULED | SHOULD_SCHEDULE)) {
107 set_sched_state(WILL_SCHEDULE);
108 set_tsk_need_resched(current);
109 }
110}
111
112#ifdef CONFIG_DEBUG_KERNEL
113
114void sched_state_plugin_check(void)
115{
116 if (!is_in_sched_state(TASK_PICKED | PICKED_WRONG_TASK)) {
117 TRACE("!!!! plugin did not call sched_state_task_picked()!"
118 "Calling sched_state_task_picked() is mandatory---fix this.\n");
119 set_sched_state(TASK_PICKED);
120 }
121}
122
123#define NAME_CHECK(x) case x: return #x
124const char* sched_state_name(int s)
125{
126 switch (s) {
127 NAME_CHECK(TASK_SCHEDULED);
128 NAME_CHECK(SHOULD_SCHEDULE);
129 NAME_CHECK(WILL_SCHEDULE);
130 NAME_CHECK(TASK_PICKED);
131 NAME_CHECK(PICKED_WRONG_TASK);
132 default:
133 return "UNKNOWN";
134 };
135}
136
137#endif
diff --git a/litmus/rt_domain.c b/litmus/rt_domain.c
new file mode 100644
index 00000000000..1683d384756
--- /dev/null
+++ b/litmus/rt_domain.c
@@ -0,0 +1,349 @@
1/*
2 * litmus/rt_domain.c
3 *
4 * LITMUS real-time infrastructure. This file contains the
5 * functions that manipulate RT domains. RT domains are an abstraction
6 * of a ready queue and a release queue.
7 */
8
9#include <linux/percpu.h>
10#include <linux/sched.h>
11#include <linux/list.h>
12#include <linux/slab.h>
13
14#include <litmus/litmus.h>
15#include <litmus/sched_plugin.h>
16#include <litmus/sched_trace.h>
17
18#include <litmus/rt_domain.h>
19
20#include <litmus/trace.h>
21
22#include <litmus/bheap.h>
23
24/* Uncomment when debugging timer races... */
25#if 0
26#define VTRACE_TASK TRACE_TASK
27#define VTRACE TRACE
28#else
29#define VTRACE_TASK(t, fmt, args...) /* shut up */
30#define VTRACE(fmt, args...) /* be quiet already */
31#endif
32
33static int dummy_resched(rt_domain_t *rt)
34{
35 return 0;
36}
37
38static int dummy_order(struct bheap_node* a, struct bheap_node* b)
39{
40 return 0;
41}
42
43/* default implementation: use default lock */
44static void default_release_jobs(rt_domain_t* rt, struct bheap* tasks)
45{
46 merge_ready(rt, tasks);
47}
48
49static unsigned int time2slot(lt_t time)
50{
51 return (unsigned int) time2quanta(time, FLOOR) % RELEASE_QUEUE_SLOTS;
52}
53
54static enum hrtimer_restart on_release_timer(struct hrtimer *timer)
55{
56 unsigned long flags;
57 struct release_heap* rh;
58 rh = container_of(timer, struct release_heap, timer);
59
60 TS_RELEASE_LATENCY(rh->release_time);
61
62 VTRACE("on_release_timer(0x%p) starts.\n", timer);
63
64 TS_RELEASE_START;
65
66
67 raw_spin_lock_irqsave(&rh->dom->release_lock, flags);
68 VTRACE("CB has the release_lock 0x%p\n", &rh->dom->release_lock);
69 /* remove from release queue */
70 list_del(&rh->list);
71 raw_spin_unlock_irqrestore(&rh->dom->release_lock, flags);
72 VTRACE("CB returned release_lock 0x%p\n", &rh->dom->release_lock);
73
74 /* call release callback */
75 rh->dom->release_jobs(rh->dom, &rh->heap);
76 /* WARNING: rh can be referenced from other CPUs from now on. */
77
78 TS_RELEASE_END;
79
80 VTRACE("on_release_timer(0x%p) ends.\n", timer);
81
82 return HRTIMER_NORESTART;
83}
84
85/* allocated in litmus.c */
86struct kmem_cache * release_heap_cache;
87
88struct release_heap* release_heap_alloc(int gfp_flags)
89{
90 struct release_heap* rh;
91 rh= kmem_cache_alloc(release_heap_cache, gfp_flags);
92 if (rh) {
93 /* initialize timer */
94 hrtimer_init(&rh->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
95 rh->timer.function = on_release_timer;
96 }
97 return rh;
98}
99
100void release_heap_free(struct release_heap* rh)
101{
102 /* make sure timer is no longer in use */
103 hrtimer_cancel(&rh->timer);
104 kmem_cache_free(release_heap_cache, rh);
105}
106
107/* Caller must hold release lock.
108 * Will return heap for given time. If no such heap exists prior to
109 * the invocation it will be created.
110 */
111static struct release_heap* get_release_heap(rt_domain_t *rt,
112 struct task_struct* t,
113 int use_task_heap)
114{
115 struct list_head* pos;
116 struct release_heap* heap = NULL;
117 struct release_heap* rh;
118 lt_t release_time = get_release(t);
119 unsigned int slot = time2slot(release_time);
120
121 /* initialize pos for the case that the list is empty */
122 pos = rt->release_queue.slot[slot].next;
123 list_for_each(pos, &rt->release_queue.slot[slot]) {
124 rh = list_entry(pos, struct release_heap, list);
125 if (release_time == rh->release_time) {
126 /* perfect match -- this happens on hyperperiod
127 * boundaries
128 */
129 heap = rh;
130 break;
131 } else if (lt_before(release_time, rh->release_time)) {
132 /* we need to insert a new node since rh is
133 * already in the future
134 */
135 break;
136 }
137 }
138 if (!heap && use_task_heap) {
139 /* use pre-allocated release heap */
140 rh = tsk_rt(t)->rel_heap;
141
142 rh->dom = rt;
143 rh->release_time = release_time;
144
145 /* add to release queue */
146 list_add(&rh->list, pos->prev);
147 heap = rh;
148 }
149 return heap;
150}
151
152static void reinit_release_heap(struct task_struct* t)
153{
154 struct release_heap* rh;
155
156 /* use pre-allocated release heap */
157 rh = tsk_rt(t)->rel_heap;
158
159 /* Make sure it is safe to use. The timer callback could still
160 * be executing on another CPU; hrtimer_cancel() will wait
161 * until the timer callback has completed. However, under no
162 * circumstances should the timer be active (= yet to be
163 * triggered).
164 *
165 * WARNING: If the CPU still holds the release_lock at this point,
166 * deadlock may occur!
167 */
168 BUG_ON(hrtimer_cancel(&rh->timer));
169
170 /* initialize */
171 bheap_init(&rh->heap);
172#ifdef CONFIG_RELEASE_MASTER
173 atomic_set(&rh->info.state, HRTIMER_START_ON_INACTIVE);
174#endif
175}
176/* arm_release_timer() - start local release timer or trigger
177 * remote timer (pull timer)
178 *
179 * Called by add_release() with:
180 * - tobe_lock taken
181 * - IRQ disabled
182 */
183#ifdef CONFIG_RELEASE_MASTER
184#define arm_release_timer(t) arm_release_timer_on((t), NO_CPU)
185static void arm_release_timer_on(rt_domain_t *_rt , int target_cpu)
186#else
187static void arm_release_timer(rt_domain_t *_rt)
188#endif
189{
190 rt_domain_t *rt = _rt;
191 struct list_head list;
192 struct list_head *pos, *safe;
193 struct task_struct* t;
194 struct release_heap* rh;
195
196 VTRACE("arm_release_timer() at %llu\n", litmus_clock());
197 list_replace_init(&rt->tobe_released, &list);
198
199 list_for_each_safe(pos, safe, &list) {
200 /* pick task of work list */
201 t = list_entry(pos, struct task_struct, rt_param.list);
202 sched_trace_task_release(t);
203 list_del(pos);
204
205 /* put into release heap while holding release_lock */
206 raw_spin_lock(&rt->release_lock);
207 VTRACE_TASK(t, "I have the release_lock 0x%p\n", &rt->release_lock);
208
209 rh = get_release_heap(rt, t, 0);
210 if (!rh) {
211 /* need to use our own, but drop lock first */
212 raw_spin_unlock(&rt->release_lock);
213 VTRACE_TASK(t, "Dropped release_lock 0x%p\n",
214 &rt->release_lock);
215
216 reinit_release_heap(t);
217 VTRACE_TASK(t, "release_heap ready\n");
218
219 raw_spin_lock(&rt->release_lock);
220 VTRACE_TASK(t, "Re-acquired release_lock 0x%p\n",
221 &rt->release_lock);
222
223 rh = get_release_heap(rt, t, 1);
224 }
225 bheap_insert(rt->order, &rh->heap, tsk_rt(t)->heap_node);
226 VTRACE_TASK(t, "arm_release_timer(): added to release heap\n");
227
228 raw_spin_unlock(&rt->release_lock);
229 VTRACE_TASK(t, "Returned the release_lock 0x%p\n", &rt->release_lock);
230
231 /* To avoid arming the timer multiple times, we only let the
232 * owner do the arming (which is the "first" task to reference
233 * this release_heap anyway).
234 */
235 if (rh == tsk_rt(t)->rel_heap) {
236 VTRACE_TASK(t, "arming timer 0x%p\n", &rh->timer);
237 /* we cannot arm the timer using hrtimer_start()
238 * as it may deadlock on rq->lock
239 *
240 * PINNED mode is ok on both local and remote CPU
241 */
242#ifdef CONFIG_RELEASE_MASTER
243 if (rt->release_master == NO_CPU &&
244 target_cpu == NO_CPU)
245#endif
246 __hrtimer_start_range_ns(&rh->timer,
247 ns_to_ktime(rh->release_time),
248 0, HRTIMER_MODE_ABS_PINNED, 0);
249#ifdef CONFIG_RELEASE_MASTER
250 else
251 hrtimer_start_on(
252 /* target_cpu overrides release master */
253 (target_cpu != NO_CPU ?
254 target_cpu : rt->release_master),
255 &rh->info, &rh->timer,
256 ns_to_ktime(rh->release_time),
257 HRTIMER_MODE_ABS_PINNED);
258#endif
259 } else
260 VTRACE_TASK(t, "0x%p is not my timer\n", &rh->timer);
261 }
262}
263
264void rt_domain_init(rt_domain_t *rt,
265 bheap_prio_t order,
266 check_resched_needed_t check,
267 release_jobs_t release
268 )
269{
270 int i;
271
272 BUG_ON(!rt);
273 if (!check)
274 check = dummy_resched;
275 if (!release)
276 release = default_release_jobs;
277 if (!order)
278 order = dummy_order;
279
280#ifdef CONFIG_RELEASE_MASTER
281 rt->release_master = NO_CPU;
282#endif
283
284 bheap_init(&rt->ready_queue);
285 INIT_LIST_HEAD(&rt->tobe_released);
286 for (i = 0; i < RELEASE_QUEUE_SLOTS; i++)
287 INIT_LIST_HEAD(&rt->release_queue.slot[i]);
288
289 raw_spin_lock_init(&rt->ready_lock);
290 raw_spin_lock_init(&rt->release_lock);
291 raw_spin_lock_init(&rt->tobe_lock);
292
293 rt->check_resched = check;
294 rt->release_jobs = release;
295 rt->order = order;
296}
297
298/* add_ready - add a real-time task to the rt ready queue. It must be runnable.
299 * @new: the newly released task
300 */
301void __add_ready(rt_domain_t* rt, struct task_struct *new)
302{
303 TRACE("rt: adding %s/%d (%llu, %llu, %llu) rel=%llu "
304 "to ready queue at %llu\n",
305 new->comm, new->pid,
306 get_exec_cost(new), get_rt_period(new), get_rt_relative_deadline(new),
307 get_release(new), litmus_clock());
308
309 BUG_ON(bheap_node_in_heap(tsk_rt(new)->heap_node));
310
311 bheap_insert(rt->order, &rt->ready_queue, tsk_rt(new)->heap_node);
312 rt->check_resched(rt);
313}
314
315/* merge_ready - Add a sorted set of tasks to the rt ready queue. They must be runnable.
316 * @tasks - the newly released tasks
317 */
318void __merge_ready(rt_domain_t* rt, struct bheap* tasks)
319{
320 bheap_union(rt->order, &rt->ready_queue, tasks);
321 rt->check_resched(rt);
322}
323
324
325#ifdef CONFIG_RELEASE_MASTER
326void __add_release_on(rt_domain_t* rt, struct task_struct *task,
327 int target_cpu)
328{
329 TRACE_TASK(task, "add_release_on(), rel=%llu, target=%d\n",
330 get_release(task), target_cpu);
331 list_add(&tsk_rt(task)->list, &rt->tobe_released);
332 task->rt_param.domain = rt;
333
334 arm_release_timer_on(rt, target_cpu);
335}
336#endif
337
338/* add_release - add a real-time task to the rt release queue.
339 * @task: the sleeping task
340 */
341void __add_release(rt_domain_t* rt, struct task_struct *task)
342{
343 TRACE_TASK(task, "add_release(), rel=%llu\n", get_release(task));
344 list_add(&tsk_rt(task)->list, &rt->tobe_released);
345 task->rt_param.domain = rt;
346
347 arm_release_timer(rt);
348}
349
diff --git a/litmus/sched_cedf.c b/litmus/sched_cedf.c
new file mode 100644
index 00000000000..6e1327bbf50
--- /dev/null
+++ b/litmus/sched_cedf.c
@@ -0,0 +1,857 @@
1/*
2 * litmus/sched_cedf.c
3 *
4 * Implementation of the C-EDF scheduling algorithm.
5 *
6 * This implementation is based on G-EDF:
7 * - CPUs are clustered around L2 or L3 caches.
8 * - Clusters topology is automatically detected (this is arch dependent
9 * and is working only on x86 at the moment --- and only with modern
10 * cpus that exports cpuid4 information)
11 * - The plugins _does not_ attempt to put tasks in the right cluster i.e.
12 * the programmer needs to be aware of the topology to place tasks
13 * in the desired cluster
14 * - default clustering is around L2 cache (cache index = 2)
15 * supported clusters are: L1 (private cache: pedf), L2, L3, ALL (all
16 * online_cpus are placed in a single cluster).
17 *
18 * For details on functions, take a look at sched_gsn_edf.c
19 *
20 * Currently, we do not support changes in the number of online cpus.
21 * If the num_online_cpus() dynamically changes, the plugin is broken.
22 *
23 * This version uses the simple approach and serializes all scheduling
24 * decisions by the use of a queue lock. This is probably not the
25 * best way to do it, but it should suffice for now.
26 */
27
28#include <linux/spinlock.h>
29#include <linux/percpu.h>
30#include <linux/sched.h>
31#include <linux/slab.h>
32
33#include <linux/module.h>
34
35#include <litmus/litmus.h>
36#include <litmus/jobs.h>
37#include <litmus/preempt.h>
38#include <litmus/budget.h>
39#include <litmus/sched_plugin.h>
40#include <litmus/edf_common.h>
41#include <litmus/sched_trace.h>
42
43#include <litmus/clustered.h>
44
45#include <litmus/bheap.h>
46
47#ifdef CONFIG_SCHED_CPU_AFFINITY
48#include <litmus/affinity.h>
49#endif
50
51/* to configure the cluster size */
52#include <litmus/litmus_proc.h>
53#include <linux/uaccess.h>
54
55/* Reference configuration variable. Determines which cache level is used to
56 * group CPUs into clusters. GLOBAL_CLUSTER, which is the default, means that
57 * all CPUs form a single cluster (just like GSN-EDF).
58 */
59static enum cache_level cluster_config = GLOBAL_CLUSTER;
60
61struct clusterdomain;
62
63/* cpu_entry_t - maintain the linked and scheduled state
64 *
65 * A cpu also contains a pointer to the cedf_domain_t cluster
66 * that owns it (struct clusterdomain*)
67 */
68typedef struct {
69 int cpu;
70 struct clusterdomain* cluster; /* owning cluster */
71 struct task_struct* linked; /* only RT tasks */
72 struct task_struct* scheduled; /* only RT tasks */
73 atomic_t will_schedule; /* prevent unneeded IPIs */
74 struct bheap_node* hn;
75} cpu_entry_t;
76
77/* one cpu_entry_t per CPU */
78DEFINE_PER_CPU(cpu_entry_t, cedf_cpu_entries);
79
80#define set_will_schedule() \
81 (atomic_set(&__get_cpu_var(cedf_cpu_entries).will_schedule, 1))
82#define clear_will_schedule() \
83 (atomic_set(&__get_cpu_var(cedf_cpu_entries).will_schedule, 0))
84#define test_will_schedule(cpu) \
85 (atomic_read(&per_cpu(cedf_cpu_entries, cpu).will_schedule))
86
87/*
88 * In C-EDF there is a cedf domain _per_ cluster
89 * The number of clusters is dynamically determined accordingly to the
90 * total cpu number and the cluster size
91 */
92typedef struct clusterdomain {
93 /* rt_domain for this cluster */
94 rt_domain_t domain;
95 /* cpus in this cluster */
96 cpu_entry_t* *cpus;
97 /* map of this cluster cpus */
98 cpumask_var_t cpu_map;
99 /* the cpus queue themselves according to priority in here */
100 struct bheap_node *heap_node;
101 struct bheap cpu_heap;
102 /* lock for this cluster */
103#define cluster_lock domain.ready_lock
104} cedf_domain_t;
105
106/* a cedf_domain per cluster; allocation is done at init/activation time */
107cedf_domain_t *cedf;
108
109#define remote_cluster(cpu) ((cedf_domain_t *) per_cpu(cedf_cpu_entries, cpu).cluster)
110#define task_cpu_cluster(task) remote_cluster(get_partition(task))
111
112/* Uncomment WANT_ALL_SCHED_EVENTS if you want to see all scheduling
113 * decisions in the TRACE() log; uncomment VERBOSE_INIT for verbose
114 * information during the initialization of the plugin (e.g., topology)
115#define WANT_ALL_SCHED_EVENTS
116 */
117#define VERBOSE_INIT
118
119static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
120{
121 cpu_entry_t *a, *b;
122 a = _a->value;
123 b = _b->value;
124 /* Note that a and b are inverted: we want the lowest-priority CPU at
125 * the top of the heap.
126 */
127 return edf_higher_prio(b->linked, a->linked);
128}
129
130/* update_cpu_position - Move the cpu entry to the correct place to maintain
131 * order in the cpu queue. Caller must hold cedf lock.
132 */
133static void update_cpu_position(cpu_entry_t *entry)
134{
135 cedf_domain_t *cluster = entry->cluster;
136
137 if (likely(bheap_node_in_heap(entry->hn)))
138 bheap_delete(cpu_lower_prio,
139 &cluster->cpu_heap,
140 entry->hn);
141
142 bheap_insert(cpu_lower_prio, &cluster->cpu_heap, entry->hn);
143}
144
145/* caller must hold cedf lock */
146static cpu_entry_t* lowest_prio_cpu(cedf_domain_t *cluster)
147{
148 struct bheap_node* hn;
149 hn = bheap_peek(cpu_lower_prio, &cluster->cpu_heap);
150 return hn->value;
151}
152
153
154/* link_task_to_cpu - Update the link of a CPU.
155 * Handles the case where the to-be-linked task is already
156 * scheduled on a different CPU.
157 */
158static noinline void link_task_to_cpu(struct task_struct* linked,
159 cpu_entry_t *entry)
160{
161 cpu_entry_t *sched;
162 struct task_struct* tmp;
163 int on_cpu;
164
165 BUG_ON(linked && !is_realtime(linked));
166
167 /* Currently linked task is set to be unlinked. */
168 if (entry->linked) {
169 entry->linked->rt_param.linked_on = NO_CPU;
170 }
171
172 /* Link new task to CPU. */
173 if (linked) {
174 tsk_rt(linked)->completed = 0;
175 /* handle task is already scheduled somewhere! */
176 on_cpu = linked->rt_param.scheduled_on;
177 if (on_cpu != NO_CPU) {
178 sched = &per_cpu(cedf_cpu_entries, on_cpu);
179 /* this should only happen if not linked already */
180 BUG_ON(sched->linked == linked);
181
182 /* If we are already scheduled on the CPU to which we
183 * wanted to link, we don't need to do the swap --
184 * we just link ourselves to the CPU and depend on
185 * the caller to get things right.
186 */
187 if (entry != sched) {
188 TRACE_TASK(linked,
189 "already scheduled on %d, updating link.\n",
190 sched->cpu);
191 tmp = sched->linked;
192 linked->rt_param.linked_on = sched->cpu;
193 sched->linked = linked;
194 update_cpu_position(sched);
195 linked = tmp;
196 }
197 }
198 if (linked) /* might be NULL due to swap */
199 linked->rt_param.linked_on = entry->cpu;
200 }
201 entry->linked = linked;
202#ifdef WANT_ALL_SCHED_EVENTS
203 if (linked)
204 TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
205 else
206 TRACE("NULL linked to %d.\n", entry->cpu);
207#endif
208 update_cpu_position(entry);
209}
210
211/* unlink - Make sure a task is not linked any longer to an entry
212 * where it was linked before. Must hold cedf_lock.
213 */
214static noinline void unlink(struct task_struct* t)
215{
216 cpu_entry_t *entry;
217
218 if (t->rt_param.linked_on != NO_CPU) {
219 /* unlink */
220 entry = &per_cpu(cedf_cpu_entries, t->rt_param.linked_on);
221 t->rt_param.linked_on = NO_CPU;
222 link_task_to_cpu(NULL, entry);
223 } else if (is_queued(t)) {
224 /* This is an interesting situation: t is scheduled,
225 * but was just recently unlinked. It cannot be
226 * linked anywhere else (because then it would have
227 * been relinked to this CPU), thus it must be in some
228 * queue. We must remove it from the list in this
229 * case.
230 *
231 * in C-EDF case is should be somewhere in the queue for
232 * its domain, therefore and we can get the domain using
233 * task_cpu_cluster
234 */
235 remove(&(task_cpu_cluster(t))->domain, t);
236 }
237}
238
239
240/* preempt - force a CPU to reschedule
241 */
242static void preempt(cpu_entry_t *entry)
243{
244 preempt_if_preemptable(entry->scheduled, entry->cpu);
245}
246
247/* requeue - Put an unlinked task into gsn-edf domain.
248 * Caller must hold cedf_lock.
249 */
250static noinline void requeue(struct task_struct* task)
251{
252 cedf_domain_t *cluster = task_cpu_cluster(task);
253 BUG_ON(!task);
254 /* sanity check before insertion */
255 BUG_ON(is_queued(task));
256
257 if (is_early_releasing(task) || is_released(task, litmus_clock()))
258 __add_ready(&cluster->domain, task);
259 else {
260 /* it has got to wait */
261 add_release(&cluster->domain, task);
262 }
263}
264
265#ifdef CONFIG_SCHED_CPU_AFFINITY
266static cpu_entry_t* cedf_get_nearest_available_cpu(
267 cedf_domain_t *cluster, cpu_entry_t *start)
268{
269 cpu_entry_t *affinity;
270
271 get_nearest_available_cpu(affinity, start, cedf_cpu_entries,
272#ifdef CONFIG_RELEASE_MASTER
273 cluster->domain.release_master
274#else
275 NO_CPU
276#endif
277 );
278
279 /* make sure CPU is in our cluster */
280 if (affinity && cpu_isset(affinity->cpu, *cluster->cpu_map))
281 return(affinity);
282 else
283 return(NULL);
284}
285#endif
286
287
288/* check for any necessary preemptions */
289static void check_for_preemptions(cedf_domain_t *cluster)
290{
291 struct task_struct *task;
292 cpu_entry_t *last;
293
294 for(last = lowest_prio_cpu(cluster);
295 edf_preemption_needed(&cluster->domain, last->linked);
296 last = lowest_prio_cpu(cluster)) {
297 /* preemption necessary */
298 task = __take_ready(&cluster->domain);
299 TRACE("check_for_preemptions: attempting to link task %d to %d\n",
300 task->pid, last->cpu);
301#ifdef CONFIG_SCHED_CPU_AFFINITY
302 {
303 cpu_entry_t *affinity =
304 cedf_get_nearest_available_cpu(cluster,
305 &per_cpu(cedf_cpu_entries, task_cpu(task)));
306 if(affinity)
307 last = affinity;
308 else if(requeue_preempted_job(last->linked))
309 requeue(last->linked);
310 }
311#else
312 if (requeue_preempted_job(last->linked))
313 requeue(last->linked);
314#endif
315 link_task_to_cpu(task, last);
316 preempt(last);
317 }
318}
319
320/* cedf_job_arrival: task is either resumed or released */
321static noinline void cedf_job_arrival(struct task_struct* task)
322{
323 cedf_domain_t *cluster = task_cpu_cluster(task);
324 BUG_ON(!task);
325
326 requeue(task);
327 check_for_preemptions(cluster);
328}
329
330static void cedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
331{
332 cedf_domain_t* cluster = container_of(rt, cedf_domain_t, domain);
333 unsigned long flags;
334
335 raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
336
337 __merge_ready(&cluster->domain, tasks);
338 check_for_preemptions(cluster);
339
340 raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
341}
342
343/* caller holds cedf_lock */
344static noinline void job_completion(struct task_struct *t, int forced)
345{
346 BUG_ON(!t);
347
348 sched_trace_task_completion(t, forced);
349
350 TRACE_TASK(t, "job_completion().\n");
351
352 /* set flags */
353 tsk_rt(t)->completed = 1;
354 /* prepare for next period */
355 prepare_for_next_period(t);
356 if (is_early_releasing(t) || is_released(t, litmus_clock()))
357 sched_trace_task_release(t);
358 /* unlink */
359 unlink(t);
360 /* requeue
361 * But don't requeue a blocking task. */
362 if (is_running(t))
363 cedf_job_arrival(t);
364}
365
366/* cedf_tick - this function is called for every local timer
367 * interrupt.
368 *
369 * checks whether the current task has expired and checks
370 * whether we need to preempt it if it has not expired
371 */
372static void cedf_tick(struct task_struct* t)
373{
374 if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
375 if (!is_np(t)) {
376 /* np tasks will be preempted when they become
377 * preemptable again
378 */
379 litmus_reschedule_local();
380 set_will_schedule();
381 TRACE("cedf_scheduler_tick: "
382 "%d is preemptable "
383 " => FORCE_RESCHED\n", t->pid);
384 } else if (is_user_np(t)) {
385 TRACE("cedf_scheduler_tick: "
386 "%d is non-preemptable, "
387 "preemption delayed.\n", t->pid);
388 request_exit_np(t);
389 }
390 }
391}
392
393/* Getting schedule() right is a bit tricky. schedule() may not make any
394 * assumptions on the state of the current task since it may be called for a
395 * number of reasons. The reasons include a scheduler_tick() determined that it
396 * was necessary, because sys_exit_np() was called, because some Linux
397 * subsystem determined so, or even (in the worst case) because there is a bug
398 * hidden somewhere. Thus, we must take extreme care to determine what the
399 * current state is.
400 *
401 * The CPU could currently be scheduling a task (or not), be linked (or not).
402 *
403 * The following assertions for the scheduled task could hold:
404 *
405 * - !is_running(scheduled) // the job blocks
406 * - scheduled->timeslice == 0 // the job completed (forcefully)
407 * - is_completed() // the job completed (by syscall)
408 * - linked != scheduled // we need to reschedule (for any reason)
409 * - is_np(scheduled) // rescheduling must be delayed,
410 * sys_exit_np must be requested
411 *
412 * Any of these can occur together.
413 */
414static struct task_struct* cedf_schedule(struct task_struct * prev)
415{
416 cpu_entry_t* entry = &__get_cpu_var(cedf_cpu_entries);
417 cedf_domain_t *cluster = entry->cluster;
418 int out_of_time, sleep, preempt, np, exists, blocks;
419 struct task_struct* next = NULL;
420
421#ifdef CONFIG_RELEASE_MASTER
422 /* Bail out early if we are the release master.
423 * The release master never schedules any real-time tasks.
424 */
425 if (unlikely(cluster->domain.release_master == entry->cpu)) {
426 sched_state_task_picked();
427 return NULL;
428 }
429#endif
430
431 raw_spin_lock(&cluster->cluster_lock);
432 clear_will_schedule();
433
434 /* sanity checking */
435 BUG_ON(entry->scheduled && entry->scheduled != prev);
436 BUG_ON(entry->scheduled && !is_realtime(prev));
437 BUG_ON(is_realtime(prev) && !entry->scheduled);
438
439 /* (0) Determine state */
440 exists = entry->scheduled != NULL;
441 blocks = exists && !is_running(entry->scheduled);
442 out_of_time = exists &&
443 budget_enforced(entry->scheduled) &&
444 budget_exhausted(entry->scheduled);
445 np = exists && is_np(entry->scheduled);
446 sleep = exists && is_completed(entry->scheduled);
447 preempt = entry->scheduled != entry->linked;
448
449#ifdef WANT_ALL_SCHED_EVENTS
450 TRACE_TASK(prev, "invoked cedf_schedule.\n");
451#endif
452
453 if (exists)
454 TRACE_TASK(prev,
455 "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
456 "state:%d sig:%d\n",
457 blocks, out_of_time, np, sleep, preempt,
458 prev->state, signal_pending(prev));
459 if (entry->linked && preempt)
460 TRACE_TASK(prev, "will be preempted by %s/%d\n",
461 entry->linked->comm, entry->linked->pid);
462
463
464 /* If a task blocks we have no choice but to reschedule.
465 */
466 if (blocks)
467 unlink(entry->scheduled);
468
469 /* Request a sys_exit_np() call if we would like to preempt but cannot.
470 * We need to make sure to update the link structure anyway in case
471 * that we are still linked. Multiple calls to request_exit_np() don't
472 * hurt.
473 */
474 if (np && (out_of_time || preempt || sleep)) {
475 unlink(entry->scheduled);
476 request_exit_np(entry->scheduled);
477 }
478
479 /* Any task that is preemptable and either exhausts its execution
480 * budget or wants to sleep completes. We may have to reschedule after
481 * this. Don't do a job completion if we block (can't have timers running
482 * for blocked jobs).
483 */
484 if (!np && (out_of_time || sleep) && !blocks)
485 job_completion(entry->scheduled, !sleep);
486
487 /* Link pending task if we became unlinked.
488 */
489 if (!entry->linked)
490 link_task_to_cpu(__take_ready(&cluster->domain), entry);
491
492 /* The final scheduling decision. Do we need to switch for some reason?
493 * If linked is different from scheduled, then select linked as next.
494 */
495 if ((!np || blocks) &&
496 entry->linked != entry->scheduled) {
497 /* Schedule a linked job? */
498 if (entry->linked) {
499 entry->linked->rt_param.scheduled_on = entry->cpu;
500 next = entry->linked;
501 }
502 if (entry->scheduled) {
503 /* not gonna be scheduled soon */
504 entry->scheduled->rt_param.scheduled_on = NO_CPU;
505 TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
506 }
507 } else
508 /* Only override Linux scheduler if we have a real-time task
509 * scheduled that needs to continue.
510 */
511 if (exists)
512 next = prev;
513
514 sched_state_task_picked();
515 raw_spin_unlock(&cluster->cluster_lock);
516
517#ifdef WANT_ALL_SCHED_EVENTS
518 TRACE("cedf_lock released, next=0x%p\n", next);
519
520 if (next)
521 TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
522 else if (exists && !next)
523 TRACE("becomes idle at %llu.\n", litmus_clock());
524#endif
525
526
527 return next;
528}
529
530
531/* _finish_switch - we just finished the switch away from prev
532 */
533static void cedf_finish_switch(struct task_struct *prev)
534{
535 cpu_entry_t* entry = &__get_cpu_var(cedf_cpu_entries);
536
537 entry->scheduled = is_realtime(current) ? current : NULL;
538#ifdef WANT_ALL_SCHED_EVENTS
539 TRACE_TASK(prev, "switched away from\n");
540#endif
541}
542
543
544/* Prepare a task for running in RT mode
545 */
546static void cedf_task_new(struct task_struct * t, int on_rq, int running)
547{
548 unsigned long flags;
549 cpu_entry_t* entry;
550 cedf_domain_t* cluster;
551
552 TRACE("gsn edf: task new %d\n", t->pid);
553
554 /* the cluster doesn't change even if t is running */
555 cluster = task_cpu_cluster(t);
556
557 raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
558
559 /* setup job params */
560 release_at(t, litmus_clock());
561
562 if (running) {
563 entry = &per_cpu(cedf_cpu_entries, task_cpu(t));
564 BUG_ON(entry->scheduled);
565
566#ifdef CONFIG_RELEASE_MASTER
567 if (entry->cpu != cluster->domain.release_master) {
568#endif
569 entry->scheduled = t;
570 tsk_rt(t)->scheduled_on = task_cpu(t);
571#ifdef CONFIG_RELEASE_MASTER
572 } else {
573 /* do not schedule on release master */
574 preempt(entry); /* force resched */
575 tsk_rt(t)->scheduled_on = NO_CPU;
576 }
577#endif
578 } else {
579 t->rt_param.scheduled_on = NO_CPU;
580 }
581 t->rt_param.linked_on = NO_CPU;
582
583 cedf_job_arrival(t);
584 raw_spin_unlock_irqrestore(&(cluster->cluster_lock), flags);
585}
586
587static void cedf_task_wake_up(struct task_struct *task)
588{
589 unsigned long flags;
590 lt_t now;
591 cedf_domain_t *cluster;
592
593 TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
594
595 cluster = task_cpu_cluster(task);
596
597 raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
598 now = litmus_clock();
599 if (is_sporadic(task) && is_tardy(task, now)) {
600 /* new sporadic release */
601 release_at(task, now);
602 sched_trace_task_release(task);
603 }
604 else {
605 if (task->rt.time_slice) {
606 /* came back in time before deadline
607 */
608 tsk_rt(task)->completed = 0;
609 }
610 }
611 cedf_job_arrival(task);
612 raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
613}
614
615static void cedf_task_block(struct task_struct *t)
616{
617 unsigned long flags;
618 cedf_domain_t *cluster;
619
620 TRACE_TASK(t, "block at %llu\n", litmus_clock());
621
622 cluster = task_cpu_cluster(t);
623
624 /* unlink if necessary */
625 raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
626 unlink(t);
627 raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
628
629 BUG_ON(!is_realtime(t));
630}
631
632
633static void cedf_task_exit(struct task_struct * t)
634{
635 unsigned long flags;
636 cedf_domain_t *cluster = task_cpu_cluster(t);
637
638 /* unlink if necessary */
639 raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
640 unlink(t);
641 if (tsk_rt(t)->scheduled_on != NO_CPU) {
642 cpu_entry_t *cpu;
643 cpu = &per_cpu(cedf_cpu_entries, tsk_rt(t)->scheduled_on);
644 cpu->scheduled = NULL;
645 tsk_rt(t)->scheduled_on = NO_CPU;
646 }
647 raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
648
649 BUG_ON(!is_realtime(t));
650 TRACE_TASK(t, "RIP\n");
651}
652
653static long cedf_admit_task(struct task_struct* tsk)
654{
655 return (remote_cluster(task_cpu(tsk)) == task_cpu_cluster(tsk)) ?
656 0 : -EINVAL;
657}
658
659/* total number of cluster */
660static int num_clusters;
661/* we do not support cluster of different sizes */
662static unsigned int cluster_size;
663
664#ifdef VERBOSE_INIT
665static void print_cluster_topology(cpumask_var_t mask, int cpu)
666{
667 int chk;
668 char buf[255];
669
670 chk = cpulist_scnprintf(buf, 254, mask);
671 buf[chk] = '\0';
672 printk(KERN_INFO "CPU = %d, shared cpu(s) = %s\n", cpu, buf);
673
674}
675#endif
676
677static int clusters_allocated = 0;
678
679static void cleanup_cedf(void)
680{
681 int i;
682
683 if (clusters_allocated) {
684 for (i = 0; i < num_clusters; i++) {
685 kfree(cedf[i].cpus);
686 kfree(cedf[i].heap_node);
687 free_cpumask_var(cedf[i].cpu_map);
688 }
689
690 kfree(cedf);
691 }
692}
693
694static long cedf_activate_plugin(void)
695{
696 int i, j, cpu, ccpu, cpu_count;
697 cpu_entry_t *entry;
698
699 cpumask_var_t mask;
700 int chk = 0;
701
702 /* de-allocate old clusters, if any */
703 cleanup_cedf();
704
705 printk(KERN_INFO "C-EDF: Activate Plugin, cluster configuration = %d\n",
706 cluster_config);
707
708 /* need to get cluster_size first */
709 if(!zalloc_cpumask_var(&mask, GFP_ATOMIC))
710 return -ENOMEM;
711
712 if (unlikely(cluster_config == GLOBAL_CLUSTER)) {
713 cluster_size = num_online_cpus();
714 } else {
715 chk = get_shared_cpu_map(mask, 0, cluster_config);
716 if (chk) {
717 /* if chk != 0 then it is the max allowed index */
718 printk(KERN_INFO "C-EDF: Cluster configuration = %d "
719 "is not supported on this hardware.\n",
720 cluster_config);
721 /* User should notice that the configuration failed, so
722 * let's bail out. */
723 return -EINVAL;
724 }
725
726 cluster_size = cpumask_weight(mask);
727 }
728
729 if ((num_online_cpus() % cluster_size) != 0) {
730 /* this can't be right, some cpus are left out */
731 printk(KERN_ERR "C-EDF: Trying to group %d cpus in %d!\n",
732 num_online_cpus(), cluster_size);
733 return -1;
734 }
735
736 num_clusters = num_online_cpus() / cluster_size;
737 printk(KERN_INFO "C-EDF: %d cluster(s) of size = %d\n",
738 num_clusters, cluster_size);
739
740 /* initialize clusters */
741 cedf = kmalloc(num_clusters * sizeof(cedf_domain_t), GFP_ATOMIC);
742 for (i = 0; i < num_clusters; i++) {
743
744 cedf[i].cpus = kmalloc(cluster_size * sizeof(cpu_entry_t),
745 GFP_ATOMIC);
746 cedf[i].heap_node = kmalloc(
747 cluster_size * sizeof(struct bheap_node),
748 GFP_ATOMIC);
749 bheap_init(&(cedf[i].cpu_heap));
750 edf_domain_init(&(cedf[i].domain), NULL, cedf_release_jobs);
751
752 if(!zalloc_cpumask_var(&cedf[i].cpu_map, GFP_ATOMIC))
753 return -ENOMEM;
754#ifdef CONFIG_RELEASE_MASTER
755 cedf[i].domain.release_master = atomic_read(&release_master_cpu);
756#endif
757 }
758
759 /* cycle through cluster and add cpus to them */
760 for (i = 0; i < num_clusters; i++) {
761
762 for_each_online_cpu(cpu) {
763 /* check if the cpu is already in a cluster */
764 for (j = 0; j < num_clusters; j++)
765 if (cpumask_test_cpu(cpu, cedf[j].cpu_map))
766 break;
767 /* if it is in a cluster go to next cpu */
768 if (j < num_clusters &&
769 cpumask_test_cpu(cpu, cedf[j].cpu_map))
770 continue;
771
772 /* this cpu isn't in any cluster */
773 /* get the shared cpus */
774 if (unlikely(cluster_config == GLOBAL_CLUSTER))
775 cpumask_copy(mask, cpu_online_mask);
776 else
777 get_shared_cpu_map(mask, cpu, cluster_config);
778
779 cpumask_copy(cedf[i].cpu_map, mask);
780#ifdef VERBOSE_INIT
781 print_cluster_topology(mask, cpu);
782#endif
783 /* add cpus to current cluster and init cpu_entry_t */
784 cpu_count = 0;
785 for_each_cpu(ccpu, cedf[i].cpu_map) {
786
787 entry = &per_cpu(cedf_cpu_entries, ccpu);
788 cedf[i].cpus[cpu_count] = entry;
789 atomic_set(&entry->will_schedule, 0);
790 entry->cpu = ccpu;
791 entry->cluster = &cedf[i];
792 entry->hn = &(cedf[i].heap_node[cpu_count]);
793 bheap_node_init(&entry->hn, entry);
794
795 cpu_count++;
796
797 entry->linked = NULL;
798 entry->scheduled = NULL;
799#ifdef CONFIG_RELEASE_MASTER
800 /* only add CPUs that should schedule jobs */
801 if (entry->cpu != entry->cluster->domain.release_master)
802#endif
803 update_cpu_position(entry);
804 }
805 /* done with this cluster */
806 break;
807 }
808 }
809
810 free_cpumask_var(mask);
811 clusters_allocated = 1;
812 return 0;
813}
814
815/* Plugin object */
816static struct sched_plugin cedf_plugin __cacheline_aligned_in_smp = {
817 .plugin_name = "C-EDF",
818 .finish_switch = cedf_finish_switch,
819 .tick = cedf_tick,
820 .task_new = cedf_task_new,
821 .complete_job = complete_job,
822 .task_exit = cedf_task_exit,
823 .schedule = cedf_schedule,
824 .task_wake_up = cedf_task_wake_up,
825 .task_block = cedf_task_block,
826 .admit_task = cedf_admit_task,
827 .activate_plugin = cedf_activate_plugin,
828};
829
830static struct proc_dir_entry *cluster_file = NULL, *cedf_dir = NULL;
831
832static int __init init_cedf(void)
833{
834 int err, fs;
835
836 err = register_sched_plugin(&cedf_plugin);
837 if (!err) {
838 fs = make_plugin_proc_dir(&cedf_plugin, &cedf_dir);
839 if (!fs)
840 cluster_file = create_cluster_file(cedf_dir, &cluster_config);
841 else
842 printk(KERN_ERR "Could not allocate C-EDF procfs dir.\n");
843 }
844 return err;
845}
846
847static void clean_cedf(void)
848{
849 cleanup_cedf();
850 if (cluster_file)
851 remove_proc_entry("cluster", cedf_dir);
852 if (cedf_dir)
853 remove_plugin_proc_dir(&cedf_plugin);
854}
855
856module_init(init_cedf);
857module_exit(clean_cedf);
diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c
new file mode 100644
index 00000000000..5956978ccdb
--- /dev/null
+++ b/litmus/sched_gsn_edf.c
@@ -0,0 +1,1030 @@
1/*
2 * litmus/sched_gsn_edf.c
3 *
4 * Implementation of the GSN-EDF scheduling algorithm.
5 *
6 * This version uses the simple approach and serializes all scheduling
7 * decisions by the use of a queue lock. This is probably not the
8 * best way to do it, but it should suffice for now.
9 */
10
11#include <linux/spinlock.h>
12#include <linux/percpu.h>
13#include <linux/sched.h>
14#include <linux/slab.h>
15
16#include <litmus/litmus.h>
17#include <litmus/jobs.h>
18#include <litmus/sched_plugin.h>
19#include <litmus/edf_common.h>
20#include <litmus/sched_trace.h>
21#include <litmus/trace.h>
22
23#include <litmus/preempt.h>
24#include <litmus/budget.h>
25
26#include <litmus/bheap.h>
27
28#ifdef CONFIG_SCHED_CPU_AFFINITY
29#include <litmus/affinity.h>
30#endif
31
32#include <linux/module.h>
33
34/* Overview of GSN-EDF operations.
35 *
36 * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This
37 * description only covers how the individual operations are implemented in
38 * LITMUS.
39 *
40 * link_task_to_cpu(T, cpu) - Low-level operation to update the linkage
41 * structure (NOT the actually scheduled
42 * task). If there is another linked task To
43 * already it will set To->linked_on = NO_CPU
44 * (thereby removing its association with this
45 * CPU). However, it will not requeue the
46 * previously linked task (if any). It will set
47 * T's state to not completed and check whether
48 * it is already running somewhere else. If T
49 * is scheduled somewhere else it will link
50 * it to that CPU instead (and pull the linked
51 * task to cpu). T may be NULL.
52 *
53 * unlink(T) - Unlink removes T from all scheduler data
54 * structures. If it is linked to some CPU it
55 * will link NULL to that CPU. If it is
56 * currently queued in the gsnedf queue it will
57 * be removed from the rt_domain. It is safe to
58 * call unlink(T) if T is not linked. T may not
59 * be NULL.
60 *
61 * requeue(T) - Requeue will insert T into the appropriate
62 * queue. If the system is in real-time mode and
63 * the T is released already, it will go into the
64 * ready queue. If the system is not in
65 * real-time mode is T, then T will go into the
66 * release queue. If T's release time is in the
67 * future, it will go into the release
68 * queue. That means that T's release time/job
69 * no/etc. has to be updated before requeu(T) is
70 * called. It is not safe to call requeue(T)
71 * when T is already queued. T may not be NULL.
72 *
73 * gsnedf_job_arrival(T) - This is the catch all function when T enters
74 * the system after either a suspension or at a
75 * job release. It will queue T (which means it
76 * is not safe to call gsnedf_job_arrival(T) if
77 * T is already queued) and then check whether a
78 * preemption is necessary. If a preemption is
79 * necessary it will update the linkage
80 * accordingly and cause scheduled to be called
81 * (either with an IPI or need_resched). It is
82 * safe to call gsnedf_job_arrival(T) if T's
83 * next job has not been actually released yet
84 * (releast time in the future). T will be put
85 * on the release queue in that case.
86 *
87 * job_completion(T) - Take care of everything that needs to be done
88 * to prepare T for its next release and place
89 * it in the right queue with
90 * gsnedf_job_arrival().
91 *
92 *
93 * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is
94 * equivalent to unlink(T). Note that if you unlink a task from a CPU none of
95 * the functions will automatically propagate pending task from the ready queue
96 * to a linked task. This is the job of the calling function ( by means of
97 * __take_ready).
98 */
99
100
101/* cpu_entry_t - maintain the linked and scheduled state
102 */
103typedef struct {
104 int cpu;
105 struct task_struct* linked; /* only RT tasks */
106 struct task_struct* scheduled; /* only RT tasks */
107 struct bheap_node* hn;
108} cpu_entry_t;
109DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries);
110
111cpu_entry_t* gsnedf_cpus[NR_CPUS];
112
113/* the cpus queue themselves according to priority in here */
114static struct bheap_node gsnedf_heap_node[NR_CPUS];
115static struct bheap gsnedf_cpu_heap;
116
117static rt_domain_t gsnedf;
118#define gsnedf_lock (gsnedf.ready_lock)
119
120
121/* Uncomment this if you want to see all scheduling decisions in the
122 * TRACE() log.
123#define WANT_ALL_SCHED_EVENTS
124 */
125
126static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
127{
128 cpu_entry_t *a, *b;
129 a = _a->value;
130 b = _b->value;
131 /* Note that a and b are inverted: we want the lowest-priority CPU at
132 * the top of the heap.
133 */
134 return edf_higher_prio(b->linked, a->linked);
135}
136
137/* update_cpu_position - Move the cpu entry to the correct place to maintain
138 * order in the cpu queue. Caller must hold gsnedf lock.
139 */
140static void update_cpu_position(cpu_entry_t *entry)
141{
142 if (likely(bheap_node_in_heap(entry->hn)))
143 bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn);
144 bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn);
145}
146
147/* caller must hold gsnedf lock */
148static cpu_entry_t* lowest_prio_cpu(void)
149{
150 struct bheap_node* hn;
151 hn = bheap_peek(cpu_lower_prio, &gsnedf_cpu_heap);
152 return hn->value;
153}
154
155
156/* link_task_to_cpu - Update the link of a CPU.
157 * Handles the case where the to-be-linked task is already
158 * scheduled on a different CPU.
159 */
160static noinline void link_task_to_cpu(struct task_struct* linked,
161 cpu_entry_t *entry)
162{
163 cpu_entry_t *sched;
164 struct task_struct* tmp;
165 int on_cpu;
166
167 BUG_ON(linked && !is_realtime(linked));
168
169 /* Currently linked task is set to be unlinked. */
170 if (entry->linked) {
171 entry->linked->rt_param.linked_on = NO_CPU;
172 }
173
174 /* Link new task to CPU. */
175 if (linked) {
176 tsk_rt(linked)->completed = 0;
177 /* handle task is already scheduled somewhere! */
178 on_cpu = linked->rt_param.scheduled_on;
179 if (on_cpu != NO_CPU) {
180 sched = &per_cpu(gsnedf_cpu_entries, on_cpu);
181 /* this should only happen if not linked already */
182 BUG_ON(sched->linked == linked);
183
184 /* If we are already scheduled on the CPU to which we
185 * wanted to link, we don't need to do the swap --
186 * we just link ourselves to the CPU and depend on
187 * the caller to get things right.
188 */
189 if (entry != sched) {
190 TRACE_TASK(linked,
191 "already scheduled on %d, updating link.\n",
192 sched->cpu);
193 tmp = sched->linked;
194 linked->rt_param.linked_on = sched->cpu;
195 sched->linked = linked;
196 update_cpu_position(sched);
197 linked = tmp;
198 }
199 }
200 if (linked) /* might be NULL due to swap */
201 linked->rt_param.linked_on = entry->cpu;
202 }
203 entry->linked = linked;
204#ifdef WANT_ALL_SCHED_EVENTS
205 if (linked)
206 TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
207 else
208 TRACE("NULL linked to %d.\n", entry->cpu);
209#endif
210 update_cpu_position(entry);
211}
212
213/* unlink - Make sure a task is not linked any longer to an entry
214 * where it was linked before. Must hold gsnedf_lock.
215 */
216static noinline void unlink(struct task_struct* t)
217{
218 cpu_entry_t *entry;
219
220 if (t->rt_param.linked_on != NO_CPU) {
221 /* unlink */
222 entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on);
223 t->rt_param.linked_on = NO_CPU;
224 link_task_to_cpu(NULL, entry);
225 } else if (is_queued(t)) {
226 /* This is an interesting situation: t is scheduled,
227 * but was just recently unlinked. It cannot be
228 * linked anywhere else (because then it would have
229 * been relinked to this CPU), thus it must be in some
230 * queue. We must remove it from the list in this
231 * case.
232 */
233 remove(&gsnedf, t);
234 }
235}
236
237
238/* preempt - force a CPU to reschedule
239 */
240static void preempt(cpu_entry_t *entry)
241{
242 preempt_if_preemptable(entry->scheduled, entry->cpu);
243}
244
245/* requeue - Put an unlinked task into gsn-edf domain.
246 * Caller must hold gsnedf_lock.
247 */
248static noinline void requeue(struct task_struct* task)
249{
250 BUG_ON(!task);
251 /* sanity check before insertion */
252 BUG_ON(is_queued(task));
253
254 if (is_early_releasing(task) || is_released(task, litmus_clock()))
255 __add_ready(&gsnedf, task);
256 else {
257 /* it has got to wait */
258 add_release(&gsnedf, task);
259 }
260}
261
262#ifdef CONFIG_SCHED_CPU_AFFINITY
263static cpu_entry_t* gsnedf_get_nearest_available_cpu(cpu_entry_t *start)
264{
265 cpu_entry_t *affinity;
266
267 get_nearest_available_cpu(affinity, start, gsnedf_cpu_entries,
268#ifdef CONFIG_RELEASE_MASTER
269 gsnedf.release_master
270#else
271 NO_CPU
272#endif
273 );
274
275 return(affinity);
276}
277#endif
278
279/* check for any necessary preemptions */
280static void check_for_preemptions(void)
281{
282 struct task_struct *task;
283 cpu_entry_t *last;
284
285 for (last = lowest_prio_cpu();
286 edf_preemption_needed(&gsnedf, last->linked);
287 last = lowest_prio_cpu()) {
288 /* preemption necessary */
289 task = __take_ready(&gsnedf);
290 TRACE("check_for_preemptions: attempting to link task %d to %d\n",
291 task->pid, last->cpu);
292
293#ifdef CONFIG_SCHED_CPU_AFFINITY
294 {
295 cpu_entry_t *affinity =
296 gsnedf_get_nearest_available_cpu(
297 &per_cpu(gsnedf_cpu_entries, task_cpu(task)));
298 if (affinity)
299 last = affinity;
300 else if (requeue_preempted_job(last->linked))
301 requeue(last->linked);
302 }
303#else
304 if (requeue_preempted_job(last->linked))
305 requeue(last->linked);
306#endif
307
308 link_task_to_cpu(task, last);
309 preempt(last);
310 }
311}
312
313/* gsnedf_job_arrival: task is either resumed or released */
314static noinline void gsnedf_job_arrival(struct task_struct* task)
315{
316 BUG_ON(!task);
317
318 requeue(task);
319 check_for_preemptions();
320}
321
322static void gsnedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
323{
324 unsigned long flags;
325
326 raw_spin_lock_irqsave(&gsnedf_lock, flags);
327
328 __merge_ready(rt, tasks);
329 check_for_preemptions();
330
331 raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
332}
333
334/* caller holds gsnedf_lock */
335static noinline void job_completion(struct task_struct *t, int forced)
336{
337 BUG_ON(!t);
338
339 sched_trace_task_completion(t, forced);
340
341 TRACE_TASK(t, "job_completion().\n");
342
343 /* set flags */
344 tsk_rt(t)->completed = 1;
345 /* prepare for next period */
346 prepare_for_next_period(t);
347 if (is_early_releasing(t) || is_released(t, litmus_clock()))
348 sched_trace_task_release(t);
349 /* unlink */
350 unlink(t);
351 /* requeue
352 * But don't requeue a blocking task. */
353 if (is_running(t))
354 gsnedf_job_arrival(t);
355}
356
357/* gsnedf_tick - this function is called for every local timer
358 * interrupt.
359 *
360 * checks whether the current task has expired and checks
361 * whether we need to preempt it if it has not expired
362 */
363static void gsnedf_tick(struct task_struct* t)
364{
365 if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
366 if (!is_np(t)) {
367 /* np tasks will be preempted when they become
368 * preemptable again
369 */
370 litmus_reschedule_local();
371 TRACE("gsnedf_scheduler_tick: "
372 "%d is preemptable "
373 " => FORCE_RESCHED\n", t->pid);
374 } else if (is_user_np(t)) {
375 TRACE("gsnedf_scheduler_tick: "
376 "%d is non-preemptable, "
377 "preemption delayed.\n", t->pid);
378 request_exit_np(t);
379 }
380 }
381}
382
383/* Getting schedule() right is a bit tricky. schedule() may not make any
384 * assumptions on the state of the current task since it may be called for a
385 * number of reasons. The reasons include a scheduler_tick() determined that it
386 * was necessary, because sys_exit_np() was called, because some Linux
387 * subsystem determined so, or even (in the worst case) because there is a bug
388 * hidden somewhere. Thus, we must take extreme care to determine what the
389 * current state is.
390 *
391 * The CPU could currently be scheduling a task (or not), be linked (or not).
392 *
393 * The following assertions for the scheduled task could hold:
394 *
395 * - !is_running(scheduled) // the job blocks
396 * - scheduled->timeslice == 0 // the job completed (forcefully)
397 * - is_completed() // the job completed (by syscall)
398 * - linked != scheduled // we need to reschedule (for any reason)
399 * - is_np(scheduled) // rescheduling must be delayed,
400 * sys_exit_np must be requested
401 *
402 * Any of these can occur together.
403 */
404static struct task_struct* gsnedf_schedule(struct task_struct * prev)
405{
406 cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries);
407 int out_of_time, sleep, preempt, np, exists, blocks;
408 struct task_struct* next = NULL;
409
410#ifdef CONFIG_RELEASE_MASTER
411 /* Bail out early if we are the release master.
412 * The release master never schedules any real-time tasks.
413 */
414 if (unlikely(gsnedf.release_master == entry->cpu)) {
415 sched_state_task_picked();
416 return NULL;
417 }
418#endif
419
420 raw_spin_lock(&gsnedf_lock);
421
422 /* sanity checking */
423 BUG_ON(entry->scheduled && entry->scheduled != prev);
424 BUG_ON(entry->scheduled && !is_realtime(prev));
425 BUG_ON(is_realtime(prev) && !entry->scheduled);
426
427 /* (0) Determine state */
428 exists = entry->scheduled != NULL;
429 blocks = exists && !is_running(entry->scheduled);
430 out_of_time = exists && budget_enforced(entry->scheduled)
431 && budget_exhausted(entry->scheduled);
432 np = exists && is_np(entry->scheduled);
433 sleep = exists && is_completed(entry->scheduled);
434 preempt = entry->scheduled != entry->linked;
435
436#ifdef WANT_ALL_SCHED_EVENTS
437 TRACE_TASK(prev, "invoked gsnedf_schedule.\n");
438#endif
439
440 if (exists)
441 TRACE_TASK(prev,
442 "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
443 "state:%d sig:%d\n",
444 blocks, out_of_time, np, sleep, preempt,
445 prev->state, signal_pending(prev));
446 if (entry->linked && preempt)
447 TRACE_TASK(prev, "will be preempted by %s/%d\n",
448 entry->linked->comm, entry->linked->pid);
449
450
451 /* If a task blocks we have no choice but to reschedule.
452 */
453 if (blocks)
454 unlink(entry->scheduled);
455
456 /* Request a sys_exit_np() call if we would like to preempt but cannot.
457 * We need to make sure to update the link structure anyway in case
458 * that we are still linked. Multiple calls to request_exit_np() don't
459 * hurt.
460 */
461 if (np && (out_of_time || preempt || sleep)) {
462 unlink(entry->scheduled);
463 request_exit_np(entry->scheduled);
464 }
465
466 /* Any task that is preemptable and either exhausts its execution
467 * budget or wants to sleep completes. We may have to reschedule after
468 * this. Don't do a job completion if we block (can't have timers running
469 * for blocked jobs).
470 */
471 if (!np && (out_of_time || sleep) && !blocks)
472 job_completion(entry->scheduled, !sleep);
473
474 /* Link pending task if we became unlinked.
475 */
476 if (!entry->linked)
477 link_task_to_cpu(__take_ready(&gsnedf), entry);
478
479 /* The final scheduling decision. Do we need to switch for some reason?
480 * If linked is different from scheduled, then select linked as next.
481 */
482 if ((!np || blocks) &&
483 entry->linked != entry->scheduled) {
484 /* Schedule a linked job? */
485 if (entry->linked) {
486 entry->linked->rt_param.scheduled_on = entry->cpu;
487 next = entry->linked;
488 TRACE_TASK(next, "scheduled_on = P%d\n", smp_processor_id());
489 }
490 if (entry->scheduled) {
491 /* not gonna be scheduled soon */
492 entry->scheduled->rt_param.scheduled_on = NO_CPU;
493 TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
494 }
495 } else
496 /* Only override Linux scheduler if we have a real-time task
497 * scheduled that needs to continue.
498 */
499 if (exists)
500 next = prev;
501
502 sched_state_task_picked();
503
504 raw_spin_unlock(&gsnedf_lock);
505
506#ifdef WANT_ALL_SCHED_EVENTS
507 TRACE("gsnedf_lock released, next=0x%p\n", next);
508
509 if (next)
510 TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
511 else if (exists && !next)
512 TRACE("becomes idle at %llu.\n", litmus_clock());
513#endif
514
515
516 return next;
517}
518
519
520/* _finish_switch - we just finished the switch away from prev
521 */
522static void gsnedf_finish_switch(struct task_struct *prev)
523{
524 cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries);
525
526 entry->scheduled = is_realtime(current) ? current : NULL;
527#ifdef WANT_ALL_SCHED_EVENTS
528 TRACE_TASK(prev, "switched away from\n");
529#endif
530}
531
532
533/* Prepare a task for running in RT mode
534 */
535static void gsnedf_task_new(struct task_struct * t, int on_rq, int running)
536{
537 unsigned long flags;
538 cpu_entry_t* entry;
539
540 TRACE("gsn edf: task new %d\n", t->pid);
541
542 raw_spin_lock_irqsave(&gsnedf_lock, flags);
543
544 /* setup job params */
545 release_at(t, litmus_clock());
546
547 if (running) {
548 entry = &per_cpu(gsnedf_cpu_entries, task_cpu(t));
549 BUG_ON(entry->scheduled);
550
551#ifdef CONFIG_RELEASE_MASTER
552 if (entry->cpu != gsnedf.release_master) {
553#endif
554 entry->scheduled = t;
555 tsk_rt(t)->scheduled_on = task_cpu(t);
556#ifdef CONFIG_RELEASE_MASTER
557 } else {
558 /* do not schedule on release master */
559 preempt(entry); /* force resched */
560 tsk_rt(t)->scheduled_on = NO_CPU;
561 }
562#endif
563 } else {
564 t->rt_param.scheduled_on = NO_CPU;
565 }
566 t->rt_param.linked_on = NO_CPU;
567
568 gsnedf_job_arrival(t);
569 raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
570}
571
572static void gsnedf_task_wake_up(struct task_struct *task)
573{
574 unsigned long flags;
575 lt_t now;
576
577 TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
578
579 raw_spin_lock_irqsave(&gsnedf_lock, flags);
580 now = litmus_clock();
581 if (is_sporadic(task) && is_tardy(task, now)) {
582 /* new sporadic release */
583 release_at(task, now);
584 sched_trace_task_release(task);
585 }
586 else {
587 if (task->rt.time_slice) {
588 /* came back in time before deadline
589 */
590 tsk_rt(task)->completed = 0;
591 }
592 }
593 gsnedf_job_arrival(task);
594 raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
595}
596
597static void gsnedf_task_block(struct task_struct *t)
598{
599 unsigned long flags;
600
601 TRACE_TASK(t, "block at %llu\n", litmus_clock());
602
603 /* unlink if necessary */
604 raw_spin_lock_irqsave(&gsnedf_lock, flags);
605 unlink(t);
606 raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
607
608 BUG_ON(!is_realtime(t));
609}
610
611
612static void gsnedf_task_exit(struct task_struct * t)
613{
614 unsigned long flags;
615
616 /* unlink if necessary */
617 raw_spin_lock_irqsave(&gsnedf_lock, flags);
618 unlink(t);
619 if (tsk_rt(t)->scheduled_on != NO_CPU) {
620 gsnedf_cpus[tsk_rt(t)->scheduled_on]->scheduled = NULL;
621 tsk_rt(t)->scheduled_on = NO_CPU;
622 }
623 raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
624
625 BUG_ON(!is_realtime(t));
626 TRACE_TASK(t, "RIP\n");
627}
628
629
630static long gsnedf_admit_task(struct task_struct* tsk)
631{
632 return 0;
633}
634
635#ifdef CONFIG_LITMUS_LOCKING
636
637#include <litmus/fdso.h>
638
639/* called with IRQs off */
640static void set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
641{
642 int linked_on;
643 int check_preempt = 0;
644
645 raw_spin_lock(&gsnedf_lock);
646
647 TRACE_TASK(t, "inherits priority from %s/%d\n", prio_inh->comm, prio_inh->pid);
648 tsk_rt(t)->inh_task = prio_inh;
649
650 linked_on = tsk_rt(t)->linked_on;
651
652 /* If it is scheduled, then we need to reorder the CPU heap. */
653 if (linked_on != NO_CPU) {
654 TRACE_TASK(t, "%s: linked on %d\n",
655 __FUNCTION__, linked_on);
656 /* Holder is scheduled; need to re-order CPUs.
657 * We can't use heap_decrease() here since
658 * the cpu_heap is ordered in reverse direction, so
659 * it is actually an increase. */
660 bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap,
661 gsnedf_cpus[linked_on]->hn);
662 bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap,
663 gsnedf_cpus[linked_on]->hn);
664 } else {
665 /* holder may be queued: first stop queue changes */
666 raw_spin_lock(&gsnedf.release_lock);
667 if (is_queued(t)) {
668 TRACE_TASK(t, "%s: is queued\n",
669 __FUNCTION__);
670 /* We need to update the position of holder in some
671 * heap. Note that this could be a release heap if we
672 * budget enforcement is used and this job overran. */
673 check_preempt =
674 !bheap_decrease(edf_ready_order,
675 tsk_rt(t)->heap_node);
676 } else {
677 /* Nothing to do: if it is not queued and not linked
678 * then it is either sleeping or currently being moved
679 * by other code (e.g., a timer interrupt handler) that
680 * will use the correct priority when enqueuing the
681 * task. */
682 TRACE_TASK(t, "%s: is NOT queued => Done.\n",
683 __FUNCTION__);
684 }
685 raw_spin_unlock(&gsnedf.release_lock);
686
687 /* If holder was enqueued in a release heap, then the following
688 * preemption check is pointless, but we can't easily detect
689 * that case. If you want to fix this, then consider that
690 * simply adding a state flag requires O(n) time to update when
691 * releasing n tasks, which conflicts with the goal to have
692 * O(log n) merges. */
693 if (check_preempt) {
694 /* heap_decrease() hit the top level of the heap: make
695 * sure preemption checks get the right task, not the
696 * potentially stale cache. */
697 bheap_uncache_min(edf_ready_order,
698 &gsnedf.ready_queue);
699 check_for_preemptions();
700 }
701 }
702
703 raw_spin_unlock(&gsnedf_lock);
704}
705
706/* called with IRQs off */
707static void clear_priority_inheritance(struct task_struct* t)
708{
709 raw_spin_lock(&gsnedf_lock);
710
711 /* A job only stops inheriting a priority when it releases a
712 * resource. Thus we can make the following assumption.*/
713 BUG_ON(tsk_rt(t)->scheduled_on == NO_CPU);
714
715 TRACE_TASK(t, "priority restored\n");
716 tsk_rt(t)->inh_task = NULL;
717
718 /* Check if rescheduling is necessary. We can't use heap_decrease()
719 * since the priority was effectively lowered. */
720 unlink(t);
721 gsnedf_job_arrival(t);
722
723 raw_spin_unlock(&gsnedf_lock);
724}
725
726
727/* ******************** FMLP support ********************** */
728
729/* struct for semaphore with priority inheritance */
730struct fmlp_semaphore {
731 struct litmus_lock litmus_lock;
732
733 /* current resource holder */
734 struct task_struct *owner;
735
736 /* highest-priority waiter */
737 struct task_struct *hp_waiter;
738
739 /* FIFO queue of waiting tasks */
740 wait_queue_head_t wait;
741};
742
743static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock)
744{
745 return container_of(lock, struct fmlp_semaphore, litmus_lock);
746}
747
748/* caller is responsible for locking */
749struct task_struct* find_hp_waiter(struct fmlp_semaphore *sem,
750 struct task_struct* skip)
751{
752 struct list_head *pos;
753 struct task_struct *queued, *found = NULL;
754
755 list_for_each(pos, &sem->wait.task_list) {
756 queued = (struct task_struct*) list_entry(pos, wait_queue_t,
757 task_list)->private;
758
759 /* Compare task prios, find high prio task. */
760 if (queued != skip && edf_higher_prio(queued, found))
761 found = queued;
762 }
763 return found;
764}
765
766int gsnedf_fmlp_lock(struct litmus_lock* l)
767{
768 struct task_struct* t = current;
769 struct fmlp_semaphore *sem = fmlp_from_lock(l);
770 wait_queue_t wait;
771 unsigned long flags;
772
773 if (!is_realtime(t))
774 return -EPERM;
775
776 /* prevent nested lock acquisition --- not supported by FMLP */
777 if (tsk_rt(t)->num_locks_held)
778 return -EBUSY;
779
780 spin_lock_irqsave(&sem->wait.lock, flags);
781
782 if (sem->owner) {
783 /* resource is not free => must suspend and wait */
784
785 init_waitqueue_entry(&wait, t);
786
787 /* FIXME: interruptible would be nice some day */
788 set_task_state(t, TASK_UNINTERRUPTIBLE);
789
790 __add_wait_queue_tail_exclusive(&sem->wait, &wait);
791
792 /* check if we need to activate priority inheritance */
793 if (edf_higher_prio(t, sem->hp_waiter)) {
794 sem->hp_waiter = t;
795 if (edf_higher_prio(t, sem->owner))
796 set_priority_inheritance(sem->owner, sem->hp_waiter);
797 }
798
799 TS_LOCK_SUSPEND;
800
801 /* release lock before sleeping */
802 spin_unlock_irqrestore(&sem->wait.lock, flags);
803
804 /* We depend on the FIFO order. Thus, we don't need to recheck
805 * when we wake up; we are guaranteed to have the lock since
806 * there is only one wake up per release.
807 */
808
809 schedule();
810
811 TS_LOCK_RESUME;
812
813 /* Since we hold the lock, no other task will change
814 * ->owner. We can thus check it without acquiring the spin
815 * lock. */
816 BUG_ON(sem->owner != t);
817 } else {
818 /* it's ours now */
819 sem->owner = t;
820
821 spin_unlock_irqrestore(&sem->wait.lock, flags);
822 }
823
824 tsk_rt(t)->num_locks_held++;
825
826 return 0;
827}
828
829int gsnedf_fmlp_unlock(struct litmus_lock* l)
830{
831 struct task_struct *t = current, *next;
832 struct fmlp_semaphore *sem = fmlp_from_lock(l);
833 unsigned long flags;
834 int err = 0;
835
836 spin_lock_irqsave(&sem->wait.lock, flags);
837
838 if (sem->owner != t) {
839 err = -EINVAL;
840 goto out;
841 }
842
843 tsk_rt(t)->num_locks_held--;
844
845 /* check if there are jobs waiting for this resource */
846 next = __waitqueue_remove_first(&sem->wait);
847 if (next) {
848 /* next becomes the resouce holder */
849 sem->owner = next;
850 TRACE_CUR("lock ownership passed to %s/%d\n", next->comm, next->pid);
851
852 /* determine new hp_waiter if necessary */
853 if (next == sem->hp_waiter) {
854 TRACE_TASK(next, "was highest-prio waiter\n");
855 /* next has the highest priority --- it doesn't need to
856 * inherit. However, we need to make sure that the
857 * next-highest priority in the queue is reflected in
858 * hp_waiter. */
859 sem->hp_waiter = find_hp_waiter(sem, next);
860 if (sem->hp_waiter)
861 TRACE_TASK(sem->hp_waiter, "is new highest-prio waiter\n");
862 else
863 TRACE("no further waiters\n");
864 } else {
865 /* Well, if next is not the highest-priority waiter,
866 * then it ought to inherit the highest-priority
867 * waiter's priority. */
868 set_priority_inheritance(next, sem->hp_waiter);
869 }
870
871 /* wake up next */
872 wake_up_process(next);
873 } else
874 /* becomes available */
875 sem->owner = NULL;
876
877 /* we lose the benefit of priority inheritance (if any) */
878 if (tsk_rt(t)->inh_task)
879 clear_priority_inheritance(t);
880
881out:
882 spin_unlock_irqrestore(&sem->wait.lock, flags);
883
884 return err;
885}
886
887int gsnedf_fmlp_close(struct litmus_lock* l)
888{
889 struct task_struct *t = current;
890 struct fmlp_semaphore *sem = fmlp_from_lock(l);
891 unsigned long flags;
892
893 int owner;
894
895 spin_lock_irqsave(&sem->wait.lock, flags);
896
897 owner = sem->owner == t;
898
899 spin_unlock_irqrestore(&sem->wait.lock, flags);
900
901 if (owner)
902 gsnedf_fmlp_unlock(l);
903
904 return 0;
905}
906
907void gsnedf_fmlp_free(struct litmus_lock* lock)
908{
909 kfree(fmlp_from_lock(lock));
910}
911
912static struct litmus_lock_ops gsnedf_fmlp_lock_ops = {
913 .close = gsnedf_fmlp_close,
914 .lock = gsnedf_fmlp_lock,
915 .unlock = gsnedf_fmlp_unlock,
916 .deallocate = gsnedf_fmlp_free,
917};
918
919static struct litmus_lock* gsnedf_new_fmlp(void)
920{
921 struct fmlp_semaphore* sem;
922
923 sem = kmalloc(sizeof(*sem), GFP_KERNEL);
924 if (!sem)
925 return NULL;
926
927 sem->owner = NULL;
928 sem->hp_waiter = NULL;
929 init_waitqueue_head(&sem->wait);
930 sem->litmus_lock.ops = &gsnedf_fmlp_lock_ops;
931
932 return &sem->litmus_lock;
933}
934
935/* **** lock constructor **** */
936
937
938static long gsnedf_allocate_lock(struct litmus_lock **lock, int type,
939 void* __user unused)
940{
941 int err = -ENXIO;
942
943 /* GSN-EDF currently only supports the FMLP for global resources. */
944 switch (type) {
945
946 case FMLP_SEM:
947 /* Flexible Multiprocessor Locking Protocol */
948 *lock = gsnedf_new_fmlp();
949 if (*lock)
950 err = 0;
951 else
952 err = -ENOMEM;
953 break;
954
955 };
956
957 return err;
958}
959
960#endif
961
962
963static long gsnedf_activate_plugin(void)
964{
965 int cpu;
966 cpu_entry_t *entry;
967
968 bheap_init(&gsnedf_cpu_heap);
969#ifdef CONFIG_RELEASE_MASTER
970 gsnedf.release_master = atomic_read(&release_master_cpu);
971#endif
972
973 for_each_online_cpu(cpu) {
974 entry = &per_cpu(gsnedf_cpu_entries, cpu);
975 bheap_node_init(&entry->hn, entry);
976 entry->linked = NULL;
977 entry->scheduled = NULL;
978#ifdef CONFIG_RELEASE_MASTER
979 if (cpu != gsnedf.release_master) {
980#endif
981 TRACE("GSN-EDF: Initializing CPU #%d.\n", cpu);
982 update_cpu_position(entry);
983#ifdef CONFIG_RELEASE_MASTER
984 } else {
985 TRACE("GSN-EDF: CPU %d is release master.\n", cpu);
986 }
987#endif
988 }
989 return 0;
990}
991
992/* Plugin object */
993static struct sched_plugin gsn_edf_plugin __cacheline_aligned_in_smp = {
994 .plugin_name = "GSN-EDF",
995 .finish_switch = gsnedf_finish_switch,
996 .tick = gsnedf_tick,
997 .task_new = gsnedf_task_new,
998 .complete_job = complete_job,
999 .task_exit = gsnedf_task_exit,
1000 .schedule = gsnedf_schedule,
1001 .task_wake_up = gsnedf_task_wake_up,
1002 .task_block = gsnedf_task_block,
1003 .admit_task = gsnedf_admit_task,
1004 .activate_plugin = gsnedf_activate_plugin,
1005#ifdef CONFIG_LITMUS_LOCKING
1006 .allocate_lock = gsnedf_allocate_lock,
1007#endif
1008};
1009
1010
1011static int __init init_gsn_edf(void)
1012{
1013 int cpu;
1014 cpu_entry_t *entry;
1015
1016 bheap_init(&gsnedf_cpu_heap);
1017 /* initialize CPU state */
1018 for (cpu = 0; cpu < NR_CPUS; cpu++) {
1019 entry = &per_cpu(gsnedf_cpu_entries, cpu);
1020 gsnedf_cpus[cpu] = entry;
1021 entry->cpu = cpu;
1022 entry->hn = &gsnedf_heap_node[cpu];
1023 bheap_node_init(&entry->hn, entry);
1024 }
1025 edf_domain_init(&gsnedf, NULL, gsnedf_release_jobs);
1026 return register_sched_plugin(&gsn_edf_plugin);
1027}
1028
1029
1030module_init(init_gsn_edf);
diff --git a/litmus/sched_litmus.c b/litmus/sched_litmus.c
new file mode 100644
index 00000000000..6b32cf09abb
--- /dev/null
+++ b/litmus/sched_litmus.c
@@ -0,0 +1,330 @@
1/* This file is included from kernel/sched.c */
2
3#include <litmus/litmus.h>
4#include <litmus/budget.h>
5#include <litmus/sched_plugin.h>
6#include <litmus/preempt.h>
7
8static void update_time_litmus(struct rq *rq, struct task_struct *p)
9{
10 u64 delta = rq->clock - p->se.exec_start;
11 if (unlikely((s64)delta < 0))
12 delta = 0;
13 /* per job counter */
14 p->rt_param.job_params.exec_time += delta;
15 /* task counter */
16 p->se.sum_exec_runtime += delta;
17 /* sched_clock() */
18 p->se.exec_start = rq->clock;
19 cpuacct_charge(p, delta);
20}
21
22static void double_rq_lock(struct rq *rq1, struct rq *rq2);
23static void double_rq_unlock(struct rq *rq1, struct rq *rq2);
24
25/*
26 * litmus_tick gets called by scheduler_tick() with HZ freq
27 * Interrupts are disabled
28 */
29static void litmus_tick(struct rq *rq, struct task_struct *p)
30{
31 TS_PLUGIN_TICK_START;
32
33 if (is_realtime(p))
34 update_time_litmus(rq, p);
35
36 /* plugin tick */
37 litmus->tick(p);
38
39 TS_PLUGIN_TICK_END;
40
41 return;
42}
43
44static struct task_struct *
45litmus_schedule(struct rq *rq, struct task_struct *prev)
46{
47 struct rq* other_rq;
48 struct task_struct *next;
49
50 long was_running;
51 lt_t _maybe_deadlock = 0;
52
53 /* let the plugin schedule */
54 next = litmus->schedule(prev);
55
56 sched_state_plugin_check();
57
58 /* check if a global plugin pulled a task from a different RQ */
59 if (next && task_rq(next) != rq) {
60 /* we need to migrate the task */
61 other_rq = task_rq(next);
62 TRACE_TASK(next, "migrate from %d\n", other_rq->cpu);
63
64 /* while we drop the lock, the prev task could change its
65 * state
66 */
67 was_running = is_running(prev);
68 mb();
69 raw_spin_unlock(&rq->lock);
70
71 /* Don't race with a concurrent switch. This could deadlock in
72 * the case of cross or circular migrations. It's the job of
73 * the plugin to make sure that doesn't happen.
74 */
75 TRACE_TASK(next, "stack_in_use=%d\n",
76 next->rt_param.stack_in_use);
77 if (next->rt_param.stack_in_use != NO_CPU) {
78 TRACE_TASK(next, "waiting to deschedule\n");
79 _maybe_deadlock = litmus_clock();
80 }
81 while (next->rt_param.stack_in_use != NO_CPU) {
82 cpu_relax();
83 mb();
84 if (next->rt_param.stack_in_use == NO_CPU)
85 TRACE_TASK(next,"descheduled. Proceeding.\n");
86
87 if (lt_before(_maybe_deadlock + 10000000,
88 litmus_clock())) {
89 /* We've been spinning for 10ms.
90 * Something can't be right!
91 * Let's abandon the task and bail out; at least
92 * we will have debug info instead of a hard
93 * deadlock.
94 */
95 TRACE_TASK(next,"stack too long in use. "
96 "Deadlock?\n");
97 next = NULL;
98
99 /* bail out */
100 raw_spin_lock(&rq->lock);
101 return next;
102 }
103 }
104#ifdef __ARCH_WANT_UNLOCKED_CTXSW
105 if (next->on_cpu)
106 TRACE_TASK(next, "waiting for !oncpu");
107 while (next->on_cpu) {
108 cpu_relax();
109 mb();
110 }
111#endif
112 double_rq_lock(rq, other_rq);
113 mb();
114 if (is_realtime(prev) && is_running(prev) != was_running) {
115 TRACE_TASK(prev,
116 "state changed while we dropped"
117 " the lock: is_running=%d, was_running=%d\n",
118 is_running(prev), was_running);
119 if (is_running(prev) && !was_running) {
120 /* prev task became unblocked
121 * we need to simulate normal sequence of events
122 * to scheduler plugins.
123 */
124 litmus->task_block(prev);
125 litmus->task_wake_up(prev);
126 }
127 }
128
129 set_task_cpu(next, smp_processor_id());
130
131 /* DEBUG: now that we have the lock we need to make sure a
132 * couple of things still hold:
133 * - it is still a real-time task
134 * - it is still runnable (could have been stopped)
135 * If either is violated, then the active plugin is
136 * doing something wrong.
137 */
138 if (!is_realtime(next) || !is_running(next)) {
139 /* BAD BAD BAD */
140 TRACE_TASK(next,"BAD: migration invariant FAILED: "
141 "rt=%d running=%d\n",
142 is_realtime(next),
143 is_running(next));
144 /* drop the task */
145 next = NULL;
146 }
147 /* release the other CPU's runqueue, but keep ours */
148 raw_spin_unlock(&other_rq->lock);
149 }
150 if (next) {
151 next->rt_param.stack_in_use = rq->cpu;
152 next->se.exec_start = rq->clock;
153 }
154
155 update_enforcement_timer(next);
156 return next;
157}
158
159static void enqueue_task_litmus(struct rq *rq, struct task_struct *p,
160 int flags)
161{
162 if (flags & ENQUEUE_WAKEUP) {
163 sched_trace_task_resume(p);
164 tsk_rt(p)->present = 1;
165 /* LITMUS^RT plugins need to update the state
166 * _before_ making it available in global structures.
167 * Linux gets away with being lazy about the task state
168 * update. We can't do that, hence we update the task
169 * state already here.
170 *
171 * WARNING: this needs to be re-evaluated when porting
172 * to newer kernel versions.
173 */
174 p->state = TASK_RUNNING;
175 litmus->task_wake_up(p);
176
177 rq->litmus.nr_running++;
178 } else
179 TRACE_TASK(p, "ignoring an enqueue, not a wake up.\n");
180}
181
182static void dequeue_task_litmus(struct rq *rq, struct task_struct *p,
183 int flags)
184{
185 if (flags & DEQUEUE_SLEEP) {
186 litmus->task_block(p);
187 tsk_rt(p)->present = 0;
188 sched_trace_task_block(p);
189
190 rq->litmus.nr_running--;
191 } else
192 TRACE_TASK(p, "ignoring a dequeue, not going to sleep.\n");
193}
194
195static void yield_task_litmus(struct rq *rq)
196{
197 TS_SYSCALL_IN_START;
198 TS_SYSCALL_IN_END;
199
200 BUG_ON(rq->curr != current);
201 /* sched_yield() is called to trigger delayed preemptions.
202 * Thus, mark the current task as needing to be rescheduled.
203 * This will cause the scheduler plugin to be invoked, which can
204 * then determine if a preemption is still required.
205 */
206 clear_exit_np(current);
207 litmus_reschedule_local();
208
209 TS_SYSCALL_OUT_START;
210}
211
212/* Plugins are responsible for this.
213 */
214static void check_preempt_curr_litmus(struct rq *rq, struct task_struct *p, int flags)
215{
216}
217
218static void put_prev_task_litmus(struct rq *rq, struct task_struct *p)
219{
220}
221
222static void pre_schedule_litmus(struct rq *rq, struct task_struct *prev)
223{
224 update_time_litmus(rq, prev);
225 if (!is_running(prev))
226 tsk_rt(prev)->present = 0;
227}
228
229/* pick_next_task_litmus() - litmus_schedule() function
230 *
231 * return the next task to be scheduled
232 */
233static struct task_struct *pick_next_task_litmus(struct rq *rq)
234{
235 /* get the to-be-switched-out task (prev) */
236 struct task_struct *prev = rq->litmus.prev;
237 struct task_struct *next;
238
239 /* if not called from schedule() but from somewhere
240 * else (e.g., migration), return now!
241 */
242 if(!rq->litmus.prev)
243 return NULL;
244
245 rq->litmus.prev = NULL;
246
247 TS_PLUGIN_SCHED_START;
248 next = litmus_schedule(rq, prev);
249 TS_PLUGIN_SCHED_END;
250
251 return next;
252}
253
254static void task_tick_litmus(struct rq *rq, struct task_struct *p, int queued)
255{
256 /* nothing to do; tick related tasks are done by litmus_tick() */
257 return;
258}
259
260static void switched_to_litmus(struct rq *rq, struct task_struct *p)
261{
262}
263
264static void prio_changed_litmus(struct rq *rq, struct task_struct *p,
265 int oldprio)
266{
267}
268
269unsigned int get_rr_interval_litmus(struct rq *rq, struct task_struct *p)
270{
271 /* return infinity */
272 return 0;
273}
274
275/* This is called when a task became a real-time task, either due to a SCHED_*
276 * class transition or due to PI mutex inheritance. We don't handle Linux PI
277 * mutex inheritance yet (and probably never will). Use LITMUS provided
278 * synchronization primitives instead.
279 */
280static void set_curr_task_litmus(struct rq *rq)
281{
282 rq->curr->se.exec_start = rq->clock;
283}
284
285
286#ifdef CONFIG_SMP
287/* execve tries to rebalance task in this scheduling domain.
288 * We don't care about the scheduling domain; can gets called from
289 * exec, fork, wakeup.
290 */
291static int
292select_task_rq_litmus(struct task_struct *p, int sd_flag, int flags)
293{
294 /* preemption is already disabled.
295 * We don't want to change cpu here
296 */
297 return task_cpu(p);
298}
299#endif
300
301static const struct sched_class litmus_sched_class = {
302 /* From 34f971f6 the stop/migrate worker threads have a class on
303 * their own, which is the highest prio class. We don't support
304 * cpu-hotplug or cpu throttling. Allows Litmus to use up to 1.0
305 * CPU capacity.
306 */
307 .next = &stop_sched_class,
308 .enqueue_task = enqueue_task_litmus,
309 .dequeue_task = dequeue_task_litmus,
310 .yield_task = yield_task_litmus,
311
312 .check_preempt_curr = check_preempt_curr_litmus,
313
314 .pick_next_task = pick_next_task_litmus,
315 .put_prev_task = put_prev_task_litmus,
316
317#ifdef CONFIG_SMP
318 .select_task_rq = select_task_rq_litmus,
319
320 .pre_schedule = pre_schedule_litmus,
321#endif
322
323 .set_curr_task = set_curr_task_litmus,
324 .task_tick = task_tick_litmus,
325
326 .get_rr_interval = get_rr_interval_litmus,
327
328 .prio_changed = prio_changed_litmus,
329 .switched_to = switched_to_litmus,
330};
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
new file mode 100644
index 00000000000..d5fb3a832ad
--- /dev/null
+++ b/litmus/sched_pfair.c
@@ -0,0 +1,1074 @@
1/*
2 * kernel/sched_pfair.c
3 *
4 * Implementation of the PD^2 pfair scheduling algorithm. This
5 * implementation realizes "early releasing," i.e., it is work-conserving.
6 *
7 */
8
9#include <asm/div64.h>
10#include <linux/delay.h>
11#include <linux/module.h>
12#include <linux/spinlock.h>
13#include <linux/percpu.h>
14#include <linux/sched.h>
15#include <linux/list.h>
16#include <linux/slab.h>
17
18#include <litmus/litmus.h>
19#include <litmus/jobs.h>
20#include <litmus/preempt.h>
21#include <litmus/rt_domain.h>
22#include <litmus/sched_plugin.h>
23#include <litmus/sched_trace.h>
24
25#include <litmus/bheap.h>
26
27/* to configure the cluster size */
28#include <litmus/litmus_proc.h>
29
30#include <litmus/clustered.h>
31
32static enum cache_level pfair_cluster_level = GLOBAL_CLUSTER;
33
34struct subtask {
35 /* measured in quanta relative to job release */
36 quanta_t release;
37 quanta_t deadline;
38 quanta_t overlap; /* called "b bit" by PD^2 */
39 quanta_t group_deadline;
40};
41
42struct pfair_param {
43 quanta_t quanta; /* number of subtasks */
44 quanta_t cur; /* index of current subtask */
45
46 quanta_t release; /* in quanta */
47 quanta_t period; /* in quanta */
48
49 quanta_t last_quantum; /* when scheduled last */
50 int last_cpu; /* where scheduled last */
51
52 struct pfair_cluster* cluster; /* where this task is scheduled */
53
54 struct subtask subtasks[0]; /* allocate together with pfair_param */
55};
56
57#define tsk_pfair(tsk) ((tsk)->rt_param.pfair)
58
59struct pfair_state {
60 struct cluster_cpu topology;
61
62 volatile quanta_t cur_tick; /* updated by the CPU that is advancing
63 * the time */
64 volatile quanta_t local_tick; /* What tick is the local CPU currently
65 * executing? Updated only by the local
66 * CPU. In QEMU, this may lag behind the
67 * current tick. In a real system, with
68 * proper timers and aligned quanta,
69 * that should only be the case for a
70 * very short time after the time
71 * advanced. With staggered quanta, it
72 * will lag for the duration of the
73 * offset.
74 */
75
76 struct task_struct* linked; /* the task that should be executing */
77 struct task_struct* local; /* the local copy of linked */
78 struct task_struct* scheduled; /* what is actually scheduled */
79
80 lt_t offset; /* stagger offset */
81 unsigned int missed_updates;
82 unsigned int missed_quanta;
83};
84
85struct pfair_cluster {
86 struct scheduling_cluster topology;
87
88 /* The "global" time in this cluster. */
89 quanta_t pfair_time; /* the "official" PFAIR clock */
90
91 /* The ready queue for this cluster. */
92 rt_domain_t pfair;
93
94 /* The set of jobs that should have their release enacted at the next
95 * quantum boundary.
96 */
97 struct bheap release_queue;
98 raw_spinlock_t release_lock;
99};
100
101#define RT_F_REQUEUE 0x2
102
103static inline struct pfair_cluster* cpu_cluster(struct pfair_state* state)
104{
105 return container_of(state->topology.cluster, struct pfair_cluster, topology);
106}
107
108static inline int cpu_id(struct pfair_state* state)
109{
110 return state->topology.id;
111}
112
113static inline struct pfair_state* from_cluster_list(struct list_head* pos)
114{
115 return list_entry(pos, struct pfair_state, topology.cluster_list);
116}
117
118static inline struct pfair_cluster* from_domain(rt_domain_t* rt)
119{
120 return container_of(rt, struct pfair_cluster, pfair);
121}
122
123static inline raw_spinlock_t* cluster_lock(struct pfair_cluster* cluster)
124{
125 /* The ready_lock is used to serialize all scheduling events. */
126 return &cluster->pfair.ready_lock;
127}
128
129static inline raw_spinlock_t* cpu_lock(struct pfair_state* state)
130{
131 return cluster_lock(cpu_cluster(state));
132}
133
134DEFINE_PER_CPU(struct pfair_state, pfair_state);
135struct pfair_state* *pstate; /* short cut */
136
137static struct pfair_cluster* pfair_clusters;
138static int num_pfair_clusters;
139
140/* Enable for lots of trace info.
141 * #define PFAIR_DEBUG
142 */
143
144#ifdef PFAIR_DEBUG
145#define PTRACE_TASK(t, f, args...) TRACE_TASK(t, f, ## args)
146#define PTRACE(f, args...) TRACE(f, ## args)
147#else
148#define PTRACE_TASK(t, f, args...)
149#define PTRACE(f, args...)
150#endif
151
152/* gcc will inline all of these accessor functions... */
153static struct subtask* cur_subtask(struct task_struct* t)
154{
155 return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur;
156}
157
158static quanta_t cur_deadline(struct task_struct* t)
159{
160 return cur_subtask(t)->deadline + tsk_pfair(t)->release;
161}
162
163static quanta_t cur_release(struct task_struct* t)
164{
165 /* This is early releasing: only the release of the first subtask
166 * counts. */
167 return tsk_pfair(t)->release;
168}
169
170static quanta_t cur_overlap(struct task_struct* t)
171{
172 return cur_subtask(t)->overlap;
173}
174
175static quanta_t cur_group_deadline(struct task_struct* t)
176{
177 quanta_t gdl = cur_subtask(t)->group_deadline;
178 if (gdl)
179 return gdl + tsk_pfair(t)->release;
180 else
181 return gdl;
182}
183
184
185static int pfair_higher_prio(struct task_struct* first,
186 struct task_struct* second)
187{
188 return /* first task must exist */
189 first && (
190 /* Does the second task exist and is it a real-time task? If
191 * not, the first task (which is a RT task) has higher
192 * priority.
193 */
194 !second || !is_realtime(second) ||
195
196 /* Is the (subtask) deadline of the first task earlier?
197 * Then it has higher priority.
198 */
199 time_before(cur_deadline(first), cur_deadline(second)) ||
200
201 /* Do we have a deadline tie?
202 * Then break by B-bit.
203 */
204 (cur_deadline(first) == cur_deadline(second) &&
205 (cur_overlap(first) > cur_overlap(second) ||
206
207 /* Do we have a B-bit tie?
208 * Then break by group deadline.
209 */
210 (cur_overlap(first) == cur_overlap(second) &&
211 (time_after(cur_group_deadline(first),
212 cur_group_deadline(second)) ||
213
214 /* Do we have a group deadline tie?
215 * Then break by PID, which are unique.
216 */
217 (cur_group_deadline(first) ==
218 cur_group_deadline(second) &&
219 first->pid < second->pid))))));
220}
221
222int pfair_ready_order(struct bheap_node* a, struct bheap_node* b)
223{
224 return pfair_higher_prio(bheap2task(a), bheap2task(b));
225}
226
227static void pfair_release_jobs(rt_domain_t* rt, struct bheap* tasks)
228{
229 struct pfair_cluster* cluster = from_domain(rt);
230 unsigned long flags;
231
232 raw_spin_lock_irqsave(&cluster->release_lock, flags);
233
234 bheap_union(pfair_ready_order, &cluster->release_queue, tasks);
235
236 raw_spin_unlock_irqrestore(&cluster->release_lock, flags);
237}
238
239static void prepare_release(struct task_struct* t, quanta_t at)
240{
241 tsk_pfair(t)->release = at;
242 tsk_pfair(t)->cur = 0;
243}
244
245/* pull released tasks from the release queue */
246static void poll_releases(struct pfair_cluster* cluster)
247{
248 raw_spin_lock(&cluster->release_lock);
249 __merge_ready(&cluster->pfair, &cluster->release_queue);
250 raw_spin_unlock(&cluster->release_lock);
251}
252
253static void check_preempt(struct task_struct* t)
254{
255 int cpu = NO_CPU;
256 if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on &&
257 is_present(t)) {
258 /* the task can be scheduled and
259 * is not scheduled where it ought to be scheduled
260 */
261 cpu = tsk_rt(t)->linked_on != NO_CPU ?
262 tsk_rt(t)->linked_on :
263 tsk_rt(t)->scheduled_on;
264 PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n",
265 tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on);
266 /* preempt */
267 litmus_reschedule(cpu);
268 }
269}
270
271/* caller must hold pfair.ready_lock */
272static void drop_all_references(struct task_struct *t)
273{
274 int cpu;
275 struct pfair_state* s;
276 struct pfair_cluster* cluster;
277 if (bheap_node_in_heap(tsk_rt(t)->heap_node)) {
278 /* It must be in the ready queue; drop references isn't called
279 * when the job is in a release queue. */
280 cluster = tsk_pfair(t)->cluster;
281 bheap_delete(pfair_ready_order, &cluster->pfair.ready_queue,
282 tsk_rt(t)->heap_node);
283 }
284 for (cpu = 0; cpu < num_online_cpus(); cpu++) {
285 s = &per_cpu(pfair_state, cpu);
286 if (s->linked == t)
287 s->linked = NULL;
288 if (s->local == t)
289 s->local = NULL;
290 if (s->scheduled == t)
291 s->scheduled = NULL;
292 }
293 /* make sure we don't have a stale linked_on field */
294 tsk_rt(t)->linked_on = NO_CPU;
295}
296
297static void pfair_prepare_next_period(struct task_struct* t)
298{
299 struct pfair_param* p = tsk_pfair(t);
300
301 prepare_for_next_period(t);
302 tsk_rt(t)->completed = 0;
303 p->release += p->period;
304}
305
306/* returns 1 if the task needs to go the release queue */
307static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
308{
309 struct pfair_param* p = tsk_pfair(t);
310 int to_relq;
311 p->cur = (p->cur + 1) % p->quanta;
312 if (!p->cur) {
313 if (is_present(t)) {
314 /* The job overran; we start a new budget allocation. */
315 pfair_prepare_next_period(t);
316 } else {
317 /* remove task from system until it wakes */
318 drop_all_references(t);
319 tsk_rt(t)->flags = RT_F_REQUEUE;
320 TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n",
321 cpu, p->cur);
322 return 0;
323 }
324 }
325 to_relq = time_after(cur_release(t), time);
326 TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d (cur_release:%lu time:%lu)\n",
327 cpu, p->cur, to_relq, cur_release(t), time);
328 return to_relq;
329}
330
331static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time)
332{
333 struct task_struct* l;
334 struct pfair_param* p;
335 struct list_head* pos;
336 struct pfair_state* cpu;
337
338 list_for_each(pos, &cluster->topology.cpus) {
339 cpu = from_cluster_list(pos);
340 l = cpu->linked;
341 cpu->missed_updates += cpu->linked != cpu->local;
342 if (l) {
343 p = tsk_pfair(l);
344 p->last_quantum = time;
345 p->last_cpu = cpu_id(cpu);
346 if (advance_subtask(time, l, cpu_id(cpu))) {
347 //cpu->linked = NULL;
348 PTRACE_TASK(l, "should go to release queue. "
349 "scheduled_on=%d present=%d\n",
350 tsk_rt(l)->scheduled_on,
351 tsk_rt(l)->present);
352 }
353 }
354 }
355}
356
357static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu)
358{
359 int cpu;
360 if (tsk_rt(t)->scheduled_on != NO_CPU) {
361 /* always observe scheduled_on linkage */
362 default_cpu = tsk_rt(t)->scheduled_on;
363 } else if (tsk_pfair(t)->last_quantum == time - 1) {
364 /* back2back quanta */
365 /* Only observe last_quantum if no scheduled_on is in the way.
366 * This should only kick in if a CPU missed quanta, and that
367 * *should* only happen in QEMU.
368 */
369 cpu = tsk_pfair(t)->last_cpu;
370 if (!pstate[cpu]->linked ||
371 tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) {
372 default_cpu = cpu;
373 }
374 }
375 return default_cpu;
376}
377
378/* returns one if linking was redirected */
379static int pfair_link(quanta_t time, int cpu,
380 struct task_struct* t)
381{
382 int target = target_cpu(time, t, cpu);
383 struct task_struct* prev = pstate[cpu]->linked;
384 struct task_struct* other;
385 struct pfair_cluster* cluster = cpu_cluster(pstate[cpu]);
386
387 if (target != cpu) {
388 BUG_ON(pstate[target]->topology.cluster != pstate[cpu]->topology.cluster);
389 other = pstate[target]->linked;
390 pstate[target]->linked = t;
391 tsk_rt(t)->linked_on = target;
392 if (!other)
393 /* linked ok, but reschedule this CPU */
394 return 1;
395 if (target < cpu) {
396 /* link other to cpu instead */
397 tsk_rt(other)->linked_on = cpu;
398 pstate[cpu]->linked = other;
399 if (prev) {
400 /* prev got pushed back into the ready queue */
401 tsk_rt(prev)->linked_on = NO_CPU;
402 __add_ready(&cluster->pfair, prev);
403 }
404 /* we are done with this cpu */
405 return 0;
406 } else {
407 /* re-add other, it's original CPU was not considered yet */
408 tsk_rt(other)->linked_on = NO_CPU;
409 __add_ready(&cluster->pfair, other);
410 /* reschedule this CPU */
411 return 1;
412 }
413 } else {
414 pstate[cpu]->linked = t;
415 tsk_rt(t)->linked_on = cpu;
416 if (prev) {
417 /* prev got pushed back into the ready queue */
418 tsk_rt(prev)->linked_on = NO_CPU;
419 __add_ready(&cluster->pfair, prev);
420 }
421 /* we are done with this CPU */
422 return 0;
423 }
424}
425
426static void schedule_subtasks(struct pfair_cluster *cluster, quanta_t time)
427{
428 int retry;
429 struct list_head *pos;
430 struct pfair_state *cpu_state;
431
432 list_for_each(pos, &cluster->topology.cpus) {
433 cpu_state = from_cluster_list(pos);
434 retry = 1;
435#ifdef CONFIG_RELEASE_MASTER
436 /* skip release master */
437 if (cluster->pfair.release_master == cpu_id(cpu_state))
438 continue;
439#endif
440 while (retry) {
441 if (pfair_higher_prio(__peek_ready(&cluster->pfair),
442 cpu_state->linked))
443 retry = pfair_link(time, cpu_id(cpu_state),
444 __take_ready(&cluster->pfair));
445 else
446 retry = 0;
447 }
448 }
449}
450
451static void schedule_next_quantum(struct pfair_cluster *cluster, quanta_t time)
452{
453 struct pfair_state *cpu;
454 struct list_head* pos;
455
456 /* called with interrupts disabled */
457 PTRACE("--- Q %lu at %llu PRE-SPIN\n",
458 time, litmus_clock());
459 raw_spin_lock(cluster_lock(cluster));
460 PTRACE("<<< Q %lu at %llu\n",
461 time, litmus_clock());
462
463 sched_trace_quantum_boundary();
464
465 advance_subtasks(cluster, time);
466 poll_releases(cluster);
467 schedule_subtasks(cluster, time);
468
469 list_for_each(pos, &cluster->topology.cpus) {
470 cpu = from_cluster_list(pos);
471 if (cpu->linked)
472 PTRACE_TASK(cpu->linked,
473 " linked on %d.\n", cpu_id(cpu));
474 else
475 PTRACE("(null) linked on %d.\n", cpu_id(cpu));
476 }
477 /* We are done. Advance time. */
478 mb();
479 list_for_each(pos, &cluster->topology.cpus) {
480 cpu = from_cluster_list(pos);
481 if (cpu->local_tick != cpu->cur_tick) {
482 TRACE("BAD Quantum not acked on %d "
483 "(l:%lu c:%lu p:%lu)\n",
484 cpu_id(cpu),
485 cpu->local_tick,
486 cpu->cur_tick,
487 cluster->pfair_time);
488 cpu->missed_quanta++;
489 }
490 cpu->cur_tick = time;
491 }
492 PTRACE(">>> Q %lu at %llu\n",
493 time, litmus_clock());
494 raw_spin_unlock(cluster_lock(cluster));
495}
496
497static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state)
498{
499 quanta_t loc;
500
501 goto first; /* skip mb() on first iteration */
502 do {
503 cpu_relax();
504 mb();
505 first: loc = state->cur_tick;
506 /* FIXME: what if loc > cur? */
507 } while (time_before(loc, q));
508 PTRACE("observed cur_tick:%lu >= q:%lu\n",
509 loc, q);
510}
511
512static quanta_t current_quantum(struct pfair_state* state)
513{
514 lt_t t = litmus_clock() - state->offset;
515 return time2quanta(t, FLOOR);
516}
517
518static void catchup_quanta(quanta_t from, quanta_t target,
519 struct pfair_state* state)
520{
521 quanta_t cur = from, time;
522 TRACE("+++< BAD catching up quanta from %lu to %lu\n",
523 from, target);
524 while (time_before(cur, target)) {
525 wait_for_quantum(cur, state);
526 cur++;
527 time = cmpxchg(&cpu_cluster(state)->pfair_time,
528 cur - 1, /* expected */
529 cur /* next */
530 );
531 if (time == cur - 1)
532 schedule_next_quantum(cpu_cluster(state), cur);
533 }
534 TRACE("+++> catching up done\n");
535}
536
537/* pfair_tick - this function is called for every local timer
538 * interrupt.
539 */
540static void pfair_tick(struct task_struct* t)
541{
542 struct pfair_state* state = &__get_cpu_var(pfair_state);
543 quanta_t time, cur;
544 int retry = 10;
545
546 do {
547 cur = current_quantum(state);
548 PTRACE("q %lu at %llu\n", cur, litmus_clock());
549
550 /* Attempt to advance time. First CPU to get here
551 * will prepare the next quantum.
552 */
553 time = cmpxchg(&cpu_cluster(state)->pfair_time,
554 cur - 1, /* expected */
555 cur /* next */
556 );
557 if (time == cur - 1) {
558 /* exchange succeeded */
559 wait_for_quantum(cur - 1, state);
560 schedule_next_quantum(cpu_cluster(state), cur);
561 retry = 0;
562 } else if (time_before(time, cur - 1)) {
563 /* the whole system missed a tick !? */
564 catchup_quanta(time, cur, state);
565 retry--;
566 } else if (time_after(time, cur)) {
567 /* our timer lagging behind!? */
568 TRACE("BAD pfair_time:%lu > cur:%lu\n", time, cur);
569 retry--;
570 } else {
571 /* Some other CPU already started scheduling
572 * this quantum. Let it do its job and then update.
573 */
574 retry = 0;
575 }
576 } while (retry);
577
578 /* Spin locally until time advances. */
579 wait_for_quantum(cur, state);
580
581 /* copy assignment */
582 /* FIXME: what if we race with a future update? Corrupted state? */
583 state->local = state->linked;
584 /* signal that we are done */
585 mb();
586 state->local_tick = state->cur_tick;
587
588 if (state->local != current
589 && (is_realtime(current) || is_present(state->local)))
590 litmus_reschedule_local();
591}
592
593static int safe_to_schedule(struct task_struct* t, int cpu)
594{
595 int where = tsk_rt(t)->scheduled_on;
596 if (where != NO_CPU && where != cpu) {
597 TRACE_TASK(t, "BAD: can't be scheduled on %d, "
598 "scheduled already on %d.\n", cpu, where);
599 return 0;
600 } else
601 return is_present(t) && !is_completed(t);
602}
603
604static struct task_struct* pfair_schedule(struct task_struct * prev)
605{
606 struct pfair_state* state = &__get_cpu_var(pfair_state);
607 struct pfair_cluster* cluster = cpu_cluster(state);
608 int blocks, completion, out_of_time;
609 struct task_struct* next = NULL;
610
611#ifdef CONFIG_RELEASE_MASTER
612 /* Bail out early if we are the release master.
613 * The release master never schedules any real-time tasks.
614 */
615 if (unlikely(cluster->pfair.release_master == cpu_id(state))) {
616 sched_state_task_picked();
617 return NULL;
618 }
619#endif
620
621 raw_spin_lock(cpu_lock(state));
622
623 blocks = is_realtime(prev) && !is_running(prev);
624 completion = is_realtime(prev) && is_completed(prev);
625 out_of_time = is_realtime(prev) && time_after(cur_release(prev),
626 state->local_tick);
627
628 if (is_realtime(prev))
629 PTRACE_TASK(prev, "blocks:%d completion:%d out_of_time:%d\n",
630 blocks, completion, out_of_time);
631
632 if (completion) {
633 sched_trace_task_completion(prev, 0);
634 pfair_prepare_next_period(prev);
635 prepare_release(prev, cur_release(prev));
636 }
637
638 if (!blocks && (completion || out_of_time)) {
639 drop_all_references(prev);
640 sched_trace_task_release(prev);
641 add_release(&cluster->pfair, prev);
642 }
643
644 if (state->local && safe_to_schedule(state->local, cpu_id(state)))
645 next = state->local;
646
647 if (prev != next) {
648 tsk_rt(prev)->scheduled_on = NO_CPU;
649 if (next)
650 tsk_rt(next)->scheduled_on = cpu_id(state);
651 }
652 sched_state_task_picked();
653 raw_spin_unlock(cpu_lock(state));
654
655 if (next)
656 TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n",
657 tsk_pfair(next)->release, cpu_cluster(state)->pfair_time, litmus_clock());
658 else if (is_realtime(prev))
659 TRACE("Becomes idle at %lu (%llu)\n", cpu_cluster(state)->pfair_time, litmus_clock());
660
661 return next;
662}
663
664static void pfair_task_new(struct task_struct * t, int on_rq, int running)
665{
666 unsigned long flags;
667 struct pfair_cluster* cluster;
668
669 TRACE("pfair: task new %d state:%d\n", t->pid, t->state);
670
671 cluster = tsk_pfair(t)->cluster;
672
673 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
674
675 prepare_release(t, cluster->pfair_time + 1);
676
677 t->rt_param.scheduled_on = NO_CPU;
678
679 if (running) {
680#ifdef CONFIG_RELEASE_MASTER
681 if (task_cpu(t) != cluster->pfair.release_master)
682#endif
683 t->rt_param.scheduled_on = task_cpu(t);
684 __add_ready(&cluster->pfair, t);
685 }
686
687 check_preempt(t);
688
689 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
690}
691
692static void pfair_task_wake_up(struct task_struct *t)
693{
694 unsigned long flags;
695 lt_t now;
696 int requeue = 0;
697 struct pfair_cluster* cluster;
698
699 cluster = tsk_pfair(t)->cluster;
700
701 TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n",
702 litmus_clock(), cur_release(t), cluster->pfair_time);
703
704 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
705
706 /* If a task blocks and wakes before its next job release,
707 * then it may resume if it is currently linked somewhere
708 * (as if it never blocked at all). Otherwise, we have a
709 * new sporadic job release.
710 */
711 requeue = tsk_rt(t)->flags == RT_F_REQUEUE;
712 now = litmus_clock();
713 if (is_tardy(t, now)) {
714 TRACE_TASK(t, "sporadic release!\n");
715 release_at(t, now);
716 prepare_release(t, time2quanta(now, CEIL));
717 sched_trace_task_release(t);
718 }
719
720 /* only add to ready queue if the task isn't still linked somewhere */
721 if (requeue) {
722 TRACE_TASK(t, "requeueing required\n");
723 tsk_rt(t)->completed = 0;
724 __add_ready(&cluster->pfair, t);
725 }
726
727 check_preempt(t);
728
729 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
730 TRACE_TASK(t, "wake up done at %llu\n", litmus_clock());
731}
732
733static void pfair_task_block(struct task_struct *t)
734{
735 BUG_ON(!is_realtime(t));
736 TRACE_TASK(t, "blocks at %llu, state:%d\n",
737 litmus_clock(), t->state);
738}
739
740static void pfair_task_exit(struct task_struct * t)
741{
742 unsigned long flags;
743 struct pfair_cluster *cluster;
744
745 BUG_ON(!is_realtime(t));
746
747 cluster = tsk_pfair(t)->cluster;
748
749 /* Remote task from release or ready queue, and ensure
750 * that it is not the scheduled task for ANY CPU. We
751 * do this blanket check because occassionally when
752 * tasks exit while blocked, the task_cpu of the task
753 * might not be the same as the CPU that the PFAIR scheduler
754 * has chosen for it.
755 */
756 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
757
758 TRACE_TASK(t, "RIP, state:%d\n", t->state);
759 drop_all_references(t);
760
761 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
762
763 kfree(t->rt_param.pfair);
764 t->rt_param.pfair = NULL;
765}
766
767
768static void pfair_release_at(struct task_struct* task, lt_t start)
769{
770 unsigned long flags;
771 quanta_t release;
772
773 struct pfair_cluster *cluster;
774
775 cluster = tsk_pfair(task)->cluster;
776
777 BUG_ON(!is_realtime(task));
778
779 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
780 release_at(task, start);
781 release = time2quanta(start, CEIL);
782
783 TRACE_TASK(task, "sys release at %lu\n", release);
784
785 drop_all_references(task);
786 prepare_release(task, release);
787 add_release(&cluster->pfair, task);
788
789 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
790}
791
792static void init_subtask(struct subtask* sub, unsigned long i,
793 lt_t quanta, lt_t period)
794{
795 /* since i is zero-based, the formulas are shifted by one */
796 lt_t tmp;
797
798 /* release */
799 tmp = period * i;
800 do_div(tmp, quanta); /* floor */
801 sub->release = (quanta_t) tmp;
802
803 /* deadline */
804 tmp = period * (i + 1);
805 if (do_div(tmp, quanta)) /* ceil */
806 tmp++;
807 sub->deadline = (quanta_t) tmp;
808
809 /* next release */
810 tmp = period * (i + 1);
811 do_div(tmp, quanta); /* floor */
812 sub->overlap = sub->deadline - (quanta_t) tmp;
813
814 /* Group deadline.
815 * Based on the formula given in Uma's thesis.
816 */
817 if (2 * quanta >= period) {
818 /* heavy */
819 tmp = (sub->deadline - (i + 1)) * period;
820 if (period > quanta &&
821 do_div(tmp, (period - quanta))) /* ceil */
822 tmp++;
823 sub->group_deadline = (quanta_t) tmp;
824 } else
825 sub->group_deadline = 0;
826}
827
828static void dump_subtasks(struct task_struct* t)
829{
830 unsigned long i;
831 for (i = 0; i < t->rt_param.pfair->quanta; i++)
832 TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n",
833 i + 1,
834 t->rt_param.pfair->subtasks[i].release,
835 t->rt_param.pfair->subtasks[i].deadline,
836 t->rt_param.pfair->subtasks[i].overlap,
837 t->rt_param.pfair->subtasks[i].group_deadline);
838}
839
840static long pfair_admit_task(struct task_struct* t)
841{
842 lt_t quanta;
843 lt_t period;
844 s64 quantum_length = ktime_to_ns(tick_period);
845 struct pfair_param* param;
846 unsigned long i;
847
848 /* first check that the task is in the right cluster */
849 if (cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]) !=
850 cpu_cluster(pstate[task_cpu(t)]))
851 return -EINVAL;
852
853 if (get_rt_period(t) != get_rt_relative_deadline(t)) {
854 printk(KERN_INFO "%s: Admission rejected. "
855 "Only implicit deadlines are currently supported.\n",
856 litmus->plugin_name);
857 return -EINVAL;
858 }
859
860 /* Pfair is a tick-based method, so the time
861 * of interest is jiffies. Calculate tick-based
862 * times for everything.
863 * (Ceiling of exec cost, floor of period.)
864 */
865
866 quanta = get_exec_cost(t);
867 period = get_rt_period(t);
868
869 quanta = time2quanta(get_exec_cost(t), CEIL);
870
871 if (do_div(period, quantum_length))
872 printk(KERN_WARNING
873 "The period of %s/%d is not a multiple of %llu.\n",
874 t->comm, t->pid, (unsigned long long) quantum_length);
875
876 if (quanta == period) {
877 /* special case: task has weight 1.0 */
878 printk(KERN_INFO
879 "Admitting weight 1.0 task. (%s/%d, %llu, %llu).\n",
880 t->comm, t->pid, quanta, period);
881 quanta = 1;
882 period = 1;
883 }
884
885 param = kmalloc(sizeof(*param) +
886 quanta * sizeof(struct subtask), GFP_ATOMIC);
887
888 if (!param)
889 return -ENOMEM;
890
891 param->quanta = quanta;
892 param->cur = 0;
893 param->release = 0;
894 param->period = period;
895
896 param->cluster = cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]);
897
898 for (i = 0; i < quanta; i++)
899 init_subtask(param->subtasks + i, i, quanta, period);
900
901 if (t->rt_param.pfair)
902 /* get rid of stale allocation */
903 kfree(t->rt_param.pfair);
904
905 t->rt_param.pfair = param;
906
907 /* spew out some debug info */
908 dump_subtasks(t);
909
910 return 0;
911}
912
913static void pfair_init_cluster(struct pfair_cluster* cluster)
914{
915 rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, pfair_release_jobs);
916 bheap_init(&cluster->release_queue);
917 raw_spin_lock_init(&cluster->release_lock);
918 INIT_LIST_HEAD(&cluster->topology.cpus);
919}
920
921static void cleanup_clusters(void)
922{
923 int i;
924
925 if (num_pfair_clusters)
926 kfree(pfair_clusters);
927 pfair_clusters = NULL;
928 num_pfair_clusters = 0;
929
930 /* avoid stale pointers */
931 for (i = 0; i < num_online_cpus(); i++) {
932 pstate[i]->topology.cluster = NULL;
933 printk("P%d missed %u updates and %u quanta.\n", cpu_id(pstate[i]),
934 pstate[i]->missed_updates, pstate[i]->missed_quanta);
935 }
936}
937
938static long pfair_activate_plugin(void)
939{
940 int err, i;
941 struct pfair_state* state;
942 struct pfair_cluster* cluster ;
943 quanta_t now;
944 int cluster_size;
945 struct cluster_cpu* cpus[NR_CPUS];
946 struct scheduling_cluster* clust[NR_CPUS];
947
948 cluster_size = get_cluster_size(pfair_cluster_level);
949
950 if (cluster_size <= 0 || num_online_cpus() % cluster_size != 0)
951 return -EINVAL;
952
953 num_pfair_clusters = num_online_cpus() / cluster_size;
954
955 pfair_clusters = kzalloc(num_pfair_clusters * sizeof(struct pfair_cluster), GFP_ATOMIC);
956 if (!pfair_clusters) {
957 num_pfair_clusters = 0;
958 printk(KERN_ERR "Could not allocate Pfair clusters!\n");
959 return -ENOMEM;
960 }
961
962 state = &__get_cpu_var(pfair_state);
963 now = current_quantum(state);
964 TRACE("Activating PFAIR at q=%lu\n", now);
965
966 for (i = 0; i < num_pfair_clusters; i++) {
967 cluster = &pfair_clusters[i];
968 pfair_init_cluster(cluster);
969 cluster->pfair_time = now;
970 clust[i] = &cluster->topology;
971#ifdef CONFIG_RELEASE_MASTER
972 cluster->pfair.release_master = atomic_read(&release_master_cpu);
973#endif
974 }
975
976 for (i = 0; i < num_online_cpus(); i++) {
977 state = &per_cpu(pfair_state, i);
978 state->cur_tick = now;
979 state->local_tick = now;
980 state->missed_quanta = 0;
981 state->missed_updates = 0;
982 state->offset = cpu_stagger_offset(i);
983 printk(KERN_ERR "cpus[%d] set; %d\n", i, num_online_cpus());
984 cpus[i] = &state->topology;
985 }
986
987 err = assign_cpus_to_clusters(pfair_cluster_level, clust, num_pfair_clusters,
988 cpus, num_online_cpus());
989
990 if (err < 0)
991 cleanup_clusters();
992
993 return err;
994}
995
996static long pfair_deactivate_plugin(void)
997{
998 cleanup_clusters();
999 return 0;
1000}
1001
1002/* Plugin object */
1003static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = {
1004 .plugin_name = "PFAIR",
1005 .tick = pfair_tick,
1006 .task_new = pfair_task_new,
1007 .task_exit = pfair_task_exit,
1008 .schedule = pfair_schedule,
1009 .task_wake_up = pfair_task_wake_up,
1010 .task_block = pfair_task_block,
1011 .admit_task = pfair_admit_task,
1012 .release_at = pfair_release_at,
1013 .complete_job = complete_job,
1014 .activate_plugin = pfair_activate_plugin,
1015 .deactivate_plugin = pfair_deactivate_plugin,
1016};
1017
1018
1019static struct proc_dir_entry *cluster_file = NULL, *pfair_dir = NULL;
1020
1021static int __init init_pfair(void)
1022{
1023 int cpu, err, fs;
1024 struct pfair_state *state;
1025
1026 /*
1027 * initialize short_cut for per-cpu pfair state;
1028 * there may be a problem here if someone removes a cpu
1029 * while we are doing this initialization... and if cpus
1030 * are added / removed later... but we don't support CPU hotplug atm anyway.
1031 */
1032 pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL);
1033
1034 /* initialize CPU state */
1035 for (cpu = 0; cpu < num_online_cpus(); cpu++) {
1036 state = &per_cpu(pfair_state, cpu);
1037 state->topology.id = cpu;
1038 state->cur_tick = 0;
1039 state->local_tick = 0;
1040 state->linked = NULL;
1041 state->local = NULL;
1042 state->scheduled = NULL;
1043 state->missed_quanta = 0;
1044 state->offset = cpu_stagger_offset(cpu);
1045 pstate[cpu] = state;
1046 }
1047
1048 pfair_clusters = NULL;
1049 num_pfair_clusters = 0;
1050
1051 err = register_sched_plugin(&pfair_plugin);
1052 if (!err) {
1053 fs = make_plugin_proc_dir(&pfair_plugin, &pfair_dir);
1054 if (!fs)
1055 cluster_file = create_cluster_file(pfair_dir, &pfair_cluster_level);
1056 else
1057 printk(KERN_ERR "Could not allocate PFAIR procfs dir.\n");
1058 }
1059
1060 return err;
1061}
1062
1063static void __exit clean_pfair(void)
1064{
1065 kfree(pstate);
1066
1067 if (cluster_file)
1068 remove_proc_entry("cluster", pfair_dir);
1069 if (pfair_dir)
1070 remove_plugin_proc_dir(&pfair_plugin);
1071}
1072
1073module_init(init_pfair);
1074module_exit(clean_pfair);
diff --git a/litmus/sched_pfp.c b/litmus/sched_pfp.c
new file mode 100644
index 00000000000..aade0904491
--- /dev/null
+++ b/litmus/sched_pfp.c
@@ -0,0 +1,1751 @@
1/*
2 * litmus/sched_pfp.c
3 *
4 * Implementation of partitioned fixed-priority scheduling.
5 * Based on PSN-EDF.
6 */
7
8#include <linux/percpu.h>
9#include <linux/sched.h>
10#include <linux/list.h>
11#include <linux/spinlock.h>
12#include <linux/module.h>
13
14#include <litmus/litmus.h>
15#include <litmus/wait.h>
16#include <litmus/jobs.h>
17#include <litmus/preempt.h>
18#include <litmus/fp_common.h>
19#include <litmus/sched_plugin.h>
20#include <litmus/sched_trace.h>
21#include <litmus/trace.h>
22#include <litmus/budget.h>
23
24#include <linux/uaccess.h>
25
26
27typedef struct {
28 rt_domain_t domain;
29 struct fp_prio_queue ready_queue;
30 int cpu;
31 struct task_struct* scheduled; /* only RT tasks */
32/*
33 * scheduling lock slock
34 * protects the domain and serializes scheduling decisions
35 */
36#define slock domain.ready_lock
37
38} pfp_domain_t;
39
40DEFINE_PER_CPU(pfp_domain_t, pfp_domains);
41
42pfp_domain_t* pfp_doms[NR_CPUS];
43
44#define local_pfp (&__get_cpu_var(pfp_domains))
45#define remote_dom(cpu) (&per_cpu(pfp_domains, cpu).domain)
46#define remote_pfp(cpu) (&per_cpu(pfp_domains, cpu))
47#define task_dom(task) remote_dom(get_partition(task))
48#define task_pfp(task) remote_pfp(get_partition(task))
49
50/* we assume the lock is being held */
51static void preempt(pfp_domain_t *pfp)
52{
53 preempt_if_preemptable(pfp->scheduled, pfp->cpu);
54}
55
56static unsigned int priority_index(struct task_struct* t)
57{
58#ifdef CONFIG_LITMUS_LOCKING
59 if (unlikely(t->rt_param.inh_task))
60 /* use effective priority */
61 t = t->rt_param.inh_task;
62
63 if (is_priority_boosted(t)) {
64 /* zero is reserved for priority-boosted tasks */
65 return 0;
66 } else
67#endif
68 return get_priority(t);
69}
70
71
72static void pfp_release_jobs(rt_domain_t* rt, struct bheap* tasks)
73{
74 pfp_domain_t *pfp = container_of(rt, pfp_domain_t, domain);
75 unsigned long flags;
76 struct task_struct* t;
77 struct bheap_node* hn;
78
79 raw_spin_lock_irqsave(&pfp->slock, flags);
80
81 while (!bheap_empty(tasks)) {
82 hn = bheap_take(fp_ready_order, tasks);
83 t = bheap2task(hn);
84 TRACE_TASK(t, "released (part:%d prio:%d)\n",
85 get_partition(t), get_priority(t));
86 fp_prio_add(&pfp->ready_queue, t, priority_index(t));
87 }
88
89 /* do we need to preempt? */
90 if (fp_higher_prio(fp_prio_peek(&pfp->ready_queue), pfp->scheduled)) {
91 TRACE_CUR("preempted by new release\n");
92 preempt(pfp);
93 }
94
95 raw_spin_unlock_irqrestore(&pfp->slock, flags);
96}
97
98static void pfp_preempt_check(pfp_domain_t *pfp)
99{
100 if (fp_higher_prio(fp_prio_peek(&pfp->ready_queue), pfp->scheduled))
101 preempt(pfp);
102}
103
104static void pfp_domain_init(pfp_domain_t* pfp,
105 int cpu)
106{
107 fp_domain_init(&pfp->domain, NULL, pfp_release_jobs);
108 pfp->cpu = cpu;
109 pfp->scheduled = NULL;
110 fp_prio_queue_init(&pfp->ready_queue);
111}
112
113static void requeue(struct task_struct* t, pfp_domain_t *pfp)
114{
115 BUG_ON(!is_running(t));
116
117 tsk_rt(t)->completed = 0;
118 if (is_released(t, litmus_clock()))
119 fp_prio_add(&pfp->ready_queue, t, priority_index(t));
120 else
121 add_release(&pfp->domain, t); /* it has got to wait */
122}
123
124static void job_completion(struct task_struct* t, int forced)
125{
126 sched_trace_task_completion(t,forced);
127 TRACE_TASK(t, "job_completion().\n");
128
129 tsk_rt(t)->completed = 1;
130 prepare_for_next_period(t);
131 if (is_released(t, litmus_clock()))
132 sched_trace_task_release(t);
133}
134
135static void pfp_tick(struct task_struct *t)
136{
137 pfp_domain_t *pfp = local_pfp;
138
139 /* Check for inconsistency. We don't need the lock for this since
140 * ->scheduled is only changed in schedule, which obviously is not
141 * executing in parallel on this CPU
142 */
143 BUG_ON(is_realtime(t) && t != pfp->scheduled);
144
145 if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
146 if (!is_np(t)) {
147 litmus_reschedule_local();
148 TRACE("pfp_scheduler_tick: "
149 "%d is preemptable "
150 " => FORCE_RESCHED\n", t->pid);
151 } else if (is_user_np(t)) {
152 TRACE("pfp_scheduler_tick: "
153 "%d is non-preemptable, "
154 "preemption delayed.\n", t->pid);
155 request_exit_np(t);
156 }
157 }
158}
159
160static struct task_struct* pfp_schedule(struct task_struct * prev)
161{
162 pfp_domain_t* pfp = local_pfp;
163 struct task_struct* next;
164
165 int out_of_time, sleep, preempt, np, exists, blocks, resched, migrate;
166
167 raw_spin_lock(&pfp->slock);
168
169 /* sanity checking
170 * differently from gedf, when a task exits (dead)
171 * pfp->schedule may be null and prev _is_ realtime
172 */
173 BUG_ON(pfp->scheduled && pfp->scheduled != prev);
174 BUG_ON(pfp->scheduled && !is_realtime(prev));
175
176 /* (0) Determine state */
177 exists = pfp->scheduled != NULL;
178 blocks = exists && !is_running(pfp->scheduled);
179 out_of_time = exists &&
180 budget_enforced(pfp->scheduled) &&
181 budget_exhausted(pfp->scheduled);
182 np = exists && is_np(pfp->scheduled);
183 sleep = exists && is_completed(pfp->scheduled);
184 migrate = exists && get_partition(pfp->scheduled) != pfp->cpu;
185 preempt = !blocks && (migrate || fp_preemption_needed(&pfp->ready_queue, prev));
186
187 /* If we need to preempt do so.
188 * The following checks set resched to 1 in case of special
189 * circumstances.
190 */
191 resched = preempt;
192
193 /* If a task blocks we have no choice but to reschedule.
194 */
195 if (blocks)
196 resched = 1;
197
198 /* Request a sys_exit_np() call if we would like to preempt but cannot.
199 * Multiple calls to request_exit_np() don't hurt.
200 */
201 if (np && (out_of_time || preempt || sleep))
202 request_exit_np(pfp->scheduled);
203
204 /* Any task that is preemptable and either exhausts its execution
205 * budget or wants to sleep completes. We may have to reschedule after
206 * this.
207 */
208 if (!np && (out_of_time || sleep) && !blocks && !migrate) {
209 job_completion(pfp->scheduled, !sleep);
210 resched = 1;
211 }
212
213 /* The final scheduling decision. Do we need to switch for some reason?
214 * Switch if we are in RT mode and have no task or if we need to
215 * resched.
216 */
217 next = NULL;
218 if ((!np || blocks) && (resched || !exists)) {
219 /* When preempting a task that does not block, then
220 * re-insert it into either the ready queue or the
221 * release queue (if it completed). requeue() picks
222 * the appropriate queue.
223 */
224 if (pfp->scheduled && !blocks && !migrate)
225 requeue(pfp->scheduled, pfp);
226 next = fp_prio_take(&pfp->ready_queue);
227 if (next == prev) {
228 struct task_struct *t = fp_prio_peek(&pfp->ready_queue);
229 TRACE_TASK(next, "next==prev sleep=%d oot=%d np=%d preempt=%d migrate=%d "
230 "boost=%d empty=%d prio-idx=%u prio=%u\n",
231 sleep, out_of_time, np, preempt, migrate,
232 is_priority_boosted(next),
233 t == NULL,
234 priority_index(next),
235 get_priority(next));
236 if (t)
237 TRACE_TASK(t, "waiter boost=%d prio-idx=%u prio=%u\n",
238 is_priority_boosted(t),
239 priority_index(t),
240 get_priority(t));
241 }
242 /* If preempt is set, we should not see the same task again. */
243 BUG_ON(preempt && next == prev);
244 /* Similarly, if preempt is set, then next may not be NULL,
245 * unless it's a migration. */
246 BUG_ON(preempt && !migrate && next == NULL);
247 } else
248 /* Only override Linux scheduler if we have a real-time task
249 * scheduled that needs to continue.
250 */
251 if (exists)
252 next = prev;
253
254 if (next) {
255 TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
256 tsk_rt(next)->completed = 0;
257 } else {
258 TRACE("becoming idle at %llu\n", litmus_clock());
259 }
260
261 pfp->scheduled = next;
262 sched_state_task_picked();
263 raw_spin_unlock(&pfp->slock);
264
265 return next;
266}
267
268#ifdef CONFIG_LITMUS_LOCKING
269
270/* prev is no longer scheduled --- see if it needs to migrate */
271static void pfp_finish_switch(struct task_struct *prev)
272{
273 pfp_domain_t *to;
274
275 if (is_realtime(prev) &&
276 is_running(prev) &&
277 get_partition(prev) != smp_processor_id()) {
278 TRACE_TASK(prev, "needs to migrate from P%d to P%d\n",
279 smp_processor_id(), get_partition(prev));
280
281 to = task_pfp(prev);
282
283 raw_spin_lock(&to->slock);
284
285 TRACE_TASK(prev, "adding to queue on P%d\n", to->cpu);
286 requeue(prev, to);
287 if (fp_preemption_needed(&to->ready_queue, to->scheduled))
288 preempt(to);
289
290 raw_spin_unlock(&to->slock);
291
292 }
293}
294
295#endif
296
297/* Prepare a task for running in RT mode
298 */
299static void pfp_task_new(struct task_struct * t, int on_rq, int running)
300{
301 pfp_domain_t* pfp = task_pfp(t);
302 unsigned long flags;
303
304 TRACE_TASK(t, "P-FP: task new, cpu = %d\n",
305 t->rt_param.task_params.cpu);
306
307 /* setup job parameters */
308 release_at(t, litmus_clock());
309
310 /* The task should be running in the queue, otherwise signal
311 * code will try to wake it up with fatal consequences.
312 */
313 raw_spin_lock_irqsave(&pfp->slock, flags);
314 if (running) {
315 /* there shouldn't be anything else running at the time */
316 BUG_ON(pfp->scheduled);
317 pfp->scheduled = t;
318 } else {
319 requeue(t, pfp);
320 /* maybe we have to reschedule */
321 pfp_preempt_check(pfp);
322 }
323 raw_spin_unlock_irqrestore(&pfp->slock, flags);
324}
325
326static void pfp_task_wake_up(struct task_struct *task)
327{
328 unsigned long flags;
329 pfp_domain_t* pfp = task_pfp(task);
330 lt_t now;
331
332 TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
333 raw_spin_lock_irqsave(&pfp->slock, flags);
334
335#ifdef CONFIG_LITMUS_LOCKING
336 /* Should only be queued when processing a fake-wake up due to a
337 * migration-related state change. */
338 if (unlikely(is_queued(task))) {
339 TRACE_TASK(task, "WARNING: waking task still queued. Is this right?\n");
340 goto out_unlock;
341 }
342#else
343 BUG_ON(is_queued(task));
344#endif
345 now = litmus_clock();
346 if (is_sporadic(task) && is_tardy(task, now)
347#ifdef CONFIG_LITMUS_LOCKING
348 /* We need to take suspensions because of semaphores into
349 * account! If a job resumes after being suspended due to acquiring
350 * a semaphore, it should never be treated as a new job release.
351 */
352 && !is_priority_boosted(task)
353#endif
354 ) {
355 /* new sporadic release */
356 release_at(task, now);
357 sched_trace_task_release(task);
358 }
359
360 /* Only add to ready queue if it is not the currently-scheduled
361 * task. This could be the case if a task was woken up concurrently
362 * on a remote CPU before the executing CPU got around to actually
363 * de-scheduling the task, i.e., wake_up() raced with schedule()
364 * and won. Also, don't requeue if it is still queued, which can
365 * happen under the DPCP due wake-ups racing with migrations.
366 */
367 if (pfp->scheduled != task) {
368 requeue(task, pfp);
369 pfp_preempt_check(pfp);
370 }
371
372#ifdef CONFIG_LITMUS_LOCKING
373out_unlock:
374#endif
375 raw_spin_unlock_irqrestore(&pfp->slock, flags);
376 TRACE_TASK(task, "wake up done\n");
377}
378
379static void pfp_task_block(struct task_struct *t)
380{
381 /* only running tasks can block, thus t is in no queue */
382 TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state);
383
384 BUG_ON(!is_realtime(t));
385
386 /* If this task blocked normally, it shouldn't be queued. The exception is
387 * if this is a simulated block()/wakeup() pair from the pull-migration code path.
388 * This should only happen if the DPCP is being used.
389 */
390#ifdef CONFIG_LITMUS_LOCKING
391 if (unlikely(is_queued(t)))
392 TRACE_TASK(t, "WARNING: blocking task still queued. Is this right?\n");
393#else
394 BUG_ON(is_queued(t));
395#endif
396}
397
398static void pfp_task_exit(struct task_struct * t)
399{
400 unsigned long flags;
401 pfp_domain_t* pfp = task_pfp(t);
402 rt_domain_t* dom;
403
404 raw_spin_lock_irqsave(&pfp->slock, flags);
405 if (is_queued(t)) {
406 BUG(); /* This currently doesn't work. */
407 /* dequeue */
408 dom = task_dom(t);
409 remove(dom, t);
410 }
411 if (pfp->scheduled == t) {
412 pfp->scheduled = NULL;
413 preempt(pfp);
414 }
415 TRACE_TASK(t, "RIP, now reschedule\n");
416
417 raw_spin_unlock_irqrestore(&pfp->slock, flags);
418}
419
420#ifdef CONFIG_LITMUS_LOCKING
421
422#include <litmus/fdso.h>
423#include <litmus/srp.h>
424
425static void fp_dequeue(pfp_domain_t* pfp, struct task_struct* t)
426{
427 BUG_ON(pfp->scheduled == t && is_queued(t));
428 if (is_queued(t))
429 fp_prio_remove(&pfp->ready_queue, t, priority_index(t));
430}
431
432static void fp_set_prio_inh(pfp_domain_t* pfp, struct task_struct* t,
433 struct task_struct* prio_inh)
434{
435 int requeue;
436
437 if (!t || t->rt_param.inh_task == prio_inh) {
438 /* no update required */
439 if (t)
440 TRACE_TASK(t, "no prio-inh update required\n");
441 return;
442 }
443
444 requeue = is_queued(t);
445 TRACE_TASK(t, "prio-inh: is_queued:%d\n", requeue);
446
447 if (requeue)
448 /* first remove */
449 fp_dequeue(pfp, t);
450
451 t->rt_param.inh_task = prio_inh;
452
453 if (requeue)
454 /* add again to the right queue */
455 fp_prio_add(&pfp->ready_queue, t, priority_index(t));
456}
457
458static int effective_agent_priority(int prio)
459{
460 /* make sure agents have higher priority */
461 return prio - LITMUS_MAX_PRIORITY;
462}
463
464static lt_t prio_point(int eprio)
465{
466 /* make sure we have non-negative prio points */
467 return eprio + LITMUS_MAX_PRIORITY;
468}
469
470static int prio_from_point(lt_t prio_point)
471{
472 return ((int) prio_point) - LITMUS_MAX_PRIORITY;
473}
474
475static void boost_priority(struct task_struct* t, lt_t priority_point)
476{
477 unsigned long flags;
478 pfp_domain_t* pfp = task_pfp(t);
479
480 raw_spin_lock_irqsave(&pfp->slock, flags);
481
482
483 TRACE_TASK(t, "priority boosted at %llu\n", litmus_clock());
484
485 tsk_rt(t)->priority_boosted = 1;
486 /* tie-break by protocol-specific priority point */
487 tsk_rt(t)->boost_start_time = priority_point;
488
489 /* Priority boosting currently only takes effect for already-scheduled
490 * tasks. This is sufficient since priority boosting only kicks in as
491 * part of lock acquisitions. */
492 BUG_ON(pfp->scheduled != t);
493
494 raw_spin_unlock_irqrestore(&pfp->slock, flags);
495}
496
497static void unboost_priority(struct task_struct* t)
498{
499 unsigned long flags;
500 pfp_domain_t* pfp = task_pfp(t);
501 lt_t now;
502
503 raw_spin_lock_irqsave(&pfp->slock, flags);
504 now = litmus_clock();
505
506 /* assumption: this only happens when the job is scheduled */
507 BUG_ON(pfp->scheduled != t);
508
509 TRACE_TASK(t, "priority restored at %llu\n", now);
510
511 /* priority boosted jobs must be scheduled */
512 BUG_ON(pfp->scheduled != t);
513
514 tsk_rt(t)->priority_boosted = 0;
515 tsk_rt(t)->boost_start_time = 0;
516
517 /* check if this changes anything */
518 if (fp_preemption_needed(&pfp->ready_queue, pfp->scheduled))
519 preempt(pfp);
520
521 raw_spin_unlock_irqrestore(&pfp->slock, flags);
522}
523
524/* ******************** SRP support ************************ */
525
526static unsigned int pfp_get_srp_prio(struct task_struct* t)
527{
528 return get_priority(t);
529}
530
531/* ******************** FMLP support ********************** */
532
533struct fmlp_semaphore {
534 struct litmus_lock litmus_lock;
535
536 /* current resource holder */
537 struct task_struct *owner;
538
539 /* FIFO queue of waiting tasks */
540 wait_queue_head_t wait;
541};
542
543static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock)
544{
545 return container_of(lock, struct fmlp_semaphore, litmus_lock);
546}
547int pfp_fmlp_lock(struct litmus_lock* l)
548{
549 struct task_struct* t = current;
550 struct fmlp_semaphore *sem = fmlp_from_lock(l);
551 wait_queue_t wait;
552 unsigned long flags;
553 lt_t time_of_request;
554
555 if (!is_realtime(t))
556 return -EPERM;
557
558 /* prevent nested lock acquisition --- not supported by FMLP */
559 if (tsk_rt(t)->num_locks_held ||
560 tsk_rt(t)->num_local_locks_held)
561 return -EBUSY;
562
563 spin_lock_irqsave(&sem->wait.lock, flags);
564
565 /* tie-break by this point in time */
566 time_of_request = litmus_clock();
567
568 /* Priority-boost ourself *before* we suspend so that
569 * our priority is boosted when we resume. */
570 boost_priority(t, time_of_request);
571
572 if (sem->owner) {
573 /* resource is not free => must suspend and wait */
574
575 init_waitqueue_entry(&wait, t);
576
577 /* FIXME: interruptible would be nice some day */
578 set_task_state(t, TASK_UNINTERRUPTIBLE);
579
580 __add_wait_queue_tail_exclusive(&sem->wait, &wait);
581
582 TS_LOCK_SUSPEND;
583
584 /* release lock before sleeping */
585 spin_unlock_irqrestore(&sem->wait.lock, flags);
586
587 /* We depend on the FIFO order. Thus, we don't need to recheck
588 * when we wake up; we are guaranteed to have the lock since
589 * there is only one wake up per release.
590 */
591
592 schedule();
593
594 TS_LOCK_RESUME;
595
596 /* Since we hold the lock, no other task will change
597 * ->owner. We can thus check it without acquiring the spin
598 * lock. */
599 BUG_ON(sem->owner != t);
600 } else {
601 /* it's ours now */
602 sem->owner = t;
603
604 spin_unlock_irqrestore(&sem->wait.lock, flags);
605 }
606
607 tsk_rt(t)->num_locks_held++;
608
609 return 0;
610}
611
612int pfp_fmlp_unlock(struct litmus_lock* l)
613{
614 struct task_struct *t = current, *next;
615 struct fmlp_semaphore *sem = fmlp_from_lock(l);
616 unsigned long flags;
617 int err = 0;
618
619 spin_lock_irqsave(&sem->wait.lock, flags);
620
621 if (sem->owner != t) {
622 err = -EINVAL;
623 goto out;
624 }
625
626 tsk_rt(t)->num_locks_held--;
627
628 /* we lose the benefit of priority boosting */
629
630 unboost_priority(t);
631
632 /* check if there are jobs waiting for this resource */
633 next = __waitqueue_remove_first(&sem->wait);
634 if (next) {
635 /* next becomes the resouce holder */
636 sem->owner = next;
637
638 /* Wake up next. The waiting job is already priority-boosted. */
639 wake_up_process(next);
640 } else
641 /* resource becomes available */
642 sem->owner = NULL;
643
644out:
645 spin_unlock_irqrestore(&sem->wait.lock, flags);
646 return err;
647}
648
649int pfp_fmlp_close(struct litmus_lock* l)
650{
651 struct task_struct *t = current;
652 struct fmlp_semaphore *sem = fmlp_from_lock(l);
653 unsigned long flags;
654
655 int owner;
656
657 spin_lock_irqsave(&sem->wait.lock, flags);
658
659 owner = sem->owner == t;
660
661 spin_unlock_irqrestore(&sem->wait.lock, flags);
662
663 if (owner)
664 pfp_fmlp_unlock(l);
665
666 return 0;
667}
668
669void pfp_fmlp_free(struct litmus_lock* lock)
670{
671 kfree(fmlp_from_lock(lock));
672}
673
674static struct litmus_lock_ops pfp_fmlp_lock_ops = {
675 .close = pfp_fmlp_close,
676 .lock = pfp_fmlp_lock,
677 .unlock = pfp_fmlp_unlock,
678 .deallocate = pfp_fmlp_free,
679};
680
681static struct litmus_lock* pfp_new_fmlp(void)
682{
683 struct fmlp_semaphore* sem;
684
685 sem = kmalloc(sizeof(*sem), GFP_KERNEL);
686 if (!sem)
687 return NULL;
688
689 sem->owner = NULL;
690 init_waitqueue_head(&sem->wait);
691 sem->litmus_lock.ops = &pfp_fmlp_lock_ops;
692
693 return &sem->litmus_lock;
694}
695
696/* ******************** MPCP support ********************** */
697
698struct mpcp_semaphore {
699 struct litmus_lock litmus_lock;
700
701 /* current resource holder */
702 struct task_struct *owner;
703
704 /* priority queue of waiting tasks */
705 wait_queue_head_t wait;
706
707 /* priority ceiling per cpu */
708 unsigned int prio_ceiling[NR_CPUS];
709
710 /* should jobs spin "virtually" for this resource? */
711 int vspin;
712};
713
714#define OMEGA_CEILING UINT_MAX
715
716/* Since jobs spin "virtually" while waiting to acquire a lock,
717 * they first must aquire a local per-cpu resource.
718 */
719static DEFINE_PER_CPU(wait_queue_head_t, mpcpvs_vspin_wait);
720static DEFINE_PER_CPU(struct task_struct*, mpcpvs_vspin);
721
722/* called with preemptions off <=> no local modifications */
723static void mpcp_vspin_enter(void)
724{
725 struct task_struct* t = current;
726
727 while (1) {
728 if (__get_cpu_var(mpcpvs_vspin) == NULL) {
729 /* good, we get to issue our request */
730 __get_cpu_var(mpcpvs_vspin) = t;
731 break;
732 } else {
733 /* some job is spinning => enqueue in request queue */
734 prio_wait_queue_t wait;
735 wait_queue_head_t* vspin = &__get_cpu_var(mpcpvs_vspin_wait);
736 unsigned long flags;
737
738 /* ordered by regular priority */
739 init_prio_waitqueue_entry(&wait, t, prio_point(get_priority(t)));
740
741 spin_lock_irqsave(&vspin->lock, flags);
742
743 set_task_state(t, TASK_UNINTERRUPTIBLE);
744
745 __add_wait_queue_prio_exclusive(vspin, &wait);
746
747 spin_unlock_irqrestore(&vspin->lock, flags);
748
749 TS_LOCK_SUSPEND;
750
751 preempt_enable_no_resched();
752
753 schedule();
754
755 preempt_disable();
756
757 TS_LOCK_RESUME;
758 /* Recheck if we got it --- some higher-priority process might
759 * have swooped in. */
760 }
761 }
762 /* ok, now it is ours */
763}
764
765/* called with preemptions off */
766static void mpcp_vspin_exit(void)
767{
768 struct task_struct* t = current, *next;
769 unsigned long flags;
770 wait_queue_head_t* vspin = &__get_cpu_var(mpcpvs_vspin_wait);
771
772 BUG_ON(__get_cpu_var(mpcpvs_vspin) != t);
773
774 /* no spinning job */
775 __get_cpu_var(mpcpvs_vspin) = NULL;
776
777 /* see if anyone is waiting for us to stop "spinning" */
778 spin_lock_irqsave(&vspin->lock, flags);
779 next = __waitqueue_remove_first(vspin);
780
781 if (next)
782 wake_up_process(next);
783
784 spin_unlock_irqrestore(&vspin->lock, flags);
785}
786
787static inline struct mpcp_semaphore* mpcp_from_lock(struct litmus_lock* lock)
788{
789 return container_of(lock, struct mpcp_semaphore, litmus_lock);
790}
791
792int pfp_mpcp_lock(struct litmus_lock* l)
793{
794 struct task_struct* t = current;
795 struct mpcp_semaphore *sem = mpcp_from_lock(l);
796 prio_wait_queue_t wait;
797 unsigned long flags;
798
799 if (!is_realtime(t))
800 return -EPERM;
801
802 /* prevent nested lock acquisition */
803 if (tsk_rt(t)->num_locks_held ||
804 tsk_rt(t)->num_local_locks_held)
805 return -EBUSY;
806
807 preempt_disable();
808
809 if (sem->vspin)
810 mpcp_vspin_enter();
811
812 /* Priority-boost ourself *before* we suspend so that
813 * our priority is boosted when we resume. Use the priority
814 * ceiling for the local partition. */
815 boost_priority(t, sem->prio_ceiling[get_partition(t)]);
816
817 spin_lock_irqsave(&sem->wait.lock, flags);
818
819 preempt_enable_no_resched();
820
821 if (sem->owner) {
822 /* resource is not free => must suspend and wait */
823
824 /* ordered by regular priority */
825 init_prio_waitqueue_entry(&wait, t, prio_point(get_priority(t)));
826
827 /* FIXME: interruptible would be nice some day */
828 set_task_state(t, TASK_UNINTERRUPTIBLE);
829
830 __add_wait_queue_prio_exclusive(&sem->wait, &wait);
831
832 TS_LOCK_SUSPEND;
833
834 /* release lock before sleeping */
835 spin_unlock_irqrestore(&sem->wait.lock, flags);
836
837 /* We depend on the FIFO order. Thus, we don't need to recheck
838 * when we wake up; we are guaranteed to have the lock since
839 * there is only one wake up per release.
840 */
841
842 schedule();
843
844 TS_LOCK_RESUME;
845
846 /* Since we hold the lock, no other task will change
847 * ->owner. We can thus check it without acquiring the spin
848 * lock. */
849 BUG_ON(sem->owner != t);
850 } else {
851 /* it's ours now */
852 sem->owner = t;
853
854 spin_unlock_irqrestore(&sem->wait.lock, flags);
855 }
856
857 tsk_rt(t)->num_locks_held++;
858
859 return 0;
860}
861
862int pfp_mpcp_unlock(struct litmus_lock* l)
863{
864 struct task_struct *t = current, *next;
865 struct mpcp_semaphore *sem = mpcp_from_lock(l);
866 unsigned long flags;
867 int err = 0;
868
869 spin_lock_irqsave(&sem->wait.lock, flags);
870
871 if (sem->owner != t) {
872 err = -EINVAL;
873 goto out;
874 }
875
876
877 tsk_rt(t)->num_locks_held--;
878
879 /* we lose the benefit of priority boosting */
880
881 unboost_priority(t);
882
883 /* check if there are jobs waiting for this resource */
884 next = __waitqueue_remove_first(&sem->wait);
885 if (next) {
886 /* next becomes the resouce holder */
887 sem->owner = next;
888
889 /* Wake up next. The waiting job is already priority-boosted. */
890 wake_up_process(next);
891 } else
892 /* resource becomes available */
893 sem->owner = NULL;
894
895out:
896 spin_unlock_irqrestore(&sem->wait.lock, flags);
897
898 if (sem->vspin && err == 0) {
899 preempt_disable();
900 mpcp_vspin_exit();
901 preempt_enable();
902 }
903
904 return err;
905}
906
907int pfp_mpcp_open(struct litmus_lock* l, void* config)
908{
909 struct task_struct *t = current;
910 struct mpcp_semaphore *sem = mpcp_from_lock(l);
911 int cpu, local_cpu;
912 unsigned long flags;
913
914 if (!is_realtime(t))
915 /* we need to know the real-time priority */
916 return -EPERM;
917
918 local_cpu = get_partition(t);
919
920 spin_lock_irqsave(&sem->wait.lock, flags);
921
922 for (cpu = 0; cpu < NR_CPUS; cpu++)
923 if (cpu != local_cpu)
924 {
925 sem->prio_ceiling[cpu] = min(sem->prio_ceiling[cpu],
926 get_priority(t));
927 TRACE_CUR("priority ceiling for sem %p is now %d on cpu %d\n",
928 sem, sem->prio_ceiling[cpu], cpu);
929 }
930
931 spin_unlock_irqrestore(&sem->wait.lock, flags);
932
933 return 0;
934}
935
936int pfp_mpcp_close(struct litmus_lock* l)
937{
938 struct task_struct *t = current;
939 struct mpcp_semaphore *sem = mpcp_from_lock(l);
940 unsigned long flags;
941
942 int owner;
943
944 spin_lock_irqsave(&sem->wait.lock, flags);
945
946 owner = sem->owner == t;
947
948 spin_unlock_irqrestore(&sem->wait.lock, flags);
949
950 if (owner)
951 pfp_mpcp_unlock(l);
952
953 return 0;
954}
955
956void pfp_mpcp_free(struct litmus_lock* lock)
957{
958 kfree(mpcp_from_lock(lock));
959}
960
961static struct litmus_lock_ops pfp_mpcp_lock_ops = {
962 .close = pfp_mpcp_close,
963 .lock = pfp_mpcp_lock,
964 .open = pfp_mpcp_open,
965 .unlock = pfp_mpcp_unlock,
966 .deallocate = pfp_mpcp_free,
967};
968
969static struct litmus_lock* pfp_new_mpcp(int vspin)
970{
971 struct mpcp_semaphore* sem;
972 int cpu;
973
974 sem = kmalloc(sizeof(*sem), GFP_KERNEL);
975 if (!sem)
976 return NULL;
977
978 sem->owner = NULL;
979 init_waitqueue_head(&sem->wait);
980 sem->litmus_lock.ops = &pfp_mpcp_lock_ops;
981
982 for (cpu = 0; cpu < NR_CPUS; cpu++)
983 sem->prio_ceiling[cpu] = OMEGA_CEILING;
984
985 /* mark as virtual spinning */
986 sem->vspin = vspin;
987
988 return &sem->litmus_lock;
989}
990
991
992/* ******************** PCP support ********************** */
993
994
995struct pcp_semaphore {
996 struct litmus_lock litmus_lock;
997
998 struct list_head ceiling;
999
1000 /* current resource holder */
1001 struct task_struct *owner;
1002
1003 /* priority ceiling --- can be negative due to DPCP support */
1004 int prio_ceiling;
1005
1006 /* on which processor is this PCP semaphore allocated? */
1007 int on_cpu;
1008};
1009
1010static inline struct pcp_semaphore* pcp_from_lock(struct litmus_lock* lock)
1011{
1012 return container_of(lock, struct pcp_semaphore, litmus_lock);
1013}
1014
1015
1016struct pcp_state {
1017 struct list_head system_ceiling;
1018
1019 /* highest-priority waiting task */
1020 struct task_struct* hp_waiter;
1021
1022 /* list of jobs waiting to get past the system ceiling */
1023 wait_queue_head_t ceiling_blocked;
1024};
1025
1026static void pcp_init_state(struct pcp_state* s)
1027{
1028 INIT_LIST_HEAD(&s->system_ceiling);
1029 s->hp_waiter = NULL;
1030 init_waitqueue_head(&s->ceiling_blocked);
1031}
1032
1033static DEFINE_PER_CPU(struct pcp_state, pcp_state);
1034
1035/* assumes preemptions are off */
1036static struct pcp_semaphore* pcp_get_ceiling(void)
1037{
1038 struct list_head* top = __get_cpu_var(pcp_state).system_ceiling.next;
1039
1040 if (top)
1041 return list_entry(top, struct pcp_semaphore, ceiling);
1042 else
1043 return NULL;
1044}
1045
1046/* assumes preempt off */
1047static void pcp_add_ceiling(struct pcp_semaphore* sem)
1048{
1049 struct list_head *pos;
1050 struct list_head *in_use = &__get_cpu_var(pcp_state).system_ceiling;
1051 struct pcp_semaphore* held;
1052
1053 BUG_ON(sem->on_cpu != smp_processor_id());
1054 BUG_ON(in_list(&sem->ceiling));
1055
1056 list_for_each(pos, in_use) {
1057 held = list_entry(pos, struct pcp_semaphore, ceiling);
1058 if (held->prio_ceiling >= sem->prio_ceiling) {
1059 __list_add(&sem->ceiling, pos->prev, pos);
1060 return;
1061 }
1062 }
1063
1064 /* we hit the end of the list */
1065
1066 list_add_tail(&sem->ceiling, in_use);
1067}
1068
1069/* assumes preempt off */
1070static int pcp_exceeds_ceiling(struct pcp_semaphore* ceiling,
1071 struct task_struct* task,
1072 int effective_prio)
1073{
1074 return ceiling == NULL ||
1075 ceiling->prio_ceiling > effective_prio ||
1076 ceiling->owner == task;
1077}
1078
1079/* assumes preempt off */
1080static void pcp_priority_inheritance(void)
1081{
1082 unsigned long flags;
1083 pfp_domain_t* pfp = local_pfp;
1084
1085 struct pcp_semaphore* ceiling = pcp_get_ceiling();
1086 struct task_struct *blocker, *blocked;
1087
1088 blocker = ceiling ? ceiling->owner : NULL;
1089 blocked = __get_cpu_var(pcp_state).hp_waiter;
1090
1091 raw_spin_lock_irqsave(&pfp->slock, flags);
1092
1093 /* Current is no longer inheriting anything by default. This should be
1094 * the currently scheduled job, and hence not currently queued. */
1095 BUG_ON(current != pfp->scheduled);
1096
1097 fp_set_prio_inh(pfp, current, NULL);
1098 fp_set_prio_inh(pfp, blocked, NULL);
1099 fp_set_prio_inh(pfp, blocker, NULL);
1100
1101
1102 /* Let blocking job inherit priority of blocked job, if required. */
1103 if (blocker && blocked &&
1104 fp_higher_prio(blocked, blocker)) {
1105 TRACE_TASK(blocker, "PCP inherits from %s/%d (prio %u -> %u) \n",
1106 blocked->comm, blocked->pid,
1107 get_priority(blocker), get_priority(blocked));
1108 fp_set_prio_inh(pfp, blocker, blocked);
1109 }
1110
1111 /* Check if anything changed. If the blocked job is current, then it is
1112 * just blocking and hence is going to call the scheduler anyway. */
1113 if (blocked != current &&
1114 fp_higher_prio(fp_prio_peek(&pfp->ready_queue), pfp->scheduled))
1115 preempt(pfp);
1116
1117 raw_spin_unlock_irqrestore(&pfp->slock, flags);
1118}
1119
1120/* called with preemptions off */
1121static void pcp_raise_ceiling(struct pcp_semaphore* sem,
1122 int effective_prio)
1123{
1124 struct task_struct* t = current;
1125 struct pcp_semaphore* ceiling;
1126 prio_wait_queue_t wait;
1127 unsigned int waiting_higher_prio;
1128
1129 do {
1130 ceiling = pcp_get_ceiling();
1131 if (pcp_exceeds_ceiling(ceiling, t, effective_prio))
1132 break;
1133
1134 TRACE_CUR("PCP ceiling-blocked, wanted sem %p, but %s/%d has the ceiling \n",
1135 sem, ceiling->owner->comm, ceiling->owner->pid);
1136
1137 /* we need to wait until the ceiling is lowered */
1138
1139 /* enqueue in priority order */
1140 init_prio_waitqueue_entry(&wait, t, prio_point(effective_prio));
1141 set_task_state(t, TASK_UNINTERRUPTIBLE);
1142 waiting_higher_prio = add_wait_queue_prio_exclusive(
1143 &__get_cpu_var(pcp_state).ceiling_blocked, &wait);
1144
1145 if (waiting_higher_prio == 0) {
1146 TRACE_CUR("PCP new highest-prio waiter => prio inheritance\n");
1147
1148 /* we are the new highest-priority waiting job
1149 * => update inheritance */
1150 __get_cpu_var(pcp_state).hp_waiter = t;
1151 pcp_priority_inheritance();
1152 }
1153
1154 TS_LOCK_SUSPEND;
1155
1156 preempt_enable_no_resched();
1157 schedule();
1158 preempt_disable();
1159
1160 /* pcp_resume_unblocked() removed us from wait queue */
1161
1162 TS_LOCK_RESUME;
1163 } while(1);
1164
1165 TRACE_CUR("PCP got the ceiling and sem %p\n", sem);
1166
1167 /* We are good to go. The semaphore should be available. */
1168 BUG_ON(sem->owner != NULL);
1169
1170 sem->owner = t;
1171
1172 pcp_add_ceiling(sem);
1173}
1174
1175static void pcp_resume_unblocked(void)
1176{
1177 wait_queue_head_t *blocked = &__get_cpu_var(pcp_state).ceiling_blocked;
1178 unsigned long flags;
1179 prio_wait_queue_t* q;
1180 struct task_struct* t = NULL;
1181
1182 struct pcp_semaphore* ceiling = pcp_get_ceiling();
1183
1184 spin_lock_irqsave(&blocked->lock, flags);
1185
1186 while (waitqueue_active(blocked)) {
1187 /* check first == highest-priority waiting job */
1188 q = list_entry(blocked->task_list.next,
1189 prio_wait_queue_t, wq.task_list);
1190 t = (struct task_struct*) q->wq.private;
1191
1192 /* can it proceed now? => let it go */
1193 if (pcp_exceeds_ceiling(ceiling, t,
1194 prio_from_point(q->priority))) {
1195 __remove_wait_queue(blocked, &q->wq);
1196 wake_up_process(t);
1197 } else {
1198 /* We are done. Update highest-priority waiter. */
1199 __get_cpu_var(pcp_state).hp_waiter = t;
1200 goto out;
1201 }
1202 }
1203 /* If we get here, then there are no more waiting
1204 * jobs. */
1205 __get_cpu_var(pcp_state).hp_waiter = NULL;
1206out:
1207 spin_unlock_irqrestore(&blocked->lock, flags);
1208}
1209
1210/* assumes preempt off */
1211static void pcp_lower_ceiling(struct pcp_semaphore* sem)
1212{
1213 BUG_ON(!in_list(&sem->ceiling));
1214 BUG_ON(sem->owner != current);
1215 BUG_ON(sem->on_cpu != smp_processor_id());
1216
1217 /* remove from ceiling list */
1218 list_del(&sem->ceiling);
1219
1220 /* release */
1221 sem->owner = NULL;
1222
1223 TRACE_CUR("PCP released sem %p\n", sem);
1224
1225 pcp_priority_inheritance();
1226
1227 /* Wake up all ceiling-blocked jobs that now pass the ceiling. */
1228 pcp_resume_unblocked();
1229}
1230
1231static void pcp_update_prio_ceiling(struct pcp_semaphore* sem,
1232 int effective_prio)
1233{
1234 /* This needs to be synchronized on something.
1235 * Might as well use waitqueue lock for the processor.
1236 * We assume this happens only before the task set starts execution,
1237 * (i.e., during initialization), but it may happen on multiple processors
1238 * at the same time.
1239 */
1240 unsigned long flags;
1241
1242 struct pcp_state* s = &per_cpu(pcp_state, sem->on_cpu);
1243
1244 spin_lock_irqsave(&s->ceiling_blocked.lock, flags);
1245
1246 sem->prio_ceiling = min(sem->prio_ceiling, effective_prio);
1247
1248 spin_unlock_irqrestore(&s->ceiling_blocked.lock, flags);
1249}
1250
1251static void pcp_init_semaphore(struct pcp_semaphore* sem, int cpu)
1252{
1253 sem->owner = NULL;
1254 INIT_LIST_HEAD(&sem->ceiling);
1255 sem->prio_ceiling = INT_MAX;
1256 sem->on_cpu = cpu;
1257}
1258
1259int pfp_pcp_lock(struct litmus_lock* l)
1260{
1261 struct task_struct* t = current;
1262 struct pcp_semaphore *sem = pcp_from_lock(l);
1263
1264 int eprio = effective_agent_priority(get_priority(t));
1265 int from = get_partition(t);
1266 int to = sem->on_cpu;
1267
1268 if (!is_realtime(t) || from != to)
1269 return -EPERM;
1270
1271 /* prevent nested lock acquisition in global critical section */
1272 if (tsk_rt(t)->num_locks_held)
1273 return -EBUSY;
1274
1275 preempt_disable();
1276
1277 pcp_raise_ceiling(sem, eprio);
1278
1279 preempt_enable();
1280
1281 tsk_rt(t)->num_local_locks_held++;
1282
1283 return 0;
1284}
1285
1286int pfp_pcp_unlock(struct litmus_lock* l)
1287{
1288 struct task_struct *t = current;
1289 struct pcp_semaphore *sem = pcp_from_lock(l);
1290
1291 int err = 0;
1292
1293 preempt_disable();
1294
1295 if (sem->on_cpu != smp_processor_id() || sem->owner != t) {
1296 err = -EINVAL;
1297 goto out;
1298 }
1299
1300 tsk_rt(t)->num_local_locks_held--;
1301
1302 /* give it back */
1303 pcp_lower_ceiling(sem);
1304
1305out:
1306 preempt_enable();
1307
1308 return err;
1309}
1310
1311int pfp_pcp_open(struct litmus_lock* l, void* __user config)
1312{
1313 struct task_struct *t = current;
1314 struct pcp_semaphore *sem = pcp_from_lock(l);
1315
1316 int cpu, eprio;
1317
1318 if (!is_realtime(t))
1319 /* we need to know the real-time priority */
1320 return -EPERM;
1321
1322 if (!config)
1323 cpu = get_partition(t);
1324 else if (get_user(cpu, (int*) config))
1325 return -EFAULT;
1326
1327 /* make sure the resource location matches */
1328 if (cpu != sem->on_cpu)
1329 return -EINVAL;
1330
1331 eprio = effective_agent_priority(get_priority(t));
1332
1333 pcp_update_prio_ceiling(sem, eprio);
1334
1335 return 0;
1336}
1337
1338int pfp_pcp_close(struct litmus_lock* l)
1339{
1340 struct task_struct *t = current;
1341 struct pcp_semaphore *sem = pcp_from_lock(l);
1342
1343 int owner = 0;
1344
1345 preempt_disable();
1346
1347 if (sem->on_cpu == smp_processor_id())
1348 owner = sem->owner == t;
1349
1350 preempt_enable();
1351
1352 if (owner)
1353 pfp_pcp_unlock(l);
1354
1355 return 0;
1356}
1357
1358void pfp_pcp_free(struct litmus_lock* lock)
1359{
1360 kfree(pcp_from_lock(lock));
1361}
1362
1363
1364static struct litmus_lock_ops pfp_pcp_lock_ops = {
1365 .close = pfp_pcp_close,
1366 .lock = pfp_pcp_lock,
1367 .open = pfp_pcp_open,
1368 .unlock = pfp_pcp_unlock,
1369 .deallocate = pfp_pcp_free,
1370};
1371
1372
1373static struct litmus_lock* pfp_new_pcp(int on_cpu)
1374{
1375 struct pcp_semaphore* sem;
1376
1377 sem = kmalloc(sizeof(*sem), GFP_KERNEL);
1378 if (!sem)
1379 return NULL;
1380
1381 sem->litmus_lock.ops = &pfp_pcp_lock_ops;
1382 pcp_init_semaphore(sem, on_cpu);
1383
1384 return &sem->litmus_lock;
1385}
1386
1387/* ******************** DPCP support ********************** */
1388
1389struct dpcp_semaphore {
1390 struct litmus_lock litmus_lock;
1391 struct pcp_semaphore pcp;
1392 int owner_cpu;
1393};
1394
1395static inline struct dpcp_semaphore* dpcp_from_lock(struct litmus_lock* lock)
1396{
1397 return container_of(lock, struct dpcp_semaphore, litmus_lock);
1398}
1399
1400/* called with preemptions disabled */
1401static void pfp_migrate_to(int target_cpu)
1402{
1403 struct task_struct* t = current;
1404 pfp_domain_t *from;
1405
1406 if (get_partition(t) == target_cpu)
1407 return;
1408
1409 /* make sure target_cpu makes sense */
1410 BUG_ON(!cpu_online(target_cpu));
1411
1412 local_irq_disable();
1413
1414 /* scheduled task should not be in any ready or release queue */
1415 BUG_ON(is_queued(t));
1416
1417 /* lock both pfp domains in order of address */
1418 from = task_pfp(t);
1419
1420 raw_spin_lock(&from->slock);
1421
1422 /* switch partitions */
1423 tsk_rt(t)->task_params.cpu = target_cpu;
1424
1425 raw_spin_unlock(&from->slock);
1426
1427 /* Don't trace scheduler costs as part of
1428 * locking overhead. Scheduling costs are accounted for
1429 * explicitly. */
1430 TS_LOCK_SUSPEND;
1431
1432 local_irq_enable();
1433 preempt_enable_no_resched();
1434
1435 /* deschedule to be migrated */
1436 schedule();
1437
1438 /* we are now on the target processor */
1439 preempt_disable();
1440
1441 /* start recording costs again */
1442 TS_LOCK_RESUME;
1443
1444 BUG_ON(smp_processor_id() != target_cpu);
1445}
1446
1447int pfp_dpcp_lock(struct litmus_lock* l)
1448{
1449 struct task_struct* t = current;
1450 struct dpcp_semaphore *sem = dpcp_from_lock(l);
1451 int eprio = effective_agent_priority(get_priority(t));
1452 int from = get_partition(t);
1453 int to = sem->pcp.on_cpu;
1454
1455 if (!is_realtime(t))
1456 return -EPERM;
1457
1458 /* prevent nested lock accquisition */
1459 if (tsk_rt(t)->num_locks_held ||
1460 tsk_rt(t)->num_local_locks_held)
1461 return -EBUSY;
1462
1463 preempt_disable();
1464
1465 /* Priority-boost ourself *before* we suspend so that
1466 * our priority is boosted when we resume. */
1467
1468 boost_priority(t, get_priority(t));
1469
1470 pfp_migrate_to(to);
1471
1472 pcp_raise_ceiling(&sem->pcp, eprio);
1473
1474 /* yep, we got it => execute request */
1475 sem->owner_cpu = from;
1476
1477 preempt_enable();
1478
1479 tsk_rt(t)->num_locks_held++;
1480
1481 return 0;
1482}
1483
1484int pfp_dpcp_unlock(struct litmus_lock* l)
1485{
1486 struct task_struct *t = current;
1487 struct dpcp_semaphore *sem = dpcp_from_lock(l);
1488 int err = 0;
1489 int home;
1490
1491 preempt_disable();
1492
1493 if (sem->pcp.on_cpu != smp_processor_id() || sem->pcp.owner != t) {
1494 err = -EINVAL;
1495 goto out;
1496 }
1497
1498 tsk_rt(t)->num_locks_held--;
1499
1500 home = sem->owner_cpu;
1501
1502 /* give it back */
1503 pcp_lower_ceiling(&sem->pcp);
1504
1505 /* we lose the benefit of priority boosting */
1506 unboost_priority(t);
1507
1508 pfp_migrate_to(home);
1509
1510out:
1511 preempt_enable();
1512
1513 return err;
1514}
1515
1516int pfp_dpcp_open(struct litmus_lock* l, void* __user config)
1517{
1518 struct task_struct *t = current;
1519 struct dpcp_semaphore *sem = dpcp_from_lock(l);
1520 int cpu, eprio;
1521
1522 if (!is_realtime(t))
1523 /* we need to know the real-time priority */
1524 return -EPERM;
1525
1526 if (get_user(cpu, (int*) config))
1527 return -EFAULT;
1528
1529 /* make sure the resource location matches */
1530 if (cpu != sem->pcp.on_cpu)
1531 return -EINVAL;
1532
1533 eprio = effective_agent_priority(get_priority(t));
1534
1535 pcp_update_prio_ceiling(&sem->pcp, eprio);
1536
1537 return 0;
1538}
1539
1540int pfp_dpcp_close(struct litmus_lock* l)
1541{
1542 struct task_struct *t = current;
1543 struct dpcp_semaphore *sem = dpcp_from_lock(l);
1544 int owner = 0;
1545
1546 preempt_disable();
1547
1548 if (sem->pcp.on_cpu == smp_processor_id())
1549 owner = sem->pcp.owner == t;
1550
1551 preempt_enable();
1552
1553 if (owner)
1554 pfp_dpcp_unlock(l);
1555
1556 return 0;
1557}
1558
1559void pfp_dpcp_free(struct litmus_lock* lock)
1560{
1561 kfree(dpcp_from_lock(lock));
1562}
1563
1564static struct litmus_lock_ops pfp_dpcp_lock_ops = {
1565 .close = pfp_dpcp_close,
1566 .lock = pfp_dpcp_lock,
1567 .open = pfp_dpcp_open,
1568 .unlock = pfp_dpcp_unlock,
1569 .deallocate = pfp_dpcp_free,
1570};
1571
1572static struct litmus_lock* pfp_new_dpcp(int on_cpu)
1573{
1574 struct dpcp_semaphore* sem;
1575
1576 sem = kmalloc(sizeof(*sem), GFP_KERNEL);
1577 if (!sem)
1578 return NULL;
1579
1580 sem->litmus_lock.ops = &pfp_dpcp_lock_ops;
1581 sem->owner_cpu = NO_CPU;
1582 pcp_init_semaphore(&sem->pcp, on_cpu);
1583
1584 return &sem->litmus_lock;
1585}
1586
1587
1588/* **** lock constructor **** */
1589
1590
1591static long pfp_allocate_lock(struct litmus_lock **lock, int type,
1592 void* __user config)
1593{
1594 int err = -ENXIO, cpu;
1595 struct srp_semaphore* srp;
1596
1597 /* P-FP currently supports the SRP for local resources and the FMLP
1598 * for global resources. */
1599 switch (type) {
1600 case FMLP_SEM:
1601 /* FIFO Mutex Locking Protocol */
1602 *lock = pfp_new_fmlp();
1603 if (*lock)
1604 err = 0;
1605 else
1606 err = -ENOMEM;
1607 break;
1608
1609 case MPCP_SEM:
1610 /* Multiprocesor Priority Ceiling Protocol */
1611 *lock = pfp_new_mpcp(0);
1612 if (*lock)
1613 err = 0;
1614 else
1615 err = -ENOMEM;
1616 break;
1617
1618 case MPCP_VS_SEM:
1619 /* Multiprocesor Priority Ceiling Protocol with virtual spinning */
1620 *lock = pfp_new_mpcp(1);
1621 if (*lock)
1622 err = 0;
1623 else
1624 err = -ENOMEM;
1625 break;
1626
1627 case DPCP_SEM:
1628 /* Distributed Priority Ceiling Protocol */
1629 if (get_user(cpu, (int*) config))
1630 return -EFAULT;
1631
1632 if (!cpu_online(cpu))
1633 return -EINVAL;
1634
1635 *lock = pfp_new_dpcp(cpu);
1636 if (*lock)
1637 err = 0;
1638 else
1639 err = -ENOMEM;
1640 break;
1641
1642 case SRP_SEM:
1643 /* Baker's Stack Resource Policy */
1644 srp = allocate_srp_semaphore();
1645 if (srp) {
1646 *lock = &srp->litmus_lock;
1647 err = 0;
1648 } else
1649 err = -ENOMEM;
1650 break;
1651
1652 case PCP_SEM:
1653 /* Priority Ceiling Protocol */
1654 if (!config)
1655 cpu = get_partition(current);
1656 else if (get_user(cpu, (int*) config))
1657 return -EFAULT;
1658
1659 if (!cpu_online(cpu))
1660 return -EINVAL;
1661
1662 *lock = pfp_new_pcp(cpu);
1663 if (*lock)
1664 err = 0;
1665 else
1666 err = -ENOMEM;
1667 break;
1668 };
1669
1670 return err;
1671}
1672
1673#endif
1674
1675static long pfp_admit_task(struct task_struct* tsk)
1676{
1677 if (task_cpu(tsk) == tsk->rt_param.task_params.cpu &&
1678#ifdef CONFIG_RELEASE_MASTER
1679 /* don't allow tasks on release master CPU */
1680 task_cpu(tsk) != remote_dom(task_cpu(tsk))->release_master &&
1681#endif
1682 litmus_is_valid_fixed_prio(get_priority(tsk)))
1683 return 0;
1684 else
1685 return -EINVAL;
1686}
1687
1688static long pfp_activate_plugin(void)
1689{
1690#if defined(CONFIG_RELEASE_MASTER) || defined(CONFIG_LITMUS_LOCKING)
1691 int cpu;
1692#endif
1693
1694#ifdef CONFIG_RELEASE_MASTER
1695 for_each_online_cpu(cpu) {
1696 remote_dom(cpu)->release_master = atomic_read(&release_master_cpu);
1697 }
1698#endif
1699
1700#ifdef CONFIG_LITMUS_LOCKING
1701 get_srp_prio = pfp_get_srp_prio;
1702
1703 for_each_online_cpu(cpu) {
1704 init_waitqueue_head(&per_cpu(mpcpvs_vspin_wait, cpu));
1705 per_cpu(mpcpvs_vspin, cpu) = NULL;
1706
1707 pcp_init_state(&per_cpu(pcp_state, cpu));
1708 pfp_doms[cpu] = remote_pfp(cpu);
1709 }
1710
1711#endif
1712
1713 return 0;
1714}
1715
1716
1717/* Plugin object */
1718static struct sched_plugin pfp_plugin __cacheline_aligned_in_smp = {
1719 .plugin_name = "P-FP",
1720 .tick = pfp_tick,
1721 .task_new = pfp_task_new,
1722 .complete_job = complete_job,
1723 .task_exit = pfp_task_exit,
1724 .schedule = pfp_schedule,
1725 .task_wake_up = pfp_task_wake_up,
1726 .task_block = pfp_task_block,
1727 .admit_task = pfp_admit_task,
1728 .activate_plugin = pfp_activate_plugin,
1729#ifdef CONFIG_LITMUS_LOCKING
1730 .allocate_lock = pfp_allocate_lock,
1731 .finish_switch = pfp_finish_switch,
1732#endif
1733};
1734
1735
1736static int __init init_pfp(void)
1737{
1738 int i;
1739
1740 /* We do not really want to support cpu hotplug, do we? ;)
1741 * However, if we are so crazy to do so,
1742 * we cannot use num_online_cpu()
1743 */
1744 for (i = 0; i < num_online_cpus(); i++) {
1745 pfp_domain_init(remote_pfp(i), i);
1746 }
1747 return register_sched_plugin(&pfp_plugin);
1748}
1749
1750module_init(init_pfp);
1751
diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c
new file mode 100644
index 00000000000..00a1900d645
--- /dev/null
+++ b/litmus/sched_plugin.c
@@ -0,0 +1,227 @@
1/* sched_plugin.c -- core infrastructure for the scheduler plugin system
2 *
3 * This file includes the initialization of the plugin system, the no-op Linux
4 * scheduler plugin, some dummy functions, and some helper functions.
5 */
6
7#include <linux/list.h>
8#include <linux/spinlock.h>
9#include <linux/sched.h>
10
11#include <litmus/litmus.h>
12#include <litmus/sched_plugin.h>
13#include <litmus/preempt.h>
14#include <litmus/jobs.h>
15
16/*
17 * Generic function to trigger preemption on either local or remote cpu
18 * from scheduler plugins. The key feature is that this function is
19 * non-preemptive section aware and does not invoke the scheduler / send
20 * IPIs if the to-be-preempted task is actually non-preemptive.
21 */
22void preempt_if_preemptable(struct task_struct* t, int cpu)
23{
24 /* t is the real-time task executing on CPU on_cpu If t is NULL, then
25 * on_cpu is currently scheduling background work.
26 */
27
28 int reschedule = 0;
29
30 if (!t)
31 /* move non-real-time task out of the way */
32 reschedule = 1;
33 else {
34 if (smp_processor_id() == cpu) {
35 /* local CPU case */
36 /* check if we need to poke userspace */
37 if (is_user_np(t))
38 /* Yes, poke it. This doesn't have to be atomic since
39 * the task is definitely not executing. */
40 request_exit_np(t);
41 else if (!is_kernel_np(t))
42 /* only if we are allowed to preempt the
43 * currently-executing task */
44 reschedule = 1;
45 } else {
46 /* Remote CPU case. Only notify if it's not a kernel
47 * NP section and if we didn't set the userspace
48 * flag. */
49 reschedule = !(is_kernel_np(t) || request_exit_np_atomic(t));
50 }
51 }
52 if (likely(reschedule))
53 litmus_reschedule(cpu);
54}
55
56
57/*************************************************************
58 * Dummy plugin functions *
59 *************************************************************/
60
61static void litmus_dummy_finish_switch(struct task_struct * prev)
62{
63}
64
65static struct task_struct* litmus_dummy_schedule(struct task_struct * prev)
66{
67 sched_state_task_picked();
68 return NULL;
69}
70
71static void litmus_dummy_tick(struct task_struct* tsk)
72{
73}
74
75static long litmus_dummy_admit_task(struct task_struct* tsk)
76{
77 printk(KERN_CRIT "LITMUS^RT: Linux plugin rejects %s/%d.\n",
78 tsk->comm, tsk->pid);
79 return -EINVAL;
80}
81
82static void litmus_dummy_task_new(struct task_struct *t, int on_rq, int running)
83{
84}
85
86static void litmus_dummy_task_wake_up(struct task_struct *task)
87{
88}
89
90static void litmus_dummy_task_block(struct task_struct *task)
91{
92}
93
94static void litmus_dummy_task_exit(struct task_struct *task)
95{
96}
97
98static long litmus_dummy_complete_job(void)
99{
100 return -ENOSYS;
101}
102
103static long litmus_dummy_activate_plugin(void)
104{
105 return 0;
106}
107
108static long litmus_dummy_deactivate_plugin(void)
109{
110 return 0;
111}
112
113#ifdef CONFIG_LITMUS_LOCKING
114
115static long litmus_dummy_allocate_lock(struct litmus_lock **lock, int type,
116 void* __user config)
117{
118 return -ENXIO;
119}
120
121#endif
122
123
124/* The default scheduler plugin. It doesn't do anything and lets Linux do its
125 * job.
126 */
127struct sched_plugin linux_sched_plugin = {
128 .plugin_name = "Linux",
129 .tick = litmus_dummy_tick,
130 .task_new = litmus_dummy_task_new,
131 .task_exit = litmus_dummy_task_exit,
132 .task_wake_up = litmus_dummy_task_wake_up,
133 .task_block = litmus_dummy_task_block,
134 .complete_job = litmus_dummy_complete_job,
135 .schedule = litmus_dummy_schedule,
136 .finish_switch = litmus_dummy_finish_switch,
137 .activate_plugin = litmus_dummy_activate_plugin,
138 .deactivate_plugin = litmus_dummy_deactivate_plugin,
139#ifdef CONFIG_LITMUS_LOCKING
140 .allocate_lock = litmus_dummy_allocate_lock,
141#endif
142 .admit_task = litmus_dummy_admit_task
143};
144
145/*
146 * The reference to current plugin that is used to schedule tasks within
147 * the system. It stores references to actual function implementations
148 * Should be initialized by calling "init_***_plugin()"
149 */
150struct sched_plugin *litmus = &linux_sched_plugin;
151
152/* the list of registered scheduling plugins */
153static LIST_HEAD(sched_plugins);
154static DEFINE_RAW_SPINLOCK(sched_plugins_lock);
155
156#define CHECK(func) {\
157 if (!plugin->func) \
158 plugin->func = litmus_dummy_ ## func;}
159
160/* FIXME: get reference to module */
161int register_sched_plugin(struct sched_plugin* plugin)
162{
163 printk(KERN_INFO "Registering LITMUS^RT plugin %s.\n",
164 plugin->plugin_name);
165
166 /* make sure we don't trip over null pointers later */
167 CHECK(finish_switch);
168 CHECK(schedule);
169 CHECK(tick);
170 CHECK(task_wake_up);
171 CHECK(task_exit);
172 CHECK(task_block);
173 CHECK(task_new);
174 CHECK(complete_job);
175 CHECK(activate_plugin);
176 CHECK(deactivate_plugin);
177#ifdef CONFIG_LITMUS_LOCKING
178 CHECK(allocate_lock);
179#endif
180 CHECK(admit_task);
181
182 if (!plugin->release_at)
183 plugin->release_at = release_at;
184
185 raw_spin_lock(&sched_plugins_lock);
186 list_add(&plugin->list, &sched_plugins);
187 raw_spin_unlock(&sched_plugins_lock);
188
189 return 0;
190}
191
192
193/* FIXME: reference counting, etc. */
194struct sched_plugin* find_sched_plugin(const char* name)
195{
196 struct list_head *pos;
197 struct sched_plugin *plugin;
198
199 raw_spin_lock(&sched_plugins_lock);
200 list_for_each(pos, &sched_plugins) {
201 plugin = list_entry(pos, struct sched_plugin, list);
202 if (!strcmp(plugin->plugin_name, name))
203 goto out_unlock;
204 }
205 plugin = NULL;
206
207out_unlock:
208 raw_spin_unlock(&sched_plugins_lock);
209 return plugin;
210}
211
212int print_sched_plugins(char* buf, int max)
213{
214 int count = 0;
215 struct list_head *pos;
216 struct sched_plugin *plugin;
217
218 raw_spin_lock(&sched_plugins_lock);
219 list_for_each(pos, &sched_plugins) {
220 plugin = list_entry(pos, struct sched_plugin, list);
221 count += snprintf(buf + count, max - count, "%s\n", plugin->plugin_name);
222 if (max - count <= 0)
223 break;
224 }
225 raw_spin_unlock(&sched_plugins_lock);
226 return count;
227}
diff --git a/litmus/sched_psn_edf.c b/litmus/sched_psn_edf.c
new file mode 100644
index 00000000000..6f4d4adcec0
--- /dev/null
+++ b/litmus/sched_psn_edf.c
@@ -0,0 +1,999 @@
1/*
2 * kernel/sched_psn_edf.c
3 *
4 * Implementation of the PSN-EDF scheduler plugin.
5 * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c.
6 *
7 * Suspensions and non-preemptable sections are supported.
8 * Priority inheritance is not supported.
9 */
10
11#include <linux/percpu.h>
12#include <linux/sched.h>
13#include <linux/list.h>
14#include <linux/spinlock.h>
15#include <linux/module.h>
16
17#include <litmus/litmus.h>
18#include <litmus/jobs.h>
19#include <litmus/preempt.h>
20#include <litmus/budget.h>
21#include <litmus/sched_plugin.h>
22#include <litmus/edf_common.h>
23#include <litmus/sched_trace.h>
24#include <litmus/trace.h>
25#include <litmus/fdso.h>
26
27typedef struct {
28 rt_domain_t domain;
29 int cpu;
30 struct task_struct* scheduled; /* only RT tasks */
31/*
32 * scheduling lock slock
33 * protects the domain and serializes scheduling decisions
34 */
35#define slock domain.ready_lock
36
37} psnedf_domain_t;
38
39DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains);
40
41#define local_edf (&__get_cpu_var(psnedf_domains).domain)
42#define local_pedf (&__get_cpu_var(psnedf_domains))
43#define remote_edf(cpu) (&per_cpu(psnedf_domains, cpu).domain)
44#define remote_pedf(cpu) (&per_cpu(psnedf_domains, cpu))
45#define task_edf(task) remote_edf(get_partition(task))
46#define task_pedf(task) remote_pedf(get_partition(task))
47
48
49static void psnedf_domain_init(psnedf_domain_t* pedf,
50 check_resched_needed_t check,
51 release_jobs_t release,
52 int cpu)
53{
54 edf_domain_init(&pedf->domain, check, release);
55 pedf->cpu = cpu;
56 pedf->scheduled = NULL;
57}
58
59static void requeue(struct task_struct* t, rt_domain_t *edf)
60{
61 if (t->state != TASK_RUNNING)
62 STRACE_TASK(t, "requeue: !TASK_RUNNING\n");
63
64 tsk_rt(t)->completed = 0;
65 if (is_early_releasing(t) || is_released(t, litmus_clock()))
66 __add_ready(edf, t);
67 else
68 add_release(edf, t); /* it has got to wait */
69}
70
71/* we assume the lock is being held */
72static void preempt(psnedf_domain_t *pedf)
73{
74 preempt_if_preemptable(pedf->scheduled, pedf->cpu);
75}
76
77#ifdef CONFIG_LITMUS_LOCKING
78
79static void boost_priority(struct task_struct* t)
80{
81 unsigned long flags;
82 psnedf_domain_t* pedf = task_pedf(t);
83 lt_t now;
84
85 raw_spin_lock_irqsave(&pedf->slock, flags);
86 now = litmus_clock();
87
88 STRACE_TASK(t, "priority boosted at %llu\n", now);
89
90 tsk_rt(t)->priority_boosted = 1;
91 tsk_rt(t)->boost_start_time = now;
92
93 if (pedf->scheduled != t) {
94 /* holder may be queued: first stop queue changes */
95 raw_spin_lock(&pedf->domain.release_lock);
96 if (is_queued(t) &&
97 /* If it is queued, then we need to re-order. */
98 bheap_decrease(edf_ready_order, tsk_rt(t)->heap_node) &&
99 /* If we bubbled to the top, then we need to check for preemptions. */
100 edf_preemption_needed(&pedf->domain, pedf->scheduled))
101 preempt(pedf);
102 raw_spin_unlock(&pedf->domain.release_lock);
103 } /* else: nothing to do since the job is not queued while scheduled */
104
105 raw_spin_unlock_irqrestore(&pedf->slock, flags);
106}
107
108static void unboost_priority(struct task_struct* t)
109{
110 unsigned long flags;
111 psnedf_domain_t* pedf = task_pedf(t);
112 lt_t now;
113
114 raw_spin_lock_irqsave(&pedf->slock, flags);
115 now = litmus_clock();
116
117 /* assumption: this only happens when the job is scheduled */
118 BUG_ON(pedf->scheduled != t);
119
120 STRACE_TASK(t, "priority restored at %llu\n", now);
121
122 /* priority boosted jobs must be scheduled */
123 BUG_ON(pedf->scheduled != t);
124
125 tsk_rt(t)->priority_boosted = 0;
126 tsk_rt(t)->boost_start_time = 0;
127
128 /* check if this changes anything */
129 if (edf_preemption_needed(&pedf->domain, pedf->scheduled))
130 preempt(pedf);
131
132 raw_spin_unlock_irqrestore(&pedf->slock, flags);
133}
134
135#endif
136
137static int psnedf_preempt_check(psnedf_domain_t *pedf)
138{
139 if (edf_preemption_needed(&pedf->domain, pedf->scheduled)) {
140 preempt(pedf);
141 return 1;
142 } else
143 return 0;
144}
145
146/* This check is trivial in partioned systems as we only have to consider
147 * the CPU of the partition.
148 */
149static int psnedf_check_resched(rt_domain_t *edf)
150{
151 psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain);
152
153 /* because this is a callback from rt_domain_t we already hold
154 * the necessary lock for the ready queue
155 */
156 return psnedf_preempt_check(pedf);
157}
158
159static void job_completion(struct task_struct* t, int forced)
160{
161 sched_trace_task_completion(t,forced);
162 STRACE_TASK(t, "job_completion().\n");
163
164 tsk_rt(t)->completed = 1;
165 prepare_for_next_period(t);
166}
167
168static void psnedf_tick(struct task_struct *t)
169{
170 psnedf_domain_t *pedf = local_pedf;
171
172 /* Check for inconsistency. We don't need the lock for this since
173 * ->scheduled is only changed in schedule, which obviously is not
174 * executing in parallel on this CPU
175 */
176 BUG_ON(is_realtime(t) && t != pedf->scheduled);
177
178 if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
179 if (!is_np(t)) {
180 litmus_reschedule_local();
181 TRACE("psnedf_scheduler_tick: "
182 "%d is preemptable "
183 " => FORCE_RESCHED\n", t->pid);
184 } else if (is_user_np(t)) {
185 TRACE("psnedf_scheduler_tick: "
186 "%d is non-preemptable, "
187 "preemption delayed.\n", t->pid);
188 request_exit_np(t);
189 }
190 }
191}
192
193static struct task_struct* psnedf_schedule(struct task_struct * prev)
194{
195 psnedf_domain_t* pedf = local_pedf;
196 rt_domain_t* edf = &pedf->domain;
197 struct task_struct* next;
198
199 int out_of_time, sleep, preempt,
200 np, exists, blocks, resched;
201
202 if (prev && is_realtime(prev)){
203 TRACE_TASK(prev, "Rescheduling\n");
204 }else{
205 TRACE("Rescheduling\n");
206 }
207
208 raw_spin_lock(&pedf->slock);
209
210 /* sanity checking
211 * differently from gedf, when a task exits (dead)
212 * pedf->schedule may be null and prev _is_ realtime
213 */
214 BUG_ON(pedf->scheduled && pedf->scheduled != prev);
215 BUG_ON(pedf->scheduled && !is_realtime(prev));
216
217 /* (0) Determine state */
218 exists = pedf->scheduled != NULL;
219 blocks = exists && !is_running(pedf->scheduled);
220 out_of_time = exists &&
221 budget_enforced(pedf->scheduled) &&
222 budget_exhausted(pedf->scheduled);
223 np = exists && is_np(pedf->scheduled);
224 sleep = exists && is_completed(pedf->scheduled);
225 preempt = edf_preemption_needed(edf, prev);
226
227 /* If we need to preempt do so.
228 * The following checks set resched to 1 in case of special
229 * circumstances.
230 */
231 resched = preempt;
232
233 /* If a task blocks we have no choice but to reschedule.
234 */
235 if (blocks)
236 resched = 1;
237
238 /* Request a sys_exit_np() call if we would like to preempt but cannot.
239 * Multiple calls to request_exit_np() don't hurt.
240 */
241 if (np && (out_of_time || preempt || sleep))
242 request_exit_np(pedf->scheduled);
243
244 /* Any task that is preemptable and either exhausts its execution
245 * budget or wants to sleep completes. We may have to reschedule after
246 * this.
247 */
248 if (!np && (out_of_time || sleep) && !blocks) {
249 job_completion(pedf->scheduled, !sleep);
250 resched = 1;
251 }
252
253 /* The final scheduling decision. Do we need to switch for some reason?
254 * Switch if we are in RT mode and have no task or if we need to
255 * resched.
256 */
257 next = NULL;
258 if ((!np || blocks) && (resched || !exists)) {
259 /* When preempting a task that does not block, then
260 * re-insert it into either the ready queue or the
261 * release queue (if it completed). requeue() picks
262 * the appropriate queue.
263 */
264 if (pedf->scheduled && !blocks)
265 requeue(pedf->scheduled, edf);
266 next = __take_ready(edf);
267 } else
268 /* Only override Linux scheduler if we have a real-time task
269 * scheduled that needs to continue.
270 */
271 if (exists)
272 next = prev;
273
274 if (next) {
275 STRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
276 tsk_rt(next)->completed = 0;
277 } else {
278 STRACE("becoming idle at %llu\n", litmus_clock());
279 }
280
281 pedf->scheduled = next;
282 sched_state_task_picked();
283 raw_spin_unlock(&pedf->slock);
284
285 return next;
286}
287
288
289/* Prepare a task for running in RT mode
290 */
291static void psnedf_task_new(struct task_struct * t, int on_rq, int running)
292{
293 rt_domain_t* edf = task_edf(t);
294 psnedf_domain_t* pedf = task_pedf(t);
295 unsigned long flags;
296
297 TRACE_TASK(t, "psn edf: task new, cpu = %d\n",
298 t->rt_param.task_params.cpu);
299
300 /* setup job parameters */
301 release_at(t, litmus_clock());
302
303 /* The task should be running in the queue, otherwise signal
304 * code will try to wake it up with fatal consequences.
305 */
306 raw_spin_lock_irqsave(&pedf->slock, flags);
307 if (running) {
308 /* there shouldn't be anything else running at the time */
309 BUG_ON(pedf->scheduled);
310 pedf->scheduled = t;
311 } else {
312 requeue(t, edf);
313 /* maybe we have to reschedule */
314 psnedf_preempt_check(pedf);
315 }
316 raw_spin_unlock_irqrestore(&pedf->slock, flags);
317}
318
319static void psnedf_task_wake_up(struct task_struct *task)
320{
321 unsigned long flags;
322 psnedf_domain_t* pedf = task_pedf(task);
323 rt_domain_t* edf = task_edf(task);
324 lt_t now;
325
326 TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
327 raw_spin_lock_irqsave(&pedf->slock, flags);
328 BUG_ON(is_queued(task));
329 now = litmus_clock();
330 if (is_sporadic(task) && is_tardy(task, now)
331#ifdef CONFIG_LITMUS_LOCKING
332 /* We need to take suspensions because of semaphores into
333 * account! If a job resumes after being suspended due to acquiring
334 * a semaphore, it should never be treated as a new job release.
335 */
336 && !is_priority_boosted(task)
337#endif
338 ) {
339 /* new sporadic release */
340 release_at(task, now);
341 sched_trace_task_release(task);
342 }
343
344 /* Only add to ready queue if it is not the currently-scheduled
345 * task. This could be the case if a task was woken up concurrently
346 * on a remote CPU before the executing CPU got around to actually
347 * de-scheduling the task, i.e., wake_up() raced with schedule()
348 * and won.
349 */
350 if (pedf->scheduled != task) {
351 requeue(task, edf);
352 psnedf_preempt_check(pedf);
353 }
354
355 raw_spin_unlock_irqrestore(&pedf->slock, flags);
356 STRACE_TASK(task, "wake up done\n");
357}
358
359static void psnedf_task_block(struct task_struct *t)
360{
361 /* only running tasks can block, thus t is in no queue */
362 TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state);
363
364 BUG_ON(!is_realtime(t));
365 BUG_ON(is_queued(t));
366}
367
368static void psnedf_task_exit(struct task_struct * t)
369{
370 unsigned long flags;
371 psnedf_domain_t* pedf = task_pedf(t);
372 rt_domain_t* edf;
373
374 raw_spin_lock_irqsave(&pedf->slock, flags);
375 if (is_queued(t)) {
376 /* dequeue */
377 edf = task_edf(t);
378 remove(edf, t);
379 }
380 if (pedf->scheduled == t)
381 pedf->scheduled = NULL;
382
383 TRACE_TASK(t, "RIP, now reschedule\n");
384
385 preempt(pedf);
386 raw_spin_unlock_irqrestore(&pedf->slock, flags);
387}
388
389#ifdef CONFIG_LITMUS_LOCKING
390
391#include <litmus/fdso.h>
392#include <litmus/srp.h>
393
394/* ******************** SRP support ************************ */
395
396static unsigned int psnedf_get_srp_prio(struct task_struct* t)
397{
398 /* assumes implicit deadlines */
399 return get_rt_period(t);
400}
401
402/* ******************** FMLP support ********************** */
403
404/* struct for semaphore with priority inheritance */
405struct fmlp_semaphore {
406 struct litmus_lock litmus_lock;
407
408 /* current resource holder */
409 struct task_struct *owner;
410
411 /* FIFO queue of waiting tasks */
412 wait_queue_head_t wait;
413};
414
415struct dgl_semaphore {
416 struct litmus_lock litmus_lock;
417
418 /* bitmask of resources that are currently locked. */
419 resource_mask_t locked;
420
421 /* bitmask of resources in the file descriptor table that are controlled by
422 * this dgl_semaphore.
423 */
424 resource_mask_t dgl_resources;
425
426 /* There can be no more than $m$ resource holders, because under
427 * partitioned scheduling, the resource holders are priority boosted, and
428 * it is impossible to have $>m$ boosted jobs.
429 */
430 bool boosted[NR_CPUS];
431
432 /* FIFO queue of waiting tasks */
433 wait_queue_head_t wait;
434};
435
436static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock)
437{
438 return container_of(lock, struct fmlp_semaphore, litmus_lock);
439}
440
441static inline struct dgl_semaphore* dgl_from_lock(struct litmus_lock* lock)
442{
443 return container_of(lock, struct dgl_semaphore, litmus_lock);
444}
445
446int psnedf_fmlp_lock(struct litmus_lock* l)
447{
448 struct task_struct* t = current;
449 struct fmlp_semaphore *sem = fmlp_from_lock(l);
450 wait_queue_t wait;
451 unsigned long flags;
452
453 if (!is_realtime(t))
454 return -EPERM;
455
456 /* prevent nested lock acquisition --- not supported by FMLP */
457 if (tsk_rt(t)->num_locks_held ||
458 tsk_rt(t)->num_local_locks_held)
459 return -EBUSY;
460
461 spin_lock_irqsave(&sem->wait.lock, flags);
462
463 if (sem->owner) {
464 /* resource is not free => must suspend and wait */
465
466 init_waitqueue_entry(&wait, t);
467
468 /* FIXME: interruptible would be nice some day */
469 set_task_state(t, TASK_UNINTERRUPTIBLE);
470
471 __add_wait_queue_tail_exclusive(&sem->wait, &wait);
472
473 TS_LOCK_SUSPEND;
474
475 /* release lock before sleeping */
476 spin_unlock_irqrestore(&sem->wait.lock, flags);
477
478 /* We depend on the FIFO order. Thus, we don't need to recheck
479 * when we wake up; we are guaranteed to have the lock since
480 * there is only one wake up per release.
481 */
482
483 schedule();
484
485 TS_LOCK_RESUME;
486
487 /* Since we hold the lock, no other task will change
488 * ->owner. We can thus check it without acquiring the spin
489 * lock. */
490 BUG_ON(sem->owner != t);
491 } else {
492 /* it's ours now */
493 sem->owner = t;
494
495 /* mark the task as priority-boosted. */
496 boost_priority(t);
497
498 spin_unlock_irqrestore(&sem->wait.lock, flags);
499 }
500
501 tsk_rt(t)->num_locks_held++;
502
503 return 0;
504}
505
506int psnedf_fmlp_unlock(struct litmus_lock* l)
507{
508 struct task_struct *t = current, *next;
509 struct fmlp_semaphore *sem = fmlp_from_lock(l);
510 unsigned long flags;
511 int err = 0;
512
513 spin_lock_irqsave(&sem->wait.lock, flags);
514
515 if (sem->owner != t) {
516 err = -EINVAL;
517 goto out;
518 }
519
520 tsk_rt(t)->num_locks_held--;
521
522 /* we lose the benefit of priority boosting */
523
524 unboost_priority(t);
525
526 /* check if there are jobs waiting for this resource */
527 next = __waitqueue_remove_first(&sem->wait);
528 if (next) {
529 /* boost next job */
530 boost_priority(next);
531
532 /* next becomes the resouce holder */
533 sem->owner = next;
534
535 /* wake up next */
536 wake_up_process(next);
537 } else
538 /* resource becomes available */
539 sem->owner = NULL;
540
541out:
542 spin_unlock_irqrestore(&sem->wait.lock, flags);
543 return err;
544}
545
546int psnedf_fmlp_close(struct litmus_lock* l)
547{
548 struct task_struct *t = current;
549 struct fmlp_semaphore *sem = fmlp_from_lock(l);
550 unsigned long flags;
551
552 int owner;
553
554 spin_lock_irqsave(&sem->wait.lock, flags);
555
556 owner = sem->owner == t;
557
558 spin_unlock_irqrestore(&sem->wait.lock, flags);
559
560 if (owner)
561 psnedf_fmlp_unlock(l);
562
563 return 0;
564}
565
566void psnedf_fmlp_free(struct litmus_lock* lock)
567{
568 kfree(fmlp_from_lock(lock));
569}
570
571static struct litmus_lock_ops psnedf_fmlp_lock_ops = {
572 .close = psnedf_fmlp_close,
573 .lock = psnedf_fmlp_lock,
574 .unlock = psnedf_fmlp_unlock,
575 .deallocate = psnedf_fmlp_free,
576};
577
578int psnedf_dgl_close(struct litmus_lock* l)
579{
580 return 0;
581}
582
583/* for compatibility, assume lock requests the whole group. */
584int psnedf_dgl_lock(struct litmus_lock* l)
585{
586 return l->ops->dynamic_group_lock(l, dgl_from_lock(l)->dgl_resources);
587}
588
589/* for compatibility, assume unlock releases the whole group. */
590int psnedf_dgl_unlock(struct litmus_lock* l)
591{
592 return l->ops->dynamic_group_unlock(l, dgl_from_lock(l)->dgl_resources);
593}
594
595/**
596 * This function checks to ensure that all resources requested in the mask are
597 * controlled by the dgl in l. This is validated in one of two ways.
598 *
599 * The dgl struct maintains a cache of resources known to be controlled by that
600 * particular dgl. If the requested resources are in that cache, return true.
601 *
602 * Note that this cache is not immediately updated when a resource is added to
603 * a group (because I didn't see an easy way to do it). The first time a resource
604 * is requested, the cache is updated (in the while loop). This is done by
605 * checking that two fdso point to the same lock object.
606 */
607bool is_mask_valid(struct litmus_lock* l, resource_mask_t mask)
608{
609 struct dgl_semaphore* d;
610 struct od_table_entry* entry;
611 resource_mask_t tmp;
612 int prev = -1;
613
614 if (l->type != DGL_SEM)
615 return false;
616
617 d = dgl_from_lock(l);
618
619 // mask -> d->dgl_resources (bitwise logical implication)
620 tmp = ~mask | d->dgl_resources;
621
622 //n.b. if tmp is 0xffffffff, ffs(~tmp) - 1 = -1, and -1 >= prev always, so stop.
623 while (prev < ffs(~tmp) - 1)
624 {
625 prev = ffs(~tmp) - 1;
626 entry = get_entry_for_od( ffs(~tmp) - 1);
627 if (entry && is_lock(entry) && get_lock(entry) == l){
628 d->dgl_resources = d->dgl_resources | ( 1 << (ffs(~tmp) -1) );
629 }
630 tmp = ~mask|d->dgl_resources;
631 }
632
633 // 2's complement: -1 is 0xffffffff
634
635 if ( tmp == -1){
636 return true;
637 } else {
638 return false;
639 }
640
641}
642
643#define DGL_CONTAINER 1
644#define for_each_bit(field, idx) \
645 for (idx = find_first_bit(&field, sizeof(field)*8); \
646 idx < sizeof(field)*8; \
647 idx = find_next_bit(&field, sizeof(field)*8, idx))
648
649int psnedf_dgl_dynamic_group_lock(struct litmus_lock* l, resource_mask_t resources)
650{
651 struct task_struct* t = current;
652 struct dgl_semaphore *sem = dgl_from_lock(l);
653 wait_queue_t wait;
654 int resource;
655 unsigned long flags;
656
657 TRACE_CUR("Trying to lock a DGL\n");
658
659 if (!is_realtime(t))
660 return -EPERM;
661
662 if ( !is_mask_valid(l, resources) )
663 return -EINVAL;
664
665 t->resources = resources;
666
667 spin_lock_irqsave(&sem->wait.lock, flags);
668
669 // if sem->locked & resources == 0, then all resources are available,
670 // otherwise we must suspend.
671 if (sem->locked & resources){
672
673 STRACE("Resources locked, suspending\n");
674
675 init_waitqueue_entry(&wait, t);
676
677 set_task_state(t, TASK_UNINTERRUPTIBLE);
678
679 __add_wait_queue_tail_exclusive(&sem->wait, &wait);
680
681 TS_LOCK_SUSPEND;
682
683 spin_unlock_irqrestore(&sem->wait.lock, flags);
684
685 schedule();
686
687 TS_LOCK_RESUME;
688 } else {
689
690 STRACE("Acquired a resource\n");
691
692 sem->locked = sem->locked | resources;
693
694 // if a job requests a resource, then it was scheduled, and therefore
695 // there was not another boosted job, so this is safe.
696 BUG_ON(sem->boosted[task_cpu(t)]);
697
698 boost_priority(t);
699
700 sem->boosted[task_cpu(t)] = true;
701
702 spin_unlock_irqrestore(&sem->wait.lock, flags);
703 }
704
705 for_each_bit(resources, resource)
706 sched_trace_server_switch_to(resource, 0, t->pid, get_job_no(t),
707 get_partition(t));
708
709 return 0;
710}
711
712inline int num_boosted(struct dgl_semaphore *sem)
713{
714 int ret = 0;
715 int i;
716 for(i = 0; i < NR_CPUS; i++){
717 ret += sem->boosted[i];
718 }
719 return ret;
720}
721
722int psnedf_dgl_dynamic_group_unlock(struct litmus_lock* l, resource_mask_t resources)
723{
724 struct task_struct *t = current, *tsk;
725 struct dgl_semaphore *sem = dgl_from_lock(l);
726 unsigned long flags;
727 int err = 0, resource;
728 resource_mask_t logically_locked;
729 struct list_head *pos, *tmp;
730
731 TRACE_CUR("Trying to unlock a DGL\n");
732
733 //Unlocking but priority is not boosted
734 BUG_ON(!sem->boosted[task_cpu(t)]);
735
736 spin_lock_irqsave(&sem->wait.lock, flags);
737
738 // ~resources | t->resources checks that t owns the resources being released
739 // note that a job can release a subset of the resources it has acquired.
740 if ( !is_mask_valid(l, resources)){
741 STRACE("Invalid mask %d\n", resources);
742 err = -EINVAL;
743 goto out;
744 } else if ( (~resources | t->resources) != -1){
745 STRACE("Trying to lock unowned resources: %d\t%d\n", resources, t->resources);
746 err = -EINVAL;
747 goto out;
748 } else {
749 sem->locked -= resources;
750 }
751
752 // if the job released all of the resources it owned, then unboost.
753 if (resources == t->resources){
754 STRACE("Released all resources\n");
755 unboost_priority(t);
756 sem->boosted[task_cpu(t)] = false;
757 } else {
758 // update t->resources to reflect the resources currently owned.
759 STRACE("Unlocked a subset of locked resources\n");
760 t->resources = t->resources & ~resources;
761 }
762
763 logically_locked = sem->locked;
764
765 list_for_each_safe(pos, tmp, &sem->wait.task_list) {
766 tsk = (struct task_struct*) list_entry(pos, wait_queue_t,
767 task_list)->private;
768 STRACE_TASK(tsk, "Evaluating\n");
769
770 if ( (logically_locked == -1) || (num_boosted(sem) == NR_CPUS) ){
771 STRACE_TASK(tsk, "All procs boosted, or all resources locked\n");
772 break;
773 }
774
775 //STRACE_TASK(tsk, "Logically locked: %o\n", logically_locked);
776 //STRACE_TASK(tsk, "tsk->resources: %o\n", tsk->resources);
777 //STRACE_TASK(tsk, "!(tsk->resources & logically_locked): %o\n", !(tsk->resources & logically_locked));
778 //STRACE_TASK(tsk, "!sem->boosted: %d\n", !sem->boosted[task_cpu(tsk)]);
779
780 // the resources requested are unlocked, tsk acquires its resources
781 if( !(tsk->resources & logically_locked) && !sem->boosted[task_cpu(tsk)]) {
782
783 STRACE_TASK(tsk, "Acquired a resource\n");
784
785 list_del_init(pos);
786
787 sem->locked = sem->locked | tsk->resources;
788
789 sem->boosted[task_cpu(tsk)] = true;
790 boost_priority(tsk);
791
792 wake_up_process(tsk);
793 }
794
795 logically_locked = logically_locked | tsk->resources;
796 }
797
798 for_each_bit(resources, resource)
799 sched_trace_server_switch_away(resource, 0, t->pid, get_job_no(t),
800 get_partition(t));
801
802out:
803 spin_unlock_irqrestore(&sem->wait.lock, flags);
804 return err;
805}
806
807void psnedf_dgl_free(struct litmus_lock* l)
808{
809 //if (l)
810 // kfree(dgl_from_lock(l));
811 TRACE("I'll free things later!\n");
812}
813
814static struct litmus_lock_ops psnedf_dgl_lock_ops = {
815 .close = psnedf_dgl_close,
816 .lock = psnedf_dgl_lock,
817 .unlock = psnedf_dgl_unlock,
818 .dynamic_group_lock = psnedf_dgl_dynamic_group_lock,
819 .dynamic_group_unlock = psnedf_dgl_dynamic_group_unlock,
820 .deallocate = psnedf_dgl_free,
821};
822
823static struct litmus_lock* psnedf_new_fmlp(void)
824{
825 struct fmlp_semaphore* sem;
826
827 sem = kmalloc(sizeof(*sem), GFP_KERNEL);
828 if (!sem)
829 return NULL;
830
831 sem->owner = NULL;
832 init_waitqueue_head(&sem->wait);
833 sem->litmus_lock.ops = &psnedf_fmlp_lock_ops;
834
835 return &sem->litmus_lock;
836}
837
838static struct litmus_lock* psnedf_new_dgl(void)
839{
840 struct dgl_semaphore* sem;
841 int i;
842
843 TRACE("Creating another DGL\n");
844
845 sem = kmalloc(sizeof(*sem), GFP_KERNEL);
846 if (!sem)
847 return NULL;
848
849 sem->locked = 0;
850 sem->dgl_resources = 0;
851
852 for(i = 0; i < NR_CPUS; i++)
853 sem->boosted[i] = false;
854
855 init_waitqueue_head(&sem->wait);
856 sem->litmus_lock.ops = &psnedf_dgl_lock_ops;
857 sem->litmus_lock.type = DGL_SEM;
858
859 sched_trace_container_param(DGL_CONTAINER, "dgl");
860 for (i = 0; i < sizeof(sem->dgl_resources)*8; ++i) {
861 sched_trace_server_param(i, DGL_CONTAINER, 0, 0);
862 }
863
864 return &sem->litmus_lock;
865}
866
867/* **** lock constructor **** */
868
869
870static long psnedf_allocate_lock(struct litmus_lock **lock, int type,
871 void* __user config)
872{
873 int err = -ENXIO;
874 int config_num;
875 struct srp_semaphore* srp;
876 struct od_table_entry* entry;
877
878 /* PSN-EDF currently supports the SRP for local resources and the FMLP
879 * for global resources. */
880 switch (type) {
881 case FMLP_SEM:
882 /* Flexible Multiprocessor Locking Protocol */
883 *lock = psnedf_new_fmlp();
884 if (*lock)
885 err = 0;
886 else
887 err = -ENOMEM;
888 break;
889
890 case SRP_SEM:
891 /* Baker's Stack Resource Policy */
892 srp = allocate_srp_semaphore();
893 if (srp) {
894 *lock = &srp->litmus_lock;
895 err = 0;
896 } else
897 err = -ENOMEM;
898 break;
899
900 case DGL_SEM:
901 /* assume that config is an int, and that config < 0 means create a new DGL
902 * and that a config > 0 means point this resource to the existing DGL
903 * for the resource in entry number config.
904 */
905 config_num = *(int*)(config);
906 TRACE("config: %d\n", config_num);
907 if (config_num < 0){
908 *lock = psnedf_new_dgl();
909 if (*lock)
910 err = 0;
911 else
912 err = -ENOMEM;
913 /* In this case, we are adding a resource to an existing lock */
914 } else {
915 entry = get_entry_for_od(config_num);
916 if (entry && entry->obj && entry->obj->type == DGL_SEM){
917 *lock = (struct litmus_lock*) entry->obj->obj;
918 err = 0;
919 } else {
920 err = -EINVAL;
921 printk(KERN_DEBUG "Cannot add to that group!\n");
922 while(1);
923 }
924 }
925
926 break;
927 };
928
929 return err;
930}
931
932#endif
933
934static long psnedf_activate_plugin(void)
935{
936#ifdef CONFIG_RELEASE_MASTER
937 int cpu;
938
939 for_each_online_cpu(cpu) {
940 remote_edf(cpu)->release_master = atomic_read(&release_master_cpu);
941 }
942#endif
943
944#ifdef CONFIG_LITMUS_LOCKING
945 get_srp_prio = psnedf_get_srp_prio;
946#endif
947
948 return 0;
949}
950
951static long psnedf_admit_task(struct task_struct* tsk)
952{
953 if (task_cpu(tsk) == tsk->rt_param.task_params.cpu
954#ifdef CONFIG_RELEASE_MASTER
955 /* don't allow tasks on release master CPU */
956 && task_cpu(tsk) != remote_edf(task_cpu(tsk))->release_master
957#endif
958 )
959 return 0;
960 else
961 return -EINVAL;
962}
963
964/* Plugin object */
965static struct sched_plugin psn_edf_plugin __cacheline_aligned_in_smp = {
966 .plugin_name = "PSN-EDF",
967 .tick = psnedf_tick,
968 .task_new = psnedf_task_new,
969 .complete_job = complete_job,
970 .task_exit = psnedf_task_exit,
971 .schedule = psnedf_schedule,
972 .task_wake_up = psnedf_task_wake_up,
973 .task_block = psnedf_task_block,
974 .admit_task = psnedf_admit_task,
975 .activate_plugin = psnedf_activate_plugin,
976#ifdef CONFIG_LITMUS_LOCKING
977 .allocate_lock = psnedf_allocate_lock,
978#endif
979};
980
981
982static int __init init_psn_edf(void)
983{
984 int i;
985
986 /* We do not really want to support cpu hotplug, do we? ;)
987 * However, if we are so crazy to do so,
988 * we cannot use num_online_cpu()
989 */
990 for (i = 0; i < num_online_cpus(); i++) {
991 psnedf_domain_init(remote_pedf(i),
992 psnedf_check_resched,
993 NULL, i);
994 }
995 return register_sched_plugin(&psn_edf_plugin);
996}
997
998module_init(init_psn_edf);
999
diff --git a/litmus/sched_task_trace.c b/litmus/sched_task_trace.c
new file mode 100644
index 00000000000..3c42dfedac1
--- /dev/null
+++ b/litmus/sched_task_trace.c
@@ -0,0 +1,272 @@
1/*
2 * sched_task_trace.c -- record scheduling events to a byte stream
3 */
4
5#define NO_TASK_TRACE_DECLS
6
7#include <linux/module.h>
8#include <linux/sched.h>
9#include <linux/percpu.h>
10
11#include <litmus/ftdev.h>
12#include <litmus/litmus.h>
13
14#include <litmus/sched_trace.h>
15#include <litmus/feather_trace.h>
16#include <litmus/ftdev.h>
17
18#ifdef CONFIG_SCHED_LITMUS_TRACEPOINT
19#define CREATE_TRACE_POINTS
20#endif
21
22#define NO_EVENTS (1 << CONFIG_SCHED_TASK_TRACE_SHIFT)
23
24#define now() litmus_clock()
25
26struct local_buffer {
27 struct st_event_record record[NO_EVENTS];
28 char flag[NO_EVENTS];
29 struct ft_buffer ftbuf;
30};
31
32DEFINE_PER_CPU(struct local_buffer, st_event_buffer);
33
34static struct ftdev st_dev;
35
36static int st_dev_can_open(struct ftdev *dev, unsigned int cpu)
37{
38 return cpu_online(cpu) ? 0 : -ENODEV;
39}
40
41static int __init init_sched_task_trace(void)
42{
43 struct local_buffer* buf;
44 int i, ok = 0, err;
45 printk("Allocated %u sched_trace_xxx() events per CPU "
46 "(buffer size: %d bytes)\n",
47 NO_EVENTS, (int) sizeof(struct local_buffer));
48
49 err = ftdev_init(&st_dev, THIS_MODULE,
50 num_online_cpus(), "sched_trace");
51 if (err)
52 goto err_out;
53
54 for (i = 0; i < st_dev.minor_cnt; i++) {
55 buf = &per_cpu(st_event_buffer, i);
56 ok += init_ft_buffer(&buf->ftbuf, NO_EVENTS,
57 sizeof(struct st_event_record),
58 buf->flag,
59 buf->record);
60 st_dev.minor[i].buf = &buf->ftbuf;
61 }
62 if (ok == st_dev.minor_cnt) {
63 st_dev.can_open = st_dev_can_open;
64 err = register_ftdev(&st_dev);
65 if (err)
66 goto err_dealloc;
67 } else {
68 err = -EINVAL;
69 goto err_dealloc;
70 }
71
72 return 0;
73
74err_dealloc:
75 ftdev_exit(&st_dev);
76err_out:
77 printk(KERN_WARNING "Could not register sched_trace module\n");
78 return err;
79}
80
81static void __exit exit_sched_task_trace(void)
82{
83 ftdev_exit(&st_dev);
84}
85
86module_init(init_sched_task_trace);
87module_exit(exit_sched_task_trace);
88
89
90static inline struct st_event_record* get_record(u8 type, struct task_struct* t)
91{
92 struct st_event_record* rec = NULL;
93 struct local_buffer* buf;
94
95 buf = &get_cpu_var(st_event_buffer);
96 if (ft_buffer_start_write(&buf->ftbuf, (void**) &rec)) {
97 rec->hdr.type = type;
98 rec->hdr.cpu = smp_processor_id();
99 rec->hdr.pid = t ? t->pid : 0;
100 rec->hdr.job = t ? t->rt_param.job_params.job_no : 0;
101 } else {
102 put_cpu_var(st_event_buffer);
103 }
104 /* rec will be NULL if it failed */
105 return rec;
106}
107
108static inline void put_record(struct st_event_record* rec)
109{
110 struct local_buffer* buf;
111 buf = &__get_cpu_var(st_event_buffer);
112 ft_buffer_finish_write(&buf->ftbuf, rec);
113 put_cpu_var(st_event_buffer);
114}
115
116feather_callback void do_sched_trace_task_name(unsigned long id, unsigned long _task)
117{
118 struct task_struct *t = (struct task_struct*) _task;
119 struct st_event_record* rec = get_record(ST_NAME, t);
120 int i;
121 if (rec) {
122 for (i = 0; i < min(TASK_COMM_LEN, ST_NAME_LEN); i++)
123 rec->data.name.cmd[i] = t->comm[i];
124 put_record(rec);
125 }
126}
127
128feather_callback void do_sched_trace_task_param(unsigned long id, unsigned long _task)
129{
130 struct task_struct *t = (struct task_struct*) _task;
131 struct st_event_record* rec = get_record(ST_PARAM, t);
132 if (rec) {
133 rec->data.param.wcet = get_exec_cost(t);
134 rec->data.param.period = get_rt_period(t);
135 rec->data.param.phase = get_rt_phase(t);
136 rec->data.param.partition = get_partition(t);
137 rec->data.param.class = get_class(t);
138 put_record(rec);
139 }
140}
141
142feather_callback void do_sched_trace_task_release(unsigned long id, unsigned long _task)
143{
144 struct task_struct *t = (struct task_struct*) _task;
145 struct st_event_record* rec = get_record(ST_RELEASE, t);
146 if (rec) {
147 rec->data.release.release = get_release(t);
148 rec->data.release.deadline = get_deadline(t);
149 put_record(rec);
150 }
151}
152
153/* skipped: st_assigned_data, we don't use it atm */
154
155feather_callback void do_sched_trace_task_switch_to(unsigned long id,
156 unsigned long _task)
157{
158 struct task_struct *t = (struct task_struct*) _task;
159 struct st_event_record* rec;
160 if (is_realtime(t)) {
161 rec = get_record(ST_SWITCH_TO, t);
162 if (rec) {
163 rec->data.switch_to.when = now();
164 rec->data.switch_to.exec_time = get_exec_time(t);
165 put_record(rec);
166 }
167 }
168}
169
170feather_callback void do_sched_trace_task_switch_away(unsigned long id,
171 unsigned long _task)
172{
173 struct task_struct *t = (struct task_struct*) _task;
174 struct st_event_record* rec;
175 if (is_realtime(t)) {
176 rec = get_record(ST_SWITCH_AWAY, t);
177 if (rec) {
178 rec->data.switch_away.when = now();
179 rec->data.switch_away.exec_time = get_exec_time(t);
180 put_record(rec);
181 }
182 }
183}
184
185feather_callback void do_sched_trace_task_completion(unsigned long id,
186 unsigned long _task,
187 unsigned long forced)
188{
189 struct task_struct *t = (struct task_struct*) _task;
190 struct st_event_record* rec = get_record(ST_COMPLETION, t);
191 if (rec) {
192 rec->data.completion.when = get_exec_time(t);
193 rec->data.completion.forced = forced;
194 put_record(rec);
195 }
196}
197
198feather_callback void do_sched_trace_task_block(unsigned long id,
199 unsigned long _task)
200{
201 struct task_struct *t = (struct task_struct*) _task;
202 struct st_event_record* rec = get_record(ST_BLOCK, t);
203 if (rec) {
204 rec->data.block.when = now();
205 put_record(rec);
206 }
207}
208
209feather_callback void do_sched_trace_task_resume(unsigned long id,
210 unsigned long _task)
211{
212 struct task_struct *t = (struct task_struct*) _task;
213 struct st_event_record* rec = get_record(ST_RESUME, t);
214 if (rec) {
215 rec->data.resume.when = now();
216 put_record(rec);
217 }
218}
219
220feather_callback void do_sched_trace_sys_release(unsigned long id,
221 unsigned long _start)
222{
223 lt_t *start = (lt_t*) _start;
224 struct st_event_record* rec = get_record(ST_SYS_RELEASE, NULL);
225 if (rec) {
226 rec->data.sys_release.when = now();
227 rec->data.sys_release.release = *start;
228 put_record(rec);
229 }
230}
231
232feather_callback void do_sched_trace_task_exit(unsigned long id,
233 unsigned long _task)
234{
235 struct task_struct *t = (struct task_struct*) _task;
236 const lt_t max_exec_time = tsk_rt(t)->max_exec_time;
237 const lt_t avg_exec_time = tsk_rt(t)->tot_exec_time / (get_job_no(t) - 1);
238
239 struct st_event_record *rec = get_record(ST_TASK_EXIT, t);
240 if (rec) {
241 rec->data.task_exit.avg_exec_time = avg_exec_time;
242 rec->data.task_exit.max_exec_time = max_exec_time;
243 put_record(rec);
244 }
245}
246
247feather_callback void do_sched_trace_task_tardy(unsigned long id,
248 unsigned long _task)
249{
250 struct task_struct *t = (struct task_struct*) _task;
251 struct st_event_record *rec = get_record(ST_TASK_TARDY, t);
252 if (rec) {
253 rec->data.task_tardy.max_tardy = tsk_rt(t)->max_tardy;
254 rec->data.task_tardy.total_tardy = tsk_rt(t)->total_tardy;
255 rec->data.task_tardy.missed = tsk_rt(t)->missed;
256 put_record(rec);
257 }
258}
259
260feather_callback void do_sched_trace_action(unsigned long id,
261 unsigned long _task,
262 unsigned long action)
263{
264 struct task_struct *t = (struct task_struct*) _task;
265 struct st_event_record* rec = get_record(ST_ACTION, t);
266
267 if (rec) {
268 rec->data.action.when = now();
269 rec->data.action.action = action;
270 put_record(rec);
271 }
272}
diff --git a/litmus/sched_trace.c b/litmus/sched_trace.c
new file mode 100644
index 00000000000..f4171fddbbb
--- /dev/null
+++ b/litmus/sched_trace.c
@@ -0,0 +1,252 @@
1/*
2 * sched_trace.c -- record scheduling events to a byte stream.
3 */
4#include <linux/spinlock.h>
5#include <linux/mutex.h>
6
7#include <linux/fs.h>
8#include <linux/slab.h>
9#include <linux/miscdevice.h>
10#include <asm/uaccess.h>
11#include <linux/module.h>
12#include <linux/sysrq.h>
13
14#include <linux/kfifo.h>
15
16#include <litmus/sched_trace.h>
17#include <litmus/litmus.h>
18
19#define SCHED_TRACE_NAME "litmus/log"
20
21/* Compute size of TRACE() buffer */
22#define LITMUS_TRACE_BUF_SIZE (1 << CONFIG_SCHED_DEBUG_TRACE_SHIFT)
23
24/* Max length of one read from the buffer */
25#define MAX_READ_LEN (64 * 1024)
26
27/* Max length for one write --- by TRACE() --- to the buffer. This is used to
28 * allocate a per-cpu buffer for printf() formatting. */
29#define MSG_SIZE 255
30
31
32static DEFINE_MUTEX(reader_mutex);
33static atomic_t reader_cnt = ATOMIC_INIT(0);
34static DEFINE_KFIFO(debug_buffer, char, LITMUS_TRACE_BUF_SIZE);
35
36
37static DEFINE_RAW_SPINLOCK(log_buffer_lock);
38static DEFINE_PER_CPU(char[MSG_SIZE], fmt_buffer);
39
40/*
41 * sched_trace_log_message - Write to the trace buffer (log_buffer)
42 *
43 * This is the only function accessing the log_buffer from inside the
44 * kernel for writing.
45 * Concurrent access to sched_trace_log_message must be serialized using
46 * log_buffer_lock
47 * The maximum length of a formatted message is 255
48 */
49void sched_trace_log_message(const char* fmt, ...)
50{
51 unsigned long flags;
52 va_list args;
53 size_t len;
54 char* buf;
55
56 if (!atomic_read(&reader_cnt))
57 /* early exit if nobody is listening */
58 return;
59
60 va_start(args, fmt);
61 local_irq_save(flags);
62
63 /* format message */
64 buf = __get_cpu_var(fmt_buffer);
65 len = vscnprintf(buf, MSG_SIZE, fmt, args);
66
67 raw_spin_lock(&log_buffer_lock);
68 /* Don't copy the trailing null byte, we don't want null bytes in a
69 * text file.
70 */
71 kfifo_in(&debug_buffer, buf, len);
72 raw_spin_unlock(&log_buffer_lock);
73
74 local_irq_restore(flags);
75 va_end(args);
76}
77
78
79/*
80 * log_read - Read the trace buffer
81 *
82 * This function is called as a file operation from userspace.
83 * Readers can sleep. Access is serialized through reader_mutex
84 */
85static ssize_t log_read(struct file *filp,
86 char __user *to, size_t len,
87 loff_t *f_pos)
88{
89 /* we ignore f_pos, this is strictly sequential */
90
91 ssize_t error = -EINVAL;
92 char* mem;
93
94 if (mutex_lock_interruptible(&reader_mutex)) {
95 error = -ERESTARTSYS;
96 goto out;
97 }
98
99 if (len > MAX_READ_LEN)
100 len = MAX_READ_LEN;
101
102 mem = kmalloc(len, GFP_KERNEL);
103 if (!mem) {
104 error = -ENOMEM;
105 goto out_unlock;
106 }
107
108 error = kfifo_out(&debug_buffer, mem, len);
109 while (!error) {
110 set_current_state(TASK_INTERRUPTIBLE);
111 schedule_timeout(110);
112 if (signal_pending(current))
113 error = -ERESTARTSYS;
114 else
115 error = kfifo_out(&debug_buffer, mem, len);
116 }
117
118 if (error > 0 && copy_to_user(to, mem, error))
119 error = -EFAULT;
120
121 kfree(mem);
122 out_unlock:
123 mutex_unlock(&reader_mutex);
124 out:
125 return error;
126}
127
128/*
129 * Enable redirection of printk() messages to the trace buffer.
130 * Defined in kernel/printk.c
131 */
132extern int trace_override;
133extern int trace_recurse;
134
135/*
136 * log_open - open the global log message ring buffer.
137 */
138static int log_open(struct inode *in, struct file *filp)
139{
140 int error = -EINVAL;
141
142 if (mutex_lock_interruptible(&reader_mutex)) {
143 error = -ERESTARTSYS;
144 goto out;
145 }
146
147 atomic_inc(&reader_cnt);
148 error = 0;
149
150 printk(KERN_DEBUG
151 "sched_trace kfifo with buffer starting at: 0x%p\n",
152 debug_buffer.buf);
153
154 /* override printk() */
155 trace_override++;
156
157 mutex_unlock(&reader_mutex);
158 out:
159 return error;
160}
161
162static int log_release(struct inode *in, struct file *filp)
163{
164 int error = -EINVAL;
165
166 if (mutex_lock_interruptible(&reader_mutex)) {
167 error = -ERESTARTSYS;
168 goto out;
169 }
170
171 atomic_dec(&reader_cnt);
172
173 /* release printk() overriding */
174 trace_override--;
175
176 printk(KERN_DEBUG "sched_trace kfifo released\n");
177
178 mutex_unlock(&reader_mutex);
179 out:
180 return error;
181}
182
183/*
184 * log_fops - The file operations for accessing the global LITMUS log message
185 * buffer.
186 *
187 * Except for opening the device file it uses the same operations as trace_fops.
188 */
189static struct file_operations log_fops = {
190 .owner = THIS_MODULE,
191 .open = log_open,
192 .release = log_release,
193 .read = log_read,
194};
195
196static struct miscdevice litmus_log_dev = {
197 .name = SCHED_TRACE_NAME,
198 .minor = MISC_DYNAMIC_MINOR,
199 .fops = &log_fops,
200};
201
202#ifdef CONFIG_MAGIC_SYSRQ
203void dump_trace_buffer(int max)
204{
205 char line[80];
206 int len;
207 int count = 0;
208
209 /* potential, but very unlikely, race... */
210 trace_recurse = 1;
211 while ((max == 0 || count++ < max) &&
212 (len = kfifo_out(&debug_buffer, line, sizeof(line - 1))) > 0) {
213 line[len] = '\0';
214 printk("%s", line);
215 }
216 trace_recurse = 0;
217}
218
219static void sysrq_dump_trace_buffer(int key)
220{
221 dump_trace_buffer(100);
222}
223
224static struct sysrq_key_op sysrq_dump_trace_buffer_op = {
225 .handler = sysrq_dump_trace_buffer,
226 .help_msg = "dump-trace-buffer(Y)",
227 .action_msg = "writing content of TRACE() buffer",
228};
229#endif
230
231static int __init init_sched_trace(void)
232{
233 printk("Initializing TRACE() device\n");
234
235#ifdef CONFIG_MAGIC_SYSRQ
236 /* offer some debugging help */
237 if (!register_sysrq_key('y', &sysrq_dump_trace_buffer_op))
238 printk("Registered dump-trace-buffer(Y) magic sysrq.\n");
239 else
240 printk("Could not register dump-trace-buffer(Y) magic sysrq.\n");
241#endif
242
243 return misc_register(&litmus_log_dev);
244}
245
246static void __exit exit_sched_trace(void)
247{
248 misc_deregister(&litmus_log_dev);
249}
250
251module_init(init_sched_trace);
252module_exit(exit_sched_trace);
diff --git a/litmus/srp.c b/litmus/srp.c
new file mode 100644
index 00000000000..c88dbf2f580
--- /dev/null
+++ b/litmus/srp.c
@@ -0,0 +1,305 @@
1/* ************************************************************************** */
2/* STACK RESOURCE POLICY */
3/* ************************************************************************** */
4
5#include <asm/atomic.h>
6#include <linux/sched.h>
7#include <linux/wait.h>
8
9#include <litmus/litmus.h>
10#include <litmus/sched_plugin.h>
11#include <litmus/fdso.h>
12#include <litmus/trace.h>
13
14
15#ifdef CONFIG_LITMUS_LOCKING
16
17#include <litmus/srp.h>
18
19srp_prioritization_t get_srp_prio;
20
21struct srp {
22 struct list_head ceiling;
23 wait_queue_head_t ceiling_blocked;
24};
25#define system_ceiling(srp) list2prio(srp->ceiling.next)
26#define ceiling2sem(c) container_of(c, struct srp_semaphore, ceiling)
27
28#define UNDEF_SEM -2
29
30atomic_t srp_objects_in_use = ATOMIC_INIT(0);
31
32DEFINE_PER_CPU(struct srp, srp);
33
34/* Initialize SRP semaphores at boot time. */
35static int __init srp_init(void)
36{
37 int i;
38
39 printk("Initializing SRP per-CPU ceilings...");
40 for (i = 0; i < NR_CPUS; i++) {
41 init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked);
42 INIT_LIST_HEAD(&per_cpu(srp, i).ceiling);
43 }
44 printk(" done!\n");
45
46 return 0;
47}
48module_init(srp_init);
49
50/* SRP task priority comparison function. Smaller numeric values have higher
51 * priority, tie-break is PID. Special case: priority == 0 <=> no priority
52 */
53static int srp_higher_prio(struct srp_priority* first,
54 struct srp_priority* second)
55{
56 if (!first->priority)
57 return 0;
58 else
59 return !second->priority ||
60 first->priority < second->priority || (
61 first->priority == second->priority &&
62 first->pid < second->pid);
63}
64
65
66static int srp_exceeds_ceiling(struct task_struct* first,
67 struct srp* srp)
68{
69 struct srp_priority prio;
70
71 if (list_empty(&srp->ceiling))
72 return 1;
73 else {
74 prio.pid = first->pid;
75 prio.priority = get_srp_prio(first);
76 return srp_higher_prio(&prio, system_ceiling(srp)) ||
77 ceiling2sem(system_ceiling(srp))->owner == first;
78 }
79}
80
81static void srp_add_prio(struct srp* srp, struct srp_priority* prio)
82{
83 struct list_head *pos;
84 if (in_list(&prio->list)) {
85 printk(KERN_CRIT "WARNING: SRP violation detected, prio is already in "
86 "ceiling list! cpu=%d, srp=%p\n", smp_processor_id(), ceiling2sem(prio));
87 return;
88 }
89 list_for_each(pos, &srp->ceiling)
90 if (unlikely(srp_higher_prio(prio, list2prio(pos)))) {
91 __list_add(&prio->list, pos->prev, pos);
92 return;
93 }
94
95 list_add_tail(&prio->list, &srp->ceiling);
96}
97
98
99static int lock_srp_semaphore(struct litmus_lock* l)
100{
101 struct task_struct* t = current;
102 struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
103
104 if (!is_realtime(t))
105 return -EPERM;
106
107 /* prevent acquisition of local locks in global critical sections */
108 if (tsk_rt(t)->num_locks_held)
109 return -EBUSY;
110
111 preempt_disable();
112
113 /* Update ceiling. */
114 srp_add_prio(&__get_cpu_var(srp), &sem->ceiling);
115
116 /* SRP invariant: all resources available */
117 BUG_ON(sem->owner != NULL);
118
119 sem->owner = t;
120 TRACE_CUR("acquired srp 0x%p\n", sem);
121
122 tsk_rt(t)->num_local_locks_held++;
123
124 preempt_enable();
125
126 return 0;
127}
128
129static int unlock_srp_semaphore(struct litmus_lock* l)
130{
131 struct task_struct* t = current;
132 struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
133 int err = 0;
134
135 preempt_disable();
136
137 if (sem->owner != t) {
138 err = -EINVAL;
139 } else {
140 /* Determine new system priority ceiling for this CPU. */
141 BUG_ON(!in_list(&sem->ceiling.list));
142
143 list_del(&sem->ceiling.list);
144 sem->owner = NULL;
145
146 /* Wake tasks on this CPU, if they exceed current ceiling. */
147 TRACE_CUR("released srp 0x%p\n", sem);
148 wake_up_all(&__get_cpu_var(srp).ceiling_blocked);
149
150 tsk_rt(t)->num_local_locks_held--;
151 }
152
153 preempt_enable();
154 return err;
155}
156
157static int open_srp_semaphore(struct litmus_lock* l, void* __user arg)
158{
159 struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
160 int err = 0;
161 struct task_struct* t = current;
162 struct srp_priority t_prio;
163
164 if (!is_realtime(t))
165 return -EPERM;
166
167 TRACE_CUR("opening SRP semaphore %p, cpu=%d\n", sem, sem->cpu);
168
169 preempt_disable();
170
171 if (sem->owner != NULL)
172 err = -EBUSY;
173
174 if (err == 0) {
175 if (sem->cpu == UNDEF_SEM)
176 sem->cpu = get_partition(t);
177 else if (sem->cpu != get_partition(t))
178 err = -EPERM;
179 }
180
181 if (err == 0) {
182 t_prio.priority = get_srp_prio(t);
183 t_prio.pid = t->pid;
184 if (srp_higher_prio(&t_prio, &sem->ceiling)) {
185 sem->ceiling.priority = t_prio.priority;
186 sem->ceiling.pid = t_prio.pid;
187 }
188 }
189
190 preempt_enable();
191
192 return err;
193}
194
195static int close_srp_semaphore(struct litmus_lock* l)
196{
197 struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
198 int err = 0;
199
200 preempt_disable();
201
202 if (sem->owner == current)
203 unlock_srp_semaphore(l);
204
205 preempt_enable();
206
207 return err;
208}
209
210static void deallocate_srp_semaphore(struct litmus_lock* l)
211{
212 struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
213 atomic_dec(&srp_objects_in_use);
214 kfree(sem);
215}
216
217static struct litmus_lock_ops srp_lock_ops = {
218 .open = open_srp_semaphore,
219 .close = close_srp_semaphore,
220 .lock = lock_srp_semaphore,
221 .unlock = unlock_srp_semaphore,
222 .deallocate = deallocate_srp_semaphore,
223};
224
225struct srp_semaphore* allocate_srp_semaphore(void)
226{
227 struct srp_semaphore* sem;
228
229 sem = kmalloc(sizeof(*sem), GFP_KERNEL);
230 if (!sem)
231 return NULL;
232
233 INIT_LIST_HEAD(&sem->ceiling.list);
234 sem->ceiling.priority = 0;
235 sem->cpu = UNDEF_SEM;
236 sem->owner = NULL;
237
238 sem->litmus_lock.ops = &srp_lock_ops;
239
240 atomic_inc(&srp_objects_in_use);
241 return sem;
242}
243
244static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync,
245 void *key)
246{
247 int cpu = smp_processor_id();
248 struct task_struct *tsk = wait->private;
249 if (cpu != get_partition(tsk))
250 TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b",
251 get_partition(tsk));
252 else if (srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
253 return default_wake_function(wait, mode, sync, key);
254 return 0;
255}
256
257static void do_ceiling_block(struct task_struct *tsk)
258{
259 wait_queue_t wait = {
260 .private = tsk,
261 .func = srp_wake_up,
262 .task_list = {NULL, NULL}
263 };
264
265 tsk->state = TASK_UNINTERRUPTIBLE;
266 add_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
267 tsk->rt_param.srp_non_recurse = 1;
268 preempt_enable_no_resched();
269 schedule();
270 preempt_disable();
271 tsk->rt_param.srp_non_recurse = 0;
272 remove_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
273}
274
275/* Wait for current task priority to exceed system-wide priority ceiling.
276 * FIXME: the hotpath should be inline.
277 */
278void srp_ceiling_block(void)
279{
280 struct task_struct *tsk = current;
281
282 /* Only applies to real-time tasks, but optimize for RT tasks. */
283 if (unlikely(!is_realtime(tsk)))
284 return;
285
286 /* Avoid recursive ceiling blocking. */
287 if (unlikely(tsk->rt_param.srp_non_recurse))
288 return;
289
290 /* Bail out early if there aren't any SRP resources around. */
291 if (likely(!atomic_read(&srp_objects_in_use)))
292 return;
293
294 preempt_disable();
295 if (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) {
296 TRACE_CUR("is priority ceiling blocked.\n");
297 while (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
298 do_ceiling_block(tsk);
299 TRACE_CUR("finally exceeds system ceiling.\n");
300 } else
301 TRACE_CUR("is not priority ceiling blocked\n");
302 preempt_enable();
303}
304
305#endif
diff --git a/litmus/sync.c b/litmus/sync.c
new file mode 100644
index 00000000000..3e79e0a12a5
--- /dev/null
+++ b/litmus/sync.c
@@ -0,0 +1,152 @@
1/* litmus/sync.c - Support for synchronous and asynchronous task system releases.
2 *
3 *
4 */
5
6#include <asm/atomic.h>
7#include <asm/uaccess.h>
8#include <linux/spinlock.h>
9#include <linux/list.h>
10#include <linux/sched.h>
11#include <linux/completion.h>
12
13#include <litmus/litmus.h>
14#include <litmus/sched_plugin.h>
15#include <litmus/jobs.h>
16
17#include <litmus/sched_trace.h>
18
19struct ts_release_wait {
20 struct list_head list;
21 struct completion completion;
22 lt_t ts_release_time;
23};
24
25#define DECLARE_TS_RELEASE_WAIT(symb) \
26 struct ts_release_wait symb = \
27 { \
28 LIST_HEAD_INIT(symb.list), \
29 COMPLETION_INITIALIZER_ONSTACK(symb.completion), \
30 0 \
31 }
32
33static LIST_HEAD(task_release_list);
34static DEFINE_MUTEX(task_release_lock);
35
36static long do_wait_for_ts_release(void)
37{
38 DECLARE_TS_RELEASE_WAIT(wait);
39
40 long ret = -ERESTARTSYS;
41
42 if (mutex_lock_interruptible(&task_release_lock))
43 goto out;
44
45 list_add(&wait.list, &task_release_list);
46
47 mutex_unlock(&task_release_lock);
48
49 /* We are enqueued, now we wait for someone to wake us up. */
50 ret = wait_for_completion_interruptible(&wait.completion);
51
52 if (!ret) {
53 /* Completion succeeded, setup release. */
54 litmus->release_at(current, wait.ts_release_time
55 + current->rt_param.task_params.phase
56 - current->rt_param.task_params.period);
57 /* trigger advance to next job release at the programmed time */
58 ret = complete_job();
59 } else {
60 /* We were interrupted, must cleanup list. */
61 mutex_lock(&task_release_lock);
62 if (!wait.completion.done)
63 list_del(&wait.list);
64 mutex_unlock(&task_release_lock);
65 }
66
67out:
68 return ret;
69}
70
71int count_tasks_waiting_for_release(void)
72{
73 int task_count = 0;
74 struct list_head *pos;
75
76 mutex_lock(&task_release_lock);
77
78 list_for_each(pos, &task_release_list) {
79 task_count++;
80 }
81
82 mutex_unlock(&task_release_lock);
83
84
85 return task_count;
86}
87
88static long do_release_ts(lt_t start)
89{
90 long task_count = 0;
91
92 struct list_head *pos, *safe;
93 struct ts_release_wait *wait;
94
95 if (mutex_lock_interruptible(&task_release_lock)) {
96 task_count = -ERESTARTSYS;
97 goto out;
98 }
99
100 TRACE("<<<<<< synchronous task system release >>>>>>\n");
101 sched_trace_sys_release(&start);
102
103 task_count = 0;
104 list_for_each_safe(pos, safe, &task_release_list) {
105 wait = (struct ts_release_wait*)
106 list_entry(pos, struct ts_release_wait, list);
107
108 task_count++;
109 wait->ts_release_time = start;
110 complete(&wait->completion);
111 }
112
113 /* clear stale list */
114 INIT_LIST_HEAD(&task_release_list);
115
116 mutex_unlock(&task_release_lock);
117
118out:
119 return task_count;
120}
121
122
123asmlinkage long sys_wait_for_ts_release(void)
124{
125 long ret = -EPERM;
126 struct task_struct *t = current;
127
128 if (is_realtime(t))
129 ret = do_wait_for_ts_release();
130
131 return ret;
132}
133
134#define ONE_MS 1000000
135
136asmlinkage long sys_release_ts(lt_t __user *__delay)
137{
138 long ret;
139 lt_t delay;
140 lt_t start_time;
141
142 /* FIXME: check capabilities... */
143
144 ret = copy_from_user(&delay, __delay, sizeof(delay));
145 if (ret == 0) {
146 /* round up to next larger integral millisecond */
147 start_time = ((litmus_clock() / ONE_MS) + 1) * ONE_MS;
148 ret = do_release_ts(start_time + delay);
149 }
150
151 return ret;
152}
diff --git a/litmus/trace.c b/litmus/trace.c
new file mode 100644
index 00000000000..7dbb98e4a3c
--- /dev/null
+++ b/litmus/trace.c
@@ -0,0 +1,300 @@
1#include <linux/sched.h>
2#include <linux/module.h>
3#include <linux/uaccess.h>
4
5#include <litmus/ftdev.h>
6#include <litmus/litmus.h>
7#include <litmus/trace.h>
8
9/******************************************************************************/
10/* Allocation */
11/******************************************************************************/
12
13static struct ftdev overhead_dev;
14
15#define trace_ts_buf overhead_dev.minor[0].buf
16
17static unsigned int ts_seq_no = 0;
18
19DEFINE_PER_CPU(atomic_t, irq_fired_count);
20
21void ft_irq_fired(void)
22{
23 /* Only called with preemptions disabled. */
24 atomic_inc(&__get_cpu_var(irq_fired_count));
25
26 if (has_control_page(current))
27 get_control_page(current)->irq_count++;
28}
29
30static inline void clear_irq_fired(void)
31{
32 atomic_set(&__raw_get_cpu_var(irq_fired_count), 0);
33}
34
35static inline unsigned int get_and_clear_irq_fired(void)
36{
37 /* This is potentially not atomic since we might migrate if
38 * preemptions are not disabled. As a tradeoff between
39 * accuracy and tracing overheads, this seems acceptable.
40 * If it proves to be a problem, then one could add a callback
41 * from the migration code to invalidate irq_fired_count.
42 */
43 return atomic_xchg(&__raw_get_cpu_var(irq_fired_count), 0);
44}
45
46static inline void save_irq_flags(struct timestamp *ts, unsigned int irq_count)
47{
48 /* Store how many interrupts occurred. */
49 ts->irq_count = irq_count;
50 /* Extra flag because ts->irq_count overflows quickly. */
51 ts->irq_flag = irq_count > 0;
52
53}
54
55static inline void write_timestamp(uint8_t event,
56 uint8_t type,
57 uint8_t cpu,
58 uint16_t pid_fragment,
59 unsigned int irq_count,
60 int record_irq,
61 int hide_irq,
62 uint64_t timestamp,
63 int record_timestamp)
64{
65 unsigned long flags;
66 unsigned int seq_no;
67 struct timestamp *ts;
68
69 /* Avoid preemptions while recording the timestamp. This reduces the
70 * number of "out of order" timestamps in the stream and makes
71 * post-processing easier. */
72
73 local_irq_save(flags);
74
75 seq_no = fetch_and_inc((int *) &ts_seq_no);
76 if (ft_buffer_start_write(trace_ts_buf, (void**) &ts)) {
77 ts->event = event;
78 ts->seq_no = seq_no;
79
80 ts->task_type = type;
81 ts->pid = pid_fragment;
82
83 ts->cpu = cpu;
84
85 if (record_irq)
86 irq_count = get_and_clear_irq_fired();
87
88 save_irq_flags(ts, irq_count - hide_irq);
89
90 if (record_timestamp)
91 timestamp = ft_timestamp();
92
93 ts->timestamp = timestamp;
94 ft_buffer_finish_write(trace_ts_buf, ts);
95 }
96
97 local_irq_restore(flags);
98}
99
100static void __add_timestamp_user(struct timestamp *pre_recorded)
101{
102 unsigned long flags;
103 unsigned int seq_no;
104 struct timestamp *ts;
105
106
107 local_irq_save(flags);
108
109 seq_no = fetch_and_inc((int *) &ts_seq_no);
110 if (ft_buffer_start_write(trace_ts_buf, (void**) &ts)) {
111 *ts = *pre_recorded;
112 ts->seq_no = seq_no;
113 ts->cpu = raw_smp_processor_id();
114 save_irq_flags(ts, get_and_clear_irq_fired());
115 ft_buffer_finish_write(trace_ts_buf, ts);
116 }
117
118 local_irq_restore(flags);
119}
120
121feather_callback void save_timestamp(unsigned long event)
122{
123 write_timestamp(event, TSK_UNKNOWN,
124 raw_smp_processor_id(),
125 current->pid,
126 0, 1, 0,
127 0, 1);
128}
129
130feather_callback void save_timestamp_def(unsigned long event,
131 unsigned long type)
132{
133 write_timestamp(event, type,
134 raw_smp_processor_id(),
135 current->pid,
136 0, 1, 0,
137 0, 1);
138}
139
140feather_callback void save_timestamp_task(unsigned long event,
141 unsigned long t_ptr)
142{
143 struct task_struct *t = (struct task_struct *) t_ptr;
144 int rt = is_realtime(t);
145
146 write_timestamp(event, rt ? TSK_RT : TSK_BE,
147 raw_smp_processor_id(),
148 t->pid,
149 0, 1, 0,
150 0, 1);
151}
152
153feather_callback void save_timestamp_cpu(unsigned long event,
154 unsigned long cpu)
155{
156 write_timestamp(event, TSK_UNKNOWN, cpu, current->pid,
157 0, 1, 0,
158 0, 1);
159}
160
161feather_callback void save_task_latency(unsigned long event,
162 unsigned long when_ptr)
163{
164 lt_t now = litmus_clock();
165 lt_t *when = (lt_t*) when_ptr;
166
167 write_timestamp(event, TSK_RT, raw_smp_processor_id(), 0,
168 0, 1, 0,
169 now - *when, 0);
170}
171
172/* fake timestamp to user-reported time */
173feather_callback void save_timestamp_time(unsigned long event,
174 unsigned long ptr)
175{
176 uint64_t* time = (uint64_t*) ptr;
177
178 write_timestamp(event, is_realtime(current) ? TSK_RT : TSK_BE,
179 raw_smp_processor_id(), current->pid,
180 0, 1, 0,
181 *time, 0);
182}
183
184/* Record user-reported IRQ count */
185feather_callback void save_timestamp_irq(unsigned long event,
186 unsigned long irq_counter_ptr)
187{
188 uint64_t* irqs = (uint64_t*) irq_counter_ptr;
189
190 write_timestamp(event, is_realtime(current) ? TSK_RT : TSK_BE,
191 raw_smp_processor_id(), current->pid,
192 *irqs, 0, 0,
193 0, 1);
194}
195
196/* Suppress one IRQ from the irq count. Used by TS_SEND_RESCHED_END, which is
197 * called from within an interrupt that is expected. */
198feather_callback void save_timestamp_hide_irq(unsigned long event)
199{
200 write_timestamp(event, is_realtime(current) ? TSK_RT : TSK_BE,
201 raw_smp_processor_id(), current->pid,
202 0, 1, 1,
203 0, 1);
204}
205
206/******************************************************************************/
207/* DEVICE FILE DRIVER */
208/******************************************************************************/
209
210/*
211 * should be 8M; it is the max we can ask to buddy system allocator (MAX_ORDER)
212 * and we might not get as much
213 */
214#define NO_TIMESTAMPS (2 << 16)
215
216static int alloc_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
217{
218 unsigned int count = NO_TIMESTAMPS;
219
220 /* An overhead-tracing timestamp should be exactly 16 bytes long. */
221 BUILD_BUG_ON(sizeof(struct timestamp) != 16);
222
223 while (count && !trace_ts_buf) {
224 printk("time stamp buffer: trying to allocate %u time stamps.\n", count);
225 ftdev->minor[idx].buf = alloc_ft_buffer(count, sizeof(struct timestamp));
226 count /= 2;
227 }
228 return ftdev->minor[idx].buf ? 0 : -ENOMEM;
229}
230
231static void free_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
232{
233 free_ft_buffer(ftdev->minor[idx].buf);
234 ftdev->minor[idx].buf = NULL;
235}
236
237static ssize_t write_timestamp_from_user(struct ft_buffer* buf, size_t len,
238 const char __user *from)
239{
240 ssize_t consumed = 0;
241 struct timestamp ts;
242
243 /* don't give us partial timestamps */
244 if (len % sizeof(ts))
245 return -EINVAL;
246
247 while (len >= sizeof(ts)) {
248 if (copy_from_user(&ts, from, sizeof(ts))) {
249 consumed = -EFAULT;
250 goto out;
251 }
252 len -= sizeof(ts);
253 from += sizeof(ts);
254 consumed += sizeof(ts);
255
256 __add_timestamp_user(&ts);
257 }
258
259out:
260 return consumed;
261}
262
263static int __init init_ft_overhead_trace(void)
264{
265 int err, cpu;
266
267 printk("Initializing Feather-Trace overhead tracing device.\n");
268 err = ftdev_init(&overhead_dev, THIS_MODULE, 1, "ft_trace");
269 if (err)
270 goto err_out;
271
272 overhead_dev.alloc = alloc_timestamp_buffer;
273 overhead_dev.free = free_timestamp_buffer;
274 overhead_dev.write = write_timestamp_from_user;
275
276 err = register_ftdev(&overhead_dev);
277 if (err)
278 goto err_dealloc;
279
280 /* initialize IRQ flags */
281 for (cpu = 0; cpu < NR_CPUS; cpu++) {
282 clear_irq_fired();
283 }
284
285 return 0;
286
287err_dealloc:
288 ftdev_exit(&overhead_dev);
289err_out:
290 printk(KERN_WARNING "Could not register ft_trace module.\n");
291 return err;
292}
293
294static void __exit exit_ft_overhead_trace(void)
295{
296 ftdev_exit(&overhead_dev);
297}
298
299module_init(init_ft_overhead_trace);
300module_exit(exit_ft_overhead_trace);
diff --git a/litmus/uncachedev.c b/litmus/uncachedev.c
new file mode 100644
index 00000000000..06a6a7c1798
--- /dev/null
+++ b/litmus/uncachedev.c
@@ -0,0 +1,102 @@
1#include <linux/sched.h>
2#include <linux/kernel.h>
3#include <linux/mm.h>
4#include <linux/fs.h>
5#include <linux/errno.h>
6#include <linux/highmem.h>
7#include <asm/page.h>
8#include <linux/miscdevice.h>
9#include <linux/module.h>
10
11#include <litmus/litmus.h>
12
13/* device for allocating pages not cached by the CPU */
14
15#define UNCACHE_NAME "litmus/uncache"
16
17void litmus_uncache_vm_open(struct vm_area_struct *vma)
18{
19}
20
21void litmus_uncache_vm_close(struct vm_area_struct *vma)
22{
23}
24
25int litmus_uncache_vm_fault(struct vm_area_struct* vma,
26 struct vm_fault* vmf)
27{
28 /* modeled after SG DMA video4linux, but without DMA. */
29 /* (see drivers/media/video/videobuf-dma-sg.c) */
30 struct page *page;
31
32 page = alloc_page(GFP_USER);
33 if (!page)
34 return VM_FAULT_OOM;
35
36 clear_user_highpage(page, (unsigned long)vmf->virtual_address);
37 vmf->page = page;
38
39 return 0;
40}
41
42static struct vm_operations_struct litmus_uncache_vm_ops = {
43 .open = litmus_uncache_vm_open,
44 .close = litmus_uncache_vm_close,
45 .fault = litmus_uncache_vm_fault,
46};
47
48static int litmus_uncache_mmap(struct file* filp, struct vm_area_struct* vma)
49{
50 /* first make sure mapper knows what he's doing */
51
52 /* you can only map the "first" page */
53 if (vma->vm_pgoff != 0)
54 return -EINVAL;
55
56 /* you can't share it with anyone */
57 if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
58 return -EINVAL;
59
60 /* cannot be expanded, and is not a "normal" page. */
61 vma->vm_flags |= VM_DONTEXPAND;
62
63 /* noncached pages are not explicitly locked in memory (for now). */
64 vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
65
66 vma->vm_ops = &litmus_uncache_vm_ops;
67
68 return 0;
69}
70
71static struct file_operations litmus_uncache_fops = {
72 .owner = THIS_MODULE,
73 .mmap = litmus_uncache_mmap,
74};
75
76static struct miscdevice litmus_uncache_dev = {
77 .name = UNCACHE_NAME,
78 .minor = MISC_DYNAMIC_MINOR,
79 .fops = &litmus_uncache_fops,
80 /* pages are not locked, so there is no reason why
81 anyone cannot allocate an uncache pages */
82 .mode = (S_IRUGO | S_IWUGO),
83};
84
85static int __init init_litmus_uncache_dev(void)
86{
87 int err;
88
89 printk("Initializing LITMUS^RT uncache device.\n");
90 err = misc_register(&litmus_uncache_dev);
91 if (err)
92 printk("Could not allocate %s device (%d).\n", UNCACHE_NAME, err);
93 return err;
94}
95
96static void __exit exit_litmus_uncache_dev(void)
97{
98 misc_deregister(&litmus_uncache_dev);
99}
100
101module_init(init_litmus_uncache_dev);
102module_exit(exit_litmus_uncache_dev);