diff options
author | Bjoern B. Brandenburg <bbb@cs.unc.edu> | 2011-01-07 17:37:01 -0500 |
---|---|---|
committer | Bjoern B. Brandenburg <bbb@cs.unc.edu> | 2011-02-01 17:00:27 -0500 |
commit | 71efbc5459ef95ed902a6980eae646197529364e (patch) | |
tree | 02d7d4f86b4d915a5aa5e2d03ce33ed3c6963335 | |
parent | 343d4ead3b12992f494134114cf50e4f37c656c5 (diff) |
Pfair: support clustered scheduling
Just like C-EDF is a global scheduler that is split across several
clusters, Pfair can be applied on a per-cluster basis. This patch
changes the Pfair implementation to enable clustering based on the
recently added generic clustering support.
-rw-r--r-- | litmus/sched_pfair.c | 392 |
1 files changed, 274 insertions, 118 deletions
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c index c7d5cf7aa2b3..71ce993df5ca 100644 --- a/litmus/sched_pfair.c +++ b/litmus/sched_pfair.c | |||
@@ -23,6 +23,13 @@ | |||
23 | 23 | ||
24 | #include <litmus/bheap.h> | 24 | #include <litmus/bheap.h> |
25 | 25 | ||
26 | /* to configure the cluster size */ | ||
27 | #include <litmus/litmus_proc.h> | ||
28 | |||
29 | #include <litmus/clustered.h> | ||
30 | |||
31 | static enum cache_level pfair_cluster_level = GLOBAL_CLUSTER; | ||
32 | |||
26 | struct subtask { | 33 | struct subtask { |
27 | /* measured in quanta relative to job release */ | 34 | /* measured in quanta relative to job release */ |
28 | quanta_t release; | 35 | quanta_t release; |
@@ -43,25 +50,28 @@ struct pfair_param { | |||
43 | 50 | ||
44 | unsigned int sporadic_release; /* On wakeup, new sporadic release? */ | 51 | unsigned int sporadic_release; /* On wakeup, new sporadic release? */ |
45 | 52 | ||
53 | struct pfair_cluster* cluster; /* where this task is scheduled */ | ||
54 | |||
46 | struct subtask subtasks[0]; /* allocate together with pfair_param */ | 55 | struct subtask subtasks[0]; /* allocate together with pfair_param */ |
47 | }; | 56 | }; |
48 | 57 | ||
49 | #define tsk_pfair(tsk) ((tsk)->rt_param.pfair) | 58 | #define tsk_pfair(tsk) ((tsk)->rt_param.pfair) |
50 | 59 | ||
51 | struct pfair_state { | 60 | struct pfair_state { |
52 | int cpu; | 61 | struct cluster_cpu topology; |
62 | |||
53 | volatile quanta_t cur_tick; /* updated by the CPU that is advancing | 63 | volatile quanta_t cur_tick; /* updated by the CPU that is advancing |
54 | * the time */ | 64 | * the time */ |
55 | volatile quanta_t local_tick; /* What tick is the local CPU currently | 65 | volatile quanta_t local_tick; /* What tick is the local CPU currently |
56 | * executing? Updated only by the local | 66 | * executing? Updated only by the local |
57 | * CPU. In QEMU, this may lag behind the | 67 | * CPU. In QEMU, this may lag behind the |
58 | * current tick. In a real system, with | 68 | * current tick. In a real system, with |
59 | * proper timers and aligned quanta, | 69 | * proper timers and aligned quanta, |
60 | * that should only be the | 70 | * that should only be the case for a |
61 | * case for a very short time after the | 71 | * very short time after the time |
62 | * time advanced. With staggered quanta, | 72 | * advanced. With staggered quanta, it |
63 | * it will lag for the duration of the | 73 | * will lag for the duration of the |
64 | * offset. | 74 | * offset. |
65 | */ | 75 | */ |
66 | 76 | ||
67 | struct task_struct* linked; /* the task that should be executing */ | 77 | struct task_struct* linked; /* the task that should be executing */ |
@@ -79,25 +89,56 @@ struct pfair_state { | |||
79 | */ | 89 | */ |
80 | #define PFAIR_MAX_PERIOD 2000 | 90 | #define PFAIR_MAX_PERIOD 2000 |
81 | 91 | ||
82 | /* This is the release queue wheel. It is indexed by pfair_time % | 92 | struct pfair_cluster { |
83 | * PFAIR_MAX_PERIOD. Each heap is ordered by PFAIR priority, so that it can be | 93 | struct scheduling_cluster topology; |
84 | * merged with the ready queue. | ||
85 | */ | ||
86 | static struct bheap release_queue[PFAIR_MAX_PERIOD]; | ||
87 | 94 | ||
88 | DEFINE_PER_CPU(struct pfair_state, pfair_state); | 95 | /* The "global" time in this cluster. */ |
89 | struct pfair_state* *pstate; /* short cut */ | 96 | quanta_t pfair_time; /* the "official" PFAIR clock */ |
97 | quanta_t merge_time; /* Updated after the release queue has been | ||
98 | * merged. Used by drop_all_references(). | ||
99 | */ | ||
90 | 100 | ||
91 | static quanta_t pfair_time = 0; /* the "official" PFAIR clock */ | 101 | /* The ready queue for this cluster. */ |
92 | static quanta_t merge_time = 0; /* Updated after the release queue has been | 102 | rt_domain_t pfair; |
93 | * merged. Used by drop_all_references(). | ||
94 | */ | ||
95 | 103 | ||
96 | static rt_domain_t pfair; | 104 | /* This is the release queue wheel for this cluster. It is indexed by |
105 | * pfair_time % PFAIR_MAX_PERIOD. Each heap is ordered by PFAIR | ||
106 | * priority, so that it can be merged with the ready queue. | ||
107 | */ | ||
108 | struct bheap release_queue[PFAIR_MAX_PERIOD]; | ||
109 | }; | ||
97 | 110 | ||
98 | /* The pfair_lock is used to serialize all scheduling events. | 111 | static inline struct pfair_cluster* cpu_cluster(struct pfair_state* state) |
99 | */ | 112 | { |
100 | #define pfair_lock pfair.ready_lock | 113 | return container_of(state->topology.cluster, struct pfair_cluster, topology); |
114 | } | ||
115 | |||
116 | static inline int cpu_id(struct pfair_state* state) | ||
117 | { | ||
118 | return state->topology.id; | ||
119 | } | ||
120 | |||
121 | static inline struct pfair_state* from_cluster_list(struct list_head* pos) | ||
122 | { | ||
123 | return list_entry(pos, struct pfair_state, topology.cluster_list); | ||
124 | } | ||
125 | |||
126 | static inline raw_spinlock_t* cluster_lock(struct pfair_cluster* cluster) | ||
127 | { | ||
128 | /* The ready_lock is used to serialize all scheduling events. */ | ||
129 | return &cluster->pfair.ready_lock; | ||
130 | } | ||
131 | |||
132 | static inline raw_spinlock_t* cpu_lock(struct pfair_state* state) | ||
133 | { | ||
134 | return cluster_lock(cpu_cluster(state)); | ||
135 | } | ||
136 | |||
137 | DEFINE_PER_CPU(struct pfair_state, pfair_state); | ||
138 | struct pfair_state* *pstate; /* short cut */ | ||
139 | |||
140 | static struct pfair_cluster* pfair_clusters; | ||
141 | static int num_pfair_clusters; | ||
101 | 142 | ||
102 | /* Enable for lots of trace info. | 143 | /* Enable for lots of trace info. |
103 | * #define PFAIR_DEBUG | 144 | * #define PFAIR_DEBUG |
@@ -197,9 +238,9 @@ int pfair_ready_order(struct bheap_node* a, struct bheap_node* b) | |||
197 | } | 238 | } |
198 | 239 | ||
199 | /* return the proper release queue for time t */ | 240 | /* return the proper release queue for time t */ |
200 | static struct bheap* relq(quanta_t t) | 241 | static struct bheap* relq(struct pfair_cluster* cluster, quanta_t t) |
201 | { | 242 | { |
202 | struct bheap* rq = &release_queue[t % PFAIR_MAX_PERIOD]; | 243 | struct bheap* rq = cluster->release_queue + (t % PFAIR_MAX_PERIOD); |
203 | return rq; | 244 | return rq; |
204 | } | 245 | } |
205 | 246 | ||
@@ -215,17 +256,19 @@ static void __pfair_add_release(struct task_struct* t, struct bheap* queue) | |||
215 | tsk_rt(t)->heap_node); | 256 | tsk_rt(t)->heap_node); |
216 | } | 257 | } |
217 | 258 | ||
218 | static void pfair_add_release(struct task_struct* t) | 259 | static void pfair_add_release(struct pfair_cluster* cluster, |
260 | struct task_struct* t) | ||
219 | { | 261 | { |
220 | BUG_ON(bheap_node_in_heap(tsk_rt(t)->heap_node)); | 262 | BUG_ON(bheap_node_in_heap(tsk_rt(t)->heap_node)); |
221 | __pfair_add_release(t, relq(cur_release(t))); | 263 | __pfair_add_release(t, relq(cluster, cur_release(t))); |
222 | } | 264 | } |
223 | 265 | ||
224 | /* pull released tasks from the release queue */ | 266 | /* pull released tasks from the release queue */ |
225 | static void poll_releases(quanta_t time) | 267 | static void poll_releases(struct pfair_cluster* cluster, |
268 | quanta_t time) | ||
226 | { | 269 | { |
227 | __merge_ready(&pfair, relq(time)); | 270 | __merge_ready(&cluster->pfair, relq(cluster, time)); |
228 | merge_time = time; | 271 | cluster->merge_time = time; |
229 | } | 272 | } |
230 | 273 | ||
231 | static void check_preempt(struct task_struct* t) | 274 | static void check_preempt(struct task_struct* t) |
@@ -246,18 +289,20 @@ static void check_preempt(struct task_struct* t) | |||
246 | } | 289 | } |
247 | } | 290 | } |
248 | 291 | ||
249 | /* caller must hold pfair_lock */ | 292 | /* caller must hold pfair.ready_lock */ |
250 | static void drop_all_references(struct task_struct *t) | 293 | static void drop_all_references(struct task_struct *t) |
251 | { | 294 | { |
252 | int cpu; | 295 | int cpu; |
253 | struct pfair_state* s; | 296 | struct pfair_state* s; |
254 | struct bheap* q; | 297 | struct bheap* q; |
298 | struct pfair_cluster* cluster; | ||
255 | if (bheap_node_in_heap(tsk_rt(t)->heap_node)) { | 299 | if (bheap_node_in_heap(tsk_rt(t)->heap_node)) { |
256 | /* figure out what queue the node is in */ | 300 | /* figure out what queue the node is in */ |
257 | if (time_before_eq(cur_release(t), merge_time)) | 301 | cluster = tsk_pfair(t)->cluster; |
258 | q = &pfair.ready_queue; | 302 | if (time_before_eq(cur_release(t), cluster->merge_time)) |
303 | q = &cluster->pfair.ready_queue; | ||
259 | else | 304 | else |
260 | q = relq(cur_release(t)); | 305 | q = relq(cluster, cur_release(t)); |
261 | bheap_delete(pfair_ready_order, q, | 306 | bheap_delete(pfair_ready_order, q, |
262 | tsk_rt(t)->heap_node); | 307 | tsk_rt(t)->heap_node); |
263 | } | 308 | } |
@@ -301,22 +346,25 @@ static int advance_subtask(quanta_t time, struct task_struct* t, int cpu) | |||
301 | return to_relq; | 346 | return to_relq; |
302 | } | 347 | } |
303 | 348 | ||
304 | static void advance_subtasks(quanta_t time) | 349 | static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time) |
305 | { | 350 | { |
306 | int cpu, missed; | 351 | int missed; |
307 | struct task_struct* l; | 352 | struct task_struct* l; |
308 | struct pfair_param* p; | 353 | struct pfair_param* p; |
354 | struct list_head* pos; | ||
355 | struct pfair_state* cpu; | ||
309 | 356 | ||
310 | for_each_online_cpu(cpu) { | 357 | list_for_each(pos, &cluster->topology.cpus) { |
311 | l = pstate[cpu]->linked; | 358 | cpu = from_cluster_list(pos); |
312 | missed = pstate[cpu]->linked != pstate[cpu]->local; | 359 | l = cpu->linked; |
360 | missed = cpu->linked != cpu->local; | ||
313 | if (l) { | 361 | if (l) { |
314 | p = tsk_pfair(l); | 362 | p = tsk_pfair(l); |
315 | p->last_quantum = time; | 363 | p->last_quantum = time; |
316 | p->last_cpu = cpu; | 364 | p->last_cpu = cpu_id(cpu); |
317 | if (advance_subtask(time, l, cpu)) { | 365 | if (advance_subtask(time, l, cpu_id(cpu))) { |
318 | pstate[cpu]->linked = NULL; | 366 | cpu->linked = NULL; |
319 | pfair_add_release(l); | 367 | pfair_add_release(cluster, l); |
320 | } | 368 | } |
321 | } | 369 | } |
322 | } | 370 | } |
@@ -350,8 +398,10 @@ static int pfair_link(quanta_t time, int cpu, | |||
350 | int target = target_cpu(time, t, cpu); | 398 | int target = target_cpu(time, t, cpu); |
351 | struct task_struct* prev = pstate[cpu]->linked; | 399 | struct task_struct* prev = pstate[cpu]->linked; |
352 | struct task_struct* other; | 400 | struct task_struct* other; |
401 | struct pfair_cluster* cluster = cpu_cluster(pstate[cpu]); | ||
353 | 402 | ||
354 | if (target != cpu) { | 403 | if (target != cpu) { |
404 | BUG_ON(pstate[target]->topology.cluster != pstate[cpu]->topology.cluster); | ||
355 | other = pstate[target]->linked; | 405 | other = pstate[target]->linked; |
356 | pstate[target]->linked = t; | 406 | pstate[target]->linked = t; |
357 | tsk_rt(t)->linked_on = target; | 407 | tsk_rt(t)->linked_on = target; |
@@ -365,14 +415,14 @@ static int pfair_link(quanta_t time, int cpu, | |||
365 | if (prev) { | 415 | if (prev) { |
366 | /* prev got pushed back into the ready queue */ | 416 | /* prev got pushed back into the ready queue */ |
367 | tsk_rt(prev)->linked_on = NO_CPU; | 417 | tsk_rt(prev)->linked_on = NO_CPU; |
368 | __add_ready(&pfair, prev); | 418 | __add_ready(&cluster->pfair, prev); |
369 | } | 419 | } |
370 | /* we are done with this cpu */ | 420 | /* we are done with this cpu */ |
371 | return 0; | 421 | return 0; |
372 | } else { | 422 | } else { |
373 | /* re-add other, it's original CPU was not considered yet */ | 423 | /* re-add other, it's original CPU was not considered yet */ |
374 | tsk_rt(other)->linked_on = NO_CPU; | 424 | tsk_rt(other)->linked_on = NO_CPU; |
375 | __add_ready(&pfair, other); | 425 | __add_ready(&cluster->pfair, other); |
376 | /* reschedule this CPU */ | 426 | /* reschedule this CPU */ |
377 | return 1; | 427 | return 1; |
378 | } | 428 | } |
@@ -382,71 +432,77 @@ static int pfair_link(quanta_t time, int cpu, | |||
382 | if (prev) { | 432 | if (prev) { |
383 | /* prev got pushed back into the ready queue */ | 433 | /* prev got pushed back into the ready queue */ |
384 | tsk_rt(prev)->linked_on = NO_CPU; | 434 | tsk_rt(prev)->linked_on = NO_CPU; |
385 | __add_ready(&pfair, prev); | 435 | __add_ready(&cluster->pfair, prev); |
386 | } | 436 | } |
387 | /* we are done with this CPU */ | 437 | /* we are done with this CPU */ |
388 | return 0; | 438 | return 0; |
389 | } | 439 | } |
390 | } | 440 | } |
391 | 441 | ||
392 | static void schedule_subtasks(quanta_t time) | 442 | static void schedule_subtasks(struct pfair_cluster *cluster, quanta_t time) |
393 | { | 443 | { |
394 | int cpu, retry; | 444 | int retry; |
445 | struct list_head *pos; | ||
446 | struct pfair_state *cpu_state; | ||
395 | 447 | ||
396 | for_each_online_cpu(cpu) { | 448 | list_for_each(pos, &cluster->topology.cpus) { |
449 | cpu_state = from_cluster_list(pos); | ||
397 | retry = 1; | 450 | retry = 1; |
398 | while (retry) { | 451 | while (retry) { |
399 | if (pfair_higher_prio(__peek_ready(&pfair), | 452 | if (pfair_higher_prio(__peek_ready(&cluster->pfair), |
400 | pstate[cpu]->linked)) | 453 | cpu_state->linked)) |
401 | retry = pfair_link(time, cpu, | 454 | retry = pfair_link(time, cpu_id(cpu_state), |
402 | __take_ready(&pfair)); | 455 | __take_ready(&cluster->pfair)); |
403 | else | 456 | else |
404 | retry = 0; | 457 | retry = 0; |
405 | } | 458 | } |
406 | } | 459 | } |
407 | } | 460 | } |
408 | 461 | ||
409 | static void schedule_next_quantum(quanta_t time) | 462 | static void schedule_next_quantum(struct pfair_cluster *cluster, quanta_t time) |
410 | { | 463 | { |
411 | int cpu; | 464 | struct pfair_state *cpu; |
465 | struct list_head* pos; | ||
412 | 466 | ||
413 | /* called with interrupts disabled */ | 467 | /* called with interrupts disabled */ |
414 | PTRACE("--- Q %lu at %llu PRE-SPIN\n", | 468 | PTRACE("--- Q %lu at %llu PRE-SPIN\n", |
415 | time, litmus_clock()); | 469 | time, litmus_clock()); |
416 | raw_spin_lock(&pfair_lock); | 470 | raw_spin_lock(cluster_lock(cluster)); |
417 | PTRACE("<<< Q %lu at %llu\n", | 471 | PTRACE("<<< Q %lu at %llu\n", |
418 | time, litmus_clock()); | 472 | time, litmus_clock()); |
419 | 473 | ||
420 | sched_trace_quantum_boundary(); | 474 | sched_trace_quantum_boundary(); |
421 | 475 | ||
422 | advance_subtasks(time); | 476 | advance_subtasks(cluster, time); |
423 | poll_releases(time); | 477 | poll_releases(cluster, time); |
424 | schedule_subtasks(time); | 478 | schedule_subtasks(cluster, time); |
425 | 479 | ||
426 | for (cpu = 0; cpu < num_online_cpus(); cpu++) | 480 | list_for_each(pos, &cluster->topology.cpus) { |
427 | if (pstate[cpu]->linked) | 481 | cpu = from_cluster_list(pos); |
482 | if (cpu->linked) | ||
428 | PTRACE_TASK(pstate[cpu]->linked, | 483 | PTRACE_TASK(pstate[cpu]->linked, |
429 | " linked on %d.\n", cpu); | 484 | " linked on %d.\n", cpu_id(cpu)); |
430 | else | 485 | else |
431 | PTRACE("(null) linked on %d.\n", cpu); | 486 | PTRACE("(null) linked on %d.\n", cpu_id(cpu)); |
432 | 487 | } | |
433 | /* We are done. Advance time. */ | 488 | /* We are done. Advance time. */ |
434 | mb(); | 489 | mb(); |
435 | for (cpu = 0; cpu < num_online_cpus(); cpu++) { | 490 | list_for_each(pos, &cluster->topology.cpus) { |
436 | if (pstate[cpu]->local_tick != pstate[cpu]->cur_tick) { | 491 | cpu = from_cluster_list(pos); |
492 | if (cpu->local_tick != cpu->cur_tick) { | ||
437 | TRACE("BAD Quantum not acked on %d " | 493 | TRACE("BAD Quantum not acked on %d " |
438 | "(l:%lu c:%lu p:%lu)\n", | 494 | "(l:%lu c:%lu p:%lu)\n", |
439 | cpu, | 495 | cpu_id(cpu), |
440 | pstate[cpu]->local_tick, | 496 | cpu->local_tick, |
441 | pstate[cpu]->cur_tick, | 497 | cpu->cur_tick, |
442 | pfair_time); | 498 | cluster->pfair_time); |
443 | pstate[cpu]->missed_quanta++; | 499 | cpu->missed_quanta++; |
444 | } | 500 | } |
445 | pstate[cpu]->cur_tick = time; | 501 | cpu->cur_tick = time; |
446 | } | 502 | } |
447 | PTRACE(">>> Q %lu at %llu\n", | 503 | PTRACE(">>> Q %lu at %llu\n", |
448 | time, litmus_clock()); | 504 | time, litmus_clock()); |
449 | raw_spin_unlock(&pfair_lock); | 505 | raw_spin_unlock(cluster_lock(cluster)); |
450 | } | 506 | } |
451 | 507 | ||
452 | static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state) | 508 | static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state) |
@@ -479,12 +535,12 @@ static void catchup_quanta(quanta_t from, quanta_t target, | |||
479 | while (time_before(cur, target)) { | 535 | while (time_before(cur, target)) { |
480 | wait_for_quantum(cur, state); | 536 | wait_for_quantum(cur, state); |
481 | cur++; | 537 | cur++; |
482 | time = cmpxchg(&pfair_time, | 538 | time = cmpxchg(&cpu_cluster(state)->pfair_time, |
483 | cur - 1, /* expected */ | 539 | cur - 1, /* expected */ |
484 | cur /* next */ | 540 | cur /* next */ |
485 | ); | 541 | ); |
486 | if (time == cur - 1) | 542 | if (time == cur - 1) |
487 | schedule_next_quantum(cur); | 543 | schedule_next_quantum(cpu_cluster(state), cur); |
488 | } | 544 | } |
489 | TRACE("+++> catching up done\n"); | 545 | TRACE("+++> catching up done\n"); |
490 | } | 546 | } |
@@ -505,14 +561,14 @@ static void pfair_tick(struct task_struct* t) | |||
505 | /* Attempt to advance time. First CPU to get here | 561 | /* Attempt to advance time. First CPU to get here |
506 | * will prepare the next quantum. | 562 | * will prepare the next quantum. |
507 | */ | 563 | */ |
508 | time = cmpxchg(&pfair_time, | 564 | time = cmpxchg(&cpu_cluster(state)->pfair_time, |
509 | cur - 1, /* expected */ | 565 | cur - 1, /* expected */ |
510 | cur /* next */ | 566 | cur /* next */ |
511 | ); | 567 | ); |
512 | if (time == cur - 1) { | 568 | if (time == cur - 1) { |
513 | /* exchange succeeded */ | 569 | /* exchange succeeded */ |
514 | wait_for_quantum(cur - 1, state); | 570 | wait_for_quantum(cur - 1, state); |
515 | schedule_next_quantum(cur); | 571 | schedule_next_quantum(cpu_cluster(state), cur); |
516 | retry = 0; | 572 | retry = 0; |
517 | } else if (time_before(time, cur - 1)) { | 573 | } else if (time_before(time, cur - 1)) { |
518 | /* the whole system missed a tick !? */ | 574 | /* the whole system missed a tick !? */ |
@@ -562,59 +618,65 @@ static struct task_struct* pfair_schedule(struct task_struct * prev) | |||
562 | int blocks; | 618 | int blocks; |
563 | struct task_struct* next = NULL; | 619 | struct task_struct* next = NULL; |
564 | 620 | ||
565 | raw_spin_lock(&pfair_lock); | 621 | raw_spin_lock(cpu_lock(state)); |
566 | 622 | ||
567 | blocks = is_realtime(prev) && !is_running(prev); | 623 | blocks = is_realtime(prev) && !is_running(prev); |
568 | 624 | ||
569 | if (state->local && safe_to_schedule(state->local, state->cpu)) | 625 | if (state->local && safe_to_schedule(state->local, cpu_id(state))) |
570 | next = state->local; | 626 | next = state->local; |
571 | 627 | ||
572 | if (prev != next) { | 628 | if (prev != next) { |
573 | tsk_rt(prev)->scheduled_on = NO_CPU; | 629 | tsk_rt(prev)->scheduled_on = NO_CPU; |
574 | if (next) | 630 | if (next) |
575 | tsk_rt(next)->scheduled_on = state->cpu; | 631 | tsk_rt(next)->scheduled_on = cpu_id(state); |
576 | } | 632 | } |
577 | sched_state_task_picked(); | 633 | sched_state_task_picked(); |
578 | raw_spin_unlock(&pfair_lock); | 634 | raw_spin_unlock(cpu_lock(state)); |
579 | 635 | ||
580 | if (next) | 636 | if (next) |
581 | TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n", | 637 | TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n", |
582 | tsk_pfair(next)->release, pfair_time, litmus_clock()); | 638 | tsk_pfair(next)->release, cpu_cluster(state)->pfair_time, litmus_clock()); |
583 | else if (is_realtime(prev)) | 639 | else if (is_realtime(prev)) |
584 | TRACE("Becomes idle at %lu (%llu)\n", pfair_time, litmus_clock()); | 640 | TRACE("Becomes idle at %lu (%llu)\n", cpu_cluster(state)->pfair_time, litmus_clock()); |
585 | 641 | ||
586 | return next; | 642 | return next; |
587 | } | 643 | } |
588 | 644 | ||
589 | static void pfair_task_new(struct task_struct * t, int on_rq, int running) | 645 | static void pfair_task_new(struct task_struct * t, int on_rq, int running) |
590 | { | 646 | { |
591 | unsigned long flags; | 647 | unsigned long flags; |
648 | struct pfair_cluster* cluster; | ||
592 | 649 | ||
593 | TRACE("pfair: task new %d state:%d\n", t->pid, t->state); | 650 | TRACE("pfair: task new %d state:%d\n", t->pid, t->state); |
594 | 651 | ||
595 | raw_spin_lock_irqsave(&pfair_lock, flags); | 652 | cluster = tsk_pfair(t)->cluster; |
653 | |||
654 | raw_spin_lock_irqsave(cluster_lock(cluster), flags); | ||
596 | if (running) | 655 | if (running) |
597 | t->rt_param.scheduled_on = task_cpu(t); | 656 | t->rt_param.scheduled_on = task_cpu(t); |
598 | else | 657 | else |
599 | t->rt_param.scheduled_on = NO_CPU; | 658 | t->rt_param.scheduled_on = NO_CPU; |
600 | 659 | ||
601 | prepare_release(t, pfair_time + 1); | 660 | prepare_release(t, cluster->pfair_time + 1); |
602 | tsk_pfair(t)->sporadic_release = 0; | 661 | tsk_pfair(t)->sporadic_release = 0; |
603 | pfair_add_release(t); | 662 | pfair_add_release(cluster, t); |
604 | check_preempt(t); | 663 | check_preempt(t); |
605 | 664 | ||
606 | raw_spin_unlock_irqrestore(&pfair_lock, flags); | 665 | raw_spin_unlock_irqrestore(cluster_lock(cluster), flags); |
607 | } | 666 | } |
608 | 667 | ||
609 | static void pfair_task_wake_up(struct task_struct *t) | 668 | static void pfair_task_wake_up(struct task_struct *t) |
610 | { | 669 | { |
611 | unsigned long flags; | 670 | unsigned long flags; |
612 | lt_t now; | 671 | lt_t now; |
672 | struct pfair_cluster* cluster; | ||
673 | |||
674 | cluster = tsk_pfair(t)->cluster; | ||
613 | 675 | ||
614 | TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n", | 676 | TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n", |
615 | litmus_clock(), cur_release(t), pfair_time); | 677 | litmus_clock(), cur_release(t), cluster->pfair_time); |
616 | 678 | ||
617 | raw_spin_lock_irqsave(&pfair_lock, flags); | 679 | raw_spin_lock_irqsave(cluster_lock(cluster), flags); |
618 | 680 | ||
619 | /* It is a little unclear how to deal with Pfair | 681 | /* It is a little unclear how to deal with Pfair |
620 | * tasks that block for a while and then wake. For now, | 682 | * tasks that block for a while and then wake. For now, |
@@ -629,13 +691,13 @@ static void pfair_task_wake_up(struct task_struct *t) | |||
629 | prepare_release(t, time2quanta(now, CEIL)); | 691 | prepare_release(t, time2quanta(now, CEIL)); |
630 | sched_trace_task_release(t); | 692 | sched_trace_task_release(t); |
631 | /* FIXME: race with pfair_time advancing */ | 693 | /* FIXME: race with pfair_time advancing */ |
632 | pfair_add_release(t); | 694 | pfair_add_release(cluster, t); |
633 | tsk_pfair(t)->sporadic_release = 0; | 695 | tsk_pfair(t)->sporadic_release = 0; |
634 | } | 696 | } |
635 | 697 | ||
636 | check_preempt(t); | 698 | check_preempt(t); |
637 | 699 | ||
638 | raw_spin_unlock_irqrestore(&pfair_lock, flags); | 700 | raw_spin_unlock_irqrestore(cluster_lock(cluster), flags); |
639 | TRACE_TASK(t, "wake up done at %llu\n", litmus_clock()); | 701 | TRACE_TASK(t, "wake up done at %llu\n", litmus_clock()); |
640 | } | 702 | } |
641 | 703 | ||
@@ -649,9 +711,12 @@ static void pfair_task_block(struct task_struct *t) | |||
649 | static void pfair_task_exit(struct task_struct * t) | 711 | static void pfair_task_exit(struct task_struct * t) |
650 | { | 712 | { |
651 | unsigned long flags; | 713 | unsigned long flags; |
714 | struct pfair_cluster *cluster; | ||
652 | 715 | ||
653 | BUG_ON(!is_realtime(t)); | 716 | BUG_ON(!is_realtime(t)); |
654 | 717 | ||
718 | cluster = tsk_pfair(t)->cluster; | ||
719 | |||
655 | /* Remote task from release or ready queue, and ensure | 720 | /* Remote task from release or ready queue, and ensure |
656 | * that it is not the scheduled task for ANY CPU. We | 721 | * that it is not the scheduled task for ANY CPU. We |
657 | * do this blanket check because occassionally when | 722 | * do this blanket check because occassionally when |
@@ -659,12 +724,12 @@ static void pfair_task_exit(struct task_struct * t) | |||
659 | * might not be the same as the CPU that the PFAIR scheduler | 724 | * might not be the same as the CPU that the PFAIR scheduler |
660 | * has chosen for it. | 725 | * has chosen for it. |
661 | */ | 726 | */ |
662 | raw_spin_lock_irqsave(&pfair_lock, flags); | 727 | raw_spin_lock_irqsave(cluster_lock(cluster), flags); |
663 | 728 | ||
664 | TRACE_TASK(t, "RIP, state:%d\n", t->state); | 729 | TRACE_TASK(t, "RIP, state:%d\n", t->state); |
665 | drop_all_references(t); | 730 | drop_all_references(t); |
666 | 731 | ||
667 | raw_spin_unlock_irqrestore(&pfair_lock, flags); | 732 | raw_spin_unlock_irqrestore(cluster_lock(cluster), flags); |
668 | 733 | ||
669 | kfree(t->rt_param.pfair); | 734 | kfree(t->rt_param.pfair); |
670 | t->rt_param.pfair = NULL; | 735 | t->rt_param.pfair = NULL; |
@@ -676,27 +741,32 @@ static void pfair_release_at(struct task_struct* task, lt_t start) | |||
676 | unsigned long flags; | 741 | unsigned long flags; |
677 | quanta_t release; | 742 | quanta_t release; |
678 | 743 | ||
744 | struct pfair_cluster *cluster; | ||
745 | |||
746 | cluster = tsk_pfair(task)->cluster; | ||
747 | |||
679 | BUG_ON(!is_realtime(task)); | 748 | BUG_ON(!is_realtime(task)); |
680 | 749 | ||
681 | raw_spin_lock_irqsave(&pfair_lock, flags); | 750 | raw_spin_lock_irqsave(cluster_lock(cluster), flags); |
682 | release_at(task, start); | 751 | release_at(task, start); |
683 | release = time2quanta(start, CEIL); | 752 | release = time2quanta(start, CEIL); |
684 | 753 | ||
685 | if (release - pfair_time >= PFAIR_MAX_PERIOD) | 754 | /* FIXME: support arbitrary offsets. */ |
686 | release = pfair_time + PFAIR_MAX_PERIOD; | 755 | if (release - cluster->pfair_time >= PFAIR_MAX_PERIOD) |
756 | release = cluster->pfair_time + PFAIR_MAX_PERIOD; | ||
687 | 757 | ||
688 | TRACE_TASK(task, "sys release at %lu\n", release); | 758 | TRACE_TASK(task, "sys release at %lu\n", release); |
689 | 759 | ||
690 | drop_all_references(task); | 760 | drop_all_references(task); |
691 | prepare_release(task, release); | 761 | prepare_release(task, release); |
692 | pfair_add_release(task); | 762 | pfair_add_release(cluster, task); |
693 | 763 | ||
694 | /* Clear sporadic release flag, since this release subsumes any | 764 | /* Clear sporadic release flag, since this release subsumes any |
695 | * sporadic release on wake. | 765 | * sporadic release on wake. |
696 | */ | 766 | */ |
697 | tsk_pfair(task)->sporadic_release = 0; | 767 | tsk_pfair(task)->sporadic_release = 0; |
698 | 768 | ||
699 | raw_spin_unlock_irqrestore(&pfair_lock, flags); | 769 | raw_spin_unlock_irqrestore(cluster_lock(cluster), flags); |
700 | } | 770 | } |
701 | 771 | ||
702 | static void init_subtask(struct subtask* sub, unsigned long i, | 772 | static void init_subtask(struct subtask* sub, unsigned long i, |
@@ -755,6 +825,11 @@ static long pfair_admit_task(struct task_struct* t) | |||
755 | struct pfair_param* param; | 825 | struct pfair_param* param; |
756 | unsigned long i; | 826 | unsigned long i; |
757 | 827 | ||
828 | /* first check that the task is in the right cluster */ | ||
829 | if (cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]) != | ||
830 | cpu_cluster(pstate[task_cpu(t)])) | ||
831 | return -EINVAL; | ||
832 | |||
758 | /* Pfair is a tick-based method, so the time | 833 | /* Pfair is a tick-based method, so the time |
759 | * of interest is jiffies. Calculate tick-based | 834 | * of interest is jiffies. Calculate tick-based |
760 | * times for everything. | 835 | * times for everything. |
@@ -798,6 +873,8 @@ static long pfair_admit_task(struct task_struct* t) | |||
798 | param->release = 0; | 873 | param->release = 0; |
799 | param->period = period; | 874 | param->period = period; |
800 | 875 | ||
876 | param->cluster = cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]); | ||
877 | |||
801 | for (i = 0; i < quanta; i++) | 878 | for (i = 0; i < quanta; i++) |
802 | init_subtask(param->subtasks + i, i, quanta, period); | 879 | init_subtask(param->subtasks + i, i, quanta, period); |
803 | 880 | ||
@@ -813,24 +890,88 @@ static long pfair_admit_task(struct task_struct* t) | |||
813 | return 0; | 890 | return 0; |
814 | } | 891 | } |
815 | 892 | ||
893 | static void pfair_init_cluster(struct pfair_cluster* cluster) | ||
894 | { | ||
895 | int i; | ||
896 | |||
897 | /* initialize release queue */ | ||
898 | for (i = 0; i < PFAIR_MAX_PERIOD; i++) | ||
899 | bheap_init(&cluster->release_queue[i]); | ||
900 | rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, NULL); | ||
901 | INIT_LIST_HEAD(&cluster->topology.cpus); | ||
902 | } | ||
903 | |||
904 | static void cleanup_clusters(void) | ||
905 | { | ||
906 | int i; | ||
907 | |||
908 | if (num_pfair_clusters) | ||
909 | kfree(pfair_clusters); | ||
910 | pfair_clusters = NULL; | ||
911 | num_pfair_clusters = 0; | ||
912 | |||
913 | /* avoid stale pointers */ | ||
914 | for (i = 0; i < NR_CPUS; i++) | ||
915 | pstate[i]->topology.cluster = NULL; | ||
916 | } | ||
917 | |||
816 | static long pfair_activate_plugin(void) | 918 | static long pfair_activate_plugin(void) |
817 | { | 919 | { |
818 | int cpu; | 920 | int err, i; |
819 | struct pfair_state* state; | 921 | struct pfair_state* state; |
922 | struct pfair_cluster* cluster ; | ||
923 | quanta_t now; | ||
924 | int cluster_size; | ||
925 | struct cluster_cpu* cpus[NR_CPUS]; | ||
926 | struct scheduling_cluster* clust[NR_CPUS]; | ||
820 | 927 | ||
821 | state = &__get_cpu_var(pfair_state); | 928 | cluster_size = get_cluster_size(pfair_cluster_level); |
822 | pfair_time = current_quantum(state); | ||
823 | 929 | ||
824 | TRACE("Activating PFAIR at q=%lu\n", pfair_time); | 930 | if (cluster_size <= 0 || num_online_cpus() % cluster_size != 0) |
931 | return -EINVAL; | ||
825 | 932 | ||
826 | for (cpu = 0; cpu < num_online_cpus(); cpu++) { | 933 | num_pfair_clusters = num_online_cpus() / cluster_size; |
827 | state = &per_cpu(pfair_state, cpu); | 934 | |
828 | state->cur_tick = pfair_time; | 935 | pfair_clusters = kzalloc(num_pfair_clusters * sizeof(struct pfair_cluster), GFP_ATOMIC); |
829 | state->local_tick = pfair_time; | 936 | if (!pfair_clusters) { |
937 | num_pfair_clusters = 0; | ||
938 | printk(KERN_ERR "Could not allocate Pfair clusters!\n"); | ||
939 | return -ENOMEM; | ||
940 | } | ||
941 | |||
942 | state = &__get_cpu_var(pfair_state); | ||
943 | now = current_quantum(state); | ||
944 | TRACE("Activating PFAIR at q=%lu\n", now); | ||
945 | |||
946 | for (i = 0; i < num_pfair_clusters; i++) { | ||
947 | cluster = &pfair_clusters[i]; | ||
948 | pfair_init_cluster(cluster); | ||
949 | cluster->pfair_time = now; | ||
950 | clust[i] = &cluster->topology; | ||
951 | } | ||
952 | |||
953 | for (i = 0; i < num_online_cpus(); i++) { | ||
954 | state = &per_cpu(pfair_state, i); | ||
955 | state->cur_tick = now; | ||
956 | state->local_tick = now; | ||
830 | state->missed_quanta = 0; | 957 | state->missed_quanta = 0; |
831 | state->offset = cpu_stagger_offset(cpu); | 958 | state->offset = cpu_stagger_offset(i); |
959 | printk(KERN_ERR "cpus[%d] set; %d\n", i, num_online_cpus()); | ||
960 | cpus[i] = &state->topology; | ||
832 | } | 961 | } |
833 | 962 | ||
963 | err = assign_cpus_to_clusters(pfair_cluster_level, clust, num_pfair_clusters, | ||
964 | cpus, num_online_cpus()); | ||
965 | |||
966 | if (err < 0) | ||
967 | cleanup_clusters(); | ||
968 | |||
969 | return err; | ||
970 | } | ||
971 | |||
972 | static long pfair_deactivate_plugin(void) | ||
973 | { | ||
974 | cleanup_clusters(); | ||
834 | return 0; | 975 | return 0; |
835 | } | 976 | } |
836 | 977 | ||
@@ -847,30 +988,29 @@ static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = { | |||
847 | .release_at = pfair_release_at, | 988 | .release_at = pfair_release_at, |
848 | .complete_job = complete_job, | 989 | .complete_job = complete_job, |
849 | .activate_plugin = pfair_activate_plugin, | 990 | .activate_plugin = pfair_activate_plugin, |
991 | .deactivate_plugin = pfair_deactivate_plugin, | ||
850 | }; | 992 | }; |
851 | 993 | ||
994 | |||
995 | static struct proc_dir_entry *cluster_file = NULL, *pfair_dir = NULL; | ||
996 | |||
852 | static int __init init_pfair(void) | 997 | static int __init init_pfair(void) |
853 | { | 998 | { |
854 | int cpu, i; | 999 | int cpu, err, fs; |
855 | struct pfair_state *state; | 1000 | struct pfair_state *state; |
856 | 1001 | ||
857 | |||
858 | /* | 1002 | /* |
859 | * initialize short_cut for per-cpu pfair state; | 1003 | * initialize short_cut for per-cpu pfair state; |
860 | * there may be a problem here if someone removes a cpu | 1004 | * there may be a problem here if someone removes a cpu |
861 | * while we are doing this initialization... and if cpus | 1005 | * while we are doing this initialization... and if cpus |
862 | * are added / removed later... is it a _real_ problem? | 1006 | * are added / removed later... but we don't support CPU hotplug atm anyway. |
863 | */ | 1007 | */ |
864 | pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL); | 1008 | pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL); |
865 | 1009 | ||
866 | /* initialize release queue */ | ||
867 | for (i = 0; i < PFAIR_MAX_PERIOD; i++) | ||
868 | bheap_init(&release_queue[i]); | ||
869 | |||
870 | /* initialize CPU state */ | 1010 | /* initialize CPU state */ |
871 | for (cpu = 0; cpu < num_online_cpus(); cpu++) { | 1011 | for (cpu = 0; cpu < num_online_cpus(); cpu++) { |
872 | state = &per_cpu(pfair_state, cpu); | 1012 | state = &per_cpu(pfair_state, cpu); |
873 | state->cpu = cpu; | 1013 | state->topology.id = cpu; |
874 | state->cur_tick = 0; | 1014 | state->cur_tick = 0; |
875 | state->local_tick = 0; | 1015 | state->local_tick = 0; |
876 | state->linked = NULL; | 1016 | state->linked = NULL; |
@@ -881,13 +1021,29 @@ static int __init init_pfair(void) | |||
881 | pstate[cpu] = state; | 1021 | pstate[cpu] = state; |
882 | } | 1022 | } |
883 | 1023 | ||
884 | rt_domain_init(&pfair, pfair_ready_order, NULL, NULL); | 1024 | pfair_clusters = NULL; |
885 | return register_sched_plugin(&pfair_plugin); | 1025 | num_pfair_clusters = 0; |
1026 | |||
1027 | err = register_sched_plugin(&pfair_plugin); | ||
1028 | if (!err) { | ||
1029 | fs = make_plugin_proc_dir(&pfair_plugin, &pfair_dir); | ||
1030 | if (!fs) | ||
1031 | cluster_file = create_cluster_file(pfair_dir, &pfair_cluster_level); | ||
1032 | else | ||
1033 | printk(KERN_ERR "Could not allocate PFAIR procfs dir.\n"); | ||
1034 | } | ||
1035 | |||
1036 | return err; | ||
886 | } | 1037 | } |
887 | 1038 | ||
888 | static void __exit clean_pfair(void) | 1039 | static void __exit clean_pfair(void) |
889 | { | 1040 | { |
890 | kfree(pstate); | 1041 | kfree(pstate); |
1042 | |||
1043 | if (cluster_file) | ||
1044 | remove_proc_entry("cluster", pfair_dir); | ||
1045 | if (pfair_dir) | ||
1046 | remove_plugin_proc_dir(&pfair_plugin); | ||
891 | } | 1047 | } |
892 | 1048 | ||
893 | module_init(init_pfair); | 1049 | module_init(init_pfair); |