aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--litmus/sched_pfair.c392
1 files changed, 274 insertions, 118 deletions
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
index c7d5cf7aa2b3..71ce993df5ca 100644
--- a/litmus/sched_pfair.c
+++ b/litmus/sched_pfair.c
@@ -23,6 +23,13 @@
23 23
24#include <litmus/bheap.h> 24#include <litmus/bheap.h>
25 25
26/* to configure the cluster size */
27#include <litmus/litmus_proc.h>
28
29#include <litmus/clustered.h>
30
31static enum cache_level pfair_cluster_level = GLOBAL_CLUSTER;
32
26struct subtask { 33struct subtask {
27 /* measured in quanta relative to job release */ 34 /* measured in quanta relative to job release */
28 quanta_t release; 35 quanta_t release;
@@ -43,25 +50,28 @@ struct pfair_param {
43 50
44 unsigned int sporadic_release; /* On wakeup, new sporadic release? */ 51 unsigned int sporadic_release; /* On wakeup, new sporadic release? */
45 52
53 struct pfair_cluster* cluster; /* where this task is scheduled */
54
46 struct subtask subtasks[0]; /* allocate together with pfair_param */ 55 struct subtask subtasks[0]; /* allocate together with pfair_param */
47}; 56};
48 57
49#define tsk_pfair(tsk) ((tsk)->rt_param.pfair) 58#define tsk_pfair(tsk) ((tsk)->rt_param.pfair)
50 59
51struct pfair_state { 60struct pfair_state {
52 int cpu; 61 struct cluster_cpu topology;
62
53 volatile quanta_t cur_tick; /* updated by the CPU that is advancing 63 volatile quanta_t cur_tick; /* updated by the CPU that is advancing
54 * the time */ 64 * the time */
55 volatile quanta_t local_tick; /* What tick is the local CPU currently 65 volatile quanta_t local_tick; /* What tick is the local CPU currently
56 * executing? Updated only by the local 66 * executing? Updated only by the local
57 * CPU. In QEMU, this may lag behind the 67 * CPU. In QEMU, this may lag behind the
58 * current tick. In a real system, with 68 * current tick. In a real system, with
59 * proper timers and aligned quanta, 69 * proper timers and aligned quanta,
60 * that should only be the 70 * that should only be the case for a
61 * case for a very short time after the 71 * very short time after the time
62 * time advanced. With staggered quanta, 72 * advanced. With staggered quanta, it
63 * it will lag for the duration of the 73 * will lag for the duration of the
64 * offset. 74 * offset.
65 */ 75 */
66 76
67 struct task_struct* linked; /* the task that should be executing */ 77 struct task_struct* linked; /* the task that should be executing */
@@ -79,25 +89,56 @@ struct pfair_state {
79 */ 89 */
80#define PFAIR_MAX_PERIOD 2000 90#define PFAIR_MAX_PERIOD 2000
81 91
82/* This is the release queue wheel. It is indexed by pfair_time % 92struct pfair_cluster {
83 * PFAIR_MAX_PERIOD. Each heap is ordered by PFAIR priority, so that it can be 93 struct scheduling_cluster topology;
84 * merged with the ready queue.
85 */
86static struct bheap release_queue[PFAIR_MAX_PERIOD];
87 94
88DEFINE_PER_CPU(struct pfair_state, pfair_state); 95 /* The "global" time in this cluster. */
89struct pfair_state* *pstate; /* short cut */ 96 quanta_t pfair_time; /* the "official" PFAIR clock */
97 quanta_t merge_time; /* Updated after the release queue has been
98 * merged. Used by drop_all_references().
99 */
90 100
91static quanta_t pfair_time = 0; /* the "official" PFAIR clock */ 101 /* The ready queue for this cluster. */
92static quanta_t merge_time = 0; /* Updated after the release queue has been 102 rt_domain_t pfair;
93 * merged. Used by drop_all_references().
94 */
95 103
96static rt_domain_t pfair; 104 /* This is the release queue wheel for this cluster. It is indexed by
105 * pfair_time % PFAIR_MAX_PERIOD. Each heap is ordered by PFAIR
106 * priority, so that it can be merged with the ready queue.
107 */
108 struct bheap release_queue[PFAIR_MAX_PERIOD];
109};
97 110
98/* The pfair_lock is used to serialize all scheduling events. 111static inline struct pfair_cluster* cpu_cluster(struct pfair_state* state)
99 */ 112{
100#define pfair_lock pfair.ready_lock 113 return container_of(state->topology.cluster, struct pfair_cluster, topology);
114}
115
116static inline int cpu_id(struct pfair_state* state)
117{
118 return state->topology.id;
119}
120
121static inline struct pfair_state* from_cluster_list(struct list_head* pos)
122{
123 return list_entry(pos, struct pfair_state, topology.cluster_list);
124}
125
126static inline raw_spinlock_t* cluster_lock(struct pfair_cluster* cluster)
127{
128 /* The ready_lock is used to serialize all scheduling events. */
129 return &cluster->pfair.ready_lock;
130}
131
132static inline raw_spinlock_t* cpu_lock(struct pfair_state* state)
133{
134 return cluster_lock(cpu_cluster(state));
135}
136
137DEFINE_PER_CPU(struct pfair_state, pfair_state);
138struct pfair_state* *pstate; /* short cut */
139
140static struct pfair_cluster* pfair_clusters;
141static int num_pfair_clusters;
101 142
102/* Enable for lots of trace info. 143/* Enable for lots of trace info.
103 * #define PFAIR_DEBUG 144 * #define PFAIR_DEBUG
@@ -197,9 +238,9 @@ int pfair_ready_order(struct bheap_node* a, struct bheap_node* b)
197} 238}
198 239
199/* return the proper release queue for time t */ 240/* return the proper release queue for time t */
200static struct bheap* relq(quanta_t t) 241static struct bheap* relq(struct pfair_cluster* cluster, quanta_t t)
201{ 242{
202 struct bheap* rq = &release_queue[t % PFAIR_MAX_PERIOD]; 243 struct bheap* rq = cluster->release_queue + (t % PFAIR_MAX_PERIOD);
203 return rq; 244 return rq;
204} 245}
205 246
@@ -215,17 +256,19 @@ static void __pfair_add_release(struct task_struct* t, struct bheap* queue)
215 tsk_rt(t)->heap_node); 256 tsk_rt(t)->heap_node);
216} 257}
217 258
218static void pfair_add_release(struct task_struct* t) 259static void pfair_add_release(struct pfair_cluster* cluster,
260 struct task_struct* t)
219{ 261{
220 BUG_ON(bheap_node_in_heap(tsk_rt(t)->heap_node)); 262 BUG_ON(bheap_node_in_heap(tsk_rt(t)->heap_node));
221 __pfair_add_release(t, relq(cur_release(t))); 263 __pfair_add_release(t, relq(cluster, cur_release(t)));
222} 264}
223 265
224/* pull released tasks from the release queue */ 266/* pull released tasks from the release queue */
225static void poll_releases(quanta_t time) 267static void poll_releases(struct pfair_cluster* cluster,
268 quanta_t time)
226{ 269{
227 __merge_ready(&pfair, relq(time)); 270 __merge_ready(&cluster->pfair, relq(cluster, time));
228 merge_time = time; 271 cluster->merge_time = time;
229} 272}
230 273
231static void check_preempt(struct task_struct* t) 274static void check_preempt(struct task_struct* t)
@@ -246,18 +289,20 @@ static void check_preempt(struct task_struct* t)
246 } 289 }
247} 290}
248 291
249/* caller must hold pfair_lock */ 292/* caller must hold pfair.ready_lock */
250static void drop_all_references(struct task_struct *t) 293static void drop_all_references(struct task_struct *t)
251{ 294{
252 int cpu; 295 int cpu;
253 struct pfair_state* s; 296 struct pfair_state* s;
254 struct bheap* q; 297 struct bheap* q;
298 struct pfair_cluster* cluster;
255 if (bheap_node_in_heap(tsk_rt(t)->heap_node)) { 299 if (bheap_node_in_heap(tsk_rt(t)->heap_node)) {
256 /* figure out what queue the node is in */ 300 /* figure out what queue the node is in */
257 if (time_before_eq(cur_release(t), merge_time)) 301 cluster = tsk_pfair(t)->cluster;
258 q = &pfair.ready_queue; 302 if (time_before_eq(cur_release(t), cluster->merge_time))
303 q = &cluster->pfair.ready_queue;
259 else 304 else
260 q = relq(cur_release(t)); 305 q = relq(cluster, cur_release(t));
261 bheap_delete(pfair_ready_order, q, 306 bheap_delete(pfair_ready_order, q,
262 tsk_rt(t)->heap_node); 307 tsk_rt(t)->heap_node);
263 } 308 }
@@ -301,22 +346,25 @@ static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
301 return to_relq; 346 return to_relq;
302} 347}
303 348
304static void advance_subtasks(quanta_t time) 349static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time)
305{ 350{
306 int cpu, missed; 351 int missed;
307 struct task_struct* l; 352 struct task_struct* l;
308 struct pfair_param* p; 353 struct pfair_param* p;
354 struct list_head* pos;
355 struct pfair_state* cpu;
309 356
310 for_each_online_cpu(cpu) { 357 list_for_each(pos, &cluster->topology.cpus) {
311 l = pstate[cpu]->linked; 358 cpu = from_cluster_list(pos);
312 missed = pstate[cpu]->linked != pstate[cpu]->local; 359 l = cpu->linked;
360 missed = cpu->linked != cpu->local;
313 if (l) { 361 if (l) {
314 p = tsk_pfair(l); 362 p = tsk_pfair(l);
315 p->last_quantum = time; 363 p->last_quantum = time;
316 p->last_cpu = cpu; 364 p->last_cpu = cpu_id(cpu);
317 if (advance_subtask(time, l, cpu)) { 365 if (advance_subtask(time, l, cpu_id(cpu))) {
318 pstate[cpu]->linked = NULL; 366 cpu->linked = NULL;
319 pfair_add_release(l); 367 pfair_add_release(cluster, l);
320 } 368 }
321 } 369 }
322 } 370 }
@@ -350,8 +398,10 @@ static int pfair_link(quanta_t time, int cpu,
350 int target = target_cpu(time, t, cpu); 398 int target = target_cpu(time, t, cpu);
351 struct task_struct* prev = pstate[cpu]->linked; 399 struct task_struct* prev = pstate[cpu]->linked;
352 struct task_struct* other; 400 struct task_struct* other;
401 struct pfair_cluster* cluster = cpu_cluster(pstate[cpu]);
353 402
354 if (target != cpu) { 403 if (target != cpu) {
404 BUG_ON(pstate[target]->topology.cluster != pstate[cpu]->topology.cluster);
355 other = pstate[target]->linked; 405 other = pstate[target]->linked;
356 pstate[target]->linked = t; 406 pstate[target]->linked = t;
357 tsk_rt(t)->linked_on = target; 407 tsk_rt(t)->linked_on = target;
@@ -365,14 +415,14 @@ static int pfair_link(quanta_t time, int cpu,
365 if (prev) { 415 if (prev) {
366 /* prev got pushed back into the ready queue */ 416 /* prev got pushed back into the ready queue */
367 tsk_rt(prev)->linked_on = NO_CPU; 417 tsk_rt(prev)->linked_on = NO_CPU;
368 __add_ready(&pfair, prev); 418 __add_ready(&cluster->pfair, prev);
369 } 419 }
370 /* we are done with this cpu */ 420 /* we are done with this cpu */
371 return 0; 421 return 0;
372 } else { 422 } else {
373 /* re-add other, it's original CPU was not considered yet */ 423 /* re-add other, it's original CPU was not considered yet */
374 tsk_rt(other)->linked_on = NO_CPU; 424 tsk_rt(other)->linked_on = NO_CPU;
375 __add_ready(&pfair, other); 425 __add_ready(&cluster->pfair, other);
376 /* reschedule this CPU */ 426 /* reschedule this CPU */
377 return 1; 427 return 1;
378 } 428 }
@@ -382,71 +432,77 @@ static int pfair_link(quanta_t time, int cpu,
382 if (prev) { 432 if (prev) {
383 /* prev got pushed back into the ready queue */ 433 /* prev got pushed back into the ready queue */
384 tsk_rt(prev)->linked_on = NO_CPU; 434 tsk_rt(prev)->linked_on = NO_CPU;
385 __add_ready(&pfair, prev); 435 __add_ready(&cluster->pfair, prev);
386 } 436 }
387 /* we are done with this CPU */ 437 /* we are done with this CPU */
388 return 0; 438 return 0;
389 } 439 }
390} 440}
391 441
392static void schedule_subtasks(quanta_t time) 442static void schedule_subtasks(struct pfair_cluster *cluster, quanta_t time)
393{ 443{
394 int cpu, retry; 444 int retry;
445 struct list_head *pos;
446 struct pfair_state *cpu_state;
395 447
396 for_each_online_cpu(cpu) { 448 list_for_each(pos, &cluster->topology.cpus) {
449 cpu_state = from_cluster_list(pos);
397 retry = 1; 450 retry = 1;
398 while (retry) { 451 while (retry) {
399 if (pfair_higher_prio(__peek_ready(&pfair), 452 if (pfair_higher_prio(__peek_ready(&cluster->pfair),
400 pstate[cpu]->linked)) 453 cpu_state->linked))
401 retry = pfair_link(time, cpu, 454 retry = pfair_link(time, cpu_id(cpu_state),
402 __take_ready(&pfair)); 455 __take_ready(&cluster->pfair));
403 else 456 else
404 retry = 0; 457 retry = 0;
405 } 458 }
406 } 459 }
407} 460}
408 461
409static void schedule_next_quantum(quanta_t time) 462static void schedule_next_quantum(struct pfair_cluster *cluster, quanta_t time)
410{ 463{
411 int cpu; 464 struct pfair_state *cpu;
465 struct list_head* pos;
412 466
413 /* called with interrupts disabled */ 467 /* called with interrupts disabled */
414 PTRACE("--- Q %lu at %llu PRE-SPIN\n", 468 PTRACE("--- Q %lu at %llu PRE-SPIN\n",
415 time, litmus_clock()); 469 time, litmus_clock());
416 raw_spin_lock(&pfair_lock); 470 raw_spin_lock(cluster_lock(cluster));
417 PTRACE("<<< Q %lu at %llu\n", 471 PTRACE("<<< Q %lu at %llu\n",
418 time, litmus_clock()); 472 time, litmus_clock());
419 473
420 sched_trace_quantum_boundary(); 474 sched_trace_quantum_boundary();
421 475
422 advance_subtasks(time); 476 advance_subtasks(cluster, time);
423 poll_releases(time); 477 poll_releases(cluster, time);
424 schedule_subtasks(time); 478 schedule_subtasks(cluster, time);
425 479
426 for (cpu = 0; cpu < num_online_cpus(); cpu++) 480 list_for_each(pos, &cluster->topology.cpus) {
427 if (pstate[cpu]->linked) 481 cpu = from_cluster_list(pos);
482 if (cpu->linked)
428 PTRACE_TASK(pstate[cpu]->linked, 483 PTRACE_TASK(pstate[cpu]->linked,
429 " linked on %d.\n", cpu); 484 " linked on %d.\n", cpu_id(cpu));
430 else 485 else
431 PTRACE("(null) linked on %d.\n", cpu); 486 PTRACE("(null) linked on %d.\n", cpu_id(cpu));
432 487 }
433 /* We are done. Advance time. */ 488 /* We are done. Advance time. */
434 mb(); 489 mb();
435 for (cpu = 0; cpu < num_online_cpus(); cpu++) { 490 list_for_each(pos, &cluster->topology.cpus) {
436 if (pstate[cpu]->local_tick != pstate[cpu]->cur_tick) { 491 cpu = from_cluster_list(pos);
492 if (cpu->local_tick != cpu->cur_tick) {
437 TRACE("BAD Quantum not acked on %d " 493 TRACE("BAD Quantum not acked on %d "
438 "(l:%lu c:%lu p:%lu)\n", 494 "(l:%lu c:%lu p:%lu)\n",
439 cpu, 495 cpu_id(cpu),
440 pstate[cpu]->local_tick, 496 cpu->local_tick,
441 pstate[cpu]->cur_tick, 497 cpu->cur_tick,
442 pfair_time); 498 cluster->pfair_time);
443 pstate[cpu]->missed_quanta++; 499 cpu->missed_quanta++;
444 } 500 }
445 pstate[cpu]->cur_tick = time; 501 cpu->cur_tick = time;
446 } 502 }
447 PTRACE(">>> Q %lu at %llu\n", 503 PTRACE(">>> Q %lu at %llu\n",
448 time, litmus_clock()); 504 time, litmus_clock());
449 raw_spin_unlock(&pfair_lock); 505 raw_spin_unlock(cluster_lock(cluster));
450} 506}
451 507
452static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state) 508static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state)
@@ -479,12 +535,12 @@ static void catchup_quanta(quanta_t from, quanta_t target,
479 while (time_before(cur, target)) { 535 while (time_before(cur, target)) {
480 wait_for_quantum(cur, state); 536 wait_for_quantum(cur, state);
481 cur++; 537 cur++;
482 time = cmpxchg(&pfair_time, 538 time = cmpxchg(&cpu_cluster(state)->pfair_time,
483 cur - 1, /* expected */ 539 cur - 1, /* expected */
484 cur /* next */ 540 cur /* next */
485 ); 541 );
486 if (time == cur - 1) 542 if (time == cur - 1)
487 schedule_next_quantum(cur); 543 schedule_next_quantum(cpu_cluster(state), cur);
488 } 544 }
489 TRACE("+++> catching up done\n"); 545 TRACE("+++> catching up done\n");
490} 546}
@@ -505,14 +561,14 @@ static void pfair_tick(struct task_struct* t)
505 /* Attempt to advance time. First CPU to get here 561 /* Attempt to advance time. First CPU to get here
506 * will prepare the next quantum. 562 * will prepare the next quantum.
507 */ 563 */
508 time = cmpxchg(&pfair_time, 564 time = cmpxchg(&cpu_cluster(state)->pfair_time,
509 cur - 1, /* expected */ 565 cur - 1, /* expected */
510 cur /* next */ 566 cur /* next */
511 ); 567 );
512 if (time == cur - 1) { 568 if (time == cur - 1) {
513 /* exchange succeeded */ 569 /* exchange succeeded */
514 wait_for_quantum(cur - 1, state); 570 wait_for_quantum(cur - 1, state);
515 schedule_next_quantum(cur); 571 schedule_next_quantum(cpu_cluster(state), cur);
516 retry = 0; 572 retry = 0;
517 } else if (time_before(time, cur - 1)) { 573 } else if (time_before(time, cur - 1)) {
518 /* the whole system missed a tick !? */ 574 /* the whole system missed a tick !? */
@@ -562,59 +618,65 @@ static struct task_struct* pfair_schedule(struct task_struct * prev)
562 int blocks; 618 int blocks;
563 struct task_struct* next = NULL; 619 struct task_struct* next = NULL;
564 620
565 raw_spin_lock(&pfair_lock); 621 raw_spin_lock(cpu_lock(state));
566 622
567 blocks = is_realtime(prev) && !is_running(prev); 623 blocks = is_realtime(prev) && !is_running(prev);
568 624
569 if (state->local && safe_to_schedule(state->local, state->cpu)) 625 if (state->local && safe_to_schedule(state->local, cpu_id(state)))
570 next = state->local; 626 next = state->local;
571 627
572 if (prev != next) { 628 if (prev != next) {
573 tsk_rt(prev)->scheduled_on = NO_CPU; 629 tsk_rt(prev)->scheduled_on = NO_CPU;
574 if (next) 630 if (next)
575 tsk_rt(next)->scheduled_on = state->cpu; 631 tsk_rt(next)->scheduled_on = cpu_id(state);
576 } 632 }
577 sched_state_task_picked(); 633 sched_state_task_picked();
578 raw_spin_unlock(&pfair_lock); 634 raw_spin_unlock(cpu_lock(state));
579 635
580 if (next) 636 if (next)
581 TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n", 637 TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n",
582 tsk_pfair(next)->release, pfair_time, litmus_clock()); 638 tsk_pfair(next)->release, cpu_cluster(state)->pfair_time, litmus_clock());
583 else if (is_realtime(prev)) 639 else if (is_realtime(prev))
584 TRACE("Becomes idle at %lu (%llu)\n", pfair_time, litmus_clock()); 640 TRACE("Becomes idle at %lu (%llu)\n", cpu_cluster(state)->pfair_time, litmus_clock());
585 641
586 return next; 642 return next;
587} 643}
588 644
589static void pfair_task_new(struct task_struct * t, int on_rq, int running) 645static void pfair_task_new(struct task_struct * t, int on_rq, int running)
590{ 646{
591 unsigned long flags; 647 unsigned long flags;
648 struct pfair_cluster* cluster;
592 649
593 TRACE("pfair: task new %d state:%d\n", t->pid, t->state); 650 TRACE("pfair: task new %d state:%d\n", t->pid, t->state);
594 651
595 raw_spin_lock_irqsave(&pfair_lock, flags); 652 cluster = tsk_pfair(t)->cluster;
653
654 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
596 if (running) 655 if (running)
597 t->rt_param.scheduled_on = task_cpu(t); 656 t->rt_param.scheduled_on = task_cpu(t);
598 else 657 else
599 t->rt_param.scheduled_on = NO_CPU; 658 t->rt_param.scheduled_on = NO_CPU;
600 659
601 prepare_release(t, pfair_time + 1); 660 prepare_release(t, cluster->pfair_time + 1);
602 tsk_pfair(t)->sporadic_release = 0; 661 tsk_pfair(t)->sporadic_release = 0;
603 pfair_add_release(t); 662 pfair_add_release(cluster, t);
604 check_preempt(t); 663 check_preempt(t);
605 664
606 raw_spin_unlock_irqrestore(&pfair_lock, flags); 665 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
607} 666}
608 667
609static void pfair_task_wake_up(struct task_struct *t) 668static void pfair_task_wake_up(struct task_struct *t)
610{ 669{
611 unsigned long flags; 670 unsigned long flags;
612 lt_t now; 671 lt_t now;
672 struct pfair_cluster* cluster;
673
674 cluster = tsk_pfair(t)->cluster;
613 675
614 TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n", 676 TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n",
615 litmus_clock(), cur_release(t), pfair_time); 677 litmus_clock(), cur_release(t), cluster->pfair_time);
616 678
617 raw_spin_lock_irqsave(&pfair_lock, flags); 679 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
618 680
619 /* It is a little unclear how to deal with Pfair 681 /* It is a little unclear how to deal with Pfair
620 * tasks that block for a while and then wake. For now, 682 * tasks that block for a while and then wake. For now,
@@ -629,13 +691,13 @@ static void pfair_task_wake_up(struct task_struct *t)
629 prepare_release(t, time2quanta(now, CEIL)); 691 prepare_release(t, time2quanta(now, CEIL));
630 sched_trace_task_release(t); 692 sched_trace_task_release(t);
631 /* FIXME: race with pfair_time advancing */ 693 /* FIXME: race with pfair_time advancing */
632 pfair_add_release(t); 694 pfair_add_release(cluster, t);
633 tsk_pfair(t)->sporadic_release = 0; 695 tsk_pfair(t)->sporadic_release = 0;
634 } 696 }
635 697
636 check_preempt(t); 698 check_preempt(t);
637 699
638 raw_spin_unlock_irqrestore(&pfair_lock, flags); 700 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
639 TRACE_TASK(t, "wake up done at %llu\n", litmus_clock()); 701 TRACE_TASK(t, "wake up done at %llu\n", litmus_clock());
640} 702}
641 703
@@ -649,9 +711,12 @@ static void pfair_task_block(struct task_struct *t)
649static void pfair_task_exit(struct task_struct * t) 711static void pfair_task_exit(struct task_struct * t)
650{ 712{
651 unsigned long flags; 713 unsigned long flags;
714 struct pfair_cluster *cluster;
652 715
653 BUG_ON(!is_realtime(t)); 716 BUG_ON(!is_realtime(t));
654 717
718 cluster = tsk_pfair(t)->cluster;
719
655 /* Remote task from release or ready queue, and ensure 720 /* Remote task from release or ready queue, and ensure
656 * that it is not the scheduled task for ANY CPU. We 721 * that it is not the scheduled task for ANY CPU. We
657 * do this blanket check because occassionally when 722 * do this blanket check because occassionally when
@@ -659,12 +724,12 @@ static void pfair_task_exit(struct task_struct * t)
659 * might not be the same as the CPU that the PFAIR scheduler 724 * might not be the same as the CPU that the PFAIR scheduler
660 * has chosen for it. 725 * has chosen for it.
661 */ 726 */
662 raw_spin_lock_irqsave(&pfair_lock, flags); 727 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
663 728
664 TRACE_TASK(t, "RIP, state:%d\n", t->state); 729 TRACE_TASK(t, "RIP, state:%d\n", t->state);
665 drop_all_references(t); 730 drop_all_references(t);
666 731
667 raw_spin_unlock_irqrestore(&pfair_lock, flags); 732 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
668 733
669 kfree(t->rt_param.pfair); 734 kfree(t->rt_param.pfair);
670 t->rt_param.pfair = NULL; 735 t->rt_param.pfair = NULL;
@@ -676,27 +741,32 @@ static void pfair_release_at(struct task_struct* task, lt_t start)
676 unsigned long flags; 741 unsigned long flags;
677 quanta_t release; 742 quanta_t release;
678 743
744 struct pfair_cluster *cluster;
745
746 cluster = tsk_pfair(task)->cluster;
747
679 BUG_ON(!is_realtime(task)); 748 BUG_ON(!is_realtime(task));
680 749
681 raw_spin_lock_irqsave(&pfair_lock, flags); 750 raw_spin_lock_irqsave(cluster_lock(cluster), flags);
682 release_at(task, start); 751 release_at(task, start);
683 release = time2quanta(start, CEIL); 752 release = time2quanta(start, CEIL);
684 753
685 if (release - pfair_time >= PFAIR_MAX_PERIOD) 754 /* FIXME: support arbitrary offsets. */
686 release = pfair_time + PFAIR_MAX_PERIOD; 755 if (release - cluster->pfair_time >= PFAIR_MAX_PERIOD)
756 release = cluster->pfair_time + PFAIR_MAX_PERIOD;
687 757
688 TRACE_TASK(task, "sys release at %lu\n", release); 758 TRACE_TASK(task, "sys release at %lu\n", release);
689 759
690 drop_all_references(task); 760 drop_all_references(task);
691 prepare_release(task, release); 761 prepare_release(task, release);
692 pfair_add_release(task); 762 pfair_add_release(cluster, task);
693 763
694 /* Clear sporadic release flag, since this release subsumes any 764 /* Clear sporadic release flag, since this release subsumes any
695 * sporadic release on wake. 765 * sporadic release on wake.
696 */ 766 */
697 tsk_pfair(task)->sporadic_release = 0; 767 tsk_pfair(task)->sporadic_release = 0;
698 768
699 raw_spin_unlock_irqrestore(&pfair_lock, flags); 769 raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
700} 770}
701 771
702static void init_subtask(struct subtask* sub, unsigned long i, 772static void init_subtask(struct subtask* sub, unsigned long i,
@@ -755,6 +825,11 @@ static long pfair_admit_task(struct task_struct* t)
755 struct pfair_param* param; 825 struct pfair_param* param;
756 unsigned long i; 826 unsigned long i;
757 827
828 /* first check that the task is in the right cluster */
829 if (cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]) !=
830 cpu_cluster(pstate[task_cpu(t)]))
831 return -EINVAL;
832
758 /* Pfair is a tick-based method, so the time 833 /* Pfair is a tick-based method, so the time
759 * of interest is jiffies. Calculate tick-based 834 * of interest is jiffies. Calculate tick-based
760 * times for everything. 835 * times for everything.
@@ -798,6 +873,8 @@ static long pfair_admit_task(struct task_struct* t)
798 param->release = 0; 873 param->release = 0;
799 param->period = period; 874 param->period = period;
800 875
876 param->cluster = cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]);
877
801 for (i = 0; i < quanta; i++) 878 for (i = 0; i < quanta; i++)
802 init_subtask(param->subtasks + i, i, quanta, period); 879 init_subtask(param->subtasks + i, i, quanta, period);
803 880
@@ -813,24 +890,88 @@ static long pfair_admit_task(struct task_struct* t)
813 return 0; 890 return 0;
814} 891}
815 892
893static void pfair_init_cluster(struct pfair_cluster* cluster)
894{
895 int i;
896
897 /* initialize release queue */
898 for (i = 0; i < PFAIR_MAX_PERIOD; i++)
899 bheap_init(&cluster->release_queue[i]);
900 rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, NULL);
901 INIT_LIST_HEAD(&cluster->topology.cpus);
902}
903
904static void cleanup_clusters(void)
905{
906 int i;
907
908 if (num_pfair_clusters)
909 kfree(pfair_clusters);
910 pfair_clusters = NULL;
911 num_pfair_clusters = 0;
912
913 /* avoid stale pointers */
914 for (i = 0; i < NR_CPUS; i++)
915 pstate[i]->topology.cluster = NULL;
916}
917
816static long pfair_activate_plugin(void) 918static long pfair_activate_plugin(void)
817{ 919{
818 int cpu; 920 int err, i;
819 struct pfair_state* state; 921 struct pfair_state* state;
922 struct pfair_cluster* cluster ;
923 quanta_t now;
924 int cluster_size;
925 struct cluster_cpu* cpus[NR_CPUS];
926 struct scheduling_cluster* clust[NR_CPUS];
820 927
821 state = &__get_cpu_var(pfair_state); 928 cluster_size = get_cluster_size(pfair_cluster_level);
822 pfair_time = current_quantum(state);
823 929
824 TRACE("Activating PFAIR at q=%lu\n", pfair_time); 930 if (cluster_size <= 0 || num_online_cpus() % cluster_size != 0)
931 return -EINVAL;
825 932
826 for (cpu = 0; cpu < num_online_cpus(); cpu++) { 933 num_pfair_clusters = num_online_cpus() / cluster_size;
827 state = &per_cpu(pfair_state, cpu); 934
828 state->cur_tick = pfair_time; 935 pfair_clusters = kzalloc(num_pfair_clusters * sizeof(struct pfair_cluster), GFP_ATOMIC);
829 state->local_tick = pfair_time; 936 if (!pfair_clusters) {
937 num_pfair_clusters = 0;
938 printk(KERN_ERR "Could not allocate Pfair clusters!\n");
939 return -ENOMEM;
940 }
941
942 state = &__get_cpu_var(pfair_state);
943 now = current_quantum(state);
944 TRACE("Activating PFAIR at q=%lu\n", now);
945
946 for (i = 0; i < num_pfair_clusters; i++) {
947 cluster = &pfair_clusters[i];
948 pfair_init_cluster(cluster);
949 cluster->pfair_time = now;
950 clust[i] = &cluster->topology;
951 }
952
953 for (i = 0; i < num_online_cpus(); i++) {
954 state = &per_cpu(pfair_state, i);
955 state->cur_tick = now;
956 state->local_tick = now;
830 state->missed_quanta = 0; 957 state->missed_quanta = 0;
831 state->offset = cpu_stagger_offset(cpu); 958 state->offset = cpu_stagger_offset(i);
959 printk(KERN_ERR "cpus[%d] set; %d\n", i, num_online_cpus());
960 cpus[i] = &state->topology;
832 } 961 }
833 962
963 err = assign_cpus_to_clusters(pfair_cluster_level, clust, num_pfair_clusters,
964 cpus, num_online_cpus());
965
966 if (err < 0)
967 cleanup_clusters();
968
969 return err;
970}
971
972static long pfair_deactivate_plugin(void)
973{
974 cleanup_clusters();
834 return 0; 975 return 0;
835} 976}
836 977
@@ -847,30 +988,29 @@ static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = {
847 .release_at = pfair_release_at, 988 .release_at = pfair_release_at,
848 .complete_job = complete_job, 989 .complete_job = complete_job,
849 .activate_plugin = pfair_activate_plugin, 990 .activate_plugin = pfair_activate_plugin,
991 .deactivate_plugin = pfair_deactivate_plugin,
850}; 992};
851 993
994
995static struct proc_dir_entry *cluster_file = NULL, *pfair_dir = NULL;
996
852static int __init init_pfair(void) 997static int __init init_pfair(void)
853{ 998{
854 int cpu, i; 999 int cpu, err, fs;
855 struct pfair_state *state; 1000 struct pfair_state *state;
856 1001
857
858 /* 1002 /*
859 * initialize short_cut for per-cpu pfair state; 1003 * initialize short_cut for per-cpu pfair state;
860 * there may be a problem here if someone removes a cpu 1004 * there may be a problem here if someone removes a cpu
861 * while we are doing this initialization... and if cpus 1005 * while we are doing this initialization... and if cpus
862 * are added / removed later... is it a _real_ problem? 1006 * are added / removed later... but we don't support CPU hotplug atm anyway.
863 */ 1007 */
864 pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL); 1008 pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL);
865 1009
866 /* initialize release queue */
867 for (i = 0; i < PFAIR_MAX_PERIOD; i++)
868 bheap_init(&release_queue[i]);
869
870 /* initialize CPU state */ 1010 /* initialize CPU state */
871 for (cpu = 0; cpu < num_online_cpus(); cpu++) { 1011 for (cpu = 0; cpu < num_online_cpus(); cpu++) {
872 state = &per_cpu(pfair_state, cpu); 1012 state = &per_cpu(pfair_state, cpu);
873 state->cpu = cpu; 1013 state->topology.id = cpu;
874 state->cur_tick = 0; 1014 state->cur_tick = 0;
875 state->local_tick = 0; 1015 state->local_tick = 0;
876 state->linked = NULL; 1016 state->linked = NULL;
@@ -881,13 +1021,29 @@ static int __init init_pfair(void)
881 pstate[cpu] = state; 1021 pstate[cpu] = state;
882 } 1022 }
883 1023
884 rt_domain_init(&pfair, pfair_ready_order, NULL, NULL); 1024 pfair_clusters = NULL;
885 return register_sched_plugin(&pfair_plugin); 1025 num_pfair_clusters = 0;
1026
1027 err = register_sched_plugin(&pfair_plugin);
1028 if (!err) {
1029 fs = make_plugin_proc_dir(&pfair_plugin, &pfair_dir);
1030 if (!fs)
1031 cluster_file = create_cluster_file(pfair_dir, &pfair_cluster_level);
1032 else
1033 printk(KERN_ERR "Could not allocate PFAIR procfs dir.\n");
1034 }
1035
1036 return err;
886} 1037}
887 1038
888static void __exit clean_pfair(void) 1039static void __exit clean_pfair(void)
889{ 1040{
890 kfree(pstate); 1041 kfree(pstate);
1042
1043 if (cluster_file)
1044 remove_proc_entry("cluster", pfair_dir);
1045 if (pfair_dir)
1046 remove_plugin_proc_dir(&pfair_plugin);
891} 1047}
892 1048
893module_init(init_pfair); 1049module_init(init_pfair);