diff options
author | Bjoern B. Brandenburg <bbb@cs.unc.edu> | 2008-05-04 18:07:35 -0400 |
---|---|---|
committer | Bjoern B. Brandenburg <bbb@cs.unc.edu> | 2008-05-04 18:07:35 -0400 |
commit | 6b06d1ce233787655eb21b624ed924806768b36c (patch) | |
tree | c0ffcd31b0831719615e83b63b2022d41faf6208 | |
parent | 9b4ccbc27ea5d1a35e79391ca5a500b32cd253a1 (diff) |
LITMUS: avoid using the same stack on two CPUs in global schedulers
This change fixes a race where a job could be executed on more than one
CPU, which to random crashes.
-rw-r--r-- | include/litmus/rt_param.h | 10 | ||||
-rw-r--r-- | kernel/sched.c | 6 | ||||
-rw-r--r-- | litmus/sched_gsn_edf.c | 9 | ||||
-rw-r--r-- | litmus/sched_litmus.c | 47 |
4 files changed, 60 insertions, 12 deletions
diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h index 76be2fe4be..a5e939daa5 100644 --- a/include/litmus/rt_param.h +++ b/include/litmus/rt_param.h | |||
@@ -111,6 +111,16 @@ struct rt_param { | |||
111 | */ | 111 | */ |
112 | volatile int scheduled_on; | 112 | volatile int scheduled_on; |
113 | 113 | ||
114 | /* Is the stack of the task currently in use? Currently, this | ||
115 | * is the responsibility of the plugin to update this field. | ||
116 | * Maybe become part of the LITMUS core some day. | ||
117 | * | ||
118 | * Used by GSN-EDF. | ||
119 | * | ||
120 | * Be careful to avoid deadlocks! | ||
121 | */ | ||
122 | volatile int stack_in_use; | ||
123 | |||
114 | /* This field can be used by plugins to store where the task | 124 | /* This field can be used by plugins to store where the task |
115 | * is currently linked. It is the responsibility of the plugin | 125 | * is currently linked. It is the responsibility of the plugin |
116 | * to avoid race conditions. | 126 | * to avoid race conditions. |
diff --git a/kernel/sched.c b/kernel/sched.c index 441996e08c..d9e876fea8 100644 --- a/kernel/sched.c +++ b/kernel/sched.c | |||
@@ -1897,6 +1897,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) | |||
1897 | finish_arch_switch(prev); | 1897 | finish_arch_switch(prev); |
1898 | litmus->finish_switch(prev); | 1898 | litmus->finish_switch(prev); |
1899 | finish_lock_switch(rq, prev); | 1899 | finish_lock_switch(rq, prev); |
1900 | prev->rt_param.stack_in_use = NO_CPU; | ||
1900 | fire_sched_in_preempt_notifiers(current); | 1901 | fire_sched_in_preempt_notifiers(current); |
1901 | if (mm) | 1902 | if (mm) |
1902 | mmdrop(mm); | 1903 | mmdrop(mm); |
@@ -3679,6 +3680,7 @@ need_resched_nonpreemptible: | |||
3679 | rq->curr = next; | 3680 | rq->curr = next; |
3680 | ++*switch_count; | 3681 | ++*switch_count; |
3681 | 3682 | ||
3683 | TRACE_TASK(next, "switched to\n"); | ||
3682 | context_switch(rq, prev, next); /* unlocks the rq */ | 3684 | context_switch(rq, prev, next); /* unlocks the rq */ |
3683 | } else | 3685 | } else |
3684 | spin_unlock_irq(&rq->lock); | 3686 | spin_unlock_irq(&rq->lock); |
@@ -4391,8 +4393,10 @@ recheck: | |||
4391 | oldprio = p->prio; | 4393 | oldprio = p->prio; |
4392 | __setscheduler(rq, p, policy, param->sched_priority); | 4394 | __setscheduler(rq, p, policy, param->sched_priority); |
4393 | 4395 | ||
4394 | if (policy == SCHED_LITMUS) | 4396 | if (policy == SCHED_LITMUS) { |
4397 | p->rt_param.stack_in_use = running ? rq->cpu : NO_CPU; | ||
4395 | litmus->task_new(p, on_rq, running); | 4398 | litmus->task_new(p, on_rq, running); |
4399 | } | ||
4396 | 4400 | ||
4397 | if (on_rq) { | 4401 | if (on_rq) { |
4398 | if (running) | 4402 | if (running) |
diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c index c988e91e6e..eb0f4c0b36 100644 --- a/litmus/sched_gsn_edf.c +++ b/litmus/sched_gsn_edf.c | |||
@@ -466,10 +466,8 @@ static struct task_struct* gsnedf_schedule(struct task_struct * prev) | |||
466 | TRACE_TASK(next, "scheduled at %llu\n", litmus_clock()); | 466 | TRACE_TASK(next, "scheduled at %llu\n", litmus_clock()); |
467 | else if (exists && !next) | 467 | else if (exists && !next) |
468 | TRACE("becomes idle at %llu.\n", litmus_clock()); | 468 | TRACE("becomes idle at %llu.\n", litmus_clock()); |
469 | /* don't race with a concurrent switch */ | 469 | |
470 | if (next && prev != next) | 470 | |
471 | while (next->rt_param.scheduled_on != NO_CPU) | ||
472 | cpu_relax(); | ||
473 | return next; | 471 | return next; |
474 | } | 472 | } |
475 | 473 | ||
@@ -481,9 +479,6 @@ static void gsnedf_finish_switch(struct task_struct *prev) | |||
481 | cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries); | 479 | cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries); |
482 | 480 | ||
483 | entry->scheduled = is_realtime(current) ? current : NULL; | 481 | entry->scheduled = is_realtime(current) ? current : NULL; |
484 | |||
485 | prev->rt_param.scheduled_on = NO_CPU; | ||
486 | current->rt_param.scheduled_on = smp_processor_id(); | ||
487 | } | 482 | } |
488 | 483 | ||
489 | 484 | ||
diff --git a/litmus/sched_litmus.c b/litmus/sched_litmus.c index feb0159033..ab52ae9510 100644 --- a/litmus/sched_litmus.c +++ b/litmus/sched_litmus.c | |||
@@ -21,10 +21,13 @@ static void litmus_tick(struct rq *rq, struct task_struct *p) | |||
21 | litmus->tick(p); | 21 | litmus->tick(p); |
22 | } | 22 | } |
23 | 23 | ||
24 | #define NO_CPU -1 | ||
25 | |||
24 | static void litmus_schedule(struct rq *rq, struct task_struct *prev) | 26 | static void litmus_schedule(struct rq *rq, struct task_struct *prev) |
25 | { | 27 | { |
26 | struct rq* other_rq; | 28 | struct rq* other_rq; |
27 | long prev_state; | 29 | long prev_state; |
30 | lt_t _maybe_deadlock = 0; | ||
28 | /* WARNING: rq is _not_ locked! */ | 31 | /* WARNING: rq is _not_ locked! */ |
29 | if (is_realtime(prev)) | 32 | if (is_realtime(prev)) |
30 | update_time_litmus(rq, prev); | 33 | update_time_litmus(rq, prev); |
@@ -43,11 +46,43 @@ static void litmus_schedule(struct rq *rq, struct task_struct *prev) | |||
43 | */ | 46 | */ |
44 | prev_state = prev->state; | 47 | prev_state = prev->state; |
45 | spin_unlock(&rq->lock); | 48 | spin_unlock(&rq->lock); |
49 | |||
50 | /* Don't race with a concurrent switch. | ||
51 | * This could deadlock in the case of cross or circular migrations. | ||
52 | * It's the job of the plugin to make sure that doesn't happen. | ||
53 | */ | ||
54 | TRACE_TASK(rq->litmus_next, "stack_in_use=%d\n", | ||
55 | rq->litmus_next->rt_param.stack_in_use); | ||
56 | if (rq->litmus_next->rt_param.stack_in_use != NO_CPU) { | ||
57 | TRACE_TASK(rq->litmus_next, "waiting to deschedule\n"); | ||
58 | _maybe_deadlock = litmus_clock(); | ||
59 | } | ||
60 | while (rq->litmus_next->rt_param.stack_in_use != NO_CPU) { | ||
61 | cpu_relax(); | ||
62 | mb(); | ||
63 | if (rq->litmus_next->rt_param.stack_in_use == NO_CPU) | ||
64 | TRACE_TASK(rq->litmus_next, "descheduled. Proceeding.\n"); | ||
65 | if (lt_before(_maybe_deadlock + 10000000, litmus_clock())) { | ||
66 | /* We've been spinning for 10ms. | ||
67 | * Something can't be right! | ||
68 | * Let's abandon the task and bail out; at least | ||
69 | * we will have debug info instead of a hard deadlock. | ||
70 | */ | ||
71 | TRACE_TASK(rq->litmus_next, | ||
72 | "stack too long in use. Deadlock?\n"); | ||
73 | rq->litmus_next = NULL; | ||
74 | |||
75 | /* bail out */ | ||
76 | spin_lock(&rq->lock); | ||
77 | return; | ||
78 | } | ||
79 | } | ||
80 | |||
46 | double_rq_lock(rq, other_rq); | 81 | double_rq_lock(rq, other_rq); |
47 | if (prev->state != prev_state) { | 82 | if (prev->state != prev_state) { |
48 | TRACE_TASK(prev, | 83 | TRACE_TASK(prev, |
49 | "state changed while we dropped" | 84 | "state changed while we dropped" |
50 | " the lock: now=%d, old=%d", | 85 | " the lock: now=%d, old=%d\n", |
51 | prev->state, prev_state); | 86 | prev->state, prev_state); |
52 | if (prev_state && !prev->state) { | 87 | if (prev_state && !prev->state) { |
53 | /* prev task became unblocked | 88 | /* prev task became unblocked |
@@ -61,7 +96,7 @@ static void litmus_schedule(struct rq *rq, struct task_struct *prev) | |||
61 | 96 | ||
62 | set_task_cpu(rq->litmus_next, smp_processor_id()); | 97 | set_task_cpu(rq->litmus_next, smp_processor_id()); |
63 | 98 | ||
64 | /* now that we have the lock we need to make sure a | 99 | /* DEBUG: now that we have the lock we need to make sure a |
65 | * couple of things still hold: | 100 | * couple of things still hold: |
66 | * - it is still a real-time task | 101 | * - it is still a real-time task |
67 | * - it is still runnable (could have been stopped) | 102 | * - it is still runnable (could have been stopped) |
@@ -71,12 +106,16 @@ static void litmus_schedule(struct rq *rq, struct task_struct *prev) | |||
71 | /* BAD BAD BAD */ | 106 | /* BAD BAD BAD */ |
72 | TRACE_TASK(rq->litmus_next, | 107 | TRACE_TASK(rq->litmus_next, |
73 | "migration invariant FAILED: rt=%d running=%d\n", | 108 | "migration invariant FAILED: rt=%d running=%d\n", |
74 | is_realtime(rq->litmus_next), | 109 | is_realtime(rq->litmus_next), |
75 | is_running(rq->litmus_next)); | 110 | is_running(rq->litmus_next)); |
111 | /* drop the task */ | ||
112 | rq->litmus_next = NULL; | ||
76 | } | 113 | } |
77 | /* release the other CPU's runqueue, but keep ours */ | 114 | /* release the other CPU's runqueue, but keep ours */ |
78 | spin_unlock(&other_rq->lock); | 115 | spin_unlock(&other_rq->lock); |
79 | } | 116 | } |
117 | if (rq->litmus_next) | ||
118 | rq->litmus_next->rt_param.stack_in_use = rq->cpu; | ||
80 | } | 119 | } |
81 | 120 | ||
82 | static void enqueue_task_litmus(struct rq *rq, struct task_struct *p, int wakeup) | 121 | static void enqueue_task_litmus(struct rq *rq, struct task_struct *p, int wakeup) |