diff options
author | Peter Zijlstra <peterz@infradead.org> | 2018-04-30 08:51:01 -0400 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2018-05-04 01:54:54 -0400 |
commit | b5bf9a90bbebffba888c9144c5a8a10317b04064 (patch) | |
tree | 4c059f0785c26ca66df0009544a23b68f7228be3 | |
parent | 85f1abe0019fcb3ea10df7029056cf42702283a8 (diff) |
sched/core: Introduce set_special_state()
Gaurav reported a perceived problem with TASK_PARKED, which turned out
to be a broken wait-loop pattern in __kthread_parkme(), but the
reported issue can (and does) in fact happen for states that do not do
condition based sleeps.
When the 'current->state = TASK_RUNNING' store of a previous
(concurrent) try_to_wake_up() collides with the setting of a 'special'
sleep state, we can loose the sleep state.
Normal condition based wait-loops are immune to this problem, but for
sleep states that are not condition based are subject to this problem.
There already is a fix for TASK_DEAD. Abstract that and also apply it
to TASK_STOPPED and TASK_TRACED, both of which are also without
condition based wait-loop.
Reported-by: Gaurav Kohli <gkohli@codeaurora.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
-rw-r--r-- | include/linux/sched.h | 50 | ||||
-rw-r--r-- | include/linux/sched/signal.h | 2 | ||||
-rw-r--r-- | kernel/sched/core.c | 17 | ||||
-rw-r--r-- | kernel/signal.c | 17 |
4 files changed, 62 insertions, 24 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index b3d697f3b573..c2413703f45d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -112,17 +112,36 @@ struct task_group; | |||
112 | 112 | ||
113 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP | 113 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP |
114 | 114 | ||
115 | /* | ||
116 | * Special states are those that do not use the normal wait-loop pattern. See | ||
117 | * the comment with set_special_state(). | ||
118 | */ | ||
119 | #define is_special_task_state(state) \ | ||
120 | ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_DEAD)) | ||
121 | |||
115 | #define __set_current_state(state_value) \ | 122 | #define __set_current_state(state_value) \ |
116 | do { \ | 123 | do { \ |
124 | WARN_ON_ONCE(is_special_task_state(state_value));\ | ||
117 | current->task_state_change = _THIS_IP_; \ | 125 | current->task_state_change = _THIS_IP_; \ |
118 | current->state = (state_value); \ | 126 | current->state = (state_value); \ |
119 | } while (0) | 127 | } while (0) |
128 | |||
120 | #define set_current_state(state_value) \ | 129 | #define set_current_state(state_value) \ |
121 | do { \ | 130 | do { \ |
131 | WARN_ON_ONCE(is_special_task_state(state_value));\ | ||
122 | current->task_state_change = _THIS_IP_; \ | 132 | current->task_state_change = _THIS_IP_; \ |
123 | smp_store_mb(current->state, (state_value)); \ | 133 | smp_store_mb(current->state, (state_value)); \ |
124 | } while (0) | 134 | } while (0) |
125 | 135 | ||
136 | #define set_special_state(state_value) \ | ||
137 | do { \ | ||
138 | unsigned long flags; /* may shadow */ \ | ||
139 | WARN_ON_ONCE(!is_special_task_state(state_value)); \ | ||
140 | raw_spin_lock_irqsave(¤t->pi_lock, flags); \ | ||
141 | current->task_state_change = _THIS_IP_; \ | ||
142 | current->state = (state_value); \ | ||
143 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ | ||
144 | } while (0) | ||
126 | #else | 145 | #else |
127 | /* | 146 | /* |
128 | * set_current_state() includes a barrier so that the write of current->state | 147 | * set_current_state() includes a barrier so that the write of current->state |
@@ -144,8 +163,8 @@ struct task_group; | |||
144 | * | 163 | * |
145 | * The above is typically ordered against the wakeup, which does: | 164 | * The above is typically ordered against the wakeup, which does: |
146 | * | 165 | * |
147 | * need_sleep = false; | 166 | * need_sleep = false; |
148 | * wake_up_state(p, TASK_UNINTERRUPTIBLE); | 167 | * wake_up_state(p, TASK_UNINTERRUPTIBLE); |
149 | * | 168 | * |
150 | * Where wake_up_state() (and all other wakeup primitives) imply enough | 169 | * Where wake_up_state() (and all other wakeup primitives) imply enough |
151 | * barriers to order the store of the variable against wakeup. | 170 | * barriers to order the store of the variable against wakeup. |
@@ -154,12 +173,33 @@ struct task_group; | |||
154 | * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a | 173 | * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a |
155 | * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). | 174 | * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). |
156 | * | 175 | * |
157 | * This is obviously fine, since they both store the exact same value. | 176 | * However, with slightly different timing the wakeup TASK_RUNNING store can |
177 | * also collide with the TASK_UNINTERRUPTIBLE store. Loosing that store is not | ||
178 | * a problem either because that will result in one extra go around the loop | ||
179 | * and our @cond test will save the day. | ||
158 | * | 180 | * |
159 | * Also see the comments of try_to_wake_up(). | 181 | * Also see the comments of try_to_wake_up(). |
160 | */ | 182 | */ |
161 | #define __set_current_state(state_value) do { current->state = (state_value); } while (0) | 183 | #define __set_current_state(state_value) \ |
162 | #define set_current_state(state_value) smp_store_mb(current->state, (state_value)) | 184 | current->state = (state_value) |
185 | |||
186 | #define set_current_state(state_value) \ | ||
187 | smp_store_mb(current->state, (state_value)) | ||
188 | |||
189 | /* | ||
190 | * set_special_state() should be used for those states when the blocking task | ||
191 | * can not use the regular condition based wait-loop. In that case we must | ||
192 | * serialize against wakeups such that any possible in-flight TASK_RUNNING stores | ||
193 | * will not collide with our state change. | ||
194 | */ | ||
195 | #define set_special_state(state_value) \ | ||
196 | do { \ | ||
197 | unsigned long flags; /* may shadow */ \ | ||
198 | raw_spin_lock_irqsave(¤t->pi_lock, flags); \ | ||
199 | current->state = (state_value); \ | ||
200 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ | ||
201 | } while (0) | ||
202 | |||
163 | #endif | 203 | #endif |
164 | 204 | ||
165 | /* Task command name length: */ | 205 | /* Task command name length: */ |
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index a7ce74c74e49..113d1ad1ced7 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h | |||
@@ -280,7 +280,7 @@ static inline void kernel_signal_stop(void) | |||
280 | { | 280 | { |
281 | spin_lock_irq(¤t->sighand->siglock); | 281 | spin_lock_irq(¤t->sighand->siglock); |
282 | if (current->jobctl & JOBCTL_STOP_DEQUEUED) | 282 | if (current->jobctl & JOBCTL_STOP_DEQUEUED) |
283 | __set_current_state(TASK_STOPPED); | 283 | set_special_state(TASK_STOPPED); |
284 | spin_unlock_irq(¤t->sighand->siglock); | 284 | spin_unlock_irq(¤t->sighand->siglock); |
285 | 285 | ||
286 | schedule(); | 286 | schedule(); |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7ad60e00a6a8..ffde9eebc846 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -3508,23 +3508,8 @@ static void __sched notrace __schedule(bool preempt) | |||
3508 | 3508 | ||
3509 | void __noreturn do_task_dead(void) | 3509 | void __noreturn do_task_dead(void) |
3510 | { | 3510 | { |
3511 | /* | ||
3512 | * The setting of TASK_RUNNING by try_to_wake_up() may be delayed | ||
3513 | * when the following two conditions become true. | ||
3514 | * - There is race condition of mmap_sem (It is acquired by | ||
3515 | * exit_mm()), and | ||
3516 | * - SMI occurs before setting TASK_RUNINNG. | ||
3517 | * (or hypervisor of virtual machine switches to other guest) | ||
3518 | * As a result, we may become TASK_RUNNING after becoming TASK_DEAD | ||
3519 | * | ||
3520 | * To avoid it, we have to wait for releasing tsk->pi_lock which | ||
3521 | * is held by try_to_wake_up() | ||
3522 | */ | ||
3523 | raw_spin_lock_irq(¤t->pi_lock); | ||
3524 | raw_spin_unlock_irq(¤t->pi_lock); | ||
3525 | |||
3526 | /* Causes final put_task_struct in finish_task_switch(): */ | 3511 | /* Causes final put_task_struct in finish_task_switch(): */ |
3527 | __set_current_state(TASK_DEAD); | 3512 | set_special_state(TASK_DEAD); |
3528 | 3513 | ||
3529 | /* Tell freezer to ignore us: */ | 3514 | /* Tell freezer to ignore us: */ |
3530 | current->flags |= PF_NOFREEZE; | 3515 | current->flags |= PF_NOFREEZE; |
diff --git a/kernel/signal.c b/kernel/signal.c index d4ccea599692..9c33163a6165 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
@@ -1961,14 +1961,27 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) | |||
1961 | return; | 1961 | return; |
1962 | } | 1962 | } |
1963 | 1963 | ||
1964 | set_special_state(TASK_TRACED); | ||
1965 | |||
1964 | /* | 1966 | /* |
1965 | * We're committing to trapping. TRACED should be visible before | 1967 | * We're committing to trapping. TRACED should be visible before |
1966 | * TRAPPING is cleared; otherwise, the tracer might fail do_wait(). | 1968 | * TRAPPING is cleared; otherwise, the tracer might fail do_wait(). |
1967 | * Also, transition to TRACED and updates to ->jobctl should be | 1969 | * Also, transition to TRACED and updates to ->jobctl should be |
1968 | * atomic with respect to siglock and should be done after the arch | 1970 | * atomic with respect to siglock and should be done after the arch |
1969 | * hook as siglock is released and regrabbed across it. | 1971 | * hook as siglock is released and regrabbed across it. |
1972 | * | ||
1973 | * TRACER TRACEE | ||
1974 | * | ||
1975 | * ptrace_attach() | ||
1976 | * [L] wait_on_bit(JOBCTL_TRAPPING) [S] set_special_state(TRACED) | ||
1977 | * do_wait() | ||
1978 | * set_current_state() smp_wmb(); | ||
1979 | * ptrace_do_wait() | ||
1980 | * wait_task_stopped() | ||
1981 | * task_stopped_code() | ||
1982 | * [L] task_is_traced() [S] task_clear_jobctl_trapping(); | ||
1970 | */ | 1983 | */ |
1971 | set_current_state(TASK_TRACED); | 1984 | smp_wmb(); |
1972 | 1985 | ||
1973 | current->last_siginfo = info; | 1986 | current->last_siginfo = info; |
1974 | current->exit_code = exit_code; | 1987 | current->exit_code = exit_code; |
@@ -2176,7 +2189,7 @@ static bool do_signal_stop(int signr) | |||
2176 | if (task_participate_group_stop(current)) | 2189 | if (task_participate_group_stop(current)) |
2177 | notify = CLD_STOPPED; | 2190 | notify = CLD_STOPPED; |
2178 | 2191 | ||
2179 | __set_current_state(TASK_STOPPED); | 2192 | set_special_state(TASK_STOPPED); |
2180 | spin_unlock_irq(¤t->sighand->siglock); | 2193 | spin_unlock_irq(¤t->sighand->siglock); |
2181 | 2194 | ||
2182 | /* | 2195 | /* |