diff options
| -rw-r--r-- | include/linux/sched.h | 50 | ||||
| -rw-r--r-- | include/linux/sched/signal.h | 2 | ||||
| -rw-r--r-- | kernel/sched/core.c | 17 | ||||
| -rw-r--r-- | kernel/signal.c | 17 |
4 files changed, 62 insertions, 24 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h index b3d697f3b573..c2413703f45d 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
| @@ -112,17 +112,36 @@ struct task_group; | |||
| 112 | 112 | ||
| 113 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP | 113 | #ifdef CONFIG_DEBUG_ATOMIC_SLEEP |
| 114 | 114 | ||
| 115 | /* | ||
| 116 | * Special states are those that do not use the normal wait-loop pattern. See | ||
| 117 | * the comment with set_special_state(). | ||
| 118 | */ | ||
| 119 | #define is_special_task_state(state) \ | ||
| 120 | ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_DEAD)) | ||
| 121 | |||
| 115 | #define __set_current_state(state_value) \ | 122 | #define __set_current_state(state_value) \ |
| 116 | do { \ | 123 | do { \ |
| 124 | WARN_ON_ONCE(is_special_task_state(state_value));\ | ||
| 117 | current->task_state_change = _THIS_IP_; \ | 125 | current->task_state_change = _THIS_IP_; \ |
| 118 | current->state = (state_value); \ | 126 | current->state = (state_value); \ |
| 119 | } while (0) | 127 | } while (0) |
| 128 | |||
| 120 | #define set_current_state(state_value) \ | 129 | #define set_current_state(state_value) \ |
| 121 | do { \ | 130 | do { \ |
| 131 | WARN_ON_ONCE(is_special_task_state(state_value));\ | ||
| 122 | current->task_state_change = _THIS_IP_; \ | 132 | current->task_state_change = _THIS_IP_; \ |
| 123 | smp_store_mb(current->state, (state_value)); \ | 133 | smp_store_mb(current->state, (state_value)); \ |
| 124 | } while (0) | 134 | } while (0) |
| 125 | 135 | ||
| 136 | #define set_special_state(state_value) \ | ||
| 137 | do { \ | ||
| 138 | unsigned long flags; /* may shadow */ \ | ||
| 139 | WARN_ON_ONCE(!is_special_task_state(state_value)); \ | ||
| 140 | raw_spin_lock_irqsave(¤t->pi_lock, flags); \ | ||
| 141 | current->task_state_change = _THIS_IP_; \ | ||
| 142 | current->state = (state_value); \ | ||
| 143 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ | ||
| 144 | } while (0) | ||
| 126 | #else | 145 | #else |
| 127 | /* | 146 | /* |
| 128 | * set_current_state() includes a barrier so that the write of current->state | 147 | * set_current_state() includes a barrier so that the write of current->state |
| @@ -144,8 +163,8 @@ struct task_group; | |||
| 144 | * | 163 | * |
| 145 | * The above is typically ordered against the wakeup, which does: | 164 | * The above is typically ordered against the wakeup, which does: |
| 146 | * | 165 | * |
| 147 | * need_sleep = false; | 166 | * need_sleep = false; |
| 148 | * wake_up_state(p, TASK_UNINTERRUPTIBLE); | 167 | * wake_up_state(p, TASK_UNINTERRUPTIBLE); |
| 149 | * | 168 | * |
| 150 | * Where wake_up_state() (and all other wakeup primitives) imply enough | 169 | * Where wake_up_state() (and all other wakeup primitives) imply enough |
| 151 | * barriers to order the store of the variable against wakeup. | 170 | * barriers to order the store of the variable against wakeup. |
| @@ -154,12 +173,33 @@ struct task_group; | |||
| 154 | * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a | 173 | * once it observes the TASK_UNINTERRUPTIBLE store the waking CPU can issue a |
| 155 | * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). | 174 | * TASK_RUNNING store which can collide with __set_current_state(TASK_RUNNING). |
| 156 | * | 175 | * |
| 157 | * This is obviously fine, since they both store the exact same value. | 176 | * However, with slightly different timing the wakeup TASK_RUNNING store can |
| 177 | * also collide with the TASK_UNINTERRUPTIBLE store. Loosing that store is not | ||
| 178 | * a problem either because that will result in one extra go around the loop | ||
| 179 | * and our @cond test will save the day. | ||
| 158 | * | 180 | * |
| 159 | * Also see the comments of try_to_wake_up(). | 181 | * Also see the comments of try_to_wake_up(). |
| 160 | */ | 182 | */ |
| 161 | #define __set_current_state(state_value) do { current->state = (state_value); } while (0) | 183 | #define __set_current_state(state_value) \ |
| 162 | #define set_current_state(state_value) smp_store_mb(current->state, (state_value)) | 184 | current->state = (state_value) |
| 185 | |||
| 186 | #define set_current_state(state_value) \ | ||
| 187 | smp_store_mb(current->state, (state_value)) | ||
| 188 | |||
| 189 | /* | ||
| 190 | * set_special_state() should be used for those states when the blocking task | ||
| 191 | * can not use the regular condition based wait-loop. In that case we must | ||
| 192 | * serialize against wakeups such that any possible in-flight TASK_RUNNING stores | ||
| 193 | * will not collide with our state change. | ||
| 194 | */ | ||
| 195 | #define set_special_state(state_value) \ | ||
| 196 | do { \ | ||
| 197 | unsigned long flags; /* may shadow */ \ | ||
| 198 | raw_spin_lock_irqsave(¤t->pi_lock, flags); \ | ||
| 199 | current->state = (state_value); \ | ||
| 200 | raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ | ||
| 201 | } while (0) | ||
| 202 | |||
| 163 | #endif | 203 | #endif |
| 164 | 204 | ||
| 165 | /* Task command name length: */ | 205 | /* Task command name length: */ |
diff --git a/include/linux/sched/signal.h b/include/linux/sched/signal.h index a7ce74c74e49..113d1ad1ced7 100644 --- a/include/linux/sched/signal.h +++ b/include/linux/sched/signal.h | |||
| @@ -280,7 +280,7 @@ static inline void kernel_signal_stop(void) | |||
| 280 | { | 280 | { |
| 281 | spin_lock_irq(¤t->sighand->siglock); | 281 | spin_lock_irq(¤t->sighand->siglock); |
| 282 | if (current->jobctl & JOBCTL_STOP_DEQUEUED) | 282 | if (current->jobctl & JOBCTL_STOP_DEQUEUED) |
| 283 | __set_current_state(TASK_STOPPED); | 283 | set_special_state(TASK_STOPPED); |
| 284 | spin_unlock_irq(¤t->sighand->siglock); | 284 | spin_unlock_irq(¤t->sighand->siglock); |
| 285 | 285 | ||
| 286 | schedule(); | 286 | schedule(); |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7ad60e00a6a8..ffde9eebc846 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
| @@ -3508,23 +3508,8 @@ static void __sched notrace __schedule(bool preempt) | |||
| 3508 | 3508 | ||
| 3509 | void __noreturn do_task_dead(void) | 3509 | void __noreturn do_task_dead(void) |
| 3510 | { | 3510 | { |
| 3511 | /* | ||
| 3512 | * The setting of TASK_RUNNING by try_to_wake_up() may be delayed | ||
| 3513 | * when the following two conditions become true. | ||
| 3514 | * - There is race condition of mmap_sem (It is acquired by | ||
| 3515 | * exit_mm()), and | ||
| 3516 | * - SMI occurs before setting TASK_RUNINNG. | ||
| 3517 | * (or hypervisor of virtual machine switches to other guest) | ||
| 3518 | * As a result, we may become TASK_RUNNING after becoming TASK_DEAD | ||
| 3519 | * | ||
| 3520 | * To avoid it, we have to wait for releasing tsk->pi_lock which | ||
| 3521 | * is held by try_to_wake_up() | ||
| 3522 | */ | ||
| 3523 | raw_spin_lock_irq(¤t->pi_lock); | ||
| 3524 | raw_spin_unlock_irq(¤t->pi_lock); | ||
| 3525 | |||
| 3526 | /* Causes final put_task_struct in finish_task_switch(): */ | 3511 | /* Causes final put_task_struct in finish_task_switch(): */ |
| 3527 | __set_current_state(TASK_DEAD); | 3512 | set_special_state(TASK_DEAD); |
| 3528 | 3513 | ||
| 3529 | /* Tell freezer to ignore us: */ | 3514 | /* Tell freezer to ignore us: */ |
| 3530 | current->flags |= PF_NOFREEZE; | 3515 | current->flags |= PF_NOFREEZE; |
diff --git a/kernel/signal.c b/kernel/signal.c index d4ccea599692..9c33163a6165 100644 --- a/kernel/signal.c +++ b/kernel/signal.c | |||
| @@ -1961,14 +1961,27 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) | |||
| 1961 | return; | 1961 | return; |
| 1962 | } | 1962 | } |
| 1963 | 1963 | ||
| 1964 | set_special_state(TASK_TRACED); | ||
| 1965 | |||
| 1964 | /* | 1966 | /* |
| 1965 | * We're committing to trapping. TRACED should be visible before | 1967 | * We're committing to trapping. TRACED should be visible before |
| 1966 | * TRAPPING is cleared; otherwise, the tracer might fail do_wait(). | 1968 | * TRAPPING is cleared; otherwise, the tracer might fail do_wait(). |
| 1967 | * Also, transition to TRACED and updates to ->jobctl should be | 1969 | * Also, transition to TRACED and updates to ->jobctl should be |
| 1968 | * atomic with respect to siglock and should be done after the arch | 1970 | * atomic with respect to siglock and should be done after the arch |
| 1969 | * hook as siglock is released and regrabbed across it. | 1971 | * hook as siglock is released and regrabbed across it. |
| 1972 | * | ||
| 1973 | * TRACER TRACEE | ||
| 1974 | * | ||
| 1975 | * ptrace_attach() | ||
| 1976 | * [L] wait_on_bit(JOBCTL_TRAPPING) [S] set_special_state(TRACED) | ||
| 1977 | * do_wait() | ||
| 1978 | * set_current_state() smp_wmb(); | ||
| 1979 | * ptrace_do_wait() | ||
| 1980 | * wait_task_stopped() | ||
| 1981 | * task_stopped_code() | ||
| 1982 | * [L] task_is_traced() [S] task_clear_jobctl_trapping(); | ||
| 1970 | */ | 1983 | */ |
| 1971 | set_current_state(TASK_TRACED); | 1984 | smp_wmb(); |
| 1972 | 1985 | ||
| 1973 | current->last_siginfo = info; | 1986 | current->last_siginfo = info; |
| 1974 | current->exit_code = exit_code; | 1987 | current->exit_code = exit_code; |
| @@ -2176,7 +2189,7 @@ static bool do_signal_stop(int signr) | |||
| 2176 | if (task_participate_group_stop(current)) | 2189 | if (task_participate_group_stop(current)) |
| 2177 | notify = CLD_STOPPED; | 2190 | notify = CLD_STOPPED; |
| 2178 | 2191 | ||
| 2179 | __set_current_state(TASK_STOPPED); | 2192 | set_special_state(TASK_STOPPED); |
| 2180 | spin_unlock_irq(¤t->sighand->siglock); | 2193 | spin_unlock_irq(¤t->sighand->siglock); |
| 2181 | 2194 | ||
| 2182 | /* | 2195 | /* |
