summaryrefslogtreecommitdiffstats
path: root/kernel/sched
diff options
context:
space:
mode:
authorMathieu Desnoyers <mathieu.desnoyers@efficios.com>2019-09-19 13:37:02 -0400
committerIngo Molnar <mingo@kernel.org>2019-09-25 11:42:30 -0400
commit227a4aadc75ba22fcb6c4e1c078817b8cbaae4ce (patch)
tree38eb00e930d17973a6f2e751e6ff87cac3acb6c5 /kernel/sched
parent2840cf02fae627860156737e83326df354ee4ec6 (diff)
sched/membarrier: Fix p->mm->membarrier_state racy load
The membarrier_state field is located within the mm_struct, which is not guaranteed to exist when used from runqueue-lock-free iteration on runqueues by the membarrier system call. Copy the membarrier_state from the mm_struct into the scheduler runqueue when the scheduler switches between mm. When registering membarrier for mm, after setting the registration bit in the mm membarrier state, issue a synchronize_rcu() to ensure the scheduler observes the change. In order to take care of the case where a runqueue keeps executing the target mm without swapping to other mm, iterate over each runqueue and issue an IPI to copy the membarrier_state from the mm_struct into each runqueue which have the same mm which state has just been modified. Move the mm membarrier_state field closer to pgd in mm_struct to use a cache line already touched by the scheduler switch_mm. The membarrier_execve() (now membarrier_exec_mmap) hook now needs to clear the runqueue's membarrier state in addition to clear the mm membarrier state, so move its implementation into the scheduler membarrier code so it can access the runqueue structure. Add memory barrier in membarrier_exec_mmap() prior to clearing the membarrier state, ensuring memory accesses executed prior to exec are not reordered with the stores clearing the membarrier state. As suggested by Linus, move all membarrier.c RCU read-side locks outside of the for each cpu loops. Suggested-by: Linus Torvalds <torvalds@linux-foundation.org> Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> Cc: Chris Metcalf <cmetcalf@ezchip.com> Cc: Christoph Lameter <cl@linux.com> Cc: Eric W. Biederman <ebiederm@xmission.com> Cc: Kirill Tkhai <tkhai@yandex.ru> Cc: Mike Galbraith <efault@gmx.de> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Paul E. McKenney <paulmck@linux.ibm.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Russell King - ARM Linux admin <linux@armlinux.org.uk> Cc: Thomas Gleixner <tglx@linutronix.de> Link: https://lkml.kernel.org/r/20190919173705.2181-5-mathieu.desnoyers@efficios.com Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c4
-rw-r--r--kernel/sched/membarrier.c175
-rw-r--r--kernel/sched/sched.h34
3 files changed, 168 insertions, 45 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 84c71160beb1..2d9a3947bef4 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3358,15 +3358,15 @@ context_switch(struct rq *rq, struct task_struct *prev,
3358 else 3358 else
3359 prev->active_mm = NULL; 3359 prev->active_mm = NULL;
3360 } else { // to user 3360 } else { // to user
3361 membarrier_switch_mm(rq, prev->active_mm, next->mm);
3361 /* 3362 /*
3362 * sys_membarrier() requires an smp_mb() between setting 3363 * sys_membarrier() requires an smp_mb() between setting
3363 * rq->curr and returning to userspace. 3364 * rq->curr / membarrier_switch_mm() and returning to userspace.
3364 * 3365 *
3365 * The below provides this either through switch_mm(), or in 3366 * The below provides this either through switch_mm(), or in
3366 * case 'prev->active_mm == next->mm' through 3367 * case 'prev->active_mm == next->mm' through
3367 * finish_task_switch()'s mmdrop(). 3368 * finish_task_switch()'s mmdrop().
3368 */ 3369 */
3369
3370 switch_mm_irqs_off(prev->active_mm, next->mm, next); 3370 switch_mm_irqs_off(prev->active_mm, next->mm, next);
3371 3371
3372 if (!prev->mm) { // from kernel 3372 if (!prev->mm) { // from kernel
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index 7ccbd0e19626..070cf433bb9a 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -30,6 +30,39 @@ static void ipi_mb(void *info)
30 smp_mb(); /* IPIs should be serializing but paranoid. */ 30 smp_mb(); /* IPIs should be serializing but paranoid. */
31} 31}
32 32
33static void ipi_sync_rq_state(void *info)
34{
35 struct mm_struct *mm = (struct mm_struct *) info;
36
37 if (current->mm != mm)
38 return;
39 this_cpu_write(runqueues.membarrier_state,
40 atomic_read(&mm->membarrier_state));
41 /*
42 * Issue a memory barrier after setting
43 * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to
44 * guarantee that no memory access following registration is reordered
45 * before registration.
46 */
47 smp_mb();
48}
49
50void membarrier_exec_mmap(struct mm_struct *mm)
51{
52 /*
53 * Issue a memory barrier before clearing membarrier_state to
54 * guarantee that no memory access prior to exec is reordered after
55 * clearing this state.
56 */
57 smp_mb();
58 atomic_set(&mm->membarrier_state, 0);
59 /*
60 * Keep the runqueue membarrier_state in sync with this mm
61 * membarrier_state.
62 */
63 this_cpu_write(runqueues.membarrier_state, 0);
64}
65
33static int membarrier_global_expedited(void) 66static int membarrier_global_expedited(void)
34{ 67{
35 int cpu; 68 int cpu;
@@ -56,6 +89,7 @@ static int membarrier_global_expedited(void)
56 } 89 }
57 90
58 cpus_read_lock(); 91 cpus_read_lock();
92 rcu_read_lock();
59 for_each_online_cpu(cpu) { 93 for_each_online_cpu(cpu) {
60 struct task_struct *p; 94 struct task_struct *p;
61 95
@@ -70,17 +104,25 @@ static int membarrier_global_expedited(void)
70 if (cpu == raw_smp_processor_id()) 104 if (cpu == raw_smp_processor_id())
71 continue; 105 continue;
72 106
73 rcu_read_lock(); 107 if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) &
108 MEMBARRIER_STATE_GLOBAL_EXPEDITED))
109 continue;
110
111 /*
112 * Skip the CPU if it runs a kernel thread. The scheduler
113 * leaves the prior task mm in place as an optimization when
114 * scheduling a kthread.
115 */
74 p = rcu_dereference(cpu_rq(cpu)->curr); 116 p = rcu_dereference(cpu_rq(cpu)->curr);
75 if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & 117 if (p->flags & PF_KTHREAD)
76 MEMBARRIER_STATE_GLOBAL_EXPEDITED)) { 118 continue;
77 if (!fallback) 119
78 __cpumask_set_cpu(cpu, tmpmask); 120 if (!fallback)
79 else 121 __cpumask_set_cpu(cpu, tmpmask);
80 smp_call_function_single(cpu, ipi_mb, NULL, 1); 122 else
81 } 123 smp_call_function_single(cpu, ipi_mb, NULL, 1);
82 rcu_read_unlock();
83 } 124 }
125 rcu_read_unlock();
84 if (!fallback) { 126 if (!fallback) {
85 preempt_disable(); 127 preempt_disable();
86 smp_call_function_many(tmpmask, ipi_mb, NULL, 1); 128 smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
@@ -136,6 +178,7 @@ static int membarrier_private_expedited(int flags)
136 } 178 }
137 179
138 cpus_read_lock(); 180 cpus_read_lock();
181 rcu_read_lock();
139 for_each_online_cpu(cpu) { 182 for_each_online_cpu(cpu) {
140 struct task_struct *p; 183 struct task_struct *p;
141 184
@@ -157,8 +200,8 @@ static int membarrier_private_expedited(int flags)
157 else 200 else
158 smp_call_function_single(cpu, ipi_mb, NULL, 1); 201 smp_call_function_single(cpu, ipi_mb, NULL, 1);
159 } 202 }
160 rcu_read_unlock();
161 } 203 }
204 rcu_read_unlock();
162 if (!fallback) { 205 if (!fallback) {
163 preempt_disable(); 206 preempt_disable();
164 smp_call_function_many(tmpmask, ipi_mb, NULL, 1); 207 smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
@@ -177,32 +220,78 @@ static int membarrier_private_expedited(int flags)
177 return 0; 220 return 0;
178} 221}
179 222
223static int sync_runqueues_membarrier_state(struct mm_struct *mm)
224{
225 int membarrier_state = atomic_read(&mm->membarrier_state);
226 cpumask_var_t tmpmask;
227 int cpu;
228
229 if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) {
230 this_cpu_write(runqueues.membarrier_state, membarrier_state);
231
232 /*
233 * For single mm user, we can simply issue a memory barrier
234 * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the
235 * mm and in the current runqueue to guarantee that no memory
236 * access following registration is reordered before
237 * registration.
238 */
239 smp_mb();
240 return 0;
241 }
242
243 if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
244 return -ENOMEM;
245
246 /*
247 * For mm with multiple users, we need to ensure all future
248 * scheduler executions will observe @mm's new membarrier
249 * state.
250 */
251 synchronize_rcu();
252
253 /*
254 * For each cpu runqueue, if the task's mm match @mm, ensure that all
255 * @mm's membarrier state set bits are also set in in the runqueue's
256 * membarrier state. This ensures that a runqueue scheduling
257 * between threads which are users of @mm has its membarrier state
258 * updated.
259 */
260 cpus_read_lock();
261 rcu_read_lock();
262 for_each_online_cpu(cpu) {
263 struct rq *rq = cpu_rq(cpu);
264 struct task_struct *p;
265
266 p = rcu_dereference(&rq->curr);
267 if (p && p->mm == mm)
268 __cpumask_set_cpu(cpu, tmpmask);
269 }
270 rcu_read_unlock();
271
272 preempt_disable();
273 smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1);
274 preempt_enable();
275
276 free_cpumask_var(tmpmask);
277 cpus_read_unlock();
278
279 return 0;
280}
281
180static int membarrier_register_global_expedited(void) 282static int membarrier_register_global_expedited(void)
181{ 283{
182 struct task_struct *p = current; 284 struct task_struct *p = current;
183 struct mm_struct *mm = p->mm; 285 struct mm_struct *mm = p->mm;
286 int ret;
184 287
185 if (atomic_read(&mm->membarrier_state) & 288 if (atomic_read(&mm->membarrier_state) &
186 MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) 289 MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY)
187 return 0; 290 return 0;
188 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); 291 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state);
189 if (atomic_read(&mm->mm_users) == 1) { 292 ret = sync_runqueues_membarrier_state(mm);
190 /* 293 if (ret)
191 * For single mm user, single threaded process, we can 294 return ret;
192 * simply issue a memory barrier after setting
193 * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that
194 * no memory access following registration is reordered
195 * before registration.
196 */
197 smp_mb();
198 } else {
199 /*
200 * For multi-mm user threads, we need to ensure all
201 * future scheduler executions will observe the new
202 * thread flag state for this mm.
203 */
204 synchronize_rcu();
205 }
206 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, 295 atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY,
207 &mm->membarrier_state); 296 &mm->membarrier_state);
208 297
@@ -213,12 +302,15 @@ static int membarrier_register_private_expedited(int flags)
213{ 302{
214 struct task_struct *p = current; 303 struct task_struct *p = current;
215 struct mm_struct *mm = p->mm; 304 struct mm_struct *mm = p->mm;
216 int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY; 305 int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY,
306 set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED,
307 ret;
217 308
218 if (flags & MEMBARRIER_FLAG_SYNC_CORE) { 309 if (flags & MEMBARRIER_FLAG_SYNC_CORE) {
219 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) 310 if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE))
220 return -EINVAL; 311 return -EINVAL;
221 state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; 312 ready_state =
313 MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY;
222 } 314 }
223 315
224 /* 316 /*
@@ -226,20 +318,15 @@ static int membarrier_register_private_expedited(int flags)
226 * groups, which use the same mm. (CLONE_VM but not 318 * groups, which use the same mm. (CLONE_VM but not
227 * CLONE_THREAD). 319 * CLONE_THREAD).
228 */ 320 */
229 if ((atomic_read(&mm->membarrier_state) & state) == state) 321 if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state)
230 return 0; 322 return 0;
231 atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state);
232 if (flags & MEMBARRIER_FLAG_SYNC_CORE) 323 if (flags & MEMBARRIER_FLAG_SYNC_CORE)
233 atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE, 324 set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE;
234 &mm->membarrier_state); 325 atomic_or(set_state, &mm->membarrier_state);
235 if (atomic_read(&mm->mm_users) != 1) { 326 ret = sync_runqueues_membarrier_state(mm);
236 /* 327 if (ret)
237 * Ensure all future scheduler executions will observe the 328 return ret;
238 * new thread flag state for this process. 329 atomic_or(ready_state, &mm->membarrier_state);
239 */
240 synchronize_rcu();
241 }
242 atomic_or(state, &mm->membarrier_state);
243 330
244 return 0; 331 return 0;
245} 332}
@@ -253,8 +340,10 @@ static int membarrier_register_private_expedited(int flags)
253 * command specified does not exist, not available on the running 340 * command specified does not exist, not available on the running
254 * kernel, or if the command argument is invalid, this system call 341 * kernel, or if the command argument is invalid, this system call
255 * returns -EINVAL. For a given command, with flags argument set to 0, 342 * returns -EINVAL. For a given command, with flags argument set to 0,
256 * this system call is guaranteed to always return the same value until 343 * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to
257 * reboot. 344 * always return the same value until reboot. In addition, it can return
345 * -ENOMEM if there is not enough memory available to perform the system
346 * call.
258 * 347 *
259 * All memory accesses performed in program order from each targeted thread 348 * All memory accesses performed in program order from each targeted thread
260 * is guaranteed to be ordered with respect to sys_membarrier(). If we use 349 * is guaranteed to be ordered with respect to sys_membarrier(). If we use
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index b3cb895d14a2..0db2c1b3361e 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -911,6 +911,10 @@ struct rq {
911 911
912 atomic_t nr_iowait; 912 atomic_t nr_iowait;
913 913
914#ifdef CONFIG_MEMBARRIER
915 int membarrier_state;
916#endif
917
914#ifdef CONFIG_SMP 918#ifdef CONFIG_SMP
915 struct root_domain *rd; 919 struct root_domain *rd;
916 struct sched_domain __rcu *sd; 920 struct sched_domain __rcu *sd;
@@ -2438,3 +2442,33 @@ static inline bool sched_energy_enabled(void)
2438static inline bool sched_energy_enabled(void) { return false; } 2442static inline bool sched_energy_enabled(void) { return false; }
2439 2443
2440#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ 2444#endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */
2445
2446#ifdef CONFIG_MEMBARRIER
2447/*
2448 * The scheduler provides memory barriers required by membarrier between:
2449 * - prior user-space memory accesses and store to rq->membarrier_state,
2450 * - store to rq->membarrier_state and following user-space memory accesses.
2451 * In the same way it provides those guarantees around store to rq->curr.
2452 */
2453static inline void membarrier_switch_mm(struct rq *rq,
2454 struct mm_struct *prev_mm,
2455 struct mm_struct *next_mm)
2456{
2457 int membarrier_state;
2458
2459 if (prev_mm == next_mm)
2460 return;
2461
2462 membarrier_state = atomic_read(&next_mm->membarrier_state);
2463 if (READ_ONCE(rq->membarrier_state) == membarrier_state)
2464 return;
2465
2466 WRITE_ONCE(rq->membarrier_state, membarrier_state);
2467}
2468#else
2469static inline void membarrier_switch_mm(struct rq *rq,
2470 struct mm_struct *prev_mm,
2471 struct mm_struct *next_mm)
2472{
2473}
2474#endif