diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-28 15:39:07 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2019-09-28 15:39:07 -0400 |
commit | 9c5efe9ae7df78600c0ee7bcce27516eb687fa6e (patch) | |
tree | 158cfb9720d876e68a14a4cccaffeb58fb7baac5 | |
parent | aefcf2f4b58155d27340ba5f9ddbe9513da8286d (diff) | |
parent | 4892f51ad54ddff2883a60b6ad4323c1f632a9d6 (diff) |
Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler fixes from Ingo Molnar:
- Apply a number of membarrier related fixes and cleanups, which fixes
a use-after-free race in the membarrier code
- Introduce proper RCU protection for tasks on the runqueue - to get
rid of the subtle task_rcu_dereference() interface that was easy to
get wrong
- Misc fixes, but also an EAS speedup
* 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip:
sched/fair: Avoid redundant EAS calculation
sched/core: Remove double update_max_interval() call on CPU startup
sched/core: Fix preempt_schedule() interrupt return comment
sched/fair: Fix -Wunused-but-set-variable warnings
sched/core: Fix migration to invalid CPU in __set_cpus_allowed_ptr()
sched/membarrier: Return -ENOMEM to userspace on memory allocation failure
sched/membarrier: Skip IPIs when mm->mm_users == 1
selftests, sched/membarrier: Add multi-threaded test
sched/membarrier: Fix p->mm->membarrier_state racy load
sched/membarrier: Call sync_core only before usermode for same mm
sched/membarrier: Remove redundant check
sched/membarrier: Fix private expedited registration check
tasks, sched/core: RCUify the assignment of rq->curr
tasks, sched/core: With a grace period after finish_task_switch(), remove unnecessary code
tasks, sched/core: Ensure tasks are available for a grace period after leaving the runqueue
tasks: Add a count of task RCU users
sched/core: Convert vcpu_is_preempted() from macro to an inline function
sched/fair: Remove unused cfs_rq_clock_task() function
-rw-r--r-- | fs/exec.c | 2 | ||||
-rw-r--r-- | include/linux/mm_types.h | 14 | ||||
-rw-r--r-- | include/linux/rcuwait.h | 20 | ||||
-rw-r--r-- | include/linux/sched.h | 10 | ||||
-rw-r--r-- | include/linux/sched/mm.h | 10 | ||||
-rw-r--r-- | include/linux/sched/task.h | 2 | ||||
-rw-r--r-- | kernel/exit.c | 74 | ||||
-rw-r--r-- | kernel/fork.c | 8 | ||||
-rw-r--r-- | kernel/sched/core.c | 28 | ||||
-rw-r--r-- | kernel/sched/fair.c | 39 | ||||
-rw-r--r-- | kernel/sched/membarrier.c | 239 | ||||
-rw-r--r-- | kernel/sched/sched.h | 34 | ||||
-rw-r--r-- | tools/testing/selftests/membarrier/.gitignore | 3 | ||||
-rw-r--r-- | tools/testing/selftests/membarrier/Makefile | 5 | ||||
-rw-r--r-- | tools/testing/selftests/membarrier/membarrier_test_impl.h (renamed from tools/testing/selftests/membarrier/membarrier_test.c) | 40 | ||||
-rw-r--r-- | tools/testing/selftests/membarrier/membarrier_test_multi_thread.c | 73 | ||||
-rw-r--r-- | tools/testing/selftests/membarrier/membarrier_test_single_thread.c | 24 |
17 files changed, 375 insertions, 250 deletions
@@ -1033,6 +1033,7 @@ static int exec_mmap(struct mm_struct *mm) | |||
1033 | } | 1033 | } |
1034 | task_lock(tsk); | 1034 | task_lock(tsk); |
1035 | active_mm = tsk->active_mm; | 1035 | active_mm = tsk->active_mm; |
1036 | membarrier_exec_mmap(mm); | ||
1036 | tsk->mm = mm; | 1037 | tsk->mm = mm; |
1037 | tsk->active_mm = mm; | 1038 | tsk->active_mm = mm; |
1038 | activate_mm(active_mm, mm); | 1039 | activate_mm(active_mm, mm); |
@@ -1825,7 +1826,6 @@ static int __do_execve_file(int fd, struct filename *filename, | |||
1825 | /* execve succeeded */ | 1826 | /* execve succeeded */ |
1826 | current->fs->in_exec = 0; | 1827 | current->fs->in_exec = 0; |
1827 | current->in_execve = 0; | 1828 | current->in_execve = 0; |
1828 | membarrier_execve(current); | ||
1829 | rseq_execve(current); | 1829 | rseq_execve(current); |
1830 | acct_update_integrals(current); | 1830 | acct_update_integrals(current); |
1831 | task_numa_free(current, false); | 1831 | task_numa_free(current, false); |
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5183e0d77dfa..2222fa795284 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h | |||
@@ -383,6 +383,16 @@ struct mm_struct { | |||
383 | unsigned long highest_vm_end; /* highest vma end address */ | 383 | unsigned long highest_vm_end; /* highest vma end address */ |
384 | pgd_t * pgd; | 384 | pgd_t * pgd; |
385 | 385 | ||
386 | #ifdef CONFIG_MEMBARRIER | ||
387 | /** | ||
388 | * @membarrier_state: Flags controlling membarrier behavior. | ||
389 | * | ||
390 | * This field is close to @pgd to hopefully fit in the same | ||
391 | * cache-line, which needs to be touched by switch_mm(). | ||
392 | */ | ||
393 | atomic_t membarrier_state; | ||
394 | #endif | ||
395 | |||
386 | /** | 396 | /** |
387 | * @mm_users: The number of users including userspace. | 397 | * @mm_users: The number of users including userspace. |
388 | * | 398 | * |
@@ -452,9 +462,7 @@ struct mm_struct { | |||
452 | unsigned long flags; /* Must use atomic bitops to access */ | 462 | unsigned long flags; /* Must use atomic bitops to access */ |
453 | 463 | ||
454 | struct core_state *core_state; /* coredumping support */ | 464 | struct core_state *core_state; /* coredumping support */ |
455 | #ifdef CONFIG_MEMBARRIER | 465 | |
456 | atomic_t membarrier_state; | ||
457 | #endif | ||
458 | #ifdef CONFIG_AIO | 466 | #ifdef CONFIG_AIO |
459 | spinlock_t ioctx_lock; | 467 | spinlock_t ioctx_lock; |
460 | struct kioctx_table __rcu *ioctx_table; | 468 | struct kioctx_table __rcu *ioctx_table; |
diff --git a/include/linux/rcuwait.h b/include/linux/rcuwait.h index 563290fc194f..75c97e4bbc57 100644 --- a/include/linux/rcuwait.h +++ b/include/linux/rcuwait.h | |||
@@ -6,16 +6,11 @@ | |||
6 | 6 | ||
7 | /* | 7 | /* |
8 | * rcuwait provides a way of blocking and waking up a single | 8 | * rcuwait provides a way of blocking and waking up a single |
9 | * task in an rcu-safe manner; where it is forbidden to use | 9 | * task in an rcu-safe manner. |
10 | * after exit_notify(). task_struct is not properly rcu protected, | ||
11 | * unless dealing with rcu-aware lists, ie: find_task_by_*(). | ||
12 | * | 10 | * |
13 | * Alternatively we have task_rcu_dereference(), but the return | 11 | * The only time @task is non-nil is when a user is blocked (or |
14 | * semantics have different implications which would break the | 12 | * checking if it needs to) on a condition, and reset as soon as we |
15 | * wakeup side. The only time @task is non-nil is when a user is | 13 | * know that the condition has succeeded and are awoken. |
16 | * blocked (or checking if it needs to) on a condition, and reset | ||
17 | * as soon as we know that the condition has succeeded and are | ||
18 | * awoken. | ||
19 | */ | 14 | */ |
20 | struct rcuwait { | 15 | struct rcuwait { |
21 | struct task_struct __rcu *task; | 16 | struct task_struct __rcu *task; |
@@ -37,13 +32,6 @@ extern void rcuwait_wake_up(struct rcuwait *w); | |||
37 | */ | 32 | */ |
38 | #define rcuwait_wait_event(w, condition) \ | 33 | #define rcuwait_wait_event(w, condition) \ |
39 | ({ \ | 34 | ({ \ |
40 | /* \ | ||
41 | * Complain if we are called after do_exit()/exit_notify(), \ | ||
42 | * as we cannot rely on the rcu critical region for the \ | ||
43 | * wakeup side. \ | ||
44 | */ \ | ||
45 | WARN_ON(current->exit_state); \ | ||
46 | \ | ||
47 | rcu_assign_pointer((w)->task, current); \ | 35 | rcu_assign_pointer((w)->task, current); \ |
48 | for (;;) { \ | 36 | for (;;) { \ |
49 | /* \ | 37 | /* \ |
diff --git a/include/linux/sched.h b/include/linux/sched.h index 70db597d6fd4..2c2e56bd8913 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h | |||
@@ -1130,7 +1130,10 @@ struct task_struct { | |||
1130 | 1130 | ||
1131 | struct tlbflush_unmap_batch tlb_ubc; | 1131 | struct tlbflush_unmap_batch tlb_ubc; |
1132 | 1132 | ||
1133 | struct rcu_head rcu; | 1133 | union { |
1134 | refcount_t rcu_users; | ||
1135 | struct rcu_head rcu; | ||
1136 | }; | ||
1134 | 1137 | ||
1135 | /* Cache last used pipe for splice(): */ | 1138 | /* Cache last used pipe for splice(): */ |
1136 | struct pipe_inode_info *splice_pipe; | 1139 | struct pipe_inode_info *splice_pipe; |
@@ -1839,7 +1842,10 @@ static inline void set_task_cpu(struct task_struct *p, unsigned int cpu) | |||
1839 | * running or not. | 1842 | * running or not. |
1840 | */ | 1843 | */ |
1841 | #ifndef vcpu_is_preempted | 1844 | #ifndef vcpu_is_preempted |
1842 | # define vcpu_is_preempted(cpu) false | 1845 | static inline bool vcpu_is_preempted(int cpu) |
1846 | { | ||
1847 | return false; | ||
1848 | } | ||
1843 | #endif | 1849 | #endif |
1844 | 1850 | ||
1845 | extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); | 1851 | extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask); |
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 4a7944078cc3..e6770012db18 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h | |||
@@ -362,16 +362,16 @@ enum { | |||
362 | 362 | ||
363 | static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) | 363 | static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) |
364 | { | 364 | { |
365 | if (current->mm != mm) | ||
366 | return; | ||
365 | if (likely(!(atomic_read(&mm->membarrier_state) & | 367 | if (likely(!(atomic_read(&mm->membarrier_state) & |
366 | MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE))) | 368 | MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE))) |
367 | return; | 369 | return; |
368 | sync_core_before_usermode(); | 370 | sync_core_before_usermode(); |
369 | } | 371 | } |
370 | 372 | ||
371 | static inline void membarrier_execve(struct task_struct *t) | 373 | extern void membarrier_exec_mmap(struct mm_struct *mm); |
372 | { | 374 | |
373 | atomic_set(&t->mm->membarrier_state, 0); | ||
374 | } | ||
375 | #else | 375 | #else |
376 | #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS | 376 | #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS |
377 | static inline void membarrier_arch_switch_mm(struct mm_struct *prev, | 377 | static inline void membarrier_arch_switch_mm(struct mm_struct *prev, |
@@ -380,7 +380,7 @@ static inline void membarrier_arch_switch_mm(struct mm_struct *prev, | |||
380 | { | 380 | { |
381 | } | 381 | } |
382 | #endif | 382 | #endif |
383 | static inline void membarrier_execve(struct task_struct *t) | 383 | static inline void membarrier_exec_mmap(struct mm_struct *mm) |
384 | { | 384 | { |
385 | } | 385 | } |
386 | static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) | 386 | static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) |
diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index 3d90ed8f75f0..4b1c3b664f51 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h | |||
@@ -119,7 +119,7 @@ static inline void put_task_struct(struct task_struct *t) | |||
119 | __put_task_struct(t); | 119 | __put_task_struct(t); |
120 | } | 120 | } |
121 | 121 | ||
122 | struct task_struct *task_rcu_dereference(struct task_struct **ptask); | 122 | void put_task_struct_rcu_user(struct task_struct *task); |
123 | 123 | ||
124 | #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT | 124 | #ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT |
125 | extern int arch_task_struct_size __read_mostly; | 125 | extern int arch_task_struct_size __read_mostly; |
diff --git a/kernel/exit.c b/kernel/exit.c index 22ab6a4bdc51..a46a50d67002 100644 --- a/kernel/exit.c +++ b/kernel/exit.c | |||
@@ -182,6 +182,11 @@ static void delayed_put_task_struct(struct rcu_head *rhp) | |||
182 | put_task_struct(tsk); | 182 | put_task_struct(tsk); |
183 | } | 183 | } |
184 | 184 | ||
185 | void put_task_struct_rcu_user(struct task_struct *task) | ||
186 | { | ||
187 | if (refcount_dec_and_test(&task->rcu_users)) | ||
188 | call_rcu(&task->rcu, delayed_put_task_struct); | ||
189 | } | ||
185 | 190 | ||
186 | void release_task(struct task_struct *p) | 191 | void release_task(struct task_struct *p) |
187 | { | 192 | { |
@@ -222,76 +227,13 @@ repeat: | |||
222 | 227 | ||
223 | write_unlock_irq(&tasklist_lock); | 228 | write_unlock_irq(&tasklist_lock); |
224 | release_thread(p); | 229 | release_thread(p); |
225 | call_rcu(&p->rcu, delayed_put_task_struct); | 230 | put_task_struct_rcu_user(p); |
226 | 231 | ||
227 | p = leader; | 232 | p = leader; |
228 | if (unlikely(zap_leader)) | 233 | if (unlikely(zap_leader)) |
229 | goto repeat; | 234 | goto repeat; |
230 | } | 235 | } |
231 | 236 | ||
232 | /* | ||
233 | * Note that if this function returns a valid task_struct pointer (!NULL) | ||
234 | * task->usage must remain >0 for the duration of the RCU critical section. | ||
235 | */ | ||
236 | struct task_struct *task_rcu_dereference(struct task_struct **ptask) | ||
237 | { | ||
238 | struct sighand_struct *sighand; | ||
239 | struct task_struct *task; | ||
240 | |||
241 | /* | ||
242 | * We need to verify that release_task() was not called and thus | ||
243 | * delayed_put_task_struct() can't run and drop the last reference | ||
244 | * before rcu_read_unlock(). We check task->sighand != NULL, | ||
245 | * but we can read the already freed and reused memory. | ||
246 | */ | ||
247 | retry: | ||
248 | task = rcu_dereference(*ptask); | ||
249 | if (!task) | ||
250 | return NULL; | ||
251 | |||
252 | probe_kernel_address(&task->sighand, sighand); | ||
253 | |||
254 | /* | ||
255 | * Pairs with atomic_dec_and_test() in put_task_struct(). If this task | ||
256 | * was already freed we can not miss the preceding update of this | ||
257 | * pointer. | ||
258 | */ | ||
259 | smp_rmb(); | ||
260 | if (unlikely(task != READ_ONCE(*ptask))) | ||
261 | goto retry; | ||
262 | |||
263 | /* | ||
264 | * We've re-checked that "task == *ptask", now we have two different | ||
265 | * cases: | ||
266 | * | ||
267 | * 1. This is actually the same task/task_struct. In this case | ||
268 | * sighand != NULL tells us it is still alive. | ||
269 | * | ||
270 | * 2. This is another task which got the same memory for task_struct. | ||
271 | * We can't know this of course, and we can not trust | ||
272 | * sighand != NULL. | ||
273 | * | ||
274 | * In this case we actually return a random value, but this is | ||
275 | * correct. | ||
276 | * | ||
277 | * If we return NULL - we can pretend that we actually noticed that | ||
278 | * *ptask was updated when the previous task has exited. Or pretend | ||
279 | * that probe_slab_address(&sighand) reads NULL. | ||
280 | * | ||
281 | * If we return the new task (because sighand is not NULL for any | ||
282 | * reason) - this is fine too. This (new) task can't go away before | ||
283 | * another gp pass. | ||
284 | * | ||
285 | * And note: We could even eliminate the false positive if re-read | ||
286 | * task->sighand once again to avoid the falsely NULL. But this case | ||
287 | * is very unlikely so we don't care. | ||
288 | */ | ||
289 | if (!sighand) | ||
290 | return NULL; | ||
291 | |||
292 | return task; | ||
293 | } | ||
294 | |||
295 | void rcuwait_wake_up(struct rcuwait *w) | 237 | void rcuwait_wake_up(struct rcuwait *w) |
296 | { | 238 | { |
297 | struct task_struct *task; | 239 | struct task_struct *task; |
@@ -311,10 +253,6 @@ void rcuwait_wake_up(struct rcuwait *w) | |||
311 | */ | 253 | */ |
312 | smp_mb(); /* (B) */ | 254 | smp_mb(); /* (B) */ |
313 | 255 | ||
314 | /* | ||
315 | * Avoid using task_rcu_dereference() magic as long as we are careful, | ||
316 | * see comment in rcuwait_wait_event() regarding ->exit_state. | ||
317 | */ | ||
318 | task = rcu_dereference(w->task); | 256 | task = rcu_dereference(w->task); |
319 | if (task) | 257 | if (task) |
320 | wake_up_process(task); | 258 | wake_up_process(task); |
diff --git a/kernel/fork.c b/kernel/fork.c index 60763c043aa3..f9572f416126 100644 --- a/kernel/fork.c +++ b/kernel/fork.c | |||
@@ -915,10 +915,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) | |||
915 | tsk->cpus_ptr = &tsk->cpus_mask; | 915 | tsk->cpus_ptr = &tsk->cpus_mask; |
916 | 916 | ||
917 | /* | 917 | /* |
918 | * One for us, one for whoever does the "release_task()" (usually | 918 | * One for the user space visible state that goes away when reaped. |
919 | * parent) | 919 | * One for the scheduler. |
920 | */ | 920 | */ |
921 | refcount_set(&tsk->usage, 2); | 921 | refcount_set(&tsk->rcu_users, 2); |
922 | /* One for the rcu users */ | ||
923 | refcount_set(&tsk->usage, 1); | ||
922 | #ifdef CONFIG_BLK_DEV_IO_TRACE | 924 | #ifdef CONFIG_BLK_DEV_IO_TRACE |
923 | tsk->btrace_seq = 0; | 925 | tsk->btrace_seq = 0; |
924 | #endif | 926 | #endif |
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index f9a1346a5fa9..7880f4f64d0e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c | |||
@@ -1656,7 +1656,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, | |||
1656 | if (cpumask_equal(p->cpus_ptr, new_mask)) | 1656 | if (cpumask_equal(p->cpus_ptr, new_mask)) |
1657 | goto out; | 1657 | goto out; |
1658 | 1658 | ||
1659 | if (!cpumask_intersects(new_mask, cpu_valid_mask)) { | 1659 | dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); |
1660 | if (dest_cpu >= nr_cpu_ids) { | ||
1660 | ret = -EINVAL; | 1661 | ret = -EINVAL; |
1661 | goto out; | 1662 | goto out; |
1662 | } | 1663 | } |
@@ -1677,7 +1678,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, | |||
1677 | if (cpumask_test_cpu(task_cpu(p), new_mask)) | 1678 | if (cpumask_test_cpu(task_cpu(p), new_mask)) |
1678 | goto out; | 1679 | goto out; |
1679 | 1680 | ||
1680 | dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); | ||
1681 | if (task_running(rq, p) || p->state == TASK_WAKING) { | 1681 | if (task_running(rq, p) || p->state == TASK_WAKING) { |
1682 | struct migration_arg arg = { p, dest_cpu }; | 1682 | struct migration_arg arg = { p, dest_cpu }; |
1683 | /* Need help from migration thread: drop lock and wait. */ | 1683 | /* Need help from migration thread: drop lock and wait. */ |
@@ -3254,7 +3254,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) | |||
3254 | /* Task is done with its stack. */ | 3254 | /* Task is done with its stack. */ |
3255 | put_task_stack(prev); | 3255 | put_task_stack(prev); |
3256 | 3256 | ||
3257 | put_task_struct(prev); | 3257 | put_task_struct_rcu_user(prev); |
3258 | } | 3258 | } |
3259 | 3259 | ||
3260 | tick_nohz_task_switch(); | 3260 | tick_nohz_task_switch(); |
@@ -3358,15 +3358,15 @@ context_switch(struct rq *rq, struct task_struct *prev, | |||
3358 | else | 3358 | else |
3359 | prev->active_mm = NULL; | 3359 | prev->active_mm = NULL; |
3360 | } else { // to user | 3360 | } else { // to user |
3361 | membarrier_switch_mm(rq, prev->active_mm, next->mm); | ||
3361 | /* | 3362 | /* |
3362 | * sys_membarrier() requires an smp_mb() between setting | 3363 | * sys_membarrier() requires an smp_mb() between setting |
3363 | * rq->curr and returning to userspace. | 3364 | * rq->curr / membarrier_switch_mm() and returning to userspace. |
3364 | * | 3365 | * |
3365 | * The below provides this either through switch_mm(), or in | 3366 | * The below provides this either through switch_mm(), or in |
3366 | * case 'prev->active_mm == next->mm' through | 3367 | * case 'prev->active_mm == next->mm' through |
3367 | * finish_task_switch()'s mmdrop(). | 3368 | * finish_task_switch()'s mmdrop(). |
3368 | */ | 3369 | */ |
3369 | |||
3370 | switch_mm_irqs_off(prev->active_mm, next->mm, next); | 3370 | switch_mm_irqs_off(prev->active_mm, next->mm, next); |
3371 | 3371 | ||
3372 | if (!prev->mm) { // from kernel | 3372 | if (!prev->mm) { // from kernel |
@@ -4042,7 +4042,11 @@ static void __sched notrace __schedule(bool preempt) | |||
4042 | 4042 | ||
4043 | if (likely(prev != next)) { | 4043 | if (likely(prev != next)) { |
4044 | rq->nr_switches++; | 4044 | rq->nr_switches++; |
4045 | rq->curr = next; | 4045 | /* |
4046 | * RCU users of rcu_dereference(rq->curr) may not see | ||
4047 | * changes to task_struct made by pick_next_task(). | ||
4048 | */ | ||
4049 | RCU_INIT_POINTER(rq->curr, next); | ||
4046 | /* | 4050 | /* |
4047 | * The membarrier system call requires each architecture | 4051 | * The membarrier system call requires each architecture |
4048 | * to have a full memory barrier after updating | 4052 | * to have a full memory barrier after updating |
@@ -4223,9 +4227,8 @@ static void __sched notrace preempt_schedule_common(void) | |||
4223 | 4227 | ||
4224 | #ifdef CONFIG_PREEMPTION | 4228 | #ifdef CONFIG_PREEMPTION |
4225 | /* | 4229 | /* |
4226 | * this is the entry point to schedule() from in-kernel preemption | 4230 | * This is the entry point to schedule() from in-kernel preemption |
4227 | * off of preempt_enable. Kernel preemptions off return from interrupt | 4231 | * off of preempt_enable. |
4228 | * occur there and call schedule directly. | ||
4229 | */ | 4232 | */ |
4230 | asmlinkage __visible void __sched notrace preempt_schedule(void) | 4233 | asmlinkage __visible void __sched notrace preempt_schedule(void) |
4231 | { | 4234 | { |
@@ -4296,7 +4299,7 @@ EXPORT_SYMBOL_GPL(preempt_schedule_notrace); | |||
4296 | #endif /* CONFIG_PREEMPTION */ | 4299 | #endif /* CONFIG_PREEMPTION */ |
4297 | 4300 | ||
4298 | /* | 4301 | /* |
4299 | * this is the entry point to schedule() from kernel preemption | 4302 | * This is the entry point to schedule() from kernel preemption |
4300 | * off of irq context. | 4303 | * off of irq context. |
4301 | * Note, that this is called and return with irqs disabled. This will | 4304 | * Note, that this is called and return with irqs disabled. This will |
4302 | * protect us against recursive calling from irq. | 4305 | * protect us against recursive calling from irq. |
@@ -6069,7 +6072,8 @@ void init_idle(struct task_struct *idle, int cpu) | |||
6069 | __set_task_cpu(idle, cpu); | 6072 | __set_task_cpu(idle, cpu); |
6070 | rcu_read_unlock(); | 6073 | rcu_read_unlock(); |
6071 | 6074 | ||
6072 | rq->curr = rq->idle = idle; | 6075 | rq->idle = idle; |
6076 | rcu_assign_pointer(rq->curr, idle); | ||
6073 | idle->on_rq = TASK_ON_RQ_QUEUED; | 6077 | idle->on_rq = TASK_ON_RQ_QUEUED; |
6074 | #ifdef CONFIG_SMP | 6078 | #ifdef CONFIG_SMP |
6075 | idle->on_cpu = 1; | 6079 | idle->on_cpu = 1; |
@@ -6430,8 +6434,6 @@ int sched_cpu_activate(unsigned int cpu) | |||
6430 | } | 6434 | } |
6431 | rq_unlock_irqrestore(rq, &rf); | 6435 | rq_unlock_irqrestore(rq, &rf); |
6432 | 6436 | ||
6433 | update_max_interval(); | ||
6434 | |||
6435 | return 0; | 6437 | return 0; |
6436 | } | 6438 | } |
6437 | 6439 | ||
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index d4bbf68c3161..83ab35e2374f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c | |||
@@ -749,7 +749,6 @@ void init_entity_runnable_average(struct sched_entity *se) | |||
749 | /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ | 749 | /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ |
750 | } | 750 | } |
751 | 751 | ||
752 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); | ||
753 | static void attach_entity_cfs_rq(struct sched_entity *se); | 752 | static void attach_entity_cfs_rq(struct sched_entity *se); |
754 | 753 | ||
755 | /* | 754 | /* |
@@ -1603,7 +1602,7 @@ static void task_numa_compare(struct task_numa_env *env, | |||
1603 | return; | 1602 | return; |
1604 | 1603 | ||
1605 | rcu_read_lock(); | 1604 | rcu_read_lock(); |
1606 | cur = task_rcu_dereference(&dst_rq->curr); | 1605 | cur = rcu_dereference(dst_rq->curr); |
1607 | if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) | 1606 | if (cur && ((cur->flags & PF_EXITING) || is_idle_task(cur))) |
1608 | cur = NULL; | 1607 | cur = NULL; |
1609 | 1608 | ||
@@ -4354,21 +4353,16 @@ static inline u64 sched_cfs_bandwidth_slice(void) | |||
4354 | } | 4353 | } |
4355 | 4354 | ||
4356 | /* | 4355 | /* |
4357 | * Replenish runtime according to assigned quota and update expiration time. | 4356 | * Replenish runtime according to assigned quota. We use sched_clock_cpu |
4358 | * We use sched_clock_cpu directly instead of rq->clock to avoid adding | 4357 | * directly instead of rq->clock to avoid adding additional synchronization |
4359 | * additional synchronization around rq->lock. | 4358 | * around rq->lock. |
4360 | * | 4359 | * |
4361 | * requires cfs_b->lock | 4360 | * requires cfs_b->lock |
4362 | */ | 4361 | */ |
4363 | void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) | 4362 | void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b) |
4364 | { | 4363 | { |
4365 | u64 now; | 4364 | if (cfs_b->quota != RUNTIME_INF) |
4366 | 4365 | cfs_b->runtime = cfs_b->quota; | |
4367 | if (cfs_b->quota == RUNTIME_INF) | ||
4368 | return; | ||
4369 | |||
4370 | now = sched_clock_cpu(smp_processor_id()); | ||
4371 | cfs_b->runtime = cfs_b->quota; | ||
4372 | } | 4366 | } |
4373 | 4367 | ||
4374 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | 4368 | static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) |
@@ -4376,15 +4370,6 @@ static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg) | |||
4376 | return &tg->cfs_bandwidth; | 4370 | return &tg->cfs_bandwidth; |
4377 | } | 4371 | } |
4378 | 4372 | ||
4379 | /* rq->task_clock normalized against any time this cfs_rq has spent throttled */ | ||
4380 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) | ||
4381 | { | ||
4382 | if (unlikely(cfs_rq->throttle_count)) | ||
4383 | return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time; | ||
4384 | |||
4385 | return rq_clock_task(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time; | ||
4386 | } | ||
4387 | |||
4388 | /* returns 0 on failure to allocate runtime */ | 4373 | /* returns 0 on failure to allocate runtime */ |
4389 | static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) | 4374 | static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq) |
4390 | { | 4375 | { |
@@ -4476,7 +4461,6 @@ static int tg_unthrottle_up(struct task_group *tg, void *data) | |||
4476 | 4461 | ||
4477 | cfs_rq->throttle_count--; | 4462 | cfs_rq->throttle_count--; |
4478 | if (!cfs_rq->throttle_count) { | 4463 | if (!cfs_rq->throttle_count) { |
4479 | /* adjust cfs_rq_clock_task() */ | ||
4480 | cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - | 4464 | cfs_rq->throttled_clock_task_time += rq_clock_task(rq) - |
4481 | cfs_rq->throttled_clock_task; | 4465 | cfs_rq->throttled_clock_task; |
4482 | 4466 | ||
@@ -4994,15 +4978,13 @@ static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) | |||
4994 | 4978 | ||
4995 | void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) | 4979 | void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b) |
4996 | { | 4980 | { |
4997 | u64 overrun; | ||
4998 | |||
4999 | lockdep_assert_held(&cfs_b->lock); | 4981 | lockdep_assert_held(&cfs_b->lock); |
5000 | 4982 | ||
5001 | if (cfs_b->period_active) | 4983 | if (cfs_b->period_active) |
5002 | return; | 4984 | return; |
5003 | 4985 | ||
5004 | cfs_b->period_active = 1; | 4986 | cfs_b->period_active = 1; |
5005 | overrun = hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); | 4987 | hrtimer_forward_now(&cfs_b->period_timer, cfs_b->period); |
5006 | hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); | 4988 | hrtimer_start_expires(&cfs_b->period_timer, HRTIMER_MODE_ABS_PINNED); |
5007 | } | 4989 | } |
5008 | 4990 | ||
@@ -5080,11 +5062,6 @@ static inline bool cfs_bandwidth_used(void) | |||
5080 | return false; | 5062 | return false; |
5081 | } | 5063 | } |
5082 | 5064 | ||
5083 | static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq) | ||
5084 | { | ||
5085 | return rq_clock_task(rq_of(cfs_rq)); | ||
5086 | } | ||
5087 | |||
5088 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} | 5065 | static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) {} |
5089 | static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } | 5066 | static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq) { return false; } |
5090 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} | 5067 | static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {} |
@@ -6412,7 +6389,7 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) | |||
6412 | } | 6389 | } |
6413 | 6390 | ||
6414 | /* Evaluate the energy impact of using this CPU. */ | 6391 | /* Evaluate the energy impact of using this CPU. */ |
6415 | if (max_spare_cap_cpu >= 0) { | 6392 | if (max_spare_cap_cpu >= 0 && max_spare_cap_cpu != prev_cpu) { |
6416 | cur_delta = compute_energy(p, max_spare_cap_cpu, pd); | 6393 | cur_delta = compute_energy(p, max_spare_cap_cpu, pd); |
6417 | cur_delta -= base_energy_pd; | 6394 | cur_delta -= base_energy_pd; |
6418 | if (cur_delta < best_delta) { | 6395 | if (cur_delta < best_delta) { |
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index aa8d75804108..a39bed2c784f 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c | |||
@@ -30,10 +30,42 @@ static void ipi_mb(void *info) | |||
30 | smp_mb(); /* IPIs should be serializing but paranoid. */ | 30 | smp_mb(); /* IPIs should be serializing but paranoid. */ |
31 | } | 31 | } |
32 | 32 | ||
33 | static void ipi_sync_rq_state(void *info) | ||
34 | { | ||
35 | struct mm_struct *mm = (struct mm_struct *) info; | ||
36 | |||
37 | if (current->mm != mm) | ||
38 | return; | ||
39 | this_cpu_write(runqueues.membarrier_state, | ||
40 | atomic_read(&mm->membarrier_state)); | ||
41 | /* | ||
42 | * Issue a memory barrier after setting | ||
43 | * MEMBARRIER_STATE_GLOBAL_EXPEDITED in the current runqueue to | ||
44 | * guarantee that no memory access following registration is reordered | ||
45 | * before registration. | ||
46 | */ | ||
47 | smp_mb(); | ||
48 | } | ||
49 | |||
50 | void membarrier_exec_mmap(struct mm_struct *mm) | ||
51 | { | ||
52 | /* | ||
53 | * Issue a memory barrier before clearing membarrier_state to | ||
54 | * guarantee that no memory access prior to exec is reordered after | ||
55 | * clearing this state. | ||
56 | */ | ||
57 | smp_mb(); | ||
58 | atomic_set(&mm->membarrier_state, 0); | ||
59 | /* | ||
60 | * Keep the runqueue membarrier_state in sync with this mm | ||
61 | * membarrier_state. | ||
62 | */ | ||
63 | this_cpu_write(runqueues.membarrier_state, 0); | ||
64 | } | ||
65 | |||
33 | static int membarrier_global_expedited(void) | 66 | static int membarrier_global_expedited(void) |
34 | { | 67 | { |
35 | int cpu; | 68 | int cpu; |
36 | bool fallback = false; | ||
37 | cpumask_var_t tmpmask; | 69 | cpumask_var_t tmpmask; |
38 | 70 | ||
39 | if (num_online_cpus() == 1) | 71 | if (num_online_cpus() == 1) |
@@ -45,17 +77,11 @@ static int membarrier_global_expedited(void) | |||
45 | */ | 77 | */ |
46 | smp_mb(); /* system call entry is not a mb. */ | 78 | smp_mb(); /* system call entry is not a mb. */ |
47 | 79 | ||
48 | /* | 80 | if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) |
49 | * Expedited membarrier commands guarantee that they won't | 81 | return -ENOMEM; |
50 | * block, hence the GFP_NOWAIT allocation flag and fallback | ||
51 | * implementation. | ||
52 | */ | ||
53 | if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { | ||
54 | /* Fallback for OOM. */ | ||
55 | fallback = true; | ||
56 | } | ||
57 | 82 | ||
58 | cpus_read_lock(); | 83 | cpus_read_lock(); |
84 | rcu_read_lock(); | ||
59 | for_each_online_cpu(cpu) { | 85 | for_each_online_cpu(cpu) { |
60 | struct task_struct *p; | 86 | struct task_struct *p; |
61 | 87 | ||
@@ -70,23 +96,28 @@ static int membarrier_global_expedited(void) | |||
70 | if (cpu == raw_smp_processor_id()) | 96 | if (cpu == raw_smp_processor_id()) |
71 | continue; | 97 | continue; |
72 | 98 | ||
73 | rcu_read_lock(); | 99 | if (!(READ_ONCE(cpu_rq(cpu)->membarrier_state) & |
74 | p = task_rcu_dereference(&cpu_rq(cpu)->curr); | 100 | MEMBARRIER_STATE_GLOBAL_EXPEDITED)) |
75 | if (p && p->mm && (atomic_read(&p->mm->membarrier_state) & | 101 | continue; |
76 | MEMBARRIER_STATE_GLOBAL_EXPEDITED)) { | 102 | |
77 | if (!fallback) | 103 | /* |
78 | __cpumask_set_cpu(cpu, tmpmask); | 104 | * Skip the CPU if it runs a kernel thread. The scheduler |
79 | else | 105 | * leaves the prior task mm in place as an optimization when |
80 | smp_call_function_single(cpu, ipi_mb, NULL, 1); | 106 | * scheduling a kthread. |
81 | } | 107 | */ |
82 | rcu_read_unlock(); | 108 | p = rcu_dereference(cpu_rq(cpu)->curr); |
83 | } | 109 | if (p->flags & PF_KTHREAD) |
84 | if (!fallback) { | 110 | continue; |
85 | preempt_disable(); | 111 | |
86 | smp_call_function_many(tmpmask, ipi_mb, NULL, 1); | 112 | __cpumask_set_cpu(cpu, tmpmask); |
87 | preempt_enable(); | ||
88 | free_cpumask_var(tmpmask); | ||
89 | } | 113 | } |
114 | rcu_read_unlock(); | ||
115 | |||
116 | preempt_disable(); | ||
117 | smp_call_function_many(tmpmask, ipi_mb, NULL, 1); | ||
118 | preempt_enable(); | ||
119 | |||
120 | free_cpumask_var(tmpmask); | ||
90 | cpus_read_unlock(); | 121 | cpus_read_unlock(); |
91 | 122 | ||
92 | /* | 123 | /* |
@@ -101,22 +132,22 @@ static int membarrier_global_expedited(void) | |||
101 | static int membarrier_private_expedited(int flags) | 132 | static int membarrier_private_expedited(int flags) |
102 | { | 133 | { |
103 | int cpu; | 134 | int cpu; |
104 | bool fallback = false; | ||
105 | cpumask_var_t tmpmask; | 135 | cpumask_var_t tmpmask; |
136 | struct mm_struct *mm = current->mm; | ||
106 | 137 | ||
107 | if (flags & MEMBARRIER_FLAG_SYNC_CORE) { | 138 | if (flags & MEMBARRIER_FLAG_SYNC_CORE) { |
108 | if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) | 139 | if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) |
109 | return -EINVAL; | 140 | return -EINVAL; |
110 | if (!(atomic_read(¤t->mm->membarrier_state) & | 141 | if (!(atomic_read(&mm->membarrier_state) & |
111 | MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) | 142 | MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) |
112 | return -EPERM; | 143 | return -EPERM; |
113 | } else { | 144 | } else { |
114 | if (!(atomic_read(¤t->mm->membarrier_state) & | 145 | if (!(atomic_read(&mm->membarrier_state) & |
115 | MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) | 146 | MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY)) |
116 | return -EPERM; | 147 | return -EPERM; |
117 | } | 148 | } |
118 | 149 | ||
119 | if (num_online_cpus() == 1) | 150 | if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) |
120 | return 0; | 151 | return 0; |
121 | 152 | ||
122 | /* | 153 | /* |
@@ -125,17 +156,11 @@ static int membarrier_private_expedited(int flags) | |||
125 | */ | 156 | */ |
126 | smp_mb(); /* system call entry is not a mb. */ | 157 | smp_mb(); /* system call entry is not a mb. */ |
127 | 158 | ||
128 | /* | 159 | if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) |
129 | * Expedited membarrier commands guarantee that they won't | 160 | return -ENOMEM; |
130 | * block, hence the GFP_NOWAIT allocation flag and fallback | ||
131 | * implementation. | ||
132 | */ | ||
133 | if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) { | ||
134 | /* Fallback for OOM. */ | ||
135 | fallback = true; | ||
136 | } | ||
137 | 161 | ||
138 | cpus_read_lock(); | 162 | cpus_read_lock(); |
163 | rcu_read_lock(); | ||
139 | for_each_online_cpu(cpu) { | 164 | for_each_online_cpu(cpu) { |
140 | struct task_struct *p; | 165 | struct task_struct *p; |
141 | 166 | ||
@@ -150,21 +175,17 @@ static int membarrier_private_expedited(int flags) | |||
150 | if (cpu == raw_smp_processor_id()) | 175 | if (cpu == raw_smp_processor_id()) |
151 | continue; | 176 | continue; |
152 | rcu_read_lock(); | 177 | rcu_read_lock(); |
153 | p = task_rcu_dereference(&cpu_rq(cpu)->curr); | 178 | p = rcu_dereference(cpu_rq(cpu)->curr); |
154 | if (p && p->mm == current->mm) { | 179 | if (p && p->mm == mm) |
155 | if (!fallback) | 180 | __cpumask_set_cpu(cpu, tmpmask); |
156 | __cpumask_set_cpu(cpu, tmpmask); | ||
157 | else | ||
158 | smp_call_function_single(cpu, ipi_mb, NULL, 1); | ||
159 | } | ||
160 | rcu_read_unlock(); | ||
161 | } | ||
162 | if (!fallback) { | ||
163 | preempt_disable(); | ||
164 | smp_call_function_many(tmpmask, ipi_mb, NULL, 1); | ||
165 | preempt_enable(); | ||
166 | free_cpumask_var(tmpmask); | ||
167 | } | 181 | } |
182 | rcu_read_unlock(); | ||
183 | |||
184 | preempt_disable(); | ||
185 | smp_call_function_many(tmpmask, ipi_mb, NULL, 1); | ||
186 | preempt_enable(); | ||
187 | |||
188 | free_cpumask_var(tmpmask); | ||
168 | cpus_read_unlock(); | 189 | cpus_read_unlock(); |
169 | 190 | ||
170 | /* | 191 | /* |
@@ -177,32 +198,78 @@ static int membarrier_private_expedited(int flags) | |||
177 | return 0; | 198 | return 0; |
178 | } | 199 | } |
179 | 200 | ||
201 | static int sync_runqueues_membarrier_state(struct mm_struct *mm) | ||
202 | { | ||
203 | int membarrier_state = atomic_read(&mm->membarrier_state); | ||
204 | cpumask_var_t tmpmask; | ||
205 | int cpu; | ||
206 | |||
207 | if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) { | ||
208 | this_cpu_write(runqueues.membarrier_state, membarrier_state); | ||
209 | |||
210 | /* | ||
211 | * For single mm user, we can simply issue a memory barrier | ||
212 | * after setting MEMBARRIER_STATE_GLOBAL_EXPEDITED in the | ||
213 | * mm and in the current runqueue to guarantee that no memory | ||
214 | * access following registration is reordered before | ||
215 | * registration. | ||
216 | */ | ||
217 | smp_mb(); | ||
218 | return 0; | ||
219 | } | ||
220 | |||
221 | if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) | ||
222 | return -ENOMEM; | ||
223 | |||
224 | /* | ||
225 | * For mm with multiple users, we need to ensure all future | ||
226 | * scheduler executions will observe @mm's new membarrier | ||
227 | * state. | ||
228 | */ | ||
229 | synchronize_rcu(); | ||
230 | |||
231 | /* | ||
232 | * For each cpu runqueue, if the task's mm match @mm, ensure that all | ||
233 | * @mm's membarrier state set bits are also set in in the runqueue's | ||
234 | * membarrier state. This ensures that a runqueue scheduling | ||
235 | * between threads which are users of @mm has its membarrier state | ||
236 | * updated. | ||
237 | */ | ||
238 | cpus_read_lock(); | ||
239 | rcu_read_lock(); | ||
240 | for_each_online_cpu(cpu) { | ||
241 | struct rq *rq = cpu_rq(cpu); | ||
242 | struct task_struct *p; | ||
243 | |||
244 | p = rcu_dereference(rq->curr); | ||
245 | if (p && p->mm == mm) | ||
246 | __cpumask_set_cpu(cpu, tmpmask); | ||
247 | } | ||
248 | rcu_read_unlock(); | ||
249 | |||
250 | preempt_disable(); | ||
251 | smp_call_function_many(tmpmask, ipi_sync_rq_state, mm, 1); | ||
252 | preempt_enable(); | ||
253 | |||
254 | free_cpumask_var(tmpmask); | ||
255 | cpus_read_unlock(); | ||
256 | |||
257 | return 0; | ||
258 | } | ||
259 | |||
180 | static int membarrier_register_global_expedited(void) | 260 | static int membarrier_register_global_expedited(void) |
181 | { | 261 | { |
182 | struct task_struct *p = current; | 262 | struct task_struct *p = current; |
183 | struct mm_struct *mm = p->mm; | 263 | struct mm_struct *mm = p->mm; |
264 | int ret; | ||
184 | 265 | ||
185 | if (atomic_read(&mm->membarrier_state) & | 266 | if (atomic_read(&mm->membarrier_state) & |
186 | MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) | 267 | MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY) |
187 | return 0; | 268 | return 0; |
188 | atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); | 269 | atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED, &mm->membarrier_state); |
189 | if (atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1) { | 270 | ret = sync_runqueues_membarrier_state(mm); |
190 | /* | 271 | if (ret) |
191 | * For single mm user, single threaded process, we can | 272 | return ret; |
192 | * simply issue a memory barrier after setting | ||
193 | * MEMBARRIER_STATE_GLOBAL_EXPEDITED to guarantee that | ||
194 | * no memory access following registration is reordered | ||
195 | * before registration. | ||
196 | */ | ||
197 | smp_mb(); | ||
198 | } else { | ||
199 | /* | ||
200 | * For multi-mm user threads, we need to ensure all | ||
201 | * future scheduler executions will observe the new | ||
202 | * thread flag state for this mm. | ||
203 | */ | ||
204 | synchronize_rcu(); | ||
205 | } | ||
206 | atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, | 273 | atomic_or(MEMBARRIER_STATE_GLOBAL_EXPEDITED_READY, |
207 | &mm->membarrier_state); | 274 | &mm->membarrier_state); |
208 | 275 | ||
@@ -213,12 +280,15 @@ static int membarrier_register_private_expedited(int flags) | |||
213 | { | 280 | { |
214 | struct task_struct *p = current; | 281 | struct task_struct *p = current; |
215 | struct mm_struct *mm = p->mm; | 282 | struct mm_struct *mm = p->mm; |
216 | int state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY; | 283 | int ready_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_READY, |
284 | set_state = MEMBARRIER_STATE_PRIVATE_EXPEDITED, | ||
285 | ret; | ||
217 | 286 | ||
218 | if (flags & MEMBARRIER_FLAG_SYNC_CORE) { | 287 | if (flags & MEMBARRIER_FLAG_SYNC_CORE) { |
219 | if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) | 288 | if (!IS_ENABLED(CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE)) |
220 | return -EINVAL; | 289 | return -EINVAL; |
221 | state = MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; | 290 | ready_state = |
291 | MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY; | ||
222 | } | 292 | } |
223 | 293 | ||
224 | /* | 294 | /* |
@@ -226,20 +296,15 @@ static int membarrier_register_private_expedited(int flags) | |||
226 | * groups, which use the same mm. (CLONE_VM but not | 296 | * groups, which use the same mm. (CLONE_VM but not |
227 | * CLONE_THREAD). | 297 | * CLONE_THREAD). |
228 | */ | 298 | */ |
229 | if (atomic_read(&mm->membarrier_state) & state) | 299 | if ((atomic_read(&mm->membarrier_state) & ready_state) == ready_state) |
230 | return 0; | 300 | return 0; |
231 | atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED, &mm->membarrier_state); | ||
232 | if (flags & MEMBARRIER_FLAG_SYNC_CORE) | 301 | if (flags & MEMBARRIER_FLAG_SYNC_CORE) |
233 | atomic_or(MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE, | 302 | set_state |= MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE; |
234 | &mm->membarrier_state); | 303 | atomic_or(set_state, &mm->membarrier_state); |
235 | if (!(atomic_read(&mm->mm_users) == 1 && get_nr_threads(p) == 1)) { | 304 | ret = sync_runqueues_membarrier_state(mm); |
236 | /* | 305 | if (ret) |
237 | * Ensure all future scheduler executions will observe the | 306 | return ret; |
238 | * new thread flag state for this process. | 307 | atomic_or(ready_state, &mm->membarrier_state); |
239 | */ | ||
240 | synchronize_rcu(); | ||
241 | } | ||
242 | atomic_or(state, &mm->membarrier_state); | ||
243 | 308 | ||
244 | return 0; | 309 | return 0; |
245 | } | 310 | } |
@@ -253,8 +318,10 @@ static int membarrier_register_private_expedited(int flags) | |||
253 | * command specified does not exist, not available on the running | 318 | * command specified does not exist, not available on the running |
254 | * kernel, or if the command argument is invalid, this system call | 319 | * kernel, or if the command argument is invalid, this system call |
255 | * returns -EINVAL. For a given command, with flags argument set to 0, | 320 | * returns -EINVAL. For a given command, with flags argument set to 0, |
256 | * this system call is guaranteed to always return the same value until | 321 | * if this system call returns -ENOSYS or -EINVAL, it is guaranteed to |
257 | * reboot. | 322 | * always return the same value until reboot. In addition, it can return |
323 | * -ENOMEM if there is not enough memory available to perform the system | ||
324 | * call. | ||
258 | * | 325 | * |
259 | * All memory accesses performed in program order from each targeted thread | 326 | * All memory accesses performed in program order from each targeted thread |
260 | * is guaranteed to be ordered with respect to sys_membarrier(). If we use | 327 | * is guaranteed to be ordered with respect to sys_membarrier(). If we use |
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index b3cb895d14a2..0db2c1b3361e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h | |||
@@ -911,6 +911,10 @@ struct rq { | |||
911 | 911 | ||
912 | atomic_t nr_iowait; | 912 | atomic_t nr_iowait; |
913 | 913 | ||
914 | #ifdef CONFIG_MEMBARRIER | ||
915 | int membarrier_state; | ||
916 | #endif | ||
917 | |||
914 | #ifdef CONFIG_SMP | 918 | #ifdef CONFIG_SMP |
915 | struct root_domain *rd; | 919 | struct root_domain *rd; |
916 | struct sched_domain __rcu *sd; | 920 | struct sched_domain __rcu *sd; |
@@ -2438,3 +2442,33 @@ static inline bool sched_energy_enabled(void) | |||
2438 | static inline bool sched_energy_enabled(void) { return false; } | 2442 | static inline bool sched_energy_enabled(void) { return false; } |
2439 | 2443 | ||
2440 | #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ | 2444 | #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL */ |
2445 | |||
2446 | #ifdef CONFIG_MEMBARRIER | ||
2447 | /* | ||
2448 | * The scheduler provides memory barriers required by membarrier between: | ||
2449 | * - prior user-space memory accesses and store to rq->membarrier_state, | ||
2450 | * - store to rq->membarrier_state and following user-space memory accesses. | ||
2451 | * In the same way it provides those guarantees around store to rq->curr. | ||
2452 | */ | ||
2453 | static inline void membarrier_switch_mm(struct rq *rq, | ||
2454 | struct mm_struct *prev_mm, | ||
2455 | struct mm_struct *next_mm) | ||
2456 | { | ||
2457 | int membarrier_state; | ||
2458 | |||
2459 | if (prev_mm == next_mm) | ||
2460 | return; | ||
2461 | |||
2462 | membarrier_state = atomic_read(&next_mm->membarrier_state); | ||
2463 | if (READ_ONCE(rq->membarrier_state) == membarrier_state) | ||
2464 | return; | ||
2465 | |||
2466 | WRITE_ONCE(rq->membarrier_state, membarrier_state); | ||
2467 | } | ||
2468 | #else | ||
2469 | static inline void membarrier_switch_mm(struct rq *rq, | ||
2470 | struct mm_struct *prev_mm, | ||
2471 | struct mm_struct *next_mm) | ||
2472 | { | ||
2473 | } | ||
2474 | #endif | ||
diff --git a/tools/testing/selftests/membarrier/.gitignore b/tools/testing/selftests/membarrier/.gitignore index 020c44f49a9e..f2f7ec0a99b4 100644 --- a/tools/testing/selftests/membarrier/.gitignore +++ b/tools/testing/selftests/membarrier/.gitignore | |||
@@ -1 +1,2 @@ | |||
1 | membarrier_test | 1 | membarrier_test_multi_thread |
2 | membarrier_test_single_thread | ||
diff --git a/tools/testing/selftests/membarrier/Makefile b/tools/testing/selftests/membarrier/Makefile index 97e3bdf3d1e9..34d1c81a2324 100644 --- a/tools/testing/selftests/membarrier/Makefile +++ b/tools/testing/selftests/membarrier/Makefile | |||
@@ -1,7 +1,8 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | 1 | # SPDX-License-Identifier: GPL-2.0-only |
2 | CFLAGS += -g -I../../../../usr/include/ | 2 | CFLAGS += -g -I../../../../usr/include/ |
3 | LDLIBS += -lpthread | ||
3 | 4 | ||
4 | TEST_GEN_PROGS := membarrier_test | 5 | TEST_GEN_PROGS := membarrier_test_single_thread \ |
6 | membarrier_test_multi_thread | ||
5 | 7 | ||
6 | include ../lib.mk | 8 | include ../lib.mk |
7 | |||
diff --git a/tools/testing/selftests/membarrier/membarrier_test.c b/tools/testing/selftests/membarrier/membarrier_test_impl.h index 70b4ddbf126b..186be69f0a59 100644 --- a/tools/testing/selftests/membarrier/membarrier_test.c +++ b/tools/testing/selftests/membarrier/membarrier_test_impl.h | |||
@@ -1,10 +1,11 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | 1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #define _GNU_SOURCE | 2 | #define _GNU_SOURCE |
3 | #include <linux/membarrier.h> | 3 | #include <linux/membarrier.h> |
4 | #include <syscall.h> | 4 | #include <syscall.h> |
5 | #include <stdio.h> | 5 | #include <stdio.h> |
6 | #include <errno.h> | 6 | #include <errno.h> |
7 | #include <string.h> | 7 | #include <string.h> |
8 | #include <pthread.h> | ||
8 | 9 | ||
9 | #include "../kselftest.h" | 10 | #include "../kselftest.h" |
10 | 11 | ||
@@ -223,7 +224,7 @@ static int test_membarrier_global_expedited_success(void) | |||
223 | return 0; | 224 | return 0; |
224 | } | 225 | } |
225 | 226 | ||
226 | static int test_membarrier(void) | 227 | static int test_membarrier_fail(void) |
227 | { | 228 | { |
228 | int status; | 229 | int status; |
229 | 230 | ||
@@ -233,10 +234,27 @@ static int test_membarrier(void) | |||
233 | status = test_membarrier_flags_fail(); | 234 | status = test_membarrier_flags_fail(); |
234 | if (status) | 235 | if (status) |
235 | return status; | 236 | return status; |
236 | status = test_membarrier_global_success(); | 237 | status = test_membarrier_private_expedited_fail(); |
237 | if (status) | 238 | if (status) |
238 | return status; | 239 | return status; |
239 | status = test_membarrier_private_expedited_fail(); | 240 | status = sys_membarrier(MEMBARRIER_CMD_QUERY, 0); |
241 | if (status < 0) { | ||
242 | ksft_test_result_fail("sys_membarrier() failed\n"); | ||
243 | return status; | ||
244 | } | ||
245 | if (status & MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE) { | ||
246 | status = test_membarrier_private_expedited_sync_core_fail(); | ||
247 | if (status) | ||
248 | return status; | ||
249 | } | ||
250 | return 0; | ||
251 | } | ||
252 | |||
253 | static int test_membarrier_success(void) | ||
254 | { | ||
255 | int status; | ||
256 | |||
257 | status = test_membarrier_global_success(); | ||
240 | if (status) | 258 | if (status) |
241 | return status; | 259 | return status; |
242 | status = test_membarrier_register_private_expedited_success(); | 260 | status = test_membarrier_register_private_expedited_success(); |
@@ -251,9 +269,6 @@ static int test_membarrier(void) | |||
251 | return status; | 269 | return status; |
252 | } | 270 | } |
253 | if (status & MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE) { | 271 | if (status & MEMBARRIER_CMD_PRIVATE_EXPEDITED_SYNC_CORE) { |
254 | status = test_membarrier_private_expedited_sync_core_fail(); | ||
255 | if (status) | ||
256 | return status; | ||
257 | status = test_membarrier_register_private_expedited_sync_core_success(); | 272 | status = test_membarrier_register_private_expedited_sync_core_success(); |
258 | if (status) | 273 | if (status) |
259 | return status; | 274 | return status; |
@@ -300,14 +315,3 @@ static int test_membarrier_query(void) | |||
300 | ksft_test_result_pass("sys_membarrier available\n"); | 315 | ksft_test_result_pass("sys_membarrier available\n"); |
301 | return 0; | 316 | return 0; |
302 | } | 317 | } |
303 | |||
304 | int main(int argc, char **argv) | ||
305 | { | ||
306 | ksft_print_header(); | ||
307 | ksft_set_plan(13); | ||
308 | |||
309 | test_membarrier_query(); | ||
310 | test_membarrier(); | ||
311 | |||
312 | return ksft_exit_pass(); | ||
313 | } | ||
diff --git a/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c b/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c new file mode 100644 index 000000000000..ac5613e5b0eb --- /dev/null +++ b/tools/testing/selftests/membarrier/membarrier_test_multi_thread.c | |||
@@ -0,0 +1,73 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #define _GNU_SOURCE | ||
3 | #include <linux/membarrier.h> | ||
4 | #include <syscall.h> | ||
5 | #include <stdio.h> | ||
6 | #include <errno.h> | ||
7 | #include <string.h> | ||
8 | #include <pthread.h> | ||
9 | |||
10 | #include "membarrier_test_impl.h" | ||
11 | |||
12 | static int thread_ready, thread_quit; | ||
13 | static pthread_mutex_t test_membarrier_thread_mutex = | ||
14 | PTHREAD_MUTEX_INITIALIZER; | ||
15 | static pthread_cond_t test_membarrier_thread_cond = | ||
16 | PTHREAD_COND_INITIALIZER; | ||
17 | |||
18 | void *test_membarrier_thread(void *arg) | ||
19 | { | ||
20 | pthread_mutex_lock(&test_membarrier_thread_mutex); | ||
21 | thread_ready = 1; | ||
22 | pthread_cond_broadcast(&test_membarrier_thread_cond); | ||
23 | pthread_mutex_unlock(&test_membarrier_thread_mutex); | ||
24 | |||
25 | pthread_mutex_lock(&test_membarrier_thread_mutex); | ||
26 | while (!thread_quit) | ||
27 | pthread_cond_wait(&test_membarrier_thread_cond, | ||
28 | &test_membarrier_thread_mutex); | ||
29 | pthread_mutex_unlock(&test_membarrier_thread_mutex); | ||
30 | |||
31 | return NULL; | ||
32 | } | ||
33 | |||
34 | static int test_mt_membarrier(void) | ||
35 | { | ||
36 | int i; | ||
37 | pthread_t test_thread; | ||
38 | |||
39 | pthread_create(&test_thread, NULL, | ||
40 | test_membarrier_thread, NULL); | ||
41 | |||
42 | pthread_mutex_lock(&test_membarrier_thread_mutex); | ||
43 | while (!thread_ready) | ||
44 | pthread_cond_wait(&test_membarrier_thread_cond, | ||
45 | &test_membarrier_thread_mutex); | ||
46 | pthread_mutex_unlock(&test_membarrier_thread_mutex); | ||
47 | |||
48 | test_membarrier_fail(); | ||
49 | |||
50 | test_membarrier_success(); | ||
51 | |||
52 | pthread_mutex_lock(&test_membarrier_thread_mutex); | ||
53 | thread_quit = 1; | ||
54 | pthread_cond_broadcast(&test_membarrier_thread_cond); | ||
55 | pthread_mutex_unlock(&test_membarrier_thread_mutex); | ||
56 | |||
57 | pthread_join(test_thread, NULL); | ||
58 | |||
59 | return 0; | ||
60 | } | ||
61 | |||
62 | int main(int argc, char **argv) | ||
63 | { | ||
64 | ksft_print_header(); | ||
65 | ksft_set_plan(13); | ||
66 | |||
67 | test_membarrier_query(); | ||
68 | |||
69 | /* Multi-threaded */ | ||
70 | test_mt_membarrier(); | ||
71 | |||
72 | return ksft_exit_pass(); | ||
73 | } | ||
diff --git a/tools/testing/selftests/membarrier/membarrier_test_single_thread.c b/tools/testing/selftests/membarrier/membarrier_test_single_thread.c new file mode 100644 index 000000000000..c1c963902854 --- /dev/null +++ b/tools/testing/selftests/membarrier/membarrier_test_single_thread.c | |||
@@ -0,0 +1,24 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0 | ||
2 | #define _GNU_SOURCE | ||
3 | #include <linux/membarrier.h> | ||
4 | #include <syscall.h> | ||
5 | #include <stdio.h> | ||
6 | #include <errno.h> | ||
7 | #include <string.h> | ||
8 | #include <pthread.h> | ||
9 | |||
10 | #include "membarrier_test_impl.h" | ||
11 | |||
12 | int main(int argc, char **argv) | ||
13 | { | ||
14 | ksft_print_header(); | ||
15 | ksft_set_plan(13); | ||
16 | |||
17 | test_membarrier_query(); | ||
18 | |||
19 | test_membarrier_fail(); | ||
20 | |||
21 | test_membarrier_success(); | ||
22 | |||
23 | return ksft_exit_pass(); | ||
24 | } | ||