aboutsummaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/Makefile1
-rw-r--r--kernel/exit.c10
-rw-r--r--kernel/locking/qspinlock.c117
-rw-r--r--kernel/membarrier.c70
-rw-r--r--kernel/rcu/Kconfig3
-rw-r--r--kernel/rcu/rcu.h128
-rw-r--r--kernel/rcu/rcu_segcblist.c108
-rw-r--r--kernel/rcu/rcu_segcblist.h28
-rw-r--r--kernel/rcu/rcuperf.c17
-rw-r--r--kernel/rcu/rcutorture.c83
-rw-r--r--kernel/rcu/srcutiny.c8
-rw-r--r--kernel/rcu/srcutree.c50
-rw-r--r--kernel/rcu/tiny.c2
-rw-r--r--kernel/rcu/tiny_plugin.h47
-rw-r--r--kernel/rcu/tree.c174
-rw-r--r--kernel/rcu/tree.h15
-rw-r--r--kernel/rcu/tree_exp.h2
-rw-r--r--kernel/rcu/tree_plugin.h238
-rw-r--r--kernel/rcu/update.c18
-rw-r--r--kernel/sched/Makefile1
-rw-r--r--kernel/sched/completion.c11
-rw-r--r--kernel/sched/core.c38
-rw-r--r--kernel/sched/membarrier.c152
-rw-r--r--kernel/task_work.c8
-rw-r--r--kernel/torture.c2
25 files changed, 544 insertions, 787 deletions
diff --git a/kernel/Makefile b/kernel/Makefile
index 4cb8e8b23c6e..9c323a6daa46 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -108,7 +108,6 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
108obj-$(CONFIG_JUMP_LABEL) += jump_label.o 108obj-$(CONFIG_JUMP_LABEL) += jump_label.o
109obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o 109obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o
110obj-$(CONFIG_TORTURE_TEST) += torture.o 110obj-$(CONFIG_TORTURE_TEST) += torture.o
111obj-$(CONFIG_MEMBARRIER) += membarrier.o
112 111
113obj-$(CONFIG_HAS_IOMEM) += memremap.o 112obj-$(CONFIG_HAS_IOMEM) += memremap.o
114 113
diff --git a/kernel/exit.c b/kernel/exit.c
index c5548faa9f37..f9ef3ecc78c1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -764,7 +764,6 @@ void __noreturn do_exit(long code)
764{ 764{
765 struct task_struct *tsk = current; 765 struct task_struct *tsk = current;
766 int group_dead; 766 int group_dead;
767 TASKS_RCU(int tasks_rcu_i);
768 767
769 profile_task_exit(tsk); 768 profile_task_exit(tsk);
770 kcov_task_exit(tsk); 769 kcov_task_exit(tsk);
@@ -819,7 +818,8 @@ void __noreturn do_exit(long code)
819 * Ensure that we must observe the pi_state in exit_mm() -> 818 * Ensure that we must observe the pi_state in exit_mm() ->
820 * mm_release() -> exit_pi_state_list(). 819 * mm_release() -> exit_pi_state_list().
821 */ 820 */
822 raw_spin_unlock_wait(&tsk->pi_lock); 821 raw_spin_lock_irq(&tsk->pi_lock);
822 raw_spin_unlock_irq(&tsk->pi_lock);
823 823
824 if (unlikely(in_atomic())) { 824 if (unlikely(in_atomic())) {
825 pr_info("note: %s[%d] exited with preempt_count %d\n", 825 pr_info("note: %s[%d] exited with preempt_count %d\n",
@@ -881,9 +881,7 @@ void __noreturn do_exit(long code)
881 */ 881 */
882 flush_ptrace_hw_breakpoint(tsk); 882 flush_ptrace_hw_breakpoint(tsk);
883 883
884 TASKS_RCU(preempt_disable()); 884 exit_tasks_rcu_start();
885 TASKS_RCU(tasks_rcu_i = __srcu_read_lock(&tasks_rcu_exit_srcu));
886 TASKS_RCU(preempt_enable());
887 exit_notify(tsk, group_dead); 885 exit_notify(tsk, group_dead);
888 proc_exit_connector(tsk); 886 proc_exit_connector(tsk);
889 mpol_put_task_policy(tsk); 887 mpol_put_task_policy(tsk);
@@ -918,7 +916,7 @@ void __noreturn do_exit(long code)
918 if (tsk->nr_dirtied) 916 if (tsk->nr_dirtied)
919 __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied); 917 __this_cpu_add(dirty_throttle_leaks, tsk->nr_dirtied);
920 exit_rcu(); 918 exit_rcu();
921 TASKS_RCU(__srcu_read_unlock(&tasks_rcu_exit_srcu, tasks_rcu_i)); 919 exit_tasks_rcu_finish();
922 920
923 do_task_dead(); 921 do_task_dead();
924} 922}
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index fd24153e8a48..294294c71ba4 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -268,123 +268,6 @@ static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock,
268#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath 268#define queued_spin_lock_slowpath native_queued_spin_lock_slowpath
269#endif 269#endif
270 270
271/*
272 * Various notes on spin_is_locked() and spin_unlock_wait(), which are
273 * 'interesting' functions:
274 *
275 * PROBLEM: some architectures have an interesting issue with atomic ACQUIRE
276 * operations in that the ACQUIRE applies to the LOAD _not_ the STORE (ARM64,
277 * PPC). Also qspinlock has a similar issue per construction, the setting of
278 * the locked byte can be unordered acquiring the lock proper.
279 *
280 * This gets to be 'interesting' in the following cases, where the /should/s
281 * end up false because of this issue.
282 *
283 *
284 * CASE 1:
285 *
286 * So the spin_is_locked() correctness issue comes from something like:
287 *
288 * CPU0 CPU1
289 *
290 * global_lock(); local_lock(i)
291 * spin_lock(&G) spin_lock(&L[i])
292 * for (i) if (!spin_is_locked(&G)) {
293 * spin_unlock_wait(&L[i]); smp_acquire__after_ctrl_dep();
294 * return;
295 * }
296 * // deal with fail
297 *
298 * Where it is important CPU1 sees G locked or CPU0 sees L[i] locked such
299 * that there is exclusion between the two critical sections.
300 *
301 * The load from spin_is_locked(&G) /should/ be constrained by the ACQUIRE from
302 * spin_lock(&L[i]), and similarly the load(s) from spin_unlock_wait(&L[i])
303 * /should/ be constrained by the ACQUIRE from spin_lock(&G).
304 *
305 * Similarly, later stuff is constrained by the ACQUIRE from CTRL+RMB.
306 *
307 *
308 * CASE 2:
309 *
310 * For spin_unlock_wait() there is a second correctness issue, namely:
311 *
312 * CPU0 CPU1
313 *
314 * flag = set;
315 * smp_mb(); spin_lock(&l)
316 * spin_unlock_wait(&l); if (!flag)
317 * // add to lockless list
318 * spin_unlock(&l);
319 * // iterate lockless list
320 *
321 * Which wants to ensure that CPU1 will stop adding bits to the list and CPU0
322 * will observe the last entry on the list (if spin_unlock_wait() had ACQUIRE
323 * semantics etc..)
324 *
325 * Where flag /should/ be ordered against the locked store of l.
326 */
327
328/*
329 * queued_spin_lock_slowpath() can (load-)ACQUIRE the lock before
330 * issuing an _unordered_ store to set _Q_LOCKED_VAL.
331 *
332 * This means that the store can be delayed, but no later than the
333 * store-release from the unlock. This means that simply observing
334 * _Q_LOCKED_VAL is not sufficient to determine if the lock is acquired.
335 *
336 * There are two paths that can issue the unordered store:
337 *
338 * (1) clear_pending_set_locked(): *,1,0 -> *,0,1
339 *
340 * (2) set_locked(): t,0,0 -> t,0,1 ; t != 0
341 * atomic_cmpxchg_relaxed(): t,0,0 -> 0,0,1
342 *
343 * However, in both cases we have other !0 state we've set before to queue
344 * ourseves:
345 *
346 * For (1) we have the atomic_cmpxchg_acquire() that set _Q_PENDING_VAL, our
347 * load is constrained by that ACQUIRE to not pass before that, and thus must
348 * observe the store.
349 *
350 * For (2) we have a more intersting scenario. We enqueue ourselves using
351 * xchg_tail(), which ends up being a RELEASE. This in itself is not
352 * sufficient, however that is followed by an smp_cond_acquire() on the same
353 * word, giving a RELEASE->ACQUIRE ordering. This again constrains our load and
354 * guarantees we must observe that store.
355 *
356 * Therefore both cases have other !0 state that is observable before the
357 * unordered locked byte store comes through. This means we can use that to
358 * wait for the lock store, and then wait for an unlock.
359 */
360#ifndef queued_spin_unlock_wait
361void queued_spin_unlock_wait(struct qspinlock *lock)
362{
363 u32 val;
364
365 for (;;) {
366 val = atomic_read(&lock->val);
367
368 if (!val) /* not locked, we're done */
369 goto done;
370
371 if (val & _Q_LOCKED_MASK) /* locked, go wait for unlock */
372 break;
373
374 /* not locked, but pending, wait until we observe the lock */
375 cpu_relax();
376 }
377
378 /* any unlock is good */
379 while (atomic_read(&lock->val) & _Q_LOCKED_MASK)
380 cpu_relax();
381
382done:
383 smp_acquire__after_ctrl_dep();
384}
385EXPORT_SYMBOL(queued_spin_unlock_wait);
386#endif
387
388#endif /* _GEN_PV_LOCK_SLOWPATH */ 271#endif /* _GEN_PV_LOCK_SLOWPATH */
389 272
390/** 273/**
diff --git a/kernel/membarrier.c b/kernel/membarrier.c
deleted file mode 100644
index 9f9284f37f8d..000000000000
--- a/kernel/membarrier.c
+++ /dev/null
@@ -1,70 +0,0 @@
1/*
2 * Copyright (C) 2010, 2015 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3 *
4 * membarrier system call
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 */
16
17#include <linux/syscalls.h>
18#include <linux/membarrier.h>
19#include <linux/tick.h>
20
21/*
22 * Bitmask made from a "or" of all commands within enum membarrier_cmd,
23 * except MEMBARRIER_CMD_QUERY.
24 */
25#define MEMBARRIER_CMD_BITMASK (MEMBARRIER_CMD_SHARED)
26
27/**
28 * sys_membarrier - issue memory barriers on a set of threads
29 * @cmd: Takes command values defined in enum membarrier_cmd.
30 * @flags: Currently needs to be 0. For future extensions.
31 *
32 * If this system call is not implemented, -ENOSYS is returned. If the
33 * command specified does not exist, or if the command argument is invalid,
34 * this system call returns -EINVAL. For a given command, with flags argument
35 * set to 0, this system call is guaranteed to always return the same value
36 * until reboot.
37 *
38 * All memory accesses performed in program order from each targeted thread
39 * is guaranteed to be ordered with respect to sys_membarrier(). If we use
40 * the semantic "barrier()" to represent a compiler barrier forcing memory
41 * accesses to be performed in program order across the barrier, and
42 * smp_mb() to represent explicit memory barriers forcing full memory
43 * ordering across the barrier, we have the following ordering table for
44 * each pair of barrier(), sys_membarrier() and smp_mb():
45 *
46 * The pair ordering is detailed as (O: ordered, X: not ordered):
47 *
48 * barrier() smp_mb() sys_membarrier()
49 * barrier() X X O
50 * smp_mb() X O O
51 * sys_membarrier() O O O
52 */
53SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
54{
55 /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
56 if (tick_nohz_full_enabled())
57 return -ENOSYS;
58 if (unlikely(flags))
59 return -EINVAL;
60 switch (cmd) {
61 case MEMBARRIER_CMD_QUERY:
62 return MEMBARRIER_CMD_BITMASK;
63 case MEMBARRIER_CMD_SHARED:
64 if (num_online_cpus() > 1)
65 synchronize_sched();
66 return 0;
67 default:
68 return -EINVAL;
69 }
70}
diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig
index be90c945063f..9210379c0353 100644
--- a/kernel/rcu/Kconfig
+++ b/kernel/rcu/Kconfig
@@ -69,8 +69,7 @@ config TREE_SRCU
69 This option selects the full-fledged version of SRCU. 69 This option selects the full-fledged version of SRCU.
70 70
71config TASKS_RCU 71config TASKS_RCU
72 bool 72 def_bool PREEMPT
73 default n
74 select SRCU 73 select SRCU
75 help 74 help
76 This option enables a task-based RCU implementation that uses 75 This option enables a task-based RCU implementation that uses
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h
index 808b8c85f626..e4b43fef89f5 100644
--- a/kernel/rcu/rcu.h
+++ b/kernel/rcu/rcu.h
@@ -356,22 +356,10 @@ do { \
356 356
357#ifdef CONFIG_TINY_RCU 357#ifdef CONFIG_TINY_RCU
358/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */ 358/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
359static inline bool rcu_gp_is_normal(void) /* Internal RCU use. */ 359static inline bool rcu_gp_is_normal(void) { return true; }
360{ 360static inline bool rcu_gp_is_expedited(void) { return false; }
361 return true; 361static inline void rcu_expedite_gp(void) { }
362} 362static inline void rcu_unexpedite_gp(void) { }
363static inline bool rcu_gp_is_expedited(void) /* Internal RCU use. */
364{
365 return false;
366}
367
368static inline void rcu_expedite_gp(void)
369{
370}
371
372static inline void rcu_unexpedite_gp(void)
373{
374}
375#else /* #ifdef CONFIG_TINY_RCU */ 363#else /* #ifdef CONFIG_TINY_RCU */
376bool rcu_gp_is_normal(void); /* Internal RCU use. */ 364bool rcu_gp_is_normal(void); /* Internal RCU use. */
377bool rcu_gp_is_expedited(void); /* Internal RCU use. */ 365bool rcu_gp_is_expedited(void); /* Internal RCU use. */
@@ -419,12 +407,8 @@ static inline void rcutorture_get_gp_data(enum rcutorture_type test_type,
419 *gpnum = 0; 407 *gpnum = 0;
420 *completed = 0; 408 *completed = 0;
421} 409}
422static inline void rcutorture_record_test_transition(void) 410static inline void rcutorture_record_test_transition(void) { }
423{ 411static inline void rcutorture_record_progress(unsigned long vernum) { }
424}
425static inline void rcutorture_record_progress(unsigned long vernum)
426{
427}
428#ifdef CONFIG_RCU_TRACE 412#ifdef CONFIG_RCU_TRACE
429void do_trace_rcu_torture_read(const char *rcutorturename, 413void do_trace_rcu_torture_read(const char *rcutorturename,
430 struct rcu_head *rhp, 414 struct rcu_head *rhp,
@@ -460,92 +444,20 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
460#endif 444#endif
461 445
462#ifdef CONFIG_TINY_RCU 446#ifdef CONFIG_TINY_RCU
463 447static inline unsigned long rcu_batches_started(void) { return 0; }
464/* 448static inline unsigned long rcu_batches_started_bh(void) { return 0; }
465 * Return the number of grace periods started. 449static inline unsigned long rcu_batches_started_sched(void) { return 0; }
466 */ 450static inline unsigned long rcu_batches_completed(void) { return 0; }
467static inline unsigned long rcu_batches_started(void) 451static inline unsigned long rcu_batches_completed_bh(void) { return 0; }
468{ 452static inline unsigned long rcu_batches_completed_sched(void) { return 0; }
469 return 0; 453static inline unsigned long rcu_exp_batches_completed(void) { return 0; }
470} 454static inline unsigned long rcu_exp_batches_completed_sched(void) { return 0; }
471 455static inline unsigned long
472/* 456srcu_batches_completed(struct srcu_struct *sp) { return 0; }
473 * Return the number of bottom-half grace periods started. 457static inline void rcu_force_quiescent_state(void) { }
474 */ 458static inline void rcu_bh_force_quiescent_state(void) { }
475static inline unsigned long rcu_batches_started_bh(void) 459static inline void rcu_sched_force_quiescent_state(void) { }
476{ 460static inline void show_rcu_gp_kthreads(void) { }
477 return 0;
478}
479
480/*
481 * Return the number of sched grace periods started.
482 */
483static inline unsigned long rcu_batches_started_sched(void)
484{
485 return 0;
486}
487
488/*
489 * Return the number of grace periods completed.
490 */
491static inline unsigned long rcu_batches_completed(void)
492{
493 return 0;
494}
495
496/*
497 * Return the number of bottom-half grace periods completed.
498 */
499static inline unsigned long rcu_batches_completed_bh(void)
500{
501 return 0;
502}
503
504/*
505 * Return the number of sched grace periods completed.
506 */
507static inline unsigned long rcu_batches_completed_sched(void)
508{
509 return 0;
510}
511
512/*
513 * Return the number of expedited grace periods completed.
514 */
515static inline unsigned long rcu_exp_batches_completed(void)
516{
517 return 0;
518}
519
520/*
521 * Return the number of expedited sched grace periods completed.
522 */
523static inline unsigned long rcu_exp_batches_completed_sched(void)
524{
525 return 0;
526}
527
528static inline unsigned long srcu_batches_completed(struct srcu_struct *sp)
529{
530 return 0;
531}
532
533static inline void rcu_force_quiescent_state(void)
534{
535}
536
537static inline void rcu_bh_force_quiescent_state(void)
538{
539}
540
541static inline void rcu_sched_force_quiescent_state(void)
542{
543}
544
545static inline void show_rcu_gp_kthreads(void)
546{
547}
548
549#else /* #ifdef CONFIG_TINY_RCU */ 461#else /* #ifdef CONFIG_TINY_RCU */
550extern unsigned long rcutorture_testseq; 462extern unsigned long rcutorture_testseq;
551extern unsigned long rcutorture_vernum; 463extern unsigned long rcutorture_vernum;
diff --git a/kernel/rcu/rcu_segcblist.c b/kernel/rcu/rcu_segcblist.c
index 2b62a38b080f..7649fcd2c4c7 100644
--- a/kernel/rcu/rcu_segcblist.c
+++ b/kernel/rcu/rcu_segcblist.c
@@ -36,24 +36,6 @@ void rcu_cblist_init(struct rcu_cblist *rclp)
36} 36}
37 37
38/* 38/*
39 * Debug function to actually count the number of callbacks.
40 * If the number exceeds the limit specified, return -1.
41 */
42long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim)
43{
44 int cnt = 0;
45 struct rcu_head **rhpp = &rclp->head;
46
47 for (;;) {
48 if (!*rhpp)
49 return cnt;
50 if (++cnt > lim)
51 return -1;
52 rhpp = &(*rhpp)->next;
53 }
54}
55
56/*
57 * Dequeue the oldest rcu_head structure from the specified callback 39 * Dequeue the oldest rcu_head structure from the specified callback
58 * list. This function assumes that the callback is non-lazy, but 40 * list. This function assumes that the callback is non-lazy, but
59 * the caller can later invoke rcu_cblist_dequeued_lazy() if it 41 * the caller can later invoke rcu_cblist_dequeued_lazy() if it
@@ -103,17 +85,6 @@ void rcu_segcblist_disable(struct rcu_segcblist *rsclp)
103} 85}
104 86
105/* 87/*
106 * Is the specified segment of the specified rcu_segcblist structure
107 * empty of callbacks?
108 */
109bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg)
110{
111 if (seg == RCU_DONE_TAIL)
112 return &rsclp->head == rsclp->tails[RCU_DONE_TAIL];
113 return rsclp->tails[seg - 1] == rsclp->tails[seg];
114}
115
116/*
117 * Does the specified rcu_segcblist structure contain callbacks that 88 * Does the specified rcu_segcblist structure contain callbacks that
118 * are ready to be invoked? 89 * are ready to be invoked?
119 */ 90 */
@@ -134,50 +105,6 @@ bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp)
134} 105}
135 106
136/* 107/*
137 * Dequeue and return the first ready-to-invoke callback. If there
138 * are no ready-to-invoke callbacks, return NULL. Disables interrupts
139 * to avoid interference. Does not protect from interference from other
140 * CPUs or tasks.
141 */
142struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp)
143{
144 unsigned long flags;
145 int i;
146 struct rcu_head *rhp;
147
148 local_irq_save(flags);
149 if (!rcu_segcblist_ready_cbs(rsclp)) {
150 local_irq_restore(flags);
151 return NULL;
152 }
153 rhp = rsclp->head;
154 BUG_ON(!rhp);
155 rsclp->head = rhp->next;
156 for (i = RCU_DONE_TAIL; i < RCU_CBLIST_NSEGS; i++) {
157 if (rsclp->tails[i] != &rhp->next)
158 break;
159 rsclp->tails[i] = &rsclp->head;
160 }
161 smp_mb(); /* Dequeue before decrement for rcu_barrier(). */
162 WRITE_ONCE(rsclp->len, rsclp->len - 1);
163 local_irq_restore(flags);
164 return rhp;
165}
166
167/*
168 * Account for the fact that a previously dequeued callback turned out
169 * to be marked as lazy.
170 */
171void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp)
172{
173 unsigned long flags;
174
175 local_irq_save(flags);
176 rsclp->len_lazy--;
177 local_irq_restore(flags);
178}
179
180/*
181 * Return a pointer to the first callback in the specified rcu_segcblist 108 * Return a pointer to the first callback in the specified rcu_segcblist
182 * structure. This is useful for diagnostics. 109 * structure. This is useful for diagnostics.
183 */ 110 */
@@ -203,17 +130,6 @@ struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp)
203} 130}
204 131
205/* 132/*
206 * Does the specified rcu_segcblist structure contain callbacks that
207 * have not yet been processed beyond having been posted, that is,
208 * does it contain callbacks in its last segment?
209 */
210bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp)
211{
212 return rcu_segcblist_is_enabled(rsclp) &&
213 !rcu_segcblist_restempty(rsclp, RCU_NEXT_READY_TAIL);
214}
215
216/*
217 * Enqueue the specified callback onto the specified rcu_segcblist 133 * Enqueue the specified callback onto the specified rcu_segcblist
218 * structure, updating accounting as needed. Note that the ->len 134 * structure, updating accounting as needed. Note that the ->len
219 * field may be accessed locklessly, hence the WRITE_ONCE(). 135 * field may be accessed locklessly, hence the WRITE_ONCE().
@@ -503,3 +419,27 @@ bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
503 return true; 419 return true;
504 return false; 420 return false;
505} 421}
422
423/*
424 * Merge the source rcu_segcblist structure into the destination
425 * rcu_segcblist structure, then initialize the source. Any pending
426 * callbacks from the source get to start over. It is best to
427 * advance and accelerate both the destination and the source
428 * before merging.
429 */
430void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
431 struct rcu_segcblist *src_rsclp)
432{
433 struct rcu_cblist donecbs;
434 struct rcu_cblist pendcbs;
435
436 rcu_cblist_init(&donecbs);
437 rcu_cblist_init(&pendcbs);
438 rcu_segcblist_extract_count(src_rsclp, &donecbs);
439 rcu_segcblist_extract_done_cbs(src_rsclp, &donecbs);
440 rcu_segcblist_extract_pend_cbs(src_rsclp, &pendcbs);
441 rcu_segcblist_insert_count(dst_rsclp, &donecbs);
442 rcu_segcblist_insert_done_cbs(dst_rsclp, &donecbs);
443 rcu_segcblist_insert_pend_cbs(dst_rsclp, &pendcbs);
444 rcu_segcblist_init(src_rsclp);
445}
diff --git a/kernel/rcu/rcu_segcblist.h b/kernel/rcu/rcu_segcblist.h
index 6e36e36478cd..581c12b63544 100644
--- a/kernel/rcu/rcu_segcblist.h
+++ b/kernel/rcu/rcu_segcblist.h
@@ -31,29 +31,7 @@ static inline void rcu_cblist_dequeued_lazy(struct rcu_cblist *rclp)
31 rclp->len_lazy--; 31 rclp->len_lazy--;
32} 32}
33 33
34/*
35 * Interim function to return rcu_cblist head pointer. Longer term, the
36 * rcu_cblist will be used more pervasively, removing the need for this
37 * function.
38 */
39static inline struct rcu_head *rcu_cblist_head(struct rcu_cblist *rclp)
40{
41 return rclp->head;
42}
43
44/*
45 * Interim function to return rcu_cblist head pointer. Longer term, the
46 * rcu_cblist will be used more pervasively, removing the need for this
47 * function.
48 */
49static inline struct rcu_head **rcu_cblist_tail(struct rcu_cblist *rclp)
50{
51 WARN_ON_ONCE(!rclp->head);
52 return rclp->tail;
53}
54
55void rcu_cblist_init(struct rcu_cblist *rclp); 34void rcu_cblist_init(struct rcu_cblist *rclp);
56long rcu_cblist_count_cbs(struct rcu_cblist *rclp, long lim);
57struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp); 35struct rcu_head *rcu_cblist_dequeue(struct rcu_cblist *rclp);
58 36
59/* 37/*
@@ -134,14 +112,10 @@ static inline struct rcu_head **rcu_segcblist_tail(struct rcu_segcblist *rsclp)
134 112
135void rcu_segcblist_init(struct rcu_segcblist *rsclp); 113void rcu_segcblist_init(struct rcu_segcblist *rsclp);
136void rcu_segcblist_disable(struct rcu_segcblist *rsclp); 114void rcu_segcblist_disable(struct rcu_segcblist *rsclp);
137bool rcu_segcblist_segempty(struct rcu_segcblist *rsclp, int seg);
138bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp); 115bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp);
139bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp); 116bool rcu_segcblist_pend_cbs(struct rcu_segcblist *rsclp);
140struct rcu_head *rcu_segcblist_dequeue(struct rcu_segcblist *rsclp);
141void rcu_segcblist_dequeued_lazy(struct rcu_segcblist *rsclp);
142struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp); 117struct rcu_head *rcu_segcblist_first_cb(struct rcu_segcblist *rsclp);
143struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp); 118struct rcu_head *rcu_segcblist_first_pend_cb(struct rcu_segcblist *rsclp);
144bool rcu_segcblist_new_cbs(struct rcu_segcblist *rsclp);
145void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp, 119void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
146 struct rcu_head *rhp, bool lazy); 120 struct rcu_head *rhp, bool lazy);
147bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp, 121bool rcu_segcblist_entrain(struct rcu_segcblist *rsclp,
@@ -162,3 +136,5 @@ void rcu_segcblist_advance(struct rcu_segcblist *rsclp, unsigned long seq);
162bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq); 136bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq);
163bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp, 137bool rcu_segcblist_future_gp_needed(struct rcu_segcblist *rsclp,
164 unsigned long seq); 138 unsigned long seq);
139void rcu_segcblist_merge(struct rcu_segcblist *dst_rsclp,
140 struct rcu_segcblist *src_rsclp);
diff --git a/kernel/rcu/rcuperf.c b/kernel/rcu/rcuperf.c
index 3cc18110b612..1f87a02c3399 100644
--- a/kernel/rcu/rcuperf.c
+++ b/kernel/rcu/rcuperf.c
@@ -317,8 +317,6 @@ static struct rcu_perf_ops sched_ops = {
317 .name = "sched" 317 .name = "sched"
318}; 318};
319 319
320#ifdef CONFIG_TASKS_RCU
321
322/* 320/*
323 * Definitions for RCU-tasks perf testing. 321 * Definitions for RCU-tasks perf testing.
324 */ 322 */
@@ -346,24 +344,11 @@ static struct rcu_perf_ops tasks_ops = {
346 .name = "tasks" 344 .name = "tasks"
347}; 345};
348 346
349#define RCUPERF_TASKS_OPS &tasks_ops,
350
351static bool __maybe_unused torturing_tasks(void) 347static bool __maybe_unused torturing_tasks(void)
352{ 348{
353 return cur_ops == &tasks_ops; 349 return cur_ops == &tasks_ops;
354} 350}
355 351
356#else /* #ifdef CONFIG_TASKS_RCU */
357
358#define RCUPERF_TASKS_OPS
359
360static bool __maybe_unused torturing_tasks(void)
361{
362 return false;
363}
364
365#endif /* #else #ifdef CONFIG_TASKS_RCU */
366
367/* 352/*
368 * If performance tests complete, wait for shutdown to commence. 353 * If performance tests complete, wait for shutdown to commence.
369 */ 354 */
@@ -658,7 +643,7 @@ rcu_perf_init(void)
658 int firsterr = 0; 643 int firsterr = 0;
659 static struct rcu_perf_ops *perf_ops[] = { 644 static struct rcu_perf_ops *perf_ops[] = {
660 &rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops, 645 &rcu_ops, &rcu_bh_ops, &srcu_ops, &srcud_ops, &sched_ops,
661 RCUPERF_TASKS_OPS 646 &tasks_ops,
662 }; 647 };
663 648
664 if (!torture_init_begin(perf_type, verbose, &perf_runnable)) 649 if (!torture_init_begin(perf_type, verbose, &perf_runnable))
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index b8f7f8ce8575..45f2ffbc1e78 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -199,7 +199,8 @@ MODULE_PARM_DESC(torture_runnable, "Start rcutorture at boot");
199static u64 notrace rcu_trace_clock_local(void) 199static u64 notrace rcu_trace_clock_local(void)
200{ 200{
201 u64 ts = trace_clock_local(); 201 u64 ts = trace_clock_local();
202 unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC); 202
203 (void)do_div(ts, NSEC_PER_USEC);
203 return ts; 204 return ts;
204} 205}
205#else /* #ifdef CONFIG_RCU_TRACE */ 206#else /* #ifdef CONFIG_RCU_TRACE */
@@ -496,7 +497,7 @@ static struct rcu_torture_ops rcu_busted_ops = {
496 .fqs = NULL, 497 .fqs = NULL,
497 .stats = NULL, 498 .stats = NULL,
498 .irq_capable = 1, 499 .irq_capable = 1,
499 .name = "rcu_busted" 500 .name = "busted"
500}; 501};
501 502
502/* 503/*
@@ -522,7 +523,7 @@ static void srcu_read_delay(struct torture_random_state *rrsp)
522 523
523 delay = torture_random(rrsp) % 524 delay = torture_random(rrsp) %
524 (nrealreaders * 2 * longdelay * uspertick); 525 (nrealreaders * 2 * longdelay * uspertick);
525 if (!delay) 526 if (!delay && in_task())
526 schedule_timeout_interruptible(longdelay); 527 schedule_timeout_interruptible(longdelay);
527 else 528 else
528 rcu_read_delay(rrsp); 529 rcu_read_delay(rrsp);
@@ -561,44 +562,7 @@ static void srcu_torture_barrier(void)
561 562
562static void srcu_torture_stats(void) 563static void srcu_torture_stats(void)
563{ 564{
564 int __maybe_unused cpu; 565 srcu_torture_stats_print(srcu_ctlp, torture_type, TORTURE_FLAG);
565 int idx;
566
567#ifdef CONFIG_TREE_SRCU
568 idx = srcu_ctlp->srcu_idx & 0x1;
569 pr_alert("%s%s Tree SRCU per-CPU(idx=%d):",
570 torture_type, TORTURE_FLAG, idx);
571 for_each_possible_cpu(cpu) {
572 unsigned long l0, l1;
573 unsigned long u0, u1;
574 long c0, c1;
575 struct srcu_data *counts;
576
577 counts = per_cpu_ptr(srcu_ctlp->sda, cpu);
578 u0 = counts->srcu_unlock_count[!idx];
579 u1 = counts->srcu_unlock_count[idx];
580
581 /*
582 * Make sure that a lock is always counted if the corresponding
583 * unlock is counted.
584 */
585 smp_rmb();
586
587 l0 = counts->srcu_lock_count[!idx];
588 l1 = counts->srcu_lock_count[idx];
589
590 c0 = l0 - u0;
591 c1 = l1 - u1;
592 pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
593 }
594 pr_cont("\n");
595#elif defined(CONFIG_TINY_SRCU)
596 idx = READ_ONCE(srcu_ctlp->srcu_idx) & 0x1;
597 pr_alert("%s%s Tiny SRCU per-CPU(idx=%d): (%hd,%hd)\n",
598 torture_type, TORTURE_FLAG, idx,
599 READ_ONCE(srcu_ctlp->srcu_lock_nesting[!idx]),
600 READ_ONCE(srcu_ctlp->srcu_lock_nesting[idx]));
601#endif
602} 566}
603 567
604static void srcu_torture_synchronize_expedited(void) 568static void srcu_torture_synchronize_expedited(void)
@@ -620,6 +584,7 @@ static struct rcu_torture_ops srcu_ops = {
620 .call = srcu_torture_call, 584 .call = srcu_torture_call,
621 .cb_barrier = srcu_torture_barrier, 585 .cb_barrier = srcu_torture_barrier,
622 .stats = srcu_torture_stats, 586 .stats = srcu_torture_stats,
587 .irq_capable = 1,
623 .name = "srcu" 588 .name = "srcu"
624}; 589};
625 590
@@ -652,6 +617,7 @@ static struct rcu_torture_ops srcud_ops = {
652 .call = srcu_torture_call, 617 .call = srcu_torture_call,
653 .cb_barrier = srcu_torture_barrier, 618 .cb_barrier = srcu_torture_barrier,
654 .stats = srcu_torture_stats, 619 .stats = srcu_torture_stats,
620 .irq_capable = 1,
655 .name = "srcud" 621 .name = "srcud"
656}; 622};
657 623
@@ -696,8 +662,6 @@ static struct rcu_torture_ops sched_ops = {
696 .name = "sched" 662 .name = "sched"
697}; 663};
698 664
699#ifdef CONFIG_TASKS_RCU
700
701/* 665/*
702 * Definitions for RCU-tasks torture testing. 666 * Definitions for RCU-tasks torture testing.
703 */ 667 */
@@ -735,24 +699,11 @@ static struct rcu_torture_ops tasks_ops = {
735 .name = "tasks" 699 .name = "tasks"
736}; 700};
737 701
738#define RCUTORTURE_TASKS_OPS &tasks_ops,
739
740static bool __maybe_unused torturing_tasks(void) 702static bool __maybe_unused torturing_tasks(void)
741{ 703{
742 return cur_ops == &tasks_ops; 704 return cur_ops == &tasks_ops;
743} 705}
744 706
745#else /* #ifdef CONFIG_TASKS_RCU */
746
747#define RCUTORTURE_TASKS_OPS
748
749static bool __maybe_unused torturing_tasks(void)
750{
751 return false;
752}
753
754#endif /* #else #ifdef CONFIG_TASKS_RCU */
755
756/* 707/*
757 * RCU torture priority-boost testing. Runs one real-time thread per 708 * RCU torture priority-boost testing. Runs one real-time thread per
758 * CPU for moderate bursts, repeatedly registering RCU callbacks and 709 * CPU for moderate bursts, repeatedly registering RCU callbacks and
@@ -1114,6 +1065,11 @@ rcu_torture_fakewriter(void *arg)
1114 return 0; 1065 return 0;
1115} 1066}
1116 1067
1068static void rcu_torture_timer_cb(struct rcu_head *rhp)
1069{
1070 kfree(rhp);
1071}
1072
1117/* 1073/*
1118 * RCU torture reader from timer handler. Dereferences rcu_torture_current, 1074 * RCU torture reader from timer handler. Dereferences rcu_torture_current,
1119 * incrementing the corresponding element of the pipeline array. The 1075 * incrementing the corresponding element of the pipeline array. The
@@ -1176,6 +1132,14 @@ static void rcu_torture_timer(unsigned long unused)
1176 __this_cpu_inc(rcu_torture_batch[completed]); 1132 __this_cpu_inc(rcu_torture_batch[completed]);
1177 preempt_enable(); 1133 preempt_enable();
1178 cur_ops->readunlock(idx); 1134 cur_ops->readunlock(idx);
1135
1136 /* Test call_rcu() invocation from interrupt handler. */
1137 if (cur_ops->call) {
1138 struct rcu_head *rhp = kmalloc(sizeof(*rhp), GFP_NOWAIT);
1139
1140 if (rhp)
1141 cur_ops->call(rhp, rcu_torture_timer_cb);
1142 }
1179} 1143}
1180 1144
1181/* 1145/*
@@ -1354,11 +1318,12 @@ rcu_torture_stats_print(void)
1354 srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, 1318 srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp,
1355 &flags, &gpnum, &completed); 1319 &flags, &gpnum, &completed);
1356 wtp = READ_ONCE(writer_task); 1320 wtp = READ_ONCE(writer_task);
1357 pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx\n", 1321 pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x ->state %#lx cpu %d\n",
1358 rcu_torture_writer_state_getname(), 1322 rcu_torture_writer_state_getname(),
1359 rcu_torture_writer_state, 1323 rcu_torture_writer_state,
1360 gpnum, completed, flags, 1324 gpnum, completed, flags,
1361 wtp == NULL ? ~0UL : wtp->state); 1325 wtp == NULL ? ~0UL : wtp->state,
1326 wtp == NULL ? -1 : (int)task_cpu(wtp));
1362 show_rcu_gp_kthreads(); 1327 show_rcu_gp_kthreads();
1363 rcu_ftrace_dump(DUMP_ALL); 1328 rcu_ftrace_dump(DUMP_ALL);
1364 } 1329 }
@@ -1749,7 +1714,7 @@ rcu_torture_init(void)
1749 int firsterr = 0; 1714 int firsterr = 0;
1750 static struct rcu_torture_ops *torture_ops[] = { 1715 static struct rcu_torture_ops *torture_ops[] = {
1751 &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops, 1716 &rcu_ops, &rcu_bh_ops, &rcu_busted_ops, &srcu_ops, &srcud_ops,
1752 &sched_ops, RCUTORTURE_TASKS_OPS 1717 &sched_ops, &tasks_ops,
1753 }; 1718 };
1754 1719
1755 if (!torture_init_begin(torture_type, verbose, &torture_runnable)) 1720 if (!torture_init_begin(torture_type, verbose, &torture_runnable))
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 1a1c1047d2ed..76ac5f50b2c7 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -33,6 +33,8 @@
33#include "rcu_segcblist.h" 33#include "rcu_segcblist.h"
34#include "rcu.h" 34#include "rcu.h"
35 35
36int rcu_scheduler_active __read_mostly;
37
36static int init_srcu_struct_fields(struct srcu_struct *sp) 38static int init_srcu_struct_fields(struct srcu_struct *sp)
37{ 39{
38 sp->srcu_lock_nesting[0] = 0; 40 sp->srcu_lock_nesting[0] = 0;
@@ -193,3 +195,9 @@ void synchronize_srcu(struct srcu_struct *sp)
193 destroy_rcu_head_on_stack(&rs.head); 195 destroy_rcu_head_on_stack(&rs.head);
194} 196}
195EXPORT_SYMBOL_GPL(synchronize_srcu); 197EXPORT_SYMBOL_GPL(synchronize_srcu);
198
199/* Lockdep diagnostics. */
200void __init rcu_scheduler_starting(void)
201{
202 rcu_scheduler_active = RCU_SCHEDULER_RUNNING;
203}
diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c
index d0ca524bf042..729a8706751d 100644
--- a/kernel/rcu/srcutree.c
+++ b/kernel/rcu/srcutree.c
@@ -51,6 +51,7 @@ module_param(counter_wrap_check, ulong, 0444);
51 51
52static void srcu_invoke_callbacks(struct work_struct *work); 52static void srcu_invoke_callbacks(struct work_struct *work);
53static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay); 53static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay);
54static void process_srcu(struct work_struct *work);
54 55
55/* 56/*
56 * Initialize SRCU combining tree. Note that statically allocated 57 * Initialize SRCU combining tree. Note that statically allocated
@@ -896,6 +897,15 @@ static void __synchronize_srcu(struct srcu_struct *sp, bool do_norm)
896 __call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm); 897 __call_srcu(sp, &rcu.head, wakeme_after_rcu, do_norm);
897 wait_for_completion(&rcu.completion); 898 wait_for_completion(&rcu.completion);
898 destroy_rcu_head_on_stack(&rcu.head); 899 destroy_rcu_head_on_stack(&rcu.head);
900
901 /*
902 * Make sure that later code is ordered after the SRCU grace
903 * period. This pairs with the raw_spin_lock_irq_rcu_node()
904 * in srcu_invoke_callbacks(). Unlike Tree RCU, this is needed
905 * because the current CPU might have been totally uninvolved with
906 * (and thus unordered against) that grace period.
907 */
908 smp_mb();
899} 909}
900 910
901/** 911/**
@@ -1194,7 +1204,7 @@ static void srcu_reschedule(struct srcu_struct *sp, unsigned long delay)
1194/* 1204/*
1195 * This is the work-queue function that handles SRCU grace periods. 1205 * This is the work-queue function that handles SRCU grace periods.
1196 */ 1206 */
1197void process_srcu(struct work_struct *work) 1207static void process_srcu(struct work_struct *work)
1198{ 1208{
1199 struct srcu_struct *sp; 1209 struct srcu_struct *sp;
1200 1210
@@ -1203,7 +1213,6 @@ void process_srcu(struct work_struct *work)
1203 srcu_advance_state(sp); 1213 srcu_advance_state(sp);
1204 srcu_reschedule(sp, srcu_get_delay(sp)); 1214 srcu_reschedule(sp, srcu_get_delay(sp));
1205} 1215}
1206EXPORT_SYMBOL_GPL(process_srcu);
1207 1216
1208void srcutorture_get_gp_data(enum rcutorture_type test_type, 1217void srcutorture_get_gp_data(enum rcutorture_type test_type,
1209 struct srcu_struct *sp, int *flags, 1218 struct srcu_struct *sp, int *flags,
@@ -1217,6 +1226,43 @@ void srcutorture_get_gp_data(enum rcutorture_type test_type,
1217} 1226}
1218EXPORT_SYMBOL_GPL(srcutorture_get_gp_data); 1227EXPORT_SYMBOL_GPL(srcutorture_get_gp_data);
1219 1228
1229void srcu_torture_stats_print(struct srcu_struct *sp, char *tt, char *tf)
1230{
1231 int cpu;
1232 int idx;
1233 unsigned long s0 = 0, s1 = 0;
1234
1235 idx = sp->srcu_idx & 0x1;
1236 pr_alert("%s%s Tree SRCU per-CPU(idx=%d):", tt, tf, idx);
1237 for_each_possible_cpu(cpu) {
1238 unsigned long l0, l1;
1239 unsigned long u0, u1;
1240 long c0, c1;
1241 struct srcu_data *counts;
1242
1243 counts = per_cpu_ptr(sp->sda, cpu);
1244 u0 = counts->srcu_unlock_count[!idx];
1245 u1 = counts->srcu_unlock_count[idx];
1246
1247 /*
1248 * Make sure that a lock is always counted if the corresponding
1249 * unlock is counted.
1250 */
1251 smp_rmb();
1252
1253 l0 = counts->srcu_lock_count[!idx];
1254 l1 = counts->srcu_lock_count[idx];
1255
1256 c0 = l0 - u0;
1257 c1 = l1 - u1;
1258 pr_cont(" %d(%ld,%ld)", cpu, c0, c1);
1259 s0 += c0;
1260 s1 += c1;
1261 }
1262 pr_cont(" T(%ld,%ld)\n", s0, s1);
1263}
1264EXPORT_SYMBOL_GPL(srcu_torture_stats_print);
1265
1220static int __init srcu_bootup_announce(void) 1266static int __init srcu_bootup_announce(void)
1221{ 1267{
1222 pr_info("Hierarchical SRCU implementation.\n"); 1268 pr_info("Hierarchical SRCU implementation.\n");
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index f8488965250f..a64eee0db39e 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -56,8 +56,6 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
56 .curtail = &rcu_bh_ctrlblk.rcucblist, 56 .curtail = &rcu_bh_ctrlblk.rcucblist,
57}; 57};
58 58
59#include "tiny_plugin.h"
60
61void rcu_barrier_bh(void) 59void rcu_barrier_bh(void)
62{ 60{
63 wait_rcu_gp(call_rcu_bh); 61 wait_rcu_gp(call_rcu_bh);
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h
deleted file mode 100644
index f0a01b2a3062..000000000000
--- a/kernel/rcu/tiny_plugin.h
+++ /dev/null
@@ -1,47 +0,0 @@
1/*
2 * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition
3 * Internal non-public definitions that provide either classic
4 * or preemptible semantics.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, you can access it online at
18 * http://www.gnu.org/licenses/gpl-2.0.html.
19 *
20 * Copyright (c) 2010 Linaro
21 *
22 * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
23 */
24
25#if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU)
26#include <linux/kernel_stat.h>
27
28int rcu_scheduler_active __read_mostly;
29EXPORT_SYMBOL_GPL(rcu_scheduler_active);
30
31/*
32 * During boot, we forgive RCU lockdep issues. After this function is
33 * invoked, we start taking RCU lockdep issues seriously. Note that unlike
34 * Tree RCU, Tiny RCU transitions directly from RCU_SCHEDULER_INACTIVE
35 * to RCU_SCHEDULER_RUNNING, skipping the RCU_SCHEDULER_INIT stage.
36 * The reason for this is that Tiny RCU does not need kthreads, so does
37 * not have to care about the fact that the scheduler is half-initialized
38 * at a certain phase of the boot process. Unless SRCU is in the mix.
39 */
40void __init rcu_scheduler_starting(void)
41{
42 WARN_ON(nr_context_switches() > 0);
43 rcu_scheduler_active = IS_ENABLED(CONFIG_SRCU)
44 ? RCU_SCHEDULER_INIT : RCU_SCHEDULER_RUNNING;
45}
46
47#endif /* #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_SRCU) */
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 9bb5dff50815..84fe96641b2e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -97,9 +97,6 @@ struct rcu_state sname##_state = { \
97 .gp_state = RCU_GP_IDLE, \ 97 .gp_state = RCU_GP_IDLE, \
98 .gpnum = 0UL - 300UL, \ 98 .gpnum = 0UL - 300UL, \
99 .completed = 0UL - 300UL, \ 99 .completed = 0UL - 300UL, \
100 .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \
101 .orphan_pend = RCU_CBLIST_INITIALIZER(sname##_state.orphan_pend), \
102 .orphan_done = RCU_CBLIST_INITIALIZER(sname##_state.orphan_done), \
103 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ 100 .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
104 .name = RCU_STATE_NAME(sname), \ 101 .name = RCU_STATE_NAME(sname), \
105 .abbr = sabbr, \ 102 .abbr = sabbr, \
@@ -843,13 +840,9 @@ static void rcu_eqs_enter(bool user)
843 */ 840 */
844void rcu_idle_enter(void) 841void rcu_idle_enter(void)
845{ 842{
846 unsigned long flags; 843 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_idle_enter() invoked with irqs enabled!!!");
847
848 local_irq_save(flags);
849 rcu_eqs_enter(false); 844 rcu_eqs_enter(false);
850 local_irq_restore(flags);
851} 845}
852EXPORT_SYMBOL_GPL(rcu_idle_enter);
853 846
854#ifdef CONFIG_NO_HZ_FULL 847#ifdef CONFIG_NO_HZ_FULL
855/** 848/**
@@ -862,7 +855,8 @@ EXPORT_SYMBOL_GPL(rcu_idle_enter);
862 */ 855 */
863void rcu_user_enter(void) 856void rcu_user_enter(void)
864{ 857{
865 rcu_eqs_enter(1); 858 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_user_enter() invoked with irqs enabled!!!");
859 rcu_eqs_enter(true);
866} 860}
867#endif /* CONFIG_NO_HZ_FULL */ 861#endif /* CONFIG_NO_HZ_FULL */
868 862
@@ -955,8 +949,10 @@ static void rcu_eqs_exit(bool user)
955 if (oldval & DYNTICK_TASK_NEST_MASK) { 949 if (oldval & DYNTICK_TASK_NEST_MASK) {
956 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; 950 rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE;
957 } else { 951 } else {
952 __this_cpu_inc(disable_rcu_irq_enter);
958 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; 953 rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
959 rcu_eqs_exit_common(oldval, user); 954 rcu_eqs_exit_common(oldval, user);
955 __this_cpu_dec(disable_rcu_irq_enter);
960 } 956 }
961} 957}
962 958
@@ -979,7 +975,6 @@ void rcu_idle_exit(void)
979 rcu_eqs_exit(false); 975 rcu_eqs_exit(false);
980 local_irq_restore(flags); 976 local_irq_restore(flags);
981} 977}
982EXPORT_SYMBOL_GPL(rcu_idle_exit);
983 978
984#ifdef CONFIG_NO_HZ_FULL 979#ifdef CONFIG_NO_HZ_FULL
985/** 980/**
@@ -1358,12 +1353,13 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp)
1358 j = jiffies; 1353 j = jiffies;
1359 gpa = READ_ONCE(rsp->gp_activity); 1354 gpa = READ_ONCE(rsp->gp_activity);
1360 if (j - gpa > 2 * HZ) { 1355 if (j - gpa > 2 * HZ) {
1361 pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n", 1356 pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx ->cpu=%d\n",
1362 rsp->name, j - gpa, 1357 rsp->name, j - gpa,
1363 rsp->gpnum, rsp->completed, 1358 rsp->gpnum, rsp->completed,
1364 rsp->gp_flags, 1359 rsp->gp_flags,
1365 gp_state_getname(rsp->gp_state), rsp->gp_state, 1360 gp_state_getname(rsp->gp_state), rsp->gp_state,
1366 rsp->gp_kthread ? rsp->gp_kthread->state : ~0); 1361 rsp->gp_kthread ? rsp->gp_kthread->state : ~0,
1362 rsp->gp_kthread ? task_cpu(rsp->gp_kthread) : -1);
1367 if (rsp->gp_kthread) { 1363 if (rsp->gp_kthread) {
1368 sched_show_task(rsp->gp_kthread); 1364 sched_show_task(rsp->gp_kthread);
1369 wake_up_process(rsp->gp_kthread); 1365 wake_up_process(rsp->gp_kthread);
@@ -2067,8 +2063,8 @@ static bool rcu_gp_init(struct rcu_state *rsp)
2067} 2063}
2068 2064
2069/* 2065/*
2070 * Helper function for wait_event_interruptible_timeout() wakeup 2066 * Helper function for swait_event_idle() wakeup at force-quiescent-state
2071 * at force-quiescent-state time. 2067 * time.
2072 */ 2068 */
2073static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp) 2069static bool rcu_gp_fqs_check_wake(struct rcu_state *rsp, int *gfp)
2074{ 2070{
@@ -2206,9 +2202,8 @@ static int __noreturn rcu_gp_kthread(void *arg)
2206 READ_ONCE(rsp->gpnum), 2202 READ_ONCE(rsp->gpnum),
2207 TPS("reqwait")); 2203 TPS("reqwait"));
2208 rsp->gp_state = RCU_GP_WAIT_GPS; 2204 rsp->gp_state = RCU_GP_WAIT_GPS;
2209 swait_event_interruptible(rsp->gp_wq, 2205 swait_event_idle(rsp->gp_wq, READ_ONCE(rsp->gp_flags) &
2210 READ_ONCE(rsp->gp_flags) & 2206 RCU_GP_FLAG_INIT);
2211 RCU_GP_FLAG_INIT);
2212 rsp->gp_state = RCU_GP_DONE_GPS; 2207 rsp->gp_state = RCU_GP_DONE_GPS;
2213 /* Locking provides needed memory barrier. */ 2208 /* Locking provides needed memory barrier. */
2214 if (rcu_gp_init(rsp)) 2209 if (rcu_gp_init(rsp))
@@ -2239,7 +2234,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
2239 READ_ONCE(rsp->gpnum), 2234 READ_ONCE(rsp->gpnum),
2240 TPS("fqswait")); 2235 TPS("fqswait"));
2241 rsp->gp_state = RCU_GP_WAIT_FQS; 2236 rsp->gp_state = RCU_GP_WAIT_FQS;
2242 ret = swait_event_interruptible_timeout(rsp->gp_wq, 2237 ret = swait_event_idle_timeout(rsp->gp_wq,
2243 rcu_gp_fqs_check_wake(rsp, &gf), j); 2238 rcu_gp_fqs_check_wake(rsp, &gf), j);
2244 rsp->gp_state = RCU_GP_DOING_FQS; 2239 rsp->gp_state = RCU_GP_DOING_FQS;
2245 /* Locking provides needed memory barriers. */ 2240 /* Locking provides needed memory barriers. */
@@ -2409,6 +2404,8 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
2409 return; 2404 return;
2410 } 2405 }
2411 WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ 2406 WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
2407 WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1 &&
2408 rcu_preempt_blocked_readers_cgp(rnp));
2412 rnp->qsmask &= ~mask; 2409 rnp->qsmask &= ~mask;
2413 trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, 2410 trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
2414 mask, rnp->qsmask, rnp->level, 2411 mask, rnp->qsmask, rnp->level,
@@ -3476,10 +3473,11 @@ static void rcu_barrier_callback(struct rcu_head *rhp)
3476 struct rcu_state *rsp = rdp->rsp; 3473 struct rcu_state *rsp = rdp->rsp;
3477 3474
3478 if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { 3475 if (atomic_dec_and_test(&rsp->barrier_cpu_count)) {
3479 _rcu_barrier_trace(rsp, "LastCB", -1, rsp->barrier_sequence); 3476 _rcu_barrier_trace(rsp, TPS("LastCB"), -1,
3477 rsp->barrier_sequence);
3480 complete(&rsp->barrier_completion); 3478 complete(&rsp->barrier_completion);
3481 } else { 3479 } else {
3482 _rcu_barrier_trace(rsp, "CB", -1, rsp->barrier_sequence); 3480 _rcu_barrier_trace(rsp, TPS("CB"), -1, rsp->barrier_sequence);
3483 } 3481 }
3484} 3482}
3485 3483
@@ -3491,14 +3489,15 @@ static void rcu_barrier_func(void *type)
3491 struct rcu_state *rsp = type; 3489 struct rcu_state *rsp = type;
3492 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda); 3490 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
3493 3491
3494 _rcu_barrier_trace(rsp, "IRQ", -1, rsp->barrier_sequence); 3492 _rcu_barrier_trace(rsp, TPS("IRQ"), -1, rsp->barrier_sequence);
3495 rdp->barrier_head.func = rcu_barrier_callback; 3493 rdp->barrier_head.func = rcu_barrier_callback;
3496 debug_rcu_head_queue(&rdp->barrier_head); 3494 debug_rcu_head_queue(&rdp->barrier_head);
3497 if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) { 3495 if (rcu_segcblist_entrain(&rdp->cblist, &rdp->barrier_head, 0)) {
3498 atomic_inc(&rsp->barrier_cpu_count); 3496 atomic_inc(&rsp->barrier_cpu_count);
3499 } else { 3497 } else {
3500 debug_rcu_head_unqueue(&rdp->barrier_head); 3498 debug_rcu_head_unqueue(&rdp->barrier_head);
3501 _rcu_barrier_trace(rsp, "IRQNQ", -1, rsp->barrier_sequence); 3499 _rcu_barrier_trace(rsp, TPS("IRQNQ"), -1,
3500 rsp->barrier_sequence);
3502 } 3501 }
3503} 3502}
3504 3503
@@ -3512,14 +3511,15 @@ static void _rcu_barrier(struct rcu_state *rsp)
3512 struct rcu_data *rdp; 3511 struct rcu_data *rdp;
3513 unsigned long s = rcu_seq_snap(&rsp->barrier_sequence); 3512 unsigned long s = rcu_seq_snap(&rsp->barrier_sequence);
3514 3513
3515 _rcu_barrier_trace(rsp, "Begin", -1, s); 3514 _rcu_barrier_trace(rsp, TPS("Begin"), -1, s);
3516 3515
3517 /* Take mutex to serialize concurrent rcu_barrier() requests. */ 3516 /* Take mutex to serialize concurrent rcu_barrier() requests. */
3518 mutex_lock(&rsp->barrier_mutex); 3517 mutex_lock(&rsp->barrier_mutex);
3519 3518
3520 /* Did someone else do our work for us? */ 3519 /* Did someone else do our work for us? */
3521 if (rcu_seq_done(&rsp->barrier_sequence, s)) { 3520 if (rcu_seq_done(&rsp->barrier_sequence, s)) {
3522 _rcu_barrier_trace(rsp, "EarlyExit", -1, rsp->barrier_sequence); 3521 _rcu_barrier_trace(rsp, TPS("EarlyExit"), -1,
3522 rsp->barrier_sequence);
3523 smp_mb(); /* caller's subsequent code after above check. */ 3523 smp_mb(); /* caller's subsequent code after above check. */
3524 mutex_unlock(&rsp->barrier_mutex); 3524 mutex_unlock(&rsp->barrier_mutex);
3525 return; 3525 return;
@@ -3527,7 +3527,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
3527 3527
3528 /* Mark the start of the barrier operation. */ 3528 /* Mark the start of the barrier operation. */
3529 rcu_seq_start(&rsp->barrier_sequence); 3529 rcu_seq_start(&rsp->barrier_sequence);
3530 _rcu_barrier_trace(rsp, "Inc1", -1, rsp->barrier_sequence); 3530 _rcu_barrier_trace(rsp, TPS("Inc1"), -1, rsp->barrier_sequence);
3531 3531
3532 /* 3532 /*
3533 * Initialize the count to one rather than to zero in order to 3533 * Initialize the count to one rather than to zero in order to
@@ -3550,10 +3550,10 @@ static void _rcu_barrier(struct rcu_state *rsp)
3550 rdp = per_cpu_ptr(rsp->rda, cpu); 3550 rdp = per_cpu_ptr(rsp->rda, cpu);
3551 if (rcu_is_nocb_cpu(cpu)) { 3551 if (rcu_is_nocb_cpu(cpu)) {
3552 if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) { 3552 if (!rcu_nocb_cpu_needs_barrier(rsp, cpu)) {
3553 _rcu_barrier_trace(rsp, "OfflineNoCB", cpu, 3553 _rcu_barrier_trace(rsp, TPS("OfflineNoCB"), cpu,
3554 rsp->barrier_sequence); 3554 rsp->barrier_sequence);
3555 } else { 3555 } else {
3556 _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, 3556 _rcu_barrier_trace(rsp, TPS("OnlineNoCB"), cpu,
3557 rsp->barrier_sequence); 3557 rsp->barrier_sequence);
3558 smp_mb__before_atomic(); 3558 smp_mb__before_atomic();
3559 atomic_inc(&rsp->barrier_cpu_count); 3559 atomic_inc(&rsp->barrier_cpu_count);
@@ -3561,11 +3561,11 @@ static void _rcu_barrier(struct rcu_state *rsp)
3561 rcu_barrier_callback, rsp, cpu, 0); 3561 rcu_barrier_callback, rsp, cpu, 0);
3562 } 3562 }
3563 } else if (rcu_segcblist_n_cbs(&rdp->cblist)) { 3563 } else if (rcu_segcblist_n_cbs(&rdp->cblist)) {
3564 _rcu_barrier_trace(rsp, "OnlineQ", cpu, 3564 _rcu_barrier_trace(rsp, TPS("OnlineQ"), cpu,
3565 rsp->barrier_sequence); 3565 rsp->barrier_sequence);
3566 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); 3566 smp_call_function_single(cpu, rcu_barrier_func, rsp, 1);
3567 } else { 3567 } else {
3568 _rcu_barrier_trace(rsp, "OnlineNQ", cpu, 3568 _rcu_barrier_trace(rsp, TPS("OnlineNQ"), cpu,
3569 rsp->barrier_sequence); 3569 rsp->barrier_sequence);
3570 } 3570 }
3571 } 3571 }
@@ -3582,7 +3582,7 @@ static void _rcu_barrier(struct rcu_state *rsp)
3582 wait_for_completion(&rsp->barrier_completion); 3582 wait_for_completion(&rsp->barrier_completion);
3583 3583
3584 /* Mark the end of the barrier operation. */ 3584 /* Mark the end of the barrier operation. */
3585 _rcu_barrier_trace(rsp, "Inc2", -1, rsp->barrier_sequence); 3585 _rcu_barrier_trace(rsp, TPS("Inc2"), -1, rsp->barrier_sequence);
3586 rcu_seq_end(&rsp->barrier_sequence); 3586 rcu_seq_end(&rsp->barrier_sequence);
3587 3587
3588 /* Other rcu_barrier() invocations can now safely proceed. */ 3588 /* Other rcu_barrier() invocations can now safely proceed. */
@@ -3684,8 +3684,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
3684 */ 3684 */
3685 rnp = rdp->mynode; 3685 rnp = rdp->mynode;
3686 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 3686 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
3687 if (!rdp->beenonline)
3688 WRITE_ONCE(rsp->ncpus, READ_ONCE(rsp->ncpus) + 1);
3689 rdp->beenonline = true; /* We have now been online. */ 3687 rdp->beenonline = true; /* We have now been online. */
3690 rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */ 3688 rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
3691 rdp->completed = rnp->completed; 3689 rdp->completed = rnp->completed;
@@ -3789,6 +3787,8 @@ void rcu_cpu_starting(unsigned int cpu)
3789{ 3787{
3790 unsigned long flags; 3788 unsigned long flags;
3791 unsigned long mask; 3789 unsigned long mask;
3790 int nbits;
3791 unsigned long oldmask;
3792 struct rcu_data *rdp; 3792 struct rcu_data *rdp;
3793 struct rcu_node *rnp; 3793 struct rcu_node *rnp;
3794 struct rcu_state *rsp; 3794 struct rcu_state *rsp;
@@ -3799,9 +3799,15 @@ void rcu_cpu_starting(unsigned int cpu)
3799 mask = rdp->grpmask; 3799 mask = rdp->grpmask;
3800 raw_spin_lock_irqsave_rcu_node(rnp, flags); 3800 raw_spin_lock_irqsave_rcu_node(rnp, flags);
3801 rnp->qsmaskinitnext |= mask; 3801 rnp->qsmaskinitnext |= mask;
3802 oldmask = rnp->expmaskinitnext;
3802 rnp->expmaskinitnext |= mask; 3803 rnp->expmaskinitnext |= mask;
3804 oldmask ^= rnp->expmaskinitnext;
3805 nbits = bitmap_weight(&oldmask, BITS_PER_LONG);
3806 /* Allow lockless access for expedited grace periods. */
3807 smp_store_release(&rsp->ncpus, rsp->ncpus + nbits); /* ^^^ */
3803 raw_spin_unlock_irqrestore_rcu_node(rnp, flags); 3808 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
3804 } 3809 }
3810 smp_mb(); /* Ensure RCU read-side usage follows above initialization. */
3805} 3811}
3806 3812
3807#ifdef CONFIG_HOTPLUG_CPU 3813#ifdef CONFIG_HOTPLUG_CPU
@@ -3845,96 +3851,30 @@ void rcu_report_dead(unsigned int cpu)
3845 rcu_cleanup_dying_idle_cpu(cpu, rsp); 3851 rcu_cleanup_dying_idle_cpu(cpu, rsp);
3846} 3852}
3847 3853
3848/* 3854/* Migrate the dead CPU's callbacks to the current CPU. */
3849 * Send the specified CPU's RCU callbacks to the orphanage. The
3850 * specified CPU must be offline, and the caller must hold the
3851 * ->orphan_lock.
3852 */
3853static void
3854rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
3855 struct rcu_node *rnp, struct rcu_data *rdp)
3856{
3857 lockdep_assert_held(&rsp->orphan_lock);
3858
3859 /* No-CBs CPUs do not have orphanable callbacks. */
3860 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) || rcu_is_nocb_cpu(rdp->cpu))
3861 return;
3862
3863 /*
3864 * Orphan the callbacks. First adjust the counts. This is safe
3865 * because _rcu_barrier() excludes CPU-hotplug operations, so it
3866 * cannot be running now. Thus no memory barrier is required.
3867 */
3868 rdp->n_cbs_orphaned += rcu_segcblist_n_cbs(&rdp->cblist);
3869 rcu_segcblist_extract_count(&rdp->cblist, &rsp->orphan_done);
3870
3871 /*
3872 * Next, move those callbacks still needing a grace period to
3873 * the orphanage, where some other CPU will pick them up.
3874 * Some of the callbacks might have gone partway through a grace
3875 * period, but that is too bad. They get to start over because we
3876 * cannot assume that grace periods are synchronized across CPUs.
3877 */
3878 rcu_segcblist_extract_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
3879
3880 /*
3881 * Then move the ready-to-invoke callbacks to the orphanage,
3882 * where some other CPU will pick them up. These will not be
3883 * required to pass though another grace period: They are done.
3884 */
3885 rcu_segcblist_extract_done_cbs(&rdp->cblist, &rsp->orphan_done);
3886
3887 /* Finally, disallow further callbacks on this CPU. */
3888 rcu_segcblist_disable(&rdp->cblist);
3889}
3890
3891/*
3892 * Adopt the RCU callbacks from the specified rcu_state structure's
3893 * orphanage. The caller must hold the ->orphan_lock.
3894 */
3895static void rcu_adopt_orphan_cbs(struct rcu_state *rsp, unsigned long flags)
3896{
3897 struct rcu_data *rdp = raw_cpu_ptr(rsp->rda);
3898
3899 lockdep_assert_held(&rsp->orphan_lock);
3900
3901 /* No-CBs CPUs are handled specially. */
3902 if (!IS_ENABLED(CONFIG_HOTPLUG_CPU) ||
3903 rcu_nocb_adopt_orphan_cbs(rsp, rdp, flags))
3904 return;
3905
3906 /* Do the accounting first. */
3907 rdp->n_cbs_adopted += rsp->orphan_done.len;
3908 if (rsp->orphan_done.len_lazy != rsp->orphan_done.len)
3909 rcu_idle_count_callbacks_posted();
3910 rcu_segcblist_insert_count(&rdp->cblist, &rsp->orphan_done);
3911
3912 /*
3913 * We do not need a memory barrier here because the only way we
3914 * can get here if there is an rcu_barrier() in flight is if
3915 * we are the task doing the rcu_barrier().
3916 */
3917
3918 /* First adopt the ready-to-invoke callbacks, then the done ones. */
3919 rcu_segcblist_insert_done_cbs(&rdp->cblist, &rsp->orphan_done);
3920 WARN_ON_ONCE(rsp->orphan_done.head);
3921 rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rsp->orphan_pend);
3922 WARN_ON_ONCE(rsp->orphan_pend.head);
3923 WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) !=
3924 !rcu_segcblist_n_cbs(&rdp->cblist));
3925}
3926
3927/* Orphan the dead CPU's callbacks, and then adopt them. */
3928static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp) 3855static void rcu_migrate_callbacks(int cpu, struct rcu_state *rsp)
3929{ 3856{
3930 unsigned long flags; 3857 unsigned long flags;
3858 struct rcu_data *my_rdp;
3931 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); 3859 struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
3932 struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ 3860 struct rcu_node *rnp_root = rcu_get_root(rdp->rsp);
3933 3861
3934 raw_spin_lock_irqsave(&rsp->orphan_lock, flags); 3862 if (rcu_is_nocb_cpu(cpu) || rcu_segcblist_empty(&rdp->cblist))
3935 rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); 3863 return; /* No callbacks to migrate. */
3936 rcu_adopt_orphan_cbs(rsp, flags); 3864
3937 raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags); 3865 local_irq_save(flags);
3866 my_rdp = this_cpu_ptr(rsp->rda);
3867 if (rcu_nocb_adopt_orphan_cbs(my_rdp, rdp, flags)) {
3868 local_irq_restore(flags);
3869 return;
3870 }
3871 raw_spin_lock_rcu_node(rnp_root); /* irqs already disabled. */
3872 rcu_advance_cbs(rsp, rnp_root, rdp); /* Leverage recent GPs. */
3873 rcu_advance_cbs(rsp, rnp_root, my_rdp); /* Assign GP to pending CBs. */
3874 rcu_segcblist_merge(&my_rdp->cblist, &rdp->cblist);
3875 WARN_ON_ONCE(rcu_segcblist_empty(&my_rdp->cblist) !=
3876 !rcu_segcblist_n_cbs(&my_rdp->cblist));
3877 raw_spin_unlock_irqrestore_rcu_node(rnp_root, flags);
3938 WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 || 3878 WARN_ONCE(rcu_segcblist_n_cbs(&rdp->cblist) != 0 ||
3939 !rcu_segcblist_empty(&rdp->cblist), 3879 !rcu_segcblist_empty(&rdp->cblist),
3940 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n", 3880 "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, 1stCB=%p\n",
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 9af0f31d6847..8e1f285f0a70 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -219,8 +219,6 @@ struct rcu_data {
219 /* qlen at last check for QS forcing */ 219 /* qlen at last check for QS forcing */
220 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ 220 unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */
221 unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ 221 unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */
222 unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */
223 unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */
224 unsigned long n_force_qs_snap; 222 unsigned long n_force_qs_snap;
225 /* did other CPU force QS recently? */ 223 /* did other CPU force QS recently? */
226 long blimit; /* Upper limit on a processed batch */ 224 long blimit; /* Upper limit on a processed batch */
@@ -268,7 +266,9 @@ struct rcu_data {
268 struct rcu_head **nocb_follower_tail; 266 struct rcu_head **nocb_follower_tail;
269 struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */ 267 struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */
270 struct task_struct *nocb_kthread; 268 struct task_struct *nocb_kthread;
269 raw_spinlock_t nocb_lock; /* Guard following pair of fields. */
271 int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ 270 int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */
271 struct timer_list nocb_timer; /* Enforce finite deferral. */
272 272
273 /* The following fields are used by the leader, hence own cacheline. */ 273 /* The following fields are used by the leader, hence own cacheline. */
274 struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp; 274 struct rcu_head *nocb_gp_head ____cacheline_internodealigned_in_smp;
@@ -350,15 +350,6 @@ struct rcu_state {
350 350
351 /* End of fields guarded by root rcu_node's lock. */ 351 /* End of fields guarded by root rcu_node's lock. */
352 352
353 raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp;
354 /* Protect following fields. */
355 struct rcu_cblist orphan_pend; /* Orphaned callbacks that */
356 /* need a grace period. */
357 struct rcu_cblist orphan_done; /* Orphaned callbacks that */
358 /* are ready to invoke. */
359 /* (Contains counts.) */
360 /* End of fields guarded by orphan_lock. */
361
362 struct mutex barrier_mutex; /* Guards barrier fields. */ 353 struct mutex barrier_mutex; /* Guards barrier fields. */
363 atomic_t barrier_cpu_count; /* # CPUs waiting on. */ 354 atomic_t barrier_cpu_count; /* # CPUs waiting on. */
364 struct completion barrier_completion; /* Wake at barrier end. */ 355 struct completion barrier_completion; /* Wake at barrier end. */
@@ -495,7 +486,7 @@ static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq);
495static void rcu_init_one_nocb(struct rcu_node *rnp); 486static void rcu_init_one_nocb(struct rcu_node *rnp);
496static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, 487static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
497 bool lazy, unsigned long flags); 488 bool lazy, unsigned long flags);
498static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, 489static bool rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
499 struct rcu_data *rdp, 490 struct rcu_data *rdp,
500 unsigned long flags); 491 unsigned long flags);
501static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp); 492static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp);
diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
index dd21ca47e4b4..46d61b597731 100644
--- a/kernel/rcu/tree_exp.h
+++ b/kernel/rcu/tree_exp.h
@@ -73,7 +73,7 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp)
73 unsigned long flags; 73 unsigned long flags;
74 unsigned long mask; 74 unsigned long mask;
75 unsigned long oldmask; 75 unsigned long oldmask;
76 int ncpus = READ_ONCE(rsp->ncpus); 76 int ncpus = smp_load_acquire(&rsp->ncpus); /* Order against locking. */
77 struct rcu_node *rnp; 77 struct rcu_node *rnp;
78 struct rcu_node *rnp_up; 78 struct rcu_node *rnp_up;
79 79
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 908b309d60d7..55bde94b9572 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -180,6 +180,8 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
180 struct task_struct *t = current; 180 struct task_struct *t = current;
181 181
182 lockdep_assert_held(&rnp->lock); 182 lockdep_assert_held(&rnp->lock);
183 WARN_ON_ONCE(rdp->mynode != rnp);
184 WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
183 185
184 /* 186 /*
185 * Decide where to queue the newly blocked task. In theory, 187 * Decide where to queue the newly blocked task. In theory,
@@ -261,6 +263,10 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp)
261 rnp->gp_tasks = &t->rcu_node_entry; 263 rnp->gp_tasks = &t->rcu_node_entry;
262 if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) 264 if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD))
263 rnp->exp_tasks = &t->rcu_node_entry; 265 rnp->exp_tasks = &t->rcu_node_entry;
266 WARN_ON_ONCE(!(blkd_state & RCU_GP_BLKD) !=
267 !(rnp->qsmask & rdp->grpmask));
268 WARN_ON_ONCE(!(blkd_state & RCU_EXP_BLKD) !=
269 !(rnp->expmask & rdp->grpmask));
264 raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */ 270 raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */
265 271
266 /* 272 /*
@@ -482,6 +488,7 @@ void rcu_read_unlock_special(struct task_struct *t)
482 rnp = t->rcu_blocked_node; 488 rnp = t->rcu_blocked_node;
483 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ 489 raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
484 WARN_ON_ONCE(rnp != t->rcu_blocked_node); 490 WARN_ON_ONCE(rnp != t->rcu_blocked_node);
491 WARN_ON_ONCE(rnp->level != rcu_num_lvls - 1);
485 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); 492 empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
486 empty_exp = sync_rcu_preempt_exp_done(rnp); 493 empty_exp = sync_rcu_preempt_exp_done(rnp);
487 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ 494 smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -495,10 +502,10 @@ void rcu_read_unlock_special(struct task_struct *t)
495 if (&t->rcu_node_entry == rnp->exp_tasks) 502 if (&t->rcu_node_entry == rnp->exp_tasks)
496 rnp->exp_tasks = np; 503 rnp->exp_tasks = np;
497 if (IS_ENABLED(CONFIG_RCU_BOOST)) { 504 if (IS_ENABLED(CONFIG_RCU_BOOST)) {
498 if (&t->rcu_node_entry == rnp->boost_tasks)
499 rnp->boost_tasks = np;
500 /* Snapshot ->boost_mtx ownership w/rnp->lock held. */ 505 /* Snapshot ->boost_mtx ownership w/rnp->lock held. */
501 drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t; 506 drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
507 if (&t->rcu_node_entry == rnp->boost_tasks)
508 rnp->boost_tasks = np;
502 } 509 }
503 510
504 /* 511 /*
@@ -636,10 +643,17 @@ static int rcu_print_task_exp_stall(struct rcu_node *rnp)
636 */ 643 */
637static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) 644static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
638{ 645{
646 struct task_struct *t;
647
639 RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n"); 648 RCU_LOCKDEP_WARN(preemptible(), "rcu_preempt_check_blocked_tasks() invoked with preemption enabled!!!\n");
640 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); 649 WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
641 if (rcu_preempt_has_tasks(rnp)) 650 if (rcu_preempt_has_tasks(rnp)) {
642 rnp->gp_tasks = rnp->blkd_tasks.next; 651 rnp->gp_tasks = rnp->blkd_tasks.next;
652 t = container_of(rnp->gp_tasks, struct task_struct,
653 rcu_node_entry);
654 trace_rcu_unlock_preempted_task(TPS("rcu_preempt-GPS"),
655 rnp->gpnum, t->pid);
656 }
643 WARN_ON_ONCE(rnp->qsmask); 657 WARN_ON_ONCE(rnp->qsmask);
644} 658}
645 659
@@ -1788,23 +1802,62 @@ bool rcu_is_nocb_cpu(int cpu)
1788} 1802}
1789 1803
1790/* 1804/*
1791 * Kick the leader kthread for this NOCB group. 1805 * Kick the leader kthread for this NOCB group. Caller holds ->nocb_lock
1806 * and this function releases it.
1792 */ 1807 */
1793static void wake_nocb_leader(struct rcu_data *rdp, bool force) 1808static void __wake_nocb_leader(struct rcu_data *rdp, bool force,
1809 unsigned long flags)
1810 __releases(rdp->nocb_lock)
1794{ 1811{
1795 struct rcu_data *rdp_leader = rdp->nocb_leader; 1812 struct rcu_data *rdp_leader = rdp->nocb_leader;
1796 1813
1797 if (!READ_ONCE(rdp_leader->nocb_kthread)) 1814 lockdep_assert_held(&rdp->nocb_lock);
1815 if (!READ_ONCE(rdp_leader->nocb_kthread)) {
1816 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
1798 return; 1817 return;
1799 if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { 1818 }
1819 if (rdp_leader->nocb_leader_sleep || force) {
1800 /* Prior smp_mb__after_atomic() orders against prior enqueue. */ 1820 /* Prior smp_mb__after_atomic() orders against prior enqueue. */
1801 WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); 1821 WRITE_ONCE(rdp_leader->nocb_leader_sleep, false);
1822 del_timer(&rdp->nocb_timer);
1823 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
1802 smp_mb(); /* ->nocb_leader_sleep before swake_up(). */ 1824 smp_mb(); /* ->nocb_leader_sleep before swake_up(). */
1803 swake_up(&rdp_leader->nocb_wq); 1825 swake_up(&rdp_leader->nocb_wq);
1826 } else {
1827 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
1804 } 1828 }
1805} 1829}
1806 1830
1807/* 1831/*
1832 * Kick the leader kthread for this NOCB group, but caller has not
1833 * acquired locks.
1834 */
1835static void wake_nocb_leader(struct rcu_data *rdp, bool force)
1836{
1837 unsigned long flags;
1838
1839 raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
1840 __wake_nocb_leader(rdp, force, flags);
1841}
1842
1843/*
1844 * Arrange to wake the leader kthread for this NOCB group at some
1845 * future time when it is safe to do so.
1846 */
1847static void wake_nocb_leader_defer(struct rcu_data *rdp, int waketype,
1848 const char *reason)
1849{
1850 unsigned long flags;
1851
1852 raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
1853 if (rdp->nocb_defer_wakeup == RCU_NOCB_WAKE_NOT)
1854 mod_timer(&rdp->nocb_timer, jiffies + 1);
1855 WRITE_ONCE(rdp->nocb_defer_wakeup, waketype);
1856 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, reason);
1857 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
1858}
1859
1860/*
1808 * Does the specified CPU need an RCU callback for the specified flavor 1861 * Does the specified CPU need an RCU callback for the specified flavor
1809 * of rcu_barrier()? 1862 * of rcu_barrier()?
1810 */ 1863 */
@@ -1891,11 +1944,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
1891 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 1944 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
1892 TPS("WakeEmpty")); 1945 TPS("WakeEmpty"));
1893 } else { 1946 } else {
1894 WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE); 1947 wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
1895 /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ 1948 TPS("WakeEmptyIsDeferred"));
1896 smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
1897 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
1898 TPS("WakeEmptyIsDeferred"));
1899 } 1949 }
1900 rdp->qlen_last_fqs_check = 0; 1950 rdp->qlen_last_fqs_check = 0;
1901 } else if (len > rdp->qlen_last_fqs_check + qhimark) { 1951 } else if (len > rdp->qlen_last_fqs_check + qhimark) {
@@ -1905,11 +1955,8 @@ static void __call_rcu_nocb_enqueue(struct rcu_data *rdp,
1905 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 1955 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
1906 TPS("WakeOvf")); 1956 TPS("WakeOvf"));
1907 } else { 1957 } else {
1908 WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_FORCE); 1958 wake_nocb_leader_defer(rdp, RCU_NOCB_WAKE,
1909 /* Store ->nocb_defer_wakeup before ->rcu_urgent_qs. */ 1959 TPS("WakeOvfIsDeferred"));
1910 smp_store_release(this_cpu_ptr(&rcu_dynticks.rcu_urgent_qs), true);
1911 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
1912 TPS("WakeOvfIsDeferred"));
1913 } 1960 }
1914 rdp->qlen_last_fqs_check = LONG_MAX / 2; 1961 rdp->qlen_last_fqs_check = LONG_MAX / 2;
1915 } else { 1962 } else {
@@ -1961,30 +2008,19 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
1961 * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is 2008 * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is
1962 * not a no-CBs CPU. 2009 * not a no-CBs CPU.
1963 */ 2010 */
1964static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, 2011static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
1965 struct rcu_data *rdp, 2012 struct rcu_data *rdp,
1966 unsigned long flags) 2013 unsigned long flags)
1967{ 2014{
1968 long ql = rsp->orphan_done.len; 2015 RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_nocb_adopt_orphan_cbs() invoked with irqs enabled!!!");
1969 long qll = rsp->orphan_done.len_lazy;
1970
1971 /* If this is not a no-CBs CPU, tell the caller to do it the old way. */
1972 if (!rcu_is_nocb_cpu(smp_processor_id())) 2016 if (!rcu_is_nocb_cpu(smp_processor_id()))
1973 return false; 2017 return false; /* Not NOCBs CPU, caller must migrate CBs. */
1974 2018 __call_rcu_nocb_enqueue(my_rdp, rcu_segcblist_head(&rdp->cblist),
1975 /* First, enqueue the donelist, if any. This preserves CB ordering. */ 2019 rcu_segcblist_tail(&rdp->cblist),
1976 if (rsp->orphan_done.head) { 2020 rcu_segcblist_n_cbs(&rdp->cblist),
1977 __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_done), 2021 rcu_segcblist_n_lazy_cbs(&rdp->cblist), flags);
1978 rcu_cblist_tail(&rsp->orphan_done), 2022 rcu_segcblist_init(&rdp->cblist);
1979 ql, qll, flags); 2023 rcu_segcblist_disable(&rdp->cblist);
1980 }
1981 if (rsp->orphan_pend.head) {
1982 __call_rcu_nocb_enqueue(rdp, rcu_cblist_head(&rsp->orphan_pend),
1983 rcu_cblist_tail(&rsp->orphan_pend),
1984 ql, qll, flags);
1985 }
1986 rcu_cblist_init(&rsp->orphan_done);
1987 rcu_cblist_init(&rsp->orphan_pend);
1988 return true; 2024 return true;
1989} 2025}
1990 2026
@@ -2031,6 +2067,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp)
2031static void nocb_leader_wait(struct rcu_data *my_rdp) 2067static void nocb_leader_wait(struct rcu_data *my_rdp)
2032{ 2068{
2033 bool firsttime = true; 2069 bool firsttime = true;
2070 unsigned long flags;
2034 bool gotcbs; 2071 bool gotcbs;
2035 struct rcu_data *rdp; 2072 struct rcu_data *rdp;
2036 struct rcu_head **tail; 2073 struct rcu_head **tail;
@@ -2039,13 +2076,17 @@ wait_again:
2039 2076
2040 /* Wait for callbacks to appear. */ 2077 /* Wait for callbacks to appear. */
2041 if (!rcu_nocb_poll) { 2078 if (!rcu_nocb_poll) {
2042 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); 2079 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Sleep"));
2043 swait_event_interruptible(my_rdp->nocb_wq, 2080 swait_event_interruptible(my_rdp->nocb_wq,
2044 !READ_ONCE(my_rdp->nocb_leader_sleep)); 2081 !READ_ONCE(my_rdp->nocb_leader_sleep));
2045 /* Memory barrier handled by smp_mb() calls below and repoll. */ 2082 raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
2083 my_rdp->nocb_leader_sleep = true;
2084 WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
2085 del_timer(&my_rdp->nocb_timer);
2086 raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
2046 } else if (firsttime) { 2087 } else if (firsttime) {
2047 firsttime = false; /* Don't drown trace log with "Poll"! */ 2088 firsttime = false; /* Don't drown trace log with "Poll"! */
2048 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Poll"); 2089 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, TPS("Poll"));
2049 } 2090 }
2050 2091
2051 /* 2092 /*
@@ -2054,7 +2095,7 @@ wait_again:
2054 * nocb_gp_head, where they await a grace period. 2095 * nocb_gp_head, where they await a grace period.
2055 */ 2096 */
2056 gotcbs = false; 2097 gotcbs = false;
2057 smp_mb(); /* wakeup before ->nocb_head reads. */ 2098 smp_mb(); /* wakeup and _sleep before ->nocb_head reads. */
2058 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { 2099 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
2059 rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head); 2100 rdp->nocb_gp_head = READ_ONCE(rdp->nocb_head);
2060 if (!rdp->nocb_gp_head) 2101 if (!rdp->nocb_gp_head)
@@ -2066,56 +2107,41 @@ wait_again:
2066 gotcbs = true; 2107 gotcbs = true;
2067 } 2108 }
2068 2109
2069 /* 2110 /* No callbacks? Sleep a bit if polling, and go retry. */
2070 * If there were no callbacks, sleep a bit, rescan after a
2071 * memory barrier, and go retry.
2072 */
2073 if (unlikely(!gotcbs)) { 2111 if (unlikely(!gotcbs)) {
2074 if (!rcu_nocb_poll)
2075 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
2076 "WokeEmpty");
2077 WARN_ON(signal_pending(current)); 2112 WARN_ON(signal_pending(current));
2078 schedule_timeout_interruptible(1); 2113 if (rcu_nocb_poll) {
2079 2114 schedule_timeout_interruptible(1);
2080 /* Rescan in case we were a victim of memory ordering. */ 2115 } else {
2081 my_rdp->nocb_leader_sleep = true; 2116 trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu,
2082 smp_mb(); /* Ensure _sleep true before scan. */ 2117 TPS("WokeEmpty"));
2083 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) 2118 }
2084 if (READ_ONCE(rdp->nocb_head)) {
2085 /* Found CB, so short-circuit next wait. */
2086 my_rdp->nocb_leader_sleep = false;
2087 break;
2088 }
2089 goto wait_again; 2119 goto wait_again;
2090 } 2120 }
2091 2121
2092 /* Wait for one grace period. */ 2122 /* Wait for one grace period. */
2093 rcu_nocb_wait_gp(my_rdp); 2123 rcu_nocb_wait_gp(my_rdp);
2094 2124
2095 /*
2096 * We left ->nocb_leader_sleep unset to reduce cache thrashing.
2097 * We set it now, but recheck for new callbacks while
2098 * traversing our follower list.
2099 */
2100 my_rdp->nocb_leader_sleep = true;
2101 smp_mb(); /* Ensure _sleep true before scan of ->nocb_head. */
2102
2103 /* Each pass through the following loop wakes a follower, if needed. */ 2125 /* Each pass through the following loop wakes a follower, if needed. */
2104 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) { 2126 for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_follower) {
2105 if (READ_ONCE(rdp->nocb_head)) 2127 if (!rcu_nocb_poll &&
2128 READ_ONCE(rdp->nocb_head) &&
2129 READ_ONCE(my_rdp->nocb_leader_sleep)) {
2130 raw_spin_lock_irqsave(&my_rdp->nocb_lock, flags);
2106 my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/ 2131 my_rdp->nocb_leader_sleep = false;/* No need to sleep.*/
2132 raw_spin_unlock_irqrestore(&my_rdp->nocb_lock, flags);
2133 }
2107 if (!rdp->nocb_gp_head) 2134 if (!rdp->nocb_gp_head)
2108 continue; /* No CBs, so no need to wake follower. */ 2135 continue; /* No CBs, so no need to wake follower. */
2109 2136
2110 /* Append callbacks to follower's "done" list. */ 2137 /* Append callbacks to follower's "done" list. */
2111 tail = xchg(&rdp->nocb_follower_tail, rdp->nocb_gp_tail); 2138 raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
2139 tail = rdp->nocb_follower_tail;
2140 rdp->nocb_follower_tail = rdp->nocb_gp_tail;
2112 *tail = rdp->nocb_gp_head; 2141 *tail = rdp->nocb_gp_head;
2113 smp_mb__after_atomic(); /* Store *tail before wakeup. */ 2142 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
2114 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) { 2143 if (rdp != my_rdp && tail == &rdp->nocb_follower_head) {
2115 /* 2144 /* List was empty, so wake up the follower. */
2116 * List was empty, wake up the follower.
2117 * Memory barriers supplied by atomic_long_add().
2118 */
2119 swake_up(&rdp->nocb_wq); 2145 swake_up(&rdp->nocb_wq);
2120 } 2146 }
2121 } 2147 }
@@ -2131,28 +2157,16 @@ wait_again:
2131 */ 2157 */
2132static void nocb_follower_wait(struct rcu_data *rdp) 2158static void nocb_follower_wait(struct rcu_data *rdp)
2133{ 2159{
2134 bool firsttime = true;
2135
2136 for (;;) { 2160 for (;;) {
2137 if (!rcu_nocb_poll) { 2161 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("FollowerSleep"));
2138 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, 2162 swait_event_interruptible(rdp->nocb_wq,
2139 "FollowerSleep"); 2163 READ_ONCE(rdp->nocb_follower_head));
2140 swait_event_interruptible(rdp->nocb_wq,
2141 READ_ONCE(rdp->nocb_follower_head));
2142 } else if (firsttime) {
2143 /* Don't drown trace log with "Poll"! */
2144 firsttime = false;
2145 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "Poll");
2146 }
2147 if (smp_load_acquire(&rdp->nocb_follower_head)) { 2164 if (smp_load_acquire(&rdp->nocb_follower_head)) {
2148 /* ^^^ Ensure CB invocation follows _head test. */ 2165 /* ^^^ Ensure CB invocation follows _head test. */
2149 return; 2166 return;
2150 } 2167 }
2151 if (!rcu_nocb_poll)
2152 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu,
2153 "WokeEmpty");
2154 WARN_ON(signal_pending(current)); 2168 WARN_ON(signal_pending(current));
2155 schedule_timeout_interruptible(1); 2169 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeEmpty"));
2156 } 2170 }
2157} 2171}
2158 2172
@@ -2165,6 +2179,7 @@ static void nocb_follower_wait(struct rcu_data *rdp)
2165static int rcu_nocb_kthread(void *arg) 2179static int rcu_nocb_kthread(void *arg)
2166{ 2180{
2167 int c, cl; 2181 int c, cl;
2182 unsigned long flags;
2168 struct rcu_head *list; 2183 struct rcu_head *list;
2169 struct rcu_head *next; 2184 struct rcu_head *next;
2170 struct rcu_head **tail; 2185 struct rcu_head **tail;
@@ -2179,11 +2194,14 @@ static int rcu_nocb_kthread(void *arg)
2179 nocb_follower_wait(rdp); 2194 nocb_follower_wait(rdp);
2180 2195
2181 /* Pull the ready-to-invoke callbacks onto local list. */ 2196 /* Pull the ready-to-invoke callbacks onto local list. */
2182 list = READ_ONCE(rdp->nocb_follower_head); 2197 raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
2198 list = rdp->nocb_follower_head;
2199 rdp->nocb_follower_head = NULL;
2200 tail = rdp->nocb_follower_tail;
2201 rdp->nocb_follower_tail = &rdp->nocb_follower_head;
2202 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
2183 BUG_ON(!list); 2203 BUG_ON(!list);
2184 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "WokeNonEmpty"); 2204 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WokeNonEmpty"));
2185 WRITE_ONCE(rdp->nocb_follower_head, NULL);
2186 tail = xchg(&rdp->nocb_follower_tail, &rdp->nocb_follower_head);
2187 2205
2188 /* Each pass through the following loop invokes a callback. */ 2206 /* Each pass through the following loop invokes a callback. */
2189 trace_rcu_batch_start(rdp->rsp->name, 2207 trace_rcu_batch_start(rdp->rsp->name,
@@ -2226,18 +2244,39 @@ static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp)
2226} 2244}
2227 2245
2228/* Do a deferred wakeup of rcu_nocb_kthread(). */ 2246/* Do a deferred wakeup of rcu_nocb_kthread(). */
2229static void do_nocb_deferred_wakeup(struct rcu_data *rdp) 2247static void do_nocb_deferred_wakeup_common(struct rcu_data *rdp)
2230{ 2248{
2249 unsigned long flags;
2231 int ndw; 2250 int ndw;
2232 2251
2233 if (!rcu_nocb_need_deferred_wakeup(rdp)) 2252 raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
2253 if (!rcu_nocb_need_deferred_wakeup(rdp)) {
2254 raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
2234 return; 2255 return;
2256 }
2235 ndw = READ_ONCE(rdp->nocb_defer_wakeup); 2257 ndw = READ_ONCE(rdp->nocb_defer_wakeup);
2236 WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); 2258 WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
2237 wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE); 2259 __wake_nocb_leader(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
2238 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake")); 2260 trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("DeferredWake"));
2239} 2261}
2240 2262
2263/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
2264static void do_nocb_deferred_wakeup_timer(unsigned long x)
2265{
2266 do_nocb_deferred_wakeup_common((struct rcu_data *)x);
2267}
2268
2269/*
2270 * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
2271 * This means we do an inexact common-case check. Note that if
2272 * we miss, ->nocb_timer will eventually clean things up.
2273 */
2274static void do_nocb_deferred_wakeup(struct rcu_data *rdp)
2275{
2276 if (rcu_nocb_need_deferred_wakeup(rdp))
2277 do_nocb_deferred_wakeup_common(rdp);
2278}
2279
2241void __init rcu_init_nohz(void) 2280void __init rcu_init_nohz(void)
2242{ 2281{
2243 int cpu; 2282 int cpu;
@@ -2287,6 +2326,9 @@ static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
2287 rdp->nocb_tail = &rdp->nocb_head; 2326 rdp->nocb_tail = &rdp->nocb_head;
2288 init_swait_queue_head(&rdp->nocb_wq); 2327 init_swait_queue_head(&rdp->nocb_wq);
2289 rdp->nocb_follower_tail = &rdp->nocb_follower_head; 2328 rdp->nocb_follower_tail = &rdp->nocb_follower_head;
2329 raw_spin_lock_init(&rdp->nocb_lock);
2330 setup_timer(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer,
2331 (unsigned long)rdp);
2290} 2332}
2291 2333
2292/* 2334/*
@@ -2459,7 +2501,7 @@ static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp,
2459 return false; 2501 return false;
2460} 2502}
2461 2503
2462static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, 2504static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_data *my_rdp,
2463 struct rcu_data *rdp, 2505 struct rcu_data *rdp,
2464 unsigned long flags) 2506 unsigned long flags)
2465{ 2507{
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 00e77c470017..5033b66d2753 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -568,7 +568,7 @@ static DECLARE_WAIT_QUEUE_HEAD(rcu_tasks_cbs_wq);
568static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock); 568static DEFINE_RAW_SPINLOCK(rcu_tasks_cbs_lock);
569 569
570/* Track exiting tasks in order to allow them to be waited for. */ 570/* Track exiting tasks in order to allow them to be waited for. */
571DEFINE_SRCU(tasks_rcu_exit_srcu); 571DEFINE_STATIC_SRCU(tasks_rcu_exit_srcu);
572 572
573/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */ 573/* Control stall timeouts. Disable with <= 0, otherwise jiffies till stall. */
574#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10) 574#define RCU_TASK_STALL_TIMEOUT (HZ * 60 * 10)
@@ -875,6 +875,22 @@ static void rcu_spawn_tasks_kthread(void)
875 mutex_unlock(&rcu_tasks_kthread_mutex); 875 mutex_unlock(&rcu_tasks_kthread_mutex);
876} 876}
877 877
878/* Do the srcu_read_lock() for the above synchronize_srcu(). */
879void exit_tasks_rcu_start(void)
880{
881 preempt_disable();
882 current->rcu_tasks_idx = __srcu_read_lock(&tasks_rcu_exit_srcu);
883 preempt_enable();
884}
885
886/* Do the srcu_read_unlock() for the above synchronize_srcu(). */
887void exit_tasks_rcu_finish(void)
888{
889 preempt_disable();
890 __srcu_read_unlock(&tasks_rcu_exit_srcu, current->rcu_tasks_idx);
891 preempt_enable();
892}
893
878#endif /* #ifdef CONFIG_TASKS_RCU */ 894#endif /* #ifdef CONFIG_TASKS_RCU */
879 895
880#ifndef CONFIG_TINY_RCU 896#ifndef CONFIG_TINY_RCU
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index 53f0164ed362..78f54932ea1d 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -25,3 +25,4 @@ obj-$(CONFIG_SCHED_DEBUG) += debug.o
25obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o 25obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
26obj-$(CONFIG_CPU_FREQ) += cpufreq.o 26obj-$(CONFIG_CPU_FREQ) += cpufreq.o
27obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o 27obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
28obj-$(CONFIG_MEMBARRIER) += membarrier.o
diff --git a/kernel/sched/completion.c b/kernel/sched/completion.c
index 13fc5ae9bf2f..c9524d2d9316 100644
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@@ -300,6 +300,8 @@ EXPORT_SYMBOL(try_wait_for_completion);
300 */ 300 */
301bool completion_done(struct completion *x) 301bool completion_done(struct completion *x)
302{ 302{
303 unsigned long flags;
304
303 if (!READ_ONCE(x->done)) 305 if (!READ_ONCE(x->done))
304 return false; 306 return false;
305 307
@@ -307,14 +309,9 @@ bool completion_done(struct completion *x)
307 * If ->done, we need to wait for complete() to release ->wait.lock 309 * If ->done, we need to wait for complete() to release ->wait.lock
308 * otherwise we can end up freeing the completion before complete() 310 * otherwise we can end up freeing the completion before complete()
309 * is done referencing it. 311 * is done referencing it.
310 *
311 * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
312 * the loads of ->done and ->wait.lock such that we cannot observe
313 * the lock before complete() acquires it while observing the ->done
314 * after it's acquired the lock.
315 */ 312 */
316 smp_rmb(); 313 spin_lock_irqsave(&x->wait.lock, flags);
317 spin_unlock_wait(&x->wait.lock); 314 spin_unlock_irqrestore(&x->wait.lock, flags);
318 return true; 315 return true;
319} 316}
320EXPORT_SYMBOL(completion_done); 317EXPORT_SYMBOL(completion_done);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0869b20fba81..e053c31d96da 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -951,8 +951,13 @@ struct migration_arg {
951static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, 951static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
952 struct task_struct *p, int dest_cpu) 952 struct task_struct *p, int dest_cpu)
953{ 953{
954 if (unlikely(!cpu_active(dest_cpu))) 954 if (p->flags & PF_KTHREAD) {
955 return rq; 955 if (unlikely(!cpu_online(dest_cpu)))
956 return rq;
957 } else {
958 if (unlikely(!cpu_active(dest_cpu)))
959 return rq;
960 }
956 961
957 /* Affinity changed (again). */ 962 /* Affinity changed (again). */
958 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed)) 963 if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
@@ -2635,6 +2640,16 @@ static struct rq *finish_task_switch(struct task_struct *prev)
2635 prev_state = prev->state; 2640 prev_state = prev->state;
2636 vtime_task_switch(prev); 2641 vtime_task_switch(prev);
2637 perf_event_task_sched_in(prev, current); 2642 perf_event_task_sched_in(prev, current);
2643 /*
2644 * The membarrier system call requires a full memory barrier
2645 * after storing to rq->curr, before going back to user-space.
2646 *
2647 * TODO: This smp_mb__after_unlock_lock can go away if PPC end
2648 * up adding a full barrier to switch_mm(), or we should figure
2649 * out if a smp_mb__after_unlock_lock is really the proper API
2650 * to use.
2651 */
2652 smp_mb__after_unlock_lock();
2638 finish_lock_switch(rq, prev); 2653 finish_lock_switch(rq, prev);
2639 finish_arch_post_lock_switch(); 2654 finish_arch_post_lock_switch();
2640 2655
@@ -3324,6 +3339,21 @@ static void __sched notrace __schedule(bool preempt)
3324 if (likely(prev != next)) { 3339 if (likely(prev != next)) {
3325 rq->nr_switches++; 3340 rq->nr_switches++;
3326 rq->curr = next; 3341 rq->curr = next;
3342 /*
3343 * The membarrier system call requires each architecture
3344 * to have a full memory barrier after updating
3345 * rq->curr, before returning to user-space. For TSO
3346 * (e.g. x86), the architecture must provide its own
3347 * barrier in switch_mm(). For weakly ordered machines
3348 * for which spin_unlock() acts as a full memory
3349 * barrier, finish_lock_switch() in common code takes
3350 * care of this barrier. For weakly ordered machines for
3351 * which spin_unlock() acts as a RELEASE barrier (only
3352 * arm64 and PowerPC), arm64 has a full barrier in
3353 * switch_to(), and PowerPC has
3354 * smp_mb__after_unlock_lock() before
3355 * finish_lock_switch().
3356 */
3327 ++*switch_count; 3357 ++*switch_count;
3328 3358
3329 trace_sched_switch(preempt, prev, next); 3359 trace_sched_switch(preempt, prev, next);
@@ -3352,8 +3382,8 @@ void __noreturn do_task_dead(void)
3352 * To avoid it, we have to wait for releasing tsk->pi_lock which 3382 * To avoid it, we have to wait for releasing tsk->pi_lock which
3353 * is held by try_to_wake_up() 3383 * is held by try_to_wake_up()
3354 */ 3384 */
3355 smp_mb(); 3385 raw_spin_lock_irq(&current->pi_lock);
3356 raw_spin_unlock_wait(&current->pi_lock); 3386 raw_spin_unlock_irq(&current->pi_lock);
3357 3387
3358 /* Causes final put_task_struct in finish_task_switch(): */ 3388 /* Causes final put_task_struct in finish_task_switch(): */
3359 __set_current_state(TASK_DEAD); 3389 __set_current_state(TASK_DEAD);
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
new file mode 100644
index 000000000000..a92fddc22747
--- /dev/null
+++ b/kernel/sched/membarrier.c
@@ -0,0 +1,152 @@
1/*
2 * Copyright (C) 2010-2017 Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
3 *
4 * membarrier system call
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 */
16
17#include <linux/syscalls.h>
18#include <linux/membarrier.h>
19#include <linux/tick.h>
20#include <linux/cpumask.h>
21
22#include "sched.h" /* for cpu_rq(). */
23
24/*
25 * Bitmask made from a "or" of all commands within enum membarrier_cmd,
26 * except MEMBARRIER_CMD_QUERY.
27 */
28#define MEMBARRIER_CMD_BITMASK \
29 (MEMBARRIER_CMD_SHARED | MEMBARRIER_CMD_PRIVATE_EXPEDITED)
30
31static void ipi_mb(void *info)
32{
33 smp_mb(); /* IPIs should be serializing but paranoid. */
34}
35
36static void membarrier_private_expedited(void)
37{
38 int cpu;
39 bool fallback = false;
40 cpumask_var_t tmpmask;
41
42 if (num_online_cpus() == 1)
43 return;
44
45 /*
46 * Matches memory barriers around rq->curr modification in
47 * scheduler.
48 */
49 smp_mb(); /* system call entry is not a mb. */
50
51 /*
52 * Expedited membarrier commands guarantee that they won't
53 * block, hence the GFP_NOWAIT allocation flag and fallback
54 * implementation.
55 */
56 if (!zalloc_cpumask_var(&tmpmask, GFP_NOWAIT)) {
57 /* Fallback for OOM. */
58 fallback = true;
59 }
60
61 cpus_read_lock();
62 for_each_online_cpu(cpu) {
63 struct task_struct *p;
64
65 /*
66 * Skipping the current CPU is OK even through we can be
67 * migrated at any point. The current CPU, at the point
68 * where we read raw_smp_processor_id(), is ensured to
69 * be in program order with respect to the caller
70 * thread. Therefore, we can skip this CPU from the
71 * iteration.
72 */
73 if (cpu == raw_smp_processor_id())
74 continue;
75 rcu_read_lock();
76 p = task_rcu_dereference(&cpu_rq(cpu)->curr);
77 if (p && p->mm == current->mm) {
78 if (!fallback)
79 __cpumask_set_cpu(cpu, tmpmask);
80 else
81 smp_call_function_single(cpu, ipi_mb, NULL, 1);
82 }
83 rcu_read_unlock();
84 }
85 if (!fallback) {
86 smp_call_function_many(tmpmask, ipi_mb, NULL, 1);
87 free_cpumask_var(tmpmask);
88 }
89 cpus_read_unlock();
90
91 /*
92 * Memory barrier on the caller thread _after_ we finished
93 * waiting for the last IPI. Matches memory barriers around
94 * rq->curr modification in scheduler.
95 */
96 smp_mb(); /* exit from system call is not a mb */
97}
98
99/**
100 * sys_membarrier - issue memory barriers on a set of threads
101 * @cmd: Takes command values defined in enum membarrier_cmd.
102 * @flags: Currently needs to be 0. For future extensions.
103 *
104 * If this system call is not implemented, -ENOSYS is returned. If the
105 * command specified does not exist, not available on the running
106 * kernel, or if the command argument is invalid, this system call
107 * returns -EINVAL. For a given command, with flags argument set to 0,
108 * this system call is guaranteed to always return the same value until
109 * reboot.
110 *
111 * All memory accesses performed in program order from each targeted thread
112 * is guaranteed to be ordered with respect to sys_membarrier(). If we use
113 * the semantic "barrier()" to represent a compiler barrier forcing memory
114 * accesses to be performed in program order across the barrier, and
115 * smp_mb() to represent explicit memory barriers forcing full memory
116 * ordering across the barrier, we have the following ordering table for
117 * each pair of barrier(), sys_membarrier() and smp_mb():
118 *
119 * The pair ordering is detailed as (O: ordered, X: not ordered):
120 *
121 * barrier() smp_mb() sys_membarrier()
122 * barrier() X X O
123 * smp_mb() X O O
124 * sys_membarrier() O O O
125 */
126SYSCALL_DEFINE2(membarrier, int, cmd, int, flags)
127{
128 if (unlikely(flags))
129 return -EINVAL;
130 switch (cmd) {
131 case MEMBARRIER_CMD_QUERY:
132 {
133 int cmd_mask = MEMBARRIER_CMD_BITMASK;
134
135 if (tick_nohz_full_enabled())
136 cmd_mask &= ~MEMBARRIER_CMD_SHARED;
137 return cmd_mask;
138 }
139 case MEMBARRIER_CMD_SHARED:
140 /* MEMBARRIER_CMD_SHARED is not compatible with nohz_full. */
141 if (tick_nohz_full_enabled())
142 return -EINVAL;
143 if (num_online_cpus() > 1)
144 synchronize_sched();
145 return 0;
146 case MEMBARRIER_CMD_PRIVATE_EXPEDITED:
147 membarrier_private_expedited();
148 return 0;
149 default:
150 return -EINVAL;
151 }
152}
diff --git a/kernel/task_work.c b/kernel/task_work.c
index d513051fcca2..836a72a66fba 100644
--- a/kernel/task_work.c
+++ b/kernel/task_work.c
@@ -96,20 +96,16 @@ void task_work_run(void)
96 * work->func() can do task_work_add(), do not set 96 * work->func() can do task_work_add(), do not set
97 * work_exited unless the list is empty. 97 * work_exited unless the list is empty.
98 */ 98 */
99 raw_spin_lock_irq(&task->pi_lock);
99 do { 100 do {
100 work = READ_ONCE(task->task_works); 101 work = READ_ONCE(task->task_works);
101 head = !work && (task->flags & PF_EXITING) ? 102 head = !work && (task->flags & PF_EXITING) ?
102 &work_exited : NULL; 103 &work_exited : NULL;
103 } while (cmpxchg(&task->task_works, work, head) != work); 104 } while (cmpxchg(&task->task_works, work, head) != work);
105 raw_spin_unlock_irq(&task->pi_lock);
104 106
105 if (!work) 107 if (!work)
106 break; 108 break;
107 /*
108 * Synchronize with task_work_cancel(). It can't remove
109 * the first entry == work, cmpxchg(task_works) should
110 * fail, but it can play with *work and other entries.
111 */
112 raw_spin_unlock_wait(&task->pi_lock);
113 109
114 do { 110 do {
115 next = work->next; 111 next = work->next;
diff --git a/kernel/torture.c b/kernel/torture.c
index 55de96529287..637e172835d8 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -117,7 +117,7 @@ bool torture_offline(int cpu, long *n_offl_attempts, long *n_offl_successes,
117 torture_type, cpu); 117 torture_type, cpu);
118 (*n_offl_successes)++; 118 (*n_offl_successes)++;
119 delta = jiffies - starttime; 119 delta = jiffies - starttime;
120 sum_offl += delta; 120 *sum_offl += delta;
121 if (*min_offl < 0) { 121 if (*min_offl < 0) {
122 *min_offl = delta; 122 *min_offl = delta;
123 *max_offl = delta; 123 *max_offl = delta;