diff options
Diffstat (limited to 'kernel/rcu')
| -rw-r--r-- | kernel/rcu/Makefile | 6 | ||||
| -rw-r--r-- | kernel/rcu/rcu.h | 132 | ||||
| -rw-r--r-- | kernel/rcu/srcu.c | 651 | ||||
| -rw-r--r-- | kernel/rcu/tiny.c | 388 | ||||
| -rw-r--r-- | kernel/rcu/tiny_plugin.h | 174 | ||||
| -rw-r--r-- | kernel/rcu/torture.c | 2145 | ||||
| -rw-r--r-- | kernel/rcu/tree.c | 3416 | ||||
| -rw-r--r-- | kernel/rcu/tree.h | 585 | ||||
| -rw-r--r-- | kernel/rcu/tree_plugin.h | 2831 | ||||
| -rw-r--r-- | kernel/rcu/tree_trace.c | 500 | ||||
| -rw-r--r-- | kernel/rcu/update.c | 347 |
11 files changed, 11175 insertions, 0 deletions
diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile new file mode 100644 index 000000000000..01e9ec37a3e3 --- /dev/null +++ b/kernel/rcu/Makefile | |||
| @@ -0,0 +1,6 @@ | |||
| 1 | obj-y += update.o srcu.o | ||
| 2 | obj-$(CONFIG_RCU_TORTURE_TEST) += torture.o | ||
| 3 | obj-$(CONFIG_TREE_RCU) += tree.o | ||
| 4 | obj-$(CONFIG_TREE_PREEMPT_RCU) += tree.o | ||
| 5 | obj-$(CONFIG_TREE_RCU_TRACE) += tree_trace.o | ||
| 6 | obj-$(CONFIG_TINY_RCU) += tiny.o | ||
diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h new file mode 100644 index 000000000000..7859a0a3951e --- /dev/null +++ b/kernel/rcu/rcu.h | |||
| @@ -0,0 +1,132 @@ | |||
| 1 | /* | ||
| 2 | * Read-Copy Update definitions shared among RCU implementations. | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, write to the Free Software | ||
| 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
| 17 | * | ||
| 18 | * Copyright IBM Corporation, 2011 | ||
| 19 | * | ||
| 20 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | ||
| 21 | */ | ||
| 22 | |||
| 23 | #ifndef __LINUX_RCU_H | ||
| 24 | #define __LINUX_RCU_H | ||
| 25 | |||
| 26 | #ifdef CONFIG_RCU_TRACE | ||
| 27 | #define RCU_TRACE(stmt) stmt | ||
| 28 | #else /* #ifdef CONFIG_RCU_TRACE */ | ||
| 29 | #define RCU_TRACE(stmt) | ||
| 30 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | ||
| 31 | |||
| 32 | /* | ||
| 33 | * Process-level increment to ->dynticks_nesting field. This allows for | ||
| 34 | * architectures that use half-interrupts and half-exceptions from | ||
| 35 | * process context. | ||
| 36 | * | ||
| 37 | * DYNTICK_TASK_NEST_MASK defines a field of width DYNTICK_TASK_NEST_WIDTH | ||
| 38 | * that counts the number of process-based reasons why RCU cannot | ||
| 39 | * consider the corresponding CPU to be idle, and DYNTICK_TASK_NEST_VALUE | ||
| 40 | * is the value used to increment or decrement this field. | ||
| 41 | * | ||
| 42 | * The rest of the bits could in principle be used to count interrupts, | ||
| 43 | * but this would mean that a negative-one value in the interrupt | ||
| 44 | * field could incorrectly zero out the DYNTICK_TASK_NEST_MASK field. | ||
| 45 | * We therefore provide a two-bit guard field defined by DYNTICK_TASK_MASK | ||
| 46 | * that is set to DYNTICK_TASK_FLAG upon initial exit from idle. | ||
| 47 | * The DYNTICK_TASK_EXIT_IDLE value is thus the combined value used upon | ||
| 48 | * initial exit from idle. | ||
| 49 | */ | ||
| 50 | #define DYNTICK_TASK_NEST_WIDTH 7 | ||
| 51 | #define DYNTICK_TASK_NEST_VALUE ((LLONG_MAX >> DYNTICK_TASK_NEST_WIDTH) + 1) | ||
| 52 | #define DYNTICK_TASK_NEST_MASK (LLONG_MAX - DYNTICK_TASK_NEST_VALUE + 1) | ||
| 53 | #define DYNTICK_TASK_FLAG ((DYNTICK_TASK_NEST_VALUE / 8) * 2) | ||
| 54 | #define DYNTICK_TASK_MASK ((DYNTICK_TASK_NEST_VALUE / 8) * 3) | ||
| 55 | #define DYNTICK_TASK_EXIT_IDLE (DYNTICK_TASK_NEST_VALUE + \ | ||
| 56 | DYNTICK_TASK_FLAG) | ||
| 57 | |||
| 58 | /* | ||
| 59 | * debug_rcu_head_queue()/debug_rcu_head_unqueue() are used internally | ||
| 60 | * by call_rcu() and rcu callback execution, and are therefore not part of the | ||
| 61 | * RCU API. Leaving in rcupdate.h because they are used by all RCU flavors. | ||
| 62 | */ | ||
| 63 | |||
| 64 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | ||
| 65 | # define STATE_RCU_HEAD_READY 0 | ||
| 66 | # define STATE_RCU_HEAD_QUEUED 1 | ||
| 67 | |||
| 68 | extern struct debug_obj_descr rcuhead_debug_descr; | ||
| 69 | |||
| 70 | static inline int debug_rcu_head_queue(struct rcu_head *head) | ||
| 71 | { | ||
| 72 | int r1; | ||
| 73 | |||
| 74 | r1 = debug_object_activate(head, &rcuhead_debug_descr); | ||
| 75 | debug_object_active_state(head, &rcuhead_debug_descr, | ||
| 76 | STATE_RCU_HEAD_READY, | ||
| 77 | STATE_RCU_HEAD_QUEUED); | ||
| 78 | return r1; | ||
| 79 | } | ||
| 80 | |||
| 81 | static inline void debug_rcu_head_unqueue(struct rcu_head *head) | ||
| 82 | { | ||
| 83 | debug_object_active_state(head, &rcuhead_debug_descr, | ||
| 84 | STATE_RCU_HEAD_QUEUED, | ||
| 85 | STATE_RCU_HEAD_READY); | ||
| 86 | debug_object_deactivate(head, &rcuhead_debug_descr); | ||
| 87 | } | ||
| 88 | #else /* !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
| 89 | static inline int debug_rcu_head_queue(struct rcu_head *head) | ||
| 90 | { | ||
| 91 | return 0; | ||
| 92 | } | ||
| 93 | |||
| 94 | static inline void debug_rcu_head_unqueue(struct rcu_head *head) | ||
| 95 | { | ||
| 96 | } | ||
| 97 | #endif /* #else !CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
| 98 | |||
| 99 | extern void kfree(const void *); | ||
| 100 | |||
| 101 | static inline bool __rcu_reclaim(const char *rn, struct rcu_head *head) | ||
| 102 | { | ||
| 103 | unsigned long offset = (unsigned long)head->func; | ||
| 104 | |||
| 105 | if (__is_kfree_rcu_offset(offset)) { | ||
| 106 | RCU_TRACE(trace_rcu_invoke_kfree_callback(rn, head, offset)); | ||
| 107 | kfree((void *)head - offset); | ||
| 108 | return 1; | ||
| 109 | } else { | ||
| 110 | RCU_TRACE(trace_rcu_invoke_callback(rn, head)); | ||
| 111 | head->func(head); | ||
| 112 | return 0; | ||
| 113 | } | ||
| 114 | } | ||
| 115 | |||
| 116 | extern int rcu_expedited; | ||
| 117 | |||
| 118 | #ifdef CONFIG_RCU_STALL_COMMON | ||
| 119 | |||
| 120 | extern int rcu_cpu_stall_suppress; | ||
| 121 | int rcu_jiffies_till_stall_check(void); | ||
| 122 | |||
| 123 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ | ||
| 124 | |||
| 125 | /* | ||
| 126 | * Strings used in tracepoints need to be exported via the | ||
| 127 | * tracing system such that tools like perf and trace-cmd can | ||
| 128 | * translate the string address pointers to actual text. | ||
| 129 | */ | ||
| 130 | #define TPS(x) tracepoint_string(x) | ||
| 131 | |||
| 132 | #endif /* __LINUX_RCU_H */ | ||
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c new file mode 100644 index 000000000000..01d5ccb8bfe3 --- /dev/null +++ b/kernel/rcu/srcu.c | |||
| @@ -0,0 +1,651 @@ | |||
| 1 | /* | ||
| 2 | * Sleepable Read-Copy Update mechanism for mutual exclusion. | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, write to the Free Software | ||
| 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
| 17 | * | ||
| 18 | * Copyright (C) IBM Corporation, 2006 | ||
| 19 | * Copyright (C) Fujitsu, 2012 | ||
| 20 | * | ||
| 21 | * Author: Paul McKenney <paulmck@us.ibm.com> | ||
| 22 | * Lai Jiangshan <laijs@cn.fujitsu.com> | ||
| 23 | * | ||
| 24 | * For detailed explanation of Read-Copy Update mechanism see - | ||
| 25 | * Documentation/RCU/ *.txt | ||
| 26 | * | ||
| 27 | */ | ||
| 28 | |||
| 29 | #include <linux/export.h> | ||
| 30 | #include <linux/mutex.h> | ||
| 31 | #include <linux/percpu.h> | ||
| 32 | #include <linux/preempt.h> | ||
| 33 | #include <linux/rcupdate.h> | ||
| 34 | #include <linux/sched.h> | ||
| 35 | #include <linux/smp.h> | ||
| 36 | #include <linux/delay.h> | ||
| 37 | #include <linux/srcu.h> | ||
| 38 | |||
| 39 | #include <trace/events/rcu.h> | ||
| 40 | |||
| 41 | #include "rcu.h" | ||
| 42 | |||
| 43 | /* | ||
| 44 | * Initialize an rcu_batch structure to empty. | ||
| 45 | */ | ||
| 46 | static inline void rcu_batch_init(struct rcu_batch *b) | ||
| 47 | { | ||
| 48 | b->head = NULL; | ||
| 49 | b->tail = &b->head; | ||
| 50 | } | ||
| 51 | |||
| 52 | /* | ||
| 53 | * Enqueue a callback onto the tail of the specified rcu_batch structure. | ||
| 54 | */ | ||
| 55 | static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head) | ||
| 56 | { | ||
| 57 | *b->tail = head; | ||
| 58 | b->tail = &head->next; | ||
| 59 | } | ||
| 60 | |||
| 61 | /* | ||
| 62 | * Is the specified rcu_batch structure empty? | ||
| 63 | */ | ||
| 64 | static inline bool rcu_batch_empty(struct rcu_batch *b) | ||
| 65 | { | ||
| 66 | return b->tail == &b->head; | ||
| 67 | } | ||
| 68 | |||
| 69 | /* | ||
| 70 | * Remove the callback at the head of the specified rcu_batch structure | ||
| 71 | * and return a pointer to it, or return NULL if the structure is empty. | ||
| 72 | */ | ||
| 73 | static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b) | ||
| 74 | { | ||
| 75 | struct rcu_head *head; | ||
| 76 | |||
| 77 | if (rcu_batch_empty(b)) | ||
| 78 | return NULL; | ||
| 79 | |||
| 80 | head = b->head; | ||
| 81 | b->head = head->next; | ||
| 82 | if (b->tail == &head->next) | ||
| 83 | rcu_batch_init(b); | ||
| 84 | |||
| 85 | return head; | ||
| 86 | } | ||
| 87 | |||
| 88 | /* | ||
| 89 | * Move all callbacks from the rcu_batch structure specified by "from" to | ||
| 90 | * the structure specified by "to". | ||
| 91 | */ | ||
| 92 | static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) | ||
| 93 | { | ||
| 94 | if (!rcu_batch_empty(from)) { | ||
| 95 | *to->tail = from->head; | ||
| 96 | to->tail = from->tail; | ||
| 97 | rcu_batch_init(from); | ||
| 98 | } | ||
| 99 | } | ||
| 100 | |||
| 101 | static int init_srcu_struct_fields(struct srcu_struct *sp) | ||
| 102 | { | ||
| 103 | sp->completed = 0; | ||
| 104 | spin_lock_init(&sp->queue_lock); | ||
| 105 | sp->running = false; | ||
| 106 | rcu_batch_init(&sp->batch_queue); | ||
| 107 | rcu_batch_init(&sp->batch_check0); | ||
| 108 | rcu_batch_init(&sp->batch_check1); | ||
| 109 | rcu_batch_init(&sp->batch_done); | ||
| 110 | INIT_DELAYED_WORK(&sp->work, process_srcu); | ||
| 111 | sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); | ||
| 112 | return sp->per_cpu_ref ? 0 : -ENOMEM; | ||
| 113 | } | ||
| 114 | |||
| 115 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
| 116 | |||
| 117 | int __init_srcu_struct(struct srcu_struct *sp, const char *name, | ||
| 118 | struct lock_class_key *key) | ||
| 119 | { | ||
| 120 | /* Don't re-initialize a lock while it is held. */ | ||
| 121 | debug_check_no_locks_freed((void *)sp, sizeof(*sp)); | ||
| 122 | lockdep_init_map(&sp->dep_map, name, key, 0); | ||
| 123 | return init_srcu_struct_fields(sp); | ||
| 124 | } | ||
| 125 | EXPORT_SYMBOL_GPL(__init_srcu_struct); | ||
| 126 | |||
| 127 | #else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
| 128 | |||
| 129 | /** | ||
| 130 | * init_srcu_struct - initialize a sleep-RCU structure | ||
| 131 | * @sp: structure to initialize. | ||
| 132 | * | ||
| 133 | * Must invoke this on a given srcu_struct before passing that srcu_struct | ||
| 134 | * to any other function. Each srcu_struct represents a separate domain | ||
| 135 | * of SRCU protection. | ||
| 136 | */ | ||
| 137 | int init_srcu_struct(struct srcu_struct *sp) | ||
| 138 | { | ||
| 139 | return init_srcu_struct_fields(sp); | ||
| 140 | } | ||
| 141 | EXPORT_SYMBOL_GPL(init_srcu_struct); | ||
| 142 | |||
| 143 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
| 144 | |||
| 145 | /* | ||
| 146 | * Returns approximate total of the readers' ->seq[] values for the | ||
| 147 | * rank of per-CPU counters specified by idx. | ||
| 148 | */ | ||
| 149 | static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) | ||
| 150 | { | ||
| 151 | int cpu; | ||
| 152 | unsigned long sum = 0; | ||
| 153 | unsigned long t; | ||
| 154 | |||
| 155 | for_each_possible_cpu(cpu) { | ||
| 156 | t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); | ||
| 157 | sum += t; | ||
| 158 | } | ||
| 159 | return sum; | ||
| 160 | } | ||
| 161 | |||
| 162 | /* | ||
| 163 | * Returns approximate number of readers active on the specified rank | ||
| 164 | * of the per-CPU ->c[] counters. | ||
| 165 | */ | ||
| 166 | static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) | ||
| 167 | { | ||
| 168 | int cpu; | ||
| 169 | unsigned long sum = 0; | ||
| 170 | unsigned long t; | ||
| 171 | |||
| 172 | for_each_possible_cpu(cpu) { | ||
| 173 | t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); | ||
| 174 | sum += t; | ||
| 175 | } | ||
| 176 | return sum; | ||
| 177 | } | ||
| 178 | |||
| 179 | /* | ||
| 180 | * Return true if the number of pre-existing readers is determined to | ||
| 181 | * be stably zero. An example unstable zero can occur if the call | ||
| 182 | * to srcu_readers_active_idx() misses an __srcu_read_lock() increment, | ||
| 183 | * but due to task migration, sees the corresponding __srcu_read_unlock() | ||
| 184 | * decrement. This can happen because srcu_readers_active_idx() takes | ||
| 185 | * time to sum the array, and might in fact be interrupted or preempted | ||
| 186 | * partway through the summation. | ||
| 187 | */ | ||
| 188 | static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) | ||
| 189 | { | ||
| 190 | unsigned long seq; | ||
| 191 | |||
| 192 | seq = srcu_readers_seq_idx(sp, idx); | ||
| 193 | |||
| 194 | /* | ||
| 195 | * The following smp_mb() A pairs with the smp_mb() B located in | ||
| 196 | * __srcu_read_lock(). This pairing ensures that if an | ||
| 197 | * __srcu_read_lock() increments its counter after the summation | ||
| 198 | * in srcu_readers_active_idx(), then the corresponding SRCU read-side | ||
| 199 | * critical section will see any changes made prior to the start | ||
| 200 | * of the current SRCU grace period. | ||
| 201 | * | ||
| 202 | * Also, if the above call to srcu_readers_seq_idx() saw the | ||
| 203 | * increment of ->seq[], then the call to srcu_readers_active_idx() | ||
| 204 | * must see the increment of ->c[]. | ||
| 205 | */ | ||
| 206 | smp_mb(); /* A */ | ||
| 207 | |||
| 208 | /* | ||
| 209 | * Note that srcu_readers_active_idx() can incorrectly return | ||
| 210 | * zero even though there is a pre-existing reader throughout. | ||
| 211 | * To see this, suppose that task A is in a very long SRCU | ||
| 212 | * read-side critical section that started on CPU 0, and that | ||
| 213 | * no other reader exists, so that the sum of the counters | ||
| 214 | * is equal to one. Then suppose that task B starts executing | ||
| 215 | * srcu_readers_active_idx(), summing up to CPU 1, and then that | ||
| 216 | * task C starts reading on CPU 0, so that its increment is not | ||
| 217 | * summed, but finishes reading on CPU 2, so that its decrement | ||
| 218 | * -is- summed. Then when task B completes its sum, it will | ||
| 219 | * incorrectly get zero, despite the fact that task A has been | ||
| 220 | * in its SRCU read-side critical section the whole time. | ||
| 221 | * | ||
| 222 | * We therefore do a validation step should srcu_readers_active_idx() | ||
| 223 | * return zero. | ||
| 224 | */ | ||
| 225 | if (srcu_readers_active_idx(sp, idx) != 0) | ||
| 226 | return false; | ||
| 227 | |||
| 228 | /* | ||
| 229 | * The remainder of this function is the validation step. | ||
| 230 | * The following smp_mb() D pairs with the smp_mb() C in | ||
| 231 | * __srcu_read_unlock(). If the __srcu_read_unlock() was seen | ||
| 232 | * by srcu_readers_active_idx() above, then any destructive | ||
| 233 | * operation performed after the grace period will happen after | ||
| 234 | * the corresponding SRCU read-side critical section. | ||
| 235 | * | ||
| 236 | * Note that there can be at most NR_CPUS worth of readers using | ||
| 237 | * the old index, which is not enough to overflow even a 32-bit | ||
| 238 | * integer. (Yes, this does mean that systems having more than | ||
| 239 | * a billion or so CPUs need to be 64-bit systems.) Therefore, | ||
| 240 | * the sum of the ->seq[] counters cannot possibly overflow. | ||
| 241 | * Therefore, the only way that the return values of the two | ||
| 242 | * calls to srcu_readers_seq_idx() can be equal is if there were | ||
| 243 | * no increments of the corresponding rank of ->seq[] counts | ||
| 244 | * in the interim. But the missed-increment scenario laid out | ||
| 245 | * above includes an increment of the ->seq[] counter by | ||
| 246 | * the corresponding __srcu_read_lock(). Therefore, if this | ||
| 247 | * scenario occurs, the return values from the two calls to | ||
| 248 | * srcu_readers_seq_idx() will differ, and thus the validation | ||
| 249 | * step below suffices. | ||
| 250 | */ | ||
| 251 | smp_mb(); /* D */ | ||
| 252 | |||
| 253 | return srcu_readers_seq_idx(sp, idx) == seq; | ||
| 254 | } | ||
| 255 | |||
| 256 | /** | ||
| 257 | * srcu_readers_active - returns approximate number of readers. | ||
| 258 | * @sp: which srcu_struct to count active readers (holding srcu_read_lock). | ||
| 259 | * | ||
| 260 | * Note that this is not an atomic primitive, and can therefore suffer | ||
| 261 | * severe errors when invoked on an active srcu_struct. That said, it | ||
| 262 | * can be useful as an error check at cleanup time. | ||
| 263 | */ | ||
| 264 | static int srcu_readers_active(struct srcu_struct *sp) | ||
| 265 | { | ||
| 266 | int cpu; | ||
| 267 | unsigned long sum = 0; | ||
| 268 | |||
| 269 | for_each_possible_cpu(cpu) { | ||
| 270 | sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); | ||
| 271 | sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); | ||
| 272 | } | ||
| 273 | return sum; | ||
| 274 | } | ||
| 275 | |||
| 276 | /** | ||
| 277 | * cleanup_srcu_struct - deconstruct a sleep-RCU structure | ||
| 278 | * @sp: structure to clean up. | ||
| 279 | * | ||
| 280 | * Must invoke this after you are finished using a given srcu_struct that | ||
| 281 | * was initialized via init_srcu_struct(), else you leak memory. | ||
| 282 | */ | ||
| 283 | void cleanup_srcu_struct(struct srcu_struct *sp) | ||
| 284 | { | ||
| 285 | if (WARN_ON(srcu_readers_active(sp))) | ||
| 286 | return; /* Leakage unless caller handles error. */ | ||
| 287 | free_percpu(sp->per_cpu_ref); | ||
| 288 | sp->per_cpu_ref = NULL; | ||
| 289 | } | ||
| 290 | EXPORT_SYMBOL_GPL(cleanup_srcu_struct); | ||
| 291 | |||
| 292 | /* | ||
| 293 | * Counts the new reader in the appropriate per-CPU element of the | ||
| 294 | * srcu_struct. Must be called from process context. | ||
| 295 | * Returns an index that must be passed to the matching srcu_read_unlock(). | ||
| 296 | */ | ||
| 297 | int __srcu_read_lock(struct srcu_struct *sp) | ||
| 298 | { | ||
| 299 | int idx; | ||
| 300 | |||
| 301 | idx = ACCESS_ONCE(sp->completed) & 0x1; | ||
| 302 | preempt_disable(); | ||
| 303 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; | ||
| 304 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ | ||
| 305 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; | ||
| 306 | preempt_enable(); | ||
| 307 | return idx; | ||
| 308 | } | ||
| 309 | EXPORT_SYMBOL_GPL(__srcu_read_lock); | ||
| 310 | |||
| 311 | /* | ||
| 312 | * Removes the count for the old reader from the appropriate per-CPU | ||
| 313 | * element of the srcu_struct. Note that this may well be a different | ||
| 314 | * CPU than that which was incremented by the corresponding srcu_read_lock(). | ||
| 315 | * Must be called from process context. | ||
| 316 | */ | ||
| 317 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) | ||
| 318 | { | ||
| 319 | smp_mb(); /* C */ /* Avoid leaking the critical section. */ | ||
| 320 | this_cpu_dec(sp->per_cpu_ref->c[idx]); | ||
| 321 | } | ||
| 322 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); | ||
| 323 | |||
| 324 | /* | ||
| 325 | * We use an adaptive strategy for synchronize_srcu() and especially for | ||
| 326 | * synchronize_srcu_expedited(). We spin for a fixed time period | ||
| 327 | * (defined below) to allow SRCU readers to exit their read-side critical | ||
| 328 | * sections. If there are still some readers after 10 microseconds, | ||
| 329 | * we repeatedly block for 1-millisecond time periods. This approach | ||
| 330 | * has done well in testing, so there is no need for a config parameter. | ||
| 331 | */ | ||
| 332 | #define SRCU_RETRY_CHECK_DELAY 5 | ||
| 333 | #define SYNCHRONIZE_SRCU_TRYCOUNT 2 | ||
| 334 | #define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12 | ||
| 335 | |||
| 336 | /* | ||
| 337 | * @@@ Wait until all pre-existing readers complete. Such readers | ||
| 338 | * will have used the index specified by "idx". | ||
| 339 | * the caller should ensures the ->completed is not changed while checking | ||
| 340 | * and idx = (->completed & 1) ^ 1 | ||
| 341 | */ | ||
| 342 | static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) | ||
| 343 | { | ||
| 344 | for (;;) { | ||
| 345 | if (srcu_readers_active_idx_check(sp, idx)) | ||
| 346 | return true; | ||
| 347 | if (--trycount <= 0) | ||
| 348 | return false; | ||
| 349 | udelay(SRCU_RETRY_CHECK_DELAY); | ||
| 350 | } | ||
| 351 | } | ||
| 352 | |||
| 353 | /* | ||
| 354 | * Increment the ->completed counter so that future SRCU readers will | ||
| 355 | * use the other rank of the ->c[] and ->seq[] arrays. This allows | ||
| 356 | * us to wait for pre-existing readers in a starvation-free manner. | ||
| 357 | */ | ||
| 358 | static void srcu_flip(struct srcu_struct *sp) | ||
| 359 | { | ||
| 360 | sp->completed++; | ||
| 361 | } | ||
| 362 | |||
| 363 | /* | ||
| 364 | * Enqueue an SRCU callback on the specified srcu_struct structure, | ||
| 365 | * initiating grace-period processing if it is not already running. | ||
| 366 | */ | ||
| 367 | void call_srcu(struct srcu_struct *sp, struct rcu_head *head, | ||
| 368 | void (*func)(struct rcu_head *head)) | ||
| 369 | { | ||
| 370 | unsigned long flags; | ||
| 371 | |||
| 372 | head->next = NULL; | ||
| 373 | head->func = func; | ||
| 374 | spin_lock_irqsave(&sp->queue_lock, flags); | ||
| 375 | rcu_batch_queue(&sp->batch_queue, head); | ||
| 376 | if (!sp->running) { | ||
| 377 | sp->running = true; | ||
| 378 | schedule_delayed_work(&sp->work, 0); | ||
| 379 | } | ||
| 380 | spin_unlock_irqrestore(&sp->queue_lock, flags); | ||
| 381 | } | ||
| 382 | EXPORT_SYMBOL_GPL(call_srcu); | ||
| 383 | |||
| 384 | struct rcu_synchronize { | ||
| 385 | struct rcu_head head; | ||
| 386 | struct completion completion; | ||
| 387 | }; | ||
| 388 | |||
| 389 | /* | ||
| 390 | * Awaken the corresponding synchronize_srcu() instance now that a | ||
| 391 | * grace period has elapsed. | ||
| 392 | */ | ||
| 393 | static void wakeme_after_rcu(struct rcu_head *head) | ||
| 394 | { | ||
| 395 | struct rcu_synchronize *rcu; | ||
| 396 | |||
| 397 | rcu = container_of(head, struct rcu_synchronize, head); | ||
| 398 | complete(&rcu->completion); | ||
| 399 | } | ||
| 400 | |||
| 401 | static void srcu_advance_batches(struct srcu_struct *sp, int trycount); | ||
| 402 | static void srcu_reschedule(struct srcu_struct *sp); | ||
| 403 | |||
| 404 | /* | ||
| 405 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). | ||
| 406 | */ | ||
| 407 | static void __synchronize_srcu(struct srcu_struct *sp, int trycount) | ||
| 408 | { | ||
| 409 | struct rcu_synchronize rcu; | ||
| 410 | struct rcu_head *head = &rcu.head; | ||
| 411 | bool done = false; | ||
| 412 | |||
| 413 | rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && | ||
| 414 | !lock_is_held(&rcu_bh_lock_map) && | ||
| 415 | !lock_is_held(&rcu_lock_map) && | ||
| 416 | !lock_is_held(&rcu_sched_lock_map), | ||
| 417 | "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); | ||
| 418 | |||
| 419 | might_sleep(); | ||
| 420 | init_completion(&rcu.completion); | ||
| 421 | |||
| 422 | head->next = NULL; | ||
| 423 | head->func = wakeme_after_rcu; | ||
| 424 | spin_lock_irq(&sp->queue_lock); | ||
| 425 | if (!sp->running) { | ||
| 426 | /* steal the processing owner */ | ||
| 427 | sp->running = true; | ||
| 428 | rcu_batch_queue(&sp->batch_check0, head); | ||
| 429 | spin_unlock_irq(&sp->queue_lock); | ||
| 430 | |||
| 431 | srcu_advance_batches(sp, trycount); | ||
| 432 | if (!rcu_batch_empty(&sp->batch_done)) { | ||
| 433 | BUG_ON(sp->batch_done.head != head); | ||
| 434 | rcu_batch_dequeue(&sp->batch_done); | ||
| 435 | done = true; | ||
| 436 | } | ||
| 437 | /* give the processing owner to work_struct */ | ||
| 438 | srcu_reschedule(sp); | ||
| 439 | } else { | ||
| 440 | rcu_batch_queue(&sp->batch_queue, head); | ||
| 441 | spin_unlock_irq(&sp->queue_lock); | ||
| 442 | } | ||
| 443 | |||
| 444 | if (!done) | ||
| 445 | wait_for_completion(&rcu.completion); | ||
| 446 | } | ||
| 447 | |||
| 448 | /** | ||
| 449 | * synchronize_srcu - wait for prior SRCU read-side critical-section completion | ||
| 450 | * @sp: srcu_struct with which to synchronize. | ||
| 451 | * | ||
| 452 | * Wait for the count to drain to zero of both indexes. To avoid the | ||
| 453 | * possible starvation of synchronize_srcu(), it waits for the count of | ||
| 454 | * the index=((->completed & 1) ^ 1) to drain to zero at first, | ||
| 455 | * and then flip the completed and wait for the count of the other index. | ||
| 456 | * | ||
| 457 | * Can block; must be called from process context. | ||
| 458 | * | ||
| 459 | * Note that it is illegal to call synchronize_srcu() from the corresponding | ||
| 460 | * SRCU read-side critical section; doing so will result in deadlock. | ||
| 461 | * However, it is perfectly legal to call synchronize_srcu() on one | ||
| 462 | * srcu_struct from some other srcu_struct's read-side critical section. | ||
| 463 | */ | ||
| 464 | void synchronize_srcu(struct srcu_struct *sp) | ||
| 465 | { | ||
| 466 | __synchronize_srcu(sp, rcu_expedited | ||
| 467 | ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT | ||
| 468 | : SYNCHRONIZE_SRCU_TRYCOUNT); | ||
| 469 | } | ||
| 470 | EXPORT_SYMBOL_GPL(synchronize_srcu); | ||
| 471 | |||
| 472 | /** | ||
| 473 | * synchronize_srcu_expedited - Brute-force SRCU grace period | ||
| 474 | * @sp: srcu_struct with which to synchronize. | ||
| 475 | * | ||
| 476 | * Wait for an SRCU grace period to elapse, but be more aggressive about | ||
| 477 | * spinning rather than blocking when waiting. | ||
| 478 | * | ||
| 479 | * Note that it is also illegal to call synchronize_srcu_expedited() | ||
| 480 | * from the corresponding SRCU read-side critical section; | ||
| 481 | * doing so will result in deadlock. However, it is perfectly legal | ||
| 482 | * to call synchronize_srcu_expedited() on one srcu_struct from some | ||
| 483 | * other srcu_struct's read-side critical section, as long as | ||
| 484 | * the resulting graph of srcu_structs is acyclic. | ||
| 485 | */ | ||
| 486 | void synchronize_srcu_expedited(struct srcu_struct *sp) | ||
| 487 | { | ||
| 488 | __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT); | ||
| 489 | } | ||
| 490 | EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); | ||
| 491 | |||
| 492 | /** | ||
| 493 | * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. | ||
| 494 | */ | ||
| 495 | void srcu_barrier(struct srcu_struct *sp) | ||
| 496 | { | ||
| 497 | synchronize_srcu(sp); | ||
| 498 | } | ||
| 499 | EXPORT_SYMBOL_GPL(srcu_barrier); | ||
| 500 | |||
| 501 | /** | ||
| 502 | * srcu_batches_completed - return batches completed. | ||
| 503 | * @sp: srcu_struct on which to report batch completion. | ||
| 504 | * | ||
| 505 | * Report the number of batches, correlated with, but not necessarily | ||
| 506 | * precisely the same as, the number of grace periods that have elapsed. | ||
| 507 | */ | ||
| 508 | long srcu_batches_completed(struct srcu_struct *sp) | ||
| 509 | { | ||
| 510 | return sp->completed; | ||
| 511 | } | ||
| 512 | EXPORT_SYMBOL_GPL(srcu_batches_completed); | ||
| 513 | |||
| 514 | #define SRCU_CALLBACK_BATCH 10 | ||
| 515 | #define SRCU_INTERVAL 1 | ||
| 516 | |||
| 517 | /* | ||
| 518 | * Move any new SRCU callbacks to the first stage of the SRCU grace | ||
| 519 | * period pipeline. | ||
| 520 | */ | ||
| 521 | static void srcu_collect_new(struct srcu_struct *sp) | ||
| 522 | { | ||
| 523 | if (!rcu_batch_empty(&sp->batch_queue)) { | ||
| 524 | spin_lock_irq(&sp->queue_lock); | ||
| 525 | rcu_batch_move(&sp->batch_check0, &sp->batch_queue); | ||
| 526 | spin_unlock_irq(&sp->queue_lock); | ||
| 527 | } | ||
| 528 | } | ||
| 529 | |||
| 530 | /* | ||
| 531 | * Core SRCU state machine. Advance callbacks from ->batch_check0 to | ||
| 532 | * ->batch_check1 and then to ->batch_done as readers drain. | ||
| 533 | */ | ||
| 534 | static void srcu_advance_batches(struct srcu_struct *sp, int trycount) | ||
| 535 | { | ||
| 536 | int idx = 1 ^ (sp->completed & 1); | ||
| 537 | |||
| 538 | /* | ||
| 539 | * Because readers might be delayed for an extended period after | ||
| 540 | * fetching ->completed for their index, at any point in time there | ||
| 541 | * might well be readers using both idx=0 and idx=1. We therefore | ||
| 542 | * need to wait for readers to clear from both index values before | ||
| 543 | * invoking a callback. | ||
| 544 | */ | ||
| 545 | |||
| 546 | if (rcu_batch_empty(&sp->batch_check0) && | ||
| 547 | rcu_batch_empty(&sp->batch_check1)) | ||
| 548 | return; /* no callbacks need to be advanced */ | ||
| 549 | |||
| 550 | if (!try_check_zero(sp, idx, trycount)) | ||
| 551 | return; /* failed to advance, will try after SRCU_INTERVAL */ | ||
| 552 | |||
| 553 | /* | ||
| 554 | * The callbacks in ->batch_check1 have already done with their | ||
| 555 | * first zero check and flip back when they were enqueued on | ||
| 556 | * ->batch_check0 in a previous invocation of srcu_advance_batches(). | ||
| 557 | * (Presumably try_check_zero() returned false during that | ||
| 558 | * invocation, leaving the callbacks stranded on ->batch_check1.) | ||
| 559 | * They are therefore ready to invoke, so move them to ->batch_done. | ||
| 560 | */ | ||
| 561 | rcu_batch_move(&sp->batch_done, &sp->batch_check1); | ||
| 562 | |||
| 563 | if (rcu_batch_empty(&sp->batch_check0)) | ||
| 564 | return; /* no callbacks need to be advanced */ | ||
| 565 | srcu_flip(sp); | ||
| 566 | |||
| 567 | /* | ||
| 568 | * The callbacks in ->batch_check0 just finished their | ||
| 569 | * first check zero and flip, so move them to ->batch_check1 | ||
| 570 | * for future checking on the other idx. | ||
| 571 | */ | ||
| 572 | rcu_batch_move(&sp->batch_check1, &sp->batch_check0); | ||
| 573 | |||
| 574 | /* | ||
| 575 | * SRCU read-side critical sections are normally short, so check | ||
| 576 | * at least twice in quick succession after a flip. | ||
| 577 | */ | ||
| 578 | trycount = trycount < 2 ? 2 : trycount; | ||
| 579 | if (!try_check_zero(sp, idx^1, trycount)) | ||
| 580 | return; /* failed to advance, will try after SRCU_INTERVAL */ | ||
| 581 | |||
| 582 | /* | ||
| 583 | * The callbacks in ->batch_check1 have now waited for all | ||
| 584 | * pre-existing readers using both idx values. They are therefore | ||
| 585 | * ready to invoke, so move them to ->batch_done. | ||
| 586 | */ | ||
| 587 | rcu_batch_move(&sp->batch_done, &sp->batch_check1); | ||
| 588 | } | ||
| 589 | |||
| 590 | /* | ||
| 591 | * Invoke a limited number of SRCU callbacks that have passed through | ||
| 592 | * their grace period. If there are more to do, SRCU will reschedule | ||
| 593 | * the workqueue. | ||
| 594 | */ | ||
| 595 | static void srcu_invoke_callbacks(struct srcu_struct *sp) | ||
| 596 | { | ||
| 597 | int i; | ||
| 598 | struct rcu_head *head; | ||
| 599 | |||
| 600 | for (i = 0; i < SRCU_CALLBACK_BATCH; i++) { | ||
| 601 | head = rcu_batch_dequeue(&sp->batch_done); | ||
| 602 | if (!head) | ||
| 603 | break; | ||
| 604 | local_bh_disable(); | ||
| 605 | head->func(head); | ||
| 606 | local_bh_enable(); | ||
| 607 | } | ||
| 608 | } | ||
| 609 | |||
| 610 | /* | ||
| 611 | * Finished one round of SRCU grace period. Start another if there are | ||
| 612 | * more SRCU callbacks queued, otherwise put SRCU into not-running state. | ||
| 613 | */ | ||
| 614 | static void srcu_reschedule(struct srcu_struct *sp) | ||
| 615 | { | ||
| 616 | bool pending = true; | ||
| 617 | |||
| 618 | if (rcu_batch_empty(&sp->batch_done) && | ||
| 619 | rcu_batch_empty(&sp->batch_check1) && | ||
| 620 | rcu_batch_empty(&sp->batch_check0) && | ||
| 621 | rcu_batch_empty(&sp->batch_queue)) { | ||
| 622 | spin_lock_irq(&sp->queue_lock); | ||
| 623 | if (rcu_batch_empty(&sp->batch_done) && | ||
| 624 | rcu_batch_empty(&sp->batch_check1) && | ||
| 625 | rcu_batch_empty(&sp->batch_check0) && | ||
| 626 | rcu_batch_empty(&sp->batch_queue)) { | ||
| 627 | sp->running = false; | ||
| 628 | pending = false; | ||
| 629 | } | ||
| 630 | spin_unlock_irq(&sp->queue_lock); | ||
| 631 | } | ||
| 632 | |||
| 633 | if (pending) | ||
| 634 | schedule_delayed_work(&sp->work, SRCU_INTERVAL); | ||
| 635 | } | ||
| 636 | |||
| 637 | /* | ||
| 638 | * This is the work-queue function that handles SRCU grace periods. | ||
| 639 | */ | ||
| 640 | void process_srcu(struct work_struct *work) | ||
| 641 | { | ||
| 642 | struct srcu_struct *sp; | ||
| 643 | |||
| 644 | sp = container_of(work, struct srcu_struct, work.work); | ||
| 645 | |||
| 646 | srcu_collect_new(sp); | ||
| 647 | srcu_advance_batches(sp, 1); | ||
| 648 | srcu_invoke_callbacks(sp); | ||
| 649 | srcu_reschedule(sp); | ||
| 650 | } | ||
| 651 | EXPORT_SYMBOL_GPL(process_srcu); | ||
diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c new file mode 100644 index 000000000000..0c9a934cfec1 --- /dev/null +++ b/kernel/rcu/tiny.c | |||
| @@ -0,0 +1,388 @@ | |||
| 1 | /* | ||
| 2 | * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition. | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, write to the Free Software | ||
| 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
| 17 | * | ||
| 18 | * Copyright IBM Corporation, 2008 | ||
| 19 | * | ||
| 20 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | ||
| 21 | * | ||
| 22 | * For detailed explanation of Read-Copy Update mechanism see - | ||
| 23 | * Documentation/RCU | ||
| 24 | */ | ||
| 25 | #include <linux/completion.h> | ||
| 26 | #include <linux/interrupt.h> | ||
| 27 | #include <linux/notifier.h> | ||
| 28 | #include <linux/rcupdate.h> | ||
| 29 | #include <linux/kernel.h> | ||
| 30 | #include <linux/export.h> | ||
| 31 | #include <linux/mutex.h> | ||
| 32 | #include <linux/sched.h> | ||
| 33 | #include <linux/types.h> | ||
| 34 | #include <linux/init.h> | ||
| 35 | #include <linux/time.h> | ||
| 36 | #include <linux/cpu.h> | ||
| 37 | #include <linux/prefetch.h> | ||
| 38 | #include <linux/ftrace_event.h> | ||
| 39 | |||
| 40 | #ifdef CONFIG_RCU_TRACE | ||
| 41 | #include <trace/events/rcu.h> | ||
| 42 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | ||
| 43 | |||
| 44 | #include "rcu.h" | ||
| 45 | |||
| 46 | /* Forward declarations for tiny_plugin.h. */ | ||
| 47 | struct rcu_ctrlblk; | ||
| 48 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp); | ||
| 49 | static void rcu_process_callbacks(struct softirq_action *unused); | ||
| 50 | static void __call_rcu(struct rcu_head *head, | ||
| 51 | void (*func)(struct rcu_head *rcu), | ||
| 52 | struct rcu_ctrlblk *rcp); | ||
| 53 | |||
| 54 | static long long rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | ||
| 55 | |||
| 56 | #include "tiny_plugin.h" | ||
| 57 | |||
| 58 | /* Common code for rcu_idle_enter() and rcu_irq_exit(), see kernel/rcutree.c. */ | ||
| 59 | static void rcu_idle_enter_common(long long newval) | ||
| 60 | { | ||
| 61 | if (newval) { | ||
| 62 | RCU_TRACE(trace_rcu_dyntick(TPS("--="), | ||
| 63 | rcu_dynticks_nesting, newval)); | ||
| 64 | rcu_dynticks_nesting = newval; | ||
| 65 | return; | ||
| 66 | } | ||
| 67 | RCU_TRACE(trace_rcu_dyntick(TPS("Start"), | ||
| 68 | rcu_dynticks_nesting, newval)); | ||
| 69 | if (!is_idle_task(current)) { | ||
| 70 | struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); | ||
| 71 | |||
| 72 | RCU_TRACE(trace_rcu_dyntick(TPS("Entry error: not idle task"), | ||
| 73 | rcu_dynticks_nesting, newval)); | ||
| 74 | ftrace_dump(DUMP_ALL); | ||
| 75 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | ||
| 76 | current->pid, current->comm, | ||
| 77 | idle->pid, idle->comm); /* must be idle task! */ | ||
| 78 | } | ||
| 79 | rcu_sched_qs(0); /* implies rcu_bh_qsctr_inc(0) */ | ||
| 80 | barrier(); | ||
| 81 | rcu_dynticks_nesting = newval; | ||
| 82 | } | ||
| 83 | |||
| 84 | /* | ||
| 85 | * Enter idle, which is an extended quiescent state if we have fully | ||
| 86 | * entered that mode (i.e., if the new value of dynticks_nesting is zero). | ||
| 87 | */ | ||
| 88 | void rcu_idle_enter(void) | ||
| 89 | { | ||
| 90 | unsigned long flags; | ||
| 91 | long long newval; | ||
| 92 | |||
| 93 | local_irq_save(flags); | ||
| 94 | WARN_ON_ONCE((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == 0); | ||
| 95 | if ((rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) == | ||
| 96 | DYNTICK_TASK_NEST_VALUE) | ||
| 97 | newval = 0; | ||
| 98 | else | ||
| 99 | newval = rcu_dynticks_nesting - DYNTICK_TASK_NEST_VALUE; | ||
| 100 | rcu_idle_enter_common(newval); | ||
| 101 | local_irq_restore(flags); | ||
| 102 | } | ||
| 103 | EXPORT_SYMBOL_GPL(rcu_idle_enter); | ||
| 104 | |||
| 105 | /* | ||
| 106 | * Exit an interrupt handler towards idle. | ||
| 107 | */ | ||
| 108 | void rcu_irq_exit(void) | ||
| 109 | { | ||
| 110 | unsigned long flags; | ||
| 111 | long long newval; | ||
| 112 | |||
| 113 | local_irq_save(flags); | ||
| 114 | newval = rcu_dynticks_nesting - 1; | ||
| 115 | WARN_ON_ONCE(newval < 0); | ||
| 116 | rcu_idle_enter_common(newval); | ||
| 117 | local_irq_restore(flags); | ||
| 118 | } | ||
| 119 | EXPORT_SYMBOL_GPL(rcu_irq_exit); | ||
| 120 | |||
| 121 | /* Common code for rcu_idle_exit() and rcu_irq_enter(), see kernel/rcutree.c. */ | ||
| 122 | static void rcu_idle_exit_common(long long oldval) | ||
| 123 | { | ||
| 124 | if (oldval) { | ||
| 125 | RCU_TRACE(trace_rcu_dyntick(TPS("++="), | ||
| 126 | oldval, rcu_dynticks_nesting)); | ||
| 127 | return; | ||
| 128 | } | ||
| 129 | RCU_TRACE(trace_rcu_dyntick(TPS("End"), oldval, rcu_dynticks_nesting)); | ||
| 130 | if (!is_idle_task(current)) { | ||
| 131 | struct task_struct *idle __maybe_unused = idle_task(smp_processor_id()); | ||
| 132 | |||
| 133 | RCU_TRACE(trace_rcu_dyntick(TPS("Exit error: not idle task"), | ||
| 134 | oldval, rcu_dynticks_nesting)); | ||
| 135 | ftrace_dump(DUMP_ALL); | ||
| 136 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | ||
| 137 | current->pid, current->comm, | ||
| 138 | idle->pid, idle->comm); /* must be idle task! */ | ||
| 139 | } | ||
| 140 | } | ||
| 141 | |||
| 142 | /* | ||
| 143 | * Exit idle, so that we are no longer in an extended quiescent state. | ||
| 144 | */ | ||
| 145 | void rcu_idle_exit(void) | ||
| 146 | { | ||
| 147 | unsigned long flags; | ||
| 148 | long long oldval; | ||
| 149 | |||
| 150 | local_irq_save(flags); | ||
| 151 | oldval = rcu_dynticks_nesting; | ||
| 152 | WARN_ON_ONCE(rcu_dynticks_nesting < 0); | ||
| 153 | if (rcu_dynticks_nesting & DYNTICK_TASK_NEST_MASK) | ||
| 154 | rcu_dynticks_nesting += DYNTICK_TASK_NEST_VALUE; | ||
| 155 | else | ||
| 156 | rcu_dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | ||
| 157 | rcu_idle_exit_common(oldval); | ||
| 158 | local_irq_restore(flags); | ||
| 159 | } | ||
| 160 | EXPORT_SYMBOL_GPL(rcu_idle_exit); | ||
| 161 | |||
| 162 | /* | ||
| 163 | * Enter an interrupt handler, moving away from idle. | ||
| 164 | */ | ||
| 165 | void rcu_irq_enter(void) | ||
| 166 | { | ||
| 167 | unsigned long flags; | ||
| 168 | long long oldval; | ||
| 169 | |||
| 170 | local_irq_save(flags); | ||
| 171 | oldval = rcu_dynticks_nesting; | ||
| 172 | rcu_dynticks_nesting++; | ||
| 173 | WARN_ON_ONCE(rcu_dynticks_nesting == 0); | ||
| 174 | rcu_idle_exit_common(oldval); | ||
| 175 | local_irq_restore(flags); | ||
| 176 | } | ||
| 177 | EXPORT_SYMBOL_GPL(rcu_irq_enter); | ||
| 178 | |||
| 179 | #if defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) | ||
| 180 | |||
| 181 | /* | ||
| 182 | * Test whether RCU thinks that the current CPU is idle. | ||
| 183 | */ | ||
| 184 | bool __rcu_is_watching(void) | ||
| 185 | { | ||
| 186 | return rcu_dynticks_nesting; | ||
| 187 | } | ||
| 188 | EXPORT_SYMBOL(__rcu_is_watching); | ||
| 189 | |||
| 190 | #endif /* defined(CONFIG_DEBUG_LOCK_ALLOC) || defined(CONFIG_RCU_TRACE) */ | ||
| 191 | |||
| 192 | /* | ||
| 193 | * Test whether the current CPU was interrupted from idle. Nested | ||
| 194 | * interrupts don't count, we must be running at the first interrupt | ||
| 195 | * level. | ||
| 196 | */ | ||
| 197 | static int rcu_is_cpu_rrupt_from_idle(void) | ||
| 198 | { | ||
| 199 | return rcu_dynticks_nesting <= 1; | ||
| 200 | } | ||
| 201 | |||
| 202 | /* | ||
| 203 | * Helper function for rcu_sched_qs() and rcu_bh_qs(). | ||
| 204 | * Also irqs are disabled to avoid confusion due to interrupt handlers | ||
| 205 | * invoking call_rcu(). | ||
| 206 | */ | ||
| 207 | static int rcu_qsctr_help(struct rcu_ctrlblk *rcp) | ||
| 208 | { | ||
| 209 | RCU_TRACE(reset_cpu_stall_ticks(rcp)); | ||
| 210 | if (rcp->rcucblist != NULL && | ||
| 211 | rcp->donetail != rcp->curtail) { | ||
| 212 | rcp->donetail = rcp->curtail; | ||
| 213 | return 1; | ||
| 214 | } | ||
| 215 | |||
| 216 | return 0; | ||
| 217 | } | ||
| 218 | |||
| 219 | /* | ||
| 220 | * Record an rcu quiescent state. And an rcu_bh quiescent state while we | ||
| 221 | * are at it, given that any rcu quiescent state is also an rcu_bh | ||
| 222 | * quiescent state. Use "+" instead of "||" to defeat short circuiting. | ||
| 223 | */ | ||
| 224 | void rcu_sched_qs(int cpu) | ||
| 225 | { | ||
| 226 | unsigned long flags; | ||
| 227 | |||
| 228 | local_irq_save(flags); | ||
| 229 | if (rcu_qsctr_help(&rcu_sched_ctrlblk) + | ||
| 230 | rcu_qsctr_help(&rcu_bh_ctrlblk)) | ||
| 231 | raise_softirq(RCU_SOFTIRQ); | ||
| 232 | local_irq_restore(flags); | ||
| 233 | } | ||
| 234 | |||
| 235 | /* | ||
| 236 | * Record an rcu_bh quiescent state. | ||
| 237 | */ | ||
| 238 | void rcu_bh_qs(int cpu) | ||
| 239 | { | ||
| 240 | unsigned long flags; | ||
| 241 | |||
| 242 | local_irq_save(flags); | ||
| 243 | if (rcu_qsctr_help(&rcu_bh_ctrlblk)) | ||
| 244 | raise_softirq(RCU_SOFTIRQ); | ||
| 245 | local_irq_restore(flags); | ||
| 246 | } | ||
| 247 | |||
| 248 | /* | ||
| 249 | * Check to see if the scheduling-clock interrupt came from an extended | ||
| 250 | * quiescent state, and, if so, tell RCU about it. This function must | ||
| 251 | * be called from hardirq context. It is normally called from the | ||
| 252 | * scheduling-clock interrupt. | ||
| 253 | */ | ||
| 254 | void rcu_check_callbacks(int cpu, int user) | ||
| 255 | { | ||
| 256 | RCU_TRACE(check_cpu_stalls()); | ||
| 257 | if (user || rcu_is_cpu_rrupt_from_idle()) | ||
| 258 | rcu_sched_qs(cpu); | ||
| 259 | else if (!in_softirq()) | ||
| 260 | rcu_bh_qs(cpu); | ||
| 261 | } | ||
| 262 | |||
| 263 | /* | ||
| 264 | * Invoke the RCU callbacks on the specified rcu_ctrlkblk structure | ||
| 265 | * whose grace period has elapsed. | ||
| 266 | */ | ||
| 267 | static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) | ||
| 268 | { | ||
| 269 | const char *rn = NULL; | ||
| 270 | struct rcu_head *next, *list; | ||
| 271 | unsigned long flags; | ||
| 272 | RCU_TRACE(int cb_count = 0); | ||
| 273 | |||
| 274 | /* If no RCU callbacks ready to invoke, just return. */ | ||
| 275 | if (&rcp->rcucblist == rcp->donetail) { | ||
| 276 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1)); | ||
| 277 | RCU_TRACE(trace_rcu_batch_end(rcp->name, 0, | ||
| 278 | !!ACCESS_ONCE(rcp->rcucblist), | ||
| 279 | need_resched(), | ||
| 280 | is_idle_task(current), | ||
| 281 | false)); | ||
| 282 | return; | ||
| 283 | } | ||
| 284 | |||
| 285 | /* Move the ready-to-invoke callbacks to a local list. */ | ||
| 286 | local_irq_save(flags); | ||
| 287 | RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1)); | ||
| 288 | list = rcp->rcucblist; | ||
| 289 | rcp->rcucblist = *rcp->donetail; | ||
| 290 | *rcp->donetail = NULL; | ||
| 291 | if (rcp->curtail == rcp->donetail) | ||
| 292 | rcp->curtail = &rcp->rcucblist; | ||
| 293 | rcp->donetail = &rcp->rcucblist; | ||
| 294 | local_irq_restore(flags); | ||
| 295 | |||
| 296 | /* Invoke the callbacks on the local list. */ | ||
| 297 | RCU_TRACE(rn = rcp->name); | ||
| 298 | while (list) { | ||
| 299 | next = list->next; | ||
| 300 | prefetch(next); | ||
| 301 | debug_rcu_head_unqueue(list); | ||
| 302 | local_bh_disable(); | ||
| 303 | __rcu_reclaim(rn, list); | ||
| 304 | local_bh_enable(); | ||
| 305 | list = next; | ||
| 306 | RCU_TRACE(cb_count++); | ||
| 307 | } | ||
| 308 | RCU_TRACE(rcu_trace_sub_qlen(rcp, cb_count)); | ||
| 309 | RCU_TRACE(trace_rcu_batch_end(rcp->name, | ||
| 310 | cb_count, 0, need_resched(), | ||
| 311 | is_idle_task(current), | ||
| 312 | false)); | ||
| 313 | } | ||
| 314 | |||
| 315 | static void rcu_process_callbacks(struct softirq_action *unused) | ||
| 316 | { | ||
| 317 | __rcu_process_callbacks(&rcu_sched_ctrlblk); | ||
| 318 | __rcu_process_callbacks(&rcu_bh_ctrlblk); | ||
| 319 | } | ||
| 320 | |||
| 321 | /* | ||
| 322 | * Wait for a grace period to elapse. But it is illegal to invoke | ||
| 323 | * synchronize_sched() from within an RCU read-side critical section. | ||
| 324 | * Therefore, any legal call to synchronize_sched() is a quiescent | ||
| 325 | * state, and so on a UP system, synchronize_sched() need do nothing. | ||
| 326 | * Ditto for synchronize_rcu_bh(). (But Lai Jiangshan points out the | ||
| 327 | * benefits of doing might_sleep() to reduce latency.) | ||
| 328 | * | ||
| 329 | * Cool, huh? (Due to Josh Triplett.) | ||
| 330 | * | ||
| 331 | * But we want to make this a static inline later. The cond_resched() | ||
| 332 | * currently makes this problematic. | ||
| 333 | */ | ||
| 334 | void synchronize_sched(void) | ||
| 335 | { | ||
| 336 | rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && | ||
| 337 | !lock_is_held(&rcu_lock_map) && | ||
| 338 | !lock_is_held(&rcu_sched_lock_map), | ||
| 339 | "Illegal synchronize_sched() in RCU read-side critical section"); | ||
| 340 | cond_resched(); | ||
| 341 | } | ||
| 342 | EXPORT_SYMBOL_GPL(synchronize_sched); | ||
| 343 | |||
| 344 | /* | ||
| 345 | * Helper function for call_rcu() and call_rcu_bh(). | ||
| 346 | */ | ||
| 347 | static void __call_rcu(struct rcu_head *head, | ||
| 348 | void (*func)(struct rcu_head *rcu), | ||
| 349 | struct rcu_ctrlblk *rcp) | ||
| 350 | { | ||
| 351 | unsigned long flags; | ||
| 352 | |||
| 353 | debug_rcu_head_queue(head); | ||
| 354 | head->func = func; | ||
| 355 | head->next = NULL; | ||
| 356 | |||
| 357 | local_irq_save(flags); | ||
| 358 | *rcp->curtail = head; | ||
| 359 | rcp->curtail = &head->next; | ||
| 360 | RCU_TRACE(rcp->qlen++); | ||
| 361 | local_irq_restore(flags); | ||
| 362 | } | ||
| 363 | |||
| 364 | /* | ||
| 365 | * Post an RCU callback to be invoked after the end of an RCU-sched grace | ||
| 366 | * period. But since we have but one CPU, that would be after any | ||
| 367 | * quiescent state. | ||
| 368 | */ | ||
| 369 | void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
| 370 | { | ||
| 371 | __call_rcu(head, func, &rcu_sched_ctrlblk); | ||
| 372 | } | ||
| 373 | EXPORT_SYMBOL_GPL(call_rcu_sched); | ||
| 374 | |||
| 375 | /* | ||
| 376 | * Post an RCU bottom-half callback to be invoked after any subsequent | ||
| 377 | * quiescent state. | ||
| 378 | */ | ||
| 379 | void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
| 380 | { | ||
| 381 | __call_rcu(head, func, &rcu_bh_ctrlblk); | ||
| 382 | } | ||
| 383 | EXPORT_SYMBOL_GPL(call_rcu_bh); | ||
| 384 | |||
| 385 | void rcu_init(void) | ||
| 386 | { | ||
| 387 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | ||
| 388 | } | ||
diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h new file mode 100644 index 000000000000..280d06cae352 --- /dev/null +++ b/kernel/rcu/tiny_plugin.h | |||
| @@ -0,0 +1,174 @@ | |||
| 1 | /* | ||
| 2 | * Read-Copy Update mechanism for mutual exclusion, the Bloatwatch edition | ||
| 3 | * Internal non-public definitions that provide either classic | ||
| 4 | * or preemptible semantics. | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify | ||
| 7 | * it under the terms of the GNU General Public License as published by | ||
| 8 | * the Free Software Foundation; either version 2 of the License, or | ||
| 9 | * (at your option) any later version. | ||
| 10 | * | ||
| 11 | * This program is distributed in the hope that it will be useful, | ||
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 14 | * GNU General Public License for more details. | ||
| 15 | * | ||
| 16 | * You should have received a copy of the GNU General Public License | ||
| 17 | * along with this program; if not, write to the Free Software | ||
| 18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
| 19 | * | ||
| 20 | * Copyright (c) 2010 Linaro | ||
| 21 | * | ||
| 22 | * Author: Paul E. McKenney <paulmck@linux.vnet.ibm.com> | ||
| 23 | */ | ||
| 24 | |||
| 25 | #include <linux/kthread.h> | ||
| 26 | #include <linux/module.h> | ||
| 27 | #include <linux/debugfs.h> | ||
| 28 | #include <linux/seq_file.h> | ||
| 29 | |||
| 30 | /* Global control variables for rcupdate callback mechanism. */ | ||
| 31 | struct rcu_ctrlblk { | ||
| 32 | struct rcu_head *rcucblist; /* List of pending callbacks (CBs). */ | ||
| 33 | struct rcu_head **donetail; /* ->next pointer of last "done" CB. */ | ||
| 34 | struct rcu_head **curtail; /* ->next pointer of last CB. */ | ||
| 35 | RCU_TRACE(long qlen); /* Number of pending CBs. */ | ||
| 36 | RCU_TRACE(unsigned long gp_start); /* Start time for stalls. */ | ||
| 37 | RCU_TRACE(unsigned long ticks_this_gp); /* Statistic for stalls. */ | ||
| 38 | RCU_TRACE(unsigned long jiffies_stall); /* Jiffies at next stall. */ | ||
| 39 | RCU_TRACE(const char *name); /* Name of RCU type. */ | ||
| 40 | }; | ||
| 41 | |||
| 42 | /* Definition for rcupdate control block. */ | ||
| 43 | static struct rcu_ctrlblk rcu_sched_ctrlblk = { | ||
| 44 | .donetail = &rcu_sched_ctrlblk.rcucblist, | ||
| 45 | .curtail = &rcu_sched_ctrlblk.rcucblist, | ||
| 46 | RCU_TRACE(.name = "rcu_sched") | ||
| 47 | }; | ||
| 48 | |||
| 49 | static struct rcu_ctrlblk rcu_bh_ctrlblk = { | ||
| 50 | .donetail = &rcu_bh_ctrlblk.rcucblist, | ||
| 51 | .curtail = &rcu_bh_ctrlblk.rcucblist, | ||
| 52 | RCU_TRACE(.name = "rcu_bh") | ||
| 53 | }; | ||
| 54 | |||
| 55 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
| 56 | #include <linux/kernel_stat.h> | ||
| 57 | |||
| 58 | int rcu_scheduler_active __read_mostly; | ||
| 59 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); | ||
| 60 | |||
| 61 | /* | ||
| 62 | * During boot, we forgive RCU lockdep issues. After this function is | ||
| 63 | * invoked, we start taking RCU lockdep issues seriously. | ||
| 64 | */ | ||
| 65 | void __init rcu_scheduler_starting(void) | ||
| 66 | { | ||
| 67 | WARN_ON(nr_context_switches() > 0); | ||
| 68 | rcu_scheduler_active = 1; | ||
| 69 | } | ||
| 70 | |||
| 71 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
| 72 | |||
| 73 | #ifdef CONFIG_RCU_TRACE | ||
| 74 | |||
| 75 | static void rcu_trace_sub_qlen(struct rcu_ctrlblk *rcp, int n) | ||
| 76 | { | ||
| 77 | unsigned long flags; | ||
| 78 | |||
| 79 | local_irq_save(flags); | ||
| 80 | rcp->qlen -= n; | ||
| 81 | local_irq_restore(flags); | ||
| 82 | } | ||
| 83 | |||
| 84 | /* | ||
| 85 | * Dump statistics for TINY_RCU, such as they are. | ||
| 86 | */ | ||
| 87 | static int show_tiny_stats(struct seq_file *m, void *unused) | ||
| 88 | { | ||
| 89 | seq_printf(m, "rcu_sched: qlen: %ld\n", rcu_sched_ctrlblk.qlen); | ||
| 90 | seq_printf(m, "rcu_bh: qlen: %ld\n", rcu_bh_ctrlblk.qlen); | ||
| 91 | return 0; | ||
| 92 | } | ||
| 93 | |||
| 94 | static int show_tiny_stats_open(struct inode *inode, struct file *file) | ||
| 95 | { | ||
| 96 | return single_open(file, show_tiny_stats, NULL); | ||
| 97 | } | ||
| 98 | |||
| 99 | static const struct file_operations show_tiny_stats_fops = { | ||
| 100 | .owner = THIS_MODULE, | ||
| 101 | .open = show_tiny_stats_open, | ||
| 102 | .read = seq_read, | ||
| 103 | .llseek = seq_lseek, | ||
| 104 | .release = single_release, | ||
| 105 | }; | ||
| 106 | |||
| 107 | static struct dentry *rcudir; | ||
| 108 | |||
| 109 | static int __init rcutiny_trace_init(void) | ||
| 110 | { | ||
| 111 | struct dentry *retval; | ||
| 112 | |||
| 113 | rcudir = debugfs_create_dir("rcu", NULL); | ||
| 114 | if (!rcudir) | ||
| 115 | goto free_out; | ||
| 116 | retval = debugfs_create_file("rcudata", 0444, rcudir, | ||
| 117 | NULL, &show_tiny_stats_fops); | ||
| 118 | if (!retval) | ||
| 119 | goto free_out; | ||
| 120 | return 0; | ||
| 121 | free_out: | ||
| 122 | debugfs_remove_recursive(rcudir); | ||
| 123 | return 1; | ||
| 124 | } | ||
| 125 | |||
| 126 | static void __exit rcutiny_trace_cleanup(void) | ||
| 127 | { | ||
| 128 | debugfs_remove_recursive(rcudir); | ||
| 129 | } | ||
| 130 | |||
| 131 | module_init(rcutiny_trace_init); | ||
| 132 | module_exit(rcutiny_trace_cleanup); | ||
| 133 | |||
| 134 | MODULE_AUTHOR("Paul E. McKenney"); | ||
| 135 | MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); | ||
| 136 | MODULE_LICENSE("GPL"); | ||
| 137 | |||
| 138 | static void check_cpu_stall(struct rcu_ctrlblk *rcp) | ||
| 139 | { | ||
| 140 | unsigned long j; | ||
| 141 | unsigned long js; | ||
| 142 | |||
| 143 | if (rcu_cpu_stall_suppress) | ||
| 144 | return; | ||
| 145 | rcp->ticks_this_gp++; | ||
| 146 | j = jiffies; | ||
| 147 | js = rcp->jiffies_stall; | ||
| 148 | if (*rcp->curtail && ULONG_CMP_GE(j, js)) { | ||
| 149 | pr_err("INFO: %s stall on CPU (%lu ticks this GP) idle=%llx (t=%lu jiffies q=%ld)\n", | ||
| 150 | rcp->name, rcp->ticks_this_gp, rcu_dynticks_nesting, | ||
| 151 | jiffies - rcp->gp_start, rcp->qlen); | ||
| 152 | dump_stack(); | ||
| 153 | } | ||
| 154 | if (*rcp->curtail && ULONG_CMP_GE(j, js)) | ||
| 155 | rcp->jiffies_stall = jiffies + | ||
| 156 | 3 * rcu_jiffies_till_stall_check() + 3; | ||
| 157 | else if (ULONG_CMP_GE(j, js)) | ||
| 158 | rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); | ||
| 159 | } | ||
| 160 | |||
| 161 | static void reset_cpu_stall_ticks(struct rcu_ctrlblk *rcp) | ||
| 162 | { | ||
| 163 | rcp->ticks_this_gp = 0; | ||
| 164 | rcp->gp_start = jiffies; | ||
| 165 | rcp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); | ||
| 166 | } | ||
| 167 | |||
| 168 | static void check_cpu_stalls(void) | ||
| 169 | { | ||
| 170 | RCU_TRACE(check_cpu_stall(&rcu_bh_ctrlblk)); | ||
| 171 | RCU_TRACE(check_cpu_stall(&rcu_sched_ctrlblk)); | ||
| 172 | } | ||
| 173 | |||
| 174 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
diff --git a/kernel/rcu/torture.c b/kernel/rcu/torture.c new file mode 100644 index 000000000000..3929cd451511 --- /dev/null +++ b/kernel/rcu/torture.c | |||
| @@ -0,0 +1,2145 @@ | |||
| 1 | /* | ||
| 2 | * Read-Copy Update module-based torture test facility | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, write to the Free Software | ||
| 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
| 17 | * | ||
| 18 | * Copyright (C) IBM Corporation, 2005, 2006 | ||
| 19 | * | ||
| 20 | * Authors: Paul E. McKenney <paulmck@us.ibm.com> | ||
| 21 | * Josh Triplett <josh@freedesktop.org> | ||
| 22 | * | ||
| 23 | * See also: Documentation/RCU/torture.txt | ||
| 24 | */ | ||
| 25 | #include <linux/types.h> | ||
| 26 | #include <linux/kernel.h> | ||
| 27 | #include <linux/init.h> | ||
| 28 | #include <linux/module.h> | ||
| 29 | #include <linux/kthread.h> | ||
| 30 | #include <linux/err.h> | ||
| 31 | #include <linux/spinlock.h> | ||
| 32 | #include <linux/smp.h> | ||
| 33 | #include <linux/rcupdate.h> | ||
| 34 | #include <linux/interrupt.h> | ||
| 35 | #include <linux/sched.h> | ||
| 36 | #include <linux/atomic.h> | ||
| 37 | #include <linux/bitops.h> | ||
| 38 | #include <linux/completion.h> | ||
| 39 | #include <linux/moduleparam.h> | ||
| 40 | #include <linux/percpu.h> | ||
| 41 | #include <linux/notifier.h> | ||
| 42 | #include <linux/reboot.h> | ||
| 43 | #include <linux/freezer.h> | ||
| 44 | #include <linux/cpu.h> | ||
| 45 | #include <linux/delay.h> | ||
| 46 | #include <linux/stat.h> | ||
| 47 | #include <linux/srcu.h> | ||
| 48 | #include <linux/slab.h> | ||
| 49 | #include <linux/trace_clock.h> | ||
| 50 | #include <asm/byteorder.h> | ||
| 51 | |||
| 52 | MODULE_LICENSE("GPL"); | ||
| 53 | MODULE_AUTHOR("Paul E. McKenney <paulmck@us.ibm.com> and Josh Triplett <josh@freedesktop.org>"); | ||
| 54 | |||
| 55 | MODULE_ALIAS("rcutorture"); | ||
| 56 | #ifdef MODULE_PARAM_PREFIX | ||
| 57 | #undef MODULE_PARAM_PREFIX | ||
| 58 | #endif | ||
| 59 | #define MODULE_PARAM_PREFIX "rcutorture." | ||
| 60 | |||
| 61 | static int fqs_duration; | ||
| 62 | module_param(fqs_duration, int, 0444); | ||
| 63 | MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us), 0 to disable"); | ||
| 64 | static int fqs_holdoff; | ||
| 65 | module_param(fqs_holdoff, int, 0444); | ||
| 66 | MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); | ||
| 67 | static int fqs_stutter = 3; | ||
| 68 | module_param(fqs_stutter, int, 0444); | ||
| 69 | MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); | ||
| 70 | static bool gp_exp; | ||
| 71 | module_param(gp_exp, bool, 0444); | ||
| 72 | MODULE_PARM_DESC(gp_exp, "Use expedited GP wait primitives"); | ||
| 73 | static bool gp_normal; | ||
| 74 | module_param(gp_normal, bool, 0444); | ||
| 75 | MODULE_PARM_DESC(gp_normal, "Use normal (non-expedited) GP wait primitives"); | ||
| 76 | static int irqreader = 1; | ||
| 77 | module_param(irqreader, int, 0444); | ||
| 78 | MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); | ||
| 79 | static int n_barrier_cbs; | ||
| 80 | module_param(n_barrier_cbs, int, 0444); | ||
| 81 | MODULE_PARM_DESC(n_barrier_cbs, "# of callbacks/kthreads for barrier testing"); | ||
| 82 | static int nfakewriters = 4; | ||
| 83 | module_param(nfakewriters, int, 0444); | ||
| 84 | MODULE_PARM_DESC(nfakewriters, "Number of RCU fake writer threads"); | ||
| 85 | static int nreaders = -1; | ||
| 86 | module_param(nreaders, int, 0444); | ||
| 87 | MODULE_PARM_DESC(nreaders, "Number of RCU reader threads"); | ||
| 88 | static int object_debug; | ||
| 89 | module_param(object_debug, int, 0444); | ||
| 90 | MODULE_PARM_DESC(object_debug, "Enable debug-object double call_rcu() testing"); | ||
| 91 | static int onoff_holdoff; | ||
| 92 | module_param(onoff_holdoff, int, 0444); | ||
| 93 | MODULE_PARM_DESC(onoff_holdoff, "Time after boot before CPU hotplugs (s)"); | ||
| 94 | static int onoff_interval; | ||
| 95 | module_param(onoff_interval, int, 0444); | ||
| 96 | MODULE_PARM_DESC(onoff_interval, "Time between CPU hotplugs (s), 0=disable"); | ||
| 97 | static int shuffle_interval = 3; | ||
| 98 | module_param(shuffle_interval, int, 0444); | ||
| 99 | MODULE_PARM_DESC(shuffle_interval, "Number of seconds between shuffles"); | ||
| 100 | static int shutdown_secs; | ||
| 101 | module_param(shutdown_secs, int, 0444); | ||
| 102 | MODULE_PARM_DESC(shutdown_secs, "Shutdown time (s), <= zero to disable."); | ||
| 103 | static int stall_cpu; | ||
| 104 | module_param(stall_cpu, int, 0444); | ||
| 105 | MODULE_PARM_DESC(stall_cpu, "Stall duration (s), zero to disable."); | ||
| 106 | static int stall_cpu_holdoff = 10; | ||
| 107 | module_param(stall_cpu_holdoff, int, 0444); | ||
| 108 | MODULE_PARM_DESC(stall_cpu_holdoff, "Time to wait before starting stall (s)."); | ||
| 109 | static int stat_interval = 60; | ||
| 110 | module_param(stat_interval, int, 0644); | ||
| 111 | MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s"); | ||
| 112 | static int stutter = 5; | ||
| 113 | module_param(stutter, int, 0444); | ||
| 114 | MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); | ||
| 115 | static int test_boost = 1; | ||
| 116 | module_param(test_boost, int, 0444); | ||
| 117 | MODULE_PARM_DESC(test_boost, "Test RCU prio boost: 0=no, 1=maybe, 2=yes."); | ||
| 118 | static int test_boost_duration = 4; | ||
| 119 | module_param(test_boost_duration, int, 0444); | ||
| 120 | MODULE_PARM_DESC(test_boost_duration, "Duration of each boost test, seconds."); | ||
| 121 | static int test_boost_interval = 7; | ||
| 122 | module_param(test_boost_interval, int, 0444); | ||
| 123 | MODULE_PARM_DESC(test_boost_interval, "Interval between boost tests, seconds."); | ||
| 124 | static bool test_no_idle_hz = true; | ||
| 125 | module_param(test_no_idle_hz, bool, 0444); | ||
| 126 | MODULE_PARM_DESC(test_no_idle_hz, "Test support for tickless idle CPUs"); | ||
| 127 | static char *torture_type = "rcu"; | ||
| 128 | module_param(torture_type, charp, 0444); | ||
| 129 | MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, ...)"); | ||
| 130 | static bool verbose; | ||
| 131 | module_param(verbose, bool, 0444); | ||
| 132 | MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s"); | ||
| 133 | |||
| 134 | #define TORTURE_FLAG "-torture:" | ||
| 135 | #define PRINTK_STRING(s) \ | ||
| 136 | do { pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) | ||
| 137 | #define VERBOSE_PRINTK_STRING(s) \ | ||
| 138 | do { if (verbose) pr_alert("%s" TORTURE_FLAG s "\n", torture_type); } while (0) | ||
| 139 | #define VERBOSE_PRINTK_ERRSTRING(s) \ | ||
| 140 | do { if (verbose) pr_alert("%s" TORTURE_FLAG "!!! " s "\n", torture_type); } while (0) | ||
| 141 | |||
| 142 | static char printk_buf[4096]; | ||
| 143 | |||
| 144 | static int nrealreaders; | ||
| 145 | static struct task_struct *writer_task; | ||
| 146 | static struct task_struct **fakewriter_tasks; | ||
| 147 | static struct task_struct **reader_tasks; | ||
| 148 | static struct task_struct *stats_task; | ||
| 149 | static struct task_struct *shuffler_task; | ||
| 150 | static struct task_struct *stutter_task; | ||
| 151 | static struct task_struct *fqs_task; | ||
| 152 | static struct task_struct *boost_tasks[NR_CPUS]; | ||
| 153 | static struct task_struct *shutdown_task; | ||
| 154 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 155 | static struct task_struct *onoff_task; | ||
| 156 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
| 157 | static struct task_struct *stall_task; | ||
| 158 | static struct task_struct **barrier_cbs_tasks; | ||
| 159 | static struct task_struct *barrier_task; | ||
| 160 | |||
| 161 | #define RCU_TORTURE_PIPE_LEN 10 | ||
| 162 | |||
| 163 | struct rcu_torture { | ||
| 164 | struct rcu_head rtort_rcu; | ||
| 165 | int rtort_pipe_count; | ||
| 166 | struct list_head rtort_free; | ||
| 167 | int rtort_mbtest; | ||
| 168 | }; | ||
| 169 | |||
| 170 | static LIST_HEAD(rcu_torture_freelist); | ||
| 171 | static struct rcu_torture __rcu *rcu_torture_current; | ||
| 172 | static unsigned long rcu_torture_current_version; | ||
| 173 | static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; | ||
| 174 | static DEFINE_SPINLOCK(rcu_torture_lock); | ||
| 175 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = | ||
| 176 | { 0 }; | ||
| 177 | static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = | ||
| 178 | { 0 }; | ||
| 179 | static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; | ||
| 180 | static atomic_t n_rcu_torture_alloc; | ||
| 181 | static atomic_t n_rcu_torture_alloc_fail; | ||
| 182 | static atomic_t n_rcu_torture_free; | ||
| 183 | static atomic_t n_rcu_torture_mberror; | ||
| 184 | static atomic_t n_rcu_torture_error; | ||
| 185 | static long n_rcu_torture_barrier_error; | ||
| 186 | static long n_rcu_torture_boost_ktrerror; | ||
| 187 | static long n_rcu_torture_boost_rterror; | ||
| 188 | static long n_rcu_torture_boost_failure; | ||
| 189 | static long n_rcu_torture_boosts; | ||
| 190 | static long n_rcu_torture_timers; | ||
| 191 | static long n_offline_attempts; | ||
| 192 | static long n_offline_successes; | ||
| 193 | static unsigned long sum_offline; | ||
| 194 | static int min_offline = -1; | ||
| 195 | static int max_offline; | ||
| 196 | static long n_online_attempts; | ||
| 197 | static long n_online_successes; | ||
| 198 | static unsigned long sum_online; | ||
| 199 | static int min_online = -1; | ||
| 200 | static int max_online; | ||
| 201 | static long n_barrier_attempts; | ||
| 202 | static long n_barrier_successes; | ||
| 203 | static struct list_head rcu_torture_removed; | ||
| 204 | static cpumask_var_t shuffle_tmp_mask; | ||
| 205 | |||
| 206 | static int stutter_pause_test; | ||
| 207 | |||
| 208 | #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) | ||
| 209 | #define RCUTORTURE_RUNNABLE_INIT 1 | ||
| 210 | #else | ||
| 211 | #define RCUTORTURE_RUNNABLE_INIT 0 | ||
| 212 | #endif | ||
| 213 | int rcutorture_runnable = RCUTORTURE_RUNNABLE_INIT; | ||
| 214 | module_param(rcutorture_runnable, int, 0444); | ||
| 215 | MODULE_PARM_DESC(rcutorture_runnable, "Start rcutorture at boot"); | ||
| 216 | |||
| 217 | #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) | ||
| 218 | #define rcu_can_boost() 1 | ||
| 219 | #else /* #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ | ||
| 220 | #define rcu_can_boost() 0 | ||
| 221 | #endif /* #else #if defined(CONFIG_RCU_BOOST) && !defined(CONFIG_HOTPLUG_CPU) */ | ||
| 222 | |||
| 223 | #ifdef CONFIG_RCU_TRACE | ||
| 224 | static u64 notrace rcu_trace_clock_local(void) | ||
| 225 | { | ||
| 226 | u64 ts = trace_clock_local(); | ||
| 227 | unsigned long __maybe_unused ts_rem = do_div(ts, NSEC_PER_USEC); | ||
| 228 | return ts; | ||
| 229 | } | ||
| 230 | #else /* #ifdef CONFIG_RCU_TRACE */ | ||
| 231 | static u64 notrace rcu_trace_clock_local(void) | ||
| 232 | { | ||
| 233 | return 0ULL; | ||
| 234 | } | ||
| 235 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | ||
| 236 | |||
| 237 | static unsigned long shutdown_time; /* jiffies to system shutdown. */ | ||
| 238 | static unsigned long boost_starttime; /* jiffies of next boost test start. */ | ||
| 239 | DEFINE_MUTEX(boost_mutex); /* protect setting boost_starttime */ | ||
| 240 | /* and boost task create/destroy. */ | ||
| 241 | static atomic_t barrier_cbs_count; /* Barrier callbacks registered. */ | ||
| 242 | static bool barrier_phase; /* Test phase. */ | ||
| 243 | static atomic_t barrier_cbs_invoked; /* Barrier callbacks invoked. */ | ||
| 244 | static wait_queue_head_t *barrier_cbs_wq; /* Coordinate barrier testing. */ | ||
| 245 | static DECLARE_WAIT_QUEUE_HEAD(barrier_wq); | ||
| 246 | |||
| 247 | /* Mediate rmmod and system shutdown. Concurrent rmmod & shutdown illegal! */ | ||
| 248 | |||
| 249 | #define FULLSTOP_DONTSTOP 0 /* Normal operation. */ | ||
| 250 | #define FULLSTOP_SHUTDOWN 1 /* System shutdown with rcutorture running. */ | ||
| 251 | #define FULLSTOP_RMMOD 2 /* Normal rmmod of rcutorture. */ | ||
| 252 | static int fullstop = FULLSTOP_RMMOD; | ||
| 253 | /* | ||
| 254 | * Protect fullstop transitions and spawning of kthreads. | ||
| 255 | */ | ||
| 256 | static DEFINE_MUTEX(fullstop_mutex); | ||
| 257 | |||
| 258 | /* Forward reference. */ | ||
| 259 | static void rcu_torture_cleanup(void); | ||
| 260 | |||
| 261 | /* | ||
| 262 | * Detect and respond to a system shutdown. | ||
| 263 | */ | ||
| 264 | static int | ||
| 265 | rcutorture_shutdown_notify(struct notifier_block *unused1, | ||
| 266 | unsigned long unused2, void *unused3) | ||
| 267 | { | ||
| 268 | mutex_lock(&fullstop_mutex); | ||
| 269 | if (fullstop == FULLSTOP_DONTSTOP) | ||
| 270 | fullstop = FULLSTOP_SHUTDOWN; | ||
| 271 | else | ||
| 272 | pr_warn(/* but going down anyway, so... */ | ||
| 273 | "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); | ||
| 274 | mutex_unlock(&fullstop_mutex); | ||
| 275 | return NOTIFY_DONE; | ||
| 276 | } | ||
| 277 | |||
| 278 | /* | ||
| 279 | * Absorb kthreads into a kernel function that won't return, so that | ||
| 280 | * they won't ever access module text or data again. | ||
| 281 | */ | ||
| 282 | static void rcutorture_shutdown_absorb(const char *title) | ||
| 283 | { | ||
| 284 | if (ACCESS_ONCE(fullstop) == FULLSTOP_SHUTDOWN) { | ||
| 285 | pr_notice( | ||
| 286 | "rcutorture thread %s parking due to system shutdown\n", | ||
| 287 | title); | ||
| 288 | schedule_timeout_uninterruptible(MAX_SCHEDULE_TIMEOUT); | ||
| 289 | } | ||
| 290 | } | ||
| 291 | |||
| 292 | /* | ||
| 293 | * Allocate an element from the rcu_tortures pool. | ||
| 294 | */ | ||
| 295 | static struct rcu_torture * | ||
| 296 | rcu_torture_alloc(void) | ||
| 297 | { | ||
| 298 | struct list_head *p; | ||
| 299 | |||
| 300 | spin_lock_bh(&rcu_torture_lock); | ||
| 301 | if (list_empty(&rcu_torture_freelist)) { | ||
| 302 | atomic_inc(&n_rcu_torture_alloc_fail); | ||
| 303 | spin_unlock_bh(&rcu_torture_lock); | ||
| 304 | return NULL; | ||
| 305 | } | ||
| 306 | atomic_inc(&n_rcu_torture_alloc); | ||
| 307 | p = rcu_torture_freelist.next; | ||
| 308 | list_del_init(p); | ||
| 309 | spin_unlock_bh(&rcu_torture_lock); | ||
| 310 | return container_of(p, struct rcu_torture, rtort_free); | ||
| 311 | } | ||
| 312 | |||
| 313 | /* | ||
| 314 | * Free an element to the rcu_tortures pool. | ||
| 315 | */ | ||
| 316 | static void | ||
| 317 | rcu_torture_free(struct rcu_torture *p) | ||
| 318 | { | ||
| 319 | atomic_inc(&n_rcu_torture_free); | ||
| 320 | spin_lock_bh(&rcu_torture_lock); | ||
| 321 | list_add_tail(&p->rtort_free, &rcu_torture_freelist); | ||
| 322 | spin_unlock_bh(&rcu_torture_lock); | ||
| 323 | } | ||
| 324 | |||
| 325 | struct rcu_random_state { | ||
| 326 | unsigned long rrs_state; | ||
| 327 | long rrs_count; | ||
| 328 | }; | ||
| 329 | |||
| 330 | #define RCU_RANDOM_MULT 39916801 /* prime */ | ||
| 331 | #define RCU_RANDOM_ADD 479001701 /* prime */ | ||
| 332 | #define RCU_RANDOM_REFRESH 10000 | ||
| 333 | |||
| 334 | #define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 } | ||
| 335 | |||
| 336 | /* | ||
| 337 | * Crude but fast random-number generator. Uses a linear congruential | ||
| 338 | * generator, with occasional help from cpu_clock(). | ||
| 339 | */ | ||
| 340 | static unsigned long | ||
| 341 | rcu_random(struct rcu_random_state *rrsp) | ||
| 342 | { | ||
| 343 | if (--rrsp->rrs_count < 0) { | ||
| 344 | rrsp->rrs_state += (unsigned long)local_clock(); | ||
| 345 | rrsp->rrs_count = RCU_RANDOM_REFRESH; | ||
| 346 | } | ||
| 347 | rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD; | ||
| 348 | return swahw32(rrsp->rrs_state); | ||
| 349 | } | ||
| 350 | |||
| 351 | static void | ||
| 352 | rcu_stutter_wait(const char *title) | ||
| 353 | { | ||
| 354 | while (stutter_pause_test || !rcutorture_runnable) { | ||
| 355 | if (rcutorture_runnable) | ||
| 356 | schedule_timeout_interruptible(1); | ||
| 357 | else | ||
| 358 | schedule_timeout_interruptible(round_jiffies_relative(HZ)); | ||
| 359 | rcutorture_shutdown_absorb(title); | ||
| 360 | } | ||
| 361 | } | ||
| 362 | |||
| 363 | /* | ||
| 364 | * Operations vector for selecting different types of tests. | ||
| 365 | */ | ||
| 366 | |||
| 367 | struct rcu_torture_ops { | ||
| 368 | void (*init)(void); | ||
| 369 | int (*readlock)(void); | ||
| 370 | void (*read_delay)(struct rcu_random_state *rrsp); | ||
| 371 | void (*readunlock)(int idx); | ||
| 372 | int (*completed)(void); | ||
| 373 | void (*deferred_free)(struct rcu_torture *p); | ||
| 374 | void (*sync)(void); | ||
| 375 | void (*exp_sync)(void); | ||
| 376 | void (*call)(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | ||
| 377 | void (*cb_barrier)(void); | ||
| 378 | void (*fqs)(void); | ||
| 379 | int (*stats)(char *page); | ||
| 380 | int irq_capable; | ||
| 381 | int can_boost; | ||
| 382 | const char *name; | ||
| 383 | }; | ||
| 384 | |||
| 385 | static struct rcu_torture_ops *cur_ops; | ||
| 386 | |||
| 387 | /* | ||
| 388 | * Definitions for rcu torture testing. | ||
| 389 | */ | ||
| 390 | |||
| 391 | static int rcu_torture_read_lock(void) __acquires(RCU) | ||
| 392 | { | ||
| 393 | rcu_read_lock(); | ||
| 394 | return 0; | ||
| 395 | } | ||
| 396 | |||
| 397 | static void rcu_read_delay(struct rcu_random_state *rrsp) | ||
| 398 | { | ||
| 399 | const unsigned long shortdelay_us = 200; | ||
| 400 | const unsigned long longdelay_ms = 50; | ||
| 401 | |||
| 402 | /* We want a short delay sometimes to make a reader delay the grace | ||
| 403 | * period, and we want a long delay occasionally to trigger | ||
| 404 | * force_quiescent_state. */ | ||
| 405 | |||
| 406 | if (!(rcu_random(rrsp) % (nrealreaders * 2000 * longdelay_ms))) | ||
| 407 | mdelay(longdelay_ms); | ||
| 408 | if (!(rcu_random(rrsp) % (nrealreaders * 2 * shortdelay_us))) | ||
| 409 | udelay(shortdelay_us); | ||
| 410 | #ifdef CONFIG_PREEMPT | ||
| 411 | if (!preempt_count() && !(rcu_random(rrsp) % (nrealreaders * 20000))) | ||
| 412 | preempt_schedule(); /* No QS if preempt_disable() in effect */ | ||
| 413 | #endif | ||
| 414 | } | ||
| 415 | |||
| 416 | static void rcu_torture_read_unlock(int idx) __releases(RCU) | ||
| 417 | { | ||
| 418 | rcu_read_unlock(); | ||
| 419 | } | ||
| 420 | |||
| 421 | static int rcu_torture_completed(void) | ||
| 422 | { | ||
| 423 | return rcu_batches_completed(); | ||
| 424 | } | ||
| 425 | |||
| 426 | static void | ||
| 427 | rcu_torture_cb(struct rcu_head *p) | ||
| 428 | { | ||
| 429 | int i; | ||
| 430 | struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu); | ||
| 431 | |||
| 432 | if (fullstop != FULLSTOP_DONTSTOP) { | ||
| 433 | /* Test is ending, just drop callbacks on the floor. */ | ||
| 434 | /* The next initialization will pick up the pieces. */ | ||
| 435 | return; | ||
| 436 | } | ||
| 437 | i = rp->rtort_pipe_count; | ||
| 438 | if (i > RCU_TORTURE_PIPE_LEN) | ||
| 439 | i = RCU_TORTURE_PIPE_LEN; | ||
| 440 | atomic_inc(&rcu_torture_wcount[i]); | ||
| 441 | if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { | ||
| 442 | rp->rtort_mbtest = 0; | ||
| 443 | rcu_torture_free(rp); | ||
| 444 | } else { | ||
| 445 | cur_ops->deferred_free(rp); | ||
| 446 | } | ||
| 447 | } | ||
| 448 | |||
| 449 | static int rcu_no_completed(void) | ||
| 450 | { | ||
| 451 | return 0; | ||
| 452 | } | ||
| 453 | |||
| 454 | static void rcu_torture_deferred_free(struct rcu_torture *p) | ||
| 455 | { | ||
| 456 | call_rcu(&p->rtort_rcu, rcu_torture_cb); | ||
| 457 | } | ||
| 458 | |||
| 459 | static void rcu_sync_torture_init(void) | ||
| 460 | { | ||
| 461 | INIT_LIST_HEAD(&rcu_torture_removed); | ||
| 462 | } | ||
| 463 | |||
| 464 | static struct rcu_torture_ops rcu_ops = { | ||
| 465 | .init = rcu_sync_torture_init, | ||
| 466 | .readlock = rcu_torture_read_lock, | ||
| 467 | .read_delay = rcu_read_delay, | ||
| 468 | .readunlock = rcu_torture_read_unlock, | ||
| 469 | .completed = rcu_torture_completed, | ||
| 470 | .deferred_free = rcu_torture_deferred_free, | ||
| 471 | .sync = synchronize_rcu, | ||
| 472 | .exp_sync = synchronize_rcu_expedited, | ||
| 473 | .call = call_rcu, | ||
| 474 | .cb_barrier = rcu_barrier, | ||
| 475 | .fqs = rcu_force_quiescent_state, | ||
| 476 | .stats = NULL, | ||
| 477 | .irq_capable = 1, | ||
| 478 | .can_boost = rcu_can_boost(), | ||
| 479 | .name = "rcu" | ||
| 480 | }; | ||
| 481 | |||
| 482 | /* | ||
| 483 | * Definitions for rcu_bh torture testing. | ||
| 484 | */ | ||
| 485 | |||
| 486 | static int rcu_bh_torture_read_lock(void) __acquires(RCU_BH) | ||
| 487 | { | ||
| 488 | rcu_read_lock_bh(); | ||
| 489 | return 0; | ||
| 490 | } | ||
| 491 | |||
| 492 | static void rcu_bh_torture_read_unlock(int idx) __releases(RCU_BH) | ||
| 493 | { | ||
| 494 | rcu_read_unlock_bh(); | ||
| 495 | } | ||
| 496 | |||
| 497 | static int rcu_bh_torture_completed(void) | ||
| 498 | { | ||
| 499 | return rcu_batches_completed_bh(); | ||
| 500 | } | ||
| 501 | |||
| 502 | static void rcu_bh_torture_deferred_free(struct rcu_torture *p) | ||
| 503 | { | ||
| 504 | call_rcu_bh(&p->rtort_rcu, rcu_torture_cb); | ||
| 505 | } | ||
| 506 | |||
| 507 | static struct rcu_torture_ops rcu_bh_ops = { | ||
| 508 | .init = rcu_sync_torture_init, | ||
| 509 | .readlock = rcu_bh_torture_read_lock, | ||
| 510 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
| 511 | .readunlock = rcu_bh_torture_read_unlock, | ||
| 512 | .completed = rcu_bh_torture_completed, | ||
| 513 | .deferred_free = rcu_bh_torture_deferred_free, | ||
| 514 | .sync = synchronize_rcu_bh, | ||
| 515 | .exp_sync = synchronize_rcu_bh_expedited, | ||
| 516 | .call = call_rcu_bh, | ||
| 517 | .cb_barrier = rcu_barrier_bh, | ||
| 518 | .fqs = rcu_bh_force_quiescent_state, | ||
| 519 | .stats = NULL, | ||
| 520 | .irq_capable = 1, | ||
| 521 | .name = "rcu_bh" | ||
| 522 | }; | ||
| 523 | |||
| 524 | /* | ||
| 525 | * Definitions for srcu torture testing. | ||
| 526 | */ | ||
| 527 | |||
| 528 | DEFINE_STATIC_SRCU(srcu_ctl); | ||
| 529 | |||
| 530 | static int srcu_torture_read_lock(void) __acquires(&srcu_ctl) | ||
| 531 | { | ||
| 532 | return srcu_read_lock(&srcu_ctl); | ||
| 533 | } | ||
| 534 | |||
| 535 | static void srcu_read_delay(struct rcu_random_state *rrsp) | ||
| 536 | { | ||
| 537 | long delay; | ||
| 538 | const long uspertick = 1000000 / HZ; | ||
| 539 | const long longdelay = 10; | ||
| 540 | |||
| 541 | /* We want there to be long-running readers, but not all the time. */ | ||
| 542 | |||
| 543 | delay = rcu_random(rrsp) % (nrealreaders * 2 * longdelay * uspertick); | ||
| 544 | if (!delay) | ||
| 545 | schedule_timeout_interruptible(longdelay); | ||
| 546 | else | ||
| 547 | rcu_read_delay(rrsp); | ||
| 548 | } | ||
| 549 | |||
| 550 | static void srcu_torture_read_unlock(int idx) __releases(&srcu_ctl) | ||
| 551 | { | ||
| 552 | srcu_read_unlock(&srcu_ctl, idx); | ||
| 553 | } | ||
| 554 | |||
| 555 | static int srcu_torture_completed(void) | ||
| 556 | { | ||
| 557 | return srcu_batches_completed(&srcu_ctl); | ||
| 558 | } | ||
| 559 | |||
| 560 | static void srcu_torture_deferred_free(struct rcu_torture *rp) | ||
| 561 | { | ||
| 562 | call_srcu(&srcu_ctl, &rp->rtort_rcu, rcu_torture_cb); | ||
| 563 | } | ||
| 564 | |||
| 565 | static void srcu_torture_synchronize(void) | ||
| 566 | { | ||
| 567 | synchronize_srcu(&srcu_ctl); | ||
| 568 | } | ||
| 569 | |||
| 570 | static void srcu_torture_call(struct rcu_head *head, | ||
| 571 | void (*func)(struct rcu_head *head)) | ||
| 572 | { | ||
| 573 | call_srcu(&srcu_ctl, head, func); | ||
| 574 | } | ||
| 575 | |||
| 576 | static void srcu_torture_barrier(void) | ||
| 577 | { | ||
| 578 | srcu_barrier(&srcu_ctl); | ||
| 579 | } | ||
| 580 | |||
| 581 | static int srcu_torture_stats(char *page) | ||
| 582 | { | ||
| 583 | int cnt = 0; | ||
| 584 | int cpu; | ||
| 585 | int idx = srcu_ctl.completed & 0x1; | ||
| 586 | |||
| 587 | cnt += sprintf(&page[cnt], "%s%s per-CPU(idx=%d):", | ||
| 588 | torture_type, TORTURE_FLAG, idx); | ||
| 589 | for_each_possible_cpu(cpu) { | ||
| 590 | cnt += sprintf(&page[cnt], " %d(%lu,%lu)", cpu, | ||
| 591 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[!idx], | ||
| 592 | per_cpu_ptr(srcu_ctl.per_cpu_ref, cpu)->c[idx]); | ||
| 593 | } | ||
| 594 | cnt += sprintf(&page[cnt], "\n"); | ||
| 595 | return cnt; | ||
| 596 | } | ||
| 597 | |||
| 598 | static void srcu_torture_synchronize_expedited(void) | ||
| 599 | { | ||
| 600 | synchronize_srcu_expedited(&srcu_ctl); | ||
| 601 | } | ||
| 602 | |||
| 603 | static struct rcu_torture_ops srcu_ops = { | ||
| 604 | .init = rcu_sync_torture_init, | ||
| 605 | .readlock = srcu_torture_read_lock, | ||
| 606 | .read_delay = srcu_read_delay, | ||
| 607 | .readunlock = srcu_torture_read_unlock, | ||
| 608 | .completed = srcu_torture_completed, | ||
| 609 | .deferred_free = srcu_torture_deferred_free, | ||
| 610 | .sync = srcu_torture_synchronize, | ||
| 611 | .exp_sync = srcu_torture_synchronize_expedited, | ||
| 612 | .call = srcu_torture_call, | ||
| 613 | .cb_barrier = srcu_torture_barrier, | ||
| 614 | .stats = srcu_torture_stats, | ||
| 615 | .name = "srcu" | ||
| 616 | }; | ||
| 617 | |||
| 618 | /* | ||
| 619 | * Definitions for sched torture testing. | ||
| 620 | */ | ||
| 621 | |||
| 622 | static int sched_torture_read_lock(void) | ||
| 623 | { | ||
| 624 | preempt_disable(); | ||
| 625 | return 0; | ||
| 626 | } | ||
| 627 | |||
| 628 | static void sched_torture_read_unlock(int idx) | ||
| 629 | { | ||
| 630 | preempt_enable(); | ||
| 631 | } | ||
| 632 | |||
| 633 | static void rcu_sched_torture_deferred_free(struct rcu_torture *p) | ||
| 634 | { | ||
| 635 | call_rcu_sched(&p->rtort_rcu, rcu_torture_cb); | ||
| 636 | } | ||
| 637 | |||
| 638 | static struct rcu_torture_ops sched_ops = { | ||
| 639 | .init = rcu_sync_torture_init, | ||
| 640 | .readlock = sched_torture_read_lock, | ||
| 641 | .read_delay = rcu_read_delay, /* just reuse rcu's version. */ | ||
| 642 | .readunlock = sched_torture_read_unlock, | ||
| 643 | .completed = rcu_no_completed, | ||
| 644 | .deferred_free = rcu_sched_torture_deferred_free, | ||
| 645 | .sync = synchronize_sched, | ||
| 646 | .exp_sync = synchronize_sched_expedited, | ||
| 647 | .call = call_rcu_sched, | ||
| 648 | .cb_barrier = rcu_barrier_sched, | ||
| 649 | .fqs = rcu_sched_force_quiescent_state, | ||
| 650 | .stats = NULL, | ||
| 651 | .irq_capable = 1, | ||
| 652 | .name = "sched" | ||
| 653 | }; | ||
| 654 | |||
| 655 | /* | ||
| 656 | * RCU torture priority-boost testing. Runs one real-time thread per | ||
| 657 | * CPU for moderate bursts, repeatedly registering RCU callbacks and | ||
| 658 | * spinning waiting for them to be invoked. If a given callback takes | ||
| 659 | * too long to be invoked, we assume that priority inversion has occurred. | ||
| 660 | */ | ||
| 661 | |||
| 662 | struct rcu_boost_inflight { | ||
| 663 | struct rcu_head rcu; | ||
| 664 | int inflight; | ||
| 665 | }; | ||
| 666 | |||
| 667 | static void rcu_torture_boost_cb(struct rcu_head *head) | ||
| 668 | { | ||
| 669 | struct rcu_boost_inflight *rbip = | ||
| 670 | container_of(head, struct rcu_boost_inflight, rcu); | ||
| 671 | |||
| 672 | smp_mb(); /* Ensure RCU-core accesses precede clearing ->inflight */ | ||
| 673 | rbip->inflight = 0; | ||
| 674 | } | ||
| 675 | |||
| 676 | static int rcu_torture_boost(void *arg) | ||
| 677 | { | ||
| 678 | unsigned long call_rcu_time; | ||
| 679 | unsigned long endtime; | ||
| 680 | unsigned long oldstarttime; | ||
| 681 | struct rcu_boost_inflight rbi = { .inflight = 0 }; | ||
| 682 | struct sched_param sp; | ||
| 683 | |||
| 684 | VERBOSE_PRINTK_STRING("rcu_torture_boost started"); | ||
| 685 | |||
| 686 | /* Set real-time priority. */ | ||
| 687 | sp.sched_priority = 1; | ||
| 688 | if (sched_setscheduler(current, SCHED_FIFO, &sp) < 0) { | ||
| 689 | VERBOSE_PRINTK_STRING("rcu_torture_boost RT prio failed!"); | ||
| 690 | n_rcu_torture_boost_rterror++; | ||
| 691 | } | ||
| 692 | |||
| 693 | init_rcu_head_on_stack(&rbi.rcu); | ||
| 694 | /* Each pass through the following loop does one boost-test cycle. */ | ||
| 695 | do { | ||
| 696 | /* Wait for the next test interval. */ | ||
| 697 | oldstarttime = boost_starttime; | ||
| 698 | while (ULONG_CMP_LT(jiffies, oldstarttime)) { | ||
| 699 | schedule_timeout_interruptible(oldstarttime - jiffies); | ||
| 700 | rcu_stutter_wait("rcu_torture_boost"); | ||
| 701 | if (kthread_should_stop() || | ||
| 702 | fullstop != FULLSTOP_DONTSTOP) | ||
| 703 | goto checkwait; | ||
| 704 | } | ||
| 705 | |||
| 706 | /* Do one boost-test interval. */ | ||
| 707 | endtime = oldstarttime + test_boost_duration * HZ; | ||
| 708 | call_rcu_time = jiffies; | ||
| 709 | while (ULONG_CMP_LT(jiffies, endtime)) { | ||
| 710 | /* If we don't have a callback in flight, post one. */ | ||
| 711 | if (!rbi.inflight) { | ||
| 712 | smp_mb(); /* RCU core before ->inflight = 1. */ | ||
| 713 | rbi.inflight = 1; | ||
| 714 | call_rcu(&rbi.rcu, rcu_torture_boost_cb); | ||
| 715 | if (jiffies - call_rcu_time > | ||
| 716 | test_boost_duration * HZ - HZ / 2) { | ||
| 717 | VERBOSE_PRINTK_STRING("rcu_torture_boost boosting failed"); | ||
| 718 | n_rcu_torture_boost_failure++; | ||
| 719 | } | ||
| 720 | call_rcu_time = jiffies; | ||
| 721 | } | ||
| 722 | cond_resched(); | ||
| 723 | rcu_stutter_wait("rcu_torture_boost"); | ||
| 724 | if (kthread_should_stop() || | ||
| 725 | fullstop != FULLSTOP_DONTSTOP) | ||
| 726 | goto checkwait; | ||
| 727 | } | ||
| 728 | |||
| 729 | /* | ||
| 730 | * Set the start time of the next test interval. | ||
| 731 | * Yes, this is vulnerable to long delays, but such | ||
| 732 | * delays simply cause a false negative for the next | ||
| 733 | * interval. Besides, we are running at RT priority, | ||
| 734 | * so delays should be relatively rare. | ||
| 735 | */ | ||
| 736 | while (oldstarttime == boost_starttime && | ||
| 737 | !kthread_should_stop()) { | ||
| 738 | if (mutex_trylock(&boost_mutex)) { | ||
| 739 | boost_starttime = jiffies + | ||
| 740 | test_boost_interval * HZ; | ||
| 741 | n_rcu_torture_boosts++; | ||
| 742 | mutex_unlock(&boost_mutex); | ||
| 743 | break; | ||
| 744 | } | ||
| 745 | schedule_timeout_uninterruptible(1); | ||
| 746 | } | ||
| 747 | |||
| 748 | /* Go do the stutter. */ | ||
| 749 | checkwait: rcu_stutter_wait("rcu_torture_boost"); | ||
| 750 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
| 751 | |||
| 752 | /* Clean up and exit. */ | ||
| 753 | VERBOSE_PRINTK_STRING("rcu_torture_boost task stopping"); | ||
| 754 | rcutorture_shutdown_absorb("rcu_torture_boost"); | ||
| 755 | while (!kthread_should_stop() || rbi.inflight) | ||
| 756 | schedule_timeout_uninterruptible(1); | ||
| 757 | smp_mb(); /* order accesses to ->inflight before stack-frame death. */ | ||
| 758 | destroy_rcu_head_on_stack(&rbi.rcu); | ||
| 759 | return 0; | ||
| 760 | } | ||
| 761 | |||
| 762 | /* | ||
| 763 | * RCU torture force-quiescent-state kthread. Repeatedly induces | ||
| 764 | * bursts of calls to force_quiescent_state(), increasing the probability | ||
| 765 | * of occurrence of some important types of race conditions. | ||
| 766 | */ | ||
| 767 | static int | ||
| 768 | rcu_torture_fqs(void *arg) | ||
| 769 | { | ||
| 770 | unsigned long fqs_resume_time; | ||
| 771 | int fqs_burst_remaining; | ||
| 772 | |||
| 773 | VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); | ||
| 774 | do { | ||
| 775 | fqs_resume_time = jiffies + fqs_stutter * HZ; | ||
| 776 | while (ULONG_CMP_LT(jiffies, fqs_resume_time) && | ||
| 777 | !kthread_should_stop()) { | ||
| 778 | schedule_timeout_interruptible(1); | ||
| 779 | } | ||
| 780 | fqs_burst_remaining = fqs_duration; | ||
| 781 | while (fqs_burst_remaining > 0 && | ||
| 782 | !kthread_should_stop()) { | ||
| 783 | cur_ops->fqs(); | ||
| 784 | udelay(fqs_holdoff); | ||
| 785 | fqs_burst_remaining -= fqs_holdoff; | ||
| 786 | } | ||
| 787 | rcu_stutter_wait("rcu_torture_fqs"); | ||
| 788 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
| 789 | VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); | ||
| 790 | rcutorture_shutdown_absorb("rcu_torture_fqs"); | ||
| 791 | while (!kthread_should_stop()) | ||
| 792 | schedule_timeout_uninterruptible(1); | ||
| 793 | return 0; | ||
| 794 | } | ||
| 795 | |||
| 796 | /* | ||
| 797 | * RCU torture writer kthread. Repeatedly substitutes a new structure | ||
| 798 | * for that pointed to by rcu_torture_current, freeing the old structure | ||
| 799 | * after a series of grace periods (the "pipeline"). | ||
| 800 | */ | ||
| 801 | static int | ||
| 802 | rcu_torture_writer(void *arg) | ||
| 803 | { | ||
| 804 | bool exp; | ||
| 805 | int i; | ||
| 806 | struct rcu_torture *rp; | ||
| 807 | struct rcu_torture *rp1; | ||
| 808 | struct rcu_torture *old_rp; | ||
| 809 | static DEFINE_RCU_RANDOM(rand); | ||
| 810 | |||
| 811 | VERBOSE_PRINTK_STRING("rcu_torture_writer task started"); | ||
| 812 | set_user_nice(current, 19); | ||
| 813 | |||
| 814 | do { | ||
| 815 | schedule_timeout_uninterruptible(1); | ||
| 816 | rp = rcu_torture_alloc(); | ||
| 817 | if (rp == NULL) | ||
| 818 | continue; | ||
| 819 | rp->rtort_pipe_count = 0; | ||
| 820 | udelay(rcu_random(&rand) & 0x3ff); | ||
| 821 | old_rp = rcu_dereference_check(rcu_torture_current, | ||
| 822 | current == writer_task); | ||
| 823 | rp->rtort_mbtest = 1; | ||
| 824 | rcu_assign_pointer(rcu_torture_current, rp); | ||
| 825 | smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */ | ||
| 826 | if (old_rp) { | ||
| 827 | i = old_rp->rtort_pipe_count; | ||
| 828 | if (i > RCU_TORTURE_PIPE_LEN) | ||
| 829 | i = RCU_TORTURE_PIPE_LEN; | ||
| 830 | atomic_inc(&rcu_torture_wcount[i]); | ||
| 831 | old_rp->rtort_pipe_count++; | ||
| 832 | if (gp_normal == gp_exp) | ||
| 833 | exp = !!(rcu_random(&rand) & 0x80); | ||
| 834 | else | ||
| 835 | exp = gp_exp; | ||
| 836 | if (!exp) { | ||
| 837 | cur_ops->deferred_free(old_rp); | ||
| 838 | } else { | ||
| 839 | cur_ops->exp_sync(); | ||
| 840 | list_add(&old_rp->rtort_free, | ||
| 841 | &rcu_torture_removed); | ||
| 842 | list_for_each_entry_safe(rp, rp1, | ||
| 843 | &rcu_torture_removed, | ||
| 844 | rtort_free) { | ||
| 845 | i = rp->rtort_pipe_count; | ||
| 846 | if (i > RCU_TORTURE_PIPE_LEN) | ||
| 847 | i = RCU_TORTURE_PIPE_LEN; | ||
| 848 | atomic_inc(&rcu_torture_wcount[i]); | ||
| 849 | if (++rp->rtort_pipe_count >= | ||
| 850 | RCU_TORTURE_PIPE_LEN) { | ||
| 851 | rp->rtort_mbtest = 0; | ||
| 852 | list_del(&rp->rtort_free); | ||
| 853 | rcu_torture_free(rp); | ||
| 854 | } | ||
| 855 | } | ||
| 856 | } | ||
| 857 | } | ||
| 858 | rcutorture_record_progress(++rcu_torture_current_version); | ||
| 859 | rcu_stutter_wait("rcu_torture_writer"); | ||
| 860 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
| 861 | VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping"); | ||
| 862 | rcutorture_shutdown_absorb("rcu_torture_writer"); | ||
| 863 | while (!kthread_should_stop()) | ||
| 864 | schedule_timeout_uninterruptible(1); | ||
| 865 | return 0; | ||
| 866 | } | ||
| 867 | |||
| 868 | /* | ||
| 869 | * RCU torture fake writer kthread. Repeatedly calls sync, with a random | ||
| 870 | * delay between calls. | ||
| 871 | */ | ||
| 872 | static int | ||
| 873 | rcu_torture_fakewriter(void *arg) | ||
| 874 | { | ||
| 875 | DEFINE_RCU_RANDOM(rand); | ||
| 876 | |||
| 877 | VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task started"); | ||
| 878 | set_user_nice(current, 19); | ||
| 879 | |||
| 880 | do { | ||
| 881 | schedule_timeout_uninterruptible(1 + rcu_random(&rand)%10); | ||
| 882 | udelay(rcu_random(&rand) & 0x3ff); | ||
| 883 | if (cur_ops->cb_barrier != NULL && | ||
| 884 | rcu_random(&rand) % (nfakewriters * 8) == 0) { | ||
| 885 | cur_ops->cb_barrier(); | ||
| 886 | } else if (gp_normal == gp_exp) { | ||
| 887 | if (rcu_random(&rand) & 0x80) | ||
| 888 | cur_ops->sync(); | ||
| 889 | else | ||
| 890 | cur_ops->exp_sync(); | ||
| 891 | } else if (gp_normal) { | ||
| 892 | cur_ops->sync(); | ||
| 893 | } else { | ||
| 894 | cur_ops->exp_sync(); | ||
| 895 | } | ||
| 896 | rcu_stutter_wait("rcu_torture_fakewriter"); | ||
| 897 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
| 898 | |||
| 899 | VERBOSE_PRINTK_STRING("rcu_torture_fakewriter task stopping"); | ||
| 900 | rcutorture_shutdown_absorb("rcu_torture_fakewriter"); | ||
| 901 | while (!kthread_should_stop()) | ||
| 902 | schedule_timeout_uninterruptible(1); | ||
| 903 | return 0; | ||
| 904 | } | ||
| 905 | |||
| 906 | void rcutorture_trace_dump(void) | ||
| 907 | { | ||
| 908 | static atomic_t beenhere = ATOMIC_INIT(0); | ||
| 909 | |||
| 910 | if (atomic_read(&beenhere)) | ||
| 911 | return; | ||
| 912 | if (atomic_xchg(&beenhere, 1) != 0) | ||
| 913 | return; | ||
| 914 | ftrace_dump(DUMP_ALL); | ||
| 915 | } | ||
| 916 | |||
| 917 | /* | ||
| 918 | * RCU torture reader from timer handler. Dereferences rcu_torture_current, | ||
| 919 | * incrementing the corresponding element of the pipeline array. The | ||
| 920 | * counter in the element should never be greater than 1, otherwise, the | ||
| 921 | * RCU implementation is broken. | ||
| 922 | */ | ||
| 923 | static void rcu_torture_timer(unsigned long unused) | ||
| 924 | { | ||
| 925 | int idx; | ||
| 926 | int completed; | ||
| 927 | int completed_end; | ||
| 928 | static DEFINE_RCU_RANDOM(rand); | ||
| 929 | static DEFINE_SPINLOCK(rand_lock); | ||
| 930 | struct rcu_torture *p; | ||
| 931 | int pipe_count; | ||
| 932 | unsigned long long ts; | ||
| 933 | |||
| 934 | idx = cur_ops->readlock(); | ||
| 935 | completed = cur_ops->completed(); | ||
| 936 | ts = rcu_trace_clock_local(); | ||
| 937 | p = rcu_dereference_check(rcu_torture_current, | ||
| 938 | rcu_read_lock_bh_held() || | ||
| 939 | rcu_read_lock_sched_held() || | ||
| 940 | srcu_read_lock_held(&srcu_ctl)); | ||
| 941 | if (p == NULL) { | ||
| 942 | /* Leave because rcu_torture_writer is not yet underway */ | ||
| 943 | cur_ops->readunlock(idx); | ||
| 944 | return; | ||
| 945 | } | ||
| 946 | if (p->rtort_mbtest == 0) | ||
| 947 | atomic_inc(&n_rcu_torture_mberror); | ||
| 948 | spin_lock(&rand_lock); | ||
| 949 | cur_ops->read_delay(&rand); | ||
| 950 | n_rcu_torture_timers++; | ||
| 951 | spin_unlock(&rand_lock); | ||
| 952 | preempt_disable(); | ||
| 953 | pipe_count = p->rtort_pipe_count; | ||
| 954 | if (pipe_count > RCU_TORTURE_PIPE_LEN) { | ||
| 955 | /* Should not happen, but... */ | ||
| 956 | pipe_count = RCU_TORTURE_PIPE_LEN; | ||
| 957 | } | ||
| 958 | completed_end = cur_ops->completed(); | ||
| 959 | if (pipe_count > 1) { | ||
| 960 | do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, ts, | ||
| 961 | completed, completed_end); | ||
| 962 | rcutorture_trace_dump(); | ||
| 963 | } | ||
| 964 | __this_cpu_inc(rcu_torture_count[pipe_count]); | ||
| 965 | completed = completed_end - completed; | ||
| 966 | if (completed > RCU_TORTURE_PIPE_LEN) { | ||
| 967 | /* Should not happen, but... */ | ||
| 968 | completed = RCU_TORTURE_PIPE_LEN; | ||
| 969 | } | ||
| 970 | __this_cpu_inc(rcu_torture_batch[completed]); | ||
| 971 | preempt_enable(); | ||
| 972 | cur_ops->readunlock(idx); | ||
| 973 | } | ||
| 974 | |||
| 975 | /* | ||
| 976 | * RCU torture reader kthread. Repeatedly dereferences rcu_torture_current, | ||
| 977 | * incrementing the corresponding element of the pipeline array. The | ||
| 978 | * counter in the element should never be greater than 1, otherwise, the | ||
| 979 | * RCU implementation is broken. | ||
| 980 | */ | ||
| 981 | static int | ||
| 982 | rcu_torture_reader(void *arg) | ||
| 983 | { | ||
| 984 | int completed; | ||
| 985 | int completed_end; | ||
| 986 | int idx; | ||
| 987 | DEFINE_RCU_RANDOM(rand); | ||
| 988 | struct rcu_torture *p; | ||
| 989 | int pipe_count; | ||
| 990 | struct timer_list t; | ||
| 991 | unsigned long long ts; | ||
| 992 | |||
| 993 | VERBOSE_PRINTK_STRING("rcu_torture_reader task started"); | ||
| 994 | set_user_nice(current, 19); | ||
| 995 | if (irqreader && cur_ops->irq_capable) | ||
| 996 | setup_timer_on_stack(&t, rcu_torture_timer, 0); | ||
| 997 | |||
| 998 | do { | ||
| 999 | if (irqreader && cur_ops->irq_capable) { | ||
| 1000 | if (!timer_pending(&t)) | ||
| 1001 | mod_timer(&t, jiffies + 1); | ||
| 1002 | } | ||
| 1003 | idx = cur_ops->readlock(); | ||
| 1004 | completed = cur_ops->completed(); | ||
| 1005 | ts = rcu_trace_clock_local(); | ||
| 1006 | p = rcu_dereference_check(rcu_torture_current, | ||
| 1007 | rcu_read_lock_bh_held() || | ||
| 1008 | rcu_read_lock_sched_held() || | ||
| 1009 | srcu_read_lock_held(&srcu_ctl)); | ||
| 1010 | if (p == NULL) { | ||
| 1011 | /* Wait for rcu_torture_writer to get underway */ | ||
| 1012 | cur_ops->readunlock(idx); | ||
| 1013 | schedule_timeout_interruptible(HZ); | ||
| 1014 | continue; | ||
| 1015 | } | ||
| 1016 | if (p->rtort_mbtest == 0) | ||
| 1017 | atomic_inc(&n_rcu_torture_mberror); | ||
| 1018 | cur_ops->read_delay(&rand); | ||
| 1019 | preempt_disable(); | ||
| 1020 | pipe_count = p->rtort_pipe_count; | ||
| 1021 | if (pipe_count > RCU_TORTURE_PIPE_LEN) { | ||
| 1022 | /* Should not happen, but... */ | ||
| 1023 | pipe_count = RCU_TORTURE_PIPE_LEN; | ||
| 1024 | } | ||
| 1025 | completed_end = cur_ops->completed(); | ||
| 1026 | if (pipe_count > 1) { | ||
| 1027 | do_trace_rcu_torture_read(cur_ops->name, &p->rtort_rcu, | ||
| 1028 | ts, completed, completed_end); | ||
| 1029 | rcutorture_trace_dump(); | ||
| 1030 | } | ||
| 1031 | __this_cpu_inc(rcu_torture_count[pipe_count]); | ||
| 1032 | completed = completed_end - completed; | ||
| 1033 | if (completed > RCU_TORTURE_PIPE_LEN) { | ||
| 1034 | /* Should not happen, but... */ | ||
| 1035 | completed = RCU_TORTURE_PIPE_LEN; | ||
| 1036 | } | ||
| 1037 | __this_cpu_inc(rcu_torture_batch[completed]); | ||
| 1038 | preempt_enable(); | ||
| 1039 | cur_ops->readunlock(idx); | ||
| 1040 | schedule(); | ||
| 1041 | rcu_stutter_wait("rcu_torture_reader"); | ||
| 1042 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
| 1043 | VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping"); | ||
| 1044 | rcutorture_shutdown_absorb("rcu_torture_reader"); | ||
| 1045 | if (irqreader && cur_ops->irq_capable) | ||
| 1046 | del_timer_sync(&t); | ||
| 1047 | while (!kthread_should_stop()) | ||
| 1048 | schedule_timeout_uninterruptible(1); | ||
| 1049 | return 0; | ||
| 1050 | } | ||
| 1051 | |||
| 1052 | /* | ||
| 1053 | * Create an RCU-torture statistics message in the specified buffer. | ||
| 1054 | */ | ||
| 1055 | static int | ||
| 1056 | rcu_torture_printk(char *page) | ||
| 1057 | { | ||
| 1058 | int cnt = 0; | ||
| 1059 | int cpu; | ||
| 1060 | int i; | ||
| 1061 | long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | ||
| 1062 | long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 }; | ||
| 1063 | |||
| 1064 | for_each_possible_cpu(cpu) { | ||
| 1065 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | ||
| 1066 | pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i]; | ||
| 1067 | batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i]; | ||
| 1068 | } | ||
| 1069 | } | ||
| 1070 | for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) { | ||
| 1071 | if (pipesummary[i] != 0) | ||
| 1072 | break; | ||
| 1073 | } | ||
| 1074 | cnt += sprintf(&page[cnt], "%s%s ", torture_type, TORTURE_FLAG); | ||
| 1075 | cnt += sprintf(&page[cnt], | ||
| 1076 | "rtc: %p ver: %lu tfle: %d rta: %d rtaf: %d rtf: %d ", | ||
| 1077 | rcu_torture_current, | ||
| 1078 | rcu_torture_current_version, | ||
| 1079 | list_empty(&rcu_torture_freelist), | ||
| 1080 | atomic_read(&n_rcu_torture_alloc), | ||
| 1081 | atomic_read(&n_rcu_torture_alloc_fail), | ||
| 1082 | atomic_read(&n_rcu_torture_free)); | ||
| 1083 | cnt += sprintf(&page[cnt], "rtmbe: %d rtbke: %ld rtbre: %ld ", | ||
| 1084 | atomic_read(&n_rcu_torture_mberror), | ||
| 1085 | n_rcu_torture_boost_ktrerror, | ||
| 1086 | n_rcu_torture_boost_rterror); | ||
| 1087 | cnt += sprintf(&page[cnt], "rtbf: %ld rtb: %ld nt: %ld ", | ||
| 1088 | n_rcu_torture_boost_failure, | ||
| 1089 | n_rcu_torture_boosts, | ||
| 1090 | n_rcu_torture_timers); | ||
| 1091 | cnt += sprintf(&page[cnt], | ||
| 1092 | "onoff: %ld/%ld:%ld/%ld %d,%d:%d,%d %lu:%lu (HZ=%d) ", | ||
| 1093 | n_online_successes, n_online_attempts, | ||
| 1094 | n_offline_successes, n_offline_attempts, | ||
| 1095 | min_online, max_online, | ||
| 1096 | min_offline, max_offline, | ||
| 1097 | sum_online, sum_offline, HZ); | ||
| 1098 | cnt += sprintf(&page[cnt], "barrier: %ld/%ld:%ld", | ||
| 1099 | n_barrier_successes, | ||
| 1100 | n_barrier_attempts, | ||
| 1101 | n_rcu_torture_barrier_error); | ||
| 1102 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | ||
| 1103 | if (atomic_read(&n_rcu_torture_mberror) != 0 || | ||
| 1104 | n_rcu_torture_barrier_error != 0 || | ||
| 1105 | n_rcu_torture_boost_ktrerror != 0 || | ||
| 1106 | n_rcu_torture_boost_rterror != 0 || | ||
| 1107 | n_rcu_torture_boost_failure != 0 || | ||
| 1108 | i > 1) { | ||
| 1109 | cnt += sprintf(&page[cnt], "!!! "); | ||
| 1110 | atomic_inc(&n_rcu_torture_error); | ||
| 1111 | WARN_ON_ONCE(1); | ||
| 1112 | } | ||
| 1113 | cnt += sprintf(&page[cnt], "Reader Pipe: "); | ||
| 1114 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | ||
| 1115 | cnt += sprintf(&page[cnt], " %ld", pipesummary[i]); | ||
| 1116 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | ||
| 1117 | cnt += sprintf(&page[cnt], "Reader Batch: "); | ||
| 1118 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | ||
| 1119 | cnt += sprintf(&page[cnt], " %ld", batchsummary[i]); | ||
| 1120 | cnt += sprintf(&page[cnt], "\n%s%s ", torture_type, TORTURE_FLAG); | ||
| 1121 | cnt += sprintf(&page[cnt], "Free-Block Circulation: "); | ||
| 1122 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | ||
| 1123 | cnt += sprintf(&page[cnt], " %d", | ||
| 1124 | atomic_read(&rcu_torture_wcount[i])); | ||
| 1125 | } | ||
| 1126 | cnt += sprintf(&page[cnt], "\n"); | ||
| 1127 | if (cur_ops->stats) | ||
| 1128 | cnt += cur_ops->stats(&page[cnt]); | ||
| 1129 | return cnt; | ||
| 1130 | } | ||
| 1131 | |||
| 1132 | /* | ||
| 1133 | * Print torture statistics. Caller must ensure that there is only | ||
| 1134 | * one call to this function at a given time!!! This is normally | ||
| 1135 | * accomplished by relying on the module system to only have one copy | ||
| 1136 | * of the module loaded, and then by giving the rcu_torture_stats | ||
| 1137 | * kthread full control (or the init/cleanup functions when rcu_torture_stats | ||
| 1138 | * thread is not running). | ||
| 1139 | */ | ||
| 1140 | static void | ||
| 1141 | rcu_torture_stats_print(void) | ||
| 1142 | { | ||
| 1143 | int cnt; | ||
| 1144 | |||
| 1145 | cnt = rcu_torture_printk(printk_buf); | ||
| 1146 | pr_alert("%s", printk_buf); | ||
| 1147 | } | ||
| 1148 | |||
| 1149 | /* | ||
| 1150 | * Periodically prints torture statistics, if periodic statistics printing | ||
| 1151 | * was specified via the stat_interval module parameter. | ||
| 1152 | * | ||
| 1153 | * No need to worry about fullstop here, since this one doesn't reference | ||
| 1154 | * volatile state or register callbacks. | ||
| 1155 | */ | ||
| 1156 | static int | ||
| 1157 | rcu_torture_stats(void *arg) | ||
| 1158 | { | ||
| 1159 | VERBOSE_PRINTK_STRING("rcu_torture_stats task started"); | ||
| 1160 | do { | ||
| 1161 | schedule_timeout_interruptible(stat_interval * HZ); | ||
| 1162 | rcu_torture_stats_print(); | ||
| 1163 | rcutorture_shutdown_absorb("rcu_torture_stats"); | ||
| 1164 | } while (!kthread_should_stop()); | ||
| 1165 | VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping"); | ||
| 1166 | return 0; | ||
| 1167 | } | ||
| 1168 | |||
| 1169 | static int rcu_idle_cpu; /* Force all torture tasks off this CPU */ | ||
| 1170 | |||
| 1171 | /* Shuffle tasks such that we allow @rcu_idle_cpu to become idle. A special case | ||
| 1172 | * is when @rcu_idle_cpu = -1, when we allow the tasks to run on all CPUs. | ||
| 1173 | */ | ||
| 1174 | static void rcu_torture_shuffle_tasks(void) | ||
| 1175 | { | ||
| 1176 | int i; | ||
| 1177 | |||
| 1178 | cpumask_setall(shuffle_tmp_mask); | ||
| 1179 | get_online_cpus(); | ||
| 1180 | |||
| 1181 | /* No point in shuffling if there is only one online CPU (ex: UP) */ | ||
| 1182 | if (num_online_cpus() == 1) { | ||
| 1183 | put_online_cpus(); | ||
| 1184 | return; | ||
| 1185 | } | ||
| 1186 | |||
| 1187 | if (rcu_idle_cpu != -1) | ||
| 1188 | cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask); | ||
| 1189 | |||
| 1190 | set_cpus_allowed_ptr(current, shuffle_tmp_mask); | ||
| 1191 | |||
| 1192 | if (reader_tasks) { | ||
| 1193 | for (i = 0; i < nrealreaders; i++) | ||
| 1194 | if (reader_tasks[i]) | ||
| 1195 | set_cpus_allowed_ptr(reader_tasks[i], | ||
| 1196 | shuffle_tmp_mask); | ||
| 1197 | } | ||
| 1198 | if (fakewriter_tasks) { | ||
| 1199 | for (i = 0; i < nfakewriters; i++) | ||
| 1200 | if (fakewriter_tasks[i]) | ||
| 1201 | set_cpus_allowed_ptr(fakewriter_tasks[i], | ||
| 1202 | shuffle_tmp_mask); | ||
| 1203 | } | ||
| 1204 | if (writer_task) | ||
| 1205 | set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask); | ||
| 1206 | if (stats_task) | ||
| 1207 | set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask); | ||
| 1208 | if (stutter_task) | ||
| 1209 | set_cpus_allowed_ptr(stutter_task, shuffle_tmp_mask); | ||
| 1210 | if (fqs_task) | ||
| 1211 | set_cpus_allowed_ptr(fqs_task, shuffle_tmp_mask); | ||
| 1212 | if (shutdown_task) | ||
| 1213 | set_cpus_allowed_ptr(shutdown_task, shuffle_tmp_mask); | ||
| 1214 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 1215 | if (onoff_task) | ||
| 1216 | set_cpus_allowed_ptr(onoff_task, shuffle_tmp_mask); | ||
| 1217 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
| 1218 | if (stall_task) | ||
| 1219 | set_cpus_allowed_ptr(stall_task, shuffle_tmp_mask); | ||
| 1220 | if (barrier_cbs_tasks) | ||
| 1221 | for (i = 0; i < n_barrier_cbs; i++) | ||
| 1222 | if (barrier_cbs_tasks[i]) | ||
| 1223 | set_cpus_allowed_ptr(barrier_cbs_tasks[i], | ||
| 1224 | shuffle_tmp_mask); | ||
| 1225 | if (barrier_task) | ||
| 1226 | set_cpus_allowed_ptr(barrier_task, shuffle_tmp_mask); | ||
| 1227 | |||
| 1228 | if (rcu_idle_cpu == -1) | ||
| 1229 | rcu_idle_cpu = num_online_cpus() - 1; | ||
| 1230 | else | ||
| 1231 | rcu_idle_cpu--; | ||
| 1232 | |||
| 1233 | put_online_cpus(); | ||
| 1234 | } | ||
| 1235 | |||
| 1236 | /* Shuffle tasks across CPUs, with the intent of allowing each CPU in the | ||
| 1237 | * system to become idle at a time and cut off its timer ticks. This is meant | ||
| 1238 | * to test the support for such tickless idle CPU in RCU. | ||
| 1239 | */ | ||
| 1240 | static int | ||
| 1241 | rcu_torture_shuffle(void *arg) | ||
| 1242 | { | ||
| 1243 | VERBOSE_PRINTK_STRING("rcu_torture_shuffle task started"); | ||
| 1244 | do { | ||
| 1245 | schedule_timeout_interruptible(shuffle_interval * HZ); | ||
| 1246 | rcu_torture_shuffle_tasks(); | ||
| 1247 | rcutorture_shutdown_absorb("rcu_torture_shuffle"); | ||
| 1248 | } while (!kthread_should_stop()); | ||
| 1249 | VERBOSE_PRINTK_STRING("rcu_torture_shuffle task stopping"); | ||
| 1250 | return 0; | ||
| 1251 | } | ||
| 1252 | |||
| 1253 | /* Cause the rcutorture test to "stutter", starting and stopping all | ||
| 1254 | * threads periodically. | ||
| 1255 | */ | ||
| 1256 | static int | ||
| 1257 | rcu_torture_stutter(void *arg) | ||
| 1258 | { | ||
| 1259 | VERBOSE_PRINTK_STRING("rcu_torture_stutter task started"); | ||
| 1260 | do { | ||
| 1261 | schedule_timeout_interruptible(stutter * HZ); | ||
| 1262 | stutter_pause_test = 1; | ||
| 1263 | if (!kthread_should_stop()) | ||
| 1264 | schedule_timeout_interruptible(stutter * HZ); | ||
| 1265 | stutter_pause_test = 0; | ||
| 1266 | rcutorture_shutdown_absorb("rcu_torture_stutter"); | ||
| 1267 | } while (!kthread_should_stop()); | ||
| 1268 | VERBOSE_PRINTK_STRING("rcu_torture_stutter task stopping"); | ||
| 1269 | return 0; | ||
| 1270 | } | ||
| 1271 | |||
| 1272 | static inline void | ||
| 1273 | rcu_torture_print_module_parms(struct rcu_torture_ops *cur_ops, const char *tag) | ||
| 1274 | { | ||
| 1275 | pr_alert("%s" TORTURE_FLAG | ||
| 1276 | "--- %s: nreaders=%d nfakewriters=%d " | ||
| 1277 | "stat_interval=%d verbose=%d test_no_idle_hz=%d " | ||
| 1278 | "shuffle_interval=%d stutter=%d irqreader=%d " | ||
| 1279 | "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d " | ||
| 1280 | "test_boost=%d/%d test_boost_interval=%d " | ||
| 1281 | "test_boost_duration=%d shutdown_secs=%d " | ||
| 1282 | "stall_cpu=%d stall_cpu_holdoff=%d " | ||
| 1283 | "n_barrier_cbs=%d " | ||
| 1284 | "onoff_interval=%d onoff_holdoff=%d\n", | ||
| 1285 | torture_type, tag, nrealreaders, nfakewriters, | ||
| 1286 | stat_interval, verbose, test_no_idle_hz, shuffle_interval, | ||
| 1287 | stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter, | ||
| 1288 | test_boost, cur_ops->can_boost, | ||
| 1289 | test_boost_interval, test_boost_duration, shutdown_secs, | ||
| 1290 | stall_cpu, stall_cpu_holdoff, | ||
| 1291 | n_barrier_cbs, | ||
| 1292 | onoff_interval, onoff_holdoff); | ||
| 1293 | } | ||
| 1294 | |||
| 1295 | static struct notifier_block rcutorture_shutdown_nb = { | ||
| 1296 | .notifier_call = rcutorture_shutdown_notify, | ||
| 1297 | }; | ||
| 1298 | |||
| 1299 | static void rcutorture_booster_cleanup(int cpu) | ||
| 1300 | { | ||
| 1301 | struct task_struct *t; | ||
| 1302 | |||
| 1303 | if (boost_tasks[cpu] == NULL) | ||
| 1304 | return; | ||
| 1305 | mutex_lock(&boost_mutex); | ||
| 1306 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_boost task"); | ||
| 1307 | t = boost_tasks[cpu]; | ||
| 1308 | boost_tasks[cpu] = NULL; | ||
| 1309 | mutex_unlock(&boost_mutex); | ||
| 1310 | |||
| 1311 | /* This must be outside of the mutex, otherwise deadlock! */ | ||
| 1312 | kthread_stop(t); | ||
| 1313 | boost_tasks[cpu] = NULL; | ||
| 1314 | } | ||
| 1315 | |||
| 1316 | static int rcutorture_booster_init(int cpu) | ||
| 1317 | { | ||
| 1318 | int retval; | ||
| 1319 | |||
| 1320 | if (boost_tasks[cpu] != NULL) | ||
| 1321 | return 0; /* Already created, nothing more to do. */ | ||
| 1322 | |||
| 1323 | /* Don't allow time recalculation while creating a new task. */ | ||
| 1324 | mutex_lock(&boost_mutex); | ||
| 1325 | VERBOSE_PRINTK_STRING("Creating rcu_torture_boost task"); | ||
| 1326 | boost_tasks[cpu] = kthread_create_on_node(rcu_torture_boost, NULL, | ||
| 1327 | cpu_to_node(cpu), | ||
| 1328 | "rcu_torture_boost"); | ||
| 1329 | if (IS_ERR(boost_tasks[cpu])) { | ||
| 1330 | retval = PTR_ERR(boost_tasks[cpu]); | ||
| 1331 | VERBOSE_PRINTK_STRING("rcu_torture_boost task create failed"); | ||
| 1332 | n_rcu_torture_boost_ktrerror++; | ||
| 1333 | boost_tasks[cpu] = NULL; | ||
| 1334 | mutex_unlock(&boost_mutex); | ||
| 1335 | return retval; | ||
| 1336 | } | ||
| 1337 | kthread_bind(boost_tasks[cpu], cpu); | ||
| 1338 | wake_up_process(boost_tasks[cpu]); | ||
| 1339 | mutex_unlock(&boost_mutex); | ||
| 1340 | return 0; | ||
| 1341 | } | ||
| 1342 | |||
| 1343 | /* | ||
| 1344 | * Cause the rcutorture test to shutdown the system after the test has | ||
| 1345 | * run for the time specified by the shutdown_secs module parameter. | ||
| 1346 | */ | ||
| 1347 | static int | ||
| 1348 | rcu_torture_shutdown(void *arg) | ||
| 1349 | { | ||
| 1350 | long delta; | ||
| 1351 | unsigned long jiffies_snap; | ||
| 1352 | |||
| 1353 | VERBOSE_PRINTK_STRING("rcu_torture_shutdown task started"); | ||
| 1354 | jiffies_snap = ACCESS_ONCE(jiffies); | ||
| 1355 | while (ULONG_CMP_LT(jiffies_snap, shutdown_time) && | ||
| 1356 | !kthread_should_stop()) { | ||
| 1357 | delta = shutdown_time - jiffies_snap; | ||
| 1358 | if (verbose) | ||
| 1359 | pr_alert("%s" TORTURE_FLAG | ||
| 1360 | "rcu_torture_shutdown task: %lu jiffies remaining\n", | ||
| 1361 | torture_type, delta); | ||
| 1362 | schedule_timeout_interruptible(delta); | ||
| 1363 | jiffies_snap = ACCESS_ONCE(jiffies); | ||
| 1364 | } | ||
| 1365 | if (kthread_should_stop()) { | ||
| 1366 | VERBOSE_PRINTK_STRING("rcu_torture_shutdown task stopping"); | ||
| 1367 | return 0; | ||
| 1368 | } | ||
| 1369 | |||
| 1370 | /* OK, shut down the system. */ | ||
| 1371 | |||
| 1372 | VERBOSE_PRINTK_STRING("rcu_torture_shutdown task shutting down system"); | ||
| 1373 | shutdown_task = NULL; /* Avoid self-kill deadlock. */ | ||
| 1374 | rcu_torture_cleanup(); /* Get the success/failure message. */ | ||
| 1375 | kernel_power_off(); /* Shut down the system. */ | ||
| 1376 | return 0; | ||
| 1377 | } | ||
| 1378 | |||
| 1379 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 1380 | |||
| 1381 | /* | ||
| 1382 | * Execute random CPU-hotplug operations at the interval specified | ||
| 1383 | * by the onoff_interval. | ||
| 1384 | */ | ||
| 1385 | static int | ||
| 1386 | rcu_torture_onoff(void *arg) | ||
| 1387 | { | ||
| 1388 | int cpu; | ||
| 1389 | unsigned long delta; | ||
| 1390 | int maxcpu = -1; | ||
| 1391 | DEFINE_RCU_RANDOM(rand); | ||
| 1392 | int ret; | ||
| 1393 | unsigned long starttime; | ||
| 1394 | |||
| 1395 | VERBOSE_PRINTK_STRING("rcu_torture_onoff task started"); | ||
| 1396 | for_each_online_cpu(cpu) | ||
| 1397 | maxcpu = cpu; | ||
| 1398 | WARN_ON(maxcpu < 0); | ||
| 1399 | if (onoff_holdoff > 0) { | ||
| 1400 | VERBOSE_PRINTK_STRING("rcu_torture_onoff begin holdoff"); | ||
| 1401 | schedule_timeout_interruptible(onoff_holdoff * HZ); | ||
| 1402 | VERBOSE_PRINTK_STRING("rcu_torture_onoff end holdoff"); | ||
| 1403 | } | ||
| 1404 | while (!kthread_should_stop()) { | ||
| 1405 | cpu = (rcu_random(&rand) >> 4) % (maxcpu + 1); | ||
| 1406 | if (cpu_online(cpu) && cpu_is_hotpluggable(cpu)) { | ||
| 1407 | if (verbose) | ||
| 1408 | pr_alert("%s" TORTURE_FLAG | ||
| 1409 | "rcu_torture_onoff task: offlining %d\n", | ||
| 1410 | torture_type, cpu); | ||
| 1411 | starttime = jiffies; | ||
| 1412 | n_offline_attempts++; | ||
| 1413 | ret = cpu_down(cpu); | ||
| 1414 | if (ret) { | ||
| 1415 | if (verbose) | ||
| 1416 | pr_alert("%s" TORTURE_FLAG | ||
| 1417 | "rcu_torture_onoff task: offline %d failed: errno %d\n", | ||
| 1418 | torture_type, cpu, ret); | ||
| 1419 | } else { | ||
| 1420 | if (verbose) | ||
| 1421 | pr_alert("%s" TORTURE_FLAG | ||
| 1422 | "rcu_torture_onoff task: offlined %d\n", | ||
| 1423 | torture_type, cpu); | ||
| 1424 | n_offline_successes++; | ||
| 1425 | delta = jiffies - starttime; | ||
| 1426 | sum_offline += delta; | ||
| 1427 | if (min_offline < 0) { | ||
| 1428 | min_offline = delta; | ||
| 1429 | max_offline = delta; | ||
| 1430 | } | ||
| 1431 | if (min_offline > delta) | ||
| 1432 | min_offline = delta; | ||
| 1433 | if (max_offline < delta) | ||
| 1434 | max_offline = delta; | ||
| 1435 | } | ||
| 1436 | } else if (cpu_is_hotpluggable(cpu)) { | ||
| 1437 | if (verbose) | ||
| 1438 | pr_alert("%s" TORTURE_FLAG | ||
| 1439 | "rcu_torture_onoff task: onlining %d\n", | ||
| 1440 | torture_type, cpu); | ||
| 1441 | starttime = jiffies; | ||
| 1442 | n_online_attempts++; | ||
| 1443 | ret = cpu_up(cpu); | ||
| 1444 | if (ret) { | ||
| 1445 | if (verbose) | ||
| 1446 | pr_alert("%s" TORTURE_FLAG | ||
| 1447 | "rcu_torture_onoff task: online %d failed: errno %d\n", | ||
| 1448 | torture_type, cpu, ret); | ||
| 1449 | } else { | ||
| 1450 | if (verbose) | ||
| 1451 | pr_alert("%s" TORTURE_FLAG | ||
| 1452 | "rcu_torture_onoff task: onlined %d\n", | ||
| 1453 | torture_type, cpu); | ||
| 1454 | n_online_successes++; | ||
| 1455 | delta = jiffies - starttime; | ||
| 1456 | sum_online += delta; | ||
| 1457 | if (min_online < 0) { | ||
| 1458 | min_online = delta; | ||
| 1459 | max_online = delta; | ||
| 1460 | } | ||
| 1461 | if (min_online > delta) | ||
| 1462 | min_online = delta; | ||
| 1463 | if (max_online < delta) | ||
| 1464 | max_online = delta; | ||
| 1465 | } | ||
| 1466 | } | ||
| 1467 | schedule_timeout_interruptible(onoff_interval * HZ); | ||
| 1468 | } | ||
| 1469 | VERBOSE_PRINTK_STRING("rcu_torture_onoff task stopping"); | ||
| 1470 | return 0; | ||
| 1471 | } | ||
| 1472 | |||
| 1473 | static int | ||
| 1474 | rcu_torture_onoff_init(void) | ||
| 1475 | { | ||
| 1476 | int ret; | ||
| 1477 | |||
| 1478 | if (onoff_interval <= 0) | ||
| 1479 | return 0; | ||
| 1480 | onoff_task = kthread_run(rcu_torture_onoff, NULL, "rcu_torture_onoff"); | ||
| 1481 | if (IS_ERR(onoff_task)) { | ||
| 1482 | ret = PTR_ERR(onoff_task); | ||
| 1483 | onoff_task = NULL; | ||
| 1484 | return ret; | ||
| 1485 | } | ||
| 1486 | return 0; | ||
| 1487 | } | ||
| 1488 | |||
| 1489 | static void rcu_torture_onoff_cleanup(void) | ||
| 1490 | { | ||
| 1491 | if (onoff_task == NULL) | ||
| 1492 | return; | ||
| 1493 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_onoff task"); | ||
| 1494 | kthread_stop(onoff_task); | ||
| 1495 | onoff_task = NULL; | ||
| 1496 | } | ||
| 1497 | |||
| 1498 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
| 1499 | |||
| 1500 | static int | ||
| 1501 | rcu_torture_onoff_init(void) | ||
| 1502 | { | ||
| 1503 | return 0; | ||
| 1504 | } | ||
| 1505 | |||
| 1506 | static void rcu_torture_onoff_cleanup(void) | ||
| 1507 | { | ||
| 1508 | } | ||
| 1509 | |||
| 1510 | #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ | ||
| 1511 | |||
| 1512 | /* | ||
| 1513 | * CPU-stall kthread. It waits as specified by stall_cpu_holdoff, then | ||
| 1514 | * induces a CPU stall for the time specified by stall_cpu. | ||
| 1515 | */ | ||
| 1516 | static int rcu_torture_stall(void *args) | ||
| 1517 | { | ||
| 1518 | unsigned long stop_at; | ||
| 1519 | |||
| 1520 | VERBOSE_PRINTK_STRING("rcu_torture_stall task started"); | ||
| 1521 | if (stall_cpu_holdoff > 0) { | ||
| 1522 | VERBOSE_PRINTK_STRING("rcu_torture_stall begin holdoff"); | ||
| 1523 | schedule_timeout_interruptible(stall_cpu_holdoff * HZ); | ||
| 1524 | VERBOSE_PRINTK_STRING("rcu_torture_stall end holdoff"); | ||
| 1525 | } | ||
| 1526 | if (!kthread_should_stop()) { | ||
| 1527 | stop_at = get_seconds() + stall_cpu; | ||
| 1528 | /* RCU CPU stall is expected behavior in following code. */ | ||
| 1529 | pr_alert("rcu_torture_stall start.\n"); | ||
| 1530 | rcu_read_lock(); | ||
| 1531 | preempt_disable(); | ||
| 1532 | while (ULONG_CMP_LT(get_seconds(), stop_at)) | ||
| 1533 | continue; /* Induce RCU CPU stall warning. */ | ||
| 1534 | preempt_enable(); | ||
| 1535 | rcu_read_unlock(); | ||
| 1536 | pr_alert("rcu_torture_stall end.\n"); | ||
| 1537 | } | ||
| 1538 | rcutorture_shutdown_absorb("rcu_torture_stall"); | ||
| 1539 | while (!kthread_should_stop()) | ||
| 1540 | schedule_timeout_interruptible(10 * HZ); | ||
| 1541 | return 0; | ||
| 1542 | } | ||
| 1543 | |||
| 1544 | /* Spawn CPU-stall kthread, if stall_cpu specified. */ | ||
| 1545 | static int __init rcu_torture_stall_init(void) | ||
| 1546 | { | ||
| 1547 | int ret; | ||
| 1548 | |||
| 1549 | if (stall_cpu <= 0) | ||
| 1550 | return 0; | ||
| 1551 | stall_task = kthread_run(rcu_torture_stall, NULL, "rcu_torture_stall"); | ||
| 1552 | if (IS_ERR(stall_task)) { | ||
| 1553 | ret = PTR_ERR(stall_task); | ||
| 1554 | stall_task = NULL; | ||
| 1555 | return ret; | ||
| 1556 | } | ||
| 1557 | return 0; | ||
| 1558 | } | ||
| 1559 | |||
| 1560 | /* Clean up after the CPU-stall kthread, if one was spawned. */ | ||
| 1561 | static void rcu_torture_stall_cleanup(void) | ||
| 1562 | { | ||
| 1563 | if (stall_task == NULL) | ||
| 1564 | return; | ||
| 1565 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stall_task."); | ||
| 1566 | kthread_stop(stall_task); | ||
| 1567 | stall_task = NULL; | ||
| 1568 | } | ||
| 1569 | |||
| 1570 | /* Callback function for RCU barrier testing. */ | ||
| 1571 | void rcu_torture_barrier_cbf(struct rcu_head *rcu) | ||
| 1572 | { | ||
| 1573 | atomic_inc(&barrier_cbs_invoked); | ||
| 1574 | } | ||
| 1575 | |||
| 1576 | /* kthread function to register callbacks used to test RCU barriers. */ | ||
| 1577 | static int rcu_torture_barrier_cbs(void *arg) | ||
| 1578 | { | ||
| 1579 | long myid = (long)arg; | ||
| 1580 | bool lastphase = 0; | ||
| 1581 | struct rcu_head rcu; | ||
| 1582 | |||
| 1583 | init_rcu_head_on_stack(&rcu); | ||
| 1584 | VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task started"); | ||
| 1585 | set_user_nice(current, 19); | ||
| 1586 | do { | ||
| 1587 | wait_event(barrier_cbs_wq[myid], | ||
| 1588 | barrier_phase != lastphase || | ||
| 1589 | kthread_should_stop() || | ||
| 1590 | fullstop != FULLSTOP_DONTSTOP); | ||
| 1591 | lastphase = barrier_phase; | ||
| 1592 | smp_mb(); /* ensure barrier_phase load before ->call(). */ | ||
| 1593 | if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) | ||
| 1594 | break; | ||
| 1595 | cur_ops->call(&rcu, rcu_torture_barrier_cbf); | ||
| 1596 | if (atomic_dec_and_test(&barrier_cbs_count)) | ||
| 1597 | wake_up(&barrier_wq); | ||
| 1598 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
| 1599 | VERBOSE_PRINTK_STRING("rcu_torture_barrier_cbs task stopping"); | ||
| 1600 | rcutorture_shutdown_absorb("rcu_torture_barrier_cbs"); | ||
| 1601 | while (!kthread_should_stop()) | ||
| 1602 | schedule_timeout_interruptible(1); | ||
| 1603 | cur_ops->cb_barrier(); | ||
| 1604 | destroy_rcu_head_on_stack(&rcu); | ||
| 1605 | return 0; | ||
| 1606 | } | ||
| 1607 | |||
| 1608 | /* kthread function to drive and coordinate RCU barrier testing. */ | ||
| 1609 | static int rcu_torture_barrier(void *arg) | ||
| 1610 | { | ||
| 1611 | int i; | ||
| 1612 | |||
| 1613 | VERBOSE_PRINTK_STRING("rcu_torture_barrier task starting"); | ||
| 1614 | do { | ||
| 1615 | atomic_set(&barrier_cbs_invoked, 0); | ||
| 1616 | atomic_set(&barrier_cbs_count, n_barrier_cbs); | ||
| 1617 | smp_mb(); /* Ensure barrier_phase after prior assignments. */ | ||
| 1618 | barrier_phase = !barrier_phase; | ||
| 1619 | for (i = 0; i < n_barrier_cbs; i++) | ||
| 1620 | wake_up(&barrier_cbs_wq[i]); | ||
| 1621 | wait_event(barrier_wq, | ||
| 1622 | atomic_read(&barrier_cbs_count) == 0 || | ||
| 1623 | kthread_should_stop() || | ||
| 1624 | fullstop != FULLSTOP_DONTSTOP); | ||
| 1625 | if (kthread_should_stop() || fullstop != FULLSTOP_DONTSTOP) | ||
| 1626 | break; | ||
| 1627 | n_barrier_attempts++; | ||
| 1628 | cur_ops->cb_barrier(); | ||
| 1629 | if (atomic_read(&barrier_cbs_invoked) != n_barrier_cbs) { | ||
| 1630 | n_rcu_torture_barrier_error++; | ||
| 1631 | WARN_ON_ONCE(1); | ||
| 1632 | } | ||
| 1633 | n_barrier_successes++; | ||
| 1634 | schedule_timeout_interruptible(HZ / 10); | ||
| 1635 | } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); | ||
| 1636 | VERBOSE_PRINTK_STRING("rcu_torture_barrier task stopping"); | ||
| 1637 | rcutorture_shutdown_absorb("rcu_torture_barrier"); | ||
| 1638 | while (!kthread_should_stop()) | ||
| 1639 | schedule_timeout_interruptible(1); | ||
| 1640 | return 0; | ||
| 1641 | } | ||
| 1642 | |||
| 1643 | /* Initialize RCU barrier testing. */ | ||
| 1644 | static int rcu_torture_barrier_init(void) | ||
| 1645 | { | ||
| 1646 | int i; | ||
| 1647 | int ret; | ||
| 1648 | |||
| 1649 | if (n_barrier_cbs == 0) | ||
| 1650 | return 0; | ||
| 1651 | if (cur_ops->call == NULL || cur_ops->cb_barrier == NULL) { | ||
| 1652 | pr_alert("%s" TORTURE_FLAG | ||
| 1653 | " Call or barrier ops missing for %s,\n", | ||
| 1654 | torture_type, cur_ops->name); | ||
| 1655 | pr_alert("%s" TORTURE_FLAG | ||
| 1656 | " RCU barrier testing omitted from run.\n", | ||
| 1657 | torture_type); | ||
| 1658 | return 0; | ||
| 1659 | } | ||
| 1660 | atomic_set(&barrier_cbs_count, 0); | ||
| 1661 | atomic_set(&barrier_cbs_invoked, 0); | ||
| 1662 | barrier_cbs_tasks = | ||
| 1663 | kzalloc(n_barrier_cbs * sizeof(barrier_cbs_tasks[0]), | ||
| 1664 | GFP_KERNEL); | ||
| 1665 | barrier_cbs_wq = | ||
| 1666 | kzalloc(n_barrier_cbs * sizeof(barrier_cbs_wq[0]), | ||
| 1667 | GFP_KERNEL); | ||
| 1668 | if (barrier_cbs_tasks == NULL || !barrier_cbs_wq) | ||
| 1669 | return -ENOMEM; | ||
| 1670 | for (i = 0; i < n_barrier_cbs; i++) { | ||
| 1671 | init_waitqueue_head(&barrier_cbs_wq[i]); | ||
| 1672 | barrier_cbs_tasks[i] = kthread_run(rcu_torture_barrier_cbs, | ||
| 1673 | (void *)(long)i, | ||
| 1674 | "rcu_torture_barrier_cbs"); | ||
| 1675 | if (IS_ERR(barrier_cbs_tasks[i])) { | ||
| 1676 | ret = PTR_ERR(barrier_cbs_tasks[i]); | ||
| 1677 | VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier_cbs"); | ||
| 1678 | barrier_cbs_tasks[i] = NULL; | ||
| 1679 | return ret; | ||
| 1680 | } | ||
| 1681 | } | ||
| 1682 | barrier_task = kthread_run(rcu_torture_barrier, NULL, | ||
| 1683 | "rcu_torture_barrier"); | ||
| 1684 | if (IS_ERR(barrier_task)) { | ||
| 1685 | ret = PTR_ERR(barrier_task); | ||
| 1686 | VERBOSE_PRINTK_ERRSTRING("Failed to create rcu_torture_barrier"); | ||
| 1687 | barrier_task = NULL; | ||
| 1688 | } | ||
| 1689 | return 0; | ||
| 1690 | } | ||
| 1691 | |||
| 1692 | /* Clean up after RCU barrier testing. */ | ||
| 1693 | static void rcu_torture_barrier_cleanup(void) | ||
| 1694 | { | ||
| 1695 | int i; | ||
| 1696 | |||
| 1697 | if (barrier_task != NULL) { | ||
| 1698 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier task"); | ||
| 1699 | kthread_stop(barrier_task); | ||
| 1700 | barrier_task = NULL; | ||
| 1701 | } | ||
| 1702 | if (barrier_cbs_tasks != NULL) { | ||
| 1703 | for (i = 0; i < n_barrier_cbs; i++) { | ||
| 1704 | if (barrier_cbs_tasks[i] != NULL) { | ||
| 1705 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_barrier_cbs task"); | ||
| 1706 | kthread_stop(barrier_cbs_tasks[i]); | ||
| 1707 | barrier_cbs_tasks[i] = NULL; | ||
| 1708 | } | ||
| 1709 | } | ||
| 1710 | kfree(barrier_cbs_tasks); | ||
| 1711 | barrier_cbs_tasks = NULL; | ||
| 1712 | } | ||
| 1713 | if (barrier_cbs_wq != NULL) { | ||
| 1714 | kfree(barrier_cbs_wq); | ||
| 1715 | barrier_cbs_wq = NULL; | ||
| 1716 | } | ||
| 1717 | } | ||
| 1718 | |||
| 1719 | static int rcutorture_cpu_notify(struct notifier_block *self, | ||
| 1720 | unsigned long action, void *hcpu) | ||
| 1721 | { | ||
| 1722 | long cpu = (long)hcpu; | ||
| 1723 | |||
| 1724 | switch (action) { | ||
| 1725 | case CPU_ONLINE: | ||
| 1726 | case CPU_DOWN_FAILED: | ||
| 1727 | (void)rcutorture_booster_init(cpu); | ||
| 1728 | break; | ||
| 1729 | case CPU_DOWN_PREPARE: | ||
| 1730 | rcutorture_booster_cleanup(cpu); | ||
| 1731 | break; | ||
| 1732 | default: | ||
| 1733 | break; | ||
| 1734 | } | ||
| 1735 | return NOTIFY_OK; | ||
| 1736 | } | ||
| 1737 | |||
| 1738 | static struct notifier_block rcutorture_cpu_nb = { | ||
| 1739 | .notifier_call = rcutorture_cpu_notify, | ||
| 1740 | }; | ||
| 1741 | |||
| 1742 | static void | ||
| 1743 | rcu_torture_cleanup(void) | ||
| 1744 | { | ||
| 1745 | int i; | ||
| 1746 | |||
| 1747 | mutex_lock(&fullstop_mutex); | ||
| 1748 | rcutorture_record_test_transition(); | ||
| 1749 | if (fullstop == FULLSTOP_SHUTDOWN) { | ||
| 1750 | pr_warn(/* but going down anyway, so... */ | ||
| 1751 | "Concurrent 'rmmod rcutorture' and shutdown illegal!\n"); | ||
| 1752 | mutex_unlock(&fullstop_mutex); | ||
| 1753 | schedule_timeout_uninterruptible(10); | ||
| 1754 | if (cur_ops->cb_barrier != NULL) | ||
| 1755 | cur_ops->cb_barrier(); | ||
| 1756 | return; | ||
| 1757 | } | ||
| 1758 | fullstop = FULLSTOP_RMMOD; | ||
| 1759 | mutex_unlock(&fullstop_mutex); | ||
| 1760 | unregister_reboot_notifier(&rcutorture_shutdown_nb); | ||
| 1761 | rcu_torture_barrier_cleanup(); | ||
| 1762 | rcu_torture_stall_cleanup(); | ||
| 1763 | if (stutter_task) { | ||
| 1764 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stutter task"); | ||
| 1765 | kthread_stop(stutter_task); | ||
| 1766 | } | ||
| 1767 | stutter_task = NULL; | ||
| 1768 | if (shuffler_task) { | ||
| 1769 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task"); | ||
| 1770 | kthread_stop(shuffler_task); | ||
| 1771 | free_cpumask_var(shuffle_tmp_mask); | ||
| 1772 | } | ||
| 1773 | shuffler_task = NULL; | ||
| 1774 | |||
| 1775 | if (writer_task) { | ||
| 1776 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task"); | ||
| 1777 | kthread_stop(writer_task); | ||
| 1778 | } | ||
| 1779 | writer_task = NULL; | ||
| 1780 | |||
| 1781 | if (reader_tasks) { | ||
| 1782 | for (i = 0; i < nrealreaders; i++) { | ||
| 1783 | if (reader_tasks[i]) { | ||
| 1784 | VERBOSE_PRINTK_STRING( | ||
| 1785 | "Stopping rcu_torture_reader task"); | ||
| 1786 | kthread_stop(reader_tasks[i]); | ||
| 1787 | } | ||
| 1788 | reader_tasks[i] = NULL; | ||
| 1789 | } | ||
| 1790 | kfree(reader_tasks); | ||
| 1791 | reader_tasks = NULL; | ||
| 1792 | } | ||
| 1793 | rcu_torture_current = NULL; | ||
| 1794 | |||
| 1795 | if (fakewriter_tasks) { | ||
| 1796 | for (i = 0; i < nfakewriters; i++) { | ||
| 1797 | if (fakewriter_tasks[i]) { | ||
| 1798 | VERBOSE_PRINTK_STRING( | ||
| 1799 | "Stopping rcu_torture_fakewriter task"); | ||
| 1800 | kthread_stop(fakewriter_tasks[i]); | ||
| 1801 | } | ||
| 1802 | fakewriter_tasks[i] = NULL; | ||
| 1803 | } | ||
| 1804 | kfree(fakewriter_tasks); | ||
| 1805 | fakewriter_tasks = NULL; | ||
| 1806 | } | ||
| 1807 | |||
| 1808 | if (stats_task) { | ||
| 1809 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task"); | ||
| 1810 | kthread_stop(stats_task); | ||
| 1811 | } | ||
| 1812 | stats_task = NULL; | ||
| 1813 | |||
| 1814 | if (fqs_task) { | ||
| 1815 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task"); | ||
| 1816 | kthread_stop(fqs_task); | ||
| 1817 | } | ||
| 1818 | fqs_task = NULL; | ||
| 1819 | if ((test_boost == 1 && cur_ops->can_boost) || | ||
| 1820 | test_boost == 2) { | ||
| 1821 | unregister_cpu_notifier(&rcutorture_cpu_nb); | ||
| 1822 | for_each_possible_cpu(i) | ||
| 1823 | rcutorture_booster_cleanup(i); | ||
| 1824 | } | ||
| 1825 | if (shutdown_task != NULL) { | ||
| 1826 | VERBOSE_PRINTK_STRING("Stopping rcu_torture_shutdown task"); | ||
| 1827 | kthread_stop(shutdown_task); | ||
| 1828 | } | ||
| 1829 | shutdown_task = NULL; | ||
| 1830 | rcu_torture_onoff_cleanup(); | ||
| 1831 | |||
| 1832 | /* Wait for all RCU callbacks to fire. */ | ||
| 1833 | |||
| 1834 | if (cur_ops->cb_barrier != NULL) | ||
| 1835 | cur_ops->cb_barrier(); | ||
| 1836 | |||
| 1837 | rcu_torture_stats_print(); /* -After- the stats thread is stopped! */ | ||
| 1838 | |||
| 1839 | if (atomic_read(&n_rcu_torture_error) || n_rcu_torture_barrier_error) | ||
| 1840 | rcu_torture_print_module_parms(cur_ops, "End of test: FAILURE"); | ||
| 1841 | else if (n_online_successes != n_online_attempts || | ||
| 1842 | n_offline_successes != n_offline_attempts) | ||
| 1843 | rcu_torture_print_module_parms(cur_ops, | ||
| 1844 | "End of test: RCU_HOTPLUG"); | ||
| 1845 | else | ||
| 1846 | rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); | ||
| 1847 | } | ||
| 1848 | |||
| 1849 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | ||
| 1850 | static void rcu_torture_leak_cb(struct rcu_head *rhp) | ||
| 1851 | { | ||
| 1852 | } | ||
| 1853 | |||
| 1854 | static void rcu_torture_err_cb(struct rcu_head *rhp) | ||
| 1855 | { | ||
| 1856 | /* | ||
| 1857 | * This -might- happen due to race conditions, but is unlikely. | ||
| 1858 | * The scenario that leads to this happening is that the | ||
| 1859 | * first of the pair of duplicate callbacks is queued, | ||
| 1860 | * someone else starts a grace period that includes that | ||
| 1861 | * callback, then the second of the pair must wait for the | ||
| 1862 | * next grace period. Unlikely, but can happen. If it | ||
| 1863 | * does happen, the debug-objects subsystem won't have splatted. | ||
| 1864 | */ | ||
| 1865 | pr_alert("rcutorture: duplicated callback was invoked.\n"); | ||
| 1866 | } | ||
| 1867 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
| 1868 | |||
| 1869 | /* | ||
| 1870 | * Verify that double-free causes debug-objects to complain, but only | ||
| 1871 | * if CONFIG_DEBUG_OBJECTS_RCU_HEAD=y. Otherwise, say that the test | ||
| 1872 | * cannot be carried out. | ||
| 1873 | */ | ||
| 1874 | static void rcu_test_debug_objects(void) | ||
| 1875 | { | ||
| 1876 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | ||
| 1877 | struct rcu_head rh1; | ||
| 1878 | struct rcu_head rh2; | ||
| 1879 | |||
| 1880 | init_rcu_head_on_stack(&rh1); | ||
| 1881 | init_rcu_head_on_stack(&rh2); | ||
| 1882 | pr_alert("rcutorture: WARN: Duplicate call_rcu() test starting.\n"); | ||
| 1883 | |||
| 1884 | /* Try to queue the rh2 pair of callbacks for the same grace period. */ | ||
| 1885 | preempt_disable(); /* Prevent preemption from interrupting test. */ | ||
| 1886 | rcu_read_lock(); /* Make it impossible to finish a grace period. */ | ||
| 1887 | call_rcu(&rh1, rcu_torture_leak_cb); /* Start grace period. */ | ||
| 1888 | local_irq_disable(); /* Make it harder to start a new grace period. */ | ||
| 1889 | call_rcu(&rh2, rcu_torture_leak_cb); | ||
| 1890 | call_rcu(&rh2, rcu_torture_err_cb); /* Duplicate callback. */ | ||
| 1891 | local_irq_enable(); | ||
| 1892 | rcu_read_unlock(); | ||
| 1893 | preempt_enable(); | ||
| 1894 | |||
| 1895 | /* Wait for them all to get done so we can safely return. */ | ||
| 1896 | rcu_barrier(); | ||
| 1897 | pr_alert("rcutorture: WARN: Duplicate call_rcu() test complete.\n"); | ||
| 1898 | destroy_rcu_head_on_stack(&rh1); | ||
| 1899 | destroy_rcu_head_on_stack(&rh2); | ||
| 1900 | #else /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
| 1901 | pr_alert("rcutorture: !CONFIG_DEBUG_OBJECTS_RCU_HEAD, not testing duplicate call_rcu()\n"); | ||
| 1902 | #endif /* #else #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
| 1903 | } | ||
| 1904 | |||
| 1905 | static int __init | ||
| 1906 | rcu_torture_init(void) | ||
| 1907 | { | ||
| 1908 | int i; | ||
| 1909 | int cpu; | ||
| 1910 | int firsterr = 0; | ||
| 1911 | int retval; | ||
| 1912 | static struct rcu_torture_ops *torture_ops[] = { | ||
| 1913 | &rcu_ops, &rcu_bh_ops, &srcu_ops, &sched_ops, | ||
| 1914 | }; | ||
| 1915 | |||
| 1916 | mutex_lock(&fullstop_mutex); | ||
| 1917 | |||
| 1918 | /* Process args and tell the world that the torturer is on the job. */ | ||
| 1919 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) { | ||
| 1920 | cur_ops = torture_ops[i]; | ||
| 1921 | if (strcmp(torture_type, cur_ops->name) == 0) | ||
| 1922 | break; | ||
| 1923 | } | ||
| 1924 | if (i == ARRAY_SIZE(torture_ops)) { | ||
| 1925 | pr_alert("rcu-torture: invalid torture type: \"%s\"\n", | ||
| 1926 | torture_type); | ||
| 1927 | pr_alert("rcu-torture types:"); | ||
| 1928 | for (i = 0; i < ARRAY_SIZE(torture_ops); i++) | ||
| 1929 | pr_alert(" %s", torture_ops[i]->name); | ||
| 1930 | pr_alert("\n"); | ||
| 1931 | mutex_unlock(&fullstop_mutex); | ||
| 1932 | return -EINVAL; | ||
| 1933 | } | ||
| 1934 | if (cur_ops->fqs == NULL && fqs_duration != 0) { | ||
| 1935 | pr_alert("rcu-torture: ->fqs NULL and non-zero fqs_duration, fqs disabled.\n"); | ||
| 1936 | fqs_duration = 0; | ||
| 1937 | } | ||
| 1938 | if (cur_ops->init) | ||
| 1939 | cur_ops->init(); /* no "goto unwind" prior to this point!!! */ | ||
| 1940 | |||
| 1941 | if (nreaders >= 0) | ||
| 1942 | nrealreaders = nreaders; | ||
| 1943 | else | ||
| 1944 | nrealreaders = 2 * num_online_cpus(); | ||
| 1945 | rcu_torture_print_module_parms(cur_ops, "Start of test"); | ||
| 1946 | fullstop = FULLSTOP_DONTSTOP; | ||
| 1947 | |||
| 1948 | /* Set up the freelist. */ | ||
| 1949 | |||
| 1950 | INIT_LIST_HEAD(&rcu_torture_freelist); | ||
| 1951 | for (i = 0; i < ARRAY_SIZE(rcu_tortures); i++) { | ||
| 1952 | rcu_tortures[i].rtort_mbtest = 0; | ||
| 1953 | list_add_tail(&rcu_tortures[i].rtort_free, | ||
| 1954 | &rcu_torture_freelist); | ||
| 1955 | } | ||
| 1956 | |||
| 1957 | /* Initialize the statistics so that each run gets its own numbers. */ | ||
| 1958 | |||
| 1959 | rcu_torture_current = NULL; | ||
| 1960 | rcu_torture_current_version = 0; | ||
| 1961 | atomic_set(&n_rcu_torture_alloc, 0); | ||
| 1962 | atomic_set(&n_rcu_torture_alloc_fail, 0); | ||
| 1963 | atomic_set(&n_rcu_torture_free, 0); | ||
| 1964 | atomic_set(&n_rcu_torture_mberror, 0); | ||
| 1965 | atomic_set(&n_rcu_torture_error, 0); | ||
| 1966 | n_rcu_torture_barrier_error = 0; | ||
| 1967 | n_rcu_torture_boost_ktrerror = 0; | ||
| 1968 | n_rcu_torture_boost_rterror = 0; | ||
| 1969 | n_rcu_torture_boost_failure = 0; | ||
| 1970 | n_rcu_torture_boosts = 0; | ||
| 1971 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) | ||
| 1972 | atomic_set(&rcu_torture_wcount[i], 0); | ||
| 1973 | for_each_possible_cpu(cpu) { | ||
| 1974 | for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) { | ||
| 1975 | per_cpu(rcu_torture_count, cpu)[i] = 0; | ||
| 1976 | per_cpu(rcu_torture_batch, cpu)[i] = 0; | ||
| 1977 | } | ||
| 1978 | } | ||
| 1979 | |||
| 1980 | /* Start up the kthreads. */ | ||
| 1981 | |||
| 1982 | VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task"); | ||
| 1983 | writer_task = kthread_create(rcu_torture_writer, NULL, | ||
| 1984 | "rcu_torture_writer"); | ||
| 1985 | if (IS_ERR(writer_task)) { | ||
| 1986 | firsterr = PTR_ERR(writer_task); | ||
| 1987 | VERBOSE_PRINTK_ERRSTRING("Failed to create writer"); | ||
| 1988 | writer_task = NULL; | ||
| 1989 | goto unwind; | ||
| 1990 | } | ||
| 1991 | wake_up_process(writer_task); | ||
| 1992 | fakewriter_tasks = kzalloc(nfakewriters * sizeof(fakewriter_tasks[0]), | ||
| 1993 | GFP_KERNEL); | ||
| 1994 | if (fakewriter_tasks == NULL) { | ||
| 1995 | VERBOSE_PRINTK_ERRSTRING("out of memory"); | ||
| 1996 | firsterr = -ENOMEM; | ||
| 1997 | goto unwind; | ||
| 1998 | } | ||
| 1999 | for (i = 0; i < nfakewriters; i++) { | ||
| 2000 | VERBOSE_PRINTK_STRING("Creating rcu_torture_fakewriter task"); | ||
| 2001 | fakewriter_tasks[i] = kthread_run(rcu_torture_fakewriter, NULL, | ||
| 2002 | "rcu_torture_fakewriter"); | ||
| 2003 | if (IS_ERR(fakewriter_tasks[i])) { | ||
| 2004 | firsterr = PTR_ERR(fakewriter_tasks[i]); | ||
| 2005 | VERBOSE_PRINTK_ERRSTRING("Failed to create fakewriter"); | ||
| 2006 | fakewriter_tasks[i] = NULL; | ||
| 2007 | goto unwind; | ||
| 2008 | } | ||
| 2009 | } | ||
| 2010 | reader_tasks = kzalloc(nrealreaders * sizeof(reader_tasks[0]), | ||
| 2011 | GFP_KERNEL); | ||
| 2012 | if (reader_tasks == NULL) { | ||
| 2013 | VERBOSE_PRINTK_ERRSTRING("out of memory"); | ||
| 2014 | firsterr = -ENOMEM; | ||
| 2015 | goto unwind; | ||
| 2016 | } | ||
| 2017 | for (i = 0; i < nrealreaders; i++) { | ||
| 2018 | VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task"); | ||
| 2019 | reader_tasks[i] = kthread_run(rcu_torture_reader, NULL, | ||
| 2020 | "rcu_torture_reader"); | ||
| 2021 | if (IS_ERR(reader_tasks[i])) { | ||
| 2022 | firsterr = PTR_ERR(reader_tasks[i]); | ||
| 2023 | VERBOSE_PRINTK_ERRSTRING("Failed to create reader"); | ||
| 2024 | reader_tasks[i] = NULL; | ||
| 2025 | goto unwind; | ||
| 2026 | } | ||
| 2027 | } | ||
| 2028 | if (stat_interval > 0) { | ||
| 2029 | VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task"); | ||
| 2030 | stats_task = kthread_run(rcu_torture_stats, NULL, | ||
| 2031 | "rcu_torture_stats"); | ||
| 2032 | if (IS_ERR(stats_task)) { | ||
| 2033 | firsterr = PTR_ERR(stats_task); | ||
| 2034 | VERBOSE_PRINTK_ERRSTRING("Failed to create stats"); | ||
| 2035 | stats_task = NULL; | ||
| 2036 | goto unwind; | ||
| 2037 | } | ||
| 2038 | } | ||
| 2039 | if (test_no_idle_hz) { | ||
| 2040 | rcu_idle_cpu = num_online_cpus() - 1; | ||
| 2041 | |||
| 2042 | if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) { | ||
| 2043 | firsterr = -ENOMEM; | ||
| 2044 | VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask"); | ||
| 2045 | goto unwind; | ||
| 2046 | } | ||
| 2047 | |||
| 2048 | /* Create the shuffler thread */ | ||
| 2049 | shuffler_task = kthread_run(rcu_torture_shuffle, NULL, | ||
| 2050 | "rcu_torture_shuffle"); | ||
| 2051 | if (IS_ERR(shuffler_task)) { | ||
| 2052 | free_cpumask_var(shuffle_tmp_mask); | ||
| 2053 | firsterr = PTR_ERR(shuffler_task); | ||
| 2054 | VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler"); | ||
| 2055 | shuffler_task = NULL; | ||
| 2056 | goto unwind; | ||
| 2057 | } | ||
| 2058 | } | ||
| 2059 | if (stutter < 0) | ||
| 2060 | stutter = 0; | ||
| 2061 | if (stutter) { | ||
| 2062 | /* Create the stutter thread */ | ||
| 2063 | stutter_task = kthread_run(rcu_torture_stutter, NULL, | ||
| 2064 | "rcu_torture_stutter"); | ||
| 2065 | if (IS_ERR(stutter_task)) { | ||
| 2066 | firsterr = PTR_ERR(stutter_task); | ||
| 2067 | VERBOSE_PRINTK_ERRSTRING("Failed to create stutter"); | ||
| 2068 | stutter_task = NULL; | ||
| 2069 | goto unwind; | ||
| 2070 | } | ||
| 2071 | } | ||
| 2072 | if (fqs_duration < 0) | ||
| 2073 | fqs_duration = 0; | ||
| 2074 | if (fqs_duration) { | ||
| 2075 | /* Create the stutter thread */ | ||
| 2076 | fqs_task = kthread_run(rcu_torture_fqs, NULL, | ||
| 2077 | "rcu_torture_fqs"); | ||
| 2078 | if (IS_ERR(fqs_task)) { | ||
| 2079 | firsterr = PTR_ERR(fqs_task); | ||
| 2080 | VERBOSE_PRINTK_ERRSTRING("Failed to create fqs"); | ||
| 2081 | fqs_task = NULL; | ||
| 2082 | goto unwind; | ||
| 2083 | } | ||
| 2084 | } | ||
| 2085 | if (test_boost_interval < 1) | ||
| 2086 | test_boost_interval = 1; | ||
| 2087 | if (test_boost_duration < 2) | ||
| 2088 | test_boost_duration = 2; | ||
| 2089 | if ((test_boost == 1 && cur_ops->can_boost) || | ||
| 2090 | test_boost == 2) { | ||
| 2091 | |||
| 2092 | boost_starttime = jiffies + test_boost_interval * HZ; | ||
| 2093 | register_cpu_notifier(&rcutorture_cpu_nb); | ||
| 2094 | for_each_possible_cpu(i) { | ||
| 2095 | if (cpu_is_offline(i)) | ||
| 2096 | continue; /* Heuristic: CPU can go offline. */ | ||
| 2097 | retval = rcutorture_booster_init(i); | ||
| 2098 | if (retval < 0) { | ||
| 2099 | firsterr = retval; | ||
| 2100 | goto unwind; | ||
| 2101 | } | ||
| 2102 | } | ||
| 2103 | } | ||
| 2104 | if (shutdown_secs > 0) { | ||
| 2105 | shutdown_time = jiffies + shutdown_secs * HZ; | ||
| 2106 | shutdown_task = kthread_create(rcu_torture_shutdown, NULL, | ||
| 2107 | "rcu_torture_shutdown"); | ||
| 2108 | if (IS_ERR(shutdown_task)) { | ||
| 2109 | firsterr = PTR_ERR(shutdown_task); | ||
| 2110 | VERBOSE_PRINTK_ERRSTRING("Failed to create shutdown"); | ||
| 2111 | shutdown_task = NULL; | ||
| 2112 | goto unwind; | ||
| 2113 | } | ||
| 2114 | wake_up_process(shutdown_task); | ||
| 2115 | } | ||
| 2116 | i = rcu_torture_onoff_init(); | ||
| 2117 | if (i != 0) { | ||
| 2118 | firsterr = i; | ||
| 2119 | goto unwind; | ||
| 2120 | } | ||
| 2121 | register_reboot_notifier(&rcutorture_shutdown_nb); | ||
| 2122 | i = rcu_torture_stall_init(); | ||
| 2123 | if (i != 0) { | ||
| 2124 | firsterr = i; | ||
| 2125 | goto unwind; | ||
| 2126 | } | ||
| 2127 | retval = rcu_torture_barrier_init(); | ||
| 2128 | if (retval != 0) { | ||
| 2129 | firsterr = retval; | ||
| 2130 | goto unwind; | ||
| 2131 | } | ||
| 2132 | if (object_debug) | ||
| 2133 | rcu_test_debug_objects(); | ||
| 2134 | rcutorture_record_test_transition(); | ||
| 2135 | mutex_unlock(&fullstop_mutex); | ||
| 2136 | return 0; | ||
| 2137 | |||
| 2138 | unwind: | ||
| 2139 | mutex_unlock(&fullstop_mutex); | ||
| 2140 | rcu_torture_cleanup(); | ||
| 2141 | return firsterr; | ||
| 2142 | } | ||
| 2143 | |||
| 2144 | module_init(rcu_torture_init); | ||
| 2145 | module_exit(rcu_torture_cleanup); | ||
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c new file mode 100644 index 000000000000..4c06ddfea7cd --- /dev/null +++ b/kernel/rcu/tree.c | |||
| @@ -0,0 +1,3416 @@ | |||
| 1 | /* | ||
| 2 | * Read-Copy Update mechanism for mutual exclusion | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, write to the Free Software | ||
| 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
| 17 | * | ||
| 18 | * Copyright IBM Corporation, 2008 | ||
| 19 | * | ||
| 20 | * Authors: Dipankar Sarma <dipankar@in.ibm.com> | ||
| 21 | * Manfred Spraul <manfred@colorfullife.com> | ||
| 22 | * Paul E. McKenney <paulmck@linux.vnet.ibm.com> Hierarchical version | ||
| 23 | * | ||
| 24 | * Based on the original work by Paul McKenney <paulmck@us.ibm.com> | ||
| 25 | * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. | ||
| 26 | * | ||
| 27 | * For detailed explanation of Read-Copy Update mechanism see - | ||
| 28 | * Documentation/RCU | ||
| 29 | */ | ||
| 30 | #include <linux/types.h> | ||
| 31 | #include <linux/kernel.h> | ||
| 32 | #include <linux/init.h> | ||
| 33 | #include <linux/spinlock.h> | ||
| 34 | #include <linux/smp.h> | ||
| 35 | #include <linux/rcupdate.h> | ||
| 36 | #include <linux/interrupt.h> | ||
| 37 | #include <linux/sched.h> | ||
| 38 | #include <linux/nmi.h> | ||
| 39 | #include <linux/atomic.h> | ||
| 40 | #include <linux/bitops.h> | ||
| 41 | #include <linux/export.h> | ||
| 42 | #include <linux/completion.h> | ||
| 43 | #include <linux/moduleparam.h> | ||
| 44 | #include <linux/module.h> | ||
| 45 | #include <linux/percpu.h> | ||
| 46 | #include <linux/notifier.h> | ||
| 47 | #include <linux/cpu.h> | ||
| 48 | #include <linux/mutex.h> | ||
| 49 | #include <linux/time.h> | ||
| 50 | #include <linux/kernel_stat.h> | ||
| 51 | #include <linux/wait.h> | ||
| 52 | #include <linux/kthread.h> | ||
| 53 | #include <linux/prefetch.h> | ||
| 54 | #include <linux/delay.h> | ||
| 55 | #include <linux/stop_machine.h> | ||
| 56 | #include <linux/random.h> | ||
| 57 | #include <linux/ftrace_event.h> | ||
| 58 | #include <linux/suspend.h> | ||
| 59 | |||
| 60 | #include "tree.h" | ||
| 61 | #include <trace/events/rcu.h> | ||
| 62 | |||
| 63 | #include "rcu.h" | ||
| 64 | |||
| 65 | MODULE_ALIAS("rcutree"); | ||
| 66 | #ifdef MODULE_PARAM_PREFIX | ||
| 67 | #undef MODULE_PARAM_PREFIX | ||
| 68 | #endif | ||
| 69 | #define MODULE_PARAM_PREFIX "rcutree." | ||
| 70 | |||
| 71 | /* Data structures. */ | ||
| 72 | |||
| 73 | static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; | ||
| 74 | static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; | ||
| 75 | |||
| 76 | /* | ||
| 77 | * In order to export the rcu_state name to the tracing tools, it | ||
| 78 | * needs to be added in the __tracepoint_string section. | ||
| 79 | * This requires defining a separate variable tp_<sname>_varname | ||
| 80 | * that points to the string being used, and this will allow | ||
| 81 | * the tracing userspace tools to be able to decipher the string | ||
| 82 | * address to the matching string. | ||
| 83 | */ | ||
| 84 | #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \ | ||
| 85 | static char sname##_varname[] = #sname; \ | ||
| 86 | static const char *tp_##sname##_varname __used __tracepoint_string = sname##_varname; \ | ||
| 87 | struct rcu_state sname##_state = { \ | ||
| 88 | .level = { &sname##_state.node[0] }, \ | ||
| 89 | .call = cr, \ | ||
| 90 | .fqs_state = RCU_GP_IDLE, \ | ||
| 91 | .gpnum = 0UL - 300UL, \ | ||
| 92 | .completed = 0UL - 300UL, \ | ||
| 93 | .orphan_lock = __RAW_SPIN_LOCK_UNLOCKED(&sname##_state.orphan_lock), \ | ||
| 94 | .orphan_nxttail = &sname##_state.orphan_nxtlist, \ | ||
| 95 | .orphan_donetail = &sname##_state.orphan_donelist, \ | ||
| 96 | .barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \ | ||
| 97 | .onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \ | ||
| 98 | .name = sname##_varname, \ | ||
| 99 | .abbr = sabbr, \ | ||
| 100 | }; \ | ||
| 101 | DEFINE_PER_CPU(struct rcu_data, sname##_data) | ||
| 102 | |||
| 103 | RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); | ||
| 104 | RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); | ||
| 105 | |||
| 106 | static struct rcu_state *rcu_state; | ||
| 107 | LIST_HEAD(rcu_struct_flavors); | ||
| 108 | |||
| 109 | /* Increase (but not decrease) the CONFIG_RCU_FANOUT_LEAF at boot time. */ | ||
| 110 | static int rcu_fanout_leaf = CONFIG_RCU_FANOUT_LEAF; | ||
| 111 | module_param(rcu_fanout_leaf, int, 0444); | ||
| 112 | int rcu_num_lvls __read_mostly = RCU_NUM_LVLS; | ||
| 113 | static int num_rcu_lvl[] = { /* Number of rcu_nodes at specified level. */ | ||
| 114 | NUM_RCU_LVL_0, | ||
| 115 | NUM_RCU_LVL_1, | ||
| 116 | NUM_RCU_LVL_2, | ||
| 117 | NUM_RCU_LVL_3, | ||
| 118 | NUM_RCU_LVL_4, | ||
| 119 | }; | ||
| 120 | int rcu_num_nodes __read_mostly = NUM_RCU_NODES; /* Total # rcu_nodes in use. */ | ||
| 121 | |||
| 122 | /* | ||
| 123 | * The rcu_scheduler_active variable transitions from zero to one just | ||
| 124 | * before the first task is spawned. So when this variable is zero, RCU | ||
| 125 | * can assume that there is but one task, allowing RCU to (for example) | ||
| 126 | * optimize synchronize_sched() to a simple barrier(). When this variable | ||
| 127 | * is one, RCU must actually do all the hard work required to detect real | ||
| 128 | * grace periods. This variable is also used to suppress boot-time false | ||
| 129 | * positives from lockdep-RCU error checking. | ||
| 130 | */ | ||
| 131 | int rcu_scheduler_active __read_mostly; | ||
| 132 | EXPORT_SYMBOL_GPL(rcu_scheduler_active); | ||
| 133 | |||
| 134 | /* | ||
| 135 | * The rcu_scheduler_fully_active variable transitions from zero to one | ||
| 136 | * during the early_initcall() processing, which is after the scheduler | ||
| 137 | * is capable of creating new tasks. So RCU processing (for example, | ||
| 138 | * creating tasks for RCU priority boosting) must be delayed until after | ||
| 139 | * rcu_scheduler_fully_active transitions from zero to one. We also | ||
| 140 | * currently delay invocation of any RCU callbacks until after this point. | ||
| 141 | * | ||
| 142 | * It might later prove better for people registering RCU callbacks during | ||
| 143 | * early boot to take responsibility for these callbacks, but one step at | ||
| 144 | * a time. | ||
| 145 | */ | ||
| 146 | static int rcu_scheduler_fully_active __read_mostly; | ||
| 147 | |||
| 148 | #ifdef CONFIG_RCU_BOOST | ||
| 149 | |||
| 150 | /* | ||
| 151 | * Control variables for per-CPU and per-rcu_node kthreads. These | ||
| 152 | * handle all flavors of RCU. | ||
| 153 | */ | ||
| 154 | static DEFINE_PER_CPU(struct task_struct *, rcu_cpu_kthread_task); | ||
| 155 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | ||
| 156 | DEFINE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); | ||
| 157 | DEFINE_PER_CPU(char, rcu_cpu_has_work); | ||
| 158 | |||
| 159 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 160 | |||
| 161 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu); | ||
| 162 | static void invoke_rcu_core(void); | ||
| 163 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp); | ||
| 164 | |||
| 165 | /* | ||
| 166 | * Track the rcutorture test sequence number and the update version | ||
| 167 | * number within a given test. The rcutorture_testseq is incremented | ||
| 168 | * on every rcutorture module load and unload, so has an odd value | ||
| 169 | * when a test is running. The rcutorture_vernum is set to zero | ||
| 170 | * when rcutorture starts and is incremented on each rcutorture update. | ||
| 171 | * These variables enable correlating rcutorture output with the | ||
| 172 | * RCU tracing information. | ||
| 173 | */ | ||
| 174 | unsigned long rcutorture_testseq; | ||
| 175 | unsigned long rcutorture_vernum; | ||
| 176 | |||
| 177 | /* | ||
| 178 | * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s | ||
| 179 | * permit this function to be invoked without holding the root rcu_node | ||
| 180 | * structure's ->lock, but of course results can be subject to change. | ||
| 181 | */ | ||
| 182 | static int rcu_gp_in_progress(struct rcu_state *rsp) | ||
| 183 | { | ||
| 184 | return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum); | ||
| 185 | } | ||
| 186 | |||
| 187 | /* | ||
| 188 | * Note a quiescent state. Because we do not need to know | ||
| 189 | * how many quiescent states passed, just if there was at least | ||
| 190 | * one since the start of the grace period, this just sets a flag. | ||
| 191 | * The caller must have disabled preemption. | ||
| 192 | */ | ||
| 193 | void rcu_sched_qs(int cpu) | ||
| 194 | { | ||
| 195 | struct rcu_data *rdp = &per_cpu(rcu_sched_data, cpu); | ||
| 196 | |||
| 197 | if (rdp->passed_quiesce == 0) | ||
| 198 | trace_rcu_grace_period(TPS("rcu_sched"), rdp->gpnum, TPS("cpuqs")); | ||
| 199 | rdp->passed_quiesce = 1; | ||
| 200 | } | ||
| 201 | |||
| 202 | void rcu_bh_qs(int cpu) | ||
| 203 | { | ||
| 204 | struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); | ||
| 205 | |||
| 206 | if (rdp->passed_quiesce == 0) | ||
| 207 | trace_rcu_grace_period(TPS("rcu_bh"), rdp->gpnum, TPS("cpuqs")); | ||
| 208 | rdp->passed_quiesce = 1; | ||
| 209 | } | ||
| 210 | |||
| 211 | /* | ||
| 212 | * Note a context switch. This is a quiescent state for RCU-sched, | ||
| 213 | * and requires special handling for preemptible RCU. | ||
| 214 | * The caller must have disabled preemption. | ||
| 215 | */ | ||
| 216 | void rcu_note_context_switch(int cpu) | ||
| 217 | { | ||
| 218 | trace_rcu_utilization(TPS("Start context switch")); | ||
| 219 | rcu_sched_qs(cpu); | ||
| 220 | rcu_preempt_note_context_switch(cpu); | ||
| 221 | trace_rcu_utilization(TPS("End context switch")); | ||
| 222 | } | ||
| 223 | EXPORT_SYMBOL_GPL(rcu_note_context_switch); | ||
| 224 | |||
| 225 | static DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = { | ||
| 226 | .dynticks_nesting = DYNTICK_TASK_EXIT_IDLE, | ||
| 227 | .dynticks = ATOMIC_INIT(1), | ||
| 228 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
| 229 | .dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE, | ||
| 230 | .dynticks_idle = ATOMIC_INIT(1), | ||
| 231 | #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
| 232 | }; | ||
| 233 | |||
| 234 | static long blimit = 10; /* Maximum callbacks per rcu_do_batch. */ | ||
| 235 | static long qhimark = 10000; /* If this many pending, ignore blimit. */ | ||
| 236 | static long qlowmark = 100; /* Once only this many pending, use blimit. */ | ||
| 237 | |||
| 238 | module_param(blimit, long, 0444); | ||
| 239 | module_param(qhimark, long, 0444); | ||
| 240 | module_param(qlowmark, long, 0444); | ||
| 241 | |||
| 242 | static ulong jiffies_till_first_fqs = ULONG_MAX; | ||
| 243 | static ulong jiffies_till_next_fqs = ULONG_MAX; | ||
| 244 | |||
| 245 | module_param(jiffies_till_first_fqs, ulong, 0644); | ||
| 246 | module_param(jiffies_till_next_fqs, ulong, 0644); | ||
| 247 | |||
| 248 | static void rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | ||
| 249 | struct rcu_data *rdp); | ||
| 250 | static void force_qs_rnp(struct rcu_state *rsp, | ||
| 251 | int (*f)(struct rcu_data *rsp, bool *isidle, | ||
| 252 | unsigned long *maxj), | ||
| 253 | bool *isidle, unsigned long *maxj); | ||
| 254 | static void force_quiescent_state(struct rcu_state *rsp); | ||
| 255 | static int rcu_pending(int cpu); | ||
| 256 | |||
| 257 | /* | ||
| 258 | * Return the number of RCU-sched batches processed thus far for debug & stats. | ||
| 259 | */ | ||
| 260 | long rcu_batches_completed_sched(void) | ||
| 261 | { | ||
| 262 | return rcu_sched_state.completed; | ||
| 263 | } | ||
| 264 | EXPORT_SYMBOL_GPL(rcu_batches_completed_sched); | ||
| 265 | |||
| 266 | /* | ||
| 267 | * Return the number of RCU BH batches processed thus far for debug & stats. | ||
| 268 | */ | ||
| 269 | long rcu_batches_completed_bh(void) | ||
| 270 | { | ||
| 271 | return rcu_bh_state.completed; | ||
| 272 | } | ||
| 273 | EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); | ||
| 274 | |||
| 275 | /* | ||
| 276 | * Force a quiescent state for RCU BH. | ||
| 277 | */ | ||
| 278 | void rcu_bh_force_quiescent_state(void) | ||
| 279 | { | ||
| 280 | force_quiescent_state(&rcu_bh_state); | ||
| 281 | } | ||
| 282 | EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); | ||
| 283 | |||
| 284 | /* | ||
| 285 | * Record the number of times rcutorture tests have been initiated and | ||
| 286 | * terminated. This information allows the debugfs tracing stats to be | ||
| 287 | * correlated to the rcutorture messages, even when the rcutorture module | ||
| 288 | * is being repeatedly loaded and unloaded. In other words, we cannot | ||
| 289 | * store this state in rcutorture itself. | ||
| 290 | */ | ||
| 291 | void rcutorture_record_test_transition(void) | ||
| 292 | { | ||
| 293 | rcutorture_testseq++; | ||
| 294 | rcutorture_vernum = 0; | ||
| 295 | } | ||
| 296 | EXPORT_SYMBOL_GPL(rcutorture_record_test_transition); | ||
| 297 | |||
| 298 | /* | ||
| 299 | * Record the number of writer passes through the current rcutorture test. | ||
| 300 | * This is also used to correlate debugfs tracing stats with the rcutorture | ||
| 301 | * messages. | ||
| 302 | */ | ||
| 303 | void rcutorture_record_progress(unsigned long vernum) | ||
| 304 | { | ||
| 305 | rcutorture_vernum++; | ||
| 306 | } | ||
| 307 | EXPORT_SYMBOL_GPL(rcutorture_record_progress); | ||
| 308 | |||
| 309 | /* | ||
| 310 | * Force a quiescent state for RCU-sched. | ||
| 311 | */ | ||
| 312 | void rcu_sched_force_quiescent_state(void) | ||
| 313 | { | ||
| 314 | force_quiescent_state(&rcu_sched_state); | ||
| 315 | } | ||
| 316 | EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); | ||
| 317 | |||
| 318 | /* | ||
| 319 | * Does the CPU have callbacks ready to be invoked? | ||
| 320 | */ | ||
| 321 | static int | ||
| 322 | cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) | ||
| 323 | { | ||
| 324 | return &rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL] && | ||
| 325 | rdp->nxttail[RCU_DONE_TAIL] != NULL; | ||
| 326 | } | ||
| 327 | |||
| 328 | /* | ||
| 329 | * Does the current CPU require a not-yet-started grace period? | ||
| 330 | * The caller must have disabled interrupts to prevent races with | ||
| 331 | * normal callback registry. | ||
| 332 | */ | ||
| 333 | static int | ||
| 334 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) | ||
| 335 | { | ||
| 336 | int i; | ||
| 337 | |||
| 338 | if (rcu_gp_in_progress(rsp)) | ||
| 339 | return 0; /* No, a grace period is already in progress. */ | ||
| 340 | if (rcu_nocb_needs_gp(rsp)) | ||
| 341 | return 1; /* Yes, a no-CBs CPU needs one. */ | ||
| 342 | if (!rdp->nxttail[RCU_NEXT_TAIL]) | ||
| 343 | return 0; /* No, this is a no-CBs (or offline) CPU. */ | ||
| 344 | if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) | ||
| 345 | return 1; /* Yes, this CPU has newly registered callbacks. */ | ||
| 346 | for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) | ||
| 347 | if (rdp->nxttail[i - 1] != rdp->nxttail[i] && | ||
| 348 | ULONG_CMP_LT(ACCESS_ONCE(rsp->completed), | ||
| 349 | rdp->nxtcompleted[i])) | ||
| 350 | return 1; /* Yes, CBs for future grace period. */ | ||
| 351 | return 0; /* No grace period needed. */ | ||
| 352 | } | ||
| 353 | |||
| 354 | /* | ||
| 355 | * Return the root node of the specified rcu_state structure. | ||
| 356 | */ | ||
| 357 | static struct rcu_node *rcu_get_root(struct rcu_state *rsp) | ||
| 358 | { | ||
| 359 | return &rsp->node[0]; | ||
| 360 | } | ||
| 361 | |||
| 362 | /* | ||
| 363 | * rcu_eqs_enter_common - current CPU is moving towards extended quiescent state | ||
| 364 | * | ||
| 365 | * If the new value of the ->dynticks_nesting counter now is zero, | ||
| 366 | * we really have entered idle, and must do the appropriate accounting. | ||
| 367 | * The caller must have disabled interrupts. | ||
| 368 | */ | ||
| 369 | static void rcu_eqs_enter_common(struct rcu_dynticks *rdtp, long long oldval, | ||
| 370 | bool user) | ||
| 371 | { | ||
| 372 | trace_rcu_dyntick(TPS("Start"), oldval, rdtp->dynticks_nesting); | ||
| 373 | if (!user && !is_idle_task(current)) { | ||
| 374 | struct task_struct *idle __maybe_unused = | ||
| 375 | idle_task(smp_processor_id()); | ||
| 376 | |||
| 377 | trace_rcu_dyntick(TPS("Error on entry: not idle task"), oldval, 0); | ||
| 378 | ftrace_dump(DUMP_ORIG); | ||
| 379 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | ||
| 380 | current->pid, current->comm, | ||
| 381 | idle->pid, idle->comm); /* must be idle task! */ | ||
| 382 | } | ||
| 383 | rcu_prepare_for_idle(smp_processor_id()); | ||
| 384 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ | ||
| 385 | smp_mb__before_atomic_inc(); /* See above. */ | ||
| 386 | atomic_inc(&rdtp->dynticks); | ||
| 387 | smp_mb__after_atomic_inc(); /* Force ordering with next sojourn. */ | ||
| 388 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | ||
| 389 | |||
| 390 | /* | ||
| 391 | * It is illegal to enter an extended quiescent state while | ||
| 392 | * in an RCU read-side critical section. | ||
| 393 | */ | ||
| 394 | rcu_lockdep_assert(!lock_is_held(&rcu_lock_map), | ||
| 395 | "Illegal idle entry in RCU read-side critical section."); | ||
| 396 | rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map), | ||
| 397 | "Illegal idle entry in RCU-bh read-side critical section."); | ||
| 398 | rcu_lockdep_assert(!lock_is_held(&rcu_sched_lock_map), | ||
| 399 | "Illegal idle entry in RCU-sched read-side critical section."); | ||
| 400 | } | ||
| 401 | |||
| 402 | /* | ||
| 403 | * Enter an RCU extended quiescent state, which can be either the | ||
| 404 | * idle loop or adaptive-tickless usermode execution. | ||
| 405 | */ | ||
| 406 | static void rcu_eqs_enter(bool user) | ||
| 407 | { | ||
| 408 | long long oldval; | ||
| 409 | struct rcu_dynticks *rdtp; | ||
| 410 | |||
| 411 | rdtp = this_cpu_ptr(&rcu_dynticks); | ||
| 412 | oldval = rdtp->dynticks_nesting; | ||
| 413 | WARN_ON_ONCE((oldval & DYNTICK_TASK_NEST_MASK) == 0); | ||
| 414 | if ((oldval & DYNTICK_TASK_NEST_MASK) == DYNTICK_TASK_NEST_VALUE) | ||
| 415 | rdtp->dynticks_nesting = 0; | ||
| 416 | else | ||
| 417 | rdtp->dynticks_nesting -= DYNTICK_TASK_NEST_VALUE; | ||
| 418 | rcu_eqs_enter_common(rdtp, oldval, user); | ||
| 419 | } | ||
| 420 | |||
| 421 | /** | ||
| 422 | * rcu_idle_enter - inform RCU that current CPU is entering idle | ||
| 423 | * | ||
| 424 | * Enter idle mode, in other words, -leave- the mode in which RCU | ||
| 425 | * read-side critical sections can occur. (Though RCU read-side | ||
| 426 | * critical sections can occur in irq handlers in idle, a possibility | ||
| 427 | * handled by irq_enter() and irq_exit().) | ||
| 428 | * | ||
| 429 | * We crowbar the ->dynticks_nesting field to zero to allow for | ||
| 430 | * the possibility of usermode upcalls having messed up our count | ||
| 431 | * of interrupt nesting level during the prior busy period. | ||
| 432 | */ | ||
| 433 | void rcu_idle_enter(void) | ||
| 434 | { | ||
| 435 | unsigned long flags; | ||
| 436 | |||
| 437 | local_irq_save(flags); | ||
| 438 | rcu_eqs_enter(false); | ||
| 439 | rcu_sysidle_enter(this_cpu_ptr(&rcu_dynticks), 0); | ||
| 440 | local_irq_restore(flags); | ||
| 441 | } | ||
| 442 | EXPORT_SYMBOL_GPL(rcu_idle_enter); | ||
| 443 | |||
| 444 | #ifdef CONFIG_RCU_USER_QS | ||
| 445 | /** | ||
| 446 | * rcu_user_enter - inform RCU that we are resuming userspace. | ||
| 447 | * | ||
| 448 | * Enter RCU idle mode right before resuming userspace. No use of RCU | ||
| 449 | * is permitted between this call and rcu_user_exit(). This way the | ||
| 450 | * CPU doesn't need to maintain the tick for RCU maintenance purposes | ||
| 451 | * when the CPU runs in userspace. | ||
| 452 | */ | ||
| 453 | void rcu_user_enter(void) | ||
| 454 | { | ||
| 455 | rcu_eqs_enter(1); | ||
| 456 | } | ||
| 457 | #endif /* CONFIG_RCU_USER_QS */ | ||
| 458 | |||
| 459 | /** | ||
| 460 | * rcu_irq_exit - inform RCU that current CPU is exiting irq towards idle | ||
| 461 | * | ||
| 462 | * Exit from an interrupt handler, which might possibly result in entering | ||
| 463 | * idle mode, in other words, leaving the mode in which read-side critical | ||
| 464 | * sections can occur. | ||
| 465 | * | ||
| 466 | * This code assumes that the idle loop never does anything that might | ||
| 467 | * result in unbalanced calls to irq_enter() and irq_exit(). If your | ||
| 468 | * architecture violates this assumption, RCU will give you what you | ||
| 469 | * deserve, good and hard. But very infrequently and irreproducibly. | ||
| 470 | * | ||
| 471 | * Use things like work queues to work around this limitation. | ||
| 472 | * | ||
| 473 | * You have been warned. | ||
| 474 | */ | ||
| 475 | void rcu_irq_exit(void) | ||
| 476 | { | ||
| 477 | unsigned long flags; | ||
| 478 | long long oldval; | ||
| 479 | struct rcu_dynticks *rdtp; | ||
| 480 | |||
| 481 | local_irq_save(flags); | ||
| 482 | rdtp = this_cpu_ptr(&rcu_dynticks); | ||
| 483 | oldval = rdtp->dynticks_nesting; | ||
| 484 | rdtp->dynticks_nesting--; | ||
| 485 | WARN_ON_ONCE(rdtp->dynticks_nesting < 0); | ||
| 486 | if (rdtp->dynticks_nesting) | ||
| 487 | trace_rcu_dyntick(TPS("--="), oldval, rdtp->dynticks_nesting); | ||
| 488 | else | ||
| 489 | rcu_eqs_enter_common(rdtp, oldval, true); | ||
| 490 | rcu_sysidle_enter(rdtp, 1); | ||
| 491 | local_irq_restore(flags); | ||
| 492 | } | ||
| 493 | |||
| 494 | /* | ||
| 495 | * rcu_eqs_exit_common - current CPU moving away from extended quiescent state | ||
| 496 | * | ||
| 497 | * If the new value of the ->dynticks_nesting counter was previously zero, | ||
| 498 | * we really have exited idle, and must do the appropriate accounting. | ||
| 499 | * The caller must have disabled interrupts. | ||
| 500 | */ | ||
| 501 | static void rcu_eqs_exit_common(struct rcu_dynticks *rdtp, long long oldval, | ||
| 502 | int user) | ||
| 503 | { | ||
| 504 | smp_mb__before_atomic_inc(); /* Force ordering w/previous sojourn. */ | ||
| 505 | atomic_inc(&rdtp->dynticks); | ||
| 506 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ | ||
| 507 | smp_mb__after_atomic_inc(); /* See above. */ | ||
| 508 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | ||
| 509 | rcu_cleanup_after_idle(smp_processor_id()); | ||
| 510 | trace_rcu_dyntick(TPS("End"), oldval, rdtp->dynticks_nesting); | ||
| 511 | if (!user && !is_idle_task(current)) { | ||
| 512 | struct task_struct *idle __maybe_unused = | ||
| 513 | idle_task(smp_processor_id()); | ||
| 514 | |||
| 515 | trace_rcu_dyntick(TPS("Error on exit: not idle task"), | ||
| 516 | oldval, rdtp->dynticks_nesting); | ||
| 517 | ftrace_dump(DUMP_ORIG); | ||
| 518 | WARN_ONCE(1, "Current pid: %d comm: %s / Idle pid: %d comm: %s", | ||
| 519 | current->pid, current->comm, | ||
| 520 | idle->pid, idle->comm); /* must be idle task! */ | ||
| 521 | } | ||
| 522 | } | ||
| 523 | |||
| 524 | /* | ||
| 525 | * Exit an RCU extended quiescent state, which can be either the | ||
| 526 | * idle loop or adaptive-tickless usermode execution. | ||
| 527 | */ | ||
| 528 | static void rcu_eqs_exit(bool user) | ||
| 529 | { | ||
| 530 | struct rcu_dynticks *rdtp; | ||
| 531 | long long oldval; | ||
| 532 | |||
| 533 | rdtp = this_cpu_ptr(&rcu_dynticks); | ||
| 534 | oldval = rdtp->dynticks_nesting; | ||
| 535 | WARN_ON_ONCE(oldval < 0); | ||
| 536 | if (oldval & DYNTICK_TASK_NEST_MASK) | ||
| 537 | rdtp->dynticks_nesting += DYNTICK_TASK_NEST_VALUE; | ||
| 538 | else | ||
| 539 | rdtp->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | ||
| 540 | rcu_eqs_exit_common(rdtp, oldval, user); | ||
| 541 | } | ||
| 542 | |||
| 543 | /** | ||
| 544 | * rcu_idle_exit - inform RCU that current CPU is leaving idle | ||
| 545 | * | ||
| 546 | * Exit idle mode, in other words, -enter- the mode in which RCU | ||
| 547 | * read-side critical sections can occur. | ||
| 548 | * | ||
| 549 | * We crowbar the ->dynticks_nesting field to DYNTICK_TASK_NEST to | ||
| 550 | * allow for the possibility of usermode upcalls messing up our count | ||
| 551 | * of interrupt nesting level during the busy period that is just | ||
| 552 | * now starting. | ||
| 553 | */ | ||
| 554 | void rcu_idle_exit(void) | ||
| 555 | { | ||
| 556 | unsigned long flags; | ||
| 557 | |||
| 558 | local_irq_save(flags); | ||
| 559 | rcu_eqs_exit(false); | ||
| 560 | rcu_sysidle_exit(this_cpu_ptr(&rcu_dynticks), 0); | ||
| 561 | local_irq_restore(flags); | ||
| 562 | } | ||
| 563 | EXPORT_SYMBOL_GPL(rcu_idle_exit); | ||
| 564 | |||
| 565 | #ifdef CONFIG_RCU_USER_QS | ||
| 566 | /** | ||
| 567 | * rcu_user_exit - inform RCU that we are exiting userspace. | ||
| 568 | * | ||
| 569 | * Exit RCU idle mode while entering the kernel because it can | ||
| 570 | * run a RCU read side critical section anytime. | ||
| 571 | */ | ||
| 572 | void rcu_user_exit(void) | ||
| 573 | { | ||
| 574 | rcu_eqs_exit(1); | ||
| 575 | } | ||
| 576 | #endif /* CONFIG_RCU_USER_QS */ | ||
| 577 | |||
| 578 | /** | ||
| 579 | * rcu_irq_enter - inform RCU that current CPU is entering irq away from idle | ||
| 580 | * | ||
| 581 | * Enter an interrupt handler, which might possibly result in exiting | ||
| 582 | * idle mode, in other words, entering the mode in which read-side critical | ||
| 583 | * sections can occur. | ||
| 584 | * | ||
| 585 | * Note that the Linux kernel is fully capable of entering an interrupt | ||
| 586 | * handler that it never exits, for example when doing upcalls to | ||
| 587 | * user mode! This code assumes that the idle loop never does upcalls to | ||
| 588 | * user mode. If your architecture does do upcalls from the idle loop (or | ||
| 589 | * does anything else that results in unbalanced calls to the irq_enter() | ||
| 590 | * and irq_exit() functions), RCU will give you what you deserve, good | ||
| 591 | * and hard. But very infrequently and irreproducibly. | ||
| 592 | * | ||
| 593 | * Use things like work queues to work around this limitation. | ||
| 594 | * | ||
| 595 | * You have been warned. | ||
| 596 | */ | ||
| 597 | void rcu_irq_enter(void) | ||
| 598 | { | ||
| 599 | unsigned long flags; | ||
| 600 | struct rcu_dynticks *rdtp; | ||
| 601 | long long oldval; | ||
| 602 | |||
| 603 | local_irq_save(flags); | ||
| 604 | rdtp = this_cpu_ptr(&rcu_dynticks); | ||
| 605 | oldval = rdtp->dynticks_nesting; | ||
| 606 | rdtp->dynticks_nesting++; | ||
| 607 | WARN_ON_ONCE(rdtp->dynticks_nesting == 0); | ||
| 608 | if (oldval) | ||
| 609 | trace_rcu_dyntick(TPS("++="), oldval, rdtp->dynticks_nesting); | ||
| 610 | else | ||
| 611 | rcu_eqs_exit_common(rdtp, oldval, true); | ||
| 612 | rcu_sysidle_exit(rdtp, 1); | ||
| 613 | local_irq_restore(flags); | ||
| 614 | } | ||
| 615 | |||
| 616 | /** | ||
| 617 | * rcu_nmi_enter - inform RCU of entry to NMI context | ||
| 618 | * | ||
| 619 | * If the CPU was idle with dynamic ticks active, and there is no | ||
| 620 | * irq handler running, this updates rdtp->dynticks_nmi to let the | ||
| 621 | * RCU grace-period handling know that the CPU is active. | ||
| 622 | */ | ||
| 623 | void rcu_nmi_enter(void) | ||
| 624 | { | ||
| 625 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | ||
| 626 | |||
| 627 | if (rdtp->dynticks_nmi_nesting == 0 && | ||
| 628 | (atomic_read(&rdtp->dynticks) & 0x1)) | ||
| 629 | return; | ||
| 630 | rdtp->dynticks_nmi_nesting++; | ||
| 631 | smp_mb__before_atomic_inc(); /* Force delay from prior write. */ | ||
| 632 | atomic_inc(&rdtp->dynticks); | ||
| 633 | /* CPUs seeing atomic_inc() must see later RCU read-side crit sects */ | ||
| 634 | smp_mb__after_atomic_inc(); /* See above. */ | ||
| 635 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks) & 0x1)); | ||
| 636 | } | ||
| 637 | |||
| 638 | /** | ||
| 639 | * rcu_nmi_exit - inform RCU of exit from NMI context | ||
| 640 | * | ||
| 641 | * If the CPU was idle with dynamic ticks active, and there is no | ||
| 642 | * irq handler running, this updates rdtp->dynticks_nmi to let the | ||
| 643 | * RCU grace-period handling know that the CPU is no longer active. | ||
| 644 | */ | ||
| 645 | void rcu_nmi_exit(void) | ||
| 646 | { | ||
| 647 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | ||
| 648 | |||
| 649 | if (rdtp->dynticks_nmi_nesting == 0 || | ||
| 650 | --rdtp->dynticks_nmi_nesting != 0) | ||
| 651 | return; | ||
| 652 | /* CPUs seeing atomic_inc() must see prior RCU read-side crit sects */ | ||
| 653 | smp_mb__before_atomic_inc(); /* See above. */ | ||
| 654 | atomic_inc(&rdtp->dynticks); | ||
| 655 | smp_mb__after_atomic_inc(); /* Force delay to next write. */ | ||
| 656 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks) & 0x1); | ||
| 657 | } | ||
| 658 | |||
| 659 | /** | ||
| 660 | * __rcu_is_watching - are RCU read-side critical sections safe? | ||
| 661 | * | ||
| 662 | * Return true if RCU is watching the running CPU, which means that | ||
| 663 | * this CPU can safely enter RCU read-side critical sections. Unlike | ||
| 664 | * rcu_is_watching(), the caller of __rcu_is_watching() must have at | ||
| 665 | * least disabled preemption. | ||
| 666 | */ | ||
| 667 | bool __rcu_is_watching(void) | ||
| 668 | { | ||
| 669 | return atomic_read(this_cpu_ptr(&rcu_dynticks.dynticks)) & 0x1; | ||
| 670 | } | ||
| 671 | |||
| 672 | /** | ||
| 673 | * rcu_is_watching - see if RCU thinks that the current CPU is idle | ||
| 674 | * | ||
| 675 | * If the current CPU is in its idle loop and is neither in an interrupt | ||
| 676 | * or NMI handler, return true. | ||
| 677 | */ | ||
| 678 | bool rcu_is_watching(void) | ||
| 679 | { | ||
| 680 | int ret; | ||
| 681 | |||
| 682 | preempt_disable(); | ||
| 683 | ret = __rcu_is_watching(); | ||
| 684 | preempt_enable(); | ||
| 685 | return ret; | ||
| 686 | } | ||
| 687 | EXPORT_SYMBOL_GPL(rcu_is_watching); | ||
| 688 | |||
| 689 | #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) | ||
| 690 | |||
| 691 | /* | ||
| 692 | * Is the current CPU online? Disable preemption to avoid false positives | ||
| 693 | * that could otherwise happen due to the current CPU number being sampled, | ||
| 694 | * this task being preempted, its old CPU being taken offline, resuming | ||
| 695 | * on some other CPU, then determining that its old CPU is now offline. | ||
| 696 | * It is OK to use RCU on an offline processor during initial boot, hence | ||
| 697 | * the check for rcu_scheduler_fully_active. Note also that it is OK | ||
| 698 | * for a CPU coming online to use RCU for one jiffy prior to marking itself | ||
| 699 | * online in the cpu_online_mask. Similarly, it is OK for a CPU going | ||
| 700 | * offline to continue to use RCU for one jiffy after marking itself | ||
| 701 | * offline in the cpu_online_mask. This leniency is necessary given the | ||
| 702 | * non-atomic nature of the online and offline processing, for example, | ||
| 703 | * the fact that a CPU enters the scheduler after completing the CPU_DYING | ||
| 704 | * notifiers. | ||
| 705 | * | ||
| 706 | * This is also why RCU internally marks CPUs online during the | ||
| 707 | * CPU_UP_PREPARE phase and offline during the CPU_DEAD phase. | ||
| 708 | * | ||
| 709 | * Disable checking if in an NMI handler because we cannot safely report | ||
| 710 | * errors from NMI handlers anyway. | ||
| 711 | */ | ||
| 712 | bool rcu_lockdep_current_cpu_online(void) | ||
| 713 | { | ||
| 714 | struct rcu_data *rdp; | ||
| 715 | struct rcu_node *rnp; | ||
| 716 | bool ret; | ||
| 717 | |||
| 718 | if (in_nmi()) | ||
| 719 | return 1; | ||
| 720 | preempt_disable(); | ||
| 721 | rdp = this_cpu_ptr(&rcu_sched_data); | ||
| 722 | rnp = rdp->mynode; | ||
| 723 | ret = (rdp->grpmask & rnp->qsmaskinit) || | ||
| 724 | !rcu_scheduler_fully_active; | ||
| 725 | preempt_enable(); | ||
| 726 | return ret; | ||
| 727 | } | ||
| 728 | EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); | ||
| 729 | |||
| 730 | #endif /* #if defined(CONFIG_PROVE_RCU) && defined(CONFIG_HOTPLUG_CPU) */ | ||
| 731 | |||
| 732 | /** | ||
| 733 | * rcu_is_cpu_rrupt_from_idle - see if idle or immediately interrupted from idle | ||
| 734 | * | ||
| 735 | * If the current CPU is idle or running at a first-level (not nested) | ||
| 736 | * interrupt from idle, return true. The caller must have at least | ||
| 737 | * disabled preemption. | ||
| 738 | */ | ||
| 739 | static int rcu_is_cpu_rrupt_from_idle(void) | ||
| 740 | { | ||
| 741 | return __this_cpu_read(rcu_dynticks.dynticks_nesting) <= 1; | ||
| 742 | } | ||
| 743 | |||
| 744 | /* | ||
| 745 | * Snapshot the specified CPU's dynticks counter so that we can later | ||
| 746 | * credit them with an implicit quiescent state. Return 1 if this CPU | ||
| 747 | * is in dynticks idle mode, which is an extended quiescent state. | ||
| 748 | */ | ||
| 749 | static int dyntick_save_progress_counter(struct rcu_data *rdp, | ||
| 750 | bool *isidle, unsigned long *maxj) | ||
| 751 | { | ||
| 752 | rdp->dynticks_snap = atomic_add_return(0, &rdp->dynticks->dynticks); | ||
| 753 | rcu_sysidle_check_cpu(rdp, isidle, maxj); | ||
| 754 | return (rdp->dynticks_snap & 0x1) == 0; | ||
| 755 | } | ||
| 756 | |||
| 757 | /* | ||
| 758 | * Return true if the specified CPU has passed through a quiescent | ||
| 759 | * state by virtue of being in or having passed through an dynticks | ||
| 760 | * idle state since the last call to dyntick_save_progress_counter() | ||
| 761 | * for this same CPU, or by virtue of having been offline. | ||
| 762 | */ | ||
| 763 | static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, | ||
| 764 | bool *isidle, unsigned long *maxj) | ||
| 765 | { | ||
| 766 | unsigned int curr; | ||
| 767 | unsigned int snap; | ||
| 768 | |||
| 769 | curr = (unsigned int)atomic_add_return(0, &rdp->dynticks->dynticks); | ||
| 770 | snap = (unsigned int)rdp->dynticks_snap; | ||
| 771 | |||
| 772 | /* | ||
| 773 | * If the CPU passed through or entered a dynticks idle phase with | ||
| 774 | * no active irq/NMI handlers, then we can safely pretend that the CPU | ||
| 775 | * already acknowledged the request to pass through a quiescent | ||
| 776 | * state. Either way, that CPU cannot possibly be in an RCU | ||
| 777 | * read-side critical section that started before the beginning | ||
| 778 | * of the current RCU grace period. | ||
| 779 | */ | ||
| 780 | if ((curr & 0x1) == 0 || UINT_CMP_GE(curr, snap + 2)) { | ||
| 781 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); | ||
| 782 | rdp->dynticks_fqs++; | ||
| 783 | return 1; | ||
| 784 | } | ||
| 785 | |||
| 786 | /* | ||
| 787 | * Check for the CPU being offline, but only if the grace period | ||
| 788 | * is old enough. We don't need to worry about the CPU changing | ||
| 789 | * state: If we see it offline even once, it has been through a | ||
| 790 | * quiescent state. | ||
| 791 | * | ||
| 792 | * The reason for insisting that the grace period be at least | ||
| 793 | * one jiffy old is that CPUs that are not quite online and that | ||
| 794 | * have just gone offline can still execute RCU read-side critical | ||
| 795 | * sections. | ||
| 796 | */ | ||
| 797 | if (ULONG_CMP_GE(rdp->rsp->gp_start + 2, jiffies)) | ||
| 798 | return 0; /* Grace period is not old enough. */ | ||
| 799 | barrier(); | ||
| 800 | if (cpu_is_offline(rdp->cpu)) { | ||
| 801 | trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("ofl")); | ||
| 802 | rdp->offline_fqs++; | ||
| 803 | return 1; | ||
| 804 | } | ||
| 805 | |||
| 806 | /* | ||
| 807 | * There is a possibility that a CPU in adaptive-ticks state | ||
| 808 | * might run in the kernel with the scheduling-clock tick disabled | ||
| 809 | * for an extended time period. Invoke rcu_kick_nohz_cpu() to | ||
| 810 | * force the CPU to restart the scheduling-clock tick in this | ||
| 811 | * CPU is in this state. | ||
| 812 | */ | ||
| 813 | rcu_kick_nohz_cpu(rdp->cpu); | ||
| 814 | |||
| 815 | return 0; | ||
| 816 | } | ||
| 817 | |||
| 818 | static void record_gp_stall_check_time(struct rcu_state *rsp) | ||
| 819 | { | ||
| 820 | unsigned long j = ACCESS_ONCE(jiffies); | ||
| 821 | |||
| 822 | rsp->gp_start = j; | ||
| 823 | smp_wmb(); /* Record start time before stall time. */ | ||
| 824 | rsp->jiffies_stall = j + rcu_jiffies_till_stall_check(); | ||
| 825 | } | ||
| 826 | |||
| 827 | /* | ||
| 828 | * Dump stacks of all tasks running on stalled CPUs. This is a fallback | ||
| 829 | * for architectures that do not implement trigger_all_cpu_backtrace(). | ||
| 830 | * The NMI-triggered stack traces are more accurate because they are | ||
| 831 | * printed by the target CPU. | ||
| 832 | */ | ||
| 833 | static void rcu_dump_cpu_stacks(struct rcu_state *rsp) | ||
| 834 | { | ||
| 835 | int cpu; | ||
| 836 | unsigned long flags; | ||
| 837 | struct rcu_node *rnp; | ||
| 838 | |||
| 839 | rcu_for_each_leaf_node(rsp, rnp) { | ||
| 840 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 841 | if (rnp->qsmask != 0) { | ||
| 842 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) | ||
| 843 | if (rnp->qsmask & (1UL << cpu)) | ||
| 844 | dump_cpu_task(rnp->grplo + cpu); | ||
| 845 | } | ||
| 846 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 847 | } | ||
| 848 | } | ||
| 849 | |||
| 850 | static void print_other_cpu_stall(struct rcu_state *rsp) | ||
| 851 | { | ||
| 852 | int cpu; | ||
| 853 | long delta; | ||
| 854 | unsigned long flags; | ||
| 855 | int ndetected = 0; | ||
| 856 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
| 857 | long totqlen = 0; | ||
| 858 | |||
| 859 | /* Only let one CPU complain about others per time interval. */ | ||
| 860 | |||
| 861 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 862 | delta = jiffies - rsp->jiffies_stall; | ||
| 863 | if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { | ||
| 864 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 865 | return; | ||
| 866 | } | ||
| 867 | rsp->jiffies_stall = jiffies + 3 * rcu_jiffies_till_stall_check() + 3; | ||
| 868 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 869 | |||
| 870 | /* | ||
| 871 | * OK, time to rat on our buddy... | ||
| 872 | * See Documentation/RCU/stallwarn.txt for info on how to debug | ||
| 873 | * RCU CPU stall warnings. | ||
| 874 | */ | ||
| 875 | pr_err("INFO: %s detected stalls on CPUs/tasks:", | ||
| 876 | rsp->name); | ||
| 877 | print_cpu_stall_info_begin(); | ||
| 878 | rcu_for_each_leaf_node(rsp, rnp) { | ||
| 879 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 880 | ndetected += rcu_print_task_stall(rnp); | ||
| 881 | if (rnp->qsmask != 0) { | ||
| 882 | for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) | ||
| 883 | if (rnp->qsmask & (1UL << cpu)) { | ||
| 884 | print_cpu_stall_info(rsp, | ||
| 885 | rnp->grplo + cpu); | ||
| 886 | ndetected++; | ||
| 887 | } | ||
| 888 | } | ||
| 889 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 890 | } | ||
| 891 | |||
| 892 | /* | ||
| 893 | * Now rat on any tasks that got kicked up to the root rcu_node | ||
| 894 | * due to CPU offlining. | ||
| 895 | */ | ||
| 896 | rnp = rcu_get_root(rsp); | ||
| 897 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 898 | ndetected += rcu_print_task_stall(rnp); | ||
| 899 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 900 | |||
| 901 | print_cpu_stall_info_end(); | ||
| 902 | for_each_possible_cpu(cpu) | ||
| 903 | totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; | ||
| 904 | pr_cont("(detected by %d, t=%ld jiffies, g=%lu, c=%lu, q=%lu)\n", | ||
| 905 | smp_processor_id(), (long)(jiffies - rsp->gp_start), | ||
| 906 | rsp->gpnum, rsp->completed, totqlen); | ||
| 907 | if (ndetected == 0) | ||
| 908 | pr_err("INFO: Stall ended before state dump start\n"); | ||
| 909 | else if (!trigger_all_cpu_backtrace()) | ||
| 910 | rcu_dump_cpu_stacks(rsp); | ||
| 911 | |||
| 912 | /* Complain about tasks blocking the grace period. */ | ||
| 913 | |||
| 914 | rcu_print_detail_task_stall(rsp); | ||
| 915 | |||
| 916 | force_quiescent_state(rsp); /* Kick them all. */ | ||
| 917 | } | ||
| 918 | |||
| 919 | /* | ||
| 920 | * This function really isn't for public consumption, but RCU is special in | ||
| 921 | * that context switches can allow the state machine to make progress. | ||
| 922 | */ | ||
| 923 | extern void resched_cpu(int cpu); | ||
| 924 | |||
| 925 | static void print_cpu_stall(struct rcu_state *rsp) | ||
| 926 | { | ||
| 927 | int cpu; | ||
| 928 | unsigned long flags; | ||
| 929 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
| 930 | long totqlen = 0; | ||
| 931 | |||
| 932 | /* | ||
| 933 | * OK, time to rat on ourselves... | ||
| 934 | * See Documentation/RCU/stallwarn.txt for info on how to debug | ||
| 935 | * RCU CPU stall warnings. | ||
| 936 | */ | ||
| 937 | pr_err("INFO: %s self-detected stall on CPU", rsp->name); | ||
| 938 | print_cpu_stall_info_begin(); | ||
| 939 | print_cpu_stall_info(rsp, smp_processor_id()); | ||
| 940 | print_cpu_stall_info_end(); | ||
| 941 | for_each_possible_cpu(cpu) | ||
| 942 | totqlen += per_cpu_ptr(rsp->rda, cpu)->qlen; | ||
| 943 | pr_cont(" (t=%lu jiffies g=%lu c=%lu q=%lu)\n", | ||
| 944 | jiffies - rsp->gp_start, rsp->gpnum, rsp->completed, totqlen); | ||
| 945 | if (!trigger_all_cpu_backtrace()) | ||
| 946 | dump_stack(); | ||
| 947 | |||
| 948 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 949 | if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) | ||
| 950 | rsp->jiffies_stall = jiffies + | ||
| 951 | 3 * rcu_jiffies_till_stall_check() + 3; | ||
| 952 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 953 | |||
| 954 | /* | ||
| 955 | * Attempt to revive the RCU machinery by forcing a context switch. | ||
| 956 | * | ||
| 957 | * A context switch would normally allow the RCU state machine to make | ||
| 958 | * progress and it could be we're stuck in kernel space without context | ||
| 959 | * switches for an entirely unreasonable amount of time. | ||
| 960 | */ | ||
| 961 | resched_cpu(smp_processor_id()); | ||
| 962 | } | ||
| 963 | |||
| 964 | static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | ||
| 965 | { | ||
| 966 | unsigned long completed; | ||
| 967 | unsigned long gpnum; | ||
| 968 | unsigned long gps; | ||
| 969 | unsigned long j; | ||
| 970 | unsigned long js; | ||
| 971 | struct rcu_node *rnp; | ||
| 972 | |||
| 973 | if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp)) | ||
| 974 | return; | ||
| 975 | j = ACCESS_ONCE(jiffies); | ||
| 976 | |||
| 977 | /* | ||
| 978 | * Lots of memory barriers to reject false positives. | ||
| 979 | * | ||
| 980 | * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall, | ||
| 981 | * then rsp->gp_start, and finally rsp->completed. These values | ||
| 982 | * are updated in the opposite order with memory barriers (or | ||
| 983 | * equivalent) during grace-period initialization and cleanup. | ||
| 984 | * Now, a false positive can occur if we get an new value of | ||
| 985 | * rsp->gp_start and a old value of rsp->jiffies_stall. But given | ||
| 986 | * the memory barriers, the only way that this can happen is if one | ||
| 987 | * grace period ends and another starts between these two fetches. | ||
| 988 | * Detect this by comparing rsp->completed with the previous fetch | ||
| 989 | * from rsp->gpnum. | ||
| 990 | * | ||
| 991 | * Given this check, comparisons of jiffies, rsp->jiffies_stall, | ||
| 992 | * and rsp->gp_start suffice to forestall false positives. | ||
| 993 | */ | ||
| 994 | gpnum = ACCESS_ONCE(rsp->gpnum); | ||
| 995 | smp_rmb(); /* Pick up ->gpnum first... */ | ||
| 996 | js = ACCESS_ONCE(rsp->jiffies_stall); | ||
| 997 | smp_rmb(); /* ...then ->jiffies_stall before the rest... */ | ||
| 998 | gps = ACCESS_ONCE(rsp->gp_start); | ||
| 999 | smp_rmb(); /* ...and finally ->gp_start before ->completed. */ | ||
| 1000 | completed = ACCESS_ONCE(rsp->completed); | ||
| 1001 | if (ULONG_CMP_GE(completed, gpnum) || | ||
| 1002 | ULONG_CMP_LT(j, js) || | ||
| 1003 | ULONG_CMP_GE(gps, js)) | ||
| 1004 | return; /* No stall or GP completed since entering function. */ | ||
| 1005 | rnp = rdp->mynode; | ||
| 1006 | if (rcu_gp_in_progress(rsp) && | ||
| 1007 | (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) { | ||
| 1008 | |||
| 1009 | /* We haven't checked in, so go dump stack. */ | ||
| 1010 | print_cpu_stall(rsp); | ||
| 1011 | |||
| 1012 | } else if (rcu_gp_in_progress(rsp) && | ||
| 1013 | ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { | ||
| 1014 | |||
| 1015 | /* They had a few time units to dump stack, so complain. */ | ||
| 1016 | print_other_cpu_stall(rsp); | ||
| 1017 | } | ||
| 1018 | } | ||
| 1019 | |||
| 1020 | /** | ||
| 1021 | * rcu_cpu_stall_reset - prevent further stall warnings in current grace period | ||
| 1022 | * | ||
| 1023 | * Set the stall-warning timeout way off into the future, thus preventing | ||
| 1024 | * any RCU CPU stall-warning messages from appearing in the current set of | ||
| 1025 | * RCU grace periods. | ||
| 1026 | * | ||
| 1027 | * The caller must disable hard irqs. | ||
| 1028 | */ | ||
| 1029 | void rcu_cpu_stall_reset(void) | ||
| 1030 | { | ||
| 1031 | struct rcu_state *rsp; | ||
| 1032 | |||
| 1033 | for_each_rcu_flavor(rsp) | ||
| 1034 | rsp->jiffies_stall = jiffies + ULONG_MAX / 2; | ||
| 1035 | } | ||
| 1036 | |||
| 1037 | /* | ||
| 1038 | * Initialize the specified rcu_data structure's callback list to empty. | ||
| 1039 | */ | ||
| 1040 | static void init_callback_list(struct rcu_data *rdp) | ||
| 1041 | { | ||
| 1042 | int i; | ||
| 1043 | |||
| 1044 | if (init_nocb_callback_list(rdp)) | ||
| 1045 | return; | ||
| 1046 | rdp->nxtlist = NULL; | ||
| 1047 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
| 1048 | rdp->nxttail[i] = &rdp->nxtlist; | ||
| 1049 | } | ||
| 1050 | |||
| 1051 | /* | ||
| 1052 | * Determine the value that ->completed will have at the end of the | ||
| 1053 | * next subsequent grace period. This is used to tag callbacks so that | ||
| 1054 | * a CPU can invoke callbacks in a timely fashion even if that CPU has | ||
| 1055 | * been dyntick-idle for an extended period with callbacks under the | ||
| 1056 | * influence of RCU_FAST_NO_HZ. | ||
| 1057 | * | ||
| 1058 | * The caller must hold rnp->lock with interrupts disabled. | ||
| 1059 | */ | ||
| 1060 | static unsigned long rcu_cbs_completed(struct rcu_state *rsp, | ||
| 1061 | struct rcu_node *rnp) | ||
| 1062 | { | ||
| 1063 | /* | ||
| 1064 | * If RCU is idle, we just wait for the next grace period. | ||
| 1065 | * But we can only be sure that RCU is idle if we are looking | ||
| 1066 | * at the root rcu_node structure -- otherwise, a new grace | ||
| 1067 | * period might have started, but just not yet gotten around | ||
| 1068 | * to initializing the current non-root rcu_node structure. | ||
| 1069 | */ | ||
| 1070 | if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed) | ||
| 1071 | return rnp->completed + 1; | ||
| 1072 | |||
| 1073 | /* | ||
| 1074 | * Otherwise, wait for a possible partial grace period and | ||
| 1075 | * then the subsequent full grace period. | ||
| 1076 | */ | ||
| 1077 | return rnp->completed + 2; | ||
| 1078 | } | ||
| 1079 | |||
| 1080 | /* | ||
| 1081 | * Trace-event helper function for rcu_start_future_gp() and | ||
| 1082 | * rcu_nocb_wait_gp(). | ||
| 1083 | */ | ||
| 1084 | static void trace_rcu_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, | ||
| 1085 | unsigned long c, const char *s) | ||
| 1086 | { | ||
| 1087 | trace_rcu_future_grace_period(rdp->rsp->name, rnp->gpnum, | ||
| 1088 | rnp->completed, c, rnp->level, | ||
| 1089 | rnp->grplo, rnp->grphi, s); | ||
| 1090 | } | ||
| 1091 | |||
| 1092 | /* | ||
| 1093 | * Start some future grace period, as needed to handle newly arrived | ||
| 1094 | * callbacks. The required future grace periods are recorded in each | ||
| 1095 | * rcu_node structure's ->need_future_gp field. | ||
| 1096 | * | ||
| 1097 | * The caller must hold the specified rcu_node structure's ->lock. | ||
| 1098 | */ | ||
| 1099 | static unsigned long __maybe_unused | ||
| 1100 | rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp) | ||
| 1101 | { | ||
| 1102 | unsigned long c; | ||
| 1103 | int i; | ||
| 1104 | struct rcu_node *rnp_root = rcu_get_root(rdp->rsp); | ||
| 1105 | |||
| 1106 | /* | ||
| 1107 | * Pick up grace-period number for new callbacks. If this | ||
| 1108 | * grace period is already marked as needed, return to the caller. | ||
| 1109 | */ | ||
| 1110 | c = rcu_cbs_completed(rdp->rsp, rnp); | ||
| 1111 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startleaf")); | ||
| 1112 | if (rnp->need_future_gp[c & 0x1]) { | ||
| 1113 | trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartleaf")); | ||
| 1114 | return c; | ||
| 1115 | } | ||
| 1116 | |||
| 1117 | /* | ||
| 1118 | * If either this rcu_node structure or the root rcu_node structure | ||
| 1119 | * believe that a grace period is in progress, then we must wait | ||
| 1120 | * for the one following, which is in "c". Because our request | ||
| 1121 | * will be noticed at the end of the current grace period, we don't | ||
| 1122 | * need to explicitly start one. | ||
| 1123 | */ | ||
| 1124 | if (rnp->gpnum != rnp->completed || | ||
| 1125 | ACCESS_ONCE(rnp->gpnum) != ACCESS_ONCE(rnp->completed)) { | ||
| 1126 | rnp->need_future_gp[c & 0x1]++; | ||
| 1127 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleaf")); | ||
| 1128 | return c; | ||
| 1129 | } | ||
| 1130 | |||
| 1131 | /* | ||
| 1132 | * There might be no grace period in progress. If we don't already | ||
| 1133 | * hold it, acquire the root rcu_node structure's lock in order to | ||
| 1134 | * start one (if needed). | ||
| 1135 | */ | ||
| 1136 | if (rnp != rnp_root) | ||
| 1137 | raw_spin_lock(&rnp_root->lock); | ||
| 1138 | |||
| 1139 | /* | ||
| 1140 | * Get a new grace-period number. If there really is no grace | ||
| 1141 | * period in progress, it will be smaller than the one we obtained | ||
| 1142 | * earlier. Adjust callbacks as needed. Note that even no-CBs | ||
| 1143 | * CPUs have a ->nxtcompleted[] array, so no no-CBs checks needed. | ||
| 1144 | */ | ||
| 1145 | c = rcu_cbs_completed(rdp->rsp, rnp_root); | ||
| 1146 | for (i = RCU_DONE_TAIL; i < RCU_NEXT_TAIL; i++) | ||
| 1147 | if (ULONG_CMP_LT(c, rdp->nxtcompleted[i])) | ||
| 1148 | rdp->nxtcompleted[i] = c; | ||
| 1149 | |||
| 1150 | /* | ||
| 1151 | * If the needed for the required grace period is already | ||
| 1152 | * recorded, trace and leave. | ||
| 1153 | */ | ||
| 1154 | if (rnp_root->need_future_gp[c & 0x1]) { | ||
| 1155 | trace_rcu_future_gp(rnp, rdp, c, TPS("Prestartedroot")); | ||
| 1156 | goto unlock_out; | ||
| 1157 | } | ||
| 1158 | |||
| 1159 | /* Record the need for the future grace period. */ | ||
| 1160 | rnp_root->need_future_gp[c & 0x1]++; | ||
| 1161 | |||
| 1162 | /* If a grace period is not already in progress, start one. */ | ||
| 1163 | if (rnp_root->gpnum != rnp_root->completed) { | ||
| 1164 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedleafroot")); | ||
| 1165 | } else { | ||
| 1166 | trace_rcu_future_gp(rnp, rdp, c, TPS("Startedroot")); | ||
| 1167 | rcu_start_gp_advanced(rdp->rsp, rnp_root, rdp); | ||
| 1168 | } | ||
| 1169 | unlock_out: | ||
| 1170 | if (rnp != rnp_root) | ||
| 1171 | raw_spin_unlock(&rnp_root->lock); | ||
| 1172 | return c; | ||
| 1173 | } | ||
| 1174 | |||
| 1175 | /* | ||
| 1176 | * Clean up any old requests for the just-ended grace period. Also return | ||
| 1177 | * whether any additional grace periods have been requested. Also invoke | ||
| 1178 | * rcu_nocb_gp_cleanup() in order to wake up any no-callbacks kthreads | ||
| 1179 | * waiting for this grace period to complete. | ||
| 1180 | */ | ||
| 1181 | static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | ||
| 1182 | { | ||
| 1183 | int c = rnp->completed; | ||
| 1184 | int needmore; | ||
| 1185 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | ||
| 1186 | |||
| 1187 | rcu_nocb_gp_cleanup(rsp, rnp); | ||
| 1188 | rnp->need_future_gp[c & 0x1] = 0; | ||
| 1189 | needmore = rnp->need_future_gp[(c + 1) & 0x1]; | ||
| 1190 | trace_rcu_future_gp(rnp, rdp, c, | ||
| 1191 | needmore ? TPS("CleanupMore") : TPS("Cleanup")); | ||
| 1192 | return needmore; | ||
| 1193 | } | ||
| 1194 | |||
| 1195 | /* | ||
| 1196 | * If there is room, assign a ->completed number to any callbacks on | ||
| 1197 | * this CPU that have not already been assigned. Also accelerate any | ||
| 1198 | * callbacks that were previously assigned a ->completed number that has | ||
| 1199 | * since proven to be too conservative, which can happen if callbacks get | ||
| 1200 | * assigned a ->completed number while RCU is idle, but with reference to | ||
| 1201 | * a non-root rcu_node structure. This function is idempotent, so it does | ||
| 1202 | * not hurt to call it repeatedly. | ||
| 1203 | * | ||
| 1204 | * The caller must hold rnp->lock with interrupts disabled. | ||
| 1205 | */ | ||
| 1206 | static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | ||
| 1207 | struct rcu_data *rdp) | ||
| 1208 | { | ||
| 1209 | unsigned long c; | ||
| 1210 | int i; | ||
| 1211 | |||
| 1212 | /* If the CPU has no callbacks, nothing to do. */ | ||
| 1213 | if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) | ||
| 1214 | return; | ||
| 1215 | |||
| 1216 | /* | ||
| 1217 | * Starting from the sublist containing the callbacks most | ||
| 1218 | * recently assigned a ->completed number and working down, find the | ||
| 1219 | * first sublist that is not assignable to an upcoming grace period. | ||
| 1220 | * Such a sublist has something in it (first two tests) and has | ||
| 1221 | * a ->completed number assigned that will complete sooner than | ||
| 1222 | * the ->completed number for newly arrived callbacks (last test). | ||
| 1223 | * | ||
| 1224 | * The key point is that any later sublist can be assigned the | ||
| 1225 | * same ->completed number as the newly arrived callbacks, which | ||
| 1226 | * means that the callbacks in any of these later sublist can be | ||
| 1227 | * grouped into a single sublist, whether or not they have already | ||
| 1228 | * been assigned a ->completed number. | ||
| 1229 | */ | ||
| 1230 | c = rcu_cbs_completed(rsp, rnp); | ||
| 1231 | for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--) | ||
| 1232 | if (rdp->nxttail[i] != rdp->nxttail[i - 1] && | ||
| 1233 | !ULONG_CMP_GE(rdp->nxtcompleted[i], c)) | ||
| 1234 | break; | ||
| 1235 | |||
| 1236 | /* | ||
| 1237 | * If there are no sublist for unassigned callbacks, leave. | ||
| 1238 | * At the same time, advance "i" one sublist, so that "i" will | ||
| 1239 | * index into the sublist where all the remaining callbacks should | ||
| 1240 | * be grouped into. | ||
| 1241 | */ | ||
| 1242 | if (++i >= RCU_NEXT_TAIL) | ||
| 1243 | return; | ||
| 1244 | |||
| 1245 | /* | ||
| 1246 | * Assign all subsequent callbacks' ->completed number to the next | ||
| 1247 | * full grace period and group them all in the sublist initially | ||
| 1248 | * indexed by "i". | ||
| 1249 | */ | ||
| 1250 | for (; i <= RCU_NEXT_TAIL; i++) { | ||
| 1251 | rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; | ||
| 1252 | rdp->nxtcompleted[i] = c; | ||
| 1253 | } | ||
| 1254 | /* Record any needed additional grace periods. */ | ||
| 1255 | rcu_start_future_gp(rnp, rdp); | ||
| 1256 | |||
| 1257 | /* Trace depending on how much we were able to accelerate. */ | ||
| 1258 | if (!*rdp->nxttail[RCU_WAIT_TAIL]) | ||
| 1259 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccWaitCB")); | ||
| 1260 | else | ||
| 1261 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("AccReadyCB")); | ||
| 1262 | } | ||
| 1263 | |||
| 1264 | /* | ||
| 1265 | * Move any callbacks whose grace period has completed to the | ||
| 1266 | * RCU_DONE_TAIL sublist, then compact the remaining sublists and | ||
| 1267 | * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL | ||
| 1268 | * sublist. This function is idempotent, so it does not hurt to | ||
| 1269 | * invoke it repeatedly. As long as it is not invoked -too- often... | ||
| 1270 | * | ||
| 1271 | * The caller must hold rnp->lock with interrupts disabled. | ||
| 1272 | */ | ||
| 1273 | static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | ||
| 1274 | struct rcu_data *rdp) | ||
| 1275 | { | ||
| 1276 | int i, j; | ||
| 1277 | |||
| 1278 | /* If the CPU has no callbacks, nothing to do. */ | ||
| 1279 | if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) | ||
| 1280 | return; | ||
| 1281 | |||
| 1282 | /* | ||
| 1283 | * Find all callbacks whose ->completed numbers indicate that they | ||
| 1284 | * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. | ||
| 1285 | */ | ||
| 1286 | for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { | ||
| 1287 | if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i])) | ||
| 1288 | break; | ||
| 1289 | rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i]; | ||
| 1290 | } | ||
| 1291 | /* Clean up any sublist tail pointers that were misordered above. */ | ||
| 1292 | for (j = RCU_WAIT_TAIL; j < i; j++) | ||
| 1293 | rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL]; | ||
| 1294 | |||
| 1295 | /* Copy down callbacks to fill in empty sublists. */ | ||
| 1296 | for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { | ||
| 1297 | if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL]) | ||
| 1298 | break; | ||
| 1299 | rdp->nxttail[j] = rdp->nxttail[i]; | ||
| 1300 | rdp->nxtcompleted[j] = rdp->nxtcompleted[i]; | ||
| 1301 | } | ||
| 1302 | |||
| 1303 | /* Classify any remaining callbacks. */ | ||
| 1304 | rcu_accelerate_cbs(rsp, rnp, rdp); | ||
| 1305 | } | ||
| 1306 | |||
| 1307 | /* | ||
| 1308 | * Update CPU-local rcu_data state to record the beginnings and ends of | ||
| 1309 | * grace periods. The caller must hold the ->lock of the leaf rcu_node | ||
| 1310 | * structure corresponding to the current CPU, and must have irqs disabled. | ||
| 1311 | */ | ||
| 1312 | static void __note_gp_changes(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) | ||
| 1313 | { | ||
| 1314 | /* Handle the ends of any preceding grace periods first. */ | ||
| 1315 | if (rdp->completed == rnp->completed) { | ||
| 1316 | |||
| 1317 | /* No grace period end, so just accelerate recent callbacks. */ | ||
| 1318 | rcu_accelerate_cbs(rsp, rnp, rdp); | ||
| 1319 | |||
| 1320 | } else { | ||
| 1321 | |||
| 1322 | /* Advance callbacks. */ | ||
| 1323 | rcu_advance_cbs(rsp, rnp, rdp); | ||
| 1324 | |||
| 1325 | /* Remember that we saw this grace-period completion. */ | ||
| 1326 | rdp->completed = rnp->completed; | ||
| 1327 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuend")); | ||
| 1328 | } | ||
| 1329 | |||
| 1330 | if (rdp->gpnum != rnp->gpnum) { | ||
| 1331 | /* | ||
| 1332 | * If the current grace period is waiting for this CPU, | ||
| 1333 | * set up to detect a quiescent state, otherwise don't | ||
| 1334 | * go looking for one. | ||
| 1335 | */ | ||
| 1336 | rdp->gpnum = rnp->gpnum; | ||
| 1337 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpustart")); | ||
| 1338 | rdp->passed_quiesce = 0; | ||
| 1339 | rdp->qs_pending = !!(rnp->qsmask & rdp->grpmask); | ||
| 1340 | zero_cpu_stall_ticks(rdp); | ||
| 1341 | } | ||
| 1342 | } | ||
| 1343 | |||
| 1344 | static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) | ||
| 1345 | { | ||
| 1346 | unsigned long flags; | ||
| 1347 | struct rcu_node *rnp; | ||
| 1348 | |||
| 1349 | local_irq_save(flags); | ||
| 1350 | rnp = rdp->mynode; | ||
| 1351 | if ((rdp->gpnum == ACCESS_ONCE(rnp->gpnum) && | ||
| 1352 | rdp->completed == ACCESS_ONCE(rnp->completed)) || /* w/out lock. */ | ||
| 1353 | !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ | ||
| 1354 | local_irq_restore(flags); | ||
| 1355 | return; | ||
| 1356 | } | ||
| 1357 | __note_gp_changes(rsp, rnp, rdp); | ||
| 1358 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1359 | } | ||
| 1360 | |||
| 1361 | /* | ||
| 1362 | * Initialize a new grace period. Return 0 if no grace period required. | ||
| 1363 | */ | ||
| 1364 | static int rcu_gp_init(struct rcu_state *rsp) | ||
| 1365 | { | ||
| 1366 | struct rcu_data *rdp; | ||
| 1367 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
| 1368 | |||
| 1369 | rcu_bind_gp_kthread(); | ||
| 1370 | raw_spin_lock_irq(&rnp->lock); | ||
| 1371 | if (rsp->gp_flags == 0) { | ||
| 1372 | /* Spurious wakeup, tell caller to go back to sleep. */ | ||
| 1373 | raw_spin_unlock_irq(&rnp->lock); | ||
| 1374 | return 0; | ||
| 1375 | } | ||
| 1376 | rsp->gp_flags = 0; /* Clear all flags: New grace period. */ | ||
| 1377 | |||
| 1378 | if (WARN_ON_ONCE(rcu_gp_in_progress(rsp))) { | ||
| 1379 | /* | ||
| 1380 | * Grace period already in progress, don't start another. | ||
| 1381 | * Not supposed to be able to happen. | ||
| 1382 | */ | ||
| 1383 | raw_spin_unlock_irq(&rnp->lock); | ||
| 1384 | return 0; | ||
| 1385 | } | ||
| 1386 | |||
| 1387 | /* Advance to a new grace period and initialize state. */ | ||
| 1388 | record_gp_stall_check_time(rsp); | ||
| 1389 | smp_wmb(); /* Record GP times before starting GP. */ | ||
| 1390 | rsp->gpnum++; | ||
| 1391 | trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); | ||
| 1392 | raw_spin_unlock_irq(&rnp->lock); | ||
| 1393 | |||
| 1394 | /* Exclude any concurrent CPU-hotplug operations. */ | ||
| 1395 | mutex_lock(&rsp->onoff_mutex); | ||
| 1396 | |||
| 1397 | /* | ||
| 1398 | * Set the quiescent-state-needed bits in all the rcu_node | ||
| 1399 | * structures for all currently online CPUs in breadth-first order, | ||
| 1400 | * starting from the root rcu_node structure, relying on the layout | ||
| 1401 | * of the tree within the rsp->node[] array. Note that other CPUs | ||
| 1402 | * will access only the leaves of the hierarchy, thus seeing that no | ||
| 1403 | * grace period is in progress, at least until the corresponding | ||
| 1404 | * leaf node has been initialized. In addition, we have excluded | ||
| 1405 | * CPU-hotplug operations. | ||
| 1406 | * | ||
| 1407 | * The grace period cannot complete until the initialization | ||
| 1408 | * process finishes, because this kthread handles both. | ||
| 1409 | */ | ||
| 1410 | rcu_for_each_node_breadth_first(rsp, rnp) { | ||
| 1411 | raw_spin_lock_irq(&rnp->lock); | ||
| 1412 | rdp = this_cpu_ptr(rsp->rda); | ||
| 1413 | rcu_preempt_check_blocked_tasks(rnp); | ||
| 1414 | rnp->qsmask = rnp->qsmaskinit; | ||
| 1415 | ACCESS_ONCE(rnp->gpnum) = rsp->gpnum; | ||
| 1416 | WARN_ON_ONCE(rnp->completed != rsp->completed); | ||
| 1417 | ACCESS_ONCE(rnp->completed) = rsp->completed; | ||
| 1418 | if (rnp == rdp->mynode) | ||
| 1419 | __note_gp_changes(rsp, rnp, rdp); | ||
| 1420 | rcu_preempt_boost_start_gp(rnp); | ||
| 1421 | trace_rcu_grace_period_init(rsp->name, rnp->gpnum, | ||
| 1422 | rnp->level, rnp->grplo, | ||
| 1423 | rnp->grphi, rnp->qsmask); | ||
| 1424 | raw_spin_unlock_irq(&rnp->lock); | ||
| 1425 | #ifdef CONFIG_PROVE_RCU_DELAY | ||
| 1426 | if ((prandom_u32() % (rcu_num_nodes + 1)) == 0 && | ||
| 1427 | system_state == SYSTEM_RUNNING) | ||
| 1428 | udelay(200); | ||
| 1429 | #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ | ||
| 1430 | cond_resched(); | ||
| 1431 | } | ||
| 1432 | |||
| 1433 | mutex_unlock(&rsp->onoff_mutex); | ||
| 1434 | return 1; | ||
| 1435 | } | ||
| 1436 | |||
| 1437 | /* | ||
| 1438 | * Do one round of quiescent-state forcing. | ||
| 1439 | */ | ||
| 1440 | static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | ||
| 1441 | { | ||
| 1442 | int fqs_state = fqs_state_in; | ||
| 1443 | bool isidle = false; | ||
| 1444 | unsigned long maxj; | ||
| 1445 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
| 1446 | |||
| 1447 | rsp->n_force_qs++; | ||
| 1448 | if (fqs_state == RCU_SAVE_DYNTICK) { | ||
| 1449 | /* Collect dyntick-idle snapshots. */ | ||
| 1450 | if (is_sysidle_rcu_state(rsp)) { | ||
| 1451 | isidle = 1; | ||
| 1452 | maxj = jiffies - ULONG_MAX / 4; | ||
| 1453 | } | ||
| 1454 | force_qs_rnp(rsp, dyntick_save_progress_counter, | ||
| 1455 | &isidle, &maxj); | ||
| 1456 | rcu_sysidle_report_gp(rsp, isidle, maxj); | ||
| 1457 | fqs_state = RCU_FORCE_QS; | ||
| 1458 | } else { | ||
| 1459 | /* Handle dyntick-idle and offline CPUs. */ | ||
| 1460 | isidle = 0; | ||
| 1461 | force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj); | ||
| 1462 | } | ||
| 1463 | /* Clear flag to prevent immediate re-entry. */ | ||
| 1464 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { | ||
| 1465 | raw_spin_lock_irq(&rnp->lock); | ||
| 1466 | rsp->gp_flags &= ~RCU_GP_FLAG_FQS; | ||
| 1467 | raw_spin_unlock_irq(&rnp->lock); | ||
| 1468 | } | ||
| 1469 | return fqs_state; | ||
| 1470 | } | ||
| 1471 | |||
| 1472 | /* | ||
| 1473 | * Clean up after the old grace period. | ||
| 1474 | */ | ||
| 1475 | static void rcu_gp_cleanup(struct rcu_state *rsp) | ||
| 1476 | { | ||
| 1477 | unsigned long gp_duration; | ||
| 1478 | int nocb = 0; | ||
| 1479 | struct rcu_data *rdp; | ||
| 1480 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
| 1481 | |||
| 1482 | raw_spin_lock_irq(&rnp->lock); | ||
| 1483 | gp_duration = jiffies - rsp->gp_start; | ||
| 1484 | if (gp_duration > rsp->gp_max) | ||
| 1485 | rsp->gp_max = gp_duration; | ||
| 1486 | |||
| 1487 | /* | ||
| 1488 | * We know the grace period is complete, but to everyone else | ||
| 1489 | * it appears to still be ongoing. But it is also the case | ||
| 1490 | * that to everyone else it looks like there is nothing that | ||
| 1491 | * they can do to advance the grace period. It is therefore | ||
| 1492 | * safe for us to drop the lock in order to mark the grace | ||
| 1493 | * period as completed in all of the rcu_node structures. | ||
| 1494 | */ | ||
| 1495 | raw_spin_unlock_irq(&rnp->lock); | ||
| 1496 | |||
| 1497 | /* | ||
| 1498 | * Propagate new ->completed value to rcu_node structures so | ||
| 1499 | * that other CPUs don't have to wait until the start of the next | ||
| 1500 | * grace period to process their callbacks. This also avoids | ||
| 1501 | * some nasty RCU grace-period initialization races by forcing | ||
| 1502 | * the end of the current grace period to be completely recorded in | ||
| 1503 | * all of the rcu_node structures before the beginning of the next | ||
| 1504 | * grace period is recorded in any of the rcu_node structures. | ||
| 1505 | */ | ||
| 1506 | rcu_for_each_node_breadth_first(rsp, rnp) { | ||
| 1507 | raw_spin_lock_irq(&rnp->lock); | ||
| 1508 | ACCESS_ONCE(rnp->completed) = rsp->gpnum; | ||
| 1509 | rdp = this_cpu_ptr(rsp->rda); | ||
| 1510 | if (rnp == rdp->mynode) | ||
| 1511 | __note_gp_changes(rsp, rnp, rdp); | ||
| 1512 | nocb += rcu_future_gp_cleanup(rsp, rnp); | ||
| 1513 | raw_spin_unlock_irq(&rnp->lock); | ||
| 1514 | cond_resched(); | ||
| 1515 | } | ||
| 1516 | rnp = rcu_get_root(rsp); | ||
| 1517 | raw_spin_lock_irq(&rnp->lock); | ||
| 1518 | rcu_nocb_gp_set(rnp, nocb); | ||
| 1519 | |||
| 1520 | rsp->completed = rsp->gpnum; /* Declare grace period done. */ | ||
| 1521 | trace_rcu_grace_period(rsp->name, rsp->completed, TPS("end")); | ||
| 1522 | rsp->fqs_state = RCU_GP_IDLE; | ||
| 1523 | rdp = this_cpu_ptr(rsp->rda); | ||
| 1524 | rcu_advance_cbs(rsp, rnp, rdp); /* Reduce false positives below. */ | ||
| 1525 | if (cpu_needs_another_gp(rsp, rdp)) { | ||
| 1526 | rsp->gp_flags = RCU_GP_FLAG_INIT; | ||
| 1527 | trace_rcu_grace_period(rsp->name, | ||
| 1528 | ACCESS_ONCE(rsp->gpnum), | ||
| 1529 | TPS("newreq")); | ||
| 1530 | } | ||
| 1531 | raw_spin_unlock_irq(&rnp->lock); | ||
| 1532 | } | ||
| 1533 | |||
| 1534 | /* | ||
| 1535 | * Body of kthread that handles grace periods. | ||
| 1536 | */ | ||
| 1537 | static int __noreturn rcu_gp_kthread(void *arg) | ||
| 1538 | { | ||
| 1539 | int fqs_state; | ||
| 1540 | int gf; | ||
| 1541 | unsigned long j; | ||
| 1542 | int ret; | ||
| 1543 | struct rcu_state *rsp = arg; | ||
| 1544 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
| 1545 | |||
| 1546 | for (;;) { | ||
| 1547 | |||
| 1548 | /* Handle grace-period start. */ | ||
| 1549 | for (;;) { | ||
| 1550 | trace_rcu_grace_period(rsp->name, | ||
| 1551 | ACCESS_ONCE(rsp->gpnum), | ||
| 1552 | TPS("reqwait")); | ||
| 1553 | wait_event_interruptible(rsp->gp_wq, | ||
| 1554 | ACCESS_ONCE(rsp->gp_flags) & | ||
| 1555 | RCU_GP_FLAG_INIT); | ||
| 1556 | if (rcu_gp_init(rsp)) | ||
| 1557 | break; | ||
| 1558 | cond_resched(); | ||
| 1559 | flush_signals(current); | ||
| 1560 | trace_rcu_grace_period(rsp->name, | ||
| 1561 | ACCESS_ONCE(rsp->gpnum), | ||
| 1562 | TPS("reqwaitsig")); | ||
| 1563 | } | ||
| 1564 | |||
| 1565 | /* Handle quiescent-state forcing. */ | ||
| 1566 | fqs_state = RCU_SAVE_DYNTICK; | ||
| 1567 | j = jiffies_till_first_fqs; | ||
| 1568 | if (j > HZ) { | ||
| 1569 | j = HZ; | ||
| 1570 | jiffies_till_first_fqs = HZ; | ||
| 1571 | } | ||
| 1572 | ret = 0; | ||
| 1573 | for (;;) { | ||
| 1574 | if (!ret) | ||
| 1575 | rsp->jiffies_force_qs = jiffies + j; | ||
| 1576 | trace_rcu_grace_period(rsp->name, | ||
| 1577 | ACCESS_ONCE(rsp->gpnum), | ||
| 1578 | TPS("fqswait")); | ||
| 1579 | ret = wait_event_interruptible_timeout(rsp->gp_wq, | ||
| 1580 | ((gf = ACCESS_ONCE(rsp->gp_flags)) & | ||
| 1581 | RCU_GP_FLAG_FQS) || | ||
| 1582 | (!ACCESS_ONCE(rnp->qsmask) && | ||
| 1583 | !rcu_preempt_blocked_readers_cgp(rnp)), | ||
| 1584 | j); | ||
| 1585 | /* If grace period done, leave loop. */ | ||
| 1586 | if (!ACCESS_ONCE(rnp->qsmask) && | ||
| 1587 | !rcu_preempt_blocked_readers_cgp(rnp)) | ||
| 1588 | break; | ||
| 1589 | /* If time for quiescent-state forcing, do it. */ | ||
| 1590 | if (ULONG_CMP_GE(jiffies, rsp->jiffies_force_qs) || | ||
| 1591 | (gf & RCU_GP_FLAG_FQS)) { | ||
| 1592 | trace_rcu_grace_period(rsp->name, | ||
| 1593 | ACCESS_ONCE(rsp->gpnum), | ||
| 1594 | TPS("fqsstart")); | ||
| 1595 | fqs_state = rcu_gp_fqs(rsp, fqs_state); | ||
| 1596 | trace_rcu_grace_period(rsp->name, | ||
| 1597 | ACCESS_ONCE(rsp->gpnum), | ||
| 1598 | TPS("fqsend")); | ||
| 1599 | cond_resched(); | ||
| 1600 | } else { | ||
| 1601 | /* Deal with stray signal. */ | ||
| 1602 | cond_resched(); | ||
| 1603 | flush_signals(current); | ||
| 1604 | trace_rcu_grace_period(rsp->name, | ||
| 1605 | ACCESS_ONCE(rsp->gpnum), | ||
| 1606 | TPS("fqswaitsig")); | ||
| 1607 | } | ||
| 1608 | j = jiffies_till_next_fqs; | ||
| 1609 | if (j > HZ) { | ||
| 1610 | j = HZ; | ||
| 1611 | jiffies_till_next_fqs = HZ; | ||
| 1612 | } else if (j < 1) { | ||
| 1613 | j = 1; | ||
| 1614 | jiffies_till_next_fqs = 1; | ||
| 1615 | } | ||
| 1616 | } | ||
| 1617 | |||
| 1618 | /* Handle grace-period end. */ | ||
| 1619 | rcu_gp_cleanup(rsp); | ||
| 1620 | } | ||
| 1621 | } | ||
| 1622 | |||
| 1623 | static void rsp_wakeup(struct irq_work *work) | ||
| 1624 | { | ||
| 1625 | struct rcu_state *rsp = container_of(work, struct rcu_state, wakeup_work); | ||
| 1626 | |||
| 1627 | /* Wake up rcu_gp_kthread() to start the grace period. */ | ||
| 1628 | wake_up(&rsp->gp_wq); | ||
| 1629 | } | ||
| 1630 | |||
| 1631 | /* | ||
| 1632 | * Start a new RCU grace period if warranted, re-initializing the hierarchy | ||
| 1633 | * in preparation for detecting the next grace period. The caller must hold | ||
| 1634 | * the root node's ->lock and hard irqs must be disabled. | ||
| 1635 | * | ||
| 1636 | * Note that it is legal for a dying CPU (which is marked as offline) to | ||
| 1637 | * invoke this function. This can happen when the dying CPU reports its | ||
| 1638 | * quiescent state. | ||
| 1639 | */ | ||
| 1640 | static void | ||
| 1641 | rcu_start_gp_advanced(struct rcu_state *rsp, struct rcu_node *rnp, | ||
| 1642 | struct rcu_data *rdp) | ||
| 1643 | { | ||
| 1644 | if (!rsp->gp_kthread || !cpu_needs_another_gp(rsp, rdp)) { | ||
| 1645 | /* | ||
| 1646 | * Either we have not yet spawned the grace-period | ||
| 1647 | * task, this CPU does not need another grace period, | ||
| 1648 | * or a grace period is already in progress. | ||
| 1649 | * Either way, don't start a new grace period. | ||
| 1650 | */ | ||
| 1651 | return; | ||
| 1652 | } | ||
| 1653 | rsp->gp_flags = RCU_GP_FLAG_INIT; | ||
| 1654 | trace_rcu_grace_period(rsp->name, ACCESS_ONCE(rsp->gpnum), | ||
| 1655 | TPS("newreq")); | ||
| 1656 | |||
| 1657 | /* | ||
| 1658 | * We can't do wakeups while holding the rnp->lock, as that | ||
| 1659 | * could cause possible deadlocks with the rq->lock. Defer | ||
| 1660 | * the wakeup to interrupt context. And don't bother waking | ||
| 1661 | * up the running kthread. | ||
| 1662 | */ | ||
| 1663 | if (current != rsp->gp_kthread) | ||
| 1664 | irq_work_queue(&rsp->wakeup_work); | ||
| 1665 | } | ||
| 1666 | |||
| 1667 | /* | ||
| 1668 | * Similar to rcu_start_gp_advanced(), but also advance the calling CPU's | ||
| 1669 | * callbacks. Note that rcu_start_gp_advanced() cannot do this because it | ||
| 1670 | * is invoked indirectly from rcu_advance_cbs(), which would result in | ||
| 1671 | * endless recursion -- or would do so if it wasn't for the self-deadlock | ||
| 1672 | * that is encountered beforehand. | ||
| 1673 | */ | ||
| 1674 | static void | ||
| 1675 | rcu_start_gp(struct rcu_state *rsp) | ||
| 1676 | { | ||
| 1677 | struct rcu_data *rdp = this_cpu_ptr(rsp->rda); | ||
| 1678 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
| 1679 | |||
| 1680 | /* | ||
| 1681 | * If there is no grace period in progress right now, any | ||
| 1682 | * callbacks we have up to this point will be satisfied by the | ||
| 1683 | * next grace period. Also, advancing the callbacks reduces the | ||
| 1684 | * probability of false positives from cpu_needs_another_gp() | ||
| 1685 | * resulting in pointless grace periods. So, advance callbacks | ||
| 1686 | * then start the grace period! | ||
| 1687 | */ | ||
| 1688 | rcu_advance_cbs(rsp, rnp, rdp); | ||
| 1689 | rcu_start_gp_advanced(rsp, rnp, rdp); | ||
| 1690 | } | ||
| 1691 | |||
| 1692 | /* | ||
| 1693 | * Report a full set of quiescent states to the specified rcu_state | ||
| 1694 | * data structure. This involves cleaning up after the prior grace | ||
| 1695 | * period and letting rcu_start_gp() start up the next grace period | ||
| 1696 | * if one is needed. Note that the caller must hold rnp->lock, which | ||
| 1697 | * is released before return. | ||
| 1698 | */ | ||
| 1699 | static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) | ||
| 1700 | __releases(rcu_get_root(rsp)->lock) | ||
| 1701 | { | ||
| 1702 | WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); | ||
| 1703 | raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); | ||
| 1704 | wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ | ||
| 1705 | } | ||
| 1706 | |||
| 1707 | /* | ||
| 1708 | * Similar to rcu_report_qs_rdp(), for which it is a helper function. | ||
| 1709 | * Allows quiescent states for a group of CPUs to be reported at one go | ||
| 1710 | * to the specified rcu_node structure, though all the CPUs in the group | ||
| 1711 | * must be represented by the same rcu_node structure (which need not be | ||
| 1712 | * a leaf rcu_node structure, though it often will be). That structure's | ||
| 1713 | * lock must be held upon entry, and it is released before return. | ||
| 1714 | */ | ||
| 1715 | static void | ||
| 1716 | rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, | ||
| 1717 | struct rcu_node *rnp, unsigned long flags) | ||
| 1718 | __releases(rnp->lock) | ||
| 1719 | { | ||
| 1720 | struct rcu_node *rnp_c; | ||
| 1721 | |||
| 1722 | /* Walk up the rcu_node hierarchy. */ | ||
| 1723 | for (;;) { | ||
| 1724 | if (!(rnp->qsmask & mask)) { | ||
| 1725 | |||
| 1726 | /* Our bit has already been cleared, so done. */ | ||
| 1727 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1728 | return; | ||
| 1729 | } | ||
| 1730 | rnp->qsmask &= ~mask; | ||
| 1731 | trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum, | ||
| 1732 | mask, rnp->qsmask, rnp->level, | ||
| 1733 | rnp->grplo, rnp->grphi, | ||
| 1734 | !!rnp->gp_tasks); | ||
| 1735 | if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { | ||
| 1736 | |||
| 1737 | /* Other bits still set at this level, so done. */ | ||
| 1738 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1739 | return; | ||
| 1740 | } | ||
| 1741 | mask = rnp->grpmask; | ||
| 1742 | if (rnp->parent == NULL) { | ||
| 1743 | |||
| 1744 | /* No more levels. Exit loop holding root lock. */ | ||
| 1745 | |||
| 1746 | break; | ||
| 1747 | } | ||
| 1748 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1749 | rnp_c = rnp; | ||
| 1750 | rnp = rnp->parent; | ||
| 1751 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 1752 | WARN_ON_ONCE(rnp_c->qsmask); | ||
| 1753 | } | ||
| 1754 | |||
| 1755 | /* | ||
| 1756 | * Get here if we are the last CPU to pass through a quiescent | ||
| 1757 | * state for this grace period. Invoke rcu_report_qs_rsp() | ||
| 1758 | * to clean up and start the next grace period if one is needed. | ||
| 1759 | */ | ||
| 1760 | rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */ | ||
| 1761 | } | ||
| 1762 | |||
| 1763 | /* | ||
| 1764 | * Record a quiescent state for the specified CPU to that CPU's rcu_data | ||
| 1765 | * structure. This must be either called from the specified CPU, or | ||
| 1766 | * called when the specified CPU is known to be offline (and when it is | ||
| 1767 | * also known that no other CPU is concurrently trying to help the offline | ||
| 1768 | * CPU). The lastcomp argument is used to make sure we are still in the | ||
| 1769 | * grace period of interest. We don't want to end the current grace period | ||
| 1770 | * based on quiescent states detected in an earlier grace period! | ||
| 1771 | */ | ||
| 1772 | static void | ||
| 1773 | rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) | ||
| 1774 | { | ||
| 1775 | unsigned long flags; | ||
| 1776 | unsigned long mask; | ||
| 1777 | struct rcu_node *rnp; | ||
| 1778 | |||
| 1779 | rnp = rdp->mynode; | ||
| 1780 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 1781 | if (rdp->passed_quiesce == 0 || rdp->gpnum != rnp->gpnum || | ||
| 1782 | rnp->completed == rnp->gpnum) { | ||
| 1783 | |||
| 1784 | /* | ||
| 1785 | * The grace period in which this quiescent state was | ||
| 1786 | * recorded has ended, so don't report it upwards. | ||
| 1787 | * We will instead need a new quiescent state that lies | ||
| 1788 | * within the current grace period. | ||
| 1789 | */ | ||
| 1790 | rdp->passed_quiesce = 0; /* need qs for new gp. */ | ||
| 1791 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1792 | return; | ||
| 1793 | } | ||
| 1794 | mask = rdp->grpmask; | ||
| 1795 | if ((rnp->qsmask & mask) == 0) { | ||
| 1796 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1797 | } else { | ||
| 1798 | rdp->qs_pending = 0; | ||
| 1799 | |||
| 1800 | /* | ||
| 1801 | * This GP can't end until cpu checks in, so all of our | ||
| 1802 | * callbacks can be processed during the next GP. | ||
| 1803 | */ | ||
| 1804 | rcu_accelerate_cbs(rsp, rnp, rdp); | ||
| 1805 | |||
| 1806 | rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ | ||
| 1807 | } | ||
| 1808 | } | ||
| 1809 | |||
| 1810 | /* | ||
| 1811 | * Check to see if there is a new grace period of which this CPU | ||
| 1812 | * is not yet aware, and if so, set up local rcu_data state for it. | ||
| 1813 | * Otherwise, see if this CPU has just passed through its first | ||
| 1814 | * quiescent state for this grace period, and record that fact if so. | ||
| 1815 | */ | ||
| 1816 | static void | ||
| 1817 | rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp) | ||
| 1818 | { | ||
| 1819 | /* Check for grace-period ends and beginnings. */ | ||
| 1820 | note_gp_changes(rsp, rdp); | ||
| 1821 | |||
| 1822 | /* | ||
| 1823 | * Does this CPU still need to do its part for current grace period? | ||
| 1824 | * If no, return and let the other CPUs do their part as well. | ||
| 1825 | */ | ||
| 1826 | if (!rdp->qs_pending) | ||
| 1827 | return; | ||
| 1828 | |||
| 1829 | /* | ||
| 1830 | * Was there a quiescent state since the beginning of the grace | ||
| 1831 | * period? If no, then exit and wait for the next call. | ||
| 1832 | */ | ||
| 1833 | if (!rdp->passed_quiesce) | ||
| 1834 | return; | ||
| 1835 | |||
| 1836 | /* | ||
| 1837 | * Tell RCU we are done (but rcu_report_qs_rdp() will be the | ||
| 1838 | * judge of that). | ||
| 1839 | */ | ||
| 1840 | rcu_report_qs_rdp(rdp->cpu, rsp, rdp); | ||
| 1841 | } | ||
| 1842 | |||
| 1843 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 1844 | |||
| 1845 | /* | ||
| 1846 | * Send the specified CPU's RCU callbacks to the orphanage. The | ||
| 1847 | * specified CPU must be offline, and the caller must hold the | ||
| 1848 | * ->orphan_lock. | ||
| 1849 | */ | ||
| 1850 | static void | ||
| 1851 | rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp, | ||
| 1852 | struct rcu_node *rnp, struct rcu_data *rdp) | ||
| 1853 | { | ||
| 1854 | /* No-CBs CPUs do not have orphanable callbacks. */ | ||
| 1855 | if (rcu_is_nocb_cpu(rdp->cpu)) | ||
| 1856 | return; | ||
| 1857 | |||
| 1858 | /* | ||
| 1859 | * Orphan the callbacks. First adjust the counts. This is safe | ||
| 1860 | * because _rcu_barrier() excludes CPU-hotplug operations, so it | ||
| 1861 | * cannot be running now. Thus no memory barrier is required. | ||
| 1862 | */ | ||
| 1863 | if (rdp->nxtlist != NULL) { | ||
| 1864 | rsp->qlen_lazy += rdp->qlen_lazy; | ||
| 1865 | rsp->qlen += rdp->qlen; | ||
| 1866 | rdp->n_cbs_orphaned += rdp->qlen; | ||
| 1867 | rdp->qlen_lazy = 0; | ||
| 1868 | ACCESS_ONCE(rdp->qlen) = 0; | ||
| 1869 | } | ||
| 1870 | |||
| 1871 | /* | ||
| 1872 | * Next, move those callbacks still needing a grace period to | ||
| 1873 | * the orphanage, where some other CPU will pick them up. | ||
| 1874 | * Some of the callbacks might have gone partway through a grace | ||
| 1875 | * period, but that is too bad. They get to start over because we | ||
| 1876 | * cannot assume that grace periods are synchronized across CPUs. | ||
| 1877 | * We don't bother updating the ->nxttail[] array yet, instead | ||
| 1878 | * we just reset the whole thing later on. | ||
| 1879 | */ | ||
| 1880 | if (*rdp->nxttail[RCU_DONE_TAIL] != NULL) { | ||
| 1881 | *rsp->orphan_nxttail = *rdp->nxttail[RCU_DONE_TAIL]; | ||
| 1882 | rsp->orphan_nxttail = rdp->nxttail[RCU_NEXT_TAIL]; | ||
| 1883 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; | ||
| 1884 | } | ||
| 1885 | |||
| 1886 | /* | ||
| 1887 | * Then move the ready-to-invoke callbacks to the orphanage, | ||
| 1888 | * where some other CPU will pick them up. These will not be | ||
| 1889 | * required to pass though another grace period: They are done. | ||
| 1890 | */ | ||
| 1891 | if (rdp->nxtlist != NULL) { | ||
| 1892 | *rsp->orphan_donetail = rdp->nxtlist; | ||
| 1893 | rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL]; | ||
| 1894 | } | ||
| 1895 | |||
| 1896 | /* Finally, initialize the rcu_data structure's list to empty. */ | ||
| 1897 | init_callback_list(rdp); | ||
| 1898 | } | ||
| 1899 | |||
| 1900 | /* | ||
| 1901 | * Adopt the RCU callbacks from the specified rcu_state structure's | ||
| 1902 | * orphanage. The caller must hold the ->orphan_lock. | ||
| 1903 | */ | ||
| 1904 | static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) | ||
| 1905 | { | ||
| 1906 | int i; | ||
| 1907 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); | ||
| 1908 | |||
| 1909 | /* No-CBs CPUs are handled specially. */ | ||
| 1910 | if (rcu_nocb_adopt_orphan_cbs(rsp, rdp)) | ||
| 1911 | return; | ||
| 1912 | |||
| 1913 | /* Do the accounting first. */ | ||
| 1914 | rdp->qlen_lazy += rsp->qlen_lazy; | ||
| 1915 | rdp->qlen += rsp->qlen; | ||
| 1916 | rdp->n_cbs_adopted += rsp->qlen; | ||
| 1917 | if (rsp->qlen_lazy != rsp->qlen) | ||
| 1918 | rcu_idle_count_callbacks_posted(); | ||
| 1919 | rsp->qlen_lazy = 0; | ||
| 1920 | rsp->qlen = 0; | ||
| 1921 | |||
| 1922 | /* | ||
| 1923 | * We do not need a memory barrier here because the only way we | ||
| 1924 | * can get here if there is an rcu_barrier() in flight is if | ||
| 1925 | * we are the task doing the rcu_barrier(). | ||
| 1926 | */ | ||
| 1927 | |||
| 1928 | /* First adopt the ready-to-invoke callbacks. */ | ||
| 1929 | if (rsp->orphan_donelist != NULL) { | ||
| 1930 | *rsp->orphan_donetail = *rdp->nxttail[RCU_DONE_TAIL]; | ||
| 1931 | *rdp->nxttail[RCU_DONE_TAIL] = rsp->orphan_donelist; | ||
| 1932 | for (i = RCU_NEXT_SIZE - 1; i >= RCU_DONE_TAIL; i--) | ||
| 1933 | if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) | ||
| 1934 | rdp->nxttail[i] = rsp->orphan_donetail; | ||
| 1935 | rsp->orphan_donelist = NULL; | ||
| 1936 | rsp->orphan_donetail = &rsp->orphan_donelist; | ||
| 1937 | } | ||
| 1938 | |||
| 1939 | /* And then adopt the callbacks that still need a grace period. */ | ||
| 1940 | if (rsp->orphan_nxtlist != NULL) { | ||
| 1941 | *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxtlist; | ||
| 1942 | rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_nxttail; | ||
| 1943 | rsp->orphan_nxtlist = NULL; | ||
| 1944 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; | ||
| 1945 | } | ||
| 1946 | } | ||
| 1947 | |||
| 1948 | /* | ||
| 1949 | * Trace the fact that this CPU is going offline. | ||
| 1950 | */ | ||
| 1951 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | ||
| 1952 | { | ||
| 1953 | RCU_TRACE(unsigned long mask); | ||
| 1954 | RCU_TRACE(struct rcu_data *rdp = this_cpu_ptr(rsp->rda)); | ||
| 1955 | RCU_TRACE(struct rcu_node *rnp = rdp->mynode); | ||
| 1956 | |||
| 1957 | RCU_TRACE(mask = rdp->grpmask); | ||
| 1958 | trace_rcu_grace_period(rsp->name, | ||
| 1959 | rnp->gpnum + 1 - !!(rnp->qsmask & mask), | ||
| 1960 | TPS("cpuofl")); | ||
| 1961 | } | ||
| 1962 | |||
| 1963 | /* | ||
| 1964 | * The CPU has been completely removed, and some other CPU is reporting | ||
| 1965 | * this fact from process context. Do the remainder of the cleanup, | ||
| 1966 | * including orphaning the outgoing CPU's RCU callbacks, and also | ||
| 1967 | * adopting them. There can only be one CPU hotplug operation at a time, | ||
| 1968 | * so no other CPU can be attempting to update rcu_cpu_kthread_task. | ||
| 1969 | */ | ||
| 1970 | static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | ||
| 1971 | { | ||
| 1972 | unsigned long flags; | ||
| 1973 | unsigned long mask; | ||
| 1974 | int need_report = 0; | ||
| 1975 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 1976 | struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ | ||
| 1977 | |||
| 1978 | /* Adjust any no-longer-needed kthreads. */ | ||
| 1979 | rcu_boost_kthread_setaffinity(rnp, -1); | ||
| 1980 | |||
| 1981 | /* Remove the dead CPU from the bitmasks in the rcu_node hierarchy. */ | ||
| 1982 | |||
| 1983 | /* Exclude any attempts to start a new grace period. */ | ||
| 1984 | mutex_lock(&rsp->onoff_mutex); | ||
| 1985 | raw_spin_lock_irqsave(&rsp->orphan_lock, flags); | ||
| 1986 | |||
| 1987 | /* Orphan the dead CPU's callbacks, and adopt them if appropriate. */ | ||
| 1988 | rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp); | ||
| 1989 | rcu_adopt_orphan_cbs(rsp); | ||
| 1990 | |||
| 1991 | /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ | ||
| 1992 | mask = rdp->grpmask; /* rnp->grplo is constant. */ | ||
| 1993 | do { | ||
| 1994 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | ||
| 1995 | rnp->qsmaskinit &= ~mask; | ||
| 1996 | if (rnp->qsmaskinit != 0) { | ||
| 1997 | if (rnp != rdp->mynode) | ||
| 1998 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
| 1999 | break; | ||
| 2000 | } | ||
| 2001 | if (rnp == rdp->mynode) | ||
| 2002 | need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); | ||
| 2003 | else | ||
| 2004 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
| 2005 | mask = rnp->grpmask; | ||
| 2006 | rnp = rnp->parent; | ||
| 2007 | } while (rnp != NULL); | ||
| 2008 | |||
| 2009 | /* | ||
| 2010 | * We still hold the leaf rcu_node structure lock here, and | ||
| 2011 | * irqs are still disabled. The reason for this subterfuge is | ||
| 2012 | * because invoking rcu_report_unblock_qs_rnp() with ->orphan_lock | ||
| 2013 | * held leads to deadlock. | ||
| 2014 | */ | ||
| 2015 | raw_spin_unlock(&rsp->orphan_lock); /* irqs remain disabled. */ | ||
| 2016 | rnp = rdp->mynode; | ||
| 2017 | if (need_report & RCU_OFL_TASKS_NORM_GP) | ||
| 2018 | rcu_report_unblock_qs_rnp(rnp, flags); | ||
| 2019 | else | ||
| 2020 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 2021 | if (need_report & RCU_OFL_TASKS_EXP_GP) | ||
| 2022 | rcu_report_exp_rnp(rsp, rnp, true); | ||
| 2023 | WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL, | ||
| 2024 | "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n", | ||
| 2025 | cpu, rdp->qlen, rdp->nxtlist); | ||
| 2026 | init_callback_list(rdp); | ||
| 2027 | /* Disallow further callbacks on this CPU. */ | ||
| 2028 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | ||
| 2029 | mutex_unlock(&rsp->onoff_mutex); | ||
| 2030 | } | ||
| 2031 | |||
| 2032 | #else /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
| 2033 | |||
| 2034 | static void rcu_cleanup_dying_cpu(struct rcu_state *rsp) | ||
| 2035 | { | ||
| 2036 | } | ||
| 2037 | |||
| 2038 | static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp) | ||
| 2039 | { | ||
| 2040 | } | ||
| 2041 | |||
| 2042 | #endif /* #else #ifdef CONFIG_HOTPLUG_CPU */ | ||
| 2043 | |||
| 2044 | /* | ||
| 2045 | * Invoke any RCU callbacks that have made it to the end of their grace | ||
| 2046 | * period. Thottle as specified by rdp->blimit. | ||
| 2047 | */ | ||
| 2048 | static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | ||
| 2049 | { | ||
| 2050 | unsigned long flags; | ||
| 2051 | struct rcu_head *next, *list, **tail; | ||
| 2052 | long bl, count, count_lazy; | ||
| 2053 | int i; | ||
| 2054 | |||
| 2055 | /* If no callbacks are ready, just return. */ | ||
| 2056 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { | ||
| 2057 | trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); | ||
| 2058 | trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), | ||
| 2059 | need_resched(), is_idle_task(current), | ||
| 2060 | rcu_is_callbacks_kthread()); | ||
| 2061 | return; | ||
| 2062 | } | ||
| 2063 | |||
| 2064 | /* | ||
| 2065 | * Extract the list of ready callbacks, disabling to prevent | ||
| 2066 | * races with call_rcu() from interrupt handlers. | ||
| 2067 | */ | ||
| 2068 | local_irq_save(flags); | ||
| 2069 | WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); | ||
| 2070 | bl = rdp->blimit; | ||
| 2071 | trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, bl); | ||
| 2072 | list = rdp->nxtlist; | ||
| 2073 | rdp->nxtlist = *rdp->nxttail[RCU_DONE_TAIL]; | ||
| 2074 | *rdp->nxttail[RCU_DONE_TAIL] = NULL; | ||
| 2075 | tail = rdp->nxttail[RCU_DONE_TAIL]; | ||
| 2076 | for (i = RCU_NEXT_SIZE - 1; i >= 0; i--) | ||
| 2077 | if (rdp->nxttail[i] == rdp->nxttail[RCU_DONE_TAIL]) | ||
| 2078 | rdp->nxttail[i] = &rdp->nxtlist; | ||
| 2079 | local_irq_restore(flags); | ||
| 2080 | |||
| 2081 | /* Invoke callbacks. */ | ||
| 2082 | count = count_lazy = 0; | ||
| 2083 | while (list) { | ||
| 2084 | next = list->next; | ||
| 2085 | prefetch(next); | ||
| 2086 | debug_rcu_head_unqueue(list); | ||
| 2087 | if (__rcu_reclaim(rsp->name, list)) | ||
| 2088 | count_lazy++; | ||
| 2089 | list = next; | ||
| 2090 | /* Stop only if limit reached and CPU has something to do. */ | ||
| 2091 | if (++count >= bl && | ||
| 2092 | (need_resched() || | ||
| 2093 | (!is_idle_task(current) && !rcu_is_callbacks_kthread()))) | ||
| 2094 | break; | ||
| 2095 | } | ||
| 2096 | |||
| 2097 | local_irq_save(flags); | ||
| 2098 | trace_rcu_batch_end(rsp->name, count, !!list, need_resched(), | ||
| 2099 | is_idle_task(current), | ||
| 2100 | rcu_is_callbacks_kthread()); | ||
| 2101 | |||
| 2102 | /* Update count, and requeue any remaining callbacks. */ | ||
| 2103 | if (list != NULL) { | ||
| 2104 | *tail = rdp->nxtlist; | ||
| 2105 | rdp->nxtlist = list; | ||
| 2106 | for (i = 0; i < RCU_NEXT_SIZE; i++) | ||
| 2107 | if (&rdp->nxtlist == rdp->nxttail[i]) | ||
| 2108 | rdp->nxttail[i] = tail; | ||
| 2109 | else | ||
| 2110 | break; | ||
| 2111 | } | ||
| 2112 | smp_mb(); /* List handling before counting for rcu_barrier(). */ | ||
| 2113 | rdp->qlen_lazy -= count_lazy; | ||
| 2114 | ACCESS_ONCE(rdp->qlen) -= count; | ||
| 2115 | rdp->n_cbs_invoked += count; | ||
| 2116 | |||
| 2117 | /* Reinstate batch limit if we have worked down the excess. */ | ||
| 2118 | if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark) | ||
| 2119 | rdp->blimit = blimit; | ||
| 2120 | |||
| 2121 | /* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */ | ||
| 2122 | if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) { | ||
| 2123 | rdp->qlen_last_fqs_check = 0; | ||
| 2124 | rdp->n_force_qs_snap = rsp->n_force_qs; | ||
| 2125 | } else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark) | ||
| 2126 | rdp->qlen_last_fqs_check = rdp->qlen; | ||
| 2127 | WARN_ON_ONCE((rdp->nxtlist == NULL) != (rdp->qlen == 0)); | ||
| 2128 | |||
| 2129 | local_irq_restore(flags); | ||
| 2130 | |||
| 2131 | /* Re-invoke RCU core processing if there are callbacks remaining. */ | ||
| 2132 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | ||
| 2133 | invoke_rcu_core(); | ||
| 2134 | } | ||
| 2135 | |||
| 2136 | /* | ||
| 2137 | * Check to see if this CPU is in a non-context-switch quiescent state | ||
| 2138 | * (user mode or idle loop for rcu, non-softirq execution for rcu_bh). | ||
| 2139 | * Also schedule RCU core processing. | ||
| 2140 | * | ||
| 2141 | * This function must be called from hardirq context. It is normally | ||
| 2142 | * invoked from the scheduling-clock interrupt. If rcu_pending returns | ||
| 2143 | * false, there is no point in invoking rcu_check_callbacks(). | ||
| 2144 | */ | ||
| 2145 | void rcu_check_callbacks(int cpu, int user) | ||
| 2146 | { | ||
| 2147 | trace_rcu_utilization(TPS("Start scheduler-tick")); | ||
| 2148 | increment_cpu_stall_ticks(); | ||
| 2149 | if (user || rcu_is_cpu_rrupt_from_idle()) { | ||
| 2150 | |||
| 2151 | /* | ||
| 2152 | * Get here if this CPU took its interrupt from user | ||
| 2153 | * mode or from the idle loop, and if this is not a | ||
| 2154 | * nested interrupt. In this case, the CPU is in | ||
| 2155 | * a quiescent state, so note it. | ||
| 2156 | * | ||
| 2157 | * No memory barrier is required here because both | ||
| 2158 | * rcu_sched_qs() and rcu_bh_qs() reference only CPU-local | ||
| 2159 | * variables that other CPUs neither access nor modify, | ||
| 2160 | * at least not while the corresponding CPU is online. | ||
| 2161 | */ | ||
| 2162 | |||
| 2163 | rcu_sched_qs(cpu); | ||
| 2164 | rcu_bh_qs(cpu); | ||
| 2165 | |||
| 2166 | } else if (!in_softirq()) { | ||
| 2167 | |||
| 2168 | /* | ||
| 2169 | * Get here if this CPU did not take its interrupt from | ||
| 2170 | * softirq, in other words, if it is not interrupting | ||
| 2171 | * a rcu_bh read-side critical section. This is an _bh | ||
| 2172 | * critical section, so note it. | ||
| 2173 | */ | ||
| 2174 | |||
| 2175 | rcu_bh_qs(cpu); | ||
| 2176 | } | ||
| 2177 | rcu_preempt_check_callbacks(cpu); | ||
| 2178 | if (rcu_pending(cpu)) | ||
| 2179 | invoke_rcu_core(); | ||
| 2180 | trace_rcu_utilization(TPS("End scheduler-tick")); | ||
| 2181 | } | ||
| 2182 | |||
| 2183 | /* | ||
| 2184 | * Scan the leaf rcu_node structures, processing dyntick state for any that | ||
| 2185 | * have not yet encountered a quiescent state, using the function specified. | ||
| 2186 | * Also initiate boosting for any threads blocked on the root rcu_node. | ||
| 2187 | * | ||
| 2188 | * The caller must have suppressed start of new grace periods. | ||
| 2189 | */ | ||
| 2190 | static void force_qs_rnp(struct rcu_state *rsp, | ||
| 2191 | int (*f)(struct rcu_data *rsp, bool *isidle, | ||
| 2192 | unsigned long *maxj), | ||
| 2193 | bool *isidle, unsigned long *maxj) | ||
| 2194 | { | ||
| 2195 | unsigned long bit; | ||
| 2196 | int cpu; | ||
| 2197 | unsigned long flags; | ||
| 2198 | unsigned long mask; | ||
| 2199 | struct rcu_node *rnp; | ||
| 2200 | |||
| 2201 | rcu_for_each_leaf_node(rsp, rnp) { | ||
| 2202 | cond_resched(); | ||
| 2203 | mask = 0; | ||
| 2204 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 2205 | if (!rcu_gp_in_progress(rsp)) { | ||
| 2206 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 2207 | return; | ||
| 2208 | } | ||
| 2209 | if (rnp->qsmask == 0) { | ||
| 2210 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ | ||
| 2211 | continue; | ||
| 2212 | } | ||
| 2213 | cpu = rnp->grplo; | ||
| 2214 | bit = 1; | ||
| 2215 | for (; cpu <= rnp->grphi; cpu++, bit <<= 1) { | ||
| 2216 | if ((rnp->qsmask & bit) != 0) { | ||
| 2217 | if ((rnp->qsmaskinit & bit) != 0) | ||
| 2218 | *isidle = 0; | ||
| 2219 | if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj)) | ||
| 2220 | mask |= bit; | ||
| 2221 | } | ||
| 2222 | } | ||
| 2223 | if (mask != 0) { | ||
| 2224 | |||
| 2225 | /* rcu_report_qs_rnp() releases rnp->lock. */ | ||
| 2226 | rcu_report_qs_rnp(mask, rsp, rnp, flags); | ||
| 2227 | continue; | ||
| 2228 | } | ||
| 2229 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 2230 | } | ||
| 2231 | rnp = rcu_get_root(rsp); | ||
| 2232 | if (rnp->qsmask == 0) { | ||
| 2233 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 2234 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock. */ | ||
| 2235 | } | ||
| 2236 | } | ||
| 2237 | |||
| 2238 | /* | ||
| 2239 | * Force quiescent states on reluctant CPUs, and also detect which | ||
| 2240 | * CPUs are in dyntick-idle mode. | ||
| 2241 | */ | ||
| 2242 | static void force_quiescent_state(struct rcu_state *rsp) | ||
| 2243 | { | ||
| 2244 | unsigned long flags; | ||
| 2245 | bool ret; | ||
| 2246 | struct rcu_node *rnp; | ||
| 2247 | struct rcu_node *rnp_old = NULL; | ||
| 2248 | |||
| 2249 | /* Funnel through hierarchy to reduce memory contention. */ | ||
| 2250 | rnp = per_cpu_ptr(rsp->rda, raw_smp_processor_id())->mynode; | ||
| 2251 | for (; rnp != NULL; rnp = rnp->parent) { | ||
| 2252 | ret = (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) || | ||
| 2253 | !raw_spin_trylock(&rnp->fqslock); | ||
| 2254 | if (rnp_old != NULL) | ||
| 2255 | raw_spin_unlock(&rnp_old->fqslock); | ||
| 2256 | if (ret) { | ||
| 2257 | rsp->n_force_qs_lh++; | ||
| 2258 | return; | ||
| 2259 | } | ||
| 2260 | rnp_old = rnp; | ||
| 2261 | } | ||
| 2262 | /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ | ||
| 2263 | |||
| 2264 | /* Reached the root of the rcu_node tree, acquire lock. */ | ||
| 2265 | raw_spin_lock_irqsave(&rnp_old->lock, flags); | ||
| 2266 | raw_spin_unlock(&rnp_old->fqslock); | ||
| 2267 | if (ACCESS_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { | ||
| 2268 | rsp->n_force_qs_lh++; | ||
| 2269 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); | ||
| 2270 | return; /* Someone beat us to it. */ | ||
| 2271 | } | ||
| 2272 | rsp->gp_flags |= RCU_GP_FLAG_FQS; | ||
| 2273 | raw_spin_unlock_irqrestore(&rnp_old->lock, flags); | ||
| 2274 | wake_up(&rsp->gp_wq); /* Memory barrier implied by wake_up() path. */ | ||
| 2275 | } | ||
| 2276 | |||
| 2277 | /* | ||
| 2278 | * This does the RCU core processing work for the specified rcu_state | ||
| 2279 | * and rcu_data structures. This may be called only from the CPU to | ||
| 2280 | * whom the rdp belongs. | ||
| 2281 | */ | ||
| 2282 | static void | ||
| 2283 | __rcu_process_callbacks(struct rcu_state *rsp) | ||
| 2284 | { | ||
| 2285 | unsigned long flags; | ||
| 2286 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); | ||
| 2287 | |||
| 2288 | WARN_ON_ONCE(rdp->beenonline == 0); | ||
| 2289 | |||
| 2290 | /* Update RCU state based on any recent quiescent states. */ | ||
| 2291 | rcu_check_quiescent_state(rsp, rdp); | ||
| 2292 | |||
| 2293 | /* Does this CPU require a not-yet-started grace period? */ | ||
| 2294 | local_irq_save(flags); | ||
| 2295 | if (cpu_needs_another_gp(rsp, rdp)) { | ||
| 2296 | raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ | ||
| 2297 | rcu_start_gp(rsp); | ||
| 2298 | raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); | ||
| 2299 | } else { | ||
| 2300 | local_irq_restore(flags); | ||
| 2301 | } | ||
| 2302 | |||
| 2303 | /* If there are callbacks ready, invoke them. */ | ||
| 2304 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | ||
| 2305 | invoke_rcu_callbacks(rsp, rdp); | ||
| 2306 | } | ||
| 2307 | |||
| 2308 | /* | ||
| 2309 | * Do RCU core processing for the current CPU. | ||
| 2310 | */ | ||
| 2311 | static void rcu_process_callbacks(struct softirq_action *unused) | ||
| 2312 | { | ||
| 2313 | struct rcu_state *rsp; | ||
| 2314 | |||
| 2315 | if (cpu_is_offline(smp_processor_id())) | ||
| 2316 | return; | ||
| 2317 | trace_rcu_utilization(TPS("Start RCU core")); | ||
| 2318 | for_each_rcu_flavor(rsp) | ||
| 2319 | __rcu_process_callbacks(rsp); | ||
| 2320 | trace_rcu_utilization(TPS("End RCU core")); | ||
| 2321 | } | ||
| 2322 | |||
| 2323 | /* | ||
| 2324 | * Schedule RCU callback invocation. If the specified type of RCU | ||
| 2325 | * does not support RCU priority boosting, just do a direct call, | ||
| 2326 | * otherwise wake up the per-CPU kernel kthread. Note that because we | ||
| 2327 | * are running on the current CPU with interrupts disabled, the | ||
| 2328 | * rcu_cpu_kthread_task cannot disappear out from under us. | ||
| 2329 | */ | ||
| 2330 | static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) | ||
| 2331 | { | ||
| 2332 | if (unlikely(!ACCESS_ONCE(rcu_scheduler_fully_active))) | ||
| 2333 | return; | ||
| 2334 | if (likely(!rsp->boost)) { | ||
| 2335 | rcu_do_batch(rsp, rdp); | ||
| 2336 | return; | ||
| 2337 | } | ||
| 2338 | invoke_rcu_callbacks_kthread(); | ||
| 2339 | } | ||
| 2340 | |||
| 2341 | static void invoke_rcu_core(void) | ||
| 2342 | { | ||
| 2343 | if (cpu_online(smp_processor_id())) | ||
| 2344 | raise_softirq(RCU_SOFTIRQ); | ||
| 2345 | } | ||
| 2346 | |||
| 2347 | /* | ||
| 2348 | * Handle any core-RCU processing required by a call_rcu() invocation. | ||
| 2349 | */ | ||
| 2350 | static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, | ||
| 2351 | struct rcu_head *head, unsigned long flags) | ||
| 2352 | { | ||
| 2353 | /* | ||
| 2354 | * If called from an extended quiescent state, invoke the RCU | ||
| 2355 | * core in order to force a re-evaluation of RCU's idleness. | ||
| 2356 | */ | ||
| 2357 | if (!rcu_is_watching() && cpu_online(smp_processor_id())) | ||
| 2358 | invoke_rcu_core(); | ||
| 2359 | |||
| 2360 | /* If interrupts were disabled or CPU offline, don't invoke RCU core. */ | ||
| 2361 | if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id())) | ||
| 2362 | return; | ||
| 2363 | |||
| 2364 | /* | ||
| 2365 | * Force the grace period if too many callbacks or too long waiting. | ||
| 2366 | * Enforce hysteresis, and don't invoke force_quiescent_state() | ||
| 2367 | * if some other CPU has recently done so. Also, don't bother | ||
| 2368 | * invoking force_quiescent_state() if the newly enqueued callback | ||
| 2369 | * is the only one waiting for a grace period to complete. | ||
| 2370 | */ | ||
| 2371 | if (unlikely(rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) { | ||
| 2372 | |||
| 2373 | /* Are we ignoring a completed grace period? */ | ||
| 2374 | note_gp_changes(rsp, rdp); | ||
| 2375 | |||
| 2376 | /* Start a new grace period if one not already started. */ | ||
| 2377 | if (!rcu_gp_in_progress(rsp)) { | ||
| 2378 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
| 2379 | |||
| 2380 | raw_spin_lock(&rnp_root->lock); | ||
| 2381 | rcu_start_gp(rsp); | ||
| 2382 | raw_spin_unlock(&rnp_root->lock); | ||
| 2383 | } else { | ||
| 2384 | /* Give the grace period a kick. */ | ||
| 2385 | rdp->blimit = LONG_MAX; | ||
| 2386 | if (rsp->n_force_qs == rdp->n_force_qs_snap && | ||
| 2387 | *rdp->nxttail[RCU_DONE_TAIL] != head) | ||
| 2388 | force_quiescent_state(rsp); | ||
| 2389 | rdp->n_force_qs_snap = rsp->n_force_qs; | ||
| 2390 | rdp->qlen_last_fqs_check = rdp->qlen; | ||
| 2391 | } | ||
| 2392 | } | ||
| 2393 | } | ||
| 2394 | |||
| 2395 | /* | ||
| 2396 | * RCU callback function to leak a callback. | ||
| 2397 | */ | ||
| 2398 | static void rcu_leak_callback(struct rcu_head *rhp) | ||
| 2399 | { | ||
| 2400 | } | ||
| 2401 | |||
| 2402 | /* | ||
| 2403 | * Helper function for call_rcu() and friends. The cpu argument will | ||
| 2404 | * normally be -1, indicating "currently running CPU". It may specify | ||
| 2405 | * a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier() | ||
| 2406 | * is expected to specify a CPU. | ||
| 2407 | */ | ||
| 2408 | static void | ||
| 2409 | __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), | ||
| 2410 | struct rcu_state *rsp, int cpu, bool lazy) | ||
| 2411 | { | ||
| 2412 | unsigned long flags; | ||
| 2413 | struct rcu_data *rdp; | ||
| 2414 | |||
| 2415 | WARN_ON_ONCE((unsigned long)head & 0x3); /* Misaligned rcu_head! */ | ||
| 2416 | if (debug_rcu_head_queue(head)) { | ||
| 2417 | /* Probable double call_rcu(), so leak the callback. */ | ||
| 2418 | ACCESS_ONCE(head->func) = rcu_leak_callback; | ||
| 2419 | WARN_ONCE(1, "__call_rcu(): Leaked duplicate callback\n"); | ||
| 2420 | return; | ||
| 2421 | } | ||
| 2422 | head->func = func; | ||
| 2423 | head->next = NULL; | ||
| 2424 | |||
| 2425 | /* | ||
| 2426 | * Opportunistically note grace-period endings and beginnings. | ||
| 2427 | * Note that we might see a beginning right after we see an | ||
| 2428 | * end, but never vice versa, since this CPU has to pass through | ||
| 2429 | * a quiescent state betweentimes. | ||
| 2430 | */ | ||
| 2431 | local_irq_save(flags); | ||
| 2432 | rdp = this_cpu_ptr(rsp->rda); | ||
| 2433 | |||
| 2434 | /* Add the callback to our list. */ | ||
| 2435 | if (unlikely(rdp->nxttail[RCU_NEXT_TAIL] == NULL) || cpu != -1) { | ||
| 2436 | int offline; | ||
| 2437 | |||
| 2438 | if (cpu != -1) | ||
| 2439 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 2440 | offline = !__call_rcu_nocb(rdp, head, lazy); | ||
| 2441 | WARN_ON_ONCE(offline); | ||
| 2442 | /* _call_rcu() is illegal on offline CPU; leak the callback. */ | ||
| 2443 | local_irq_restore(flags); | ||
| 2444 | return; | ||
| 2445 | } | ||
| 2446 | ACCESS_ONCE(rdp->qlen)++; | ||
| 2447 | if (lazy) | ||
| 2448 | rdp->qlen_lazy++; | ||
| 2449 | else | ||
| 2450 | rcu_idle_count_callbacks_posted(); | ||
| 2451 | smp_mb(); /* Count before adding callback for rcu_barrier(). */ | ||
| 2452 | *rdp->nxttail[RCU_NEXT_TAIL] = head; | ||
| 2453 | rdp->nxttail[RCU_NEXT_TAIL] = &head->next; | ||
| 2454 | |||
| 2455 | if (__is_kfree_rcu_offset((unsigned long)func)) | ||
| 2456 | trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func, | ||
| 2457 | rdp->qlen_lazy, rdp->qlen); | ||
| 2458 | else | ||
| 2459 | trace_rcu_callback(rsp->name, head, rdp->qlen_lazy, rdp->qlen); | ||
| 2460 | |||
| 2461 | /* Go handle any RCU core processing required. */ | ||
| 2462 | __call_rcu_core(rsp, rdp, head, flags); | ||
| 2463 | local_irq_restore(flags); | ||
| 2464 | } | ||
| 2465 | |||
| 2466 | /* | ||
| 2467 | * Queue an RCU-sched callback for invocation after a grace period. | ||
| 2468 | */ | ||
| 2469 | void call_rcu_sched(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
| 2470 | { | ||
| 2471 | __call_rcu(head, func, &rcu_sched_state, -1, 0); | ||
| 2472 | } | ||
| 2473 | EXPORT_SYMBOL_GPL(call_rcu_sched); | ||
| 2474 | |||
| 2475 | /* | ||
| 2476 | * Queue an RCU callback for invocation after a quicker grace period. | ||
| 2477 | */ | ||
| 2478 | void call_rcu_bh(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
| 2479 | { | ||
| 2480 | __call_rcu(head, func, &rcu_bh_state, -1, 0); | ||
| 2481 | } | ||
| 2482 | EXPORT_SYMBOL_GPL(call_rcu_bh); | ||
| 2483 | |||
| 2484 | /* | ||
| 2485 | * Because a context switch is a grace period for RCU-sched and RCU-bh, | ||
| 2486 | * any blocking grace-period wait automatically implies a grace period | ||
| 2487 | * if there is only one CPU online at any point time during execution | ||
| 2488 | * of either synchronize_sched() or synchronize_rcu_bh(). It is OK to | ||
| 2489 | * occasionally incorrectly indicate that there are multiple CPUs online | ||
| 2490 | * when there was in fact only one the whole time, as this just adds | ||
| 2491 | * some overhead: RCU still operates correctly. | ||
| 2492 | */ | ||
| 2493 | static inline int rcu_blocking_is_gp(void) | ||
| 2494 | { | ||
| 2495 | int ret; | ||
| 2496 | |||
| 2497 | might_sleep(); /* Check for RCU read-side critical section. */ | ||
| 2498 | preempt_disable(); | ||
| 2499 | ret = num_online_cpus() <= 1; | ||
| 2500 | preempt_enable(); | ||
| 2501 | return ret; | ||
| 2502 | } | ||
| 2503 | |||
| 2504 | /** | ||
| 2505 | * synchronize_sched - wait until an rcu-sched grace period has elapsed. | ||
| 2506 | * | ||
| 2507 | * Control will return to the caller some time after a full rcu-sched | ||
| 2508 | * grace period has elapsed, in other words after all currently executing | ||
| 2509 | * rcu-sched read-side critical sections have completed. These read-side | ||
| 2510 | * critical sections are delimited by rcu_read_lock_sched() and | ||
| 2511 | * rcu_read_unlock_sched(), and may be nested. Note that preempt_disable(), | ||
| 2512 | * local_irq_disable(), and so on may be used in place of | ||
| 2513 | * rcu_read_lock_sched(). | ||
| 2514 | * | ||
| 2515 | * This means that all preempt_disable code sequences, including NMI and | ||
| 2516 | * non-threaded hardware-interrupt handlers, in progress on entry will | ||
| 2517 | * have completed before this primitive returns. However, this does not | ||
| 2518 | * guarantee that softirq handlers will have completed, since in some | ||
| 2519 | * kernels, these handlers can run in process context, and can block. | ||
| 2520 | * | ||
| 2521 | * Note that this guarantee implies further memory-ordering guarantees. | ||
| 2522 | * On systems with more than one CPU, when synchronize_sched() returns, | ||
| 2523 | * each CPU is guaranteed to have executed a full memory barrier since the | ||
| 2524 | * end of its last RCU-sched read-side critical section whose beginning | ||
| 2525 | * preceded the call to synchronize_sched(). In addition, each CPU having | ||
| 2526 | * an RCU read-side critical section that extends beyond the return from | ||
| 2527 | * synchronize_sched() is guaranteed to have executed a full memory barrier | ||
| 2528 | * after the beginning of synchronize_sched() and before the beginning of | ||
| 2529 | * that RCU read-side critical section. Note that these guarantees include | ||
| 2530 | * CPUs that are offline, idle, or executing in user mode, as well as CPUs | ||
| 2531 | * that are executing in the kernel. | ||
| 2532 | * | ||
| 2533 | * Furthermore, if CPU A invoked synchronize_sched(), which returned | ||
| 2534 | * to its caller on CPU B, then both CPU A and CPU B are guaranteed | ||
| 2535 | * to have executed a full memory barrier during the execution of | ||
| 2536 | * synchronize_sched() -- even if CPU A and CPU B are the same CPU (but | ||
| 2537 | * again only if the system has more than one CPU). | ||
| 2538 | * | ||
| 2539 | * This primitive provides the guarantees made by the (now removed) | ||
| 2540 | * synchronize_kernel() API. In contrast, synchronize_rcu() only | ||
| 2541 | * guarantees that rcu_read_lock() sections will have completed. | ||
| 2542 | * In "classic RCU", these two guarantees happen to be one and | ||
| 2543 | * the same, but can differ in realtime RCU implementations. | ||
| 2544 | */ | ||
| 2545 | void synchronize_sched(void) | ||
| 2546 | { | ||
| 2547 | rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && | ||
| 2548 | !lock_is_held(&rcu_lock_map) && | ||
| 2549 | !lock_is_held(&rcu_sched_lock_map), | ||
| 2550 | "Illegal synchronize_sched() in RCU-sched read-side critical section"); | ||
| 2551 | if (rcu_blocking_is_gp()) | ||
| 2552 | return; | ||
| 2553 | if (rcu_expedited) | ||
| 2554 | synchronize_sched_expedited(); | ||
| 2555 | else | ||
| 2556 | wait_rcu_gp(call_rcu_sched); | ||
| 2557 | } | ||
| 2558 | EXPORT_SYMBOL_GPL(synchronize_sched); | ||
| 2559 | |||
| 2560 | /** | ||
| 2561 | * synchronize_rcu_bh - wait until an rcu_bh grace period has elapsed. | ||
| 2562 | * | ||
| 2563 | * Control will return to the caller some time after a full rcu_bh grace | ||
| 2564 | * period has elapsed, in other words after all currently executing rcu_bh | ||
| 2565 | * read-side critical sections have completed. RCU read-side critical | ||
| 2566 | * sections are delimited by rcu_read_lock_bh() and rcu_read_unlock_bh(), | ||
| 2567 | * and may be nested. | ||
| 2568 | * | ||
| 2569 | * See the description of synchronize_sched() for more detailed information | ||
| 2570 | * on memory ordering guarantees. | ||
| 2571 | */ | ||
| 2572 | void synchronize_rcu_bh(void) | ||
| 2573 | { | ||
| 2574 | rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && | ||
| 2575 | !lock_is_held(&rcu_lock_map) && | ||
| 2576 | !lock_is_held(&rcu_sched_lock_map), | ||
| 2577 | "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section"); | ||
| 2578 | if (rcu_blocking_is_gp()) | ||
| 2579 | return; | ||
| 2580 | if (rcu_expedited) | ||
| 2581 | synchronize_rcu_bh_expedited(); | ||
| 2582 | else | ||
| 2583 | wait_rcu_gp(call_rcu_bh); | ||
| 2584 | } | ||
| 2585 | EXPORT_SYMBOL_GPL(synchronize_rcu_bh); | ||
| 2586 | |||
| 2587 | static int synchronize_sched_expedited_cpu_stop(void *data) | ||
| 2588 | { | ||
| 2589 | /* | ||
| 2590 | * There must be a full memory barrier on each affected CPU | ||
| 2591 | * between the time that try_stop_cpus() is called and the | ||
| 2592 | * time that it returns. | ||
| 2593 | * | ||
| 2594 | * In the current initial implementation of cpu_stop, the | ||
| 2595 | * above condition is already met when the control reaches | ||
| 2596 | * this point and the following smp_mb() is not strictly | ||
| 2597 | * necessary. Do smp_mb() anyway for documentation and | ||
| 2598 | * robustness against future implementation changes. | ||
| 2599 | */ | ||
| 2600 | smp_mb(); /* See above comment block. */ | ||
| 2601 | return 0; | ||
| 2602 | } | ||
| 2603 | |||
| 2604 | /** | ||
| 2605 | * synchronize_sched_expedited - Brute-force RCU-sched grace period | ||
| 2606 | * | ||
| 2607 | * Wait for an RCU-sched grace period to elapse, but use a "big hammer" | ||
| 2608 | * approach to force the grace period to end quickly. This consumes | ||
| 2609 | * significant time on all CPUs and is unfriendly to real-time workloads, | ||
| 2610 | * so is thus not recommended for any sort of common-case code. In fact, | ||
| 2611 | * if you are using synchronize_sched_expedited() in a loop, please | ||
| 2612 | * restructure your code to batch your updates, and then use a single | ||
| 2613 | * synchronize_sched() instead. | ||
| 2614 | * | ||
| 2615 | * Note that it is illegal to call this function while holding any lock | ||
| 2616 | * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal | ||
| 2617 | * to call this function from a CPU-hotplug notifier. Failing to observe | ||
| 2618 | * these restriction will result in deadlock. | ||
| 2619 | * | ||
| 2620 | * This implementation can be thought of as an application of ticket | ||
| 2621 | * locking to RCU, with sync_sched_expedited_started and | ||
| 2622 | * sync_sched_expedited_done taking on the roles of the halves | ||
| 2623 | * of the ticket-lock word. Each task atomically increments | ||
| 2624 | * sync_sched_expedited_started upon entry, snapshotting the old value, | ||
| 2625 | * then attempts to stop all the CPUs. If this succeeds, then each | ||
| 2626 | * CPU will have executed a context switch, resulting in an RCU-sched | ||
| 2627 | * grace period. We are then done, so we use atomic_cmpxchg() to | ||
| 2628 | * update sync_sched_expedited_done to match our snapshot -- but | ||
| 2629 | * only if someone else has not already advanced past our snapshot. | ||
| 2630 | * | ||
| 2631 | * On the other hand, if try_stop_cpus() fails, we check the value | ||
| 2632 | * of sync_sched_expedited_done. If it has advanced past our | ||
| 2633 | * initial snapshot, then someone else must have forced a grace period | ||
| 2634 | * some time after we took our snapshot. In this case, our work is | ||
| 2635 | * done for us, and we can simply return. Otherwise, we try again, | ||
| 2636 | * but keep our initial snapshot for purposes of checking for someone | ||
| 2637 | * doing our work for us. | ||
| 2638 | * | ||
| 2639 | * If we fail too many times in a row, we fall back to synchronize_sched(). | ||
| 2640 | */ | ||
| 2641 | void synchronize_sched_expedited(void) | ||
| 2642 | { | ||
| 2643 | long firstsnap, s, snap; | ||
| 2644 | int trycount = 0; | ||
| 2645 | struct rcu_state *rsp = &rcu_sched_state; | ||
| 2646 | |||
| 2647 | /* | ||
| 2648 | * If we are in danger of counter wrap, just do synchronize_sched(). | ||
| 2649 | * By allowing sync_sched_expedited_started to advance no more than | ||
| 2650 | * ULONG_MAX/8 ahead of sync_sched_expedited_done, we are ensuring | ||
| 2651 | * that more than 3.5 billion CPUs would be required to force a | ||
| 2652 | * counter wrap on a 32-bit system. Quite a few more CPUs would of | ||
| 2653 | * course be required on a 64-bit system. | ||
| 2654 | */ | ||
| 2655 | if (ULONG_CMP_GE((ulong)atomic_long_read(&rsp->expedited_start), | ||
| 2656 | (ulong)atomic_long_read(&rsp->expedited_done) + | ||
| 2657 | ULONG_MAX / 8)) { | ||
| 2658 | synchronize_sched(); | ||
| 2659 | atomic_long_inc(&rsp->expedited_wrap); | ||
| 2660 | return; | ||
| 2661 | } | ||
| 2662 | |||
| 2663 | /* | ||
| 2664 | * Take a ticket. Note that atomic_inc_return() implies a | ||
| 2665 | * full memory barrier. | ||
| 2666 | */ | ||
| 2667 | snap = atomic_long_inc_return(&rsp->expedited_start); | ||
| 2668 | firstsnap = snap; | ||
| 2669 | get_online_cpus(); | ||
| 2670 | WARN_ON_ONCE(cpu_is_offline(raw_smp_processor_id())); | ||
| 2671 | |||
| 2672 | /* | ||
| 2673 | * Each pass through the following loop attempts to force a | ||
| 2674 | * context switch on each CPU. | ||
| 2675 | */ | ||
| 2676 | while (try_stop_cpus(cpu_online_mask, | ||
| 2677 | synchronize_sched_expedited_cpu_stop, | ||
| 2678 | NULL) == -EAGAIN) { | ||
| 2679 | put_online_cpus(); | ||
| 2680 | atomic_long_inc(&rsp->expedited_tryfail); | ||
| 2681 | |||
| 2682 | /* Check to see if someone else did our work for us. */ | ||
| 2683 | s = atomic_long_read(&rsp->expedited_done); | ||
| 2684 | if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { | ||
| 2685 | /* ensure test happens before caller kfree */ | ||
| 2686 | smp_mb__before_atomic_inc(); /* ^^^ */ | ||
| 2687 | atomic_long_inc(&rsp->expedited_workdone1); | ||
| 2688 | return; | ||
| 2689 | } | ||
| 2690 | |||
| 2691 | /* No joy, try again later. Or just synchronize_sched(). */ | ||
| 2692 | if (trycount++ < 10) { | ||
| 2693 | udelay(trycount * num_online_cpus()); | ||
| 2694 | } else { | ||
| 2695 | wait_rcu_gp(call_rcu_sched); | ||
| 2696 | atomic_long_inc(&rsp->expedited_normal); | ||
| 2697 | return; | ||
| 2698 | } | ||
| 2699 | |||
| 2700 | /* Recheck to see if someone else did our work for us. */ | ||
| 2701 | s = atomic_long_read(&rsp->expedited_done); | ||
| 2702 | if (ULONG_CMP_GE((ulong)s, (ulong)firstsnap)) { | ||
| 2703 | /* ensure test happens before caller kfree */ | ||
| 2704 | smp_mb__before_atomic_inc(); /* ^^^ */ | ||
| 2705 | atomic_long_inc(&rsp->expedited_workdone2); | ||
| 2706 | return; | ||
| 2707 | } | ||
| 2708 | |||
| 2709 | /* | ||
| 2710 | * Refetching sync_sched_expedited_started allows later | ||
| 2711 | * callers to piggyback on our grace period. We retry | ||
| 2712 | * after they started, so our grace period works for them, | ||
| 2713 | * and they started after our first try, so their grace | ||
| 2714 | * period works for us. | ||
| 2715 | */ | ||
| 2716 | get_online_cpus(); | ||
| 2717 | snap = atomic_long_read(&rsp->expedited_start); | ||
| 2718 | smp_mb(); /* ensure read is before try_stop_cpus(). */ | ||
| 2719 | } | ||
| 2720 | atomic_long_inc(&rsp->expedited_stoppedcpus); | ||
| 2721 | |||
| 2722 | /* | ||
| 2723 | * Everyone up to our most recent fetch is covered by our grace | ||
| 2724 | * period. Update the counter, but only if our work is still | ||
| 2725 | * relevant -- which it won't be if someone who started later | ||
| 2726 | * than we did already did their update. | ||
| 2727 | */ | ||
| 2728 | do { | ||
| 2729 | atomic_long_inc(&rsp->expedited_done_tries); | ||
| 2730 | s = atomic_long_read(&rsp->expedited_done); | ||
| 2731 | if (ULONG_CMP_GE((ulong)s, (ulong)snap)) { | ||
| 2732 | /* ensure test happens before caller kfree */ | ||
| 2733 | smp_mb__before_atomic_inc(); /* ^^^ */ | ||
| 2734 | atomic_long_inc(&rsp->expedited_done_lost); | ||
| 2735 | break; | ||
| 2736 | } | ||
| 2737 | } while (atomic_long_cmpxchg(&rsp->expedited_done, s, snap) != s); | ||
| 2738 | atomic_long_inc(&rsp->expedited_done_exit); | ||
| 2739 | |||
| 2740 | put_online_cpus(); | ||
| 2741 | } | ||
| 2742 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | ||
| 2743 | |||
| 2744 | /* | ||
| 2745 | * Check to see if there is any immediate RCU-related work to be done | ||
| 2746 | * by the current CPU, for the specified type of RCU, returning 1 if so. | ||
| 2747 | * The checks are in order of increasing expense: checks that can be | ||
| 2748 | * carried out against CPU-local state are performed first. However, | ||
| 2749 | * we must check for CPU stalls first, else we might not get a chance. | ||
| 2750 | */ | ||
| 2751 | static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) | ||
| 2752 | { | ||
| 2753 | struct rcu_node *rnp = rdp->mynode; | ||
| 2754 | |||
| 2755 | rdp->n_rcu_pending++; | ||
| 2756 | |||
| 2757 | /* Check for CPU stalls, if enabled. */ | ||
| 2758 | check_cpu_stall(rsp, rdp); | ||
| 2759 | |||
| 2760 | /* Is the RCU core waiting for a quiescent state from this CPU? */ | ||
| 2761 | if (rcu_scheduler_fully_active && | ||
| 2762 | rdp->qs_pending && !rdp->passed_quiesce) { | ||
| 2763 | rdp->n_rp_qs_pending++; | ||
| 2764 | } else if (rdp->qs_pending && rdp->passed_quiesce) { | ||
| 2765 | rdp->n_rp_report_qs++; | ||
| 2766 | return 1; | ||
| 2767 | } | ||
| 2768 | |||
| 2769 | /* Does this CPU have callbacks ready to invoke? */ | ||
| 2770 | if (cpu_has_callbacks_ready_to_invoke(rdp)) { | ||
| 2771 | rdp->n_rp_cb_ready++; | ||
| 2772 | return 1; | ||
| 2773 | } | ||
| 2774 | |||
| 2775 | /* Has RCU gone idle with this CPU needing another grace period? */ | ||
| 2776 | if (cpu_needs_another_gp(rsp, rdp)) { | ||
| 2777 | rdp->n_rp_cpu_needs_gp++; | ||
| 2778 | return 1; | ||
| 2779 | } | ||
| 2780 | |||
| 2781 | /* Has another RCU grace period completed? */ | ||
| 2782 | if (ACCESS_ONCE(rnp->completed) != rdp->completed) { /* outside lock */ | ||
| 2783 | rdp->n_rp_gp_completed++; | ||
| 2784 | return 1; | ||
| 2785 | } | ||
| 2786 | |||
| 2787 | /* Has a new RCU grace period started? */ | ||
| 2788 | if (ACCESS_ONCE(rnp->gpnum) != rdp->gpnum) { /* outside lock */ | ||
| 2789 | rdp->n_rp_gp_started++; | ||
| 2790 | return 1; | ||
| 2791 | } | ||
| 2792 | |||
| 2793 | /* nothing to do */ | ||
| 2794 | rdp->n_rp_need_nothing++; | ||
| 2795 | return 0; | ||
| 2796 | } | ||
| 2797 | |||
| 2798 | /* | ||
| 2799 | * Check to see if there is any immediate RCU-related work to be done | ||
| 2800 | * by the current CPU, returning 1 if so. This function is part of the | ||
| 2801 | * RCU implementation; it is -not- an exported member of the RCU API. | ||
| 2802 | */ | ||
| 2803 | static int rcu_pending(int cpu) | ||
| 2804 | { | ||
| 2805 | struct rcu_state *rsp; | ||
| 2806 | |||
| 2807 | for_each_rcu_flavor(rsp) | ||
| 2808 | if (__rcu_pending(rsp, per_cpu_ptr(rsp->rda, cpu))) | ||
| 2809 | return 1; | ||
| 2810 | return 0; | ||
| 2811 | } | ||
| 2812 | |||
| 2813 | /* | ||
| 2814 | * Return true if the specified CPU has any callback. If all_lazy is | ||
| 2815 | * non-NULL, store an indication of whether all callbacks are lazy. | ||
| 2816 | * (If there are no callbacks, all of them are deemed to be lazy.) | ||
| 2817 | */ | ||
| 2818 | static int rcu_cpu_has_callbacks(int cpu, bool *all_lazy) | ||
| 2819 | { | ||
| 2820 | bool al = true; | ||
| 2821 | bool hc = false; | ||
| 2822 | struct rcu_data *rdp; | ||
| 2823 | struct rcu_state *rsp; | ||
| 2824 | |||
| 2825 | for_each_rcu_flavor(rsp) { | ||
| 2826 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 2827 | if (!rdp->nxtlist) | ||
| 2828 | continue; | ||
| 2829 | hc = true; | ||
| 2830 | if (rdp->qlen != rdp->qlen_lazy || !all_lazy) { | ||
| 2831 | al = false; | ||
| 2832 | break; | ||
| 2833 | } | ||
| 2834 | } | ||
| 2835 | if (all_lazy) | ||
| 2836 | *all_lazy = al; | ||
| 2837 | return hc; | ||
| 2838 | } | ||
| 2839 | |||
| 2840 | /* | ||
| 2841 | * Helper function for _rcu_barrier() tracing. If tracing is disabled, | ||
| 2842 | * the compiler is expected to optimize this away. | ||
| 2843 | */ | ||
| 2844 | static void _rcu_barrier_trace(struct rcu_state *rsp, const char *s, | ||
| 2845 | int cpu, unsigned long done) | ||
| 2846 | { | ||
| 2847 | trace_rcu_barrier(rsp->name, s, cpu, | ||
| 2848 | atomic_read(&rsp->barrier_cpu_count), done); | ||
| 2849 | } | ||
| 2850 | |||
| 2851 | /* | ||
| 2852 | * RCU callback function for _rcu_barrier(). If we are last, wake | ||
| 2853 | * up the task executing _rcu_barrier(). | ||
| 2854 | */ | ||
| 2855 | static void rcu_barrier_callback(struct rcu_head *rhp) | ||
| 2856 | { | ||
| 2857 | struct rcu_data *rdp = container_of(rhp, struct rcu_data, barrier_head); | ||
| 2858 | struct rcu_state *rsp = rdp->rsp; | ||
| 2859 | |||
| 2860 | if (atomic_dec_and_test(&rsp->barrier_cpu_count)) { | ||
| 2861 | _rcu_barrier_trace(rsp, "LastCB", -1, rsp->n_barrier_done); | ||
| 2862 | complete(&rsp->barrier_completion); | ||
| 2863 | } else { | ||
| 2864 | _rcu_barrier_trace(rsp, "CB", -1, rsp->n_barrier_done); | ||
| 2865 | } | ||
| 2866 | } | ||
| 2867 | |||
| 2868 | /* | ||
| 2869 | * Called with preemption disabled, and from cross-cpu IRQ context. | ||
| 2870 | */ | ||
| 2871 | static void rcu_barrier_func(void *type) | ||
| 2872 | { | ||
| 2873 | struct rcu_state *rsp = type; | ||
| 2874 | struct rcu_data *rdp = __this_cpu_ptr(rsp->rda); | ||
| 2875 | |||
| 2876 | _rcu_barrier_trace(rsp, "IRQ", -1, rsp->n_barrier_done); | ||
| 2877 | atomic_inc(&rsp->barrier_cpu_count); | ||
| 2878 | rsp->call(&rdp->barrier_head, rcu_barrier_callback); | ||
| 2879 | } | ||
| 2880 | |||
| 2881 | /* | ||
| 2882 | * Orchestrate the specified type of RCU barrier, waiting for all | ||
| 2883 | * RCU callbacks of the specified type to complete. | ||
| 2884 | */ | ||
| 2885 | static void _rcu_barrier(struct rcu_state *rsp) | ||
| 2886 | { | ||
| 2887 | int cpu; | ||
| 2888 | struct rcu_data *rdp; | ||
| 2889 | unsigned long snap = ACCESS_ONCE(rsp->n_barrier_done); | ||
| 2890 | unsigned long snap_done; | ||
| 2891 | |||
| 2892 | _rcu_barrier_trace(rsp, "Begin", -1, snap); | ||
| 2893 | |||
| 2894 | /* Take mutex to serialize concurrent rcu_barrier() requests. */ | ||
| 2895 | mutex_lock(&rsp->barrier_mutex); | ||
| 2896 | |||
| 2897 | /* | ||
| 2898 | * Ensure that all prior references, including to ->n_barrier_done, | ||
| 2899 | * are ordered before the _rcu_barrier() machinery. | ||
| 2900 | */ | ||
| 2901 | smp_mb(); /* See above block comment. */ | ||
| 2902 | |||
| 2903 | /* | ||
| 2904 | * Recheck ->n_barrier_done to see if others did our work for us. | ||
| 2905 | * This means checking ->n_barrier_done for an even-to-odd-to-even | ||
| 2906 | * transition. The "if" expression below therefore rounds the old | ||
| 2907 | * value up to the next even number and adds two before comparing. | ||
| 2908 | */ | ||
| 2909 | snap_done = rsp->n_barrier_done; | ||
| 2910 | _rcu_barrier_trace(rsp, "Check", -1, snap_done); | ||
| 2911 | |||
| 2912 | /* | ||
| 2913 | * If the value in snap is odd, we needed to wait for the current | ||
| 2914 | * rcu_barrier() to complete, then wait for the next one, in other | ||
| 2915 | * words, we need the value of snap_done to be three larger than | ||
| 2916 | * the value of snap. On the other hand, if the value in snap is | ||
| 2917 | * even, we only had to wait for the next rcu_barrier() to complete, | ||
| 2918 | * in other words, we need the value of snap_done to be only two | ||
| 2919 | * greater than the value of snap. The "(snap + 3) & ~0x1" computes | ||
| 2920 | * this for us (thank you, Linus!). | ||
| 2921 | */ | ||
| 2922 | if (ULONG_CMP_GE(snap_done, (snap + 3) & ~0x1)) { | ||
| 2923 | _rcu_barrier_trace(rsp, "EarlyExit", -1, snap_done); | ||
| 2924 | smp_mb(); /* caller's subsequent code after above check. */ | ||
| 2925 | mutex_unlock(&rsp->barrier_mutex); | ||
| 2926 | return; | ||
| 2927 | } | ||
| 2928 | |||
| 2929 | /* | ||
| 2930 | * Increment ->n_barrier_done to avoid duplicate work. Use | ||
| 2931 | * ACCESS_ONCE() to prevent the compiler from speculating | ||
| 2932 | * the increment to precede the early-exit check. | ||
| 2933 | */ | ||
| 2934 | ACCESS_ONCE(rsp->n_barrier_done)++; | ||
| 2935 | WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 1); | ||
| 2936 | _rcu_barrier_trace(rsp, "Inc1", -1, rsp->n_barrier_done); | ||
| 2937 | smp_mb(); /* Order ->n_barrier_done increment with below mechanism. */ | ||
| 2938 | |||
| 2939 | /* | ||
| 2940 | * Initialize the count to one rather than to zero in order to | ||
| 2941 | * avoid a too-soon return to zero in case of a short grace period | ||
| 2942 | * (or preemption of this task). Exclude CPU-hotplug operations | ||
| 2943 | * to ensure that no offline CPU has callbacks queued. | ||
| 2944 | */ | ||
| 2945 | init_completion(&rsp->barrier_completion); | ||
| 2946 | atomic_set(&rsp->barrier_cpu_count, 1); | ||
| 2947 | get_online_cpus(); | ||
| 2948 | |||
| 2949 | /* | ||
| 2950 | * Force each CPU with callbacks to register a new callback. | ||
| 2951 | * When that callback is invoked, we will know that all of the | ||
| 2952 | * corresponding CPU's preceding callbacks have been invoked. | ||
| 2953 | */ | ||
| 2954 | for_each_possible_cpu(cpu) { | ||
| 2955 | if (!cpu_online(cpu) && !rcu_is_nocb_cpu(cpu)) | ||
| 2956 | continue; | ||
| 2957 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 2958 | if (rcu_is_nocb_cpu(cpu)) { | ||
| 2959 | _rcu_barrier_trace(rsp, "OnlineNoCB", cpu, | ||
| 2960 | rsp->n_barrier_done); | ||
| 2961 | atomic_inc(&rsp->barrier_cpu_count); | ||
| 2962 | __call_rcu(&rdp->barrier_head, rcu_barrier_callback, | ||
| 2963 | rsp, cpu, 0); | ||
| 2964 | } else if (ACCESS_ONCE(rdp->qlen)) { | ||
| 2965 | _rcu_barrier_trace(rsp, "OnlineQ", cpu, | ||
| 2966 | rsp->n_barrier_done); | ||
| 2967 | smp_call_function_single(cpu, rcu_barrier_func, rsp, 1); | ||
| 2968 | } else { | ||
| 2969 | _rcu_barrier_trace(rsp, "OnlineNQ", cpu, | ||
| 2970 | rsp->n_barrier_done); | ||
| 2971 | } | ||
| 2972 | } | ||
| 2973 | put_online_cpus(); | ||
| 2974 | |||
| 2975 | /* | ||
| 2976 | * Now that we have an rcu_barrier_callback() callback on each | ||
| 2977 | * CPU, and thus each counted, remove the initial count. | ||
| 2978 | */ | ||
| 2979 | if (atomic_dec_and_test(&rsp->barrier_cpu_count)) | ||
| 2980 | complete(&rsp->barrier_completion); | ||
| 2981 | |||
| 2982 | /* Increment ->n_barrier_done to prevent duplicate work. */ | ||
| 2983 | smp_mb(); /* Keep increment after above mechanism. */ | ||
| 2984 | ACCESS_ONCE(rsp->n_barrier_done)++; | ||
| 2985 | WARN_ON_ONCE((rsp->n_barrier_done & 0x1) != 0); | ||
| 2986 | _rcu_barrier_trace(rsp, "Inc2", -1, rsp->n_barrier_done); | ||
| 2987 | smp_mb(); /* Keep increment before caller's subsequent code. */ | ||
| 2988 | |||
| 2989 | /* Wait for all rcu_barrier_callback() callbacks to be invoked. */ | ||
| 2990 | wait_for_completion(&rsp->barrier_completion); | ||
| 2991 | |||
| 2992 | /* Other rcu_barrier() invocations can now safely proceed. */ | ||
| 2993 | mutex_unlock(&rsp->barrier_mutex); | ||
| 2994 | } | ||
| 2995 | |||
| 2996 | /** | ||
| 2997 | * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete. | ||
| 2998 | */ | ||
| 2999 | void rcu_barrier_bh(void) | ||
| 3000 | { | ||
| 3001 | _rcu_barrier(&rcu_bh_state); | ||
| 3002 | } | ||
| 3003 | EXPORT_SYMBOL_GPL(rcu_barrier_bh); | ||
| 3004 | |||
| 3005 | /** | ||
| 3006 | * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks. | ||
| 3007 | */ | ||
| 3008 | void rcu_barrier_sched(void) | ||
| 3009 | { | ||
| 3010 | _rcu_barrier(&rcu_sched_state); | ||
| 3011 | } | ||
| 3012 | EXPORT_SYMBOL_GPL(rcu_barrier_sched); | ||
| 3013 | |||
| 3014 | /* | ||
| 3015 | * Do boot-time initialization of a CPU's per-CPU RCU data. | ||
| 3016 | */ | ||
| 3017 | static void __init | ||
| 3018 | rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) | ||
| 3019 | { | ||
| 3020 | unsigned long flags; | ||
| 3021 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 3022 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
| 3023 | |||
| 3024 | /* Set up local state, ensuring consistent view of global state. */ | ||
| 3025 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 3026 | rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); | ||
| 3027 | init_callback_list(rdp); | ||
| 3028 | rdp->qlen_lazy = 0; | ||
| 3029 | ACCESS_ONCE(rdp->qlen) = 0; | ||
| 3030 | rdp->dynticks = &per_cpu(rcu_dynticks, cpu); | ||
| 3031 | WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); | ||
| 3032 | WARN_ON_ONCE(atomic_read(&rdp->dynticks->dynticks) != 1); | ||
| 3033 | rdp->cpu = cpu; | ||
| 3034 | rdp->rsp = rsp; | ||
| 3035 | rcu_boot_init_nocb_percpu_data(rdp); | ||
| 3036 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 3037 | } | ||
| 3038 | |||
| 3039 | /* | ||
| 3040 | * Initialize a CPU's per-CPU RCU data. Note that only one online or | ||
| 3041 | * offline event can be happening at a given time. Note also that we | ||
| 3042 | * can accept some slop in the rsp->completed access due to the fact | ||
| 3043 | * that this CPU cannot possibly have any RCU callbacks in flight yet. | ||
| 3044 | */ | ||
| 3045 | static void | ||
| 3046 | rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) | ||
| 3047 | { | ||
| 3048 | unsigned long flags; | ||
| 3049 | unsigned long mask; | ||
| 3050 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 3051 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
| 3052 | |||
| 3053 | /* Exclude new grace periods. */ | ||
| 3054 | mutex_lock(&rsp->onoff_mutex); | ||
| 3055 | |||
| 3056 | /* Set up local state, ensuring consistent view of global state. */ | ||
| 3057 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 3058 | rdp->beenonline = 1; /* We have now been online. */ | ||
| 3059 | rdp->preemptible = preemptible; | ||
| 3060 | rdp->qlen_last_fqs_check = 0; | ||
| 3061 | rdp->n_force_qs_snap = rsp->n_force_qs; | ||
| 3062 | rdp->blimit = blimit; | ||
| 3063 | init_callback_list(rdp); /* Re-enable callbacks on this CPU. */ | ||
| 3064 | rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE; | ||
| 3065 | rcu_sysidle_init_percpu_data(rdp->dynticks); | ||
| 3066 | atomic_set(&rdp->dynticks->dynticks, | ||
| 3067 | (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); | ||
| 3068 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
| 3069 | |||
| 3070 | /* Add CPU to rcu_node bitmasks. */ | ||
| 3071 | rnp = rdp->mynode; | ||
| 3072 | mask = rdp->grpmask; | ||
| 3073 | do { | ||
| 3074 | /* Exclude any attempts to start a new GP on small systems. */ | ||
| 3075 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | ||
| 3076 | rnp->qsmaskinit |= mask; | ||
| 3077 | mask = rnp->grpmask; | ||
| 3078 | if (rnp == rdp->mynode) { | ||
| 3079 | /* | ||
| 3080 | * If there is a grace period in progress, we will | ||
| 3081 | * set up to wait for it next time we run the | ||
| 3082 | * RCU core code. | ||
| 3083 | */ | ||
| 3084 | rdp->gpnum = rnp->completed; | ||
| 3085 | rdp->completed = rnp->completed; | ||
| 3086 | rdp->passed_quiesce = 0; | ||
| 3087 | rdp->qs_pending = 0; | ||
| 3088 | trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); | ||
| 3089 | } | ||
| 3090 | raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ | ||
| 3091 | rnp = rnp->parent; | ||
| 3092 | } while (rnp != NULL && !(rnp->qsmaskinit & mask)); | ||
| 3093 | local_irq_restore(flags); | ||
| 3094 | |||
| 3095 | mutex_unlock(&rsp->onoff_mutex); | ||
| 3096 | } | ||
| 3097 | |||
| 3098 | static void rcu_prepare_cpu(int cpu) | ||
| 3099 | { | ||
| 3100 | struct rcu_state *rsp; | ||
| 3101 | |||
| 3102 | for_each_rcu_flavor(rsp) | ||
| 3103 | rcu_init_percpu_data(cpu, rsp, | ||
| 3104 | strcmp(rsp->name, "rcu_preempt") == 0); | ||
| 3105 | } | ||
| 3106 | |||
| 3107 | /* | ||
| 3108 | * Handle CPU online/offline notification events. | ||
| 3109 | */ | ||
| 3110 | static int rcu_cpu_notify(struct notifier_block *self, | ||
| 3111 | unsigned long action, void *hcpu) | ||
| 3112 | { | ||
| 3113 | long cpu = (long)hcpu; | ||
| 3114 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | ||
| 3115 | struct rcu_node *rnp = rdp->mynode; | ||
| 3116 | struct rcu_state *rsp; | ||
| 3117 | |||
| 3118 | trace_rcu_utilization(TPS("Start CPU hotplug")); | ||
| 3119 | switch (action) { | ||
| 3120 | case CPU_UP_PREPARE: | ||
| 3121 | case CPU_UP_PREPARE_FROZEN: | ||
| 3122 | rcu_prepare_cpu(cpu); | ||
| 3123 | rcu_prepare_kthreads(cpu); | ||
| 3124 | break; | ||
| 3125 | case CPU_ONLINE: | ||
| 3126 | case CPU_DOWN_FAILED: | ||
| 3127 | rcu_boost_kthread_setaffinity(rnp, -1); | ||
| 3128 | break; | ||
| 3129 | case CPU_DOWN_PREPARE: | ||
| 3130 | rcu_boost_kthread_setaffinity(rnp, cpu); | ||
| 3131 | break; | ||
| 3132 | case CPU_DYING: | ||
| 3133 | case CPU_DYING_FROZEN: | ||
| 3134 | for_each_rcu_flavor(rsp) | ||
| 3135 | rcu_cleanup_dying_cpu(rsp); | ||
| 3136 | break; | ||
| 3137 | case CPU_DEAD: | ||
| 3138 | case CPU_DEAD_FROZEN: | ||
| 3139 | case CPU_UP_CANCELED: | ||
| 3140 | case CPU_UP_CANCELED_FROZEN: | ||
| 3141 | for_each_rcu_flavor(rsp) | ||
| 3142 | rcu_cleanup_dead_cpu(cpu, rsp); | ||
| 3143 | break; | ||
| 3144 | default: | ||
| 3145 | break; | ||
| 3146 | } | ||
| 3147 | trace_rcu_utilization(TPS("End CPU hotplug")); | ||
| 3148 | return NOTIFY_OK; | ||
| 3149 | } | ||
| 3150 | |||
| 3151 | static int rcu_pm_notify(struct notifier_block *self, | ||
| 3152 | unsigned long action, void *hcpu) | ||
| 3153 | { | ||
| 3154 | switch (action) { | ||
| 3155 | case PM_HIBERNATION_PREPARE: | ||
| 3156 | case PM_SUSPEND_PREPARE: | ||
| 3157 | if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */ | ||
| 3158 | rcu_expedited = 1; | ||
| 3159 | break; | ||
| 3160 | case PM_POST_HIBERNATION: | ||
| 3161 | case PM_POST_SUSPEND: | ||
| 3162 | rcu_expedited = 0; | ||
| 3163 | break; | ||
| 3164 | default: | ||
| 3165 | break; | ||
| 3166 | } | ||
| 3167 | return NOTIFY_OK; | ||
| 3168 | } | ||
| 3169 | |||
| 3170 | /* | ||
| 3171 | * Spawn the kthread that handles this RCU flavor's grace periods. | ||
| 3172 | */ | ||
| 3173 | static int __init rcu_spawn_gp_kthread(void) | ||
| 3174 | { | ||
| 3175 | unsigned long flags; | ||
| 3176 | struct rcu_node *rnp; | ||
| 3177 | struct rcu_state *rsp; | ||
| 3178 | struct task_struct *t; | ||
| 3179 | |||
| 3180 | for_each_rcu_flavor(rsp) { | ||
| 3181 | t = kthread_run(rcu_gp_kthread, rsp, "%s", rsp->name); | ||
| 3182 | BUG_ON(IS_ERR(t)); | ||
| 3183 | rnp = rcu_get_root(rsp); | ||
| 3184 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 3185 | rsp->gp_kthread = t; | ||
| 3186 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 3187 | rcu_spawn_nocb_kthreads(rsp); | ||
| 3188 | } | ||
| 3189 | return 0; | ||
| 3190 | } | ||
| 3191 | early_initcall(rcu_spawn_gp_kthread); | ||
| 3192 | |||
| 3193 | /* | ||
| 3194 | * This function is invoked towards the end of the scheduler's initialization | ||
| 3195 | * process. Before this is called, the idle task might contain | ||
| 3196 | * RCU read-side critical sections (during which time, this idle | ||
| 3197 | * task is booting the system). After this function is called, the | ||
| 3198 | * idle tasks are prohibited from containing RCU read-side critical | ||
| 3199 | * sections. This function also enables RCU lockdep checking. | ||
| 3200 | */ | ||
| 3201 | void rcu_scheduler_starting(void) | ||
| 3202 | { | ||
| 3203 | WARN_ON(num_online_cpus() != 1); | ||
| 3204 | WARN_ON(nr_context_switches() > 0); | ||
| 3205 | rcu_scheduler_active = 1; | ||
| 3206 | } | ||
| 3207 | |||
| 3208 | /* | ||
| 3209 | * Compute the per-level fanout, either using the exact fanout specified | ||
| 3210 | * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT. | ||
| 3211 | */ | ||
| 3212 | #ifdef CONFIG_RCU_FANOUT_EXACT | ||
| 3213 | static void __init rcu_init_levelspread(struct rcu_state *rsp) | ||
| 3214 | { | ||
| 3215 | int i; | ||
| 3216 | |||
| 3217 | for (i = rcu_num_lvls - 1; i > 0; i--) | ||
| 3218 | rsp->levelspread[i] = CONFIG_RCU_FANOUT; | ||
| 3219 | rsp->levelspread[0] = rcu_fanout_leaf; | ||
| 3220 | } | ||
| 3221 | #else /* #ifdef CONFIG_RCU_FANOUT_EXACT */ | ||
| 3222 | static void __init rcu_init_levelspread(struct rcu_state *rsp) | ||
| 3223 | { | ||
| 3224 | int ccur; | ||
| 3225 | int cprv; | ||
| 3226 | int i; | ||
| 3227 | |||
| 3228 | cprv = nr_cpu_ids; | ||
| 3229 | for (i = rcu_num_lvls - 1; i >= 0; i--) { | ||
| 3230 | ccur = rsp->levelcnt[i]; | ||
| 3231 | rsp->levelspread[i] = (cprv + ccur - 1) / ccur; | ||
| 3232 | cprv = ccur; | ||
| 3233 | } | ||
| 3234 | } | ||
| 3235 | #endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */ | ||
| 3236 | |||
| 3237 | /* | ||
| 3238 | * Helper function for rcu_init() that initializes one rcu_state structure. | ||
| 3239 | */ | ||
| 3240 | static void __init rcu_init_one(struct rcu_state *rsp, | ||
| 3241 | struct rcu_data __percpu *rda) | ||
| 3242 | { | ||
| 3243 | static char *buf[] = { "rcu_node_0", | ||
| 3244 | "rcu_node_1", | ||
| 3245 | "rcu_node_2", | ||
| 3246 | "rcu_node_3" }; /* Match MAX_RCU_LVLS */ | ||
| 3247 | static char *fqs[] = { "rcu_node_fqs_0", | ||
| 3248 | "rcu_node_fqs_1", | ||
| 3249 | "rcu_node_fqs_2", | ||
| 3250 | "rcu_node_fqs_3" }; /* Match MAX_RCU_LVLS */ | ||
| 3251 | int cpustride = 1; | ||
| 3252 | int i; | ||
| 3253 | int j; | ||
| 3254 | struct rcu_node *rnp; | ||
| 3255 | |||
| 3256 | BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ | ||
| 3257 | |||
| 3258 | /* Silence gcc 4.8 warning about array index out of range. */ | ||
| 3259 | if (rcu_num_lvls > RCU_NUM_LVLS) | ||
| 3260 | panic("rcu_init_one: rcu_num_lvls overflow"); | ||
| 3261 | |||
| 3262 | /* Initialize the level-tracking arrays. */ | ||
| 3263 | |||
| 3264 | for (i = 0; i < rcu_num_lvls; i++) | ||
| 3265 | rsp->levelcnt[i] = num_rcu_lvl[i]; | ||
| 3266 | for (i = 1; i < rcu_num_lvls; i++) | ||
| 3267 | rsp->level[i] = rsp->level[i - 1] + rsp->levelcnt[i - 1]; | ||
| 3268 | rcu_init_levelspread(rsp); | ||
| 3269 | |||
| 3270 | /* Initialize the elements themselves, starting from the leaves. */ | ||
| 3271 | |||
| 3272 | for (i = rcu_num_lvls - 1; i >= 0; i--) { | ||
| 3273 | cpustride *= rsp->levelspread[i]; | ||
| 3274 | rnp = rsp->level[i]; | ||
| 3275 | for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { | ||
| 3276 | raw_spin_lock_init(&rnp->lock); | ||
| 3277 | lockdep_set_class_and_name(&rnp->lock, | ||
| 3278 | &rcu_node_class[i], buf[i]); | ||
| 3279 | raw_spin_lock_init(&rnp->fqslock); | ||
| 3280 | lockdep_set_class_and_name(&rnp->fqslock, | ||
| 3281 | &rcu_fqs_class[i], fqs[i]); | ||
| 3282 | rnp->gpnum = rsp->gpnum; | ||
| 3283 | rnp->completed = rsp->completed; | ||
| 3284 | rnp->qsmask = 0; | ||
| 3285 | rnp->qsmaskinit = 0; | ||
| 3286 | rnp->grplo = j * cpustride; | ||
| 3287 | rnp->grphi = (j + 1) * cpustride - 1; | ||
| 3288 | if (rnp->grphi >= NR_CPUS) | ||
| 3289 | rnp->grphi = NR_CPUS - 1; | ||
| 3290 | if (i == 0) { | ||
| 3291 | rnp->grpnum = 0; | ||
| 3292 | rnp->grpmask = 0; | ||
| 3293 | rnp->parent = NULL; | ||
| 3294 | } else { | ||
| 3295 | rnp->grpnum = j % rsp->levelspread[i - 1]; | ||
| 3296 | rnp->grpmask = 1UL << rnp->grpnum; | ||
| 3297 | rnp->parent = rsp->level[i - 1] + | ||
| 3298 | j / rsp->levelspread[i - 1]; | ||
| 3299 | } | ||
| 3300 | rnp->level = i; | ||
| 3301 | INIT_LIST_HEAD(&rnp->blkd_tasks); | ||
| 3302 | rcu_init_one_nocb(rnp); | ||
| 3303 | } | ||
| 3304 | } | ||
| 3305 | |||
| 3306 | rsp->rda = rda; | ||
| 3307 | init_waitqueue_head(&rsp->gp_wq); | ||
| 3308 | init_irq_work(&rsp->wakeup_work, rsp_wakeup); | ||
| 3309 | rnp = rsp->level[rcu_num_lvls - 1]; | ||
| 3310 | for_each_possible_cpu(i) { | ||
| 3311 | while (i > rnp->grphi) | ||
| 3312 | rnp++; | ||
| 3313 | per_cpu_ptr(rsp->rda, i)->mynode = rnp; | ||
| 3314 | rcu_boot_init_percpu_data(i, rsp); | ||
| 3315 | } | ||
| 3316 | list_add(&rsp->flavors, &rcu_struct_flavors); | ||
| 3317 | } | ||
| 3318 | |||
| 3319 | /* | ||
| 3320 | * Compute the rcu_node tree geometry from kernel parameters. This cannot | ||
| 3321 | * replace the definitions in tree.h because those are needed to size | ||
| 3322 | * the ->node array in the rcu_state structure. | ||
| 3323 | */ | ||
| 3324 | static void __init rcu_init_geometry(void) | ||
| 3325 | { | ||
| 3326 | ulong d; | ||
| 3327 | int i; | ||
| 3328 | int j; | ||
| 3329 | int n = nr_cpu_ids; | ||
| 3330 | int rcu_capacity[MAX_RCU_LVLS + 1]; | ||
| 3331 | |||
| 3332 | /* | ||
| 3333 | * Initialize any unspecified boot parameters. | ||
| 3334 | * The default values of jiffies_till_first_fqs and | ||
| 3335 | * jiffies_till_next_fqs are set to the RCU_JIFFIES_TILL_FORCE_QS | ||
| 3336 | * value, which is a function of HZ, then adding one for each | ||
| 3337 | * RCU_JIFFIES_FQS_DIV CPUs that might be on the system. | ||
| 3338 | */ | ||
| 3339 | d = RCU_JIFFIES_TILL_FORCE_QS + nr_cpu_ids / RCU_JIFFIES_FQS_DIV; | ||
| 3340 | if (jiffies_till_first_fqs == ULONG_MAX) | ||
| 3341 | jiffies_till_first_fqs = d; | ||
| 3342 | if (jiffies_till_next_fqs == ULONG_MAX) | ||
| 3343 | jiffies_till_next_fqs = d; | ||
| 3344 | |||
| 3345 | /* If the compile-time values are accurate, just leave. */ | ||
| 3346 | if (rcu_fanout_leaf == CONFIG_RCU_FANOUT_LEAF && | ||
| 3347 | nr_cpu_ids == NR_CPUS) | ||
| 3348 | return; | ||
| 3349 | |||
| 3350 | /* | ||
| 3351 | * Compute number of nodes that can be handled an rcu_node tree | ||
| 3352 | * with the given number of levels. Setting rcu_capacity[0] makes | ||
| 3353 | * some of the arithmetic easier. | ||
| 3354 | */ | ||
| 3355 | rcu_capacity[0] = 1; | ||
| 3356 | rcu_capacity[1] = rcu_fanout_leaf; | ||
| 3357 | for (i = 2; i <= MAX_RCU_LVLS; i++) | ||
| 3358 | rcu_capacity[i] = rcu_capacity[i - 1] * CONFIG_RCU_FANOUT; | ||
| 3359 | |||
| 3360 | /* | ||
| 3361 | * The boot-time rcu_fanout_leaf parameter is only permitted | ||
| 3362 | * to increase the leaf-level fanout, not decrease it. Of course, | ||
| 3363 | * the leaf-level fanout cannot exceed the number of bits in | ||
| 3364 | * the rcu_node masks. Finally, the tree must be able to accommodate | ||
| 3365 | * the configured number of CPUs. Complain and fall back to the | ||
| 3366 | * compile-time values if these limits are exceeded. | ||
| 3367 | */ | ||
| 3368 | if (rcu_fanout_leaf < CONFIG_RCU_FANOUT_LEAF || | ||
| 3369 | rcu_fanout_leaf > sizeof(unsigned long) * 8 || | ||
| 3370 | n > rcu_capacity[MAX_RCU_LVLS]) { | ||
| 3371 | WARN_ON(1); | ||
| 3372 | return; | ||
| 3373 | } | ||
| 3374 | |||
| 3375 | /* Calculate the number of rcu_nodes at each level of the tree. */ | ||
| 3376 | for (i = 1; i <= MAX_RCU_LVLS; i++) | ||
| 3377 | if (n <= rcu_capacity[i]) { | ||
| 3378 | for (j = 0; j <= i; j++) | ||
| 3379 | num_rcu_lvl[j] = | ||
| 3380 | DIV_ROUND_UP(n, rcu_capacity[i - j]); | ||
| 3381 | rcu_num_lvls = i; | ||
| 3382 | for (j = i + 1; j <= MAX_RCU_LVLS; j++) | ||
| 3383 | num_rcu_lvl[j] = 0; | ||
| 3384 | break; | ||
| 3385 | } | ||
| 3386 | |||
| 3387 | /* Calculate the total number of rcu_node structures. */ | ||
| 3388 | rcu_num_nodes = 0; | ||
| 3389 | for (i = 0; i <= MAX_RCU_LVLS; i++) | ||
| 3390 | rcu_num_nodes += num_rcu_lvl[i]; | ||
| 3391 | rcu_num_nodes -= n; | ||
| 3392 | } | ||
| 3393 | |||
| 3394 | void __init rcu_init(void) | ||
| 3395 | { | ||
| 3396 | int cpu; | ||
| 3397 | |||
| 3398 | rcu_bootup_announce(); | ||
| 3399 | rcu_init_geometry(); | ||
| 3400 | rcu_init_one(&rcu_bh_state, &rcu_bh_data); | ||
| 3401 | rcu_init_one(&rcu_sched_state, &rcu_sched_data); | ||
| 3402 | __rcu_init_preempt(); | ||
| 3403 | open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); | ||
| 3404 | |||
| 3405 | /* | ||
| 3406 | * We don't need protection against CPU-hotplug here because | ||
| 3407 | * this is called early in boot, before either interrupts | ||
| 3408 | * or the scheduler are operational. | ||
| 3409 | */ | ||
| 3410 | cpu_notifier(rcu_cpu_notify, 0); | ||
| 3411 | pm_notifier(rcu_pm_notify, 0); | ||
| 3412 | for_each_online_cpu(cpu) | ||
| 3413 | rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); | ||
| 3414 | } | ||
| 3415 | |||
| 3416 | #include "tree_plugin.h" | ||
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h new file mode 100644 index 000000000000..52be957c9fe2 --- /dev/null +++ b/kernel/rcu/tree.h | |||
| @@ -0,0 +1,585 @@ | |||
| 1 | /* | ||
| 2 | * Read-Copy Update mechanism for mutual exclusion (tree-based version) | ||
| 3 | * Internal non-public definitions. | ||
| 4 | * | ||
| 5 | * This program is free software; you can redistribute it and/or modify | ||
| 6 | * it under the terms of the GNU General Public License as published by | ||
| 7 | * the Free Software Foundation; either version 2 of the License, or | ||
| 8 | * (at your option) any later version. | ||
| 9 | * | ||
| 10 | * This program is distributed in the hope that it will be useful, | ||
| 11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 13 | * GNU General Public License for more details. | ||
| 14 | * | ||
| 15 | * You should have received a copy of the GNU General Public License | ||
| 16 | * along with this program; if not, write to the Free Software | ||
| 17 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
| 18 | * | ||
| 19 | * Copyright IBM Corporation, 2008 | ||
| 20 | * | ||
| 21 | * Author: Ingo Molnar <mingo@elte.hu> | ||
| 22 | * Paul E. McKenney <paulmck@linux.vnet.ibm.com> | ||
| 23 | */ | ||
| 24 | |||
| 25 | #include <linux/cache.h> | ||
| 26 | #include <linux/spinlock.h> | ||
| 27 | #include <linux/threads.h> | ||
| 28 | #include <linux/cpumask.h> | ||
| 29 | #include <linux/seqlock.h> | ||
| 30 | #include <linux/irq_work.h> | ||
| 31 | |||
| 32 | /* | ||
| 33 | * Define shape of hierarchy based on NR_CPUS, CONFIG_RCU_FANOUT, and | ||
| 34 | * CONFIG_RCU_FANOUT_LEAF. | ||
| 35 | * In theory, it should be possible to add more levels straightforwardly. | ||
| 36 | * In practice, this did work well going from three levels to four. | ||
| 37 | * Of course, your mileage may vary. | ||
| 38 | */ | ||
| 39 | #define MAX_RCU_LVLS 4 | ||
| 40 | #define RCU_FANOUT_1 (CONFIG_RCU_FANOUT_LEAF) | ||
| 41 | #define RCU_FANOUT_2 (RCU_FANOUT_1 * CONFIG_RCU_FANOUT) | ||
| 42 | #define RCU_FANOUT_3 (RCU_FANOUT_2 * CONFIG_RCU_FANOUT) | ||
| 43 | #define RCU_FANOUT_4 (RCU_FANOUT_3 * CONFIG_RCU_FANOUT) | ||
| 44 | |||
| 45 | #if NR_CPUS <= RCU_FANOUT_1 | ||
| 46 | # define RCU_NUM_LVLS 1 | ||
| 47 | # define NUM_RCU_LVL_0 1 | ||
| 48 | # define NUM_RCU_LVL_1 (NR_CPUS) | ||
| 49 | # define NUM_RCU_LVL_2 0 | ||
| 50 | # define NUM_RCU_LVL_3 0 | ||
| 51 | # define NUM_RCU_LVL_4 0 | ||
| 52 | #elif NR_CPUS <= RCU_FANOUT_2 | ||
| 53 | # define RCU_NUM_LVLS 2 | ||
| 54 | # define NUM_RCU_LVL_0 1 | ||
| 55 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | ||
| 56 | # define NUM_RCU_LVL_2 (NR_CPUS) | ||
| 57 | # define NUM_RCU_LVL_3 0 | ||
| 58 | # define NUM_RCU_LVL_4 0 | ||
| 59 | #elif NR_CPUS <= RCU_FANOUT_3 | ||
| 60 | # define RCU_NUM_LVLS 3 | ||
| 61 | # define NUM_RCU_LVL_0 1 | ||
| 62 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) | ||
| 63 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | ||
| 64 | # define NUM_RCU_LVL_3 (NR_CPUS) | ||
| 65 | # define NUM_RCU_LVL_4 0 | ||
| 66 | #elif NR_CPUS <= RCU_FANOUT_4 | ||
| 67 | # define RCU_NUM_LVLS 4 | ||
| 68 | # define NUM_RCU_LVL_0 1 | ||
| 69 | # define NUM_RCU_LVL_1 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_3) | ||
| 70 | # define NUM_RCU_LVL_2 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_2) | ||
| 71 | # define NUM_RCU_LVL_3 DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_1) | ||
| 72 | # define NUM_RCU_LVL_4 (NR_CPUS) | ||
| 73 | #else | ||
| 74 | # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" | ||
| 75 | #endif /* #if (NR_CPUS) <= RCU_FANOUT_1 */ | ||
| 76 | |||
| 77 | #define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3 + NUM_RCU_LVL_4) | ||
| 78 | #define NUM_RCU_NODES (RCU_SUM - NR_CPUS) | ||
| 79 | |||
| 80 | extern int rcu_num_lvls; | ||
| 81 | extern int rcu_num_nodes; | ||
| 82 | |||
| 83 | /* | ||
| 84 | * Dynticks per-CPU state. | ||
| 85 | */ | ||
| 86 | struct rcu_dynticks { | ||
| 87 | long long dynticks_nesting; /* Track irq/process nesting level. */ | ||
| 88 | /* Process level is worth LLONG_MAX/2. */ | ||
| 89 | int dynticks_nmi_nesting; /* Track NMI nesting level. */ | ||
| 90 | atomic_t dynticks; /* Even value for idle, else odd. */ | ||
| 91 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
| 92 | long long dynticks_idle_nesting; | ||
| 93 | /* irq/process nesting level from idle. */ | ||
| 94 | atomic_t dynticks_idle; /* Even value for idle, else odd. */ | ||
| 95 | /* "Idle" excludes userspace execution. */ | ||
| 96 | unsigned long dynticks_idle_jiffies; | ||
| 97 | /* End of last non-NMI non-idle period. */ | ||
| 98 | #endif /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
| 99 | #ifdef CONFIG_RCU_FAST_NO_HZ | ||
| 100 | bool all_lazy; /* Are all CPU's CBs lazy? */ | ||
| 101 | unsigned long nonlazy_posted; | ||
| 102 | /* # times non-lazy CBs posted to CPU. */ | ||
| 103 | unsigned long nonlazy_posted_snap; | ||
| 104 | /* idle-period nonlazy_posted snapshot. */ | ||
| 105 | unsigned long last_accelerate; | ||
| 106 | /* Last jiffy CBs were accelerated. */ | ||
| 107 | unsigned long last_advance_all; | ||
| 108 | /* Last jiffy CBs were all advanced. */ | ||
| 109 | int tick_nohz_enabled_snap; /* Previously seen value from sysfs. */ | ||
| 110 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | ||
| 111 | }; | ||
| 112 | |||
| 113 | /* RCU's kthread states for tracing. */ | ||
| 114 | #define RCU_KTHREAD_STOPPED 0 | ||
| 115 | #define RCU_KTHREAD_RUNNING 1 | ||
| 116 | #define RCU_KTHREAD_WAITING 2 | ||
| 117 | #define RCU_KTHREAD_OFFCPU 3 | ||
| 118 | #define RCU_KTHREAD_YIELDING 4 | ||
| 119 | #define RCU_KTHREAD_MAX 4 | ||
| 120 | |||
| 121 | /* | ||
| 122 | * Definition for node within the RCU grace-period-detection hierarchy. | ||
| 123 | */ | ||
| 124 | struct rcu_node { | ||
| 125 | raw_spinlock_t lock; /* Root rcu_node's lock protects some */ | ||
| 126 | /* rcu_state fields as well as following. */ | ||
| 127 | unsigned long gpnum; /* Current grace period for this node. */ | ||
| 128 | /* This will either be equal to or one */ | ||
| 129 | /* behind the root rcu_node's gpnum. */ | ||
| 130 | unsigned long completed; /* Last GP completed for this node. */ | ||
| 131 | /* This will either be equal to or one */ | ||
| 132 | /* behind the root rcu_node's gpnum. */ | ||
| 133 | unsigned long qsmask; /* CPUs or groups that need to switch in */ | ||
| 134 | /* order for current grace period to proceed.*/ | ||
| 135 | /* In leaf rcu_node, each bit corresponds to */ | ||
| 136 | /* an rcu_data structure, otherwise, each */ | ||
| 137 | /* bit corresponds to a child rcu_node */ | ||
| 138 | /* structure. */ | ||
| 139 | unsigned long expmask; /* Groups that have ->blkd_tasks */ | ||
| 140 | /* elements that need to drain to allow the */ | ||
| 141 | /* current expedited grace period to */ | ||
| 142 | /* complete (only for TREE_PREEMPT_RCU). */ | ||
| 143 | unsigned long qsmaskinit; | ||
| 144 | /* Per-GP initial value for qsmask & expmask. */ | ||
| 145 | unsigned long grpmask; /* Mask to apply to parent qsmask. */ | ||
| 146 | /* Only one bit will be set in this mask. */ | ||
| 147 | int grplo; /* lowest-numbered CPU or group here. */ | ||
| 148 | int grphi; /* highest-numbered CPU or group here. */ | ||
| 149 | u8 grpnum; /* CPU/group number for next level up. */ | ||
| 150 | u8 level; /* root is at level 0. */ | ||
| 151 | struct rcu_node *parent; | ||
| 152 | struct list_head blkd_tasks; | ||
| 153 | /* Tasks blocked in RCU read-side critical */ | ||
| 154 | /* section. Tasks are placed at the head */ | ||
| 155 | /* of this list and age towards the tail. */ | ||
| 156 | struct list_head *gp_tasks; | ||
| 157 | /* Pointer to the first task blocking the */ | ||
| 158 | /* current grace period, or NULL if there */ | ||
| 159 | /* is no such task. */ | ||
| 160 | struct list_head *exp_tasks; | ||
| 161 | /* Pointer to the first task blocking the */ | ||
| 162 | /* current expedited grace period, or NULL */ | ||
| 163 | /* if there is no such task. If there */ | ||
| 164 | /* is no current expedited grace period, */ | ||
| 165 | /* then there can cannot be any such task. */ | ||
| 166 | #ifdef CONFIG_RCU_BOOST | ||
| 167 | struct list_head *boost_tasks; | ||
| 168 | /* Pointer to first task that needs to be */ | ||
| 169 | /* priority boosted, or NULL if no priority */ | ||
| 170 | /* boosting is needed for this rcu_node */ | ||
| 171 | /* structure. If there are no tasks */ | ||
| 172 | /* queued on this rcu_node structure that */ | ||
| 173 | /* are blocking the current grace period, */ | ||
| 174 | /* there can be no such task. */ | ||
| 175 | unsigned long boost_time; | ||
| 176 | /* When to start boosting (jiffies). */ | ||
| 177 | struct task_struct *boost_kthread_task; | ||
| 178 | /* kthread that takes care of priority */ | ||
| 179 | /* boosting for this rcu_node structure. */ | ||
| 180 | unsigned int boost_kthread_status; | ||
| 181 | /* State of boost_kthread_task for tracing. */ | ||
| 182 | unsigned long n_tasks_boosted; | ||
| 183 | /* Total number of tasks boosted. */ | ||
| 184 | unsigned long n_exp_boosts; | ||
| 185 | /* Number of tasks boosted for expedited GP. */ | ||
| 186 | unsigned long n_normal_boosts; | ||
| 187 | /* Number of tasks boosted for normal GP. */ | ||
| 188 | unsigned long n_balk_blkd_tasks; | ||
| 189 | /* Refused to boost: no blocked tasks. */ | ||
| 190 | unsigned long n_balk_exp_gp_tasks; | ||
| 191 | /* Refused to boost: nothing blocking GP. */ | ||
| 192 | unsigned long n_balk_boost_tasks; | ||
| 193 | /* Refused to boost: already boosting. */ | ||
| 194 | unsigned long n_balk_notblocked; | ||
| 195 | /* Refused to boost: RCU RS CS still running. */ | ||
| 196 | unsigned long n_balk_notyet; | ||
| 197 | /* Refused to boost: not yet time. */ | ||
| 198 | unsigned long n_balk_nos; | ||
| 199 | /* Refused to boost: not sure why, though. */ | ||
| 200 | /* This can happen due to race conditions. */ | ||
| 201 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 202 | #ifdef CONFIG_RCU_NOCB_CPU | ||
| 203 | wait_queue_head_t nocb_gp_wq[2]; | ||
| 204 | /* Place for rcu_nocb_kthread() to wait GP. */ | ||
| 205 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
| 206 | int need_future_gp[2]; | ||
| 207 | /* Counts of upcoming no-CB GP requests. */ | ||
| 208 | raw_spinlock_t fqslock ____cacheline_internodealigned_in_smp; | ||
| 209 | } ____cacheline_internodealigned_in_smp; | ||
| 210 | |||
| 211 | /* | ||
| 212 | * Do a full breadth-first scan of the rcu_node structures for the | ||
| 213 | * specified rcu_state structure. | ||
| 214 | */ | ||
| 215 | #define rcu_for_each_node_breadth_first(rsp, rnp) \ | ||
| 216 | for ((rnp) = &(rsp)->node[0]; \ | ||
| 217 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) | ||
| 218 | |||
| 219 | /* | ||
| 220 | * Do a breadth-first scan of the non-leaf rcu_node structures for the | ||
| 221 | * specified rcu_state structure. Note that if there is a singleton | ||
| 222 | * rcu_node tree with but one rcu_node structure, this loop is a no-op. | ||
| 223 | */ | ||
| 224 | #define rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) \ | ||
| 225 | for ((rnp) = &(rsp)->node[0]; \ | ||
| 226 | (rnp) < (rsp)->level[rcu_num_lvls - 1]; (rnp)++) | ||
| 227 | |||
| 228 | /* | ||
| 229 | * Scan the leaves of the rcu_node hierarchy for the specified rcu_state | ||
| 230 | * structure. Note that if there is a singleton rcu_node tree with but | ||
| 231 | * one rcu_node structure, this loop -will- visit the rcu_node structure. | ||
| 232 | * It is still a leaf node, even if it is also the root node. | ||
| 233 | */ | ||
| 234 | #define rcu_for_each_leaf_node(rsp, rnp) \ | ||
| 235 | for ((rnp) = (rsp)->level[rcu_num_lvls - 1]; \ | ||
| 236 | (rnp) < &(rsp)->node[rcu_num_nodes]; (rnp)++) | ||
| 237 | |||
| 238 | /* Index values for nxttail array in struct rcu_data. */ | ||
| 239 | #define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ | ||
| 240 | #define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ | ||
| 241 | #define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ | ||
| 242 | #define RCU_NEXT_TAIL 3 | ||
| 243 | #define RCU_NEXT_SIZE 4 | ||
| 244 | |||
| 245 | /* Per-CPU data for read-copy update. */ | ||
| 246 | struct rcu_data { | ||
| 247 | /* 1) quiescent-state and grace-period handling : */ | ||
| 248 | unsigned long completed; /* Track rsp->completed gp number */ | ||
| 249 | /* in order to detect GP end. */ | ||
| 250 | unsigned long gpnum; /* Highest gp number that this CPU */ | ||
| 251 | /* is aware of having started. */ | ||
| 252 | bool passed_quiesce; /* User-mode/idle loop etc. */ | ||
| 253 | bool qs_pending; /* Core waits for quiesc state. */ | ||
| 254 | bool beenonline; /* CPU online at least once. */ | ||
| 255 | bool preemptible; /* Preemptible RCU? */ | ||
| 256 | struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ | ||
| 257 | unsigned long grpmask; /* Mask to apply to leaf qsmask. */ | ||
| 258 | #ifdef CONFIG_RCU_CPU_STALL_INFO | ||
| 259 | unsigned long ticks_this_gp; /* The number of scheduling-clock */ | ||
| 260 | /* ticks this CPU has handled */ | ||
| 261 | /* during and after the last grace */ | ||
| 262 | /* period it is aware of. */ | ||
| 263 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ | ||
| 264 | |||
| 265 | /* 2) batch handling */ | ||
| 266 | /* | ||
| 267 | * If nxtlist is not NULL, it is partitioned as follows. | ||
| 268 | * Any of the partitions might be empty, in which case the | ||
| 269 | * pointer to that partition will be equal to the pointer for | ||
| 270 | * the following partition. When the list is empty, all of | ||
| 271 | * the nxttail elements point to the ->nxtlist pointer itself, | ||
| 272 | * which in that case is NULL. | ||
| 273 | * | ||
| 274 | * [nxtlist, *nxttail[RCU_DONE_TAIL]): | ||
| 275 | * Entries that batch # <= ->completed | ||
| 276 | * The grace period for these entries has completed, and | ||
| 277 | * the other grace-period-completed entries may be moved | ||
| 278 | * here temporarily in rcu_process_callbacks(). | ||
| 279 | * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): | ||
| 280 | * Entries that batch # <= ->completed - 1: waiting for current GP | ||
| 281 | * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): | ||
| 282 | * Entries known to have arrived before current GP ended | ||
| 283 | * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]): | ||
| 284 | * Entries that might have arrived after current GP ended | ||
| 285 | * Note that the value of *nxttail[RCU_NEXT_TAIL] will | ||
| 286 | * always be NULL, as this is the end of the list. | ||
| 287 | */ | ||
| 288 | struct rcu_head *nxtlist; | ||
| 289 | struct rcu_head **nxttail[RCU_NEXT_SIZE]; | ||
| 290 | unsigned long nxtcompleted[RCU_NEXT_SIZE]; | ||
| 291 | /* grace periods for sublists. */ | ||
| 292 | long qlen_lazy; /* # of lazy queued callbacks */ | ||
| 293 | long qlen; /* # of queued callbacks, incl lazy */ | ||
| 294 | long qlen_last_fqs_check; | ||
| 295 | /* qlen at last check for QS forcing */ | ||
| 296 | unsigned long n_cbs_invoked; /* count of RCU cbs invoked. */ | ||
| 297 | unsigned long n_nocbs_invoked; /* count of no-CBs RCU cbs invoked. */ | ||
| 298 | unsigned long n_cbs_orphaned; /* RCU cbs orphaned by dying CPU */ | ||
| 299 | unsigned long n_cbs_adopted; /* RCU cbs adopted from dying CPU */ | ||
| 300 | unsigned long n_force_qs_snap; | ||
| 301 | /* did other CPU force QS recently? */ | ||
| 302 | long blimit; /* Upper limit on a processed batch */ | ||
| 303 | |||
| 304 | /* 3) dynticks interface. */ | ||
| 305 | struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ | ||
| 306 | int dynticks_snap; /* Per-GP tracking for dynticks. */ | ||
| 307 | |||
| 308 | /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ | ||
| 309 | unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ | ||
| 310 | unsigned long offline_fqs; /* Kicked due to being offline. */ | ||
| 311 | |||
| 312 | /* 5) __rcu_pending() statistics. */ | ||
| 313 | unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ | ||
| 314 | unsigned long n_rp_qs_pending; | ||
| 315 | unsigned long n_rp_report_qs; | ||
| 316 | unsigned long n_rp_cb_ready; | ||
| 317 | unsigned long n_rp_cpu_needs_gp; | ||
| 318 | unsigned long n_rp_gp_completed; | ||
| 319 | unsigned long n_rp_gp_started; | ||
| 320 | unsigned long n_rp_need_nothing; | ||
| 321 | |||
| 322 | /* 6) _rcu_barrier() and OOM callbacks. */ | ||
| 323 | struct rcu_head barrier_head; | ||
| 324 | #ifdef CONFIG_RCU_FAST_NO_HZ | ||
| 325 | struct rcu_head oom_head; | ||
| 326 | #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | ||
| 327 | |||
| 328 | /* 7) Callback offloading. */ | ||
| 329 | #ifdef CONFIG_RCU_NOCB_CPU | ||
| 330 | struct rcu_head *nocb_head; /* CBs waiting for kthread. */ | ||
| 331 | struct rcu_head **nocb_tail; | ||
| 332 | atomic_long_t nocb_q_count; /* # CBs waiting for kthread */ | ||
| 333 | atomic_long_t nocb_q_count_lazy; /* (approximate). */ | ||
| 334 | int nocb_p_count; /* # CBs being invoked by kthread */ | ||
| 335 | int nocb_p_count_lazy; /* (approximate). */ | ||
| 336 | wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ | ||
| 337 | struct task_struct *nocb_kthread; | ||
| 338 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
| 339 | |||
| 340 | /* 8) RCU CPU stall data. */ | ||
| 341 | #ifdef CONFIG_RCU_CPU_STALL_INFO | ||
| 342 | unsigned int softirq_snap; /* Snapshot of softirq activity. */ | ||
| 343 | #endif /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ | ||
| 344 | |||
| 345 | int cpu; | ||
| 346 | struct rcu_state *rsp; | ||
| 347 | }; | ||
| 348 | |||
| 349 | /* Values for fqs_state field in struct rcu_state. */ | ||
| 350 | #define RCU_GP_IDLE 0 /* No grace period in progress. */ | ||
| 351 | #define RCU_GP_INIT 1 /* Grace period being initialized. */ | ||
| 352 | #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ | ||
| 353 | #define RCU_FORCE_QS 3 /* Need to force quiescent state. */ | ||
| 354 | #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK | ||
| 355 | |||
| 356 | #define RCU_JIFFIES_TILL_FORCE_QS (1 + (HZ > 250) + (HZ > 500)) | ||
| 357 | /* For jiffies_till_first_fqs and */ | ||
| 358 | /* and jiffies_till_next_fqs. */ | ||
| 359 | |||
| 360 | #define RCU_JIFFIES_FQS_DIV 256 /* Very large systems need more */ | ||
| 361 | /* delay between bouts of */ | ||
| 362 | /* quiescent-state forcing. */ | ||
| 363 | |||
| 364 | #define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time to take */ | ||
| 365 | /* at least one scheduling clock */ | ||
| 366 | /* irq before ratting on them. */ | ||
| 367 | |||
| 368 | #define rcu_wait(cond) \ | ||
| 369 | do { \ | ||
| 370 | for (;;) { \ | ||
| 371 | set_current_state(TASK_INTERRUPTIBLE); \ | ||
| 372 | if (cond) \ | ||
| 373 | break; \ | ||
| 374 | schedule(); \ | ||
| 375 | } \ | ||
| 376 | __set_current_state(TASK_RUNNING); \ | ||
| 377 | } while (0) | ||
| 378 | |||
| 379 | /* | ||
| 380 | * RCU global state, including node hierarchy. This hierarchy is | ||
| 381 | * represented in "heap" form in a dense array. The root (first level) | ||
| 382 | * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second | ||
| 383 | * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]), | ||
| 384 | * and the third level in ->node[m+1] and following (->node[m+1] referenced | ||
| 385 | * by ->level[2]). The number of levels is determined by the number of | ||
| 386 | * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy" | ||
| 387 | * consisting of a single rcu_node. | ||
| 388 | */ | ||
| 389 | struct rcu_state { | ||
| 390 | struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ | ||
| 391 | struct rcu_node *level[RCU_NUM_LVLS]; /* Hierarchy levels. */ | ||
| 392 | u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ | ||
| 393 | u8 levelspread[RCU_NUM_LVLS]; /* kids/node in each level. */ | ||
| 394 | struct rcu_data __percpu *rda; /* pointer of percu rcu_data. */ | ||
| 395 | void (*call)(struct rcu_head *head, /* call_rcu() flavor. */ | ||
| 396 | void (*func)(struct rcu_head *head)); | ||
| 397 | |||
| 398 | /* The following fields are guarded by the root rcu_node's lock. */ | ||
| 399 | |||
| 400 | u8 fqs_state ____cacheline_internodealigned_in_smp; | ||
| 401 | /* Force QS state. */ | ||
| 402 | u8 boost; /* Subject to priority boost. */ | ||
| 403 | unsigned long gpnum; /* Current gp number. */ | ||
| 404 | unsigned long completed; /* # of last completed gp. */ | ||
| 405 | struct task_struct *gp_kthread; /* Task for grace periods. */ | ||
| 406 | wait_queue_head_t gp_wq; /* Where GP task waits. */ | ||
| 407 | int gp_flags; /* Commands for GP task. */ | ||
| 408 | |||
| 409 | /* End of fields guarded by root rcu_node's lock. */ | ||
| 410 | |||
| 411 | raw_spinlock_t orphan_lock ____cacheline_internodealigned_in_smp; | ||
| 412 | /* Protect following fields. */ | ||
| 413 | struct rcu_head *orphan_nxtlist; /* Orphaned callbacks that */ | ||
| 414 | /* need a grace period. */ | ||
| 415 | struct rcu_head **orphan_nxttail; /* Tail of above. */ | ||
| 416 | struct rcu_head *orphan_donelist; /* Orphaned callbacks that */ | ||
| 417 | /* are ready to invoke. */ | ||
| 418 | struct rcu_head **orphan_donetail; /* Tail of above. */ | ||
| 419 | long qlen_lazy; /* Number of lazy callbacks. */ | ||
| 420 | long qlen; /* Total number of callbacks. */ | ||
| 421 | /* End of fields guarded by orphan_lock. */ | ||
| 422 | |||
| 423 | struct mutex onoff_mutex; /* Coordinate hotplug & GPs. */ | ||
| 424 | |||
| 425 | struct mutex barrier_mutex; /* Guards barrier fields. */ | ||
| 426 | atomic_t barrier_cpu_count; /* # CPUs waiting on. */ | ||
| 427 | struct completion barrier_completion; /* Wake at barrier end. */ | ||
| 428 | unsigned long n_barrier_done; /* ++ at start and end of */ | ||
| 429 | /* _rcu_barrier(). */ | ||
| 430 | /* End of fields guarded by barrier_mutex. */ | ||
| 431 | |||
| 432 | atomic_long_t expedited_start; /* Starting ticket. */ | ||
| 433 | atomic_long_t expedited_done; /* Done ticket. */ | ||
| 434 | atomic_long_t expedited_wrap; /* # near-wrap incidents. */ | ||
| 435 | atomic_long_t expedited_tryfail; /* # acquisition failures. */ | ||
| 436 | atomic_long_t expedited_workdone1; /* # done by others #1. */ | ||
| 437 | atomic_long_t expedited_workdone2; /* # done by others #2. */ | ||
| 438 | atomic_long_t expedited_normal; /* # fallbacks to normal. */ | ||
| 439 | atomic_long_t expedited_stoppedcpus; /* # successful stop_cpus. */ | ||
| 440 | atomic_long_t expedited_done_tries; /* # tries to update _done. */ | ||
| 441 | atomic_long_t expedited_done_lost; /* # times beaten to _done. */ | ||
| 442 | atomic_long_t expedited_done_exit; /* # times exited _done loop. */ | ||
| 443 | |||
| 444 | unsigned long jiffies_force_qs; /* Time at which to invoke */ | ||
| 445 | /* force_quiescent_state(). */ | ||
| 446 | unsigned long n_force_qs; /* Number of calls to */ | ||
| 447 | /* force_quiescent_state(). */ | ||
| 448 | unsigned long n_force_qs_lh; /* ~Number of calls leaving */ | ||
| 449 | /* due to lock unavailable. */ | ||
| 450 | unsigned long n_force_qs_ngp; /* Number of calls leaving */ | ||
| 451 | /* due to no GP active. */ | ||
| 452 | unsigned long gp_start; /* Time at which GP started, */ | ||
| 453 | /* but in jiffies. */ | ||
| 454 | unsigned long jiffies_stall; /* Time at which to check */ | ||
| 455 | /* for CPU stalls. */ | ||
| 456 | unsigned long gp_max; /* Maximum GP duration in */ | ||
| 457 | /* jiffies. */ | ||
| 458 | const char *name; /* Name of structure. */ | ||
| 459 | char abbr; /* Abbreviated name. */ | ||
| 460 | struct list_head flavors; /* List of RCU flavors. */ | ||
| 461 | struct irq_work wakeup_work; /* Postponed wakeups */ | ||
| 462 | }; | ||
| 463 | |||
| 464 | /* Values for rcu_state structure's gp_flags field. */ | ||
| 465 | #define RCU_GP_FLAG_INIT 0x1 /* Need grace-period initialization. */ | ||
| 466 | #define RCU_GP_FLAG_FQS 0x2 /* Need grace-period quiescent-state forcing. */ | ||
| 467 | |||
| 468 | extern struct list_head rcu_struct_flavors; | ||
| 469 | |||
| 470 | /* Sequence through rcu_state structures for each RCU flavor. */ | ||
| 471 | #define for_each_rcu_flavor(rsp) \ | ||
| 472 | list_for_each_entry((rsp), &rcu_struct_flavors, flavors) | ||
| 473 | |||
| 474 | /* Return values for rcu_preempt_offline_tasks(). */ | ||
| 475 | |||
| 476 | #define RCU_OFL_TASKS_NORM_GP 0x1 /* Tasks blocking normal */ | ||
| 477 | /* GP were moved to root. */ | ||
| 478 | #define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ | ||
| 479 | /* GP were moved to root. */ | ||
| 480 | |||
| 481 | /* | ||
| 482 | * RCU implementation internal declarations: | ||
| 483 | */ | ||
| 484 | extern struct rcu_state rcu_sched_state; | ||
| 485 | DECLARE_PER_CPU(struct rcu_data, rcu_sched_data); | ||
| 486 | |||
| 487 | extern struct rcu_state rcu_bh_state; | ||
| 488 | DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); | ||
| 489 | |||
| 490 | #ifdef CONFIG_TREE_PREEMPT_RCU | ||
| 491 | extern struct rcu_state rcu_preempt_state; | ||
| 492 | DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); | ||
| 493 | #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | ||
| 494 | |||
| 495 | #ifdef CONFIG_RCU_BOOST | ||
| 496 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_status); | ||
| 497 | DECLARE_PER_CPU(int, rcu_cpu_kthread_cpu); | ||
| 498 | DECLARE_PER_CPU(unsigned int, rcu_cpu_kthread_loops); | ||
| 499 | DECLARE_PER_CPU(char, rcu_cpu_has_work); | ||
| 500 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 501 | |||
| 502 | #ifndef RCU_TREE_NONCORE | ||
| 503 | |||
| 504 | /* Forward declarations for rcutree_plugin.h */ | ||
| 505 | static void rcu_bootup_announce(void); | ||
| 506 | long rcu_batches_completed(void); | ||
| 507 | static void rcu_preempt_note_context_switch(int cpu); | ||
| 508 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp); | ||
| 509 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 510 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, | ||
| 511 | unsigned long flags); | ||
| 512 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
| 513 | static void rcu_print_detail_task_stall(struct rcu_state *rsp); | ||
| 514 | static int rcu_print_task_stall(struct rcu_node *rnp); | ||
| 515 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); | ||
| 516 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 517 | static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | ||
| 518 | struct rcu_node *rnp, | ||
| 519 | struct rcu_data *rdp); | ||
| 520 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
| 521 | static void rcu_preempt_check_callbacks(int cpu); | ||
| 522 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)); | ||
| 523 | #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) | ||
| 524 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | ||
| 525 | bool wake); | ||
| 526 | #endif /* #if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_TREE_PREEMPT_RCU) */ | ||
| 527 | static void __init __rcu_init_preempt(void); | ||
| 528 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags); | ||
| 529 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp); | ||
| 530 | static void invoke_rcu_callbacks_kthread(void); | ||
| 531 | static bool rcu_is_callbacks_kthread(void); | ||
| 532 | #ifdef CONFIG_RCU_BOOST | ||
| 533 | static void rcu_preempt_do_callbacks(void); | ||
| 534 | static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | ||
| 535 | struct rcu_node *rnp); | ||
| 536 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 537 | static void rcu_prepare_kthreads(int cpu); | ||
| 538 | static void rcu_cleanup_after_idle(int cpu); | ||
| 539 | static void rcu_prepare_for_idle(int cpu); | ||
| 540 | static void rcu_idle_count_callbacks_posted(void); | ||
| 541 | static void print_cpu_stall_info_begin(void); | ||
| 542 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu); | ||
| 543 | static void print_cpu_stall_info_end(void); | ||
| 544 | static void zero_cpu_stall_ticks(struct rcu_data *rdp); | ||
| 545 | static void increment_cpu_stall_ticks(void); | ||
| 546 | static int rcu_nocb_needs_gp(struct rcu_state *rsp); | ||
| 547 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); | ||
| 548 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); | ||
| 549 | static void rcu_init_one_nocb(struct rcu_node *rnp); | ||
| 550 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | ||
| 551 | bool lazy); | ||
| 552 | static bool rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | ||
| 553 | struct rcu_data *rdp); | ||
| 554 | static void rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp); | ||
| 555 | static void rcu_spawn_nocb_kthreads(struct rcu_state *rsp); | ||
| 556 | static void rcu_kick_nohz_cpu(int cpu); | ||
| 557 | static bool init_nocb_callback_list(struct rcu_data *rdp); | ||
| 558 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq); | ||
| 559 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq); | ||
| 560 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | ||
| 561 | unsigned long *maxj); | ||
| 562 | static bool is_sysidle_rcu_state(struct rcu_state *rsp); | ||
| 563 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | ||
| 564 | unsigned long maxj); | ||
| 565 | static void rcu_bind_gp_kthread(void); | ||
| 566 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp); | ||
| 567 | |||
| 568 | #endif /* #ifndef RCU_TREE_NONCORE */ | ||
| 569 | |||
| 570 | #ifdef CONFIG_RCU_TRACE | ||
| 571 | #ifdef CONFIG_RCU_NOCB_CPU | ||
| 572 | /* Sum up queue lengths for tracing. */ | ||
| 573 | static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) | ||
| 574 | { | ||
| 575 | *ql = atomic_long_read(&rdp->nocb_q_count) + rdp->nocb_p_count; | ||
| 576 | *qll = atomic_long_read(&rdp->nocb_q_count_lazy) + rdp->nocb_p_count_lazy; | ||
| 577 | } | ||
| 578 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
| 579 | static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) | ||
| 580 | { | ||
| 581 | *ql = 0; | ||
| 582 | *qll = 0; | ||
| 583 | } | ||
| 584 | #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ | ||
| 585 | #endif /* #ifdef CONFIG_RCU_TRACE */ | ||
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h new file mode 100644 index 000000000000..3822ac0c4b27 --- /dev/null +++ b/kernel/rcu/tree_plugin.h | |||
| @@ -0,0 +1,2831 @@ | |||
| 1 | /* | ||
| 2 | * Read-Copy Update mechanism for mutual exclusion (tree-based version) | ||
| 3 | * Internal non-public definitions that provide either classic | ||
| 4 | * or preemptible semantics. | ||
| 5 | * | ||
| 6 | * This program is free software; you can redistribute it and/or modify | ||
| 7 | * it under the terms of the GNU General Public License as published by | ||
| 8 | * the Free Software Foundation; either version 2 of the License, or | ||
| 9 | * (at your option) any later version. | ||
| 10 | * | ||
| 11 | * This program is distributed in the hope that it will be useful, | ||
| 12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 14 | * GNU General Public License for more details. | ||
| 15 | * | ||
| 16 | * You should have received a copy of the GNU General Public License | ||
| 17 | * along with this program; if not, write to the Free Software | ||
| 18 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
| 19 | * | ||
| 20 | * Copyright Red Hat, 2009 | ||
| 21 | * Copyright IBM Corporation, 2009 | ||
| 22 | * | ||
| 23 | * Author: Ingo Molnar <mingo@elte.hu> | ||
| 24 | * Paul E. McKenney <paulmck@linux.vnet.ibm.com> | ||
| 25 | */ | ||
| 26 | |||
| 27 | #include <linux/delay.h> | ||
| 28 | #include <linux/gfp.h> | ||
| 29 | #include <linux/oom.h> | ||
| 30 | #include <linux/smpboot.h> | ||
| 31 | #include "../time/tick-internal.h" | ||
| 32 | |||
| 33 | #define RCU_KTHREAD_PRIO 1 | ||
| 34 | |||
| 35 | #ifdef CONFIG_RCU_BOOST | ||
| 36 | #define RCU_BOOST_PRIO CONFIG_RCU_BOOST_PRIO | ||
| 37 | #else | ||
| 38 | #define RCU_BOOST_PRIO RCU_KTHREAD_PRIO | ||
| 39 | #endif | ||
| 40 | |||
| 41 | #ifdef CONFIG_RCU_NOCB_CPU | ||
| 42 | static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */ | ||
| 43 | static bool have_rcu_nocb_mask; /* Was rcu_nocb_mask allocated? */ | ||
| 44 | static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ | ||
| 45 | static char __initdata nocb_buf[NR_CPUS * 5]; | ||
| 46 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
| 47 | |||
| 48 | /* | ||
| 49 | * Check the RCU kernel configuration parameters and print informative | ||
| 50 | * messages about anything out of the ordinary. If you like #ifdef, you | ||
| 51 | * will love this function. | ||
| 52 | */ | ||
| 53 | static void __init rcu_bootup_announce_oddness(void) | ||
| 54 | { | ||
| 55 | #ifdef CONFIG_RCU_TRACE | ||
| 56 | pr_info("\tRCU debugfs-based tracing is enabled.\n"); | ||
| 57 | #endif | ||
| 58 | #if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32) | ||
| 59 | pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n", | ||
| 60 | CONFIG_RCU_FANOUT); | ||
| 61 | #endif | ||
| 62 | #ifdef CONFIG_RCU_FANOUT_EXACT | ||
| 63 | pr_info("\tHierarchical RCU autobalancing is disabled.\n"); | ||
| 64 | #endif | ||
| 65 | #ifdef CONFIG_RCU_FAST_NO_HZ | ||
| 66 | pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n"); | ||
| 67 | #endif | ||
| 68 | #ifdef CONFIG_PROVE_RCU | ||
| 69 | pr_info("\tRCU lockdep checking is enabled.\n"); | ||
| 70 | #endif | ||
| 71 | #ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE | ||
| 72 | pr_info("\tRCU torture testing starts during boot.\n"); | ||
| 73 | #endif | ||
| 74 | #if defined(CONFIG_TREE_PREEMPT_RCU) && !defined(CONFIG_RCU_CPU_STALL_VERBOSE) | ||
| 75 | pr_info("\tDump stacks of tasks blocking RCU-preempt GP.\n"); | ||
| 76 | #endif | ||
| 77 | #if defined(CONFIG_RCU_CPU_STALL_INFO) | ||
| 78 | pr_info("\tAdditional per-CPU info printed with stalls.\n"); | ||
| 79 | #endif | ||
| 80 | #if NUM_RCU_LVL_4 != 0 | ||
| 81 | pr_info("\tFour-level hierarchy is enabled.\n"); | ||
| 82 | #endif | ||
| 83 | if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF) | ||
| 84 | pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf); | ||
| 85 | if (nr_cpu_ids != NR_CPUS) | ||
| 86 | pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids); | ||
| 87 | #ifdef CONFIG_RCU_NOCB_CPU | ||
| 88 | #ifndef CONFIG_RCU_NOCB_CPU_NONE | ||
| 89 | if (!have_rcu_nocb_mask) { | ||
| 90 | zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL); | ||
| 91 | have_rcu_nocb_mask = true; | ||
| 92 | } | ||
| 93 | #ifdef CONFIG_RCU_NOCB_CPU_ZERO | ||
| 94 | pr_info("\tOffload RCU callbacks from CPU 0\n"); | ||
| 95 | cpumask_set_cpu(0, rcu_nocb_mask); | ||
| 96 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ZERO */ | ||
| 97 | #ifdef CONFIG_RCU_NOCB_CPU_ALL | ||
| 98 | pr_info("\tOffload RCU callbacks from all CPUs\n"); | ||
| 99 | cpumask_copy(rcu_nocb_mask, cpu_possible_mask); | ||
| 100 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU_ALL */ | ||
| 101 | #endif /* #ifndef CONFIG_RCU_NOCB_CPU_NONE */ | ||
| 102 | if (have_rcu_nocb_mask) { | ||
| 103 | if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) { | ||
| 104 | pr_info("\tNote: kernel parameter 'rcu_nocbs=' contains nonexistent CPUs.\n"); | ||
| 105 | cpumask_and(rcu_nocb_mask, cpu_possible_mask, | ||
| 106 | rcu_nocb_mask); | ||
| 107 | } | ||
| 108 | cpulist_scnprintf(nocb_buf, sizeof(nocb_buf), rcu_nocb_mask); | ||
| 109 | pr_info("\tOffload RCU callbacks from CPUs: %s.\n", nocb_buf); | ||
| 110 | if (rcu_nocb_poll) | ||
| 111 | pr_info("\tPoll for callbacks from no-CBs CPUs.\n"); | ||
| 112 | } | ||
| 113 | #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
| 114 | } | ||
| 115 | |||
| 116 | #ifdef CONFIG_TREE_PREEMPT_RCU | ||
| 117 | |||
| 118 | RCU_STATE_INITIALIZER(rcu_preempt, 'p', call_rcu); | ||
| 119 | static struct rcu_state *rcu_state = &rcu_preempt_state; | ||
| 120 | |||
| 121 | static int rcu_preempted_readers_exp(struct rcu_node *rnp); | ||
| 122 | |||
| 123 | /* | ||
| 124 | * Tell them what RCU they are running. | ||
| 125 | */ | ||
| 126 | static void __init rcu_bootup_announce(void) | ||
| 127 | { | ||
| 128 | pr_info("Preemptible hierarchical RCU implementation.\n"); | ||
| 129 | rcu_bootup_announce_oddness(); | ||
| 130 | } | ||
| 131 | |||
| 132 | /* | ||
| 133 | * Return the number of RCU-preempt batches processed thus far | ||
| 134 | * for debug and statistics. | ||
| 135 | */ | ||
| 136 | long rcu_batches_completed_preempt(void) | ||
| 137 | { | ||
| 138 | return rcu_preempt_state.completed; | ||
| 139 | } | ||
| 140 | EXPORT_SYMBOL_GPL(rcu_batches_completed_preempt); | ||
| 141 | |||
| 142 | /* | ||
| 143 | * Return the number of RCU batches processed thus far for debug & stats. | ||
| 144 | */ | ||
| 145 | long rcu_batches_completed(void) | ||
| 146 | { | ||
| 147 | return rcu_batches_completed_preempt(); | ||
| 148 | } | ||
| 149 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
| 150 | |||
| 151 | /* | ||
| 152 | * Force a quiescent state for preemptible RCU. | ||
| 153 | */ | ||
| 154 | void rcu_force_quiescent_state(void) | ||
| 155 | { | ||
| 156 | force_quiescent_state(&rcu_preempt_state); | ||
| 157 | } | ||
| 158 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | ||
| 159 | |||
| 160 | /* | ||
| 161 | * Record a preemptible-RCU quiescent state for the specified CPU. Note | ||
| 162 | * that this just means that the task currently running on the CPU is | ||
| 163 | * not in a quiescent state. There might be any number of tasks blocked | ||
| 164 | * while in an RCU read-side critical section. | ||
| 165 | * | ||
| 166 | * Unlike the other rcu_*_qs() functions, callers to this function | ||
| 167 | * must disable irqs in order to protect the assignment to | ||
| 168 | * ->rcu_read_unlock_special. | ||
| 169 | */ | ||
| 170 | static void rcu_preempt_qs(int cpu) | ||
| 171 | { | ||
| 172 | struct rcu_data *rdp = &per_cpu(rcu_preempt_data, cpu); | ||
| 173 | |||
| 174 | if (rdp->passed_quiesce == 0) | ||
| 175 | trace_rcu_grace_period(TPS("rcu_preempt"), rdp->gpnum, TPS("cpuqs")); | ||
| 176 | rdp->passed_quiesce = 1; | ||
| 177 | current->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_NEED_QS; | ||
| 178 | } | ||
| 179 | |||
| 180 | /* | ||
| 181 | * We have entered the scheduler, and the current task might soon be | ||
| 182 | * context-switched away from. If this task is in an RCU read-side | ||
| 183 | * critical section, we will no longer be able to rely on the CPU to | ||
| 184 | * record that fact, so we enqueue the task on the blkd_tasks list. | ||
| 185 | * The task will dequeue itself when it exits the outermost enclosing | ||
| 186 | * RCU read-side critical section. Therefore, the current grace period | ||
| 187 | * cannot be permitted to complete until the blkd_tasks list entries | ||
| 188 | * predating the current grace period drain, in other words, until | ||
| 189 | * rnp->gp_tasks becomes NULL. | ||
| 190 | * | ||
| 191 | * Caller must disable preemption. | ||
| 192 | */ | ||
| 193 | static void rcu_preempt_note_context_switch(int cpu) | ||
| 194 | { | ||
| 195 | struct task_struct *t = current; | ||
| 196 | unsigned long flags; | ||
| 197 | struct rcu_data *rdp; | ||
| 198 | struct rcu_node *rnp; | ||
| 199 | |||
| 200 | if (t->rcu_read_lock_nesting > 0 && | ||
| 201 | (t->rcu_read_unlock_special & RCU_READ_UNLOCK_BLOCKED) == 0) { | ||
| 202 | |||
| 203 | /* Possibly blocking in an RCU read-side critical section. */ | ||
| 204 | rdp = per_cpu_ptr(rcu_preempt_state.rda, cpu); | ||
| 205 | rnp = rdp->mynode; | ||
| 206 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 207 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; | ||
| 208 | t->rcu_blocked_node = rnp; | ||
| 209 | |||
| 210 | /* | ||
| 211 | * If this CPU has already checked in, then this task | ||
| 212 | * will hold up the next grace period rather than the | ||
| 213 | * current grace period. Queue the task accordingly. | ||
| 214 | * If the task is queued for the current grace period | ||
| 215 | * (i.e., this CPU has not yet passed through a quiescent | ||
| 216 | * state for the current grace period), then as long | ||
| 217 | * as that task remains queued, the current grace period | ||
| 218 | * cannot end. Note that there is some uncertainty as | ||
| 219 | * to exactly when the current grace period started. | ||
| 220 | * We take a conservative approach, which can result | ||
| 221 | * in unnecessarily waiting on tasks that started very | ||
| 222 | * slightly after the current grace period began. C'est | ||
| 223 | * la vie!!! | ||
| 224 | * | ||
| 225 | * But first, note that the current CPU must still be | ||
| 226 | * on line! | ||
| 227 | */ | ||
| 228 | WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0); | ||
| 229 | WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); | ||
| 230 | if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) { | ||
| 231 | list_add(&t->rcu_node_entry, rnp->gp_tasks->prev); | ||
| 232 | rnp->gp_tasks = &t->rcu_node_entry; | ||
| 233 | #ifdef CONFIG_RCU_BOOST | ||
| 234 | if (rnp->boost_tasks != NULL) | ||
| 235 | rnp->boost_tasks = rnp->gp_tasks; | ||
| 236 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 237 | } else { | ||
| 238 | list_add(&t->rcu_node_entry, &rnp->blkd_tasks); | ||
| 239 | if (rnp->qsmask & rdp->grpmask) | ||
| 240 | rnp->gp_tasks = &t->rcu_node_entry; | ||
| 241 | } | ||
| 242 | trace_rcu_preempt_task(rdp->rsp->name, | ||
| 243 | t->pid, | ||
| 244 | (rnp->qsmask & rdp->grpmask) | ||
| 245 | ? rnp->gpnum | ||
| 246 | : rnp->gpnum + 1); | ||
| 247 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 248 | } else if (t->rcu_read_lock_nesting < 0 && | ||
| 249 | t->rcu_read_unlock_special) { | ||
| 250 | |||
| 251 | /* | ||
| 252 | * Complete exit from RCU read-side critical section on | ||
| 253 | * behalf of preempted instance of __rcu_read_unlock(). | ||
| 254 | */ | ||
| 255 | rcu_read_unlock_special(t); | ||
| 256 | } | ||
| 257 | |||
| 258 | /* | ||
| 259 | * Either we were not in an RCU read-side critical section to | ||
| 260 | * begin with, or we have now recorded that critical section | ||
| 261 | * globally. Either way, we can now note a quiescent state | ||
| 262 | * for this CPU. Again, if we were in an RCU read-side critical | ||
| 263 | * section, and if that critical section was blocking the current | ||
| 264 | * grace period, then the fact that the task has been enqueued | ||
| 265 | * means that we continue to block the current grace period. | ||
| 266 | */ | ||
| 267 | local_irq_save(flags); | ||
| 268 | rcu_preempt_qs(cpu); | ||
| 269 | local_irq_restore(flags); | ||
| 270 | } | ||
| 271 | |||
| 272 | /* | ||
| 273 | * Check for preempted RCU readers blocking the current grace period | ||
| 274 | * for the specified rcu_node structure. If the caller needs a reliable | ||
| 275 | * answer, it must hold the rcu_node's ->lock. | ||
| 276 | */ | ||
| 277 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) | ||
| 278 | { | ||
| 279 | return rnp->gp_tasks != NULL; | ||
| 280 | } | ||
| 281 | |||
| 282 | /* | ||
| 283 | * Record a quiescent state for all tasks that were previously queued | ||
| 284 | * on the specified rcu_node structure and that were blocking the current | ||
| 285 | * RCU grace period. The caller must hold the specified rnp->lock with | ||
| 286 | * irqs disabled, and this lock is released upon return, but irqs remain | ||
| 287 | * disabled. | ||
| 288 | */ | ||
| 289 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | ||
| 290 | __releases(rnp->lock) | ||
| 291 | { | ||
| 292 | unsigned long mask; | ||
| 293 | struct rcu_node *rnp_p; | ||
| 294 | |||
| 295 | if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { | ||
| 296 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 297 | return; /* Still need more quiescent states! */ | ||
| 298 | } | ||
| 299 | |||
| 300 | rnp_p = rnp->parent; | ||
| 301 | if (rnp_p == NULL) { | ||
| 302 | /* | ||
| 303 | * Either there is only one rcu_node in the tree, | ||
| 304 | * or tasks were kicked up to root rcu_node due to | ||
| 305 | * CPUs going offline. | ||
| 306 | */ | ||
| 307 | rcu_report_qs_rsp(&rcu_preempt_state, flags); | ||
| 308 | return; | ||
| 309 | } | ||
| 310 | |||
| 311 | /* Report up the rest of the hierarchy. */ | ||
| 312 | mask = rnp->grpmask; | ||
| 313 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
| 314 | raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ | ||
| 315 | rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); | ||
| 316 | } | ||
| 317 | |||
| 318 | /* | ||
| 319 | * Advance a ->blkd_tasks-list pointer to the next entry, instead | ||
| 320 | * returning NULL if at the end of the list. | ||
| 321 | */ | ||
| 322 | static struct list_head *rcu_next_node_entry(struct task_struct *t, | ||
| 323 | struct rcu_node *rnp) | ||
| 324 | { | ||
| 325 | struct list_head *np; | ||
| 326 | |||
| 327 | np = t->rcu_node_entry.next; | ||
| 328 | if (np == &rnp->blkd_tasks) | ||
| 329 | np = NULL; | ||
| 330 | return np; | ||
| 331 | } | ||
| 332 | |||
| 333 | /* | ||
| 334 | * Handle special cases during rcu_read_unlock(), such as needing to | ||
| 335 | * notify RCU core processing or task having blocked during the RCU | ||
| 336 | * read-side critical section. | ||
| 337 | */ | ||
| 338 | void rcu_read_unlock_special(struct task_struct *t) | ||
| 339 | { | ||
| 340 | int empty; | ||
| 341 | int empty_exp; | ||
| 342 | int empty_exp_now; | ||
| 343 | unsigned long flags; | ||
| 344 | struct list_head *np; | ||
| 345 | #ifdef CONFIG_RCU_BOOST | ||
| 346 | struct rt_mutex *rbmp = NULL; | ||
| 347 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 348 | struct rcu_node *rnp; | ||
| 349 | int special; | ||
| 350 | |||
| 351 | /* NMI handlers cannot block and cannot safely manipulate state. */ | ||
| 352 | if (in_nmi()) | ||
| 353 | return; | ||
| 354 | |||
| 355 | local_irq_save(flags); | ||
| 356 | |||
| 357 | /* | ||
| 358 | * If RCU core is waiting for this CPU to exit critical section, | ||
| 359 | * let it know that we have done so. | ||
| 360 | */ | ||
| 361 | special = t->rcu_read_unlock_special; | ||
| 362 | if (special & RCU_READ_UNLOCK_NEED_QS) { | ||
| 363 | rcu_preempt_qs(smp_processor_id()); | ||
| 364 | } | ||
| 365 | |||
| 366 | /* Hardware IRQ handlers cannot block. */ | ||
| 367 | if (in_irq() || in_serving_softirq()) { | ||
| 368 | local_irq_restore(flags); | ||
| 369 | return; | ||
| 370 | } | ||
| 371 | |||
| 372 | /* Clean up if blocked during RCU read-side critical section. */ | ||
| 373 | if (special & RCU_READ_UNLOCK_BLOCKED) { | ||
| 374 | t->rcu_read_unlock_special &= ~RCU_READ_UNLOCK_BLOCKED; | ||
| 375 | |||
| 376 | /* | ||
| 377 | * Remove this task from the list it blocked on. The | ||
| 378 | * task can migrate while we acquire the lock, but at | ||
| 379 | * most one time. So at most two passes through loop. | ||
| 380 | */ | ||
| 381 | for (;;) { | ||
| 382 | rnp = t->rcu_blocked_node; | ||
| 383 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | ||
| 384 | if (rnp == t->rcu_blocked_node) | ||
| 385 | break; | ||
| 386 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
| 387 | } | ||
| 388 | empty = !rcu_preempt_blocked_readers_cgp(rnp); | ||
| 389 | empty_exp = !rcu_preempted_readers_exp(rnp); | ||
| 390 | smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ | ||
| 391 | np = rcu_next_node_entry(t, rnp); | ||
| 392 | list_del_init(&t->rcu_node_entry); | ||
| 393 | t->rcu_blocked_node = NULL; | ||
| 394 | trace_rcu_unlock_preempted_task(TPS("rcu_preempt"), | ||
| 395 | rnp->gpnum, t->pid); | ||
| 396 | if (&t->rcu_node_entry == rnp->gp_tasks) | ||
| 397 | rnp->gp_tasks = np; | ||
| 398 | if (&t->rcu_node_entry == rnp->exp_tasks) | ||
| 399 | rnp->exp_tasks = np; | ||
| 400 | #ifdef CONFIG_RCU_BOOST | ||
| 401 | if (&t->rcu_node_entry == rnp->boost_tasks) | ||
| 402 | rnp->boost_tasks = np; | ||
| 403 | /* Snapshot/clear ->rcu_boost_mutex with rcu_node lock held. */ | ||
| 404 | if (t->rcu_boost_mutex) { | ||
| 405 | rbmp = t->rcu_boost_mutex; | ||
| 406 | t->rcu_boost_mutex = NULL; | ||
| 407 | } | ||
| 408 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 409 | |||
| 410 | /* | ||
| 411 | * If this was the last task on the current list, and if | ||
| 412 | * we aren't waiting on any CPUs, report the quiescent state. | ||
| 413 | * Note that rcu_report_unblock_qs_rnp() releases rnp->lock, | ||
| 414 | * so we must take a snapshot of the expedited state. | ||
| 415 | */ | ||
| 416 | empty_exp_now = !rcu_preempted_readers_exp(rnp); | ||
| 417 | if (!empty && !rcu_preempt_blocked_readers_cgp(rnp)) { | ||
| 418 | trace_rcu_quiescent_state_report(TPS("preempt_rcu"), | ||
| 419 | rnp->gpnum, | ||
| 420 | 0, rnp->qsmask, | ||
| 421 | rnp->level, | ||
| 422 | rnp->grplo, | ||
| 423 | rnp->grphi, | ||
| 424 | !!rnp->gp_tasks); | ||
| 425 | rcu_report_unblock_qs_rnp(rnp, flags); | ||
| 426 | } else { | ||
| 427 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 428 | } | ||
| 429 | |||
| 430 | #ifdef CONFIG_RCU_BOOST | ||
| 431 | /* Unboost if we were boosted. */ | ||
| 432 | if (rbmp) | ||
| 433 | rt_mutex_unlock(rbmp); | ||
| 434 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 435 | |||
| 436 | /* | ||
| 437 | * If this was the last task on the expedited lists, | ||
| 438 | * then we need to report up the rcu_node hierarchy. | ||
| 439 | */ | ||
| 440 | if (!empty_exp && empty_exp_now) | ||
| 441 | rcu_report_exp_rnp(&rcu_preempt_state, rnp, true); | ||
| 442 | } else { | ||
| 443 | local_irq_restore(flags); | ||
| 444 | } | ||
| 445 | } | ||
| 446 | |||
| 447 | #ifdef CONFIG_RCU_CPU_STALL_VERBOSE | ||
| 448 | |||
| 449 | /* | ||
| 450 | * Dump detailed information for all tasks blocking the current RCU | ||
| 451 | * grace period on the specified rcu_node structure. | ||
| 452 | */ | ||
| 453 | static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) | ||
| 454 | { | ||
| 455 | unsigned long flags; | ||
| 456 | struct task_struct *t; | ||
| 457 | |||
| 458 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 459 | if (!rcu_preempt_blocked_readers_cgp(rnp)) { | ||
| 460 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 461 | return; | ||
| 462 | } | ||
| 463 | t = list_entry(rnp->gp_tasks, | ||
| 464 | struct task_struct, rcu_node_entry); | ||
| 465 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) | ||
| 466 | sched_show_task(t); | ||
| 467 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 468 | } | ||
| 469 | |||
| 470 | /* | ||
| 471 | * Dump detailed information for all tasks blocking the current RCU | ||
| 472 | * grace period. | ||
| 473 | */ | ||
| 474 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) | ||
| 475 | { | ||
| 476 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
| 477 | |||
| 478 | rcu_print_detail_task_stall_rnp(rnp); | ||
| 479 | rcu_for_each_leaf_node(rsp, rnp) | ||
| 480 | rcu_print_detail_task_stall_rnp(rnp); | ||
| 481 | } | ||
| 482 | |||
| 483 | #else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ | ||
| 484 | |||
| 485 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) | ||
| 486 | { | ||
| 487 | } | ||
| 488 | |||
| 489 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ | ||
| 490 | |||
| 491 | #ifdef CONFIG_RCU_CPU_STALL_INFO | ||
| 492 | |||
| 493 | static void rcu_print_task_stall_begin(struct rcu_node *rnp) | ||
| 494 | { | ||
| 495 | pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):", | ||
| 496 | rnp->level, rnp->grplo, rnp->grphi); | ||
| 497 | } | ||
| 498 | |||
| 499 | static void rcu_print_task_stall_end(void) | ||
| 500 | { | ||
| 501 | pr_cont("\n"); | ||
| 502 | } | ||
| 503 | |||
| 504 | #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ | ||
| 505 | |||
| 506 | static void rcu_print_task_stall_begin(struct rcu_node *rnp) | ||
| 507 | { | ||
| 508 | } | ||
| 509 | |||
| 510 | static void rcu_print_task_stall_end(void) | ||
| 511 | { | ||
| 512 | } | ||
| 513 | |||
| 514 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ | ||
| 515 | |||
| 516 | /* | ||
| 517 | * Scan the current list of tasks blocked within RCU read-side critical | ||
| 518 | * sections, printing out the tid of each. | ||
| 519 | */ | ||
| 520 | static int rcu_print_task_stall(struct rcu_node *rnp) | ||
| 521 | { | ||
| 522 | struct task_struct *t; | ||
| 523 | int ndetected = 0; | ||
| 524 | |||
| 525 | if (!rcu_preempt_blocked_readers_cgp(rnp)) | ||
| 526 | return 0; | ||
| 527 | rcu_print_task_stall_begin(rnp); | ||
| 528 | t = list_entry(rnp->gp_tasks, | ||
| 529 | struct task_struct, rcu_node_entry); | ||
| 530 | list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) { | ||
| 531 | pr_cont(" P%d", t->pid); | ||
| 532 | ndetected++; | ||
| 533 | } | ||
| 534 | rcu_print_task_stall_end(); | ||
| 535 | return ndetected; | ||
| 536 | } | ||
| 537 | |||
| 538 | /* | ||
| 539 | * Check that the list of blocked tasks for the newly completed grace | ||
| 540 | * period is in fact empty. It is a serious bug to complete a grace | ||
| 541 | * period that still has RCU readers blocked! This function must be | ||
| 542 | * invoked -before- updating this rnp's ->gpnum, and the rnp's ->lock | ||
| 543 | * must be held by the caller. | ||
| 544 | * | ||
| 545 | * Also, if there are blocked tasks on the list, they automatically | ||
| 546 | * block the newly created grace period, so set up ->gp_tasks accordingly. | ||
| 547 | */ | ||
| 548 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) | ||
| 549 | { | ||
| 550 | WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); | ||
| 551 | if (!list_empty(&rnp->blkd_tasks)) | ||
| 552 | rnp->gp_tasks = rnp->blkd_tasks.next; | ||
| 553 | WARN_ON_ONCE(rnp->qsmask); | ||
| 554 | } | ||
| 555 | |||
| 556 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 557 | |||
| 558 | /* | ||
| 559 | * Handle tasklist migration for case in which all CPUs covered by the | ||
| 560 | * specified rcu_node have gone offline. Move them up to the root | ||
| 561 | * rcu_node. The reason for not just moving them to the immediate | ||
| 562 | * parent is to remove the need for rcu_read_unlock_special() to | ||
| 563 | * make more than two attempts to acquire the target rcu_node's lock. | ||
| 564 | * Returns true if there were tasks blocking the current RCU grace | ||
| 565 | * period. | ||
| 566 | * | ||
| 567 | * Returns 1 if there was previously a task blocking the current grace | ||
| 568 | * period on the specified rcu_node structure. | ||
| 569 | * | ||
| 570 | * The caller must hold rnp->lock with irqs disabled. | ||
| 571 | */ | ||
| 572 | static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | ||
| 573 | struct rcu_node *rnp, | ||
| 574 | struct rcu_data *rdp) | ||
| 575 | { | ||
| 576 | struct list_head *lp; | ||
| 577 | struct list_head *lp_root; | ||
| 578 | int retval = 0; | ||
| 579 | struct rcu_node *rnp_root = rcu_get_root(rsp); | ||
| 580 | struct task_struct *t; | ||
| 581 | |||
| 582 | if (rnp == rnp_root) { | ||
| 583 | WARN_ONCE(1, "Last CPU thought to be offlined?"); | ||
| 584 | return 0; /* Shouldn't happen: at least one CPU online. */ | ||
| 585 | } | ||
| 586 | |||
| 587 | /* If we are on an internal node, complain bitterly. */ | ||
| 588 | WARN_ON_ONCE(rnp != rdp->mynode); | ||
| 589 | |||
| 590 | /* | ||
| 591 | * Move tasks up to root rcu_node. Don't try to get fancy for | ||
| 592 | * this corner-case operation -- just put this node's tasks | ||
| 593 | * at the head of the root node's list, and update the root node's | ||
| 594 | * ->gp_tasks and ->exp_tasks pointers to those of this node's, | ||
| 595 | * if non-NULL. This might result in waiting for more tasks than | ||
| 596 | * absolutely necessary, but this is a good performance/complexity | ||
| 597 | * tradeoff. | ||
| 598 | */ | ||
| 599 | if (rcu_preempt_blocked_readers_cgp(rnp) && rnp->qsmask == 0) | ||
| 600 | retval |= RCU_OFL_TASKS_NORM_GP; | ||
| 601 | if (rcu_preempted_readers_exp(rnp)) | ||
| 602 | retval |= RCU_OFL_TASKS_EXP_GP; | ||
| 603 | lp = &rnp->blkd_tasks; | ||
| 604 | lp_root = &rnp_root->blkd_tasks; | ||
| 605 | while (!list_empty(lp)) { | ||
| 606 | t = list_entry(lp->next, typeof(*t), rcu_node_entry); | ||
| 607 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ | ||
| 608 | list_del(&t->rcu_node_entry); | ||
| 609 | t->rcu_blocked_node = rnp_root; | ||
| 610 | list_add(&t->rcu_node_entry, lp_root); | ||
| 611 | if (&t->rcu_node_entry == rnp->gp_tasks) | ||
| 612 | rnp_root->gp_tasks = rnp->gp_tasks; | ||
| 613 | if (&t->rcu_node_entry == rnp->exp_tasks) | ||
| 614 | rnp_root->exp_tasks = rnp->exp_tasks; | ||
| 615 | #ifdef CONFIG_RCU_BOOST | ||
| 616 | if (&t->rcu_node_entry == rnp->boost_tasks) | ||
| 617 | rnp_root->boost_tasks = rnp->boost_tasks; | ||
| 618 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 619 | raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ | ||
| 620 | } | ||
| 621 | |||
| 622 | rnp->gp_tasks = NULL; | ||
| 623 | rnp->exp_tasks = NULL; | ||
| 624 | #ifdef CONFIG_RCU_BOOST | ||
| 625 | rnp->boost_tasks = NULL; | ||
| 626 | /* | ||
| 627 | * In case root is being boosted and leaf was not. Make sure | ||
| 628 | * that we boost the tasks blocking the current grace period | ||
| 629 | * in this case. | ||
| 630 | */ | ||
| 631 | raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ | ||
| 632 | if (rnp_root->boost_tasks != NULL && | ||
| 633 | rnp_root->boost_tasks != rnp_root->gp_tasks && | ||
| 634 | rnp_root->boost_tasks != rnp_root->exp_tasks) | ||
| 635 | rnp_root->boost_tasks = rnp_root->gp_tasks; | ||
| 636 | raw_spin_unlock(&rnp_root->lock); /* irqs still disabled */ | ||
| 637 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 638 | |||
| 639 | return retval; | ||
| 640 | } | ||
| 641 | |||
| 642 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
| 643 | |||
| 644 | /* | ||
| 645 | * Check for a quiescent state from the current CPU. When a task blocks, | ||
| 646 | * the task is recorded in the corresponding CPU's rcu_node structure, | ||
| 647 | * which is checked elsewhere. | ||
| 648 | * | ||
| 649 | * Caller must disable hard irqs. | ||
| 650 | */ | ||
| 651 | static void rcu_preempt_check_callbacks(int cpu) | ||
| 652 | { | ||
| 653 | struct task_struct *t = current; | ||
| 654 | |||
| 655 | if (t->rcu_read_lock_nesting == 0) { | ||
| 656 | rcu_preempt_qs(cpu); | ||
| 657 | return; | ||
| 658 | } | ||
| 659 | if (t->rcu_read_lock_nesting > 0 && | ||
| 660 | per_cpu(rcu_preempt_data, cpu).qs_pending) | ||
| 661 | t->rcu_read_unlock_special |= RCU_READ_UNLOCK_NEED_QS; | ||
| 662 | } | ||
| 663 | |||
| 664 | #ifdef CONFIG_RCU_BOOST | ||
| 665 | |||
| 666 | static void rcu_preempt_do_callbacks(void) | ||
| 667 | { | ||
| 668 | rcu_do_batch(&rcu_preempt_state, this_cpu_ptr(&rcu_preempt_data)); | ||
| 669 | } | ||
| 670 | |||
| 671 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 672 | |||
| 673 | /* | ||
| 674 | * Queue a preemptible-RCU callback for invocation after a grace period. | ||
| 675 | */ | ||
| 676 | void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu)) | ||
| 677 | { | ||
| 678 | __call_rcu(head, func, &rcu_preempt_state, -1, 0); | ||
| 679 | } | ||
| 680 | EXPORT_SYMBOL_GPL(call_rcu); | ||
| 681 | |||
| 682 | /* | ||
| 683 | * Queue an RCU callback for lazy invocation after a grace period. | ||
| 684 | * This will likely be later named something like "call_rcu_lazy()", | ||
| 685 | * but this change will require some way of tagging the lazy RCU | ||
| 686 | * callbacks in the list of pending callbacks. Until then, this | ||
| 687 | * function may only be called from __kfree_rcu(). | ||
| 688 | */ | ||
| 689 | void kfree_call_rcu(struct rcu_head *head, | ||
| 690 | void (*func)(struct rcu_head *rcu)) | ||
| 691 | { | ||
| 692 | __call_rcu(head, func, &rcu_preempt_state, -1, 1); | ||
| 693 | } | ||
| 694 | EXPORT_SYMBOL_GPL(kfree_call_rcu); | ||
| 695 | |||
| 696 | /** | ||
| 697 | * synchronize_rcu - wait until a grace period has elapsed. | ||
| 698 | * | ||
| 699 | * Control will return to the caller some time after a full grace | ||
| 700 | * period has elapsed, in other words after all currently executing RCU | ||
| 701 | * read-side critical sections have completed. Note, however, that | ||
| 702 | * upon return from synchronize_rcu(), the caller might well be executing | ||
| 703 | * concurrently with new RCU read-side critical sections that began while | ||
| 704 | * synchronize_rcu() was waiting. RCU read-side critical sections are | ||
| 705 | * delimited by rcu_read_lock() and rcu_read_unlock(), and may be nested. | ||
| 706 | * | ||
| 707 | * See the description of synchronize_sched() for more detailed information | ||
| 708 | * on memory ordering guarantees. | ||
| 709 | */ | ||
| 710 | void synchronize_rcu(void) | ||
| 711 | { | ||
| 712 | rcu_lockdep_assert(!lock_is_held(&rcu_bh_lock_map) && | ||
| 713 | !lock_is_held(&rcu_lock_map) && | ||
| 714 | !lock_is_held(&rcu_sched_lock_map), | ||
| 715 | "Illegal synchronize_rcu() in RCU read-side critical section"); | ||
| 716 | if (!rcu_scheduler_active) | ||
| 717 | return; | ||
| 718 | if (rcu_expedited) | ||
| 719 | synchronize_rcu_expedited(); | ||
| 720 | else | ||
| 721 | wait_rcu_gp(call_rcu); | ||
| 722 | } | ||
| 723 | EXPORT_SYMBOL_GPL(synchronize_rcu); | ||
| 724 | |||
| 725 | static DECLARE_WAIT_QUEUE_HEAD(sync_rcu_preempt_exp_wq); | ||
| 726 | static unsigned long sync_rcu_preempt_exp_count; | ||
| 727 | static DEFINE_MUTEX(sync_rcu_preempt_exp_mutex); | ||
| 728 | |||
| 729 | /* | ||
| 730 | * Return non-zero if there are any tasks in RCU read-side critical | ||
| 731 | * sections blocking the current preemptible-RCU expedited grace period. | ||
| 732 | * If there is no preemptible-RCU expedited grace period currently in | ||
| 733 | * progress, returns zero unconditionally. | ||
| 734 | */ | ||
| 735 | static int rcu_preempted_readers_exp(struct rcu_node *rnp) | ||
| 736 | { | ||
| 737 | return rnp->exp_tasks != NULL; | ||
| 738 | } | ||
| 739 | |||
| 740 | /* | ||
| 741 | * return non-zero if there is no RCU expedited grace period in progress | ||
| 742 | * for the specified rcu_node structure, in other words, if all CPUs and | ||
| 743 | * tasks covered by the specified rcu_node structure have done their bit | ||
| 744 | * for the current expedited grace period. Works only for preemptible | ||
| 745 | * RCU -- other RCU implementation use other means. | ||
| 746 | * | ||
| 747 | * Caller must hold sync_rcu_preempt_exp_mutex. | ||
| 748 | */ | ||
| 749 | static int sync_rcu_preempt_exp_done(struct rcu_node *rnp) | ||
| 750 | { | ||
| 751 | return !rcu_preempted_readers_exp(rnp) && | ||
| 752 | ACCESS_ONCE(rnp->expmask) == 0; | ||
| 753 | } | ||
| 754 | |||
| 755 | /* | ||
| 756 | * Report the exit from RCU read-side critical section for the last task | ||
| 757 | * that queued itself during or before the current expedited preemptible-RCU | ||
| 758 | * grace period. This event is reported either to the rcu_node structure on | ||
| 759 | * which the task was queued or to one of that rcu_node structure's ancestors, | ||
| 760 | * recursively up the tree. (Calm down, calm down, we do the recursion | ||
| 761 | * iteratively!) | ||
| 762 | * | ||
| 763 | * Most callers will set the "wake" flag, but the task initiating the | ||
| 764 | * expedited grace period need not wake itself. | ||
| 765 | * | ||
| 766 | * Caller must hold sync_rcu_preempt_exp_mutex. | ||
| 767 | */ | ||
| 768 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | ||
| 769 | bool wake) | ||
| 770 | { | ||
| 771 | unsigned long flags; | ||
| 772 | unsigned long mask; | ||
| 773 | |||
| 774 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 775 | for (;;) { | ||
| 776 | if (!sync_rcu_preempt_exp_done(rnp)) { | ||
| 777 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 778 | break; | ||
| 779 | } | ||
| 780 | if (rnp->parent == NULL) { | ||
| 781 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 782 | if (wake) | ||
| 783 | wake_up(&sync_rcu_preempt_exp_wq); | ||
| 784 | break; | ||
| 785 | } | ||
| 786 | mask = rnp->grpmask; | ||
| 787 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ | ||
| 788 | rnp = rnp->parent; | ||
| 789 | raw_spin_lock(&rnp->lock); /* irqs already disabled */ | ||
| 790 | rnp->expmask &= ~mask; | ||
| 791 | } | ||
| 792 | } | ||
| 793 | |||
| 794 | /* | ||
| 795 | * Snapshot the tasks blocking the newly started preemptible-RCU expedited | ||
| 796 | * grace period for the specified rcu_node structure. If there are no such | ||
| 797 | * tasks, report it up the rcu_node hierarchy. | ||
| 798 | * | ||
| 799 | * Caller must hold sync_rcu_preempt_exp_mutex and must exclude | ||
| 800 | * CPU hotplug operations. | ||
| 801 | */ | ||
| 802 | static void | ||
| 803 | sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) | ||
| 804 | { | ||
| 805 | unsigned long flags; | ||
| 806 | int must_wait = 0; | ||
| 807 | |||
| 808 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 809 | if (list_empty(&rnp->blkd_tasks)) { | ||
| 810 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 811 | } else { | ||
| 812 | rnp->exp_tasks = rnp->blkd_tasks.next; | ||
| 813 | rcu_initiate_boost(rnp, flags); /* releases rnp->lock */ | ||
| 814 | must_wait = 1; | ||
| 815 | } | ||
| 816 | if (!must_wait) | ||
| 817 | rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */ | ||
| 818 | } | ||
| 819 | |||
| 820 | /** | ||
| 821 | * synchronize_rcu_expedited - Brute-force RCU grace period | ||
| 822 | * | ||
| 823 | * Wait for an RCU-preempt grace period, but expedite it. The basic | ||
| 824 | * idea is to invoke synchronize_sched_expedited() to push all the tasks to | ||
| 825 | * the ->blkd_tasks lists and wait for this list to drain. This consumes | ||
| 826 | * significant time on all CPUs and is unfriendly to real-time workloads, | ||
| 827 | * so is thus not recommended for any sort of common-case code. | ||
| 828 | * In fact, if you are using synchronize_rcu_expedited() in a loop, | ||
| 829 | * please restructure your code to batch your updates, and then Use a | ||
| 830 | * single synchronize_rcu() instead. | ||
| 831 | * | ||
| 832 | * Note that it is illegal to call this function while holding any lock | ||
| 833 | * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal | ||
| 834 | * to call this function from a CPU-hotplug notifier. Failing to observe | ||
| 835 | * these restriction will result in deadlock. | ||
| 836 | */ | ||
| 837 | void synchronize_rcu_expedited(void) | ||
| 838 | { | ||
| 839 | unsigned long flags; | ||
| 840 | struct rcu_node *rnp; | ||
| 841 | struct rcu_state *rsp = &rcu_preempt_state; | ||
| 842 | unsigned long snap; | ||
| 843 | int trycount = 0; | ||
| 844 | |||
| 845 | smp_mb(); /* Caller's modifications seen first by other CPUs. */ | ||
| 846 | snap = ACCESS_ONCE(sync_rcu_preempt_exp_count) + 1; | ||
| 847 | smp_mb(); /* Above access cannot bleed into critical section. */ | ||
| 848 | |||
| 849 | /* | ||
| 850 | * Block CPU-hotplug operations. This means that any CPU-hotplug | ||
| 851 | * operation that finds an rcu_node structure with tasks in the | ||
| 852 | * process of being boosted will know that all tasks blocking | ||
| 853 | * this expedited grace period will already be in the process of | ||
| 854 | * being boosted. This simplifies the process of moving tasks | ||
| 855 | * from leaf to root rcu_node structures. | ||
| 856 | */ | ||
| 857 | get_online_cpus(); | ||
| 858 | |||
| 859 | /* | ||
| 860 | * Acquire lock, falling back to synchronize_rcu() if too many | ||
| 861 | * lock-acquisition failures. Of course, if someone does the | ||
| 862 | * expedited grace period for us, just leave. | ||
| 863 | */ | ||
| 864 | while (!mutex_trylock(&sync_rcu_preempt_exp_mutex)) { | ||
| 865 | if (ULONG_CMP_LT(snap, | ||
| 866 | ACCESS_ONCE(sync_rcu_preempt_exp_count))) { | ||
| 867 | put_online_cpus(); | ||
| 868 | goto mb_ret; /* Others did our work for us. */ | ||
| 869 | } | ||
| 870 | if (trycount++ < 10) { | ||
| 871 | udelay(trycount * num_online_cpus()); | ||
| 872 | } else { | ||
| 873 | put_online_cpus(); | ||
| 874 | wait_rcu_gp(call_rcu); | ||
| 875 | return; | ||
| 876 | } | ||
| 877 | } | ||
| 878 | if (ULONG_CMP_LT(snap, ACCESS_ONCE(sync_rcu_preempt_exp_count))) { | ||
| 879 | put_online_cpus(); | ||
| 880 | goto unlock_mb_ret; /* Others did our work for us. */ | ||
| 881 | } | ||
| 882 | |||
| 883 | /* force all RCU readers onto ->blkd_tasks lists. */ | ||
| 884 | synchronize_sched_expedited(); | ||
| 885 | |||
| 886 | /* Initialize ->expmask for all non-leaf rcu_node structures. */ | ||
| 887 | rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { | ||
| 888 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 889 | rnp->expmask = rnp->qsmaskinit; | ||
| 890 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 891 | } | ||
| 892 | |||
| 893 | /* Snapshot current state of ->blkd_tasks lists. */ | ||
| 894 | rcu_for_each_leaf_node(rsp, rnp) | ||
| 895 | sync_rcu_preempt_exp_init(rsp, rnp); | ||
| 896 | if (NUM_RCU_NODES > 1) | ||
| 897 | sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); | ||
| 898 | |||
| 899 | put_online_cpus(); | ||
| 900 | |||
| 901 | /* Wait for snapshotted ->blkd_tasks lists to drain. */ | ||
| 902 | rnp = rcu_get_root(rsp); | ||
| 903 | wait_event(sync_rcu_preempt_exp_wq, | ||
| 904 | sync_rcu_preempt_exp_done(rnp)); | ||
| 905 | |||
| 906 | /* Clean up and exit. */ | ||
| 907 | smp_mb(); /* ensure expedited GP seen before counter increment. */ | ||
| 908 | ACCESS_ONCE(sync_rcu_preempt_exp_count)++; | ||
| 909 | unlock_mb_ret: | ||
| 910 | mutex_unlock(&sync_rcu_preempt_exp_mutex); | ||
| 911 | mb_ret: | ||
| 912 | smp_mb(); /* ensure subsequent action seen after grace period. */ | ||
| 913 | } | ||
| 914 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | ||
| 915 | |||
| 916 | /** | ||
| 917 | * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete. | ||
| 918 | * | ||
| 919 | * Note that this primitive does not necessarily wait for an RCU grace period | ||
| 920 | * to complete. For example, if there are no RCU callbacks queued anywhere | ||
| 921 | * in the system, then rcu_barrier() is within its rights to return | ||
| 922 | * immediately, without waiting for anything, much less an RCU grace period. | ||
| 923 | */ | ||
| 924 | void rcu_barrier(void) | ||
| 925 | { | ||
| 926 | _rcu_barrier(&rcu_preempt_state); | ||
| 927 | } | ||
| 928 | EXPORT_SYMBOL_GPL(rcu_barrier); | ||
| 929 | |||
| 930 | /* | ||
| 931 | * Initialize preemptible RCU's state structures. | ||
| 932 | */ | ||
| 933 | static void __init __rcu_init_preempt(void) | ||
| 934 | { | ||
| 935 | rcu_init_one(&rcu_preempt_state, &rcu_preempt_data); | ||
| 936 | } | ||
| 937 | |||
| 938 | /* | ||
| 939 | * Check for a task exiting while in a preemptible-RCU read-side | ||
| 940 | * critical section, clean up if so. No need to issue warnings, | ||
| 941 | * as debug_check_no_locks_held() already does this if lockdep | ||
| 942 | * is enabled. | ||
| 943 | */ | ||
| 944 | void exit_rcu(void) | ||
| 945 | { | ||
| 946 | struct task_struct *t = current; | ||
| 947 | |||
| 948 | if (likely(list_empty(¤t->rcu_node_entry))) | ||
| 949 | return; | ||
| 950 | t->rcu_read_lock_nesting = 1; | ||
| 951 | barrier(); | ||
| 952 | t->rcu_read_unlock_special = RCU_READ_UNLOCK_BLOCKED; | ||
| 953 | __rcu_read_unlock(); | ||
| 954 | } | ||
| 955 | |||
| 956 | #else /* #ifdef CONFIG_TREE_PREEMPT_RCU */ | ||
| 957 | |||
| 958 | static struct rcu_state *rcu_state = &rcu_sched_state; | ||
| 959 | |||
| 960 | /* | ||
| 961 | * Tell them what RCU they are running. | ||
| 962 | */ | ||
| 963 | static void __init rcu_bootup_announce(void) | ||
| 964 | { | ||
| 965 | pr_info("Hierarchical RCU implementation.\n"); | ||
| 966 | rcu_bootup_announce_oddness(); | ||
| 967 | } | ||
| 968 | |||
| 969 | /* | ||
| 970 | * Return the number of RCU batches processed thus far for debug & stats. | ||
| 971 | */ | ||
| 972 | long rcu_batches_completed(void) | ||
| 973 | { | ||
| 974 | return rcu_batches_completed_sched(); | ||
| 975 | } | ||
| 976 | EXPORT_SYMBOL_GPL(rcu_batches_completed); | ||
| 977 | |||
| 978 | /* | ||
| 979 | * Force a quiescent state for RCU, which, because there is no preemptible | ||
| 980 | * RCU, becomes the same as rcu-sched. | ||
| 981 | */ | ||
| 982 | void rcu_force_quiescent_state(void) | ||
| 983 | { | ||
| 984 | rcu_sched_force_quiescent_state(); | ||
| 985 | } | ||
| 986 | EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); | ||
| 987 | |||
| 988 | /* | ||
| 989 | * Because preemptible RCU does not exist, we never have to check for | ||
| 990 | * CPUs being in quiescent states. | ||
| 991 | */ | ||
| 992 | static void rcu_preempt_note_context_switch(int cpu) | ||
| 993 | { | ||
| 994 | } | ||
| 995 | |||
| 996 | /* | ||
| 997 | * Because preemptible RCU does not exist, there are never any preempted | ||
| 998 | * RCU readers. | ||
| 999 | */ | ||
| 1000 | static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp) | ||
| 1001 | { | ||
| 1002 | return 0; | ||
| 1003 | } | ||
| 1004 | |||
| 1005 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 1006 | |||
| 1007 | /* Because preemptible RCU does not exist, no quieting of tasks. */ | ||
| 1008 | static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) | ||
| 1009 | { | ||
| 1010 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1011 | } | ||
| 1012 | |||
| 1013 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
| 1014 | |||
| 1015 | /* | ||
| 1016 | * Because preemptible RCU does not exist, we never have to check for | ||
| 1017 | * tasks blocked within RCU read-side critical sections. | ||
| 1018 | */ | ||
| 1019 | static void rcu_print_detail_task_stall(struct rcu_state *rsp) | ||
| 1020 | { | ||
| 1021 | } | ||
| 1022 | |||
| 1023 | /* | ||
| 1024 | * Because preemptible RCU does not exist, we never have to check for | ||
| 1025 | * tasks blocked within RCU read-side critical sections. | ||
| 1026 | */ | ||
| 1027 | static int rcu_print_task_stall(struct rcu_node *rnp) | ||
| 1028 | { | ||
| 1029 | return 0; | ||
| 1030 | } | ||
| 1031 | |||
| 1032 | /* | ||
| 1033 | * Because there is no preemptible RCU, there can be no readers blocked, | ||
| 1034 | * so there is no need to check for blocked tasks. So check only for | ||
| 1035 | * bogus qsmask values. | ||
| 1036 | */ | ||
| 1037 | static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) | ||
| 1038 | { | ||
| 1039 | WARN_ON_ONCE(rnp->qsmask); | ||
| 1040 | } | ||
| 1041 | |||
| 1042 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 1043 | |||
| 1044 | /* | ||
| 1045 | * Because preemptible RCU does not exist, it never needs to migrate | ||
| 1046 | * tasks that were blocked within RCU read-side critical sections, and | ||
| 1047 | * such non-existent tasks cannot possibly have been blocking the current | ||
| 1048 | * grace period. | ||
| 1049 | */ | ||
| 1050 | static int rcu_preempt_offline_tasks(struct rcu_state *rsp, | ||
| 1051 | struct rcu_node *rnp, | ||
| 1052 | struct rcu_data *rdp) | ||
| 1053 | { | ||
| 1054 | return 0; | ||
| 1055 | } | ||
| 1056 | |||
| 1057 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
| 1058 | |||
| 1059 | /* | ||
| 1060 | * Because preemptible RCU does not exist, it never has any callbacks | ||
| 1061 | * to check. | ||
| 1062 | */ | ||
| 1063 | static void rcu_preempt_check_callbacks(int cpu) | ||
| 1064 | { | ||
| 1065 | } | ||
| 1066 | |||
| 1067 | /* | ||
| 1068 | * Queue an RCU callback for lazy invocation after a grace period. | ||
| 1069 | * This will likely be later named something like "call_rcu_lazy()", | ||
| 1070 | * but this change will require some way of tagging the lazy RCU | ||
| 1071 | * callbacks in the list of pending callbacks. Until then, this | ||
| 1072 | * function may only be called from __kfree_rcu(). | ||
| 1073 | * | ||
| 1074 | * Because there is no preemptible RCU, we use RCU-sched instead. | ||
| 1075 | */ | ||
| 1076 | void kfree_call_rcu(struct rcu_head *head, | ||
| 1077 | void (*func)(struct rcu_head *rcu)) | ||
| 1078 | { | ||
| 1079 | __call_rcu(head, func, &rcu_sched_state, -1, 1); | ||
| 1080 | } | ||
| 1081 | EXPORT_SYMBOL_GPL(kfree_call_rcu); | ||
| 1082 | |||
| 1083 | /* | ||
| 1084 | * Wait for an rcu-preempt grace period, but make it happen quickly. | ||
| 1085 | * But because preemptible RCU does not exist, map to rcu-sched. | ||
| 1086 | */ | ||
| 1087 | void synchronize_rcu_expedited(void) | ||
| 1088 | { | ||
| 1089 | synchronize_sched_expedited(); | ||
| 1090 | } | ||
| 1091 | EXPORT_SYMBOL_GPL(synchronize_rcu_expedited); | ||
| 1092 | |||
| 1093 | #ifdef CONFIG_HOTPLUG_CPU | ||
| 1094 | |||
| 1095 | /* | ||
| 1096 | * Because preemptible RCU does not exist, there is never any need to | ||
| 1097 | * report on tasks preempted in RCU read-side critical sections during | ||
| 1098 | * expedited RCU grace periods. | ||
| 1099 | */ | ||
| 1100 | static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, | ||
| 1101 | bool wake) | ||
| 1102 | { | ||
| 1103 | } | ||
| 1104 | |||
| 1105 | #endif /* #ifdef CONFIG_HOTPLUG_CPU */ | ||
| 1106 | |||
| 1107 | /* | ||
| 1108 | * Because preemptible RCU does not exist, rcu_barrier() is just | ||
| 1109 | * another name for rcu_barrier_sched(). | ||
| 1110 | */ | ||
| 1111 | void rcu_barrier(void) | ||
| 1112 | { | ||
| 1113 | rcu_barrier_sched(); | ||
| 1114 | } | ||
| 1115 | EXPORT_SYMBOL_GPL(rcu_barrier); | ||
| 1116 | |||
| 1117 | /* | ||
| 1118 | * Because preemptible RCU does not exist, it need not be initialized. | ||
| 1119 | */ | ||
| 1120 | static void __init __rcu_init_preempt(void) | ||
| 1121 | { | ||
| 1122 | } | ||
| 1123 | |||
| 1124 | /* | ||
| 1125 | * Because preemptible RCU does not exist, tasks cannot possibly exit | ||
| 1126 | * while in preemptible RCU read-side critical sections. | ||
| 1127 | */ | ||
| 1128 | void exit_rcu(void) | ||
| 1129 | { | ||
| 1130 | } | ||
| 1131 | |||
| 1132 | #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ | ||
| 1133 | |||
| 1134 | #ifdef CONFIG_RCU_BOOST | ||
| 1135 | |||
| 1136 | #include "../rtmutex_common.h" | ||
| 1137 | |||
| 1138 | #ifdef CONFIG_RCU_TRACE | ||
| 1139 | |||
| 1140 | static void rcu_initiate_boost_trace(struct rcu_node *rnp) | ||
| 1141 | { | ||
| 1142 | if (list_empty(&rnp->blkd_tasks)) | ||
| 1143 | rnp->n_balk_blkd_tasks++; | ||
| 1144 | else if (rnp->exp_tasks == NULL && rnp->gp_tasks == NULL) | ||
| 1145 | rnp->n_balk_exp_gp_tasks++; | ||
| 1146 | else if (rnp->gp_tasks != NULL && rnp->boost_tasks != NULL) | ||
| 1147 | rnp->n_balk_boost_tasks++; | ||
| 1148 | else if (rnp->gp_tasks != NULL && rnp->qsmask != 0) | ||
| 1149 | rnp->n_balk_notblocked++; | ||
| 1150 | else if (rnp->gp_tasks != NULL && | ||
| 1151 | ULONG_CMP_LT(jiffies, rnp->boost_time)) | ||
| 1152 | rnp->n_balk_notyet++; | ||
| 1153 | else | ||
| 1154 | rnp->n_balk_nos++; | ||
| 1155 | } | ||
| 1156 | |||
| 1157 | #else /* #ifdef CONFIG_RCU_TRACE */ | ||
| 1158 | |||
| 1159 | static void rcu_initiate_boost_trace(struct rcu_node *rnp) | ||
| 1160 | { | ||
| 1161 | } | ||
| 1162 | |||
| 1163 | #endif /* #else #ifdef CONFIG_RCU_TRACE */ | ||
| 1164 | |||
| 1165 | static void rcu_wake_cond(struct task_struct *t, int status) | ||
| 1166 | { | ||
| 1167 | /* | ||
| 1168 | * If the thread is yielding, only wake it when this | ||
| 1169 | * is invoked from idle | ||
| 1170 | */ | ||
| 1171 | if (status != RCU_KTHREAD_YIELDING || is_idle_task(current)) | ||
| 1172 | wake_up_process(t); | ||
| 1173 | } | ||
| 1174 | |||
| 1175 | /* | ||
| 1176 | * Carry out RCU priority boosting on the task indicated by ->exp_tasks | ||
| 1177 | * or ->boost_tasks, advancing the pointer to the next task in the | ||
| 1178 | * ->blkd_tasks list. | ||
| 1179 | * | ||
| 1180 | * Note that irqs must be enabled: boosting the task can block. | ||
| 1181 | * Returns 1 if there are more tasks needing to be boosted. | ||
| 1182 | */ | ||
| 1183 | static int rcu_boost(struct rcu_node *rnp) | ||
| 1184 | { | ||
| 1185 | unsigned long flags; | ||
| 1186 | struct rt_mutex mtx; | ||
| 1187 | struct task_struct *t; | ||
| 1188 | struct list_head *tb; | ||
| 1189 | |||
| 1190 | if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) | ||
| 1191 | return 0; /* Nothing left to boost. */ | ||
| 1192 | |||
| 1193 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 1194 | |||
| 1195 | /* | ||
| 1196 | * Recheck under the lock: all tasks in need of boosting | ||
| 1197 | * might exit their RCU read-side critical sections on their own. | ||
| 1198 | */ | ||
| 1199 | if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { | ||
| 1200 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1201 | return 0; | ||
| 1202 | } | ||
| 1203 | |||
| 1204 | /* | ||
| 1205 | * Preferentially boost tasks blocking expedited grace periods. | ||
| 1206 | * This cannot starve the normal grace periods because a second | ||
| 1207 | * expedited grace period must boost all blocked tasks, including | ||
| 1208 | * those blocking the pre-existing normal grace period. | ||
| 1209 | */ | ||
| 1210 | if (rnp->exp_tasks != NULL) { | ||
| 1211 | tb = rnp->exp_tasks; | ||
| 1212 | rnp->n_exp_boosts++; | ||
| 1213 | } else { | ||
| 1214 | tb = rnp->boost_tasks; | ||
| 1215 | rnp->n_normal_boosts++; | ||
| 1216 | } | ||
| 1217 | rnp->n_tasks_boosted++; | ||
| 1218 | |||
| 1219 | /* | ||
| 1220 | * We boost task t by manufacturing an rt_mutex that appears to | ||
| 1221 | * be held by task t. We leave a pointer to that rt_mutex where | ||
| 1222 | * task t can find it, and task t will release the mutex when it | ||
| 1223 | * exits its outermost RCU read-side critical section. Then | ||
| 1224 | * simply acquiring this artificial rt_mutex will boost task | ||
| 1225 | * t's priority. (Thanks to tglx for suggesting this approach!) | ||
| 1226 | * | ||
| 1227 | * Note that task t must acquire rnp->lock to remove itself from | ||
| 1228 | * the ->blkd_tasks list, which it will do from exit() if from | ||
| 1229 | * nowhere else. We therefore are guaranteed that task t will | ||
| 1230 | * stay around at least until we drop rnp->lock. Note that | ||
| 1231 | * rnp->lock also resolves races between our priority boosting | ||
| 1232 | * and task t's exiting its outermost RCU read-side critical | ||
| 1233 | * section. | ||
| 1234 | */ | ||
| 1235 | t = container_of(tb, struct task_struct, rcu_node_entry); | ||
| 1236 | rt_mutex_init_proxy_locked(&mtx, t); | ||
| 1237 | t->rcu_boost_mutex = &mtx; | ||
| 1238 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1239 | rt_mutex_lock(&mtx); /* Side effect: boosts task t's priority. */ | ||
| 1240 | rt_mutex_unlock(&mtx); /* Keep lockdep happy. */ | ||
| 1241 | |||
| 1242 | return ACCESS_ONCE(rnp->exp_tasks) != NULL || | ||
| 1243 | ACCESS_ONCE(rnp->boost_tasks) != NULL; | ||
| 1244 | } | ||
| 1245 | |||
| 1246 | /* | ||
| 1247 | * Priority-boosting kthread. One per leaf rcu_node and one for the | ||
| 1248 | * root rcu_node. | ||
| 1249 | */ | ||
| 1250 | static int rcu_boost_kthread(void *arg) | ||
| 1251 | { | ||
| 1252 | struct rcu_node *rnp = (struct rcu_node *)arg; | ||
| 1253 | int spincnt = 0; | ||
| 1254 | int more2boost; | ||
| 1255 | |||
| 1256 | trace_rcu_utilization(TPS("Start boost kthread@init")); | ||
| 1257 | for (;;) { | ||
| 1258 | rnp->boost_kthread_status = RCU_KTHREAD_WAITING; | ||
| 1259 | trace_rcu_utilization(TPS("End boost kthread@rcu_wait")); | ||
| 1260 | rcu_wait(rnp->boost_tasks || rnp->exp_tasks); | ||
| 1261 | trace_rcu_utilization(TPS("Start boost kthread@rcu_wait")); | ||
| 1262 | rnp->boost_kthread_status = RCU_KTHREAD_RUNNING; | ||
| 1263 | more2boost = rcu_boost(rnp); | ||
| 1264 | if (more2boost) | ||
| 1265 | spincnt++; | ||
| 1266 | else | ||
| 1267 | spincnt = 0; | ||
| 1268 | if (spincnt > 10) { | ||
| 1269 | rnp->boost_kthread_status = RCU_KTHREAD_YIELDING; | ||
| 1270 | trace_rcu_utilization(TPS("End boost kthread@rcu_yield")); | ||
| 1271 | schedule_timeout_interruptible(2); | ||
| 1272 | trace_rcu_utilization(TPS("Start boost kthread@rcu_yield")); | ||
| 1273 | spincnt = 0; | ||
| 1274 | } | ||
| 1275 | } | ||
| 1276 | /* NOTREACHED */ | ||
| 1277 | trace_rcu_utilization(TPS("End boost kthread@notreached")); | ||
| 1278 | return 0; | ||
| 1279 | } | ||
| 1280 | |||
| 1281 | /* | ||
| 1282 | * Check to see if it is time to start boosting RCU readers that are | ||
| 1283 | * blocking the current grace period, and, if so, tell the per-rcu_node | ||
| 1284 | * kthread to start boosting them. If there is an expedited grace | ||
| 1285 | * period in progress, it is always time to boost. | ||
| 1286 | * | ||
| 1287 | * The caller must hold rnp->lock, which this function releases. | ||
| 1288 | * The ->boost_kthread_task is immortal, so we don't need to worry | ||
| 1289 | * about it going away. | ||
| 1290 | */ | ||
| 1291 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | ||
| 1292 | { | ||
| 1293 | struct task_struct *t; | ||
| 1294 | |||
| 1295 | if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { | ||
| 1296 | rnp->n_balk_exp_gp_tasks++; | ||
| 1297 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1298 | return; | ||
| 1299 | } | ||
| 1300 | if (rnp->exp_tasks != NULL || | ||
| 1301 | (rnp->gp_tasks != NULL && | ||
| 1302 | rnp->boost_tasks == NULL && | ||
| 1303 | rnp->qsmask == 0 && | ||
| 1304 | ULONG_CMP_GE(jiffies, rnp->boost_time))) { | ||
| 1305 | if (rnp->exp_tasks == NULL) | ||
| 1306 | rnp->boost_tasks = rnp->gp_tasks; | ||
| 1307 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1308 | t = rnp->boost_kthread_task; | ||
| 1309 | if (t) | ||
| 1310 | rcu_wake_cond(t, rnp->boost_kthread_status); | ||
| 1311 | } else { | ||
| 1312 | rcu_initiate_boost_trace(rnp); | ||
| 1313 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1314 | } | ||
| 1315 | } | ||
| 1316 | |||
| 1317 | /* | ||
| 1318 | * Wake up the per-CPU kthread to invoke RCU callbacks. | ||
| 1319 | */ | ||
| 1320 | static void invoke_rcu_callbacks_kthread(void) | ||
| 1321 | { | ||
| 1322 | unsigned long flags; | ||
| 1323 | |||
| 1324 | local_irq_save(flags); | ||
| 1325 | __this_cpu_write(rcu_cpu_has_work, 1); | ||
| 1326 | if (__this_cpu_read(rcu_cpu_kthread_task) != NULL && | ||
| 1327 | current != __this_cpu_read(rcu_cpu_kthread_task)) { | ||
| 1328 | rcu_wake_cond(__this_cpu_read(rcu_cpu_kthread_task), | ||
| 1329 | __this_cpu_read(rcu_cpu_kthread_status)); | ||
| 1330 | } | ||
| 1331 | local_irq_restore(flags); | ||
| 1332 | } | ||
| 1333 | |||
| 1334 | /* | ||
| 1335 | * Is the current CPU running the RCU-callbacks kthread? | ||
| 1336 | * Caller must have preemption disabled. | ||
| 1337 | */ | ||
| 1338 | static bool rcu_is_callbacks_kthread(void) | ||
| 1339 | { | ||
| 1340 | return __this_cpu_read(rcu_cpu_kthread_task) == current; | ||
| 1341 | } | ||
| 1342 | |||
| 1343 | #define RCU_BOOST_DELAY_JIFFIES DIV_ROUND_UP(CONFIG_RCU_BOOST_DELAY * HZ, 1000) | ||
| 1344 | |||
| 1345 | /* | ||
| 1346 | * Do priority-boost accounting for the start of a new grace period. | ||
| 1347 | */ | ||
| 1348 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | ||
| 1349 | { | ||
| 1350 | rnp->boost_time = jiffies + RCU_BOOST_DELAY_JIFFIES; | ||
| 1351 | } | ||
| 1352 | |||
| 1353 | /* | ||
| 1354 | * Create an RCU-boost kthread for the specified node if one does not | ||
| 1355 | * already exist. We only create this kthread for preemptible RCU. | ||
| 1356 | * Returns zero if all is well, a negated errno otherwise. | ||
| 1357 | */ | ||
| 1358 | static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, | ||
| 1359 | struct rcu_node *rnp) | ||
| 1360 | { | ||
| 1361 | int rnp_index = rnp - &rsp->node[0]; | ||
| 1362 | unsigned long flags; | ||
| 1363 | struct sched_param sp; | ||
| 1364 | struct task_struct *t; | ||
| 1365 | |||
| 1366 | if (&rcu_preempt_state != rsp) | ||
| 1367 | return 0; | ||
| 1368 | |||
| 1369 | if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0) | ||
| 1370 | return 0; | ||
| 1371 | |||
| 1372 | rsp->boost = 1; | ||
| 1373 | if (rnp->boost_kthread_task != NULL) | ||
| 1374 | return 0; | ||
| 1375 | t = kthread_create(rcu_boost_kthread, (void *)rnp, | ||
| 1376 | "rcub/%d", rnp_index); | ||
| 1377 | if (IS_ERR(t)) | ||
| 1378 | return PTR_ERR(t); | ||
| 1379 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 1380 | rnp->boost_kthread_task = t; | ||
| 1381 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1382 | sp.sched_priority = RCU_BOOST_PRIO; | ||
| 1383 | sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); | ||
| 1384 | wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ | ||
| 1385 | return 0; | ||
| 1386 | } | ||
| 1387 | |||
| 1388 | static void rcu_kthread_do_work(void) | ||
| 1389 | { | ||
| 1390 | rcu_do_batch(&rcu_sched_state, this_cpu_ptr(&rcu_sched_data)); | ||
| 1391 | rcu_do_batch(&rcu_bh_state, this_cpu_ptr(&rcu_bh_data)); | ||
| 1392 | rcu_preempt_do_callbacks(); | ||
| 1393 | } | ||
| 1394 | |||
| 1395 | static void rcu_cpu_kthread_setup(unsigned int cpu) | ||
| 1396 | { | ||
| 1397 | struct sched_param sp; | ||
| 1398 | |||
| 1399 | sp.sched_priority = RCU_KTHREAD_PRIO; | ||
| 1400 | sched_setscheduler_nocheck(current, SCHED_FIFO, &sp); | ||
| 1401 | } | ||
| 1402 | |||
| 1403 | static void rcu_cpu_kthread_park(unsigned int cpu) | ||
| 1404 | { | ||
| 1405 | per_cpu(rcu_cpu_kthread_status, cpu) = RCU_KTHREAD_OFFCPU; | ||
| 1406 | } | ||
| 1407 | |||
| 1408 | static int rcu_cpu_kthread_should_run(unsigned int cpu) | ||
| 1409 | { | ||
| 1410 | return __this_cpu_read(rcu_cpu_has_work); | ||
| 1411 | } | ||
| 1412 | |||
| 1413 | /* | ||
| 1414 | * Per-CPU kernel thread that invokes RCU callbacks. This replaces the | ||
| 1415 | * RCU softirq used in flavors and configurations of RCU that do not | ||
| 1416 | * support RCU priority boosting. | ||
| 1417 | */ | ||
| 1418 | static void rcu_cpu_kthread(unsigned int cpu) | ||
| 1419 | { | ||
| 1420 | unsigned int *statusp = this_cpu_ptr(&rcu_cpu_kthread_status); | ||
| 1421 | char work, *workp = this_cpu_ptr(&rcu_cpu_has_work); | ||
| 1422 | int spincnt; | ||
| 1423 | |||
| 1424 | for (spincnt = 0; spincnt < 10; spincnt++) { | ||
| 1425 | trace_rcu_utilization(TPS("Start CPU kthread@rcu_wait")); | ||
| 1426 | local_bh_disable(); | ||
| 1427 | *statusp = RCU_KTHREAD_RUNNING; | ||
| 1428 | this_cpu_inc(rcu_cpu_kthread_loops); | ||
| 1429 | local_irq_disable(); | ||
| 1430 | work = *workp; | ||
| 1431 | *workp = 0; | ||
| 1432 | local_irq_enable(); | ||
| 1433 | if (work) | ||
| 1434 | rcu_kthread_do_work(); | ||
| 1435 | local_bh_enable(); | ||
| 1436 | if (*workp == 0) { | ||
| 1437 | trace_rcu_utilization(TPS("End CPU kthread@rcu_wait")); | ||
| 1438 | *statusp = RCU_KTHREAD_WAITING; | ||
| 1439 | return; | ||
| 1440 | } | ||
| 1441 | } | ||
| 1442 | *statusp = RCU_KTHREAD_YIELDING; | ||
| 1443 | trace_rcu_utilization(TPS("Start CPU kthread@rcu_yield")); | ||
| 1444 | schedule_timeout_interruptible(2); | ||
| 1445 | trace_rcu_utilization(TPS("End CPU kthread@rcu_yield")); | ||
| 1446 | *statusp = RCU_KTHREAD_WAITING; | ||
| 1447 | } | ||
| 1448 | |||
| 1449 | /* | ||
| 1450 | * Set the per-rcu_node kthread's affinity to cover all CPUs that are | ||
| 1451 | * served by the rcu_node in question. The CPU hotplug lock is still | ||
| 1452 | * held, so the value of rnp->qsmaskinit will be stable. | ||
| 1453 | * | ||
| 1454 | * We don't include outgoingcpu in the affinity set, use -1 if there is | ||
| 1455 | * no outgoing CPU. If there are no CPUs left in the affinity set, | ||
| 1456 | * this function allows the kthread to execute on any CPU. | ||
| 1457 | */ | ||
| 1458 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | ||
| 1459 | { | ||
| 1460 | struct task_struct *t = rnp->boost_kthread_task; | ||
| 1461 | unsigned long mask = rnp->qsmaskinit; | ||
| 1462 | cpumask_var_t cm; | ||
| 1463 | int cpu; | ||
| 1464 | |||
| 1465 | if (!t) | ||
| 1466 | return; | ||
| 1467 | if (!zalloc_cpumask_var(&cm, GFP_KERNEL)) | ||
| 1468 | return; | ||
| 1469 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask >>= 1) | ||
| 1470 | if ((mask & 0x1) && cpu != outgoingcpu) | ||
| 1471 | cpumask_set_cpu(cpu, cm); | ||
| 1472 | if (cpumask_weight(cm) == 0) { | ||
| 1473 | cpumask_setall(cm); | ||
| 1474 | for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++) | ||
| 1475 | cpumask_clear_cpu(cpu, cm); | ||
| 1476 | WARN_ON_ONCE(cpumask_weight(cm) == 0); | ||
| 1477 | } | ||
| 1478 | set_cpus_allowed_ptr(t, cm); | ||
| 1479 | free_cpumask_var(cm); | ||
| 1480 | } | ||
| 1481 | |||
| 1482 | static struct smp_hotplug_thread rcu_cpu_thread_spec = { | ||
| 1483 | .store = &rcu_cpu_kthread_task, | ||
| 1484 | .thread_should_run = rcu_cpu_kthread_should_run, | ||
| 1485 | .thread_fn = rcu_cpu_kthread, | ||
| 1486 | .thread_comm = "rcuc/%u", | ||
| 1487 | .setup = rcu_cpu_kthread_setup, | ||
| 1488 | .park = rcu_cpu_kthread_park, | ||
| 1489 | }; | ||
| 1490 | |||
| 1491 | /* | ||
| 1492 | * Spawn all kthreads -- called as soon as the scheduler is running. | ||
| 1493 | */ | ||
| 1494 | static int __init rcu_spawn_kthreads(void) | ||
| 1495 | { | ||
| 1496 | struct rcu_node *rnp; | ||
| 1497 | int cpu; | ||
| 1498 | |||
| 1499 | rcu_scheduler_fully_active = 1; | ||
| 1500 | for_each_possible_cpu(cpu) | ||
| 1501 | per_cpu(rcu_cpu_has_work, cpu) = 0; | ||
| 1502 | BUG_ON(smpboot_register_percpu_thread(&rcu_cpu_thread_spec)); | ||
| 1503 | rnp = rcu_get_root(rcu_state); | ||
| 1504 | (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); | ||
| 1505 | if (NUM_RCU_NODES > 1) { | ||
| 1506 | rcu_for_each_leaf_node(rcu_state, rnp) | ||
| 1507 | (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); | ||
| 1508 | } | ||
| 1509 | return 0; | ||
| 1510 | } | ||
| 1511 | early_initcall(rcu_spawn_kthreads); | ||
| 1512 | |||
| 1513 | static void rcu_prepare_kthreads(int cpu) | ||
| 1514 | { | ||
| 1515 | struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); | ||
| 1516 | struct rcu_node *rnp = rdp->mynode; | ||
| 1517 | |||
| 1518 | /* Fire up the incoming CPU's kthread and leaf rcu_node kthread. */ | ||
| 1519 | if (rcu_scheduler_fully_active) | ||
| 1520 | (void)rcu_spawn_one_boost_kthread(rcu_state, rnp); | ||
| 1521 | } | ||
| 1522 | |||
| 1523 | #else /* #ifdef CONFIG_RCU_BOOST */ | ||
| 1524 | |||
| 1525 | static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) | ||
| 1526 | { | ||
| 1527 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 1528 | } | ||
| 1529 | |||
| 1530 | static void invoke_rcu_callbacks_kthread(void) | ||
| 1531 | { | ||
| 1532 | WARN_ON_ONCE(1); | ||
| 1533 | } | ||
| 1534 | |||
| 1535 | static bool rcu_is_callbacks_kthread(void) | ||
| 1536 | { | ||
| 1537 | return false; | ||
| 1538 | } | ||
| 1539 | |||
| 1540 | static void rcu_preempt_boost_start_gp(struct rcu_node *rnp) | ||
| 1541 | { | ||
| 1542 | } | ||
| 1543 | |||
| 1544 | static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu) | ||
| 1545 | { | ||
| 1546 | } | ||
| 1547 | |||
| 1548 | static int __init rcu_scheduler_really_started(void) | ||
| 1549 | { | ||
| 1550 | rcu_scheduler_fully_active = 1; | ||
| 1551 | return 0; | ||
| 1552 | } | ||
| 1553 | early_initcall(rcu_scheduler_really_started); | ||
| 1554 | |||
| 1555 | static void rcu_prepare_kthreads(int cpu) | ||
| 1556 | { | ||
| 1557 | } | ||
| 1558 | |||
| 1559 | #endif /* #else #ifdef CONFIG_RCU_BOOST */ | ||
| 1560 | |||
| 1561 | #if !defined(CONFIG_RCU_FAST_NO_HZ) | ||
| 1562 | |||
| 1563 | /* | ||
| 1564 | * Check to see if any future RCU-related work will need to be done | ||
| 1565 | * by the current CPU, even if none need be done immediately, returning | ||
| 1566 | * 1 if so. This function is part of the RCU implementation; it is -not- | ||
| 1567 | * an exported member of the RCU API. | ||
| 1568 | * | ||
| 1569 | * Because we not have RCU_FAST_NO_HZ, just check whether this CPU needs | ||
| 1570 | * any flavor of RCU. | ||
| 1571 | */ | ||
| 1572 | int rcu_needs_cpu(int cpu, unsigned long *delta_jiffies) | ||
| 1573 | { | ||
| 1574 | *delta_jiffies = ULONG_MAX; | ||
| 1575 | return rcu_cpu_has_callbacks(cpu, NULL); | ||
| 1576 | } | ||
| 1577 | |||
| 1578 | /* | ||
| 1579 | * Because we do not have RCU_FAST_NO_HZ, don't bother cleaning up | ||
| 1580 | * after it. | ||
| 1581 | */ | ||
| 1582 | static void rcu_cleanup_after_idle(int cpu) | ||
| 1583 | { | ||
| 1584 | } | ||
| 1585 | |||
| 1586 | /* | ||
| 1587 | * Do the idle-entry grace-period work, which, because CONFIG_RCU_FAST_NO_HZ=n, | ||
| 1588 | * is nothing. | ||
| 1589 | */ | ||
| 1590 | static void rcu_prepare_for_idle(int cpu) | ||
| 1591 | { | ||
| 1592 | } | ||
| 1593 | |||
| 1594 | /* | ||
| 1595 | * Don't bother keeping a running count of the number of RCU callbacks | ||
| 1596 | * posted because CONFIG_RCU_FAST_NO_HZ=n. | ||
| 1597 | */ | ||
| 1598 | static void rcu_idle_count_callbacks_posted(void) | ||
| 1599 | { | ||
| 1600 | } | ||
| 1601 | |||
| 1602 | #else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | ||
| 1603 | |||
| 1604 | /* | ||
| 1605 | * This code is invoked when a CPU goes idle, at which point we want | ||
| 1606 | * to have the CPU do everything required for RCU so that it can enter | ||
| 1607 | * the energy-efficient dyntick-idle mode. This is handled by a | ||
| 1608 | * state machine implemented by rcu_prepare_for_idle() below. | ||
| 1609 | * | ||
| 1610 | * The following three proprocessor symbols control this state machine: | ||
| 1611 | * | ||
| 1612 | * RCU_IDLE_GP_DELAY gives the number of jiffies that a CPU is permitted | ||
| 1613 | * to sleep in dyntick-idle mode with RCU callbacks pending. This | ||
| 1614 | * is sized to be roughly one RCU grace period. Those energy-efficiency | ||
| 1615 | * benchmarkers who might otherwise be tempted to set this to a large | ||
| 1616 | * number, be warned: Setting RCU_IDLE_GP_DELAY too high can hang your | ||
| 1617 | * system. And if you are -that- concerned about energy efficiency, | ||
| 1618 | * just power the system down and be done with it! | ||
| 1619 | * RCU_IDLE_LAZY_GP_DELAY gives the number of jiffies that a CPU is | ||
| 1620 | * permitted to sleep in dyntick-idle mode with only lazy RCU | ||
| 1621 | * callbacks pending. Setting this too high can OOM your system. | ||
| 1622 | * | ||
| 1623 | * The values below work well in practice. If future workloads require | ||
| 1624 | * adjustment, they can be converted into kernel config parameters, though | ||
| 1625 | * making the state machine smarter might be a better option. | ||
| 1626 | */ | ||
| 1627 | #define RCU_IDLE_GP_DELAY 4 /* Roughly one grace period. */ | ||
| 1628 | #define RCU_IDLE_LAZY_GP_DELAY (6 * HZ) /* Roughly six seconds. */ | ||
| 1629 | |||
| 1630 | static int rcu_idle_gp_delay = RCU_IDLE_GP_DELAY; | ||
| 1631 | module_param(rcu_idle_gp_delay, int, 0644); | ||
| 1632 | static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY; | ||
| 1633 | module_param(rcu_idle_lazy_gp_delay, int, 0644); | ||
| 1634 | |||
| 1635 | extern int tick_nohz_enabled; | ||
| 1636 | |||
| 1637 | /* | ||
| 1638 | * Try to advance callbacks for all flavors of RCU on the current CPU, but | ||
| 1639 | * only if it has been awhile since the last time we did so. Afterwards, | ||
| 1640 | * if there are any callbacks ready for immediate invocation, return true. | ||
| 1641 | */ | ||
| 1642 | static bool rcu_try_advance_all_cbs(void) | ||
| 1643 | { | ||
| 1644 | bool cbs_ready = false; | ||
| 1645 | struct rcu_data *rdp; | ||
| 1646 | struct rcu_dynticks *rdtp = this_cpu_ptr(&rcu_dynticks); | ||
| 1647 | struct rcu_node *rnp; | ||
| 1648 | struct rcu_state *rsp; | ||
| 1649 | |||
| 1650 | /* Exit early if we advanced recently. */ | ||
| 1651 | if (jiffies == rdtp->last_advance_all) | ||
| 1652 | return 0; | ||
| 1653 | rdtp->last_advance_all = jiffies; | ||
| 1654 | |||
| 1655 | for_each_rcu_flavor(rsp) { | ||
| 1656 | rdp = this_cpu_ptr(rsp->rda); | ||
| 1657 | rnp = rdp->mynode; | ||
| 1658 | |||
| 1659 | /* | ||
| 1660 | * Don't bother checking unless a grace period has | ||
| 1661 | * completed since we last checked and there are | ||
| 1662 | * callbacks not yet ready to invoke. | ||
| 1663 | */ | ||
| 1664 | if (rdp->completed != rnp->completed && | ||
| 1665 | rdp->nxttail[RCU_DONE_TAIL] != rdp->nxttail[RCU_NEXT_TAIL]) | ||
| 1666 | note_gp_changes(rsp, rdp); | ||
| 1667 | |||
| 1668 | if (cpu_has_callbacks_ready_to_invoke(rdp)) | ||
| 1669 | cbs_ready = true; | ||
| 1670 | } | ||
| 1671 | return cbs_ready; | ||
| 1672 | } | ||
| 1673 | |||
| 1674 | /* | ||
| 1675 | * Allow the CPU to enter dyntick-idle mode unless it has callbacks ready | ||
| 1676 | * to invoke. If the CPU has callbacks, try to advance them. Tell the | ||
| 1677 | * caller to set the timeout based on whether or not there are non-lazy | ||
| 1678 | * callbacks. | ||
| 1679 | * | ||
| 1680 | * The caller must have disabled interrupts. | ||
| 1681 | */ | ||
| 1682 | int rcu_needs_cpu(int cpu, unsigned long *dj) | ||
| 1683 | { | ||
| 1684 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
| 1685 | |||
| 1686 | /* Snapshot to detect later posting of non-lazy callback. */ | ||
| 1687 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; | ||
| 1688 | |||
| 1689 | /* If no callbacks, RCU doesn't need the CPU. */ | ||
| 1690 | if (!rcu_cpu_has_callbacks(cpu, &rdtp->all_lazy)) { | ||
| 1691 | *dj = ULONG_MAX; | ||
| 1692 | return 0; | ||
| 1693 | } | ||
| 1694 | |||
| 1695 | /* Attempt to advance callbacks. */ | ||
| 1696 | if (rcu_try_advance_all_cbs()) { | ||
| 1697 | /* Some ready to invoke, so initiate later invocation. */ | ||
| 1698 | invoke_rcu_core(); | ||
| 1699 | return 1; | ||
| 1700 | } | ||
| 1701 | rdtp->last_accelerate = jiffies; | ||
| 1702 | |||
| 1703 | /* Request timer delay depending on laziness, and round. */ | ||
| 1704 | if (!rdtp->all_lazy) { | ||
| 1705 | *dj = round_up(rcu_idle_gp_delay + jiffies, | ||
| 1706 | rcu_idle_gp_delay) - jiffies; | ||
| 1707 | } else { | ||
| 1708 | *dj = round_jiffies(rcu_idle_lazy_gp_delay + jiffies) - jiffies; | ||
| 1709 | } | ||
| 1710 | return 0; | ||
| 1711 | } | ||
| 1712 | |||
| 1713 | /* | ||
| 1714 | * Prepare a CPU for idle from an RCU perspective. The first major task | ||
| 1715 | * is to sense whether nohz mode has been enabled or disabled via sysfs. | ||
| 1716 | * The second major task is to check to see if a non-lazy callback has | ||
| 1717 | * arrived at a CPU that previously had only lazy callbacks. The third | ||
| 1718 | * major task is to accelerate (that is, assign grace-period numbers to) | ||
| 1719 | * any recently arrived callbacks. | ||
| 1720 | * | ||
| 1721 | * The caller must have disabled interrupts. | ||
| 1722 | */ | ||
| 1723 | static void rcu_prepare_for_idle(int cpu) | ||
| 1724 | { | ||
| 1725 | struct rcu_data *rdp; | ||
| 1726 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
| 1727 | struct rcu_node *rnp; | ||
| 1728 | struct rcu_state *rsp; | ||
| 1729 | int tne; | ||
| 1730 | |||
| 1731 | /* Handle nohz enablement switches conservatively. */ | ||
| 1732 | tne = ACCESS_ONCE(tick_nohz_enabled); | ||
| 1733 | if (tne != rdtp->tick_nohz_enabled_snap) { | ||
| 1734 | if (rcu_cpu_has_callbacks(cpu, NULL)) | ||
| 1735 | invoke_rcu_core(); /* force nohz to see update. */ | ||
| 1736 | rdtp->tick_nohz_enabled_snap = tne; | ||
| 1737 | return; | ||
| 1738 | } | ||
| 1739 | if (!tne) | ||
| 1740 | return; | ||
| 1741 | |||
| 1742 | /* If this is a no-CBs CPU, no callbacks, just return. */ | ||
| 1743 | if (rcu_is_nocb_cpu(cpu)) | ||
| 1744 | return; | ||
| 1745 | |||
| 1746 | /* | ||
| 1747 | * If a non-lazy callback arrived at a CPU having only lazy | ||
| 1748 | * callbacks, invoke RCU core for the side-effect of recalculating | ||
| 1749 | * idle duration on re-entry to idle. | ||
| 1750 | */ | ||
| 1751 | if (rdtp->all_lazy && | ||
| 1752 | rdtp->nonlazy_posted != rdtp->nonlazy_posted_snap) { | ||
| 1753 | rdtp->all_lazy = false; | ||
| 1754 | rdtp->nonlazy_posted_snap = rdtp->nonlazy_posted; | ||
| 1755 | invoke_rcu_core(); | ||
| 1756 | return; | ||
| 1757 | } | ||
| 1758 | |||
| 1759 | /* | ||
| 1760 | * If we have not yet accelerated this jiffy, accelerate all | ||
| 1761 | * callbacks on this CPU. | ||
| 1762 | */ | ||
| 1763 | if (rdtp->last_accelerate == jiffies) | ||
| 1764 | return; | ||
| 1765 | rdtp->last_accelerate = jiffies; | ||
| 1766 | for_each_rcu_flavor(rsp) { | ||
| 1767 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 1768 | if (!*rdp->nxttail[RCU_DONE_TAIL]) | ||
| 1769 | continue; | ||
| 1770 | rnp = rdp->mynode; | ||
| 1771 | raw_spin_lock(&rnp->lock); /* irqs already disabled. */ | ||
| 1772 | rcu_accelerate_cbs(rsp, rnp, rdp); | ||
| 1773 | raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ | ||
| 1774 | } | ||
| 1775 | } | ||
| 1776 | |||
| 1777 | /* | ||
| 1778 | * Clean up for exit from idle. Attempt to advance callbacks based on | ||
| 1779 | * any grace periods that elapsed while the CPU was idle, and if any | ||
| 1780 | * callbacks are now ready to invoke, initiate invocation. | ||
| 1781 | */ | ||
| 1782 | static void rcu_cleanup_after_idle(int cpu) | ||
| 1783 | { | ||
| 1784 | |||
| 1785 | if (rcu_is_nocb_cpu(cpu)) | ||
| 1786 | return; | ||
| 1787 | if (rcu_try_advance_all_cbs()) | ||
| 1788 | invoke_rcu_core(); | ||
| 1789 | } | ||
| 1790 | |||
| 1791 | /* | ||
| 1792 | * Keep a running count of the number of non-lazy callbacks posted | ||
| 1793 | * on this CPU. This running counter (which is never decremented) allows | ||
| 1794 | * rcu_prepare_for_idle() to detect when something out of the idle loop | ||
| 1795 | * posts a callback, even if an equal number of callbacks are invoked. | ||
| 1796 | * Of course, callbacks should only be posted from within a trace event | ||
| 1797 | * designed to be called from idle or from within RCU_NONIDLE(). | ||
| 1798 | */ | ||
| 1799 | static void rcu_idle_count_callbacks_posted(void) | ||
| 1800 | { | ||
| 1801 | __this_cpu_add(rcu_dynticks.nonlazy_posted, 1); | ||
| 1802 | } | ||
| 1803 | |||
| 1804 | /* | ||
| 1805 | * Data for flushing lazy RCU callbacks at OOM time. | ||
| 1806 | */ | ||
| 1807 | static atomic_t oom_callback_count; | ||
| 1808 | static DECLARE_WAIT_QUEUE_HEAD(oom_callback_wq); | ||
| 1809 | |||
| 1810 | /* | ||
| 1811 | * RCU OOM callback -- decrement the outstanding count and deliver the | ||
| 1812 | * wake-up if we are the last one. | ||
| 1813 | */ | ||
| 1814 | static void rcu_oom_callback(struct rcu_head *rhp) | ||
| 1815 | { | ||
| 1816 | if (atomic_dec_and_test(&oom_callback_count)) | ||
| 1817 | wake_up(&oom_callback_wq); | ||
| 1818 | } | ||
| 1819 | |||
| 1820 | /* | ||
| 1821 | * Post an rcu_oom_notify callback on the current CPU if it has at | ||
| 1822 | * least one lazy callback. This will unnecessarily post callbacks | ||
| 1823 | * to CPUs that already have a non-lazy callback at the end of their | ||
| 1824 | * callback list, but this is an infrequent operation, so accept some | ||
| 1825 | * extra overhead to keep things simple. | ||
| 1826 | */ | ||
| 1827 | static void rcu_oom_notify_cpu(void *unused) | ||
| 1828 | { | ||
| 1829 | struct rcu_state *rsp; | ||
| 1830 | struct rcu_data *rdp; | ||
| 1831 | |||
| 1832 | for_each_rcu_flavor(rsp) { | ||
| 1833 | rdp = __this_cpu_ptr(rsp->rda); | ||
| 1834 | if (rdp->qlen_lazy != 0) { | ||
| 1835 | atomic_inc(&oom_callback_count); | ||
| 1836 | rsp->call(&rdp->oom_head, rcu_oom_callback); | ||
| 1837 | } | ||
| 1838 | } | ||
| 1839 | } | ||
| 1840 | |||
| 1841 | /* | ||
| 1842 | * If low on memory, ensure that each CPU has a non-lazy callback. | ||
| 1843 | * This will wake up CPUs that have only lazy callbacks, in turn | ||
| 1844 | * ensuring that they free up the corresponding memory in a timely manner. | ||
| 1845 | * Because an uncertain amount of memory will be freed in some uncertain | ||
| 1846 | * timeframe, we do not claim to have freed anything. | ||
| 1847 | */ | ||
| 1848 | static int rcu_oom_notify(struct notifier_block *self, | ||
| 1849 | unsigned long notused, void *nfreed) | ||
| 1850 | { | ||
| 1851 | int cpu; | ||
| 1852 | |||
| 1853 | /* Wait for callbacks from earlier instance to complete. */ | ||
| 1854 | wait_event(oom_callback_wq, atomic_read(&oom_callback_count) == 0); | ||
| 1855 | |||
| 1856 | /* | ||
| 1857 | * Prevent premature wakeup: ensure that all increments happen | ||
| 1858 | * before there is a chance of the counter reaching zero. | ||
| 1859 | */ | ||
| 1860 | atomic_set(&oom_callback_count, 1); | ||
| 1861 | |||
| 1862 | get_online_cpus(); | ||
| 1863 | for_each_online_cpu(cpu) { | ||
| 1864 | smp_call_function_single(cpu, rcu_oom_notify_cpu, NULL, 1); | ||
| 1865 | cond_resched(); | ||
| 1866 | } | ||
| 1867 | put_online_cpus(); | ||
| 1868 | |||
| 1869 | /* Unconditionally decrement: no need to wake ourselves up. */ | ||
| 1870 | atomic_dec(&oom_callback_count); | ||
| 1871 | |||
| 1872 | return NOTIFY_OK; | ||
| 1873 | } | ||
| 1874 | |||
| 1875 | static struct notifier_block rcu_oom_nb = { | ||
| 1876 | .notifier_call = rcu_oom_notify | ||
| 1877 | }; | ||
| 1878 | |||
| 1879 | static int __init rcu_register_oom_notifier(void) | ||
| 1880 | { | ||
| 1881 | register_oom_notifier(&rcu_oom_nb); | ||
| 1882 | return 0; | ||
| 1883 | } | ||
| 1884 | early_initcall(rcu_register_oom_notifier); | ||
| 1885 | |||
| 1886 | #endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ | ||
| 1887 | |||
| 1888 | #ifdef CONFIG_RCU_CPU_STALL_INFO | ||
| 1889 | |||
| 1890 | #ifdef CONFIG_RCU_FAST_NO_HZ | ||
| 1891 | |||
| 1892 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | ||
| 1893 | { | ||
| 1894 | struct rcu_dynticks *rdtp = &per_cpu(rcu_dynticks, cpu); | ||
| 1895 | unsigned long nlpd = rdtp->nonlazy_posted - rdtp->nonlazy_posted_snap; | ||
| 1896 | |||
| 1897 | sprintf(cp, "last_accelerate: %04lx/%04lx, nonlazy_posted: %ld, %c%c", | ||
| 1898 | rdtp->last_accelerate & 0xffff, jiffies & 0xffff, | ||
| 1899 | ulong2long(nlpd), | ||
| 1900 | rdtp->all_lazy ? 'L' : '.', | ||
| 1901 | rdtp->tick_nohz_enabled_snap ? '.' : 'D'); | ||
| 1902 | } | ||
| 1903 | |||
| 1904 | #else /* #ifdef CONFIG_RCU_FAST_NO_HZ */ | ||
| 1905 | |||
| 1906 | static void print_cpu_stall_fast_no_hz(char *cp, int cpu) | ||
| 1907 | { | ||
| 1908 | *cp = '\0'; | ||
| 1909 | } | ||
| 1910 | |||
| 1911 | #endif /* #else #ifdef CONFIG_RCU_FAST_NO_HZ */ | ||
| 1912 | |||
| 1913 | /* Initiate the stall-info list. */ | ||
| 1914 | static void print_cpu_stall_info_begin(void) | ||
| 1915 | { | ||
| 1916 | pr_cont("\n"); | ||
| 1917 | } | ||
| 1918 | |||
| 1919 | /* | ||
| 1920 | * Print out diagnostic information for the specified stalled CPU. | ||
| 1921 | * | ||
| 1922 | * If the specified CPU is aware of the current RCU grace period | ||
| 1923 | * (flavor specified by rsp), then print the number of scheduling | ||
| 1924 | * clock interrupts the CPU has taken during the time that it has | ||
| 1925 | * been aware. Otherwise, print the number of RCU grace periods | ||
| 1926 | * that this CPU is ignorant of, for example, "1" if the CPU was | ||
| 1927 | * aware of the previous grace period. | ||
| 1928 | * | ||
| 1929 | * Also print out idle and (if CONFIG_RCU_FAST_NO_HZ) idle-entry info. | ||
| 1930 | */ | ||
| 1931 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) | ||
| 1932 | { | ||
| 1933 | char fast_no_hz[72]; | ||
| 1934 | struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 1935 | struct rcu_dynticks *rdtp = rdp->dynticks; | ||
| 1936 | char *ticks_title; | ||
| 1937 | unsigned long ticks_value; | ||
| 1938 | |||
| 1939 | if (rsp->gpnum == rdp->gpnum) { | ||
| 1940 | ticks_title = "ticks this GP"; | ||
| 1941 | ticks_value = rdp->ticks_this_gp; | ||
| 1942 | } else { | ||
| 1943 | ticks_title = "GPs behind"; | ||
| 1944 | ticks_value = rsp->gpnum - rdp->gpnum; | ||
| 1945 | } | ||
| 1946 | print_cpu_stall_fast_no_hz(fast_no_hz, cpu); | ||
| 1947 | pr_err("\t%d: (%lu %s) idle=%03x/%llx/%d softirq=%u/%u %s\n", | ||
| 1948 | cpu, ticks_value, ticks_title, | ||
| 1949 | atomic_read(&rdtp->dynticks) & 0xfff, | ||
| 1950 | rdtp->dynticks_nesting, rdtp->dynticks_nmi_nesting, | ||
| 1951 | rdp->softirq_snap, kstat_softirqs_cpu(RCU_SOFTIRQ, cpu), | ||
| 1952 | fast_no_hz); | ||
| 1953 | } | ||
| 1954 | |||
| 1955 | /* Terminate the stall-info list. */ | ||
| 1956 | static void print_cpu_stall_info_end(void) | ||
| 1957 | { | ||
| 1958 | pr_err("\t"); | ||
| 1959 | } | ||
| 1960 | |||
| 1961 | /* Zero ->ticks_this_gp for all flavors of RCU. */ | ||
| 1962 | static void zero_cpu_stall_ticks(struct rcu_data *rdp) | ||
| 1963 | { | ||
| 1964 | rdp->ticks_this_gp = 0; | ||
| 1965 | rdp->softirq_snap = kstat_softirqs_cpu(RCU_SOFTIRQ, smp_processor_id()); | ||
| 1966 | } | ||
| 1967 | |||
| 1968 | /* Increment ->ticks_this_gp for all flavors of RCU. */ | ||
| 1969 | static void increment_cpu_stall_ticks(void) | ||
| 1970 | { | ||
| 1971 | struct rcu_state *rsp; | ||
| 1972 | |||
| 1973 | for_each_rcu_flavor(rsp) | ||
| 1974 | __this_cpu_ptr(rsp->rda)->ticks_this_gp++; | ||
| 1975 | } | ||
| 1976 | |||
| 1977 | #else /* #ifdef CONFIG_RCU_CPU_STALL_INFO */ | ||
| 1978 | |||
| 1979 | static void print_cpu_stall_info_begin(void) | ||
| 1980 | { | ||
| 1981 | pr_cont(" {"); | ||
| 1982 | } | ||
| 1983 | |||
| 1984 | static void print_cpu_stall_info(struct rcu_state *rsp, int cpu) | ||
| 1985 | { | ||
| 1986 | pr_cont(" %d", cpu); | ||
| 1987 | } | ||
| 1988 | |||
| 1989 | static void print_cpu_stall_info_end(void) | ||
| 1990 | { | ||
| 1991 | pr_cont("} "); | ||
| 1992 | } | ||
| 1993 | |||
| 1994 | static void zero_cpu_stall_ticks(struct rcu_data *rdp) | ||
| 1995 | { | ||
| 1996 | } | ||
| 1997 | |||
| 1998 | static void increment_cpu_stall_ticks(void) | ||
| 1999 | { | ||
| 2000 | } | ||
| 2001 | |||
| 2002 | #endif /* #else #ifdef CONFIG_RCU_CPU_STALL_INFO */ | ||
| 2003 | |||
| 2004 | #ifdef CONFIG_RCU_NOCB_CPU | ||
| 2005 | |||
| 2006 | /* | ||
| 2007 | * Offload callback processing from the boot-time-specified set of CPUs | ||
| 2008 | * specified by rcu_nocb_mask. For each CPU in the set, there is a | ||
| 2009 | * kthread created that pulls the callbacks from the corresponding CPU, | ||
| 2010 | * waits for a grace period to elapse, and invokes the callbacks. | ||
| 2011 | * The no-CBs CPUs do a wake_up() on their kthread when they insert | ||
| 2012 | * a callback into any empty list, unless the rcu_nocb_poll boot parameter | ||
| 2013 | * has been specified, in which case each kthread actively polls its | ||
| 2014 | * CPU. (Which isn't so great for energy efficiency, but which does | ||
| 2015 | * reduce RCU's overhead on that CPU.) | ||
| 2016 | * | ||
| 2017 | * This is intended to be used in conjunction with Frederic Weisbecker's | ||
| 2018 | * adaptive-idle work, which would seriously reduce OS jitter on CPUs | ||
| 2019 | * running CPU-bound user-mode computations. | ||
| 2020 | * | ||
| 2021 | * Offloading of callback processing could also in theory be used as | ||
| 2022 | * an energy-efficiency measure because CPUs with no RCU callbacks | ||
| 2023 | * queued are more aggressive about entering dyntick-idle mode. | ||
| 2024 | */ | ||
| 2025 | |||
| 2026 | |||
| 2027 | /* Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters. */ | ||
| 2028 | static int __init rcu_nocb_setup(char *str) | ||
| 2029 | { | ||
| 2030 | alloc_bootmem_cpumask_var(&rcu_nocb_mask); | ||
| 2031 | have_rcu_nocb_mask = true; | ||
| 2032 | cpulist_parse(str, rcu_nocb_mask); | ||
| 2033 | return 1; | ||
| 2034 | } | ||
| 2035 | __setup("rcu_nocbs=", rcu_nocb_setup); | ||
| 2036 | |||
| 2037 | static int __init parse_rcu_nocb_poll(char *arg) | ||
| 2038 | { | ||
| 2039 | rcu_nocb_poll = 1; | ||
| 2040 | return 0; | ||
| 2041 | } | ||
| 2042 | early_param("rcu_nocb_poll", parse_rcu_nocb_poll); | ||
| 2043 | |||
| 2044 | /* | ||
| 2045 | * Do any no-CBs CPUs need another grace period? | ||
| 2046 | * | ||
| 2047 | * Interrupts must be disabled. If the caller does not hold the root | ||
| 2048 | * rnp_node structure's ->lock, the results are advisory only. | ||
| 2049 | */ | ||
| 2050 | static int rcu_nocb_needs_gp(struct rcu_state *rsp) | ||
| 2051 | { | ||
| 2052 | struct rcu_node *rnp = rcu_get_root(rsp); | ||
| 2053 | |||
| 2054 | return rnp->need_future_gp[(ACCESS_ONCE(rnp->completed) + 1) & 0x1]; | ||
| 2055 | } | ||
| 2056 | |||
| 2057 | /* | ||
| 2058 | * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended | ||
| 2059 | * grace period. | ||
| 2060 | */ | ||
| 2061 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | ||
| 2062 | { | ||
| 2063 | wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); | ||
| 2064 | } | ||
| 2065 | |||
| 2066 | /* | ||
| 2067 | * Set the root rcu_node structure's ->need_future_gp field | ||
| 2068 | * based on the sum of those of all rcu_node structures. This does | ||
| 2069 | * double-count the root rcu_node structure's requests, but this | ||
| 2070 | * is necessary to handle the possibility of a rcu_nocb_kthread() | ||
| 2071 | * having awakened during the time that the rcu_node structures | ||
| 2072 | * were being updated for the end of the previous grace period. | ||
| 2073 | */ | ||
| 2074 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) | ||
| 2075 | { | ||
| 2076 | rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; | ||
| 2077 | } | ||
| 2078 | |||
| 2079 | static void rcu_init_one_nocb(struct rcu_node *rnp) | ||
| 2080 | { | ||
| 2081 | init_waitqueue_head(&rnp->nocb_gp_wq[0]); | ||
| 2082 | init_waitqueue_head(&rnp->nocb_gp_wq[1]); | ||
| 2083 | } | ||
| 2084 | |||
| 2085 | /* Is the specified CPU a no-CPUs CPU? */ | ||
| 2086 | bool rcu_is_nocb_cpu(int cpu) | ||
| 2087 | { | ||
| 2088 | if (have_rcu_nocb_mask) | ||
| 2089 | return cpumask_test_cpu(cpu, rcu_nocb_mask); | ||
| 2090 | return false; | ||
| 2091 | } | ||
| 2092 | |||
| 2093 | /* | ||
| 2094 | * Enqueue the specified string of rcu_head structures onto the specified | ||
| 2095 | * CPU's no-CBs lists. The CPU is specified by rdp, the head of the | ||
| 2096 | * string by rhp, and the tail of the string by rhtp. The non-lazy/lazy | ||
| 2097 | * counts are supplied by rhcount and rhcount_lazy. | ||
| 2098 | * | ||
| 2099 | * If warranted, also wake up the kthread servicing this CPUs queues. | ||
| 2100 | */ | ||
| 2101 | static void __call_rcu_nocb_enqueue(struct rcu_data *rdp, | ||
| 2102 | struct rcu_head *rhp, | ||
| 2103 | struct rcu_head **rhtp, | ||
| 2104 | int rhcount, int rhcount_lazy) | ||
| 2105 | { | ||
| 2106 | int len; | ||
| 2107 | struct rcu_head **old_rhpp; | ||
| 2108 | struct task_struct *t; | ||
| 2109 | |||
| 2110 | /* Enqueue the callback on the nocb list and update counts. */ | ||
| 2111 | old_rhpp = xchg(&rdp->nocb_tail, rhtp); | ||
| 2112 | ACCESS_ONCE(*old_rhpp) = rhp; | ||
| 2113 | atomic_long_add(rhcount, &rdp->nocb_q_count); | ||
| 2114 | atomic_long_add(rhcount_lazy, &rdp->nocb_q_count_lazy); | ||
| 2115 | |||
| 2116 | /* If we are not being polled and there is a kthread, awaken it ... */ | ||
| 2117 | t = ACCESS_ONCE(rdp->nocb_kthread); | ||
| 2118 | if (rcu_nocb_poll || !t) { | ||
| 2119 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2120 | TPS("WakeNotPoll")); | ||
| 2121 | return; | ||
| 2122 | } | ||
| 2123 | len = atomic_long_read(&rdp->nocb_q_count); | ||
| 2124 | if (old_rhpp == &rdp->nocb_head) { | ||
| 2125 | wake_up(&rdp->nocb_wq); /* ... only if queue was empty ... */ | ||
| 2126 | rdp->qlen_last_fqs_check = 0; | ||
| 2127 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeEmpty")); | ||
| 2128 | } else if (len > rdp->qlen_last_fqs_check + qhimark) { | ||
| 2129 | wake_up_process(t); /* ... or if many callbacks queued. */ | ||
| 2130 | rdp->qlen_last_fqs_check = LONG_MAX / 2; | ||
| 2131 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeOvf")); | ||
| 2132 | } else { | ||
| 2133 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, TPS("WakeNot")); | ||
| 2134 | } | ||
| 2135 | return; | ||
| 2136 | } | ||
| 2137 | |||
| 2138 | /* | ||
| 2139 | * This is a helper for __call_rcu(), which invokes this when the normal | ||
| 2140 | * callback queue is inoperable. If this is not a no-CBs CPU, this | ||
| 2141 | * function returns failure back to __call_rcu(), which can complain | ||
| 2142 | * appropriately. | ||
| 2143 | * | ||
| 2144 | * Otherwise, this function queues the callback where the corresponding | ||
| 2145 | * "rcuo" kthread can find it. | ||
| 2146 | */ | ||
| 2147 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | ||
| 2148 | bool lazy) | ||
| 2149 | { | ||
| 2150 | |||
| 2151 | if (!rcu_is_nocb_cpu(rdp->cpu)) | ||
| 2152 | return 0; | ||
| 2153 | __call_rcu_nocb_enqueue(rdp, rhp, &rhp->next, 1, lazy); | ||
| 2154 | if (__is_kfree_rcu_offset((unsigned long)rhp->func)) | ||
| 2155 | trace_rcu_kfree_callback(rdp->rsp->name, rhp, | ||
| 2156 | (unsigned long)rhp->func, | ||
| 2157 | -atomic_long_read(&rdp->nocb_q_count_lazy), | ||
| 2158 | -atomic_long_read(&rdp->nocb_q_count)); | ||
| 2159 | else | ||
| 2160 | trace_rcu_callback(rdp->rsp->name, rhp, | ||
| 2161 | -atomic_long_read(&rdp->nocb_q_count_lazy), | ||
| 2162 | -atomic_long_read(&rdp->nocb_q_count)); | ||
| 2163 | return 1; | ||
| 2164 | } | ||
| 2165 | |||
| 2166 | /* | ||
| 2167 | * Adopt orphaned callbacks on a no-CBs CPU, or return 0 if this is | ||
| 2168 | * not a no-CBs CPU. | ||
| 2169 | */ | ||
| 2170 | static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | ||
| 2171 | struct rcu_data *rdp) | ||
| 2172 | { | ||
| 2173 | long ql = rsp->qlen; | ||
| 2174 | long qll = rsp->qlen_lazy; | ||
| 2175 | |||
| 2176 | /* If this is not a no-CBs CPU, tell the caller to do it the old way. */ | ||
| 2177 | if (!rcu_is_nocb_cpu(smp_processor_id())) | ||
| 2178 | return 0; | ||
| 2179 | rsp->qlen = 0; | ||
| 2180 | rsp->qlen_lazy = 0; | ||
| 2181 | |||
| 2182 | /* First, enqueue the donelist, if any. This preserves CB ordering. */ | ||
| 2183 | if (rsp->orphan_donelist != NULL) { | ||
| 2184 | __call_rcu_nocb_enqueue(rdp, rsp->orphan_donelist, | ||
| 2185 | rsp->orphan_donetail, ql, qll); | ||
| 2186 | ql = qll = 0; | ||
| 2187 | rsp->orphan_donelist = NULL; | ||
| 2188 | rsp->orphan_donetail = &rsp->orphan_donelist; | ||
| 2189 | } | ||
| 2190 | if (rsp->orphan_nxtlist != NULL) { | ||
| 2191 | __call_rcu_nocb_enqueue(rdp, rsp->orphan_nxtlist, | ||
| 2192 | rsp->orphan_nxttail, ql, qll); | ||
| 2193 | ql = qll = 0; | ||
| 2194 | rsp->orphan_nxtlist = NULL; | ||
| 2195 | rsp->orphan_nxttail = &rsp->orphan_nxtlist; | ||
| 2196 | } | ||
| 2197 | return 1; | ||
| 2198 | } | ||
| 2199 | |||
| 2200 | /* | ||
| 2201 | * If necessary, kick off a new grace period, and either way wait | ||
| 2202 | * for a subsequent grace period to complete. | ||
| 2203 | */ | ||
| 2204 | static void rcu_nocb_wait_gp(struct rcu_data *rdp) | ||
| 2205 | { | ||
| 2206 | unsigned long c; | ||
| 2207 | bool d; | ||
| 2208 | unsigned long flags; | ||
| 2209 | struct rcu_node *rnp = rdp->mynode; | ||
| 2210 | |||
| 2211 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 2212 | c = rcu_start_future_gp(rnp, rdp); | ||
| 2213 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 2214 | |||
| 2215 | /* | ||
| 2216 | * Wait for the grace period. Do so interruptibly to avoid messing | ||
| 2217 | * up the load average. | ||
| 2218 | */ | ||
| 2219 | trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); | ||
| 2220 | for (;;) { | ||
| 2221 | wait_event_interruptible( | ||
| 2222 | rnp->nocb_gp_wq[c & 0x1], | ||
| 2223 | (d = ULONG_CMP_GE(ACCESS_ONCE(rnp->completed), c))); | ||
| 2224 | if (likely(d)) | ||
| 2225 | break; | ||
| 2226 | flush_signals(current); | ||
| 2227 | trace_rcu_future_gp(rnp, rdp, c, TPS("ResumeWait")); | ||
| 2228 | } | ||
| 2229 | trace_rcu_future_gp(rnp, rdp, c, TPS("EndWait")); | ||
| 2230 | smp_mb(); /* Ensure that CB invocation happens after GP end. */ | ||
| 2231 | } | ||
| 2232 | |||
| 2233 | /* | ||
| 2234 | * Per-rcu_data kthread, but only for no-CBs CPUs. Each kthread invokes | ||
| 2235 | * callbacks queued by the corresponding no-CBs CPU. | ||
| 2236 | */ | ||
| 2237 | static int rcu_nocb_kthread(void *arg) | ||
| 2238 | { | ||
| 2239 | int c, cl; | ||
| 2240 | bool firsttime = 1; | ||
| 2241 | struct rcu_head *list; | ||
| 2242 | struct rcu_head *next; | ||
| 2243 | struct rcu_head **tail; | ||
| 2244 | struct rcu_data *rdp = arg; | ||
| 2245 | |||
| 2246 | /* Each pass through this loop invokes one batch of callbacks */ | ||
| 2247 | for (;;) { | ||
| 2248 | /* If not polling, wait for next batch of callbacks. */ | ||
| 2249 | if (!rcu_nocb_poll) { | ||
| 2250 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2251 | TPS("Sleep")); | ||
| 2252 | wait_event_interruptible(rdp->nocb_wq, rdp->nocb_head); | ||
| 2253 | } else if (firsttime) { | ||
| 2254 | firsttime = 0; | ||
| 2255 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2256 | TPS("Poll")); | ||
| 2257 | } | ||
| 2258 | list = ACCESS_ONCE(rdp->nocb_head); | ||
| 2259 | if (!list) { | ||
| 2260 | if (!rcu_nocb_poll) | ||
| 2261 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2262 | TPS("WokeEmpty")); | ||
| 2263 | schedule_timeout_interruptible(1); | ||
| 2264 | flush_signals(current); | ||
| 2265 | continue; | ||
| 2266 | } | ||
| 2267 | firsttime = 1; | ||
| 2268 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2269 | TPS("WokeNonEmpty")); | ||
| 2270 | |||
| 2271 | /* | ||
| 2272 | * Extract queued callbacks, update counts, and wait | ||
| 2273 | * for a grace period to elapse. | ||
| 2274 | */ | ||
| 2275 | ACCESS_ONCE(rdp->nocb_head) = NULL; | ||
| 2276 | tail = xchg(&rdp->nocb_tail, &rdp->nocb_head); | ||
| 2277 | c = atomic_long_xchg(&rdp->nocb_q_count, 0); | ||
| 2278 | cl = atomic_long_xchg(&rdp->nocb_q_count_lazy, 0); | ||
| 2279 | ACCESS_ONCE(rdp->nocb_p_count) += c; | ||
| 2280 | ACCESS_ONCE(rdp->nocb_p_count_lazy) += cl; | ||
| 2281 | rcu_nocb_wait_gp(rdp); | ||
| 2282 | |||
| 2283 | /* Each pass through the following loop invokes a callback. */ | ||
| 2284 | trace_rcu_batch_start(rdp->rsp->name, cl, c, -1); | ||
| 2285 | c = cl = 0; | ||
| 2286 | while (list) { | ||
| 2287 | next = list->next; | ||
| 2288 | /* Wait for enqueuing to complete, if needed. */ | ||
| 2289 | while (next == NULL && &list->next != tail) { | ||
| 2290 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2291 | TPS("WaitQueue")); | ||
| 2292 | schedule_timeout_interruptible(1); | ||
| 2293 | trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, | ||
| 2294 | TPS("WokeQueue")); | ||
| 2295 | next = list->next; | ||
| 2296 | } | ||
| 2297 | debug_rcu_head_unqueue(list); | ||
| 2298 | local_bh_disable(); | ||
| 2299 | if (__rcu_reclaim(rdp->rsp->name, list)) | ||
| 2300 | cl++; | ||
| 2301 | c++; | ||
| 2302 | local_bh_enable(); | ||
| 2303 | list = next; | ||
| 2304 | } | ||
| 2305 | trace_rcu_batch_end(rdp->rsp->name, c, !!list, 0, 0, 1); | ||
| 2306 | ACCESS_ONCE(rdp->nocb_p_count) -= c; | ||
| 2307 | ACCESS_ONCE(rdp->nocb_p_count_lazy) -= cl; | ||
| 2308 | rdp->n_nocbs_invoked += c; | ||
| 2309 | } | ||
| 2310 | return 0; | ||
| 2311 | } | ||
| 2312 | |||
| 2313 | /* Initialize per-rcu_data variables for no-CBs CPUs. */ | ||
| 2314 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | ||
| 2315 | { | ||
| 2316 | rdp->nocb_tail = &rdp->nocb_head; | ||
| 2317 | init_waitqueue_head(&rdp->nocb_wq); | ||
| 2318 | } | ||
| 2319 | |||
| 2320 | /* Create a kthread for each RCU flavor for each no-CBs CPU. */ | ||
| 2321 | static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | ||
| 2322 | { | ||
| 2323 | int cpu; | ||
| 2324 | struct rcu_data *rdp; | ||
| 2325 | struct task_struct *t; | ||
| 2326 | |||
| 2327 | if (rcu_nocb_mask == NULL) | ||
| 2328 | return; | ||
| 2329 | for_each_cpu(cpu, rcu_nocb_mask) { | ||
| 2330 | rdp = per_cpu_ptr(rsp->rda, cpu); | ||
| 2331 | t = kthread_run(rcu_nocb_kthread, rdp, | ||
| 2332 | "rcuo%c/%d", rsp->abbr, cpu); | ||
| 2333 | BUG_ON(IS_ERR(t)); | ||
| 2334 | ACCESS_ONCE(rdp->nocb_kthread) = t; | ||
| 2335 | } | ||
| 2336 | } | ||
| 2337 | |||
| 2338 | /* Prevent __call_rcu() from enqueuing callbacks on no-CBs CPUs */ | ||
| 2339 | static bool init_nocb_callback_list(struct rcu_data *rdp) | ||
| 2340 | { | ||
| 2341 | if (rcu_nocb_mask == NULL || | ||
| 2342 | !cpumask_test_cpu(rdp->cpu, rcu_nocb_mask)) | ||
| 2343 | return false; | ||
| 2344 | rdp->nxttail[RCU_NEXT_TAIL] = NULL; | ||
| 2345 | return true; | ||
| 2346 | } | ||
| 2347 | |||
| 2348 | #else /* #ifdef CONFIG_RCU_NOCB_CPU */ | ||
| 2349 | |||
| 2350 | static int rcu_nocb_needs_gp(struct rcu_state *rsp) | ||
| 2351 | { | ||
| 2352 | return 0; | ||
| 2353 | } | ||
| 2354 | |||
| 2355 | static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) | ||
| 2356 | { | ||
| 2357 | } | ||
| 2358 | |||
| 2359 | static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) | ||
| 2360 | { | ||
| 2361 | } | ||
| 2362 | |||
| 2363 | static void rcu_init_one_nocb(struct rcu_node *rnp) | ||
| 2364 | { | ||
| 2365 | } | ||
| 2366 | |||
| 2367 | static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, | ||
| 2368 | bool lazy) | ||
| 2369 | { | ||
| 2370 | return 0; | ||
| 2371 | } | ||
| 2372 | |||
| 2373 | static bool __maybe_unused rcu_nocb_adopt_orphan_cbs(struct rcu_state *rsp, | ||
| 2374 | struct rcu_data *rdp) | ||
| 2375 | { | ||
| 2376 | return 0; | ||
| 2377 | } | ||
| 2378 | |||
| 2379 | static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) | ||
| 2380 | { | ||
| 2381 | } | ||
| 2382 | |||
| 2383 | static void __init rcu_spawn_nocb_kthreads(struct rcu_state *rsp) | ||
| 2384 | { | ||
| 2385 | } | ||
| 2386 | |||
| 2387 | static bool init_nocb_callback_list(struct rcu_data *rdp) | ||
| 2388 | { | ||
| 2389 | return false; | ||
| 2390 | } | ||
| 2391 | |||
| 2392 | #endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */ | ||
| 2393 | |||
| 2394 | /* | ||
| 2395 | * An adaptive-ticks CPU can potentially execute in kernel mode for an | ||
| 2396 | * arbitrarily long period of time with the scheduling-clock tick turned | ||
| 2397 | * off. RCU will be paying attention to this CPU because it is in the | ||
| 2398 | * kernel, but the CPU cannot be guaranteed to be executing the RCU state | ||
| 2399 | * machine because the scheduling-clock tick has been disabled. Therefore, | ||
| 2400 | * if an adaptive-ticks CPU is failing to respond to the current grace | ||
| 2401 | * period and has not be idle from an RCU perspective, kick it. | ||
| 2402 | */ | ||
| 2403 | static void rcu_kick_nohz_cpu(int cpu) | ||
| 2404 | { | ||
| 2405 | #ifdef CONFIG_NO_HZ_FULL | ||
| 2406 | if (tick_nohz_full_cpu(cpu)) | ||
| 2407 | smp_send_reschedule(cpu); | ||
| 2408 | #endif /* #ifdef CONFIG_NO_HZ_FULL */ | ||
| 2409 | } | ||
| 2410 | |||
| 2411 | |||
| 2412 | #ifdef CONFIG_NO_HZ_FULL_SYSIDLE | ||
| 2413 | |||
| 2414 | /* | ||
| 2415 | * Define RCU flavor that holds sysidle state. This needs to be the | ||
| 2416 | * most active flavor of RCU. | ||
| 2417 | */ | ||
| 2418 | #ifdef CONFIG_PREEMPT_RCU | ||
| 2419 | static struct rcu_state *rcu_sysidle_state = &rcu_preempt_state; | ||
| 2420 | #else /* #ifdef CONFIG_PREEMPT_RCU */ | ||
| 2421 | static struct rcu_state *rcu_sysidle_state = &rcu_sched_state; | ||
| 2422 | #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ | ||
| 2423 | |||
| 2424 | static int full_sysidle_state; /* Current system-idle state. */ | ||
| 2425 | #define RCU_SYSIDLE_NOT 0 /* Some CPU is not idle. */ | ||
| 2426 | #define RCU_SYSIDLE_SHORT 1 /* All CPUs idle for brief period. */ | ||
| 2427 | #define RCU_SYSIDLE_LONG 2 /* All CPUs idle for long enough. */ | ||
| 2428 | #define RCU_SYSIDLE_FULL 3 /* All CPUs idle, ready for sysidle. */ | ||
| 2429 | #define RCU_SYSIDLE_FULL_NOTED 4 /* Actually entered sysidle state. */ | ||
| 2430 | |||
| 2431 | /* | ||
| 2432 | * Invoked to note exit from irq or task transition to idle. Note that | ||
| 2433 | * usermode execution does -not- count as idle here! After all, we want | ||
| 2434 | * to detect full-system idle states, not RCU quiescent states and grace | ||
| 2435 | * periods. The caller must have disabled interrupts. | ||
| 2436 | */ | ||
| 2437 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) | ||
| 2438 | { | ||
| 2439 | unsigned long j; | ||
| 2440 | |||
| 2441 | /* Adjust nesting, check for fully idle. */ | ||
| 2442 | if (irq) { | ||
| 2443 | rdtp->dynticks_idle_nesting--; | ||
| 2444 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); | ||
| 2445 | if (rdtp->dynticks_idle_nesting != 0) | ||
| 2446 | return; /* Still not fully idle. */ | ||
| 2447 | } else { | ||
| 2448 | if ((rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) == | ||
| 2449 | DYNTICK_TASK_NEST_VALUE) { | ||
| 2450 | rdtp->dynticks_idle_nesting = 0; | ||
| 2451 | } else { | ||
| 2452 | rdtp->dynticks_idle_nesting -= DYNTICK_TASK_NEST_VALUE; | ||
| 2453 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting < 0); | ||
| 2454 | return; /* Still not fully idle. */ | ||
| 2455 | } | ||
| 2456 | } | ||
| 2457 | |||
| 2458 | /* Record start of fully idle period. */ | ||
| 2459 | j = jiffies; | ||
| 2460 | ACCESS_ONCE(rdtp->dynticks_idle_jiffies) = j; | ||
| 2461 | smp_mb__before_atomic_inc(); | ||
| 2462 | atomic_inc(&rdtp->dynticks_idle); | ||
| 2463 | smp_mb__after_atomic_inc(); | ||
| 2464 | WARN_ON_ONCE(atomic_read(&rdtp->dynticks_idle) & 0x1); | ||
| 2465 | } | ||
| 2466 | |||
| 2467 | /* | ||
| 2468 | * Unconditionally force exit from full system-idle state. This is | ||
| 2469 | * invoked when a normal CPU exits idle, but must be called separately | ||
| 2470 | * for the timekeeping CPU (tick_do_timer_cpu). The reason for this | ||
| 2471 | * is that the timekeeping CPU is permitted to take scheduling-clock | ||
| 2472 | * interrupts while the system is in system-idle state, and of course | ||
| 2473 | * rcu_sysidle_exit() has no way of distinguishing a scheduling-clock | ||
| 2474 | * interrupt from any other type of interrupt. | ||
| 2475 | */ | ||
| 2476 | void rcu_sysidle_force_exit(void) | ||
| 2477 | { | ||
| 2478 | int oldstate = ACCESS_ONCE(full_sysidle_state); | ||
| 2479 | int newoldstate; | ||
| 2480 | |||
| 2481 | /* | ||
| 2482 | * Each pass through the following loop attempts to exit full | ||
| 2483 | * system-idle state. If contention proves to be a problem, | ||
| 2484 | * a trylock-based contention tree could be used here. | ||
| 2485 | */ | ||
| 2486 | while (oldstate > RCU_SYSIDLE_SHORT) { | ||
| 2487 | newoldstate = cmpxchg(&full_sysidle_state, | ||
| 2488 | oldstate, RCU_SYSIDLE_NOT); | ||
| 2489 | if (oldstate == newoldstate && | ||
| 2490 | oldstate == RCU_SYSIDLE_FULL_NOTED) { | ||
| 2491 | rcu_kick_nohz_cpu(tick_do_timer_cpu); | ||
| 2492 | return; /* We cleared it, done! */ | ||
| 2493 | } | ||
| 2494 | oldstate = newoldstate; | ||
| 2495 | } | ||
| 2496 | smp_mb(); /* Order initial oldstate fetch vs. later non-idle work. */ | ||
| 2497 | } | ||
| 2498 | |||
| 2499 | /* | ||
| 2500 | * Invoked to note entry to irq or task transition from idle. Note that | ||
| 2501 | * usermode execution does -not- count as idle here! The caller must | ||
| 2502 | * have disabled interrupts. | ||
| 2503 | */ | ||
| 2504 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) | ||
| 2505 | { | ||
| 2506 | /* Adjust nesting, check for already non-idle. */ | ||
| 2507 | if (irq) { | ||
| 2508 | rdtp->dynticks_idle_nesting++; | ||
| 2509 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); | ||
| 2510 | if (rdtp->dynticks_idle_nesting != 1) | ||
| 2511 | return; /* Already non-idle. */ | ||
| 2512 | } else { | ||
| 2513 | /* | ||
| 2514 | * Allow for irq misnesting. Yes, it really is possible | ||
| 2515 | * to enter an irq handler then never leave it, and maybe | ||
| 2516 | * also vice versa. Handle both possibilities. | ||
| 2517 | */ | ||
| 2518 | if (rdtp->dynticks_idle_nesting & DYNTICK_TASK_NEST_MASK) { | ||
| 2519 | rdtp->dynticks_idle_nesting += DYNTICK_TASK_NEST_VALUE; | ||
| 2520 | WARN_ON_ONCE(rdtp->dynticks_idle_nesting <= 0); | ||
| 2521 | return; /* Already non-idle. */ | ||
| 2522 | } else { | ||
| 2523 | rdtp->dynticks_idle_nesting = DYNTICK_TASK_EXIT_IDLE; | ||
| 2524 | } | ||
| 2525 | } | ||
| 2526 | |||
| 2527 | /* Record end of idle period. */ | ||
| 2528 | smp_mb__before_atomic_inc(); | ||
| 2529 | atomic_inc(&rdtp->dynticks_idle); | ||
| 2530 | smp_mb__after_atomic_inc(); | ||
| 2531 | WARN_ON_ONCE(!(atomic_read(&rdtp->dynticks_idle) & 0x1)); | ||
| 2532 | |||
| 2533 | /* | ||
| 2534 | * If we are the timekeeping CPU, we are permitted to be non-idle | ||
| 2535 | * during a system-idle state. This must be the case, because | ||
| 2536 | * the timekeeping CPU has to take scheduling-clock interrupts | ||
| 2537 | * during the time that the system is transitioning to full | ||
| 2538 | * system-idle state. This means that the timekeeping CPU must | ||
| 2539 | * invoke rcu_sysidle_force_exit() directly if it does anything | ||
| 2540 | * more than take a scheduling-clock interrupt. | ||
| 2541 | */ | ||
| 2542 | if (smp_processor_id() == tick_do_timer_cpu) | ||
| 2543 | return; | ||
| 2544 | |||
| 2545 | /* Update system-idle state: We are clearly no longer fully idle! */ | ||
| 2546 | rcu_sysidle_force_exit(); | ||
| 2547 | } | ||
| 2548 | |||
| 2549 | /* | ||
| 2550 | * Check to see if the current CPU is idle. Note that usermode execution | ||
| 2551 | * does not count as idle. The caller must have disabled interrupts. | ||
| 2552 | */ | ||
| 2553 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | ||
| 2554 | unsigned long *maxj) | ||
| 2555 | { | ||
| 2556 | int cur; | ||
| 2557 | unsigned long j; | ||
| 2558 | struct rcu_dynticks *rdtp = rdp->dynticks; | ||
| 2559 | |||
| 2560 | /* | ||
| 2561 | * If some other CPU has already reported non-idle, if this is | ||
| 2562 | * not the flavor of RCU that tracks sysidle state, or if this | ||
| 2563 | * is an offline or the timekeeping CPU, nothing to do. | ||
| 2564 | */ | ||
| 2565 | if (!*isidle || rdp->rsp != rcu_sysidle_state || | ||
| 2566 | cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu) | ||
| 2567 | return; | ||
| 2568 | if (rcu_gp_in_progress(rdp->rsp)) | ||
| 2569 | WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu); | ||
| 2570 | |||
| 2571 | /* Pick up current idle and NMI-nesting counter and check. */ | ||
| 2572 | cur = atomic_read(&rdtp->dynticks_idle); | ||
| 2573 | if (cur & 0x1) { | ||
| 2574 | *isidle = false; /* We are not idle! */ | ||
| 2575 | return; | ||
| 2576 | } | ||
| 2577 | smp_mb(); /* Read counters before timestamps. */ | ||
| 2578 | |||
| 2579 | /* Pick up timestamps. */ | ||
| 2580 | j = ACCESS_ONCE(rdtp->dynticks_idle_jiffies); | ||
| 2581 | /* If this CPU entered idle more recently, update maxj timestamp. */ | ||
| 2582 | if (ULONG_CMP_LT(*maxj, j)) | ||
| 2583 | *maxj = j; | ||
| 2584 | } | ||
| 2585 | |||
| 2586 | /* | ||
| 2587 | * Is this the flavor of RCU that is handling full-system idle? | ||
| 2588 | */ | ||
| 2589 | static bool is_sysidle_rcu_state(struct rcu_state *rsp) | ||
| 2590 | { | ||
| 2591 | return rsp == rcu_sysidle_state; | ||
| 2592 | } | ||
| 2593 | |||
| 2594 | /* | ||
| 2595 | * Bind the grace-period kthread for the sysidle flavor of RCU to the | ||
| 2596 | * timekeeping CPU. | ||
| 2597 | */ | ||
| 2598 | static void rcu_bind_gp_kthread(void) | ||
| 2599 | { | ||
| 2600 | int cpu = ACCESS_ONCE(tick_do_timer_cpu); | ||
| 2601 | |||
| 2602 | if (cpu < 0 || cpu >= nr_cpu_ids) | ||
| 2603 | return; | ||
| 2604 | if (raw_smp_processor_id() != cpu) | ||
| 2605 | set_cpus_allowed_ptr(current, cpumask_of(cpu)); | ||
| 2606 | } | ||
| 2607 | |||
| 2608 | /* | ||
| 2609 | * Return a delay in jiffies based on the number of CPUs, rcu_node | ||
| 2610 | * leaf fanout, and jiffies tick rate. The idea is to allow larger | ||
| 2611 | * systems more time to transition to full-idle state in order to | ||
| 2612 | * avoid the cache thrashing that otherwise occur on the state variable. | ||
| 2613 | * Really small systems (less than a couple of tens of CPUs) should | ||
| 2614 | * instead use a single global atomically incremented counter, and later | ||
| 2615 | * versions of this will automatically reconfigure themselves accordingly. | ||
| 2616 | */ | ||
| 2617 | static unsigned long rcu_sysidle_delay(void) | ||
| 2618 | { | ||
| 2619 | if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) | ||
| 2620 | return 0; | ||
| 2621 | return DIV_ROUND_UP(nr_cpu_ids * HZ, rcu_fanout_leaf * 1000); | ||
| 2622 | } | ||
| 2623 | |||
| 2624 | /* | ||
| 2625 | * Advance the full-system-idle state. This is invoked when all of | ||
| 2626 | * the non-timekeeping CPUs are idle. | ||
| 2627 | */ | ||
| 2628 | static void rcu_sysidle(unsigned long j) | ||
| 2629 | { | ||
| 2630 | /* Check the current state. */ | ||
| 2631 | switch (ACCESS_ONCE(full_sysidle_state)) { | ||
| 2632 | case RCU_SYSIDLE_NOT: | ||
| 2633 | |||
| 2634 | /* First time all are idle, so note a short idle period. */ | ||
| 2635 | ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_SHORT; | ||
| 2636 | break; | ||
| 2637 | |||
| 2638 | case RCU_SYSIDLE_SHORT: | ||
| 2639 | |||
| 2640 | /* | ||
| 2641 | * Idle for a bit, time to advance to next state? | ||
| 2642 | * cmpxchg failure means race with non-idle, let them win. | ||
| 2643 | */ | ||
| 2644 | if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) | ||
| 2645 | (void)cmpxchg(&full_sysidle_state, | ||
| 2646 | RCU_SYSIDLE_SHORT, RCU_SYSIDLE_LONG); | ||
| 2647 | break; | ||
| 2648 | |||
| 2649 | case RCU_SYSIDLE_LONG: | ||
| 2650 | |||
| 2651 | /* | ||
| 2652 | * Do an additional check pass before advancing to full. | ||
| 2653 | * cmpxchg failure means race with non-idle, let them win. | ||
| 2654 | */ | ||
| 2655 | if (ULONG_CMP_GE(jiffies, j + rcu_sysidle_delay())) | ||
| 2656 | (void)cmpxchg(&full_sysidle_state, | ||
| 2657 | RCU_SYSIDLE_LONG, RCU_SYSIDLE_FULL); | ||
| 2658 | break; | ||
| 2659 | |||
| 2660 | default: | ||
| 2661 | break; | ||
| 2662 | } | ||
| 2663 | } | ||
| 2664 | |||
| 2665 | /* | ||
| 2666 | * Found a non-idle non-timekeeping CPU, so kick the system-idle state | ||
| 2667 | * back to the beginning. | ||
| 2668 | */ | ||
| 2669 | static void rcu_sysidle_cancel(void) | ||
| 2670 | { | ||
| 2671 | smp_mb(); | ||
| 2672 | ACCESS_ONCE(full_sysidle_state) = RCU_SYSIDLE_NOT; | ||
| 2673 | } | ||
| 2674 | |||
| 2675 | /* | ||
| 2676 | * Update the sysidle state based on the results of a force-quiescent-state | ||
| 2677 | * scan of the CPUs' dyntick-idle state. | ||
| 2678 | */ | ||
| 2679 | static void rcu_sysidle_report(struct rcu_state *rsp, int isidle, | ||
| 2680 | unsigned long maxj, bool gpkt) | ||
| 2681 | { | ||
| 2682 | if (rsp != rcu_sysidle_state) | ||
| 2683 | return; /* Wrong flavor, ignore. */ | ||
| 2684 | if (gpkt && nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) | ||
| 2685 | return; /* Running state machine from timekeeping CPU. */ | ||
| 2686 | if (isidle) | ||
| 2687 | rcu_sysidle(maxj); /* More idle! */ | ||
| 2688 | else | ||
| 2689 | rcu_sysidle_cancel(); /* Idle is over. */ | ||
| 2690 | } | ||
| 2691 | |||
| 2692 | /* | ||
| 2693 | * Wrapper for rcu_sysidle_report() when called from the grace-period | ||
| 2694 | * kthread's context. | ||
| 2695 | */ | ||
| 2696 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | ||
| 2697 | unsigned long maxj) | ||
| 2698 | { | ||
| 2699 | rcu_sysidle_report(rsp, isidle, maxj, true); | ||
| 2700 | } | ||
| 2701 | |||
| 2702 | /* Callback and function for forcing an RCU grace period. */ | ||
| 2703 | struct rcu_sysidle_head { | ||
| 2704 | struct rcu_head rh; | ||
| 2705 | int inuse; | ||
| 2706 | }; | ||
| 2707 | |||
| 2708 | static void rcu_sysidle_cb(struct rcu_head *rhp) | ||
| 2709 | { | ||
| 2710 | struct rcu_sysidle_head *rshp; | ||
| 2711 | |||
| 2712 | /* | ||
| 2713 | * The following memory barrier is needed to replace the | ||
| 2714 | * memory barriers that would normally be in the memory | ||
| 2715 | * allocator. | ||
| 2716 | */ | ||
| 2717 | smp_mb(); /* grace period precedes setting inuse. */ | ||
| 2718 | |||
| 2719 | rshp = container_of(rhp, struct rcu_sysidle_head, rh); | ||
| 2720 | ACCESS_ONCE(rshp->inuse) = 0; | ||
| 2721 | } | ||
| 2722 | |||
| 2723 | /* | ||
| 2724 | * Check to see if the system is fully idle, other than the timekeeping CPU. | ||
| 2725 | * The caller must have disabled interrupts. | ||
| 2726 | */ | ||
| 2727 | bool rcu_sys_is_idle(void) | ||
| 2728 | { | ||
| 2729 | static struct rcu_sysidle_head rsh; | ||
| 2730 | int rss = ACCESS_ONCE(full_sysidle_state); | ||
| 2731 | |||
| 2732 | if (WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu)) | ||
| 2733 | return false; | ||
| 2734 | |||
| 2735 | /* Handle small-system case by doing a full scan of CPUs. */ | ||
| 2736 | if (nr_cpu_ids <= CONFIG_NO_HZ_FULL_SYSIDLE_SMALL) { | ||
| 2737 | int oldrss = rss - 1; | ||
| 2738 | |||
| 2739 | /* | ||
| 2740 | * One pass to advance to each state up to _FULL. | ||
| 2741 | * Give up if any pass fails to advance the state. | ||
| 2742 | */ | ||
| 2743 | while (rss < RCU_SYSIDLE_FULL && oldrss < rss) { | ||
| 2744 | int cpu; | ||
| 2745 | bool isidle = true; | ||
| 2746 | unsigned long maxj = jiffies - ULONG_MAX / 4; | ||
| 2747 | struct rcu_data *rdp; | ||
| 2748 | |||
| 2749 | /* Scan all the CPUs looking for nonidle CPUs. */ | ||
| 2750 | for_each_possible_cpu(cpu) { | ||
| 2751 | rdp = per_cpu_ptr(rcu_sysidle_state->rda, cpu); | ||
| 2752 | rcu_sysidle_check_cpu(rdp, &isidle, &maxj); | ||
| 2753 | if (!isidle) | ||
| 2754 | break; | ||
| 2755 | } | ||
| 2756 | rcu_sysidle_report(rcu_sysidle_state, | ||
| 2757 | isidle, maxj, false); | ||
| 2758 | oldrss = rss; | ||
| 2759 | rss = ACCESS_ONCE(full_sysidle_state); | ||
| 2760 | } | ||
| 2761 | } | ||
| 2762 | |||
| 2763 | /* If this is the first observation of an idle period, record it. */ | ||
| 2764 | if (rss == RCU_SYSIDLE_FULL) { | ||
| 2765 | rss = cmpxchg(&full_sysidle_state, | ||
| 2766 | RCU_SYSIDLE_FULL, RCU_SYSIDLE_FULL_NOTED); | ||
| 2767 | return rss == RCU_SYSIDLE_FULL; | ||
| 2768 | } | ||
| 2769 | |||
| 2770 | smp_mb(); /* ensure rss load happens before later caller actions. */ | ||
| 2771 | |||
| 2772 | /* If already fully idle, tell the caller (in case of races). */ | ||
| 2773 | if (rss == RCU_SYSIDLE_FULL_NOTED) | ||
| 2774 | return true; | ||
| 2775 | |||
| 2776 | /* | ||
| 2777 | * If we aren't there yet, and a grace period is not in flight, | ||
| 2778 | * initiate a grace period. Either way, tell the caller that | ||
| 2779 | * we are not there yet. We use an xchg() rather than an assignment | ||
| 2780 | * to make up for the memory barriers that would otherwise be | ||
| 2781 | * provided by the memory allocator. | ||
| 2782 | */ | ||
| 2783 | if (nr_cpu_ids > CONFIG_NO_HZ_FULL_SYSIDLE_SMALL && | ||
| 2784 | !rcu_gp_in_progress(rcu_sysidle_state) && | ||
| 2785 | !rsh.inuse && xchg(&rsh.inuse, 1) == 0) | ||
| 2786 | call_rcu(&rsh.rh, rcu_sysidle_cb); | ||
| 2787 | return false; | ||
| 2788 | } | ||
| 2789 | |||
| 2790 | /* | ||
| 2791 | * Initialize dynticks sysidle state for CPUs coming online. | ||
| 2792 | */ | ||
| 2793 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) | ||
| 2794 | { | ||
| 2795 | rdtp->dynticks_idle_nesting = DYNTICK_TASK_NEST_VALUE; | ||
| 2796 | } | ||
| 2797 | |||
| 2798 | #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
| 2799 | |||
| 2800 | static void rcu_sysidle_enter(struct rcu_dynticks *rdtp, int irq) | ||
| 2801 | { | ||
| 2802 | } | ||
| 2803 | |||
| 2804 | static void rcu_sysidle_exit(struct rcu_dynticks *rdtp, int irq) | ||
| 2805 | { | ||
| 2806 | } | ||
| 2807 | |||
| 2808 | static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle, | ||
| 2809 | unsigned long *maxj) | ||
| 2810 | { | ||
| 2811 | } | ||
| 2812 | |||
| 2813 | static bool is_sysidle_rcu_state(struct rcu_state *rsp) | ||
| 2814 | { | ||
| 2815 | return false; | ||
| 2816 | } | ||
| 2817 | |||
| 2818 | static void rcu_bind_gp_kthread(void) | ||
| 2819 | { | ||
| 2820 | } | ||
| 2821 | |||
| 2822 | static void rcu_sysidle_report_gp(struct rcu_state *rsp, int isidle, | ||
| 2823 | unsigned long maxj) | ||
| 2824 | { | ||
| 2825 | } | ||
| 2826 | |||
| 2827 | static void rcu_sysidle_init_percpu_data(struct rcu_dynticks *rdtp) | ||
| 2828 | { | ||
| 2829 | } | ||
| 2830 | |||
| 2831 | #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */ | ||
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c new file mode 100644 index 000000000000..3596797b7e46 --- /dev/null +++ b/kernel/rcu/tree_trace.c | |||
| @@ -0,0 +1,500 @@ | |||
| 1 | /* | ||
| 2 | * Read-Copy Update tracing for classic implementation | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, write to the Free Software | ||
| 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
| 17 | * | ||
| 18 | * Copyright IBM Corporation, 2008 | ||
| 19 | * | ||
| 20 | * Papers: http://www.rdrop.com/users/paulmck/RCU | ||
| 21 | * | ||
| 22 | * For detailed explanation of Read-Copy Update mechanism see - | ||
| 23 | * Documentation/RCU | ||
| 24 | * | ||
| 25 | */ | ||
| 26 | #include <linux/types.h> | ||
| 27 | #include <linux/kernel.h> | ||
| 28 | #include <linux/init.h> | ||
| 29 | #include <linux/spinlock.h> | ||
| 30 | #include <linux/smp.h> | ||
| 31 | #include <linux/rcupdate.h> | ||
| 32 | #include <linux/interrupt.h> | ||
| 33 | #include <linux/sched.h> | ||
| 34 | #include <linux/atomic.h> | ||
| 35 | #include <linux/bitops.h> | ||
| 36 | #include <linux/module.h> | ||
| 37 | #include <linux/completion.h> | ||
| 38 | #include <linux/moduleparam.h> | ||
| 39 | #include <linux/percpu.h> | ||
| 40 | #include <linux/notifier.h> | ||
| 41 | #include <linux/cpu.h> | ||
| 42 | #include <linux/mutex.h> | ||
| 43 | #include <linux/debugfs.h> | ||
| 44 | #include <linux/seq_file.h> | ||
| 45 | |||
| 46 | #define RCU_TREE_NONCORE | ||
| 47 | #include "tree.h" | ||
| 48 | |||
| 49 | static int r_open(struct inode *inode, struct file *file, | ||
| 50 | const struct seq_operations *op) | ||
| 51 | { | ||
| 52 | int ret = seq_open(file, op); | ||
| 53 | if (!ret) { | ||
| 54 | struct seq_file *m = (struct seq_file *)file->private_data; | ||
| 55 | m->private = inode->i_private; | ||
| 56 | } | ||
| 57 | return ret; | ||
| 58 | } | ||
| 59 | |||
| 60 | static void *r_start(struct seq_file *m, loff_t *pos) | ||
| 61 | { | ||
| 62 | struct rcu_state *rsp = (struct rcu_state *)m->private; | ||
| 63 | *pos = cpumask_next(*pos - 1, cpu_possible_mask); | ||
| 64 | if ((*pos) < nr_cpu_ids) | ||
| 65 | return per_cpu_ptr(rsp->rda, *pos); | ||
| 66 | return NULL; | ||
| 67 | } | ||
| 68 | |||
| 69 | static void *r_next(struct seq_file *m, void *v, loff_t *pos) | ||
| 70 | { | ||
| 71 | (*pos)++; | ||
| 72 | return r_start(m, pos); | ||
| 73 | } | ||
| 74 | |||
| 75 | static void r_stop(struct seq_file *m, void *v) | ||
| 76 | { | ||
| 77 | } | ||
| 78 | |||
| 79 | static int show_rcubarrier(struct seq_file *m, void *v) | ||
| 80 | { | ||
| 81 | struct rcu_state *rsp = (struct rcu_state *)m->private; | ||
| 82 | seq_printf(m, "bcc: %d nbd: %lu\n", | ||
| 83 | atomic_read(&rsp->barrier_cpu_count), | ||
| 84 | rsp->n_barrier_done); | ||
| 85 | return 0; | ||
| 86 | } | ||
| 87 | |||
| 88 | static int rcubarrier_open(struct inode *inode, struct file *file) | ||
| 89 | { | ||
| 90 | return single_open(file, show_rcubarrier, inode->i_private); | ||
| 91 | } | ||
| 92 | |||
| 93 | static const struct file_operations rcubarrier_fops = { | ||
| 94 | .owner = THIS_MODULE, | ||
| 95 | .open = rcubarrier_open, | ||
| 96 | .read = seq_read, | ||
| 97 | .llseek = no_llseek, | ||
| 98 | .release = single_release, | ||
| 99 | }; | ||
| 100 | |||
| 101 | #ifdef CONFIG_RCU_BOOST | ||
| 102 | |||
| 103 | static char convert_kthread_status(unsigned int kthread_status) | ||
| 104 | { | ||
| 105 | if (kthread_status > RCU_KTHREAD_MAX) | ||
| 106 | return '?'; | ||
| 107 | return "SRWOY"[kthread_status]; | ||
| 108 | } | ||
| 109 | |||
| 110 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 111 | |||
| 112 | static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) | ||
| 113 | { | ||
| 114 | long ql, qll; | ||
| 115 | |||
| 116 | if (!rdp->beenonline) | ||
| 117 | return; | ||
| 118 | seq_printf(m, "%3d%cc=%ld g=%ld pq=%d qp=%d", | ||
| 119 | rdp->cpu, | ||
| 120 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | ||
| 121 | ulong2long(rdp->completed), ulong2long(rdp->gpnum), | ||
| 122 | rdp->passed_quiesce, rdp->qs_pending); | ||
| 123 | seq_printf(m, " dt=%d/%llx/%d df=%lu", | ||
| 124 | atomic_read(&rdp->dynticks->dynticks), | ||
| 125 | rdp->dynticks->dynticks_nesting, | ||
| 126 | rdp->dynticks->dynticks_nmi_nesting, | ||
| 127 | rdp->dynticks_fqs); | ||
| 128 | seq_printf(m, " of=%lu", rdp->offline_fqs); | ||
| 129 | rcu_nocb_q_lengths(rdp, &ql, &qll); | ||
| 130 | qll += rdp->qlen_lazy; | ||
| 131 | ql += rdp->qlen; | ||
| 132 | seq_printf(m, " ql=%ld/%ld qs=%c%c%c%c", | ||
| 133 | qll, ql, | ||
| 134 | ".N"[rdp->nxttail[RCU_NEXT_READY_TAIL] != | ||
| 135 | rdp->nxttail[RCU_NEXT_TAIL]], | ||
| 136 | ".R"[rdp->nxttail[RCU_WAIT_TAIL] != | ||
| 137 | rdp->nxttail[RCU_NEXT_READY_TAIL]], | ||
| 138 | ".W"[rdp->nxttail[RCU_DONE_TAIL] != | ||
| 139 | rdp->nxttail[RCU_WAIT_TAIL]], | ||
| 140 | ".D"[&rdp->nxtlist != rdp->nxttail[RCU_DONE_TAIL]]); | ||
| 141 | #ifdef CONFIG_RCU_BOOST | ||
| 142 | seq_printf(m, " kt=%d/%c ktl=%x", | ||
| 143 | per_cpu(rcu_cpu_has_work, rdp->cpu), | ||
| 144 | convert_kthread_status(per_cpu(rcu_cpu_kthread_status, | ||
| 145 | rdp->cpu)), | ||
| 146 | per_cpu(rcu_cpu_kthread_loops, rdp->cpu) & 0xffff); | ||
| 147 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 148 | seq_printf(m, " b=%ld", rdp->blimit); | ||
| 149 | seq_printf(m, " ci=%lu nci=%lu co=%lu ca=%lu\n", | ||
| 150 | rdp->n_cbs_invoked, rdp->n_nocbs_invoked, | ||
| 151 | rdp->n_cbs_orphaned, rdp->n_cbs_adopted); | ||
| 152 | } | ||
| 153 | |||
| 154 | static int show_rcudata(struct seq_file *m, void *v) | ||
| 155 | { | ||
| 156 | print_one_rcu_data(m, (struct rcu_data *)v); | ||
| 157 | return 0; | ||
| 158 | } | ||
| 159 | |||
| 160 | static const struct seq_operations rcudate_op = { | ||
| 161 | .start = r_start, | ||
| 162 | .next = r_next, | ||
| 163 | .stop = r_stop, | ||
| 164 | .show = show_rcudata, | ||
| 165 | }; | ||
| 166 | |||
| 167 | static int rcudata_open(struct inode *inode, struct file *file) | ||
| 168 | { | ||
| 169 | return r_open(inode, file, &rcudate_op); | ||
| 170 | } | ||
| 171 | |||
| 172 | static const struct file_operations rcudata_fops = { | ||
| 173 | .owner = THIS_MODULE, | ||
| 174 | .open = rcudata_open, | ||
| 175 | .read = seq_read, | ||
| 176 | .llseek = no_llseek, | ||
| 177 | .release = seq_release, | ||
| 178 | }; | ||
| 179 | |||
| 180 | static int show_rcuexp(struct seq_file *m, void *v) | ||
| 181 | { | ||
| 182 | struct rcu_state *rsp = (struct rcu_state *)m->private; | ||
| 183 | |||
| 184 | seq_printf(m, "s=%lu d=%lu w=%lu tf=%lu wd1=%lu wd2=%lu n=%lu sc=%lu dt=%lu dl=%lu dx=%lu\n", | ||
| 185 | atomic_long_read(&rsp->expedited_start), | ||
| 186 | atomic_long_read(&rsp->expedited_done), | ||
| 187 | atomic_long_read(&rsp->expedited_wrap), | ||
| 188 | atomic_long_read(&rsp->expedited_tryfail), | ||
| 189 | atomic_long_read(&rsp->expedited_workdone1), | ||
| 190 | atomic_long_read(&rsp->expedited_workdone2), | ||
| 191 | atomic_long_read(&rsp->expedited_normal), | ||
| 192 | atomic_long_read(&rsp->expedited_stoppedcpus), | ||
| 193 | atomic_long_read(&rsp->expedited_done_tries), | ||
| 194 | atomic_long_read(&rsp->expedited_done_lost), | ||
| 195 | atomic_long_read(&rsp->expedited_done_exit)); | ||
| 196 | return 0; | ||
| 197 | } | ||
| 198 | |||
| 199 | static int rcuexp_open(struct inode *inode, struct file *file) | ||
| 200 | { | ||
| 201 | return single_open(file, show_rcuexp, inode->i_private); | ||
| 202 | } | ||
| 203 | |||
| 204 | static const struct file_operations rcuexp_fops = { | ||
| 205 | .owner = THIS_MODULE, | ||
| 206 | .open = rcuexp_open, | ||
| 207 | .read = seq_read, | ||
| 208 | .llseek = no_llseek, | ||
| 209 | .release = single_release, | ||
| 210 | }; | ||
| 211 | |||
| 212 | #ifdef CONFIG_RCU_BOOST | ||
| 213 | |||
| 214 | static void print_one_rcu_node_boost(struct seq_file *m, struct rcu_node *rnp) | ||
| 215 | { | ||
| 216 | seq_printf(m, "%d:%d tasks=%c%c%c%c kt=%c ntb=%lu neb=%lu nnb=%lu ", | ||
| 217 | rnp->grplo, rnp->grphi, | ||
| 218 | "T."[list_empty(&rnp->blkd_tasks)], | ||
| 219 | "N."[!rnp->gp_tasks], | ||
| 220 | "E."[!rnp->exp_tasks], | ||
| 221 | "B."[!rnp->boost_tasks], | ||
| 222 | convert_kthread_status(rnp->boost_kthread_status), | ||
| 223 | rnp->n_tasks_boosted, rnp->n_exp_boosts, | ||
| 224 | rnp->n_normal_boosts); | ||
| 225 | seq_printf(m, "j=%04x bt=%04x\n", | ||
| 226 | (int)(jiffies & 0xffff), | ||
| 227 | (int)(rnp->boost_time & 0xffff)); | ||
| 228 | seq_printf(m, " balk: nt=%lu egt=%lu bt=%lu nb=%lu ny=%lu nos=%lu\n", | ||
| 229 | rnp->n_balk_blkd_tasks, | ||
| 230 | rnp->n_balk_exp_gp_tasks, | ||
| 231 | rnp->n_balk_boost_tasks, | ||
| 232 | rnp->n_balk_notblocked, | ||
| 233 | rnp->n_balk_notyet, | ||
| 234 | rnp->n_balk_nos); | ||
| 235 | } | ||
| 236 | |||
| 237 | static int show_rcu_node_boost(struct seq_file *m, void *unused) | ||
| 238 | { | ||
| 239 | struct rcu_node *rnp; | ||
| 240 | |||
| 241 | rcu_for_each_leaf_node(&rcu_preempt_state, rnp) | ||
| 242 | print_one_rcu_node_boost(m, rnp); | ||
| 243 | return 0; | ||
| 244 | } | ||
| 245 | |||
| 246 | static int rcu_node_boost_open(struct inode *inode, struct file *file) | ||
| 247 | { | ||
| 248 | return single_open(file, show_rcu_node_boost, NULL); | ||
| 249 | } | ||
| 250 | |||
| 251 | static const struct file_operations rcu_node_boost_fops = { | ||
| 252 | .owner = THIS_MODULE, | ||
| 253 | .open = rcu_node_boost_open, | ||
| 254 | .read = seq_read, | ||
| 255 | .llseek = no_llseek, | ||
| 256 | .release = single_release, | ||
| 257 | }; | ||
| 258 | |||
| 259 | #endif /* #ifdef CONFIG_RCU_BOOST */ | ||
| 260 | |||
| 261 | static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) | ||
| 262 | { | ||
| 263 | unsigned long gpnum; | ||
| 264 | int level = 0; | ||
| 265 | struct rcu_node *rnp; | ||
| 266 | |||
| 267 | gpnum = rsp->gpnum; | ||
| 268 | seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x ", | ||
| 269 | ulong2long(rsp->completed), ulong2long(gpnum), | ||
| 270 | rsp->fqs_state, | ||
| 271 | (long)(rsp->jiffies_force_qs - jiffies), | ||
| 272 | (int)(jiffies & 0xffff)); | ||
| 273 | seq_printf(m, "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld/%ld\n", | ||
| 274 | rsp->n_force_qs, rsp->n_force_qs_ngp, | ||
| 275 | rsp->n_force_qs - rsp->n_force_qs_ngp, | ||
| 276 | rsp->n_force_qs_lh, rsp->qlen_lazy, rsp->qlen); | ||
| 277 | for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < rcu_num_nodes; rnp++) { | ||
| 278 | if (rnp->level != level) { | ||
| 279 | seq_puts(m, "\n"); | ||
| 280 | level = rnp->level; | ||
| 281 | } | ||
| 282 | seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d ", | ||
| 283 | rnp->qsmask, rnp->qsmaskinit, | ||
| 284 | ".G"[rnp->gp_tasks != NULL], | ||
| 285 | ".E"[rnp->exp_tasks != NULL], | ||
| 286 | ".T"[!list_empty(&rnp->blkd_tasks)], | ||
| 287 | rnp->grplo, rnp->grphi, rnp->grpnum); | ||
| 288 | } | ||
| 289 | seq_puts(m, "\n"); | ||
| 290 | } | ||
| 291 | |||
| 292 | static int show_rcuhier(struct seq_file *m, void *v) | ||
| 293 | { | ||
| 294 | struct rcu_state *rsp = (struct rcu_state *)m->private; | ||
| 295 | print_one_rcu_state(m, rsp); | ||
| 296 | return 0; | ||
| 297 | } | ||
| 298 | |||
| 299 | static int rcuhier_open(struct inode *inode, struct file *file) | ||
| 300 | { | ||
| 301 | return single_open(file, show_rcuhier, inode->i_private); | ||
| 302 | } | ||
| 303 | |||
| 304 | static const struct file_operations rcuhier_fops = { | ||
| 305 | .owner = THIS_MODULE, | ||
| 306 | .open = rcuhier_open, | ||
| 307 | .read = seq_read, | ||
| 308 | .llseek = no_llseek, | ||
| 309 | .release = single_release, | ||
| 310 | }; | ||
| 311 | |||
| 312 | static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) | ||
| 313 | { | ||
| 314 | unsigned long flags; | ||
| 315 | unsigned long completed; | ||
| 316 | unsigned long gpnum; | ||
| 317 | unsigned long gpage; | ||
| 318 | unsigned long gpmax; | ||
| 319 | struct rcu_node *rnp = &rsp->node[0]; | ||
| 320 | |||
| 321 | raw_spin_lock_irqsave(&rnp->lock, flags); | ||
| 322 | completed = ACCESS_ONCE(rsp->completed); | ||
| 323 | gpnum = ACCESS_ONCE(rsp->gpnum); | ||
| 324 | if (completed == gpnum) | ||
| 325 | gpage = 0; | ||
| 326 | else | ||
| 327 | gpage = jiffies - rsp->gp_start; | ||
| 328 | gpmax = rsp->gp_max; | ||
| 329 | raw_spin_unlock_irqrestore(&rnp->lock, flags); | ||
| 330 | seq_printf(m, "completed=%ld gpnum=%ld age=%ld max=%ld\n", | ||
| 331 | ulong2long(completed), ulong2long(gpnum), gpage, gpmax); | ||
| 332 | } | ||
| 333 | |||
| 334 | static int show_rcugp(struct seq_file *m, void *v) | ||
| 335 | { | ||
| 336 | struct rcu_state *rsp = (struct rcu_state *)m->private; | ||
| 337 | show_one_rcugp(m, rsp); | ||
| 338 | return 0; | ||
| 339 | } | ||
| 340 | |||
| 341 | static int rcugp_open(struct inode *inode, struct file *file) | ||
| 342 | { | ||
| 343 | return single_open(file, show_rcugp, inode->i_private); | ||
| 344 | } | ||
| 345 | |||
| 346 | static const struct file_operations rcugp_fops = { | ||
| 347 | .owner = THIS_MODULE, | ||
| 348 | .open = rcugp_open, | ||
| 349 | .read = seq_read, | ||
| 350 | .llseek = no_llseek, | ||
| 351 | .release = single_release, | ||
| 352 | }; | ||
| 353 | |||
| 354 | static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) | ||
| 355 | { | ||
| 356 | if (!rdp->beenonline) | ||
| 357 | return; | ||
| 358 | seq_printf(m, "%3d%cnp=%ld ", | ||
| 359 | rdp->cpu, | ||
| 360 | cpu_is_offline(rdp->cpu) ? '!' : ' ', | ||
| 361 | rdp->n_rcu_pending); | ||
| 362 | seq_printf(m, "qsp=%ld rpq=%ld cbr=%ld cng=%ld ", | ||
| 363 | rdp->n_rp_qs_pending, | ||
| 364 | rdp->n_rp_report_qs, | ||
| 365 | rdp->n_rp_cb_ready, | ||
| 366 | rdp->n_rp_cpu_needs_gp); | ||
| 367 | seq_printf(m, "gpc=%ld gps=%ld nn=%ld\n", | ||
| 368 | rdp->n_rp_gp_completed, | ||
| 369 | rdp->n_rp_gp_started, | ||
| 370 | rdp->n_rp_need_nothing); | ||
| 371 | } | ||
| 372 | |||
| 373 | static int show_rcu_pending(struct seq_file *m, void *v) | ||
| 374 | { | ||
| 375 | print_one_rcu_pending(m, (struct rcu_data *)v); | ||
| 376 | return 0; | ||
| 377 | } | ||
| 378 | |||
| 379 | static const struct seq_operations rcu_pending_op = { | ||
| 380 | .start = r_start, | ||
| 381 | .next = r_next, | ||
| 382 | .stop = r_stop, | ||
| 383 | .show = show_rcu_pending, | ||
| 384 | }; | ||
| 385 | |||
| 386 | static int rcu_pending_open(struct inode *inode, struct file *file) | ||
| 387 | { | ||
| 388 | return r_open(inode, file, &rcu_pending_op); | ||
| 389 | } | ||
| 390 | |||
| 391 | static const struct file_operations rcu_pending_fops = { | ||
| 392 | .owner = THIS_MODULE, | ||
| 393 | .open = rcu_pending_open, | ||
| 394 | .read = seq_read, | ||
| 395 | .llseek = no_llseek, | ||
| 396 | .release = seq_release, | ||
| 397 | }; | ||
| 398 | |||
| 399 | static int show_rcutorture(struct seq_file *m, void *unused) | ||
| 400 | { | ||
| 401 | seq_printf(m, "rcutorture test sequence: %lu %s\n", | ||
| 402 | rcutorture_testseq >> 1, | ||
| 403 | (rcutorture_testseq & 0x1) ? "(test in progress)" : ""); | ||
| 404 | seq_printf(m, "rcutorture update version number: %lu\n", | ||
| 405 | rcutorture_vernum); | ||
| 406 | return 0; | ||
| 407 | } | ||
| 408 | |||
| 409 | static int rcutorture_open(struct inode *inode, struct file *file) | ||
| 410 | { | ||
| 411 | return single_open(file, show_rcutorture, NULL); | ||
| 412 | } | ||
| 413 | |||
| 414 | static const struct file_operations rcutorture_fops = { | ||
| 415 | .owner = THIS_MODULE, | ||
| 416 | .open = rcutorture_open, | ||
| 417 | .read = seq_read, | ||
| 418 | .llseek = seq_lseek, | ||
| 419 | .release = single_release, | ||
| 420 | }; | ||
| 421 | |||
| 422 | static struct dentry *rcudir; | ||
| 423 | |||
| 424 | static int __init rcutree_trace_init(void) | ||
| 425 | { | ||
| 426 | struct rcu_state *rsp; | ||
| 427 | struct dentry *retval; | ||
| 428 | struct dentry *rspdir; | ||
| 429 | |||
| 430 | rcudir = debugfs_create_dir("rcu", NULL); | ||
| 431 | if (!rcudir) | ||
| 432 | goto free_out; | ||
| 433 | |||
| 434 | for_each_rcu_flavor(rsp) { | ||
| 435 | rspdir = debugfs_create_dir(rsp->name, rcudir); | ||
| 436 | if (!rspdir) | ||
| 437 | goto free_out; | ||
| 438 | |||
| 439 | retval = debugfs_create_file("rcudata", 0444, | ||
| 440 | rspdir, rsp, &rcudata_fops); | ||
| 441 | if (!retval) | ||
| 442 | goto free_out; | ||
| 443 | |||
| 444 | retval = debugfs_create_file("rcuexp", 0444, | ||
| 445 | rspdir, rsp, &rcuexp_fops); | ||
| 446 | if (!retval) | ||
| 447 | goto free_out; | ||
| 448 | |||
| 449 | retval = debugfs_create_file("rcu_pending", 0444, | ||
| 450 | rspdir, rsp, &rcu_pending_fops); | ||
| 451 | if (!retval) | ||
| 452 | goto free_out; | ||
| 453 | |||
| 454 | retval = debugfs_create_file("rcubarrier", 0444, | ||
| 455 | rspdir, rsp, &rcubarrier_fops); | ||
| 456 | if (!retval) | ||
| 457 | goto free_out; | ||
| 458 | |||
| 459 | #ifdef CONFIG_RCU_BOOST | ||
| 460 | if (rsp == &rcu_preempt_state) { | ||
| 461 | retval = debugfs_create_file("rcuboost", 0444, | ||
| 462 | rspdir, NULL, &rcu_node_boost_fops); | ||
| 463 | if (!retval) | ||
| 464 | goto free_out; | ||
| 465 | } | ||
| 466 | #endif | ||
| 467 | |||
| 468 | retval = debugfs_create_file("rcugp", 0444, | ||
| 469 | rspdir, rsp, &rcugp_fops); | ||
| 470 | if (!retval) | ||
| 471 | goto free_out; | ||
| 472 | |||
| 473 | retval = debugfs_create_file("rcuhier", 0444, | ||
| 474 | rspdir, rsp, &rcuhier_fops); | ||
| 475 | if (!retval) | ||
| 476 | goto free_out; | ||
| 477 | } | ||
| 478 | |||
| 479 | retval = debugfs_create_file("rcutorture", 0444, rcudir, | ||
| 480 | NULL, &rcutorture_fops); | ||
| 481 | if (!retval) | ||
| 482 | goto free_out; | ||
| 483 | return 0; | ||
| 484 | free_out: | ||
| 485 | debugfs_remove_recursive(rcudir); | ||
| 486 | return 1; | ||
| 487 | } | ||
| 488 | |||
| 489 | static void __exit rcutree_trace_cleanup(void) | ||
| 490 | { | ||
| 491 | debugfs_remove_recursive(rcudir); | ||
| 492 | } | ||
| 493 | |||
| 494 | |||
| 495 | module_init(rcutree_trace_init); | ||
| 496 | module_exit(rcutree_trace_cleanup); | ||
| 497 | |||
| 498 | MODULE_AUTHOR("Paul E. McKenney"); | ||
| 499 | MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); | ||
| 500 | MODULE_LICENSE("GPL"); | ||
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c new file mode 100644 index 000000000000..6cb3dff89e2b --- /dev/null +++ b/kernel/rcu/update.c | |||
| @@ -0,0 +1,347 @@ | |||
| 1 | /* | ||
| 2 | * Read-Copy Update mechanism for mutual exclusion | ||
| 3 | * | ||
| 4 | * This program is free software; you can redistribute it and/or modify | ||
| 5 | * it under the terms of the GNU General Public License as published by | ||
| 6 | * the Free Software Foundation; either version 2 of the License, or | ||
| 7 | * (at your option) any later version. | ||
| 8 | * | ||
| 9 | * This program is distributed in the hope that it will be useful, | ||
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | ||
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | ||
| 12 | * GNU General Public License for more details. | ||
| 13 | * | ||
| 14 | * You should have received a copy of the GNU General Public License | ||
| 15 | * along with this program; if not, write to the Free Software | ||
| 16 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. | ||
| 17 | * | ||
| 18 | * Copyright IBM Corporation, 2001 | ||
| 19 | * | ||
| 20 | * Authors: Dipankar Sarma <dipankar@in.ibm.com> | ||
| 21 | * Manfred Spraul <manfred@colorfullife.com> | ||
| 22 | * | ||
| 23 | * Based on the original work by Paul McKenney <paulmck@us.ibm.com> | ||
| 24 | * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. | ||
| 25 | * Papers: | ||
| 26 | * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf | ||
| 27 | * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001) | ||
| 28 | * | ||
| 29 | * For detailed explanation of Read-Copy Update mechanism see - | ||
| 30 | * http://lse.sourceforge.net/locking/rcupdate.html | ||
| 31 | * | ||
| 32 | */ | ||
| 33 | #include <linux/types.h> | ||
| 34 | #include <linux/kernel.h> | ||
| 35 | #include <linux/init.h> | ||
| 36 | #include <linux/spinlock.h> | ||
| 37 | #include <linux/smp.h> | ||
| 38 | #include <linux/interrupt.h> | ||
| 39 | #include <linux/sched.h> | ||
| 40 | #include <linux/atomic.h> | ||
| 41 | #include <linux/bitops.h> | ||
| 42 | #include <linux/percpu.h> | ||
| 43 | #include <linux/notifier.h> | ||
| 44 | #include <linux/cpu.h> | ||
| 45 | #include <linux/mutex.h> | ||
| 46 | #include <linux/export.h> | ||
| 47 | #include <linux/hardirq.h> | ||
| 48 | #include <linux/delay.h> | ||
| 49 | #include <linux/module.h> | ||
| 50 | |||
| 51 | #define CREATE_TRACE_POINTS | ||
| 52 | #include <trace/events/rcu.h> | ||
| 53 | |||
| 54 | #include "rcu.h" | ||
| 55 | |||
| 56 | MODULE_ALIAS("rcupdate"); | ||
| 57 | #ifdef MODULE_PARAM_PREFIX | ||
| 58 | #undef MODULE_PARAM_PREFIX | ||
| 59 | #endif | ||
| 60 | #define MODULE_PARAM_PREFIX "rcupdate." | ||
| 61 | |||
| 62 | module_param(rcu_expedited, int, 0); | ||
| 63 | |||
| 64 | #ifdef CONFIG_PREEMPT_RCU | ||
| 65 | |||
| 66 | /* | ||
| 67 | * Preemptible RCU implementation for rcu_read_lock(). | ||
| 68 | * Just increment ->rcu_read_lock_nesting, shared state will be updated | ||
| 69 | * if we block. | ||
| 70 | */ | ||
| 71 | void __rcu_read_lock(void) | ||
| 72 | { | ||
| 73 | current->rcu_read_lock_nesting++; | ||
| 74 | barrier(); /* critical section after entry code. */ | ||
| 75 | } | ||
| 76 | EXPORT_SYMBOL_GPL(__rcu_read_lock); | ||
| 77 | |||
| 78 | /* | ||
| 79 | * Preemptible RCU implementation for rcu_read_unlock(). | ||
| 80 | * Decrement ->rcu_read_lock_nesting. If the result is zero (outermost | ||
| 81 | * rcu_read_unlock()) and ->rcu_read_unlock_special is non-zero, then | ||
| 82 | * invoke rcu_read_unlock_special() to clean up after a context switch | ||
| 83 | * in an RCU read-side critical section and other special cases. | ||
| 84 | */ | ||
| 85 | void __rcu_read_unlock(void) | ||
| 86 | { | ||
| 87 | struct task_struct *t = current; | ||
| 88 | |||
| 89 | if (t->rcu_read_lock_nesting != 1) { | ||
| 90 | --t->rcu_read_lock_nesting; | ||
| 91 | } else { | ||
| 92 | barrier(); /* critical section before exit code. */ | ||
| 93 | t->rcu_read_lock_nesting = INT_MIN; | ||
| 94 | #ifdef CONFIG_PROVE_RCU_DELAY | ||
| 95 | udelay(10); /* Make preemption more probable. */ | ||
| 96 | #endif /* #ifdef CONFIG_PROVE_RCU_DELAY */ | ||
| 97 | barrier(); /* assign before ->rcu_read_unlock_special load */ | ||
| 98 | if (unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) | ||
| 99 | rcu_read_unlock_special(t); | ||
| 100 | barrier(); /* ->rcu_read_unlock_special load before assign */ | ||
| 101 | t->rcu_read_lock_nesting = 0; | ||
| 102 | } | ||
| 103 | #ifdef CONFIG_PROVE_LOCKING | ||
| 104 | { | ||
| 105 | int rrln = ACCESS_ONCE(t->rcu_read_lock_nesting); | ||
| 106 | |||
| 107 | WARN_ON_ONCE(rrln < 0 && rrln > INT_MIN / 2); | ||
| 108 | } | ||
| 109 | #endif /* #ifdef CONFIG_PROVE_LOCKING */ | ||
| 110 | } | ||
| 111 | EXPORT_SYMBOL_GPL(__rcu_read_unlock); | ||
| 112 | |||
| 113 | #endif /* #ifdef CONFIG_PREEMPT_RCU */ | ||
| 114 | |||
| 115 | #ifdef CONFIG_DEBUG_LOCK_ALLOC | ||
| 116 | static struct lock_class_key rcu_lock_key; | ||
| 117 | struct lockdep_map rcu_lock_map = | ||
| 118 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); | ||
| 119 | EXPORT_SYMBOL_GPL(rcu_lock_map); | ||
| 120 | |||
| 121 | static struct lock_class_key rcu_bh_lock_key; | ||
| 122 | struct lockdep_map rcu_bh_lock_map = | ||
| 123 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key); | ||
| 124 | EXPORT_SYMBOL_GPL(rcu_bh_lock_map); | ||
| 125 | |||
| 126 | static struct lock_class_key rcu_sched_lock_key; | ||
| 127 | struct lockdep_map rcu_sched_lock_map = | ||
| 128 | STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); | ||
| 129 | EXPORT_SYMBOL_GPL(rcu_sched_lock_map); | ||
| 130 | |||
| 131 | int notrace debug_lockdep_rcu_enabled(void) | ||
| 132 | { | ||
| 133 | return rcu_scheduler_active && debug_locks && | ||
| 134 | current->lockdep_recursion == 0; | ||
| 135 | } | ||
| 136 | EXPORT_SYMBOL_GPL(debug_lockdep_rcu_enabled); | ||
| 137 | |||
| 138 | /** | ||
| 139 | * rcu_read_lock_bh_held() - might we be in RCU-bh read-side critical section? | ||
| 140 | * | ||
| 141 | * Check for bottom half being disabled, which covers both the | ||
| 142 | * CONFIG_PROVE_RCU and not cases. Note that if someone uses | ||
| 143 | * rcu_read_lock_bh(), but then later enables BH, lockdep (if enabled) | ||
| 144 | * will show the situation. This is useful for debug checks in functions | ||
| 145 | * that require that they be called within an RCU read-side critical | ||
| 146 | * section. | ||
| 147 | * | ||
| 148 | * Check debug_lockdep_rcu_enabled() to prevent false positives during boot. | ||
| 149 | * | ||
| 150 | * Note that rcu_read_lock() is disallowed if the CPU is either idle or | ||
| 151 | * offline from an RCU perspective, so check for those as well. | ||
| 152 | */ | ||
| 153 | int rcu_read_lock_bh_held(void) | ||
| 154 | { | ||
| 155 | if (!debug_lockdep_rcu_enabled()) | ||
| 156 | return 1; | ||
| 157 | if (!rcu_is_watching()) | ||
| 158 | return 0; | ||
| 159 | if (!rcu_lockdep_current_cpu_online()) | ||
| 160 | return 0; | ||
| 161 | return in_softirq() || irqs_disabled(); | ||
| 162 | } | ||
| 163 | EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held); | ||
| 164 | |||
| 165 | #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | ||
| 166 | |||
| 167 | struct rcu_synchronize { | ||
| 168 | struct rcu_head head; | ||
| 169 | struct completion completion; | ||
| 170 | }; | ||
| 171 | |||
| 172 | /* | ||
| 173 | * Awaken the corresponding synchronize_rcu() instance now that a | ||
| 174 | * grace period has elapsed. | ||
| 175 | */ | ||
| 176 | static void wakeme_after_rcu(struct rcu_head *head) | ||
| 177 | { | ||
| 178 | struct rcu_synchronize *rcu; | ||
| 179 | |||
| 180 | rcu = container_of(head, struct rcu_synchronize, head); | ||
| 181 | complete(&rcu->completion); | ||
| 182 | } | ||
| 183 | |||
| 184 | void wait_rcu_gp(call_rcu_func_t crf) | ||
| 185 | { | ||
| 186 | struct rcu_synchronize rcu; | ||
| 187 | |||
| 188 | init_rcu_head_on_stack(&rcu.head); | ||
| 189 | init_completion(&rcu.completion); | ||
| 190 | /* Will wake me after RCU finished. */ | ||
| 191 | crf(&rcu.head, wakeme_after_rcu); | ||
| 192 | /* Wait for it. */ | ||
| 193 | wait_for_completion(&rcu.completion); | ||
| 194 | destroy_rcu_head_on_stack(&rcu.head); | ||
| 195 | } | ||
| 196 | EXPORT_SYMBOL_GPL(wait_rcu_gp); | ||
| 197 | |||
| 198 | #ifdef CONFIG_PROVE_RCU | ||
| 199 | /* | ||
| 200 | * wrapper function to avoid #include problems. | ||
| 201 | */ | ||
| 202 | int rcu_my_thread_group_empty(void) | ||
| 203 | { | ||
| 204 | return thread_group_empty(current); | ||
| 205 | } | ||
| 206 | EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); | ||
| 207 | #endif /* #ifdef CONFIG_PROVE_RCU */ | ||
| 208 | |||
| 209 | #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD | ||
| 210 | static inline void debug_init_rcu_head(struct rcu_head *head) | ||
| 211 | { | ||
| 212 | debug_object_init(head, &rcuhead_debug_descr); | ||
| 213 | } | ||
| 214 | |||
| 215 | static inline void debug_rcu_head_free(struct rcu_head *head) | ||
| 216 | { | ||
| 217 | debug_object_free(head, &rcuhead_debug_descr); | ||
| 218 | } | ||
| 219 | |||
| 220 | /* | ||
| 221 | * fixup_activate is called when: | ||
| 222 | * - an active object is activated | ||
| 223 | * - an unknown object is activated (might be a statically initialized object) | ||
| 224 | * Activation is performed internally by call_rcu(). | ||
| 225 | */ | ||
| 226 | static int rcuhead_fixup_activate(void *addr, enum debug_obj_state state) | ||
| 227 | { | ||
| 228 | struct rcu_head *head = addr; | ||
| 229 | |||
| 230 | switch (state) { | ||
| 231 | |||
| 232 | case ODEBUG_STATE_NOTAVAILABLE: | ||
| 233 | /* | ||
| 234 | * This is not really a fixup. We just make sure that it is | ||
| 235 | * tracked in the object tracker. | ||
| 236 | */ | ||
| 237 | debug_object_init(head, &rcuhead_debug_descr); | ||
| 238 | debug_object_activate(head, &rcuhead_debug_descr); | ||
| 239 | return 0; | ||
| 240 | default: | ||
| 241 | return 1; | ||
| 242 | } | ||
| 243 | } | ||
| 244 | |||
| 245 | /** | ||
| 246 | * init_rcu_head_on_stack() - initialize on-stack rcu_head for debugobjects | ||
| 247 | * @head: pointer to rcu_head structure to be initialized | ||
| 248 | * | ||
| 249 | * This function informs debugobjects of a new rcu_head structure that | ||
| 250 | * has been allocated as an auto variable on the stack. This function | ||
| 251 | * is not required for rcu_head structures that are statically defined or | ||
| 252 | * that are dynamically allocated on the heap. This function has no | ||
| 253 | * effect for !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. | ||
| 254 | */ | ||
| 255 | void init_rcu_head_on_stack(struct rcu_head *head) | ||
| 256 | { | ||
| 257 | debug_object_init_on_stack(head, &rcuhead_debug_descr); | ||
| 258 | } | ||
| 259 | EXPORT_SYMBOL_GPL(init_rcu_head_on_stack); | ||
| 260 | |||
| 261 | /** | ||
| 262 | * destroy_rcu_head_on_stack() - destroy on-stack rcu_head for debugobjects | ||
| 263 | * @head: pointer to rcu_head structure to be initialized | ||
| 264 | * | ||
| 265 | * This function informs debugobjects that an on-stack rcu_head structure | ||
| 266 | * is about to go out of scope. As with init_rcu_head_on_stack(), this | ||
| 267 | * function is not required for rcu_head structures that are statically | ||
| 268 | * defined or that are dynamically allocated on the heap. Also as with | ||
| 269 | * init_rcu_head_on_stack(), this function has no effect for | ||
| 270 | * !CONFIG_DEBUG_OBJECTS_RCU_HEAD kernel builds. | ||
| 271 | */ | ||
| 272 | void destroy_rcu_head_on_stack(struct rcu_head *head) | ||
| 273 | { | ||
| 274 | debug_object_free(head, &rcuhead_debug_descr); | ||
| 275 | } | ||
| 276 | EXPORT_SYMBOL_GPL(destroy_rcu_head_on_stack); | ||
| 277 | |||
| 278 | struct debug_obj_descr rcuhead_debug_descr = { | ||
| 279 | .name = "rcu_head", | ||
| 280 | .fixup_activate = rcuhead_fixup_activate, | ||
| 281 | }; | ||
| 282 | EXPORT_SYMBOL_GPL(rcuhead_debug_descr); | ||
| 283 | #endif /* #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD */ | ||
| 284 | |||
| 285 | #if defined(CONFIG_TREE_RCU) || defined(CONFIG_TREE_PREEMPT_RCU) || defined(CONFIG_RCU_TRACE) | ||
| 286 | void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, | ||
| 287 | unsigned long secs, | ||
| 288 | unsigned long c_old, unsigned long c) | ||
| 289 | { | ||
| 290 | trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c); | ||
| 291 | } | ||
| 292 | EXPORT_SYMBOL_GPL(do_trace_rcu_torture_read); | ||
| 293 | #else | ||
| 294 | #define do_trace_rcu_torture_read(rcutorturename, rhp, secs, c_old, c) \ | ||
| 295 | do { } while (0) | ||
| 296 | #endif | ||
| 297 | |||
| 298 | #ifdef CONFIG_RCU_STALL_COMMON | ||
| 299 | |||
| 300 | #ifdef CONFIG_PROVE_RCU | ||
| 301 | #define RCU_STALL_DELAY_DELTA (5 * HZ) | ||
| 302 | #else | ||
| 303 | #define RCU_STALL_DELAY_DELTA 0 | ||
| 304 | #endif | ||
| 305 | |||
| 306 | int rcu_cpu_stall_suppress __read_mostly; /* 1 = suppress stall warnings. */ | ||
| 307 | static int rcu_cpu_stall_timeout __read_mostly = CONFIG_RCU_CPU_STALL_TIMEOUT; | ||
| 308 | |||
| 309 | module_param(rcu_cpu_stall_suppress, int, 0644); | ||
| 310 | module_param(rcu_cpu_stall_timeout, int, 0644); | ||
| 311 | |||
| 312 | int rcu_jiffies_till_stall_check(void) | ||
| 313 | { | ||
| 314 | int till_stall_check = ACCESS_ONCE(rcu_cpu_stall_timeout); | ||
| 315 | |||
| 316 | /* | ||
| 317 | * Limit check must be consistent with the Kconfig limits | ||
| 318 | * for CONFIG_RCU_CPU_STALL_TIMEOUT. | ||
| 319 | */ | ||
| 320 | if (till_stall_check < 3) { | ||
| 321 | ACCESS_ONCE(rcu_cpu_stall_timeout) = 3; | ||
| 322 | till_stall_check = 3; | ||
| 323 | } else if (till_stall_check > 300) { | ||
| 324 | ACCESS_ONCE(rcu_cpu_stall_timeout) = 300; | ||
| 325 | till_stall_check = 300; | ||
| 326 | } | ||
| 327 | return till_stall_check * HZ + RCU_STALL_DELAY_DELTA; | ||
| 328 | } | ||
| 329 | |||
| 330 | static int rcu_panic(struct notifier_block *this, unsigned long ev, void *ptr) | ||
| 331 | { | ||
| 332 | rcu_cpu_stall_suppress = 1; | ||
| 333 | return NOTIFY_DONE; | ||
| 334 | } | ||
| 335 | |||
| 336 | static struct notifier_block rcu_panic_block = { | ||
| 337 | .notifier_call = rcu_panic, | ||
| 338 | }; | ||
| 339 | |||
| 340 | static int __init check_cpu_stall_init(void) | ||
| 341 | { | ||
| 342 | atomic_notifier_chain_register(&panic_notifier_list, &rcu_panic_block); | ||
| 343 | return 0; | ||
| 344 | } | ||
| 345 | early_initcall(check_cpu_stall_init); | ||
| 346 | |||
| 347 | #endif /* #ifdef CONFIG_RCU_STALL_COMMON */ | ||
