diff options
Diffstat (limited to 'kernel/srcu.c')
-rw-r--r-- | kernel/srcu.c | 548 |
1 files changed, 435 insertions, 113 deletions
diff --git a/kernel/srcu.c b/kernel/srcu.c index ba35f3a4a1f4..2095be3318d5 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
@@ -34,10 +34,77 @@ | |||
34 | #include <linux/delay.h> | 34 | #include <linux/delay.h> |
35 | #include <linux/srcu.h> | 35 | #include <linux/srcu.h> |
36 | 36 | ||
37 | /* | ||
38 | * Initialize an rcu_batch structure to empty. | ||
39 | */ | ||
40 | static inline void rcu_batch_init(struct rcu_batch *b) | ||
41 | { | ||
42 | b->head = NULL; | ||
43 | b->tail = &b->head; | ||
44 | } | ||
45 | |||
46 | /* | ||
47 | * Enqueue a callback onto the tail of the specified rcu_batch structure. | ||
48 | */ | ||
49 | static inline void rcu_batch_queue(struct rcu_batch *b, struct rcu_head *head) | ||
50 | { | ||
51 | *b->tail = head; | ||
52 | b->tail = &head->next; | ||
53 | } | ||
54 | |||
55 | /* | ||
56 | * Is the specified rcu_batch structure empty? | ||
57 | */ | ||
58 | static inline bool rcu_batch_empty(struct rcu_batch *b) | ||
59 | { | ||
60 | return b->tail == &b->head; | ||
61 | } | ||
62 | |||
63 | /* | ||
64 | * Remove the callback at the head of the specified rcu_batch structure | ||
65 | * and return a pointer to it, or return NULL if the structure is empty. | ||
66 | */ | ||
67 | static inline struct rcu_head *rcu_batch_dequeue(struct rcu_batch *b) | ||
68 | { | ||
69 | struct rcu_head *head; | ||
70 | |||
71 | if (rcu_batch_empty(b)) | ||
72 | return NULL; | ||
73 | |||
74 | head = b->head; | ||
75 | b->head = head->next; | ||
76 | if (b->tail == &head->next) | ||
77 | rcu_batch_init(b); | ||
78 | |||
79 | return head; | ||
80 | } | ||
81 | |||
82 | /* | ||
83 | * Move all callbacks from the rcu_batch structure specified by "from" to | ||
84 | * the structure specified by "to". | ||
85 | */ | ||
86 | static inline void rcu_batch_move(struct rcu_batch *to, struct rcu_batch *from) | ||
87 | { | ||
88 | if (!rcu_batch_empty(from)) { | ||
89 | *to->tail = from->head; | ||
90 | to->tail = from->tail; | ||
91 | rcu_batch_init(from); | ||
92 | } | ||
93 | } | ||
94 | |||
95 | /* single-thread state-machine */ | ||
96 | static void process_srcu(struct work_struct *work); | ||
97 | |||
37 | static int init_srcu_struct_fields(struct srcu_struct *sp) | 98 | static int init_srcu_struct_fields(struct srcu_struct *sp) |
38 | { | 99 | { |
39 | sp->completed = 0; | 100 | sp->completed = 0; |
40 | mutex_init(&sp->mutex); | 101 | spin_lock_init(&sp->queue_lock); |
102 | sp->running = false; | ||
103 | rcu_batch_init(&sp->batch_queue); | ||
104 | rcu_batch_init(&sp->batch_check0); | ||
105 | rcu_batch_init(&sp->batch_check1); | ||
106 | rcu_batch_init(&sp->batch_done); | ||
107 | INIT_DELAYED_WORK(&sp->work, process_srcu); | ||
41 | sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); | 108 | sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); |
42 | return sp->per_cpu_ref ? 0 : -ENOMEM; | 109 | return sp->per_cpu_ref ? 0 : -ENOMEM; |
43 | } | 110 | } |
@@ -73,21 +140,116 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); | |||
73 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 140 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
74 | 141 | ||
75 | /* | 142 | /* |
76 | * srcu_readers_active_idx -- returns approximate number of readers | 143 | * Returns approximate total of the readers' ->seq[] values for the |
77 | * active on the specified rank of per-CPU counters. | 144 | * rank of per-CPU counters specified by idx. |
78 | */ | 145 | */ |
146 | static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) | ||
147 | { | ||
148 | int cpu; | ||
149 | unsigned long sum = 0; | ||
150 | unsigned long t; | ||
79 | 151 | ||
80 | static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) | 152 | for_each_possible_cpu(cpu) { |
153 | t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); | ||
154 | sum += t; | ||
155 | } | ||
156 | return sum; | ||
157 | } | ||
158 | |||
159 | /* | ||
160 | * Returns approximate number of readers active on the specified rank | ||
161 | * of the per-CPU ->c[] counters. | ||
162 | */ | ||
163 | static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) | ||
81 | { | 164 | { |
82 | int cpu; | 165 | int cpu; |
83 | int sum; | 166 | unsigned long sum = 0; |
167 | unsigned long t; | ||
84 | 168 | ||
85 | sum = 0; | 169 | for_each_possible_cpu(cpu) { |
86 | for_each_possible_cpu(cpu) | 170 | t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); |
87 | sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; | 171 | sum += t; |
172 | } | ||
88 | return sum; | 173 | return sum; |
89 | } | 174 | } |
90 | 175 | ||
176 | /* | ||
177 | * Return true if the number of pre-existing readers is determined to | ||
178 | * be stably zero. An example unstable zero can occur if the call | ||
179 | * to srcu_readers_active_idx() misses an __srcu_read_lock() increment, | ||
180 | * but due to task migration, sees the corresponding __srcu_read_unlock() | ||
181 | * decrement. This can happen because srcu_readers_active_idx() takes | ||
182 | * time to sum the array, and might in fact be interrupted or preempted | ||
183 | * partway through the summation. | ||
184 | */ | ||
185 | static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) | ||
186 | { | ||
187 | unsigned long seq; | ||
188 | |||
189 | seq = srcu_readers_seq_idx(sp, idx); | ||
190 | |||
191 | /* | ||
192 | * The following smp_mb() A pairs with the smp_mb() B located in | ||
193 | * __srcu_read_lock(). This pairing ensures that if an | ||
194 | * __srcu_read_lock() increments its counter after the summation | ||
195 | * in srcu_readers_active_idx(), then the corresponding SRCU read-side | ||
196 | * critical section will see any changes made prior to the start | ||
197 | * of the current SRCU grace period. | ||
198 | * | ||
199 | * Also, if the above call to srcu_readers_seq_idx() saw the | ||
200 | * increment of ->seq[], then the call to srcu_readers_active_idx() | ||
201 | * must see the increment of ->c[]. | ||
202 | */ | ||
203 | smp_mb(); /* A */ | ||
204 | |||
205 | /* | ||
206 | * Note that srcu_readers_active_idx() can incorrectly return | ||
207 | * zero even though there is a pre-existing reader throughout. | ||
208 | * To see this, suppose that task A is in a very long SRCU | ||
209 | * read-side critical section that started on CPU 0, and that | ||
210 | * no other reader exists, so that the sum of the counters | ||
211 | * is equal to one. Then suppose that task B starts executing | ||
212 | * srcu_readers_active_idx(), summing up to CPU 1, and then that | ||
213 | * task C starts reading on CPU 0, so that its increment is not | ||
214 | * summed, but finishes reading on CPU 2, so that its decrement | ||
215 | * -is- summed. Then when task B completes its sum, it will | ||
216 | * incorrectly get zero, despite the fact that task A has been | ||
217 | * in its SRCU read-side critical section the whole time. | ||
218 | * | ||
219 | * We therefore do a validation step should srcu_readers_active_idx() | ||
220 | * return zero. | ||
221 | */ | ||
222 | if (srcu_readers_active_idx(sp, idx) != 0) | ||
223 | return false; | ||
224 | |||
225 | /* | ||
226 | * The remainder of this function is the validation step. | ||
227 | * The following smp_mb() D pairs with the smp_mb() C in | ||
228 | * __srcu_read_unlock(). If the __srcu_read_unlock() was seen | ||
229 | * by srcu_readers_active_idx() above, then any destructive | ||
230 | * operation performed after the grace period will happen after | ||
231 | * the corresponding SRCU read-side critical section. | ||
232 | * | ||
233 | * Note that there can be at most NR_CPUS worth of readers using | ||
234 | * the old index, which is not enough to overflow even a 32-bit | ||
235 | * integer. (Yes, this does mean that systems having more than | ||
236 | * a billion or so CPUs need to be 64-bit systems.) Therefore, | ||
237 | * the sum of the ->seq[] counters cannot possibly overflow. | ||
238 | * Therefore, the only way that the return values of the two | ||
239 | * calls to srcu_readers_seq_idx() can be equal is if there were | ||
240 | * no increments of the corresponding rank of ->seq[] counts | ||
241 | * in the interim. But the missed-increment scenario laid out | ||
242 | * above includes an increment of the ->seq[] counter by | ||
243 | * the corresponding __srcu_read_lock(). Therefore, if this | ||
244 | * scenario occurs, the return values from the two calls to | ||
245 | * srcu_readers_seq_idx() will differ, and thus the validation | ||
246 | * step below suffices. | ||
247 | */ | ||
248 | smp_mb(); /* D */ | ||
249 | |||
250 | return srcu_readers_seq_idx(sp, idx) == seq; | ||
251 | } | ||
252 | |||
91 | /** | 253 | /** |
92 | * srcu_readers_active - returns approximate number of readers. | 254 | * srcu_readers_active - returns approximate number of readers. |
93 | * @sp: which srcu_struct to count active readers (holding srcu_read_lock). | 255 | * @sp: which srcu_struct to count active readers (holding srcu_read_lock). |
@@ -98,7 +260,14 @@ static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) | |||
98 | */ | 260 | */ |
99 | static int srcu_readers_active(struct srcu_struct *sp) | 261 | static int srcu_readers_active(struct srcu_struct *sp) |
100 | { | 262 | { |
101 | return srcu_readers_active_idx(sp, 0) + srcu_readers_active_idx(sp, 1); | 263 | int cpu; |
264 | unsigned long sum = 0; | ||
265 | |||
266 | for_each_possible_cpu(cpu) { | ||
267 | sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[0]); | ||
268 | sum += ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[1]); | ||
269 | } | ||
270 | return sum; | ||
102 | } | 271 | } |
103 | 272 | ||
104 | /** | 273 | /** |
@@ -131,10 +300,11 @@ int __srcu_read_lock(struct srcu_struct *sp) | |||
131 | int idx; | 300 | int idx; |
132 | 301 | ||
133 | preempt_disable(); | 302 | preempt_disable(); |
134 | idx = sp->completed & 0x1; | 303 | idx = rcu_dereference_index_check(sp->completed, |
135 | barrier(); /* ensure compiler looks -once- at sp->completed. */ | 304 | rcu_read_lock_sched_held()) & 0x1; |
136 | per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; | 305 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; |
137 | srcu_barrier(); /* ensure compiler won't misorder critical section. */ | 306 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ |
307 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; | ||
138 | preempt_enable(); | 308 | preempt_enable(); |
139 | return idx; | 309 | return idx; |
140 | } | 310 | } |
@@ -149,8 +319,8 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); | |||
149 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) | 319 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) |
150 | { | 320 | { |
151 | preempt_disable(); | 321 | preempt_disable(); |
152 | srcu_barrier(); /* ensure compiler won't misorder critical section. */ | 322 | smp_mb(); /* C */ /* Avoid leaking the critical section. */ |
153 | per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; | 323 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) -= 1; |
154 | preempt_enable(); | 324 | preempt_enable(); |
155 | } | 325 | } |
156 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); | 326 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); |
@@ -163,106 +333,119 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); | |||
163 | * we repeatedly block for 1-millisecond time periods. This approach | 333 | * we repeatedly block for 1-millisecond time periods. This approach |
164 | * has done well in testing, so there is no need for a config parameter. | 334 | * has done well in testing, so there is no need for a config parameter. |
165 | */ | 335 | */ |
166 | #define SYNCHRONIZE_SRCU_READER_DELAY 10 | 336 | #define SRCU_RETRY_CHECK_DELAY 5 |
337 | #define SYNCHRONIZE_SRCU_TRYCOUNT 2 | ||
338 | #define SYNCHRONIZE_SRCU_EXP_TRYCOUNT 12 | ||
167 | 339 | ||
168 | /* | 340 | /* |
169 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). | 341 | * @@@ Wait until all pre-existing readers complete. Such readers |
342 | * will have used the index specified by "idx". | ||
343 | * the caller should ensures the ->completed is not changed while checking | ||
344 | * and idx = (->completed & 1) ^ 1 | ||
170 | */ | 345 | */ |
171 | static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | 346 | static bool try_check_zero(struct srcu_struct *sp, int idx, int trycount) |
172 | { | 347 | { |
173 | int idx; | 348 | for (;;) { |
174 | 349 | if (srcu_readers_active_idx_check(sp, idx)) | |
175 | rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && | 350 | return true; |
176 | !lock_is_held(&rcu_bh_lock_map) && | 351 | if (--trycount <= 0) |
177 | !lock_is_held(&rcu_lock_map) && | 352 | return false; |
178 | !lock_is_held(&rcu_sched_lock_map), | 353 | udelay(SRCU_RETRY_CHECK_DELAY); |
179 | "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); | 354 | } |
180 | 355 | } | |
181 | idx = sp->completed; | ||
182 | mutex_lock(&sp->mutex); | ||
183 | 356 | ||
184 | /* | 357 | /* |
185 | * Check to see if someone else did the work for us while we were | 358 | * Increment the ->completed counter so that future SRCU readers will |
186 | * waiting to acquire the lock. We need -two- advances of | 359 | * use the other rank of the ->c[] and ->seq[] arrays. This allows |
187 | * the counter, not just one. If there was but one, we might have | 360 | * us to wait for pre-existing readers in a starvation-free manner. |
188 | * shown up -after- our helper's first synchronize_sched(), thus | 361 | */ |
189 | * having failed to prevent CPU-reordering races with concurrent | 362 | static void srcu_flip(struct srcu_struct *sp) |
190 | * srcu_read_unlock()s on other CPUs (see comment below). So we | 363 | { |
191 | * either (1) wait for two or (2) supply the second ourselves. | 364 | sp->completed++; |
192 | */ | 365 | } |
193 | 366 | ||
194 | if ((sp->completed - idx) >= 2) { | 367 | /* |
195 | mutex_unlock(&sp->mutex); | 368 | * Enqueue an SRCU callback on the specified srcu_struct structure, |
196 | return; | 369 | * initiating grace-period processing if it is not already running. |
370 | */ | ||
371 | void call_srcu(struct srcu_struct *sp, struct rcu_head *head, | ||
372 | void (*func)(struct rcu_head *head)) | ||
373 | { | ||
374 | unsigned long flags; | ||
375 | |||
376 | head->next = NULL; | ||
377 | head->func = func; | ||
378 | spin_lock_irqsave(&sp->queue_lock, flags); | ||
379 | rcu_batch_queue(&sp->batch_queue, head); | ||
380 | if (!sp->running) { | ||
381 | sp->running = true; | ||
382 | queue_delayed_work(system_nrt_wq, &sp->work, 0); | ||
197 | } | 383 | } |
384 | spin_unlock_irqrestore(&sp->queue_lock, flags); | ||
385 | } | ||
386 | EXPORT_SYMBOL_GPL(call_srcu); | ||
198 | 387 | ||
199 | sync_func(); /* Force memory barrier on all CPUs. */ | 388 | struct rcu_synchronize { |
389 | struct rcu_head head; | ||
390 | struct completion completion; | ||
391 | }; | ||
200 | 392 | ||
201 | /* | 393 | /* |
202 | * The preceding synchronize_sched() ensures that any CPU that | 394 | * Awaken the corresponding synchronize_srcu() instance now that a |
203 | * sees the new value of sp->completed will also see any preceding | 395 | * grace period has elapsed. |
204 | * changes to data structures made by this CPU. This prevents | 396 | */ |
205 | * some other CPU from reordering the accesses in its SRCU | 397 | static void wakeme_after_rcu(struct rcu_head *head) |
206 | * read-side critical section to precede the corresponding | 398 | { |
207 | * srcu_read_lock() -- ensuring that such references will in | 399 | struct rcu_synchronize *rcu; |
208 | * fact be protected. | ||
209 | * | ||
210 | * So it is now safe to do the flip. | ||
211 | */ | ||
212 | 400 | ||
213 | idx = sp->completed & 0x1; | 401 | rcu = container_of(head, struct rcu_synchronize, head); |
214 | sp->completed++; | 402 | complete(&rcu->completion); |
403 | } | ||
215 | 404 | ||
216 | sync_func(); /* Force memory barrier on all CPUs. */ | 405 | static void srcu_advance_batches(struct srcu_struct *sp, int trycount); |
406 | static void srcu_reschedule(struct srcu_struct *sp); | ||
217 | 407 | ||
218 | /* | 408 | /* |
219 | * At this point, because of the preceding synchronize_sched(), | 409 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). |
220 | * all srcu_read_lock() calls using the old counters have completed. | 410 | */ |
221 | * Their corresponding critical sections might well be still | 411 | static void __synchronize_srcu(struct srcu_struct *sp, int trycount) |
222 | * executing, but the srcu_read_lock() primitives themselves | 412 | { |
223 | * will have finished executing. We initially give readers | 413 | struct rcu_synchronize rcu; |
224 | * an arbitrarily chosen 10 microseconds to get out of their | 414 | struct rcu_head *head = &rcu.head; |
225 | * SRCU read-side critical sections, then loop waiting 1/HZ | 415 | bool done = false; |
226 | * seconds per iteration. The 10-microsecond value has done | ||
227 | * very well in testing. | ||
228 | */ | ||
229 | |||
230 | if (srcu_readers_active_idx(sp, idx)) | ||
231 | udelay(SYNCHRONIZE_SRCU_READER_DELAY); | ||
232 | while (srcu_readers_active_idx(sp, idx)) | ||
233 | schedule_timeout_interruptible(1); | ||
234 | 416 | ||
235 | sync_func(); /* Force memory barrier on all CPUs. */ | 417 | rcu_lockdep_assert(!lock_is_held(&sp->dep_map) && |
418 | !lock_is_held(&rcu_bh_lock_map) && | ||
419 | !lock_is_held(&rcu_lock_map) && | ||
420 | !lock_is_held(&rcu_sched_lock_map), | ||
421 | "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); | ||
236 | 422 | ||
237 | /* | 423 | init_completion(&rcu.completion); |
238 | * The preceding synchronize_sched() forces all srcu_read_unlock() | 424 | |
239 | * primitives that were executing concurrently with the preceding | 425 | head->next = NULL; |
240 | * for_each_possible_cpu() loop to have completed by this point. | 426 | head->func = wakeme_after_rcu; |
241 | * More importantly, it also forces the corresponding SRCU read-side | 427 | spin_lock_irq(&sp->queue_lock); |
242 | * critical sections to have also completed, and the corresponding | 428 | if (!sp->running) { |
243 | * references to SRCU-protected data items to be dropped. | 429 | /* steal the processing owner */ |
244 | * | 430 | sp->running = true; |
245 | * Note: | 431 | rcu_batch_queue(&sp->batch_check0, head); |
246 | * | 432 | spin_unlock_irq(&sp->queue_lock); |
247 | * Despite what you might think at first glance, the | 433 | |
248 | * preceding synchronize_sched() -must- be within the | 434 | srcu_advance_batches(sp, trycount); |
249 | * critical section ended by the following mutex_unlock(). | 435 | if (!rcu_batch_empty(&sp->batch_done)) { |
250 | * Otherwise, a task taking the early exit can race | 436 | BUG_ON(sp->batch_done.head != head); |
251 | * with a srcu_read_unlock(), which might have executed | 437 | rcu_batch_dequeue(&sp->batch_done); |
252 | * just before the preceding srcu_readers_active() check, | 438 | done = true; |
253 | * and whose CPU might have reordered the srcu_read_unlock() | 439 | } |
254 | * with the preceding critical section. In this case, there | 440 | /* give the processing owner to work_struct */ |
255 | * is nothing preventing the synchronize_sched() task that is | 441 | srcu_reschedule(sp); |
256 | * taking the early exit from freeing a data structure that | 442 | } else { |
257 | * is still being referenced (out of order) by the task | 443 | rcu_batch_queue(&sp->batch_queue, head); |
258 | * doing the srcu_read_unlock(). | 444 | spin_unlock_irq(&sp->queue_lock); |
259 | * | 445 | } |
260 | * Alternatively, the comparison with "2" on the early exit | ||
261 | * could be changed to "3", but this increases synchronize_srcu() | ||
262 | * latency for bulk loads. So the current code is preferred. | ||
263 | */ | ||
264 | 446 | ||
265 | mutex_unlock(&sp->mutex); | 447 | if (!done) |
448 | wait_for_completion(&rcu.completion); | ||
266 | } | 449 | } |
267 | 450 | ||
268 | /** | 451 | /** |
@@ -281,7 +464,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | |||
281 | */ | 464 | */ |
282 | void synchronize_srcu(struct srcu_struct *sp) | 465 | void synchronize_srcu(struct srcu_struct *sp) |
283 | { | 466 | { |
284 | __synchronize_srcu(sp, synchronize_sched); | 467 | __synchronize_srcu(sp, SYNCHRONIZE_SRCU_TRYCOUNT); |
285 | } | 468 | } |
286 | EXPORT_SYMBOL_GPL(synchronize_srcu); | 469 | EXPORT_SYMBOL_GPL(synchronize_srcu); |
287 | 470 | ||
@@ -289,18 +472,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); | |||
289 | * synchronize_srcu_expedited - Brute-force SRCU grace period | 472 | * synchronize_srcu_expedited - Brute-force SRCU grace period |
290 | * @sp: srcu_struct with which to synchronize. | 473 | * @sp: srcu_struct with which to synchronize. |
291 | * | 474 | * |
292 | * Wait for an SRCU grace period to elapse, but use a "big hammer" | 475 | * Wait for an SRCU grace period to elapse, but be more aggressive about |
293 | * approach to force the grace period to end quickly. This consumes | 476 | * spinning rather than blocking when waiting. |
294 | * significant time on all CPUs and is unfriendly to real-time workloads, | ||
295 | * so is thus not recommended for any sort of common-case code. In fact, | ||
296 | * if you are using synchronize_srcu_expedited() in a loop, please | ||
297 | * restructure your code to batch your updates, and then use a single | ||
298 | * synchronize_srcu() instead. | ||
299 | * | 477 | * |
300 | * Note that it is illegal to call this function while holding any lock | 478 | * Note that it is illegal to call this function while holding any lock |
301 | * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal | 479 | * that is acquired by a CPU-hotplug notifier. It is also illegal to call |
302 | * to call this function from a CPU-hotplug notifier. Failing to observe | ||
303 | * these restriction will result in deadlock. It is also illegal to call | ||
304 | * synchronize_srcu_expedited() from the corresponding SRCU read-side | 480 | * synchronize_srcu_expedited() from the corresponding SRCU read-side |
305 | * critical section; doing so will result in deadlock. However, it is | 481 | * critical section; doing so will result in deadlock. However, it is |
306 | * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct | 482 | * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct |
@@ -309,20 +485,166 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); | |||
309 | */ | 485 | */ |
310 | void synchronize_srcu_expedited(struct srcu_struct *sp) | 486 | void synchronize_srcu_expedited(struct srcu_struct *sp) |
311 | { | 487 | { |
312 | __synchronize_srcu(sp, synchronize_sched_expedited); | 488 | __synchronize_srcu(sp, SYNCHRONIZE_SRCU_EXP_TRYCOUNT); |
313 | } | 489 | } |
314 | EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); | 490 | EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); |
315 | 491 | ||
316 | /** | 492 | /** |
493 | * srcu_barrier - Wait until all in-flight call_srcu() callbacks complete. | ||
494 | */ | ||
495 | void srcu_barrier(struct srcu_struct *sp) | ||
496 | { | ||
497 | synchronize_srcu(sp); | ||
498 | } | ||
499 | EXPORT_SYMBOL_GPL(srcu_barrier); | ||
500 | |||
501 | /** | ||
317 | * srcu_batches_completed - return batches completed. | 502 | * srcu_batches_completed - return batches completed. |
318 | * @sp: srcu_struct on which to report batch completion. | 503 | * @sp: srcu_struct on which to report batch completion. |
319 | * | 504 | * |
320 | * Report the number of batches, correlated with, but not necessarily | 505 | * Report the number of batches, correlated with, but not necessarily |
321 | * precisely the same as, the number of grace periods that have elapsed. | 506 | * precisely the same as, the number of grace periods that have elapsed. |
322 | */ | 507 | */ |
323 | |||
324 | long srcu_batches_completed(struct srcu_struct *sp) | 508 | long srcu_batches_completed(struct srcu_struct *sp) |
325 | { | 509 | { |
326 | return sp->completed; | 510 | return sp->completed; |
327 | } | 511 | } |
328 | EXPORT_SYMBOL_GPL(srcu_batches_completed); | 512 | EXPORT_SYMBOL_GPL(srcu_batches_completed); |
513 | |||
514 | #define SRCU_CALLBACK_BATCH 10 | ||
515 | #define SRCU_INTERVAL 1 | ||
516 | |||
517 | /* | ||
518 | * Move any new SRCU callbacks to the first stage of the SRCU grace | ||
519 | * period pipeline. | ||
520 | */ | ||
521 | static void srcu_collect_new(struct srcu_struct *sp) | ||
522 | { | ||
523 | if (!rcu_batch_empty(&sp->batch_queue)) { | ||
524 | spin_lock_irq(&sp->queue_lock); | ||
525 | rcu_batch_move(&sp->batch_check0, &sp->batch_queue); | ||
526 | spin_unlock_irq(&sp->queue_lock); | ||
527 | } | ||
528 | } | ||
529 | |||
530 | /* | ||
531 | * Core SRCU state machine. Advance callbacks from ->batch_check0 to | ||
532 | * ->batch_check1 and then to ->batch_done as readers drain. | ||
533 | */ | ||
534 | static void srcu_advance_batches(struct srcu_struct *sp, int trycount) | ||
535 | { | ||
536 | int idx = 1 ^ (sp->completed & 1); | ||
537 | |||
538 | /* | ||
539 | * Because readers might be delayed for an extended period after | ||
540 | * fetching ->completed for their index, at any point in time there | ||
541 | * might well be readers using both idx=0 and idx=1. We therefore | ||
542 | * need to wait for readers to clear from both index values before | ||
543 | * invoking a callback. | ||
544 | */ | ||
545 | |||
546 | if (rcu_batch_empty(&sp->batch_check0) && | ||
547 | rcu_batch_empty(&sp->batch_check1)) | ||
548 | return; /* no callbacks need to be advanced */ | ||
549 | |||
550 | if (!try_check_zero(sp, idx, trycount)) | ||
551 | return; /* failed to advance, will try after SRCU_INTERVAL */ | ||
552 | |||
553 | /* | ||
554 | * The callbacks in ->batch_check1 have already done with their | ||
555 | * first zero check and flip back when they were enqueued on | ||
556 | * ->batch_check0 in a previous invocation of srcu_advance_batches(). | ||
557 | * (Presumably try_check_zero() returned false during that | ||
558 | * invocation, leaving the callbacks stranded on ->batch_check1.) | ||
559 | * They are therefore ready to invoke, so move them to ->batch_done. | ||
560 | */ | ||
561 | rcu_batch_move(&sp->batch_done, &sp->batch_check1); | ||
562 | |||
563 | if (rcu_batch_empty(&sp->batch_check0)) | ||
564 | return; /* no callbacks need to be advanced */ | ||
565 | srcu_flip(sp); | ||
566 | |||
567 | /* | ||
568 | * The callbacks in ->batch_check0 just finished their | ||
569 | * first check zero and flip, so move them to ->batch_check1 | ||
570 | * for future checking on the other idx. | ||
571 | */ | ||
572 | rcu_batch_move(&sp->batch_check1, &sp->batch_check0); | ||
573 | |||
574 | /* | ||
575 | * SRCU read-side critical sections are normally short, so check | ||
576 | * at least twice in quick succession after a flip. | ||
577 | */ | ||
578 | trycount = trycount < 2 ? 2 : trycount; | ||
579 | if (!try_check_zero(sp, idx^1, trycount)) | ||
580 | return; /* failed to advance, will try after SRCU_INTERVAL */ | ||
581 | |||
582 | /* | ||
583 | * The callbacks in ->batch_check1 have now waited for all | ||
584 | * pre-existing readers using both idx values. They are therefore | ||
585 | * ready to invoke, so move them to ->batch_done. | ||
586 | */ | ||
587 | rcu_batch_move(&sp->batch_done, &sp->batch_check1); | ||
588 | } | ||
589 | |||
590 | /* | ||
591 | * Invoke a limited number of SRCU callbacks that have passed through | ||
592 | * their grace period. If there are more to do, SRCU will reschedule | ||
593 | * the workqueue. | ||
594 | */ | ||
595 | static void srcu_invoke_callbacks(struct srcu_struct *sp) | ||
596 | { | ||
597 | int i; | ||
598 | struct rcu_head *head; | ||
599 | |||
600 | for (i = 0; i < SRCU_CALLBACK_BATCH; i++) { | ||
601 | head = rcu_batch_dequeue(&sp->batch_done); | ||
602 | if (!head) | ||
603 | break; | ||
604 | local_bh_disable(); | ||
605 | head->func(head); | ||
606 | local_bh_enable(); | ||
607 | } | ||
608 | } | ||
609 | |||
610 | /* | ||
611 | * Finished one round of SRCU grace period. Start another if there are | ||
612 | * more SRCU callbacks queued, otherwise put SRCU into not-running state. | ||
613 | */ | ||
614 | static void srcu_reschedule(struct srcu_struct *sp) | ||
615 | { | ||
616 | bool pending = true; | ||
617 | |||
618 | if (rcu_batch_empty(&sp->batch_done) && | ||
619 | rcu_batch_empty(&sp->batch_check1) && | ||
620 | rcu_batch_empty(&sp->batch_check0) && | ||
621 | rcu_batch_empty(&sp->batch_queue)) { | ||
622 | spin_lock_irq(&sp->queue_lock); | ||
623 | if (rcu_batch_empty(&sp->batch_done) && | ||
624 | rcu_batch_empty(&sp->batch_check1) && | ||
625 | rcu_batch_empty(&sp->batch_check0) && | ||
626 | rcu_batch_empty(&sp->batch_queue)) { | ||
627 | sp->running = false; | ||
628 | pending = false; | ||
629 | } | ||
630 | spin_unlock_irq(&sp->queue_lock); | ||
631 | } | ||
632 | |||
633 | if (pending) | ||
634 | queue_delayed_work(system_nrt_wq, &sp->work, SRCU_INTERVAL); | ||
635 | } | ||
636 | |||
637 | /* | ||
638 | * This is the work-queue function that handles SRCU grace periods. | ||
639 | */ | ||
640 | static void process_srcu(struct work_struct *work) | ||
641 | { | ||
642 | struct srcu_struct *sp; | ||
643 | |||
644 | sp = container_of(work, struct srcu_struct, work.work); | ||
645 | |||
646 | srcu_collect_new(sp); | ||
647 | srcu_advance_batches(sp, 1); | ||
648 | srcu_invoke_callbacks(sp); | ||
649 | srcu_reschedule(sp); | ||
650 | } | ||