diff options
author | Paul E. McKenney <paul.mckenney@linaro.org> | 2012-02-05 10:42:44 -0500 |
---|---|---|
committer | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2012-04-30 13:48:19 -0400 |
commit | cef50120b61c2af4ce34bc165e19cad66296f93d (patch) | |
tree | 963a9473155bcf6a8aa12ce92ff842242c9c3575 /kernel/srcu.c | |
parent | fae4b54f28f034d228fa3bfc98858c698b64e89c (diff) |
rcu: Direct algorithmic SRCU implementation
The current implementation of synchronize_srcu_expedited() can cause
severe OS jitter due to its use of synchronize_sched(), which in turn
invokes try_stop_cpus(), which causes each CPU to be sent an IPI.
This can result in severe performance degradation for real-time workloads
and especially for short-interation-length HPC workloads. Furthermore,
because only one instance of try_stop_cpus() can be making forward progress
at a given time, only one instance of synchronize_srcu_expedited() can
make forward progress at a time, even if they are all operating on
distinct srcu_struct structures.
This commit, inspired by an earlier implementation by Peter Zijlstra
(https://lkml.org/lkml/2012/1/31/211) and by further offline discussions,
takes a strictly algorithmic bits-in-memory approach. This has the
disadvantage of requiring one explicit memory-barrier instruction in
each of srcu_read_lock() and srcu_read_unlock(), but on the other hand
completely dispenses with OS jitter and furthermore allows SRCU to be
used freely by CPUs that RCU believes to be idle or offline.
The update-side implementation handles the single read-side memory
barrier by rechecking the per-CPU counters after summing them and
by running through the update-side state machine twice.
This implementation has passed moderate rcutorture testing on both
x86 and Power. Also updated to use this_cpu_ptr() instead of per_cpu_ptr(),
as suggested by Peter Zijlstra.
Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Diffstat (limited to 'kernel/srcu.c')
-rw-r--r-- | kernel/srcu.c | 284 |
1 files changed, 189 insertions, 95 deletions
diff --git a/kernel/srcu.c b/kernel/srcu.c index ba35f3a4a1f4..84c9b97dc3d9 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
@@ -73,19 +73,102 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); | |||
73 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 73 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
74 | 74 | ||
75 | /* | 75 | /* |
76 | * srcu_readers_active_idx -- returns approximate number of readers | 76 | * Returns approximate number of readers active on the specified rank |
77 | * active on the specified rank of per-CPU counters. | 77 | * of per-CPU counters. Also snapshots each counter's value in the |
78 | * corresponding element of sp->snap[] for later use validating | ||
79 | * the sum. | ||
78 | */ | 80 | */ |
81 | static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) | ||
82 | { | ||
83 | int cpu; | ||
84 | unsigned long sum = 0; | ||
85 | unsigned long t; | ||
79 | 86 | ||
80 | static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) | 87 | for_each_possible_cpu(cpu) { |
88 | t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); | ||
89 | sum += t; | ||
90 | sp->snap[cpu] = t; | ||
91 | } | ||
92 | return sum & SRCU_REF_MASK; | ||
93 | } | ||
94 | |||
95 | /* | ||
96 | * To be called from the update side after an index flip. Returns true | ||
97 | * if the modulo sum of the counters is stably zero, false if there is | ||
98 | * some possibility of non-zero. | ||
99 | */ | ||
100 | static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) | ||
81 | { | 101 | { |
82 | int cpu; | 102 | int cpu; |
83 | int sum; | ||
84 | 103 | ||
85 | sum = 0; | 104 | /* |
105 | * Note that srcu_readers_active_idx() can incorrectly return | ||
106 | * zero even though there is a pre-existing reader throughout. | ||
107 | * To see this, suppose that task A is in a very long SRCU | ||
108 | * read-side critical section that started on CPU 0, and that | ||
109 | * no other reader exists, so that the modulo sum of the counters | ||
110 | * is equal to one. Then suppose that task B starts executing | ||
111 | * srcu_readers_active_idx(), summing up to CPU 1, and then that | ||
112 | * task C starts reading on CPU 0, so that its increment is not | ||
113 | * summed, but finishes reading on CPU 2, so that its decrement | ||
114 | * -is- summed. Then when task B completes its sum, it will | ||
115 | * incorrectly get zero, despite the fact that task A has been | ||
116 | * in its SRCU read-side critical section the whole time. | ||
117 | * | ||
118 | * We therefore do a validation step should srcu_readers_active_idx() | ||
119 | * return zero. | ||
120 | */ | ||
121 | if (srcu_readers_active_idx(sp, idx) != 0) | ||
122 | return false; | ||
123 | |||
124 | /* | ||
125 | * Since the caller recently flipped ->completed, we can see at | ||
126 | * most one increment of each CPU's counter from this point | ||
127 | * forward. The reason for this is that the reader CPU must have | ||
128 | * fetched the index before srcu_readers_active_idx checked | ||
129 | * that CPU's counter, but not yet incremented its counter. | ||
130 | * Its eventual counter increment will follow the read in | ||
131 | * srcu_readers_active_idx(), and that increment is immediately | ||
132 | * followed by smp_mb() B. Because smp_mb() D is between | ||
133 | * the ->completed flip and srcu_readers_active_idx()'s read, | ||
134 | * that CPU's subsequent load of ->completed must see the new | ||
135 | * value, and therefore increment the counter in the other rank. | ||
136 | */ | ||
137 | smp_mb(); /* A */ | ||
138 | |||
139 | /* | ||
140 | * Now, we check the ->snap array that srcu_readers_active_idx() | ||
141 | * filled in from the per-CPU counter values. Since both | ||
142 | * __srcu_read_lock() and __srcu_read_unlock() increment the | ||
143 | * upper bits of the per-CPU counter, an increment/decrement | ||
144 | * pair will change the value of the counter. Since there is | ||
145 | * only one possible increment, the only way to wrap the counter | ||
146 | * is to have a huge number of counter decrements, which requires | ||
147 | * a huge number of tasks and huge SRCU read-side critical-section | ||
148 | * nesting levels, even on 32-bit systems. | ||
149 | * | ||
150 | * All of the ways of confusing the readings require that the scan | ||
151 | * in srcu_readers_active_idx() see the read-side task's decrement, | ||
152 | * but not its increment. However, between that decrement and | ||
153 | * increment are smb_mb() B and C. Either or both of these pair | ||
154 | * with smp_mb() A above to ensure that the scan below will see | ||
155 | * the read-side tasks's increment, thus noting a difference in | ||
156 | * the counter values between the two passes. | ||
157 | * | ||
158 | * Therefore, if srcu_readers_active_idx() returned zero, and | ||
159 | * none of the counters changed, we know that the zero was the | ||
160 | * correct sum. | ||
161 | * | ||
162 | * Of course, it is possible that a task might be delayed | ||
163 | * for a very long time in __srcu_read_lock() after fetching | ||
164 | * the index but before incrementing its counter. This | ||
165 | * possibility will be dealt with in __synchronize_srcu(). | ||
166 | */ | ||
86 | for_each_possible_cpu(cpu) | 167 | for_each_possible_cpu(cpu) |
87 | sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; | 168 | if (sp->snap[cpu] != |
88 | return sum; | 169 | ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx])) |
170 | return false; /* False zero reading! */ | ||
171 | return true; | ||
89 | } | 172 | } |
90 | 173 | ||
91 | /** | 174 | /** |
@@ -131,10 +214,11 @@ int __srcu_read_lock(struct srcu_struct *sp) | |||
131 | int idx; | 214 | int idx; |
132 | 215 | ||
133 | preempt_disable(); | 216 | preempt_disable(); |
134 | idx = sp->completed & 0x1; | 217 | idx = rcu_dereference_index_check(sp->completed, |
135 | barrier(); /* ensure compiler looks -once- at sp->completed. */ | 218 | rcu_read_lock_sched_held()) & 0x1; |
136 | per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; | 219 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += |
137 | srcu_barrier(); /* ensure compiler won't misorder critical section. */ | 220 | SRCU_USAGE_COUNT + 1; |
221 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ | ||
138 | preempt_enable(); | 222 | preempt_enable(); |
139 | return idx; | 223 | return idx; |
140 | } | 224 | } |
@@ -149,8 +233,9 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock); | |||
149 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) | 233 | void __srcu_read_unlock(struct srcu_struct *sp, int idx) |
150 | { | 234 | { |
151 | preempt_disable(); | 235 | preempt_disable(); |
152 | srcu_barrier(); /* ensure compiler won't misorder critical section. */ | 236 | smp_mb(); /* C */ /* Avoid leaking the critical section. */ |
153 | per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; | 237 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += |
238 | SRCU_USAGE_COUNT - 1; | ||
154 | preempt_enable(); | 239 | preempt_enable(); |
155 | } | 240 | } |
156 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); | 241 | EXPORT_SYMBOL_GPL(__srcu_read_unlock); |
@@ -163,12 +248,65 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock); | |||
163 | * we repeatedly block for 1-millisecond time periods. This approach | 248 | * we repeatedly block for 1-millisecond time periods. This approach |
164 | * has done well in testing, so there is no need for a config parameter. | 249 | * has done well in testing, so there is no need for a config parameter. |
165 | */ | 250 | */ |
166 | #define SYNCHRONIZE_SRCU_READER_DELAY 10 | 251 | #define SYNCHRONIZE_SRCU_READER_DELAY 5 |
252 | |||
253 | /* | ||
254 | * Flip the readers' index by incrementing ->completed, then wait | ||
255 | * until there are no more readers using the counters referenced by | ||
256 | * the old index value. (Recall that the index is the bottom bit | ||
257 | * of ->completed.) | ||
258 | * | ||
259 | * Of course, it is possible that a reader might be delayed for the | ||
260 | * full duration of flip_idx_and_wait() between fetching the | ||
261 | * index and incrementing its counter. This possibility is handled | ||
262 | * by __synchronize_srcu() invoking flip_idx_and_wait() twice. | ||
263 | */ | ||
264 | static void flip_idx_and_wait(struct srcu_struct *sp, bool expedited) | ||
265 | { | ||
266 | int idx; | ||
267 | int trycount = 0; | ||
268 | |||
269 | idx = sp->completed++ & 0x1; | ||
270 | |||
271 | /* | ||
272 | * If a reader fetches the index before the above increment, | ||
273 | * but increments its counter after srcu_readers_active_idx_check() | ||
274 | * sums it, then smp_mb() D will pair with __srcu_read_lock()'s | ||
275 | * smp_mb() B to ensure that the SRCU read-side critical section | ||
276 | * will see any updates that the current task performed before its | ||
277 | * call to synchronize_srcu(), or to synchronize_srcu_expedited(), | ||
278 | * as the case may be. | ||
279 | */ | ||
280 | smp_mb(); /* D */ | ||
281 | |||
282 | /* | ||
283 | * SRCU read-side critical sections are normally short, so wait | ||
284 | * a small amount of time before possibly blocking. | ||
285 | */ | ||
286 | if (!srcu_readers_active_idx_check(sp, idx)) { | ||
287 | udelay(SYNCHRONIZE_SRCU_READER_DELAY); | ||
288 | while (!srcu_readers_active_idx_check(sp, idx)) { | ||
289 | if (expedited && ++ trycount < 10) | ||
290 | udelay(SYNCHRONIZE_SRCU_READER_DELAY); | ||
291 | else | ||
292 | schedule_timeout_interruptible(1); | ||
293 | } | ||
294 | } | ||
295 | |||
296 | /* | ||
297 | * The following smp_mb() E pairs with srcu_read_unlock()'s | ||
298 | * smp_mb C to ensure that if srcu_readers_active_idx_check() | ||
299 | * sees srcu_read_unlock()'s counter decrement, then any | ||
300 | * of the current task's subsequent code will happen after | ||
301 | * that SRCU read-side critical section. | ||
302 | */ | ||
303 | smp_mb(); /* E */ | ||
304 | } | ||
167 | 305 | ||
168 | /* | 306 | /* |
169 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). | 307 | * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). |
170 | */ | 308 | */ |
171 | static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | 309 | static void __synchronize_srcu(struct srcu_struct *sp, bool expedited) |
172 | { | 310 | { |
173 | int idx; | 311 | int idx; |
174 | 312 | ||
@@ -178,90 +316,53 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | |||
178 | !lock_is_held(&rcu_sched_lock_map), | 316 | !lock_is_held(&rcu_sched_lock_map), |
179 | "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); | 317 | "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); |
180 | 318 | ||
181 | idx = sp->completed; | 319 | smp_mb(); /* Ensure prior action happens before grace period. */ |
320 | idx = ACCESS_ONCE(sp->completed); | ||
321 | smp_mb(); /* Access to ->completed before lock acquisition. */ | ||
182 | mutex_lock(&sp->mutex); | 322 | mutex_lock(&sp->mutex); |
183 | 323 | ||
184 | /* | 324 | /* |
185 | * Check to see if someone else did the work for us while we were | 325 | * Check to see if someone else did the work for us while we were |
186 | * waiting to acquire the lock. We need -two- advances of | 326 | * waiting to acquire the lock. We need -three- advances of |
187 | * the counter, not just one. If there was but one, we might have | 327 | * the counter, not just one. If there was but one, we might have |
188 | * shown up -after- our helper's first synchronize_sched(), thus | 328 | * shown up -after- our helper's first synchronize_sched(), thus |
189 | * having failed to prevent CPU-reordering races with concurrent | 329 | * having failed to prevent CPU-reordering races with concurrent |
190 | * srcu_read_unlock()s on other CPUs (see comment below). So we | 330 | * srcu_read_unlock()s on other CPUs (see comment below). If there |
191 | * either (1) wait for two or (2) supply the second ourselves. | 331 | * was only two, we are guaranteed to have waited through only one |
332 | * full index-flip phase. So we either (1) wait for three or | ||
333 | * (2) supply the additional ones we need. | ||
192 | */ | 334 | */ |
193 | 335 | ||
194 | if ((sp->completed - idx) >= 2) { | 336 | if (sp->completed == idx + 2) |
337 | idx = 1; | ||
338 | else if (sp->completed == idx + 3) { | ||
195 | mutex_unlock(&sp->mutex); | 339 | mutex_unlock(&sp->mutex); |
196 | return; | 340 | return; |
197 | } | 341 | } else |
198 | 342 | idx = 0; | |
199 | sync_func(); /* Force memory barrier on all CPUs. */ | ||
200 | 343 | ||
201 | /* | 344 | /* |
202 | * The preceding synchronize_sched() ensures that any CPU that | 345 | * If there were no helpers, then we need to do two flips of |
203 | * sees the new value of sp->completed will also see any preceding | 346 | * the index. The first flip is required if there are any |
204 | * changes to data structures made by this CPU. This prevents | 347 | * outstanding SRCU readers even if there are no new readers |
205 | * some other CPU from reordering the accesses in its SRCU | 348 | * running concurrently with the first counter flip. |
206 | * read-side critical section to precede the corresponding | ||
207 | * srcu_read_lock() -- ensuring that such references will in | ||
208 | * fact be protected. | ||
209 | * | 349 | * |
210 | * So it is now safe to do the flip. | 350 | * The second flip is required when a new reader picks up |
211 | */ | 351 | * the old value of the index, but does not increment its |
212 | 352 | * counter until after its counters is summed/rechecked by | |
213 | idx = sp->completed & 0x1; | 353 | * srcu_readers_active_idx_check(). In this case, the current SRCU |
214 | sp->completed++; | 354 | * grace period would be OK because the SRCU read-side critical |
215 | 355 | * section started after this SRCU grace period started, so the | |
216 | sync_func(); /* Force memory barrier on all CPUs. */ | 356 | * grace period is not required to wait for the reader. |
217 | |||
218 | /* | ||
219 | * At this point, because of the preceding synchronize_sched(), | ||
220 | * all srcu_read_lock() calls using the old counters have completed. | ||
221 | * Their corresponding critical sections might well be still | ||
222 | * executing, but the srcu_read_lock() primitives themselves | ||
223 | * will have finished executing. We initially give readers | ||
224 | * an arbitrarily chosen 10 microseconds to get out of their | ||
225 | * SRCU read-side critical sections, then loop waiting 1/HZ | ||
226 | * seconds per iteration. The 10-microsecond value has done | ||
227 | * very well in testing. | ||
228 | */ | ||
229 | |||
230 | if (srcu_readers_active_idx(sp, idx)) | ||
231 | udelay(SYNCHRONIZE_SRCU_READER_DELAY); | ||
232 | while (srcu_readers_active_idx(sp, idx)) | ||
233 | schedule_timeout_interruptible(1); | ||
234 | |||
235 | sync_func(); /* Force memory barrier on all CPUs. */ | ||
236 | |||
237 | /* | ||
238 | * The preceding synchronize_sched() forces all srcu_read_unlock() | ||
239 | * primitives that were executing concurrently with the preceding | ||
240 | * for_each_possible_cpu() loop to have completed by this point. | ||
241 | * More importantly, it also forces the corresponding SRCU read-side | ||
242 | * critical sections to have also completed, and the corresponding | ||
243 | * references to SRCU-protected data items to be dropped. | ||
244 | * | 357 | * |
245 | * Note: | 358 | * However, the next SRCU grace period would be waiting for the |
246 | * | 359 | * other set of counters to go to zero, and therefore would not |
247 | * Despite what you might think at first glance, the | 360 | * wait for the reader, which would be very bad. To avoid this |
248 | * preceding synchronize_sched() -must- be within the | 361 | * bad scenario, we flip and wait twice, clearing out both sets |
249 | * critical section ended by the following mutex_unlock(). | 362 | * of counters. |
250 | * Otherwise, a task taking the early exit can race | ||
251 | * with a srcu_read_unlock(), which might have executed | ||
252 | * just before the preceding srcu_readers_active() check, | ||
253 | * and whose CPU might have reordered the srcu_read_unlock() | ||
254 | * with the preceding critical section. In this case, there | ||
255 | * is nothing preventing the synchronize_sched() task that is | ||
256 | * taking the early exit from freeing a data structure that | ||
257 | * is still being referenced (out of order) by the task | ||
258 | * doing the srcu_read_unlock(). | ||
259 | * | ||
260 | * Alternatively, the comparison with "2" on the early exit | ||
261 | * could be changed to "3", but this increases synchronize_srcu() | ||
262 | * latency for bulk loads. So the current code is preferred. | ||
263 | */ | 363 | */ |
264 | 364 | for (; idx < 2; idx++) | |
365 | flip_idx_and_wait(sp, expedited); | ||
265 | mutex_unlock(&sp->mutex); | 366 | mutex_unlock(&sp->mutex); |
266 | } | 367 | } |
267 | 368 | ||
@@ -281,7 +382,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) | |||
281 | */ | 382 | */ |
282 | void synchronize_srcu(struct srcu_struct *sp) | 383 | void synchronize_srcu(struct srcu_struct *sp) |
283 | { | 384 | { |
284 | __synchronize_srcu(sp, synchronize_sched); | 385 | __synchronize_srcu(sp, 0); |
285 | } | 386 | } |
286 | EXPORT_SYMBOL_GPL(synchronize_srcu); | 387 | EXPORT_SYMBOL_GPL(synchronize_srcu); |
287 | 388 | ||
@@ -289,18 +390,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); | |||
289 | * synchronize_srcu_expedited - Brute-force SRCU grace period | 390 | * synchronize_srcu_expedited - Brute-force SRCU grace period |
290 | * @sp: srcu_struct with which to synchronize. | 391 | * @sp: srcu_struct with which to synchronize. |
291 | * | 392 | * |
292 | * Wait for an SRCU grace period to elapse, but use a "big hammer" | 393 | * Wait for an SRCU grace period to elapse, but be more aggressive about |
293 | * approach to force the grace period to end quickly. This consumes | 394 | * spinning rather than blocking when waiting. |
294 | * significant time on all CPUs and is unfriendly to real-time workloads, | ||
295 | * so is thus not recommended for any sort of common-case code. In fact, | ||
296 | * if you are using synchronize_srcu_expedited() in a loop, please | ||
297 | * restructure your code to batch your updates, and then use a single | ||
298 | * synchronize_srcu() instead. | ||
299 | * | 395 | * |
300 | * Note that it is illegal to call this function while holding any lock | 396 | * Note that it is illegal to call this function while holding any lock |
301 | * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal | 397 | * that is acquired by a CPU-hotplug notifier. It is also illegal to call |
302 | * to call this function from a CPU-hotplug notifier. Failing to observe | ||
303 | * these restriction will result in deadlock. It is also illegal to call | ||
304 | * synchronize_srcu_expedited() from the corresponding SRCU read-side | 398 | * synchronize_srcu_expedited() from the corresponding SRCU read-side |
305 | * critical section; doing so will result in deadlock. However, it is | 399 | * critical section; doing so will result in deadlock. However, it is |
306 | * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct | 400 | * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct |
@@ -309,7 +403,7 @@ EXPORT_SYMBOL_GPL(synchronize_srcu); | |||
309 | */ | 403 | */ |
310 | void synchronize_srcu_expedited(struct srcu_struct *sp) | 404 | void synchronize_srcu_expedited(struct srcu_struct *sp) |
311 | { | 405 | { |
312 | __synchronize_srcu(sp, synchronize_sched_expedited); | 406 | __synchronize_srcu(sp, 1); |
313 | } | 407 | } |
314 | EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); | 408 | EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); |
315 | 409 | ||