aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/srcu.c
diff options
context:
space:
mode:
authorPaul E. McKenney <paul.mckenney@linaro.org>2012-02-05 10:42:44 -0500
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>2012-04-30 13:48:19 -0400
commitcef50120b61c2af4ce34bc165e19cad66296f93d (patch)
tree963a9473155bcf6a8aa12ce92ff842242c9c3575 /kernel/srcu.c
parentfae4b54f28f034d228fa3bfc98858c698b64e89c (diff)
rcu: Direct algorithmic SRCU implementation
The current implementation of synchronize_srcu_expedited() can cause severe OS jitter due to its use of synchronize_sched(), which in turn invokes try_stop_cpus(), which causes each CPU to be sent an IPI. This can result in severe performance degradation for real-time workloads and especially for short-interation-length HPC workloads. Furthermore, because only one instance of try_stop_cpus() can be making forward progress at a given time, only one instance of synchronize_srcu_expedited() can make forward progress at a time, even if they are all operating on distinct srcu_struct structures. This commit, inspired by an earlier implementation by Peter Zijlstra (https://lkml.org/lkml/2012/1/31/211) and by further offline discussions, takes a strictly algorithmic bits-in-memory approach. This has the disadvantage of requiring one explicit memory-barrier instruction in each of srcu_read_lock() and srcu_read_unlock(), but on the other hand completely dispenses with OS jitter and furthermore allows SRCU to be used freely by CPUs that RCU believes to be idle or offline. The update-side implementation handles the single read-side memory barrier by rechecking the per-CPU counters after summing them and by running through the update-side state machine twice. This implementation has passed moderate rcutorture testing on both x86 and Power. Also updated to use this_cpu_ptr() instead of per_cpu_ptr(), as suggested by Peter Zijlstra. Reported-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Diffstat (limited to 'kernel/srcu.c')
-rw-r--r--kernel/srcu.c284
1 files changed, 189 insertions, 95 deletions
diff --git a/kernel/srcu.c b/kernel/srcu.c
index ba35f3a4a1f..84c9b97dc3d 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -73,19 +73,102 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
73#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 73#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
74 74
75/* 75/*
76 * srcu_readers_active_idx -- returns approximate number of readers 76 * Returns approximate number of readers active on the specified rank
77 * active on the specified rank of per-CPU counters. 77 * of per-CPU counters. Also snapshots each counter's value in the
78 * corresponding element of sp->snap[] for later use validating
79 * the sum.
78 */ 80 */
81static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
82{
83 int cpu;
84 unsigned long sum = 0;
85 unsigned long t;
79 86
80static int srcu_readers_active_idx(struct srcu_struct *sp, int idx) 87 for_each_possible_cpu(cpu) {
88 t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
89 sum += t;
90 sp->snap[cpu] = t;
91 }
92 return sum & SRCU_REF_MASK;
93}
94
95/*
96 * To be called from the update side after an index flip. Returns true
97 * if the modulo sum of the counters is stably zero, false if there is
98 * some possibility of non-zero.
99 */
100static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
81{ 101{
82 int cpu; 102 int cpu;
83 int sum;
84 103
85 sum = 0; 104 /*
105 * Note that srcu_readers_active_idx() can incorrectly return
106 * zero even though there is a pre-existing reader throughout.
107 * To see this, suppose that task A is in a very long SRCU
108 * read-side critical section that started on CPU 0, and that
109 * no other reader exists, so that the modulo sum of the counters
110 * is equal to one. Then suppose that task B starts executing
111 * srcu_readers_active_idx(), summing up to CPU 1, and then that
112 * task C starts reading on CPU 0, so that its increment is not
113 * summed, but finishes reading on CPU 2, so that its decrement
114 * -is- summed. Then when task B completes its sum, it will
115 * incorrectly get zero, despite the fact that task A has been
116 * in its SRCU read-side critical section the whole time.
117 *
118 * We therefore do a validation step should srcu_readers_active_idx()
119 * return zero.
120 */
121 if (srcu_readers_active_idx(sp, idx) != 0)
122 return false;
123
124 /*
125 * Since the caller recently flipped ->completed, we can see at
126 * most one increment of each CPU's counter from this point
127 * forward. The reason for this is that the reader CPU must have
128 * fetched the index before srcu_readers_active_idx checked
129 * that CPU's counter, but not yet incremented its counter.
130 * Its eventual counter increment will follow the read in
131 * srcu_readers_active_idx(), and that increment is immediately
132 * followed by smp_mb() B. Because smp_mb() D is between
133 * the ->completed flip and srcu_readers_active_idx()'s read,
134 * that CPU's subsequent load of ->completed must see the new
135 * value, and therefore increment the counter in the other rank.
136 */
137 smp_mb(); /* A */
138
139 /*
140 * Now, we check the ->snap array that srcu_readers_active_idx()
141 * filled in from the per-CPU counter values. Since both
142 * __srcu_read_lock() and __srcu_read_unlock() increment the
143 * upper bits of the per-CPU counter, an increment/decrement
144 * pair will change the value of the counter. Since there is
145 * only one possible increment, the only way to wrap the counter
146 * is to have a huge number of counter decrements, which requires
147 * a huge number of tasks and huge SRCU read-side critical-section
148 * nesting levels, even on 32-bit systems.
149 *
150 * All of the ways of confusing the readings require that the scan
151 * in srcu_readers_active_idx() see the read-side task's decrement,
152 * but not its increment. However, between that decrement and
153 * increment are smb_mb() B and C. Either or both of these pair
154 * with smp_mb() A above to ensure that the scan below will see
155 * the read-side tasks's increment, thus noting a difference in
156 * the counter values between the two passes.
157 *
158 * Therefore, if srcu_readers_active_idx() returned zero, and
159 * none of the counters changed, we know that the zero was the
160 * correct sum.
161 *
162 * Of course, it is possible that a task might be delayed
163 * for a very long time in __srcu_read_lock() after fetching
164 * the index but before incrementing its counter. This
165 * possibility will be dealt with in __synchronize_srcu().
166 */
86 for_each_possible_cpu(cpu) 167 for_each_possible_cpu(cpu)
87 sum += per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]; 168 if (sp->snap[cpu] !=
88 return sum; 169 ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]))
170 return false; /* False zero reading! */
171 return true;
89} 172}
90 173
91/** 174/**
@@ -131,10 +214,11 @@ int __srcu_read_lock(struct srcu_struct *sp)
131 int idx; 214 int idx;
132 215
133 preempt_disable(); 216 preempt_disable();
134 idx = sp->completed & 0x1; 217 idx = rcu_dereference_index_check(sp->completed,
135 barrier(); /* ensure compiler looks -once- at sp->completed. */ 218 rcu_read_lock_sched_held()) & 0x1;
136 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]++; 219 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) +=
137 srcu_barrier(); /* ensure compiler won't misorder critical section. */ 220 SRCU_USAGE_COUNT + 1;
221 smp_mb(); /* B */ /* Avoid leaking the critical section. */
138 preempt_enable(); 222 preempt_enable();
139 return idx; 223 return idx;
140} 224}
@@ -149,8 +233,9 @@ EXPORT_SYMBOL_GPL(__srcu_read_lock);
149void __srcu_read_unlock(struct srcu_struct *sp, int idx) 233void __srcu_read_unlock(struct srcu_struct *sp, int idx)
150{ 234{
151 preempt_disable(); 235 preempt_disable();
152 srcu_barrier(); /* ensure compiler won't misorder critical section. */ 236 smp_mb(); /* C */ /* Avoid leaking the critical section. */
153 per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; 237 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) +=
238 SRCU_USAGE_COUNT - 1;
154 preempt_enable(); 239 preempt_enable();
155} 240}
156EXPORT_SYMBOL_GPL(__srcu_read_unlock); 241EXPORT_SYMBOL_GPL(__srcu_read_unlock);
@@ -163,12 +248,65 @@ EXPORT_SYMBOL_GPL(__srcu_read_unlock);
163 * we repeatedly block for 1-millisecond time periods. This approach 248 * we repeatedly block for 1-millisecond time periods. This approach
164 * has done well in testing, so there is no need for a config parameter. 249 * has done well in testing, so there is no need for a config parameter.
165 */ 250 */
166#define SYNCHRONIZE_SRCU_READER_DELAY 10 251#define SYNCHRONIZE_SRCU_READER_DELAY 5
252
253/*
254 * Flip the readers' index by incrementing ->completed, then wait
255 * until there are no more readers using the counters referenced by
256 * the old index value. (Recall that the index is the bottom bit
257 * of ->completed.)
258 *
259 * Of course, it is possible that a reader might be delayed for the
260 * full duration of flip_idx_and_wait() between fetching the
261 * index and incrementing its counter. This possibility is handled
262 * by __synchronize_srcu() invoking flip_idx_and_wait() twice.
263 */
264static void flip_idx_and_wait(struct srcu_struct *sp, bool expedited)
265{
266 int idx;
267 int trycount = 0;
268
269 idx = sp->completed++ & 0x1;
270
271 /*
272 * If a reader fetches the index before the above increment,
273 * but increments its counter after srcu_readers_active_idx_check()
274 * sums it, then smp_mb() D will pair with __srcu_read_lock()'s
275 * smp_mb() B to ensure that the SRCU read-side critical section
276 * will see any updates that the current task performed before its
277 * call to synchronize_srcu(), or to synchronize_srcu_expedited(),
278 * as the case may be.
279 */
280 smp_mb(); /* D */
281
282 /*
283 * SRCU read-side critical sections are normally short, so wait
284 * a small amount of time before possibly blocking.
285 */
286 if (!srcu_readers_active_idx_check(sp, idx)) {
287 udelay(SYNCHRONIZE_SRCU_READER_DELAY);
288 while (!srcu_readers_active_idx_check(sp, idx)) {
289 if (expedited && ++ trycount < 10)
290 udelay(SYNCHRONIZE_SRCU_READER_DELAY);
291 else
292 schedule_timeout_interruptible(1);
293 }
294 }
295
296 /*
297 * The following smp_mb() E pairs with srcu_read_unlock()'s
298 * smp_mb C to ensure that if srcu_readers_active_idx_check()
299 * sees srcu_read_unlock()'s counter decrement, then any
300 * of the current task's subsequent code will happen after
301 * that SRCU read-side critical section.
302 */
303 smp_mb(); /* E */
304}
167 305
168/* 306/*
169 * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). 307 * Helper function for synchronize_srcu() and synchronize_srcu_expedited().
170 */ 308 */
171static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) 309static void __synchronize_srcu(struct srcu_struct *sp, bool expedited)
172{ 310{
173 int idx; 311 int idx;
174 312
@@ -178,90 +316,53 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
178 !lock_is_held(&rcu_sched_lock_map), 316 !lock_is_held(&rcu_sched_lock_map),
179 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section"); 317 "Illegal synchronize_srcu() in same-type SRCU (or RCU) read-side critical section");
180 318
181 idx = sp->completed; 319 smp_mb(); /* Ensure prior action happens before grace period. */
320 idx = ACCESS_ONCE(sp->completed);
321 smp_mb(); /* Access to ->completed before lock acquisition. */
182 mutex_lock(&sp->mutex); 322 mutex_lock(&sp->mutex);
183 323
184 /* 324 /*
185 * Check to see if someone else did the work for us while we were 325 * Check to see if someone else did the work for us while we were
186 * waiting to acquire the lock. We need -two- advances of 326 * waiting to acquire the lock. We need -three- advances of
187 * the counter, not just one. If there was but one, we might have 327 * the counter, not just one. If there was but one, we might have
188 * shown up -after- our helper's first synchronize_sched(), thus 328 * shown up -after- our helper's first synchronize_sched(), thus
189 * having failed to prevent CPU-reordering races with concurrent 329 * having failed to prevent CPU-reordering races with concurrent
190 * srcu_read_unlock()s on other CPUs (see comment below). So we 330 * srcu_read_unlock()s on other CPUs (see comment below). If there
191 * either (1) wait for two or (2) supply the second ourselves. 331 * was only two, we are guaranteed to have waited through only one
332 * full index-flip phase. So we either (1) wait for three or
333 * (2) supply the additional ones we need.
192 */ 334 */
193 335
194 if ((sp->completed - idx) >= 2) { 336 if (sp->completed == idx + 2)
337 idx = 1;
338 else if (sp->completed == idx + 3) {
195 mutex_unlock(&sp->mutex); 339 mutex_unlock(&sp->mutex);
196 return; 340 return;
197 } 341 } else
198 342 idx = 0;
199 sync_func(); /* Force memory barrier on all CPUs. */
200 343
201 /* 344 /*
202 * The preceding synchronize_sched() ensures that any CPU that 345 * If there were no helpers, then we need to do two flips of
203 * sees the new value of sp->completed will also see any preceding 346 * the index. The first flip is required if there are any
204 * changes to data structures made by this CPU. This prevents 347 * outstanding SRCU readers even if there are no new readers
205 * some other CPU from reordering the accesses in its SRCU 348 * running concurrently with the first counter flip.
206 * read-side critical section to precede the corresponding
207 * srcu_read_lock() -- ensuring that such references will in
208 * fact be protected.
209 * 349 *
210 * So it is now safe to do the flip. 350 * The second flip is required when a new reader picks up
211 */ 351 * the old value of the index, but does not increment its
212 352 * counter until after its counters is summed/rechecked by
213 idx = sp->completed & 0x1; 353 * srcu_readers_active_idx_check(). In this case, the current SRCU
214 sp->completed++; 354 * grace period would be OK because the SRCU read-side critical
215 355 * section started after this SRCU grace period started, so the
216 sync_func(); /* Force memory barrier on all CPUs. */ 356 * grace period is not required to wait for the reader.
217
218 /*
219 * At this point, because of the preceding synchronize_sched(),
220 * all srcu_read_lock() calls using the old counters have completed.
221 * Their corresponding critical sections might well be still
222 * executing, but the srcu_read_lock() primitives themselves
223 * will have finished executing. We initially give readers
224 * an arbitrarily chosen 10 microseconds to get out of their
225 * SRCU read-side critical sections, then loop waiting 1/HZ
226 * seconds per iteration. The 10-microsecond value has done
227 * very well in testing.
228 */
229
230 if (srcu_readers_active_idx(sp, idx))
231 udelay(SYNCHRONIZE_SRCU_READER_DELAY);
232 while (srcu_readers_active_idx(sp, idx))
233 schedule_timeout_interruptible(1);
234
235 sync_func(); /* Force memory barrier on all CPUs. */
236
237 /*
238 * The preceding synchronize_sched() forces all srcu_read_unlock()
239 * primitives that were executing concurrently with the preceding
240 * for_each_possible_cpu() loop to have completed by this point.
241 * More importantly, it also forces the corresponding SRCU read-side
242 * critical sections to have also completed, and the corresponding
243 * references to SRCU-protected data items to be dropped.
244 * 357 *
245 * Note: 358 * However, the next SRCU grace period would be waiting for the
246 * 359 * other set of counters to go to zero, and therefore would not
247 * Despite what you might think at first glance, the 360 * wait for the reader, which would be very bad. To avoid this
248 * preceding synchronize_sched() -must- be within the 361 * bad scenario, we flip and wait twice, clearing out both sets
249 * critical section ended by the following mutex_unlock(). 362 * of counters.
250 * Otherwise, a task taking the early exit can race
251 * with a srcu_read_unlock(), which might have executed
252 * just before the preceding srcu_readers_active() check,
253 * and whose CPU might have reordered the srcu_read_unlock()
254 * with the preceding critical section. In this case, there
255 * is nothing preventing the synchronize_sched() task that is
256 * taking the early exit from freeing a data structure that
257 * is still being referenced (out of order) by the task
258 * doing the srcu_read_unlock().
259 *
260 * Alternatively, the comparison with "2" on the early exit
261 * could be changed to "3", but this increases synchronize_srcu()
262 * latency for bulk loads. So the current code is preferred.
263 */ 363 */
264 364 for (; idx < 2; idx++)
365 flip_idx_and_wait(sp, expedited);
265 mutex_unlock(&sp->mutex); 366 mutex_unlock(&sp->mutex);
266} 367}
267 368
@@ -281,7 +382,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void))
281 */ 382 */
282void synchronize_srcu(struct srcu_struct *sp) 383void synchronize_srcu(struct srcu_struct *sp)
283{ 384{
284 __synchronize_srcu(sp, synchronize_sched); 385 __synchronize_srcu(sp, 0);
285} 386}
286EXPORT_SYMBOL_GPL(synchronize_srcu); 387EXPORT_SYMBOL_GPL(synchronize_srcu);
287 388
@@ -289,18 +390,11 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
289 * synchronize_srcu_expedited - Brute-force SRCU grace period 390 * synchronize_srcu_expedited - Brute-force SRCU grace period
290 * @sp: srcu_struct with which to synchronize. 391 * @sp: srcu_struct with which to synchronize.
291 * 392 *
292 * Wait for an SRCU grace period to elapse, but use a "big hammer" 393 * Wait for an SRCU grace period to elapse, but be more aggressive about
293 * approach to force the grace period to end quickly. This consumes 394 * spinning rather than blocking when waiting.
294 * significant time on all CPUs and is unfriendly to real-time workloads,
295 * so is thus not recommended for any sort of common-case code. In fact,
296 * if you are using synchronize_srcu_expedited() in a loop, please
297 * restructure your code to batch your updates, and then use a single
298 * synchronize_srcu() instead.
299 * 395 *
300 * Note that it is illegal to call this function while holding any lock 396 * Note that it is illegal to call this function while holding any lock
301 * that is acquired by a CPU-hotplug notifier. And yes, it is also illegal 397 * that is acquired by a CPU-hotplug notifier. It is also illegal to call
302 * to call this function from a CPU-hotplug notifier. Failing to observe
303 * these restriction will result in deadlock. It is also illegal to call
304 * synchronize_srcu_expedited() from the corresponding SRCU read-side 398 * synchronize_srcu_expedited() from the corresponding SRCU read-side
305 * critical section; doing so will result in deadlock. However, it is 399 * critical section; doing so will result in deadlock. However, it is
306 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct 400 * perfectly legal to call synchronize_srcu_expedited() on one srcu_struct
@@ -309,7 +403,7 @@ EXPORT_SYMBOL_GPL(synchronize_srcu);
309 */ 403 */
310void synchronize_srcu_expedited(struct srcu_struct *sp) 404void synchronize_srcu_expedited(struct srcu_struct *sp)
311{ 405{
312 __synchronize_srcu(sp, synchronize_sched_expedited); 406 __synchronize_srcu(sp, 1);
313} 407}
314EXPORT_SYMBOL_GPL(synchronize_srcu_expedited); 408EXPORT_SYMBOL_GPL(synchronize_srcu_expedited);
315 409