aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/srcu.c
diff options
context:
space:
mode:
authorLai Jiangshan <laijs@cn.fujitsu.com>2012-02-27 12:29:09 -0500
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>2012-04-30 13:48:22 -0400
commitb52ce066c55a6a53cf1f8d71308d74f908e31b99 (patch)
treee814e4e175f2bd8e1c0795247f413d711c7350df /kernel/srcu.c
parent18108ebfebe9e871d0a9af830baf8f5df69eb5fc (diff)
rcu: Implement a variant of Peter's SRCU algorithm
This commit implements a variant of Peter's algorithm, which may be found at https://lkml.org/lkml/2012/2/1/119. o Make the checking lock-free to enable parallel checking. Parallel checking is required when (1) the original checking task is preempted for a long time, (2) sychronize_srcu_expedited() starts during an ongoing SRCU grace period, or (3) we wish to avoid acquiring a lock. o Since the checking is lock-free, we avoid a mutex in state machine for call_srcu(). o Remove the SRCU_REF_MASK and remove the coupling with the flipping. This might allow us to remove the preempt_disable() in future versions, though such removal will need great care because it rescinds the one-old-reader-per-CPU guarantee. o Remove a smp_mb(), simplify the comments and make the smp_mb() pairs more intuitive. Inspired-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Diffstat (limited to 'kernel/srcu.c')
-rw-r--r--kernel/srcu.c149
1 files changed, 69 insertions, 80 deletions
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 1fecb4d858ed..e0139a274856 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -73,10 +73,25 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
73#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ 73#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
74 74
75/* 75/*
76 * Returns approximate total of the readers' ->seq[] values for the
77 * rank of per-CPU counters specified by idx.
78 */
79static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
80{
81 int cpu;
82 unsigned long sum = 0;
83 unsigned long t;
84
85 for_each_possible_cpu(cpu) {
86 t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
87 sum += t;
88 }
89 return sum;
90}
91
92/*
76 * Returns approximate number of readers active on the specified rank 93 * Returns approximate number of readers active on the specified rank
77 * of per-CPU counters. Also snapshots each counter's value in the 94 * of the per-CPU ->c[] counters.
78 * corresponding element of sp->snap[] for later use validating
79 * the sum.
80 */ 95 */
81static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) 96static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
82{ 97{
@@ -87,26 +102,45 @@ static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
87 for_each_possible_cpu(cpu) { 102 for_each_possible_cpu(cpu) {
88 t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); 103 t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
89 sum += t; 104 sum += t;
90 sp->snap[cpu] = t;
91 } 105 }
92 return sum & SRCU_REF_MASK; 106 return sum;
93} 107}
94 108
95/* 109/*
96 * To be called from the update side after an index flip. Returns true 110 * Return true if the number of pre-existing readers is determined to
97 * if the modulo sum of the counters is stably zero, false if there is 111 * be stably zero. An example unstable zero can occur if the call
98 * some possibility of non-zero. 112 * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
113 * but due to task migration, sees the corresponding __srcu_read_unlock()
114 * decrement. This can happen because srcu_readers_active_idx() takes
115 * time to sum the array, and might in fact be interrupted or preempted
116 * partway through the summation.
99 */ 117 */
100static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) 118static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
101{ 119{
102 int cpu; 120 unsigned long seq;
121
122 seq = srcu_readers_seq_idx(sp, idx);
123
124 /*
125 * The following smp_mb() A pairs with the smp_mb() B located in
126 * __srcu_read_lock(). This pairing ensures that if an
127 * __srcu_read_lock() increments its counter after the summation
128 * in srcu_readers_active_idx(), then the corresponding SRCU read-side
129 * critical section will see any changes made prior to the start
130 * of the current SRCU grace period.
131 *
132 * Also, if the above call to srcu_readers_seq_idx() saw the
133 * increment of ->seq[], then the call to srcu_readers_active_idx()
134 * must see the increment of ->c[].
135 */
136 smp_mb(); /* A */
103 137
104 /* 138 /*
105 * Note that srcu_readers_active_idx() can incorrectly return 139 * Note that srcu_readers_active_idx() can incorrectly return
106 * zero even though there is a pre-existing reader throughout. 140 * zero even though there is a pre-existing reader throughout.
107 * To see this, suppose that task A is in a very long SRCU 141 * To see this, suppose that task A is in a very long SRCU
108 * read-side critical section that started on CPU 0, and that 142 * read-side critical section that started on CPU 0, and that
109 * no other reader exists, so that the modulo sum of the counters 143 * no other reader exists, so that the sum of the counters
110 * is equal to one. Then suppose that task B starts executing 144 * is equal to one. Then suppose that task B starts executing
111 * srcu_readers_active_idx(), summing up to CPU 1, and then that 145 * srcu_readers_active_idx(), summing up to CPU 1, and then that
112 * task C starts reading on CPU 0, so that its increment is not 146 * task C starts reading on CPU 0, so that its increment is not
@@ -122,53 +156,31 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
122 return false; 156 return false;
123 157
124 /* 158 /*
125 * Since the caller recently flipped ->completed, we can see at 159 * The remainder of this function is the validation step.
126 * most one increment of each CPU's counter from this point 160 * The following smp_mb() D pairs with the smp_mb() C in
127 * forward. The reason for this is that the reader CPU must have 161 * __srcu_read_unlock(). If the __srcu_read_unlock() was seen
128 * fetched the index before srcu_readers_active_idx checked 162 * by srcu_readers_active_idx() above, then any destructive
129 * that CPU's counter, but not yet incremented its counter. 163 * operation performed after the grace period will happen after
130 * Its eventual counter increment will follow the read in 164 * the corresponding SRCU read-side critical section.
131 * srcu_readers_active_idx(), and that increment is immediately
132 * followed by smp_mb() B. Because smp_mb() D is between
133 * the ->completed flip and srcu_readers_active_idx()'s read,
134 * that CPU's subsequent load of ->completed must see the new
135 * value, and therefore increment the counter in the other rank.
136 */
137 smp_mb(); /* A */
138
139 /*
140 * Now, we check the ->snap array that srcu_readers_active_idx()
141 * filled in from the per-CPU counter values. Since
142 * __srcu_read_lock() increments the upper bits of the per-CPU
143 * counter, an increment/decrement pair will change the value
144 * of the counter. Since there is only one possible increment,
145 * the only way to wrap the counter is to have a huge number of
146 * counter decrements, which requires a huge number of tasks and
147 * huge SRCU read-side critical-section nesting levels, even on
148 * 32-bit systems.
149 * 165 *
150 * All of the ways of confusing the readings require that the scan 166 * Note that there can be at most NR_CPUS worth of readers using
151 * in srcu_readers_active_idx() see the read-side task's decrement, 167 * the old index, which is not enough to overflow even a 32-bit
152 * but not its increment. However, between that decrement and 168 * integer. (Yes, this does mean that systems having more than
153 * increment are smb_mb() B and C. Either or both of these pair 169 * a billion or so CPUs need to be 64-bit systems.) Therefore,
154 * with smp_mb() A above to ensure that the scan below will see 170 * the sum of the ->seq[] counters cannot possibly overflow.
155 * the read-side tasks's increment, thus noting a difference in 171 * Therefore, the only way that the return values of the two
156 * the counter values between the two passes. 172 * calls to srcu_readers_seq_idx() can be equal is if there were
157 * 173 * no increments of the corresponding rank of ->seq[] counts
158 * Therefore, if srcu_readers_active_idx() returned zero, and 174 * in the interim. But the missed-increment scenario laid out
159 * none of the counters changed, we know that the zero was the 175 * above includes an increment of the ->seq[] counter by
160 * correct sum. 176 * the corresponding __srcu_read_lock(). Therefore, if this
161 * 177 * scenario occurs, the return values from the two calls to
162 * Of course, it is possible that a task might be delayed 178 * srcu_readers_seq_idx() will differ, and thus the validation
163 * for a very long time in __srcu_read_lock() after fetching 179 * step below suffices.
164 * the index but before incrementing its counter. This
165 * possibility will be dealt with in __synchronize_srcu().
166 */ 180 */
167 for_each_possible_cpu(cpu) 181 smp_mb(); /* D */
168 if (sp->snap[cpu] != 182
169 ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx])) 183 return srcu_readers_seq_idx(sp, idx) == seq;
170 return false; /* False zero reading! */
171 return true;
172} 184}
173 185
174/** 186/**
@@ -216,9 +228,9 @@ int __srcu_read_lock(struct srcu_struct *sp)
216 preempt_disable(); 228 preempt_disable();
217 idx = rcu_dereference_index_check(sp->completed, 229 idx = rcu_dereference_index_check(sp->completed,
218 rcu_read_lock_sched_held()) & 0x1; 230 rcu_read_lock_sched_held()) & 0x1;
219 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 231 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
220 SRCU_USAGE_COUNT + 1;
221 smp_mb(); /* B */ /* Avoid leaking the critical section. */ 232 smp_mb(); /* B */ /* Avoid leaking the critical section. */
233 ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
222 preempt_enable(); 234 preempt_enable();
223 return idx; 235 return idx;
224} 236}
@@ -258,17 +270,6 @@ static void wait_idx(struct srcu_struct *sp, int idx, bool expedited)
258 int trycount = 0; 270 int trycount = 0;
259 271
260 /* 272 /*
261 * If a reader fetches the index before the ->completed increment,
262 * but increments its counter after srcu_readers_active_idx_check()
263 * sums it, then smp_mb() D will pair with __srcu_read_lock()'s
264 * smp_mb() B to ensure that the SRCU read-side critical section
265 * will see any updates that the current task performed before its
266 * call to synchronize_srcu(), or to synchronize_srcu_expedited(),
267 * as the case may be.
268 */
269 smp_mb(); /* D */
270
271 /*
272 * SRCU read-side critical sections are normally short, so wait 273 * SRCU read-side critical sections are normally short, so wait
273 * a small amount of time before possibly blocking. 274 * a small amount of time before possibly blocking.
274 */ 275 */
@@ -281,18 +282,6 @@ static void wait_idx(struct srcu_struct *sp, int idx, bool expedited)
281 schedule_timeout_interruptible(1); 282 schedule_timeout_interruptible(1);
282 } 283 }
283 } 284 }
284
285 /*
286 * The following smp_mb() E pairs with srcu_read_unlock()'s
287 * smp_mb C to ensure that if srcu_readers_active_idx_check()
288 * sees srcu_read_unlock()'s counter decrement, then any
289 * of the current task's subsequent code will happen after
290 * that SRCU read-side critical section.
291 *
292 * It also ensures the order between the above waiting and
293 * the next flipping.
294 */
295 smp_mb(); /* E */
296} 285}
297 286
298static void srcu_flip(struct srcu_struct *sp) 287static void srcu_flip(struct srcu_struct *sp)