diff options
author | Lai Jiangshan <laijs@cn.fujitsu.com> | 2012-02-27 12:29:09 -0500 |
---|---|---|
committer | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2012-04-30 13:48:22 -0400 |
commit | b52ce066c55a6a53cf1f8d71308d74f908e31b99 (patch) | |
tree | e814e4e175f2bd8e1c0795247f413d711c7350df /kernel/srcu.c | |
parent | 18108ebfebe9e871d0a9af830baf8f5df69eb5fc (diff) |
rcu: Implement a variant of Peter's SRCU algorithm
This commit implements a variant of Peter's algorithm, which may be found
at https://lkml.org/lkml/2012/2/1/119.
o Make the checking lock-free to enable parallel checking.
Parallel checking is required when (1) the original checking
task is preempted for a long time, (2) sychronize_srcu_expedited()
starts during an ongoing SRCU grace period, or (3) we wish to
avoid acquiring a lock.
o Since the checking is lock-free, we avoid a mutex in state machine
for call_srcu().
o Remove the SRCU_REF_MASK and remove the coupling with the flipping.
This might allow us to remove the preempt_disable() in future
versions, though such removal will need great care because it
rescinds the one-old-reader-per-CPU guarantee.
o Remove a smp_mb(), simplify the comments and make the smp_mb() pairs
more intuitive.
Inspired-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Diffstat (limited to 'kernel/srcu.c')
-rw-r--r-- | kernel/srcu.c | 149 |
1 files changed, 69 insertions, 80 deletions
diff --git a/kernel/srcu.c b/kernel/srcu.c index 1fecb4d858ed..e0139a274856 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c | |||
@@ -73,10 +73,25 @@ EXPORT_SYMBOL_GPL(init_srcu_struct); | |||
73 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ | 73 | #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ |
74 | 74 | ||
75 | /* | 75 | /* |
76 | * Returns approximate total of the readers' ->seq[] values for the | ||
77 | * rank of per-CPU counters specified by idx. | ||
78 | */ | ||
79 | static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx) | ||
80 | { | ||
81 | int cpu; | ||
82 | unsigned long sum = 0; | ||
83 | unsigned long t; | ||
84 | |||
85 | for_each_possible_cpu(cpu) { | ||
86 | t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]); | ||
87 | sum += t; | ||
88 | } | ||
89 | return sum; | ||
90 | } | ||
91 | |||
92 | /* | ||
76 | * Returns approximate number of readers active on the specified rank | 93 | * Returns approximate number of readers active on the specified rank |
77 | * of per-CPU counters. Also snapshots each counter's value in the | 94 | * of the per-CPU ->c[] counters. |
78 | * corresponding element of sp->snap[] for later use validating | ||
79 | * the sum. | ||
80 | */ | 95 | */ |
81 | static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) | 96 | static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) |
82 | { | 97 | { |
@@ -87,26 +102,45 @@ static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx) | |||
87 | for_each_possible_cpu(cpu) { | 102 | for_each_possible_cpu(cpu) { |
88 | t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); | 103 | t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]); |
89 | sum += t; | 104 | sum += t; |
90 | sp->snap[cpu] = t; | ||
91 | } | 105 | } |
92 | return sum & SRCU_REF_MASK; | 106 | return sum; |
93 | } | 107 | } |
94 | 108 | ||
95 | /* | 109 | /* |
96 | * To be called from the update side after an index flip. Returns true | 110 | * Return true if the number of pre-existing readers is determined to |
97 | * if the modulo sum of the counters is stably zero, false if there is | 111 | * be stably zero. An example unstable zero can occur if the call |
98 | * some possibility of non-zero. | 112 | * to srcu_readers_active_idx() misses an __srcu_read_lock() increment, |
113 | * but due to task migration, sees the corresponding __srcu_read_unlock() | ||
114 | * decrement. This can happen because srcu_readers_active_idx() takes | ||
115 | * time to sum the array, and might in fact be interrupted or preempted | ||
116 | * partway through the summation. | ||
99 | */ | 117 | */ |
100 | static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) | 118 | static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) |
101 | { | 119 | { |
102 | int cpu; | 120 | unsigned long seq; |
121 | |||
122 | seq = srcu_readers_seq_idx(sp, idx); | ||
123 | |||
124 | /* | ||
125 | * The following smp_mb() A pairs with the smp_mb() B located in | ||
126 | * __srcu_read_lock(). This pairing ensures that if an | ||
127 | * __srcu_read_lock() increments its counter after the summation | ||
128 | * in srcu_readers_active_idx(), then the corresponding SRCU read-side | ||
129 | * critical section will see any changes made prior to the start | ||
130 | * of the current SRCU grace period. | ||
131 | * | ||
132 | * Also, if the above call to srcu_readers_seq_idx() saw the | ||
133 | * increment of ->seq[], then the call to srcu_readers_active_idx() | ||
134 | * must see the increment of ->c[]. | ||
135 | */ | ||
136 | smp_mb(); /* A */ | ||
103 | 137 | ||
104 | /* | 138 | /* |
105 | * Note that srcu_readers_active_idx() can incorrectly return | 139 | * Note that srcu_readers_active_idx() can incorrectly return |
106 | * zero even though there is a pre-existing reader throughout. | 140 | * zero even though there is a pre-existing reader throughout. |
107 | * To see this, suppose that task A is in a very long SRCU | 141 | * To see this, suppose that task A is in a very long SRCU |
108 | * read-side critical section that started on CPU 0, and that | 142 | * read-side critical section that started on CPU 0, and that |
109 | * no other reader exists, so that the modulo sum of the counters | 143 | * no other reader exists, so that the sum of the counters |
110 | * is equal to one. Then suppose that task B starts executing | 144 | * is equal to one. Then suppose that task B starts executing |
111 | * srcu_readers_active_idx(), summing up to CPU 1, and then that | 145 | * srcu_readers_active_idx(), summing up to CPU 1, and then that |
112 | * task C starts reading on CPU 0, so that its increment is not | 146 | * task C starts reading on CPU 0, so that its increment is not |
@@ -122,53 +156,31 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx) | |||
122 | return false; | 156 | return false; |
123 | 157 | ||
124 | /* | 158 | /* |
125 | * Since the caller recently flipped ->completed, we can see at | 159 | * The remainder of this function is the validation step. |
126 | * most one increment of each CPU's counter from this point | 160 | * The following smp_mb() D pairs with the smp_mb() C in |
127 | * forward. The reason for this is that the reader CPU must have | 161 | * __srcu_read_unlock(). If the __srcu_read_unlock() was seen |
128 | * fetched the index before srcu_readers_active_idx checked | 162 | * by srcu_readers_active_idx() above, then any destructive |
129 | * that CPU's counter, but not yet incremented its counter. | 163 | * operation performed after the grace period will happen after |
130 | * Its eventual counter increment will follow the read in | 164 | * the corresponding SRCU read-side critical section. |
131 | * srcu_readers_active_idx(), and that increment is immediately | ||
132 | * followed by smp_mb() B. Because smp_mb() D is between | ||
133 | * the ->completed flip and srcu_readers_active_idx()'s read, | ||
134 | * that CPU's subsequent load of ->completed must see the new | ||
135 | * value, and therefore increment the counter in the other rank. | ||
136 | */ | ||
137 | smp_mb(); /* A */ | ||
138 | |||
139 | /* | ||
140 | * Now, we check the ->snap array that srcu_readers_active_idx() | ||
141 | * filled in from the per-CPU counter values. Since | ||
142 | * __srcu_read_lock() increments the upper bits of the per-CPU | ||
143 | * counter, an increment/decrement pair will change the value | ||
144 | * of the counter. Since there is only one possible increment, | ||
145 | * the only way to wrap the counter is to have a huge number of | ||
146 | * counter decrements, which requires a huge number of tasks and | ||
147 | * huge SRCU read-side critical-section nesting levels, even on | ||
148 | * 32-bit systems. | ||
149 | * | 165 | * |
150 | * All of the ways of confusing the readings require that the scan | 166 | * Note that there can be at most NR_CPUS worth of readers using |
151 | * in srcu_readers_active_idx() see the read-side task's decrement, | 167 | * the old index, which is not enough to overflow even a 32-bit |
152 | * but not its increment. However, between that decrement and | 168 | * integer. (Yes, this does mean that systems having more than |
153 | * increment are smb_mb() B and C. Either or both of these pair | 169 | * a billion or so CPUs need to be 64-bit systems.) Therefore, |
154 | * with smp_mb() A above to ensure that the scan below will see | 170 | * the sum of the ->seq[] counters cannot possibly overflow. |
155 | * the read-side tasks's increment, thus noting a difference in | 171 | * Therefore, the only way that the return values of the two |
156 | * the counter values between the two passes. | 172 | * calls to srcu_readers_seq_idx() can be equal is if there were |
157 | * | 173 | * no increments of the corresponding rank of ->seq[] counts |
158 | * Therefore, if srcu_readers_active_idx() returned zero, and | 174 | * in the interim. But the missed-increment scenario laid out |
159 | * none of the counters changed, we know that the zero was the | 175 | * above includes an increment of the ->seq[] counter by |
160 | * correct sum. | 176 | * the corresponding __srcu_read_lock(). Therefore, if this |
161 | * | 177 | * scenario occurs, the return values from the two calls to |
162 | * Of course, it is possible that a task might be delayed | 178 | * srcu_readers_seq_idx() will differ, and thus the validation |
163 | * for a very long time in __srcu_read_lock() after fetching | 179 | * step below suffices. |
164 | * the index but before incrementing its counter. This | ||
165 | * possibility will be dealt with in __synchronize_srcu(). | ||
166 | */ | 180 | */ |
167 | for_each_possible_cpu(cpu) | 181 | smp_mb(); /* D */ |
168 | if (sp->snap[cpu] != | 182 | |
169 | ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx])) | 183 | return srcu_readers_seq_idx(sp, idx) == seq; |
170 | return false; /* False zero reading! */ | ||
171 | return true; | ||
172 | } | 184 | } |
173 | 185 | ||
174 | /** | 186 | /** |
@@ -216,9 +228,9 @@ int __srcu_read_lock(struct srcu_struct *sp) | |||
216 | preempt_disable(); | 228 | preempt_disable(); |
217 | idx = rcu_dereference_index_check(sp->completed, | 229 | idx = rcu_dereference_index_check(sp->completed, |
218 | rcu_read_lock_sched_held()) & 0x1; | 230 | rcu_read_lock_sched_held()) & 0x1; |
219 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += | 231 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1; |
220 | SRCU_USAGE_COUNT + 1; | ||
221 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ | 232 | smp_mb(); /* B */ /* Avoid leaking the critical section. */ |
233 | ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1; | ||
222 | preempt_enable(); | 234 | preempt_enable(); |
223 | return idx; | 235 | return idx; |
224 | } | 236 | } |
@@ -258,17 +270,6 @@ static void wait_idx(struct srcu_struct *sp, int idx, bool expedited) | |||
258 | int trycount = 0; | 270 | int trycount = 0; |
259 | 271 | ||
260 | /* | 272 | /* |
261 | * If a reader fetches the index before the ->completed increment, | ||
262 | * but increments its counter after srcu_readers_active_idx_check() | ||
263 | * sums it, then smp_mb() D will pair with __srcu_read_lock()'s | ||
264 | * smp_mb() B to ensure that the SRCU read-side critical section | ||
265 | * will see any updates that the current task performed before its | ||
266 | * call to synchronize_srcu(), or to synchronize_srcu_expedited(), | ||
267 | * as the case may be. | ||
268 | */ | ||
269 | smp_mb(); /* D */ | ||
270 | |||
271 | /* | ||
272 | * SRCU read-side critical sections are normally short, so wait | 273 | * SRCU read-side critical sections are normally short, so wait |
273 | * a small amount of time before possibly blocking. | 274 | * a small amount of time before possibly blocking. |
274 | */ | 275 | */ |
@@ -281,18 +282,6 @@ static void wait_idx(struct srcu_struct *sp, int idx, bool expedited) | |||
281 | schedule_timeout_interruptible(1); | 282 | schedule_timeout_interruptible(1); |
282 | } | 283 | } |
283 | } | 284 | } |
284 | |||
285 | /* | ||
286 | * The following smp_mb() E pairs with srcu_read_unlock()'s | ||
287 | * smp_mb C to ensure that if srcu_readers_active_idx_check() | ||
288 | * sees srcu_read_unlock()'s counter decrement, then any | ||
289 | * of the current task's subsequent code will happen after | ||
290 | * that SRCU read-side critical section. | ||
291 | * | ||
292 | * It also ensures the order between the above waiting and | ||
293 | * the next flipping. | ||
294 | */ | ||
295 | smp_mb(); /* E */ | ||
296 | } | 285 | } |
297 | 286 | ||
298 | static void srcu_flip(struct srcu_struct *sp) | 287 | static void srcu_flip(struct srcu_struct *sp) |