rcu: Implement a variant of Peter's SRCU algorithm

This commit implements a variant of Peter's algorithm, which may be found at https://lkml.org/lkml/2012/2/1/119. o Make the checking lock-free to enable parallel checking. Parallel checking is required when (1) the original checking task is preempted for a long time, (2) sychronize_srcu_expedited() starts during an ongoing SRCU grace period, or (3) we wish to avoid acquiring a lock. o Since the checking is lock-free, we avoid a mutex in state machine for call_srcu(). o Remove the SRCU_REF_MASK and remove the coupling with the flipping. This might allow us to remove the preempt_disable() in future versions, though such removal will need great care because it rescinds the one-old-reader-per-CPU guarantee. o Remove a smp_mb(), simplify the comments and make the smp_mb() pairs more intuitive. Inspired-by: Peter Zijlstra <peterz@infradead.org> Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
author: Lai Jiangshan <laijs@cn.fujitsu.com> 2012-02-27 12:29:09 -0500
committer: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 2012-04-30 13:48:22 -0400
commit: b52ce066c55a6a53cf1f8d71308d74f908e31b99 (patch)
tree: e814e4e175f2bd8e1c0795247f413d711c7350df /kernel/srcu.c
parent: 18108ebfebe9e871d0a9af830baf8f5df69eb5fc (diff)
1 files changed, 69 insertions, 80 deletions
diff --git a/kernel/srcu.c b/kernel/srcu.c
index 1fecb4d858ed..e0139a274856 100644
--- a/kernel/srcu.c
+++ b/kernel/srcu.c
@@ -73,10 +73,25 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
 #endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 /*
+ * Returns approximate total of the readers' ->seq[] values for the
+ * rank of per-CPU counters specified by idx.
+ */
+static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
+{
+        int cpu;
+        unsigned long sum = 0;
+        unsigned long t;
+        for_each_possible_cpu(cpu) {
+                t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
+                sum += t;
+        }
+        return sum;
+}
+/*
 * Returns approximate number of readers active on the specified rank
- * of per-CPU counters.  Also snapshots each counter's value in the
+ * of the per-CPU ->c[] counters.
- * corresponding element of sp->snap[] for later use validating
- * the sum.
 */
 static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
 {
@@ -87,26 +102,45 @@ static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
        for_each_possible_cpu(cpu) {
                t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
                sum += t;
-                sp->snap[cpu] = t;
        }
-        return sum & SRCU_REF_MASK;
+        return sum;
 }
 /*
- * To be called from the update side after an index flip.  Returns true
+ * Return true if the number of pre-existing readers is determined to
- * if the modulo sum of the counters is stably zero, false if there is
+ * be stably zero.  An example unstable zero can occur if the call
- * some possibility of non-zero.
+ * to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
+ * but due to task migration, sees the corresponding __srcu_read_unlock()
+ * decrement.  This can happen because srcu_readers_active_idx() takes
+ * time to sum the array, and might in fact be interrupted or preempted
+ * partway through the summation.
 */
 static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
 {
-        int cpu;
+        unsigned long seq;
+        seq = srcu_readers_seq_idx(sp, idx);
+        /*
+         * The following smp_mb() A pairs with the smp_mb() B located in
+         * __srcu_read_lock().  This pairing ensures that if an
+         * __srcu_read_lock() increments its counter after the summation
+         * in srcu_readers_active_idx(), then the corresponding SRCU read-side
+         * critical section will see any changes made prior to the start
+         * of the current SRCU grace period.
+         *
+         * Also, if the above call to srcu_readers_seq_idx() saw the
+         * increment of ->seq[], then the call to srcu_readers_active_idx()
+         * must see the increment of ->c[].
+         */
+        smp_mb(); /* A */
        /*
         * Note that srcu_readers_active_idx() can incorrectly return
         * zero even though there is a pre-existing reader throughout.
         * To see this, suppose that task A is in a very long SRCU
         * read-side critical section that started on CPU 0, and that
-         * no other reader exists, so that the modulo sum of the counters
+         * no other reader exists, so that the sum of the counters
         * is equal to one.  Then suppose that task B starts executing
         * srcu_readers_active_idx(), summing up to CPU 1, and then that
         * task C starts reading on CPU 0, so that its increment is not
@@ -122,53 +156,31 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
                return false;
        /*
-         * Since the caller recently flipped ->completed, we can see at
+         * The remainder of this function is the validation step.
-         * most one increment of each CPU's counter from this point
+         * The following smp_mb() D pairs with the smp_mb() C in
-         * forward.  The reason for this is that the reader CPU must have
+         * __srcu_read_unlock().  If the __srcu_read_unlock() was seen
-         * fetched the index before srcu_readers_active_idx checked
+         * by srcu_readers_active_idx() above, then any destructive
-         * that CPU's counter, but not yet incremented its counter.
+         * operation performed after the grace period will happen after
-         * Its eventual counter increment will follow the read in
+         * the corresponding SRCU read-side critical section.
-         * srcu_readers_active_idx(), and that increment is immediately
-         * followed by smp_mb() B.  Because smp_mb() D is between
-         * the ->completed flip and srcu_readers_active_idx()'s read,
-         * that CPU's subsequent load of ->completed must see the new
-         * value, and therefore increment the counter in the other rank.
-         */
-        smp_mb(); /* A */
-        /*
-         * Now, we check the ->snap array that srcu_readers_active_idx()
-         * filled in from the per-CPU counter values. Since
-         * __srcu_read_lock() increments the upper bits of the per-CPU
-         * counter, an increment/decrement pair will change the value
-         * of the counter.  Since there is only one possible increment,
-         * the only way to wrap the counter is to have a huge number of
-         * counter decrements, which requires a huge number of tasks and
-         * huge SRCU read-side critical-section nesting levels, even on
-         * 32-bit systems.
         *
-         * All of the ways of confusing the readings require that the scan
+         * Note that there can be at most NR_CPUS worth of readers using
-         * in srcu_readers_active_idx() see the read-side task's decrement,
+         * the old index, which is not enough to overflow even a 32-bit
-         * but not its increment.  However, between that decrement and
+         * integer.  (Yes, this does mean that systems having more than
-         * increment are smb_mb() B and C.  Either or both of these pair
+         * a billion or so CPUs need to be 64-bit systems.)  Therefore,
-         * with smp_mb() A above to ensure that the scan below will see
+         * the sum of the ->seq[] counters cannot possibly overflow.
-         * the read-side tasks's increment, thus noting a difference in
+         * Therefore, the only way that the return values of the two
-         * the counter values between the two passes.
+         * calls to srcu_readers_seq_idx() can be equal is if there were
-         *
+         * no increments of the corresponding rank of ->seq[] counts
-         * Therefore, if srcu_readers_active_idx() returned zero, and
+         * in the interim.  But the missed-increment scenario laid out
-         * none of the counters changed, we know that the zero was the
+         * above includes an increment of the ->seq[] counter by
-         * correct sum.
+         * the corresponding __srcu_read_lock().  Therefore, if this
-         *
+         * scenario occurs, the return values from the two calls to
-         * Of course, it is possible that a task might be delayed
+         * srcu_readers_seq_idx() will differ, and thus the validation
-         * for a very long time in __srcu_read_lock() after fetching
+         * step below suffices.
-         * the index but before incrementing its counter.  This
-         * possibility will be dealt with in __synchronize_srcu().
         */
-        for_each_possible_cpu(cpu)
+        smp_mb(); /* D */
-                if (sp->snap[cpu] !=
-                    ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]))
+        return srcu_readers_seq_idx(sp, idx) == seq;
-                        return false;  /* False zero reading! */
-        return true;
 }
 /**
@@ -216,9 +228,9 @@ int __srcu_read_lock(struct srcu_struct *sp)
        preempt_disable();
        idx = rcu_dereference_index_check(sp->completed,
                                          rcu_read_lock_sched_held()) & 0x1;
-        ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) +=
+        ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
-                SRCU_USAGE_COUNT + 1;
        smp_mb(); /* B */  /* Avoid leaking the critical section. */
+        ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
        preempt_enable();
        return idx;
 }
@@ -258,17 +270,6 @@ static void wait_idx(struct srcu_struct *sp, int idx, bool expedited)
        int trycount = 0;
        /*
-         * If a reader fetches the index before the ->completed increment,
-         * but increments its counter after srcu_readers_active_idx_check()
-         * sums it, then smp_mb() D will pair with __srcu_read_lock()'s
-         * smp_mb() B to ensure that the SRCU read-side critical section
-         * will see any updates that the current task performed before its
-         * call to synchronize_srcu(), or to synchronize_srcu_expedited(),
-         * as the case may be.
-         */
-        smp_mb(); /* D */
-        /*
         * SRCU read-side critical sections are normally short, so wait
         * a small amount of time before possibly blocking.
         */
@@ -281,18 +282,6 @@ static void wait_idx(struct srcu_struct *sp, int idx, bool expedited)
                                schedule_timeout_interruptible(1);
                }
        }
-        /*
-         * The following smp_mb() E pairs with srcu_read_unlock()'s
-         * smp_mb C to ensure that if srcu_readers_active_idx_check()
-         * sees srcu_read_unlock()'s counter decrement, then any
-         * of the current task's subsequent code will happen after
-         * that SRCU read-side critical section.
-         *
-         * It also ensures the order between the above waiting and
-         * the next flipping.
-         */
-        smp_mb(); /* E */
 }
 static void srcu_flip(struct srcu_struct *sp)
author	Lai Jiangshan <laijs@cn.fujitsu.com>	2012-02-27 12:29:09 -0500
committer	Paul E. McKenney <paulmck@linux.vnet.ibm.com>	2012-04-30 13:48:22 -0400
commit	b52ce066c55a6a53cf1f8d71308d74f908e31b99 (patch)
tree	e814e4e175f2bd8e1c0795247f413d711c7350df /kernel/srcu.c
parent	18108ebfebe9e871d0a9af830baf8f5df69eb5fc (diff)

diff --git a/kernel/srcu.c b/kernel/srcu.c index 1fecb4d858ed..e0139a274856 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c
@@ -73,10 +73,25 @@ EXPORT_SYMBOL_GPL(init_srcu_struct);
73	#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */	73	#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */
74		74
75	/*	75	/*
		76	* Returns approximate total of the readers' ->seq[] values for the
		77	* rank of per-CPU counters specified by idx.
		78	*/
		79	static unsigned long srcu_readers_seq_idx(struct srcu_struct *sp, int idx)
		80	{
		81	int cpu;
		82	unsigned long sum = 0;
		83	unsigned long t;
		84
		85	for_each_possible_cpu(cpu) {
		86	t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->seq[idx]);
		87	sum += t;
		88	}
		89	return sum;
		90	}
		91
		92	/*
76	* Returns approximate number of readers active on the specified rank	93	* Returns approximate number of readers active on the specified rank
77	* of per-CPU counters. Also snapshots each counter's value in the	94	* of the per-CPU ->c[] counters.
78	* corresponding element of sp->snap[] for later use validating
79	* the sum.
80	*/	95	*/
81	static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)	96	static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
82	{	97	{
@@ -87,26 +102,45 @@ static unsigned long srcu_readers_active_idx(struct srcu_struct *sp, int idx)
87	for_each_possible_cpu(cpu) {	102	for_each_possible_cpu(cpu) {
88	t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);	103	t = ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]);
89	sum += t;	104	sum += t;
90	sp->snap[cpu] = t;
91	}	105	}
92	return sum & SRCU_REF_MASK;	106	return sum;
93	}	107	}
94		108
95	/*	109	/*
96	* To be called from the update side after an index flip. Returns true	110	* Return true if the number of pre-existing readers is determined to
97	* if the modulo sum of the counters is stably zero, false if there is	111	* be stably zero. An example unstable zero can occur if the call
98	* some possibility of non-zero.	112	* to srcu_readers_active_idx() misses an __srcu_read_lock() increment,
		113	* but due to task migration, sees the corresponding __srcu_read_unlock()
		114	* decrement. This can happen because srcu_readers_active_idx() takes
		115	* time to sum the array, and might in fact be interrupted or preempted
		116	* partway through the summation.
99	*/	117	*/
100	static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)	118	static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
101	{	119	{
102	int cpu;	120	unsigned long seq;
		121
		122	seq = srcu_readers_seq_idx(sp, idx);
		123
		124	/*
		125	* The following smp_mb() A pairs with the smp_mb() B located in
		126	* __srcu_read_lock(). This pairing ensures that if an
		127	* __srcu_read_lock() increments its counter after the summation
		128	* in srcu_readers_active_idx(), then the corresponding SRCU read-side
		129	* critical section will see any changes made prior to the start
		130	* of the current SRCU grace period.
		131	*
		132	* Also, if the above call to srcu_readers_seq_idx() saw the
		133	* increment of ->seq[], then the call to srcu_readers_active_idx()
		134	* must see the increment of ->c[].
		135	*/
		136	smp_mb(); /* A */
103		137
104	/*	138	/*
105	* Note that srcu_readers_active_idx() can incorrectly return	139	* Note that srcu_readers_active_idx() can incorrectly return
106	* zero even though there is a pre-existing reader throughout.	140	* zero even though there is a pre-existing reader throughout.
107	* To see this, suppose that task A is in a very long SRCU	141	* To see this, suppose that task A is in a very long SRCU
108	* read-side critical section that started on CPU 0, and that	142	* read-side critical section that started on CPU 0, and that
109	* no other reader exists, so that the modulo sum of the counters	143	* no other reader exists, so that the sum of the counters
110	* is equal to one. Then suppose that task B starts executing	144	* is equal to one. Then suppose that task B starts executing
111	* srcu_readers_active_idx(), summing up to CPU 1, and then that	145	* srcu_readers_active_idx(), summing up to CPU 1, and then that
112	* task C starts reading on CPU 0, so that its increment is not	146	* task C starts reading on CPU 0, so that its increment is not
@@ -122,53 +156,31 @@ static bool srcu_readers_active_idx_check(struct srcu_struct *sp, int idx)
122	return false;	156	return false;
123		157
124	/*	158	/*
125	* Since the caller recently flipped ->completed, we can see at	159	* The remainder of this function is the validation step.
126	* most one increment of each CPU's counter from this point	160	* The following smp_mb() D pairs with the smp_mb() C in
127	* forward. The reason for this is that the reader CPU must have	161	* __srcu_read_unlock(). If the __srcu_read_unlock() was seen
128	* fetched the index before srcu_readers_active_idx checked	162	* by srcu_readers_active_idx() above, then any destructive
129	* that CPU's counter, but not yet incremented its counter.	163	* operation performed after the grace period will happen after
130	* Its eventual counter increment will follow the read in	164	* the corresponding SRCU read-side critical section.
131	* srcu_readers_active_idx(), and that increment is immediately
132	* followed by smp_mb() B. Because smp_mb() D is between
133	* the ->completed flip and srcu_readers_active_idx()'s read,
134	* that CPU's subsequent load of ->completed must see the new
135	* value, and therefore increment the counter in the other rank.
136	*/
137	smp_mb(); /* A */
138
139	/*
140	* Now, we check the ->snap array that srcu_readers_active_idx()
141	* filled in from the per-CPU counter values. Since
142	* __srcu_read_lock() increments the upper bits of the per-CPU
143	* counter, an increment/decrement pair will change the value
144	* of the counter. Since there is only one possible increment,
145	* the only way to wrap the counter is to have a huge number of
146	* counter decrements, which requires a huge number of tasks and
147	* huge SRCU read-side critical-section nesting levels, even on
148	* 32-bit systems.
149	*	165	*
150	* All of the ways of confusing the readings require that the scan	166	* Note that there can be at most NR_CPUS worth of readers using
151	* in srcu_readers_active_idx() see the read-side task's decrement,	167	* the old index, which is not enough to overflow even a 32-bit
152	* but not its increment. However, between that decrement and	168	* integer. (Yes, this does mean that systems having more than
153	* increment are smb_mb() B and C. Either or both of these pair	169	* a billion or so CPUs need to be 64-bit systems.) Therefore,
154	* with smp_mb() A above to ensure that the scan below will see	170	* the sum of the ->seq[] counters cannot possibly overflow.
155	* the read-side tasks's increment, thus noting a difference in	171	* Therefore, the only way that the return values of the two
156	* the counter values between the two passes.	172	* calls to srcu_readers_seq_idx() can be equal is if there were
157	*	173	* no increments of the corresponding rank of ->seq[] counts
158	* Therefore, if srcu_readers_active_idx() returned zero, and	174	* in the interim. But the missed-increment scenario laid out
159	* none of the counters changed, we know that the zero was the	175	* above includes an increment of the ->seq[] counter by
160	* correct sum.	176	* the corresponding __srcu_read_lock(). Therefore, if this
161	*	177	* scenario occurs, the return values from the two calls to
162	* Of course, it is possible that a task might be delayed	178	* srcu_readers_seq_idx() will differ, and thus the validation
163	* for a very long time in __srcu_read_lock() after fetching	179	* step below suffices.
164	* the index but before incrementing its counter. This
165	* possibility will be dealt with in __synchronize_srcu().
166	*/	180	*/
167	for_each_possible_cpu(cpu)	181	smp_mb(); /* D */
168	if (sp->snap[cpu] !=	182
169	ACCESS_ONCE(per_cpu_ptr(sp->per_cpu_ref, cpu)->c[idx]))	183	return srcu_readers_seq_idx(sp, idx) == seq;
170	return false; /* False zero reading! */
171	return true;
172	}	184	}
173		185
174	/**	186	/**
@@ -216,9 +228,9 @@ int __srcu_read_lock(struct srcu_struct *sp)
216	preempt_disable();	228	preempt_disable();
217	idx = rcu_dereference_index_check(sp->completed,	229	idx = rcu_dereference_index_check(sp->completed,
218	rcu_read_lock_sched_held()) & 0x1;	230	rcu_read_lock_sched_held()) & 0x1;
219	ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) +=	231	ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->c[idx]) += 1;
220	SRCU_USAGE_COUNT + 1;
221	smp_mb(); /* B / / Avoid leaking the critical section. */	232	smp_mb(); /* B / / Avoid leaking the critical section. */
		233	ACCESS_ONCE(this_cpu_ptr(sp->per_cpu_ref)->seq[idx]) += 1;
222	preempt_enable();	234	preempt_enable();
223	return idx;	235	return idx;
224	}	236	}
@@ -258,17 +270,6 @@ static void wait_idx(struct srcu_struct *sp, int idx, bool expedited)
258	int trycount = 0;	270	int trycount = 0;
259		271
260	/*	272	/*
261	* If a reader fetches the index before the ->completed increment,
262	* but increments its counter after srcu_readers_active_idx_check()
263	* sums it, then smp_mb() D will pair with __srcu_read_lock()'s
264	* smp_mb() B to ensure that the SRCU read-side critical section
265	* will see any updates that the current task performed before its
266	* call to synchronize_srcu(), or to synchronize_srcu_expedited(),
267	* as the case may be.
268	*/
269	smp_mb(); /* D */
270
271	/*
272	* SRCU read-side critical sections are normally short, so wait	273	* SRCU read-side critical sections are normally short, so wait
273	* a small amount of time before possibly blocking.	274	* a small amount of time before possibly blocking.
274	*/	275	*/
@@ -281,18 +282,6 @@ static void wait_idx(struct srcu_struct *sp, int idx, bool expedited)
281	schedule_timeout_interruptible(1);	282	schedule_timeout_interruptible(1);
282	}	283	}
283	}	284	}
284
285	/*
286	* The following smp_mb() E pairs with srcu_read_unlock()'s
287	* smp_mb C to ensure that if srcu_readers_active_idx_check()
288	* sees srcu_read_unlock()'s counter decrement, then any
289	* of the current task's subsequent code will happen after
290	* that SRCU read-side critical section.
291	*
292	* It also ensures the order between the above waiting and
293	* the next flipping.
294	*/
295	smp_mb(); /* E */
296	}	285	}
297		286
298	static void srcu_flip(struct srcu_struct *sp)	287	static void srcu_flip(struct srcu_struct *sp)