rcu: increase synchronize_sched_expedited() batching

The fix in commit #6a0cc49 requires more than three concurrent instances of synchronize_sched_expedited() before batching is possible. This patch uses a ticket-counter-like approach that is also not unrelated to Lai Jiangshan's Ring RCU to allow sharing of expedited grace periods even when there are only two concurrent instances of synchronize_sched_expedited(). This commit builds on Tejun's original posting, which may be found at http://lkml.org/lkml/2010/11/9/204, adding memory barriers, avoiding overflow of signed integers (other than via atomic_t), and fixing the detection of batching. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
author: Tejun Heo <tj@kernel.org> 2010-11-23 00:36:11 -0500
committer: Paul E. McKenney <paulmck@linux.vnet.ibm.com> 2010-12-17 15:34:08 -0500
commit: e27fc9641e8ddc8146f8e01f06e5eba2469698de (patch)
tree: 5cff4c23cb113b901718d9c0f4a6e3081f3173bd /kernel/rcutree_plugin.h
parent: 46fdb0937f26124700fc9fc80da4776330cc00d3 (diff)
1 files changed, 62 insertions, 20 deletions
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c22c4ef2a0d0..a3638710dc67 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1025,7 +1025,8 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
 #else /* #ifndef CONFIG_SMP */
-static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);
+static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
+static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
 static int synchronize_sched_expedited_cpu_stop(void *data)
 {
@@ -1041,8 +1042,6 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
         * robustness against future implementation changes.
         */
        smp_mb(); /* See above comment block. */
-        if (cpumask_first(cpu_online_mask) == smp_processor_id())
-                atomic_inc(&synchronize_sched_expedited_count);
        return 0;
 }
@@ -1056,43 +1055,86 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
 * lock that is acquired by a CPU-hotplug notifier.  Failing to
 * observe this restriction will result in deadlock.
 *
- * The synchronize_sched_expedited_cpu_stop() function is called
+ * This implementation can be thought of as an application of ticket
- * in stop-CPU context, but in order to keep overhead down to a dull
+ * locking to RCU, with sync_sched_expedited_started and
- * roar, we don't force this function to wait for its counterparts
+ * sync_sched_expedited_done taking on the roles of the halves
- * on other CPUs.  One instance of this function will increment the
+ * of the ticket-lock word.  Each task atomically increments
- * synchronize_sched_expedited_count variable per call to
+ * sync_sched_expedited_started upon entry, snapshotting the old value,
- * try_stop_cpus(), but there is no guarantee what order this instance
+ * then attempts to stop all the CPUs.  If this succeeds, then each
- * will occur in.  The worst case is that it is last on one call
+ * CPU will have executed a context switch, resulting in an RCU-sched
- * to try_stop_cpus(), and the first on the next call.  This means
+ * grace period.  We are then done, so we use atomic_cmpxchg() to
- * that piggybacking requires that synchronize_sched_expedited_count
+ * update sync_sched_expedited_done to match our snapshot -- but
- * be incremented by 3: this guarantees that the piggybacking
+ * only if someone else has not already advanced past our snapshot.
- * task has waited through an entire cycle of context switches,
+ *
- * even in the worst case.
+ * On the other hand, if try_stop_cpus() fails, we check the value
+ * of sync_sched_expedited_done.  If it has advanced past our
+ * initial snapshot, then someone else must have forced a grace period
+ * some time after we took our snapshot.  In this case, our work is
+ * done for us, and we can simply return.  Otherwise, we try again,
+ * but keep our initial snapshot for purposes of checking for someone
+ * doing our work for us.
+ *
+ * If we fail too many times in a row, we fall back to synchronize_sched().
 */
 void synchronize_sched_expedited(void)
 {
-        int snap, trycount = 0;
+        int firstsnap, s, snap, trycount = 0;
-        smp_mb();  /* ensure prior mod happens before capturing snap. */
+        /* Note that atomic_inc_return() implies full memory barrier. */
-        snap = atomic_read(&synchronize_sched_expedited_count) + 2;
+        firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
        get_online_cpus();
+        /*
+         * Each pass through the following loop attempts to force a
+         * context switch on each CPU.
+         */
        while (try_stop_cpus(cpu_online_mask,
                             synchronize_sched_expedited_cpu_stop,
                             NULL) == -EAGAIN) {
                put_online_cpus();
+                /* No joy, try again later.  Or just synchronize_sched(). */
                if (trycount++ < 10)
                        udelay(trycount * num_online_cpus());
                else {
                        synchronize_sched();
                        return;
                }
-                if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {
+                /* Check to see if someone else did our work for us. */
+                s = atomic_read(&sync_sched_expedited_done);
+                if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
                        smp_mb(); /* ensure test happens before caller kfree */
                        return;
                }
+                /*
+                 * Refetching sync_sched_expedited_started allows later
+                 * callers to piggyback on our grace period.  We subtract
+                 * 1 to get the same token that the last incrementer got.
+                 * We retry after they started, so our grace period works
+                 * for them, and they started after our first try, so their
+                 * grace period works for us.
+                 */
                get_online_cpus();
+                snap = atomic_read(&sync_sched_expedited_started) - 1;
+                smp_mb(); /* ensure read is before try_stop_cpus(). */
        }
-        smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */
+        /*
+         * Everyone up to our most recent fetch is covered by our grace
+         * period.  Update the counter, but only if our work is still
+         * relevant -- which it won't be if someone who started later
+         * than we did beat us to the punch.
+         */
+        do {
+                s = atomic_read(&sync_sched_expedited_done);
+                if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
+                        smp_mb(); /* ensure test happens before caller kfree */
+                        break;
+                }
+        } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
        put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
author	Tejun Heo <tj@kernel.org>	2010-11-23 00:36:11 -0500
committer	Paul E. McKenney <paulmck@linux.vnet.ibm.com>	2010-12-17 15:34:08 -0500
commit	e27fc9641e8ddc8146f8e01f06e5eba2469698de (patch)
tree	5cff4c23cb113b901718d9c0f4a6e3081f3173bd /kernel/rcutree_plugin.h
parent	46fdb0937f26124700fc9fc80da4776330cc00d3 (diff)

diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c22c4ef2a0d0..a3638710dc67 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h
@@ -1025,7 +1025,8 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1025		1025
1026	#else /* #ifndef CONFIG_SMP */	1026	#else /* #ifndef CONFIG_SMP */
1027		1027
1028	static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0);	1028	static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
		1029	static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
1029		1030
1030	static int synchronize_sched_expedited_cpu_stop(void *data)	1031	static int synchronize_sched_expedited_cpu_stop(void *data)
1031	{	1032	{
@@ -1041,8 +1042,6 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
1041	* robustness against future implementation changes.	1042	* robustness against future implementation changes.
1042	*/	1043	*/
1043	smp_mb(); /* See above comment block. */	1044	smp_mb(); /* See above comment block. */
1044	if (cpumask_first(cpu_online_mask) == smp_processor_id())
1045	atomic_inc(&synchronize_sched_expedited_count);
1046	return 0;	1045	return 0;
1047	}	1046	}
1048		1047
@@ -1056,43 +1055,86 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
1056	* lock that is acquired by a CPU-hotplug notifier. Failing to	1055	* lock that is acquired by a CPU-hotplug notifier. Failing to
1057	* observe this restriction will result in deadlock.	1056	* observe this restriction will result in deadlock.
1058	*	1057	*
1059	* The synchronize_sched_expedited_cpu_stop() function is called	1058	* This implementation can be thought of as an application of ticket
1060	* in stop-CPU context, but in order to keep overhead down to a dull	1059	* locking to RCU, with sync_sched_expedited_started and
1061	* roar, we don't force this function to wait for its counterparts	1060	* sync_sched_expedited_done taking on the roles of the halves
1062	* on other CPUs. One instance of this function will increment the	1061	* of the ticket-lock word. Each task atomically increments
1063	* synchronize_sched_expedited_count variable per call to	1062	* sync_sched_expedited_started upon entry, snapshotting the old value,
1064	* try_stop_cpus(), but there is no guarantee what order this instance	1063	* then attempts to stop all the CPUs. If this succeeds, then each
1065	* will occur in. The worst case is that it is last on one call	1064	* CPU will have executed a context switch, resulting in an RCU-sched
1066	* to try_stop_cpus(), and the first on the next call. This means	1065	* grace period. We are then done, so we use atomic_cmpxchg() to
1067	* that piggybacking requires that synchronize_sched_expedited_count	1066	* update sync_sched_expedited_done to match our snapshot -- but
1068	* be incremented by 3: this guarantees that the piggybacking	1067	* only if someone else has not already advanced past our snapshot.
1069	* task has waited through an entire cycle of context switches,	1068	*
1070	* even in the worst case.	1069	* On the other hand, if try_stop_cpus() fails, we check the value
		1070	* of sync_sched_expedited_done. If it has advanced past our
		1071	* initial snapshot, then someone else must have forced a grace period
		1072	* some time after we took our snapshot. In this case, our work is
		1073	* done for us, and we can simply return. Otherwise, we try again,
		1074	* but keep our initial snapshot for purposes of checking for someone
		1075	* doing our work for us.
		1076	*
		1077	* If we fail too many times in a row, we fall back to synchronize_sched().
1071	*/	1078	*/
1072	void synchronize_sched_expedited(void)	1079	void synchronize_sched_expedited(void)
1073	{	1080	{
1074	int snap, trycount = 0;	1081	int firstsnap, s, snap, trycount = 0;
1075		1082
1076	smp_mb(); /* ensure prior mod happens before capturing snap. */	1083	/* Note that atomic_inc_return() implies full memory barrier. */
1077	snap = atomic_read(&synchronize_sched_expedited_count) + 2;	1084	firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
1078	get_online_cpus();	1085	get_online_cpus();
		1086
		1087	/*
		1088	* Each pass through the following loop attempts to force a
		1089	* context switch on each CPU.
		1090	*/
1079	while (try_stop_cpus(cpu_online_mask,	1091	while (try_stop_cpus(cpu_online_mask,
1080	synchronize_sched_expedited_cpu_stop,	1092	synchronize_sched_expedited_cpu_stop,
1081	NULL) == -EAGAIN) {	1093	NULL) == -EAGAIN) {
1082	put_online_cpus();	1094	put_online_cpus();
		1095
		1096	/* No joy, try again later. Or just synchronize_sched(). */
1083	if (trycount++ < 10)	1097	if (trycount++ < 10)
1084	udelay(trycount * num_online_cpus());	1098	udelay(trycount * num_online_cpus());
1085	else {	1099	else {
1086	synchronize_sched();	1100	synchronize_sched();
1087	return;	1101	return;
1088	}	1102	}
1089	if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) {	1103
		1104	/* Check to see if someone else did our work for us. */
		1105	s = atomic_read(&sync_sched_expedited_done);
		1106	if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
1090	smp_mb(); /* ensure test happens before caller kfree */	1107	smp_mb(); /* ensure test happens before caller kfree */
1091	return;	1108	return;
1092	}	1109	}
		1110
		1111	/*
		1112	* Refetching sync_sched_expedited_started allows later
		1113	* callers to piggyback on our grace period. We subtract
		1114	* 1 to get the same token that the last incrementer got.
		1115	* We retry after they started, so our grace period works
		1116	* for them, and they started after our first try, so their
		1117	* grace period works for us.
		1118	*/
1093	get_online_cpus();	1119	get_online_cpus();
		1120	snap = atomic_read(&sync_sched_expedited_started) - 1;
		1121	smp_mb(); /* ensure read is before try_stop_cpus(). */
1094	}	1122	}
1095	smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */	1123
		1124	/*
		1125	* Everyone up to our most recent fetch is covered by our grace
		1126	* period. Update the counter, but only if our work is still
		1127	* relevant -- which it won't be if someone who started later
		1128	* than we did beat us to the punch.
		1129	*/
		1130	do {
		1131	s = atomic_read(&sync_sched_expedited_done);
		1132	if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
		1133	smp_mb(); /* ensure test happens before caller kfree */
		1134	break;
		1135	}
		1136	} while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
		1137
1096	put_online_cpus();	1138	put_online_cpus();
1097	}	1139	}
1098	EXPORT_SYMBOL_GPL(synchronize_sched_expedited);	1140	EXPORT_SYMBOL_GPL(synchronize_sched_expedited);