aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/rcutree_plugin.h
diff options
context:
space:
mode:
authorTejun Heo <tj@kernel.org>2010-11-23 00:36:11 -0500
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>2010-12-17 15:34:08 -0500
commite27fc9641e8ddc8146f8e01f06e5eba2469698de (patch)
tree5cff4c23cb113b901718d9c0f4a6e3081f3173bd /kernel/rcutree_plugin.h
parent46fdb0937f26124700fc9fc80da4776330cc00d3 (diff)
rcu: increase synchronize_sched_expedited() batching
The fix in commit #6a0cc49 requires more than three concurrent instances of synchronize_sched_expedited() before batching is possible. This patch uses a ticket-counter-like approach that is also not unrelated to Lai Jiangshan's Ring RCU to allow sharing of expedited grace periods even when there are only two concurrent instances of synchronize_sched_expedited(). This commit builds on Tejun's original posting, which may be found at http://lkml.org/lkml/2010/11/9/204, adding memory barriers, avoiding overflow of signed integers (other than via atomic_t), and fixing the detection of batching. Signed-off-by: Tejun Heo <tj@kernel.org> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Diffstat (limited to 'kernel/rcutree_plugin.h')
-rw-r--r--kernel/rcutree_plugin.h82
1 files changed, 62 insertions, 20 deletions
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c22c4ef2a0d0..a3638710dc67 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -1025,7 +1025,8 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited);
1025 1025
1026#else /* #ifndef CONFIG_SMP */ 1026#else /* #ifndef CONFIG_SMP */
1027 1027
1028static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0); 1028static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0);
1029static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0);
1029 1030
1030static int synchronize_sched_expedited_cpu_stop(void *data) 1031static int synchronize_sched_expedited_cpu_stop(void *data)
1031{ 1032{
@@ -1041,8 +1042,6 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
1041 * robustness against future implementation changes. 1042 * robustness against future implementation changes.
1042 */ 1043 */
1043 smp_mb(); /* See above comment block. */ 1044 smp_mb(); /* See above comment block. */
1044 if (cpumask_first(cpu_online_mask) == smp_processor_id())
1045 atomic_inc(&synchronize_sched_expedited_count);
1046 return 0; 1045 return 0;
1047} 1046}
1048 1047
@@ -1056,43 +1055,86 @@ static int synchronize_sched_expedited_cpu_stop(void *data)
1056 * lock that is acquired by a CPU-hotplug notifier. Failing to 1055 * lock that is acquired by a CPU-hotplug notifier. Failing to
1057 * observe this restriction will result in deadlock. 1056 * observe this restriction will result in deadlock.
1058 * 1057 *
1059 * The synchronize_sched_expedited_cpu_stop() function is called 1058 * This implementation can be thought of as an application of ticket
1060 * in stop-CPU context, but in order to keep overhead down to a dull 1059 * locking to RCU, with sync_sched_expedited_started and
1061 * roar, we don't force this function to wait for its counterparts 1060 * sync_sched_expedited_done taking on the roles of the halves
1062 * on other CPUs. One instance of this function will increment the 1061 * of the ticket-lock word. Each task atomically increments
1063 * synchronize_sched_expedited_count variable per call to 1062 * sync_sched_expedited_started upon entry, snapshotting the old value,
1064 * try_stop_cpus(), but there is no guarantee what order this instance 1063 * then attempts to stop all the CPUs. If this succeeds, then each
1065 * will occur in. The worst case is that it is last on one call 1064 * CPU will have executed a context switch, resulting in an RCU-sched
1066 * to try_stop_cpus(), and the first on the next call. This means 1065 * grace period. We are then done, so we use atomic_cmpxchg() to
1067 * that piggybacking requires that synchronize_sched_expedited_count 1066 * update sync_sched_expedited_done to match our snapshot -- but
1068 * be incremented by 3: this guarantees that the piggybacking 1067 * only if someone else has not already advanced past our snapshot.
1069 * task has waited through an entire cycle of context switches, 1068 *
1070 * even in the worst case. 1069 * On the other hand, if try_stop_cpus() fails, we check the value
1070 * of sync_sched_expedited_done. If it has advanced past our
1071 * initial snapshot, then someone else must have forced a grace period
1072 * some time after we took our snapshot. In this case, our work is
1073 * done for us, and we can simply return. Otherwise, we try again,
1074 * but keep our initial snapshot for purposes of checking for someone
1075 * doing our work for us.
1076 *
1077 * If we fail too many times in a row, we fall back to synchronize_sched().
1071 */ 1078 */
1072void synchronize_sched_expedited(void) 1079void synchronize_sched_expedited(void)
1073{ 1080{
1074 int snap, trycount = 0; 1081 int firstsnap, s, snap, trycount = 0;
1075 1082
1076 smp_mb(); /* ensure prior mod happens before capturing snap. */ 1083 /* Note that atomic_inc_return() implies full memory barrier. */
1077 snap = atomic_read(&synchronize_sched_expedited_count) + 2; 1084 firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started);
1078 get_online_cpus(); 1085 get_online_cpus();
1086
1087 /*
1088 * Each pass through the following loop attempts to force a
1089 * context switch on each CPU.
1090 */
1079 while (try_stop_cpus(cpu_online_mask, 1091 while (try_stop_cpus(cpu_online_mask,
1080 synchronize_sched_expedited_cpu_stop, 1092 synchronize_sched_expedited_cpu_stop,
1081 NULL) == -EAGAIN) { 1093 NULL) == -EAGAIN) {
1082 put_online_cpus(); 1094 put_online_cpus();
1095
1096 /* No joy, try again later. Or just synchronize_sched(). */
1083 if (trycount++ < 10) 1097 if (trycount++ < 10)
1084 udelay(trycount * num_online_cpus()); 1098 udelay(trycount * num_online_cpus());
1085 else { 1099 else {
1086 synchronize_sched(); 1100 synchronize_sched();
1087 return; 1101 return;
1088 } 1102 }
1089 if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) { 1103
1104 /* Check to see if someone else did our work for us. */
1105 s = atomic_read(&sync_sched_expedited_done);
1106 if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) {
1090 smp_mb(); /* ensure test happens before caller kfree */ 1107 smp_mb(); /* ensure test happens before caller kfree */
1091 return; 1108 return;
1092 } 1109 }
1110
1111 /*
1112 * Refetching sync_sched_expedited_started allows later
1113 * callers to piggyback on our grace period. We subtract
1114 * 1 to get the same token that the last incrementer got.
1115 * We retry after they started, so our grace period works
1116 * for them, and they started after our first try, so their
1117 * grace period works for us.
1118 */
1093 get_online_cpus(); 1119 get_online_cpus();
1120 snap = atomic_read(&sync_sched_expedited_started) - 1;
1121 smp_mb(); /* ensure read is before try_stop_cpus(). */
1094 } 1122 }
1095 smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */ 1123
1124 /*
1125 * Everyone up to our most recent fetch is covered by our grace
1126 * period. Update the counter, but only if our work is still
1127 * relevant -- which it won't be if someone who started later
1128 * than we did beat us to the punch.
1129 */
1130 do {
1131 s = atomic_read(&sync_sched_expedited_done);
1132 if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) {
1133 smp_mb(); /* ensure test happens before caller kfree */
1134 break;
1135 }
1136 } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s);
1137
1096 put_online_cpus(); 1138 put_online_cpus();
1097} 1139}
1098EXPORT_SYMBOL_GPL(synchronize_sched_expedited); 1140EXPORT_SYMBOL_GPL(synchronize_sched_expedited);