diff options
author | Tejun Heo <tj@kernel.org> | 2010-11-23 00:36:11 -0500 |
---|---|---|
committer | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2010-12-17 15:34:08 -0500 |
commit | e27fc9641e8ddc8146f8e01f06e5eba2469698de (patch) | |
tree | 5cff4c23cb113b901718d9c0f4a6e3081f3173bd | |
parent | 46fdb0937f26124700fc9fc80da4776330cc00d3 (diff) |
rcu: increase synchronize_sched_expedited() batching
The fix in commit #6a0cc49 requires more than three concurrent instances
of synchronize_sched_expedited() before batching is possible. This
patch uses a ticket-counter-like approach that is also not unrelated to
Lai Jiangshan's Ring RCU to allow sharing of expedited grace periods even
when there are only two concurrent instances of synchronize_sched_expedited().
This commit builds on Tejun's original posting, which may be found at
http://lkml.org/lkml/2010/11/9/204, adding memory barriers, avoiding
overflow of signed integers (other than via atomic_t), and fixing the
detection of batching.
Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
-rw-r--r-- | include/linux/rcupdate.h | 2 | ||||
-rw-r--r-- | kernel/rcutree_plugin.h | 82 |
2 files changed, 64 insertions, 20 deletions
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 49e8e16308e1..af5614856285 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h | |||
@@ -47,6 +47,8 @@ | |||
47 | extern int rcutorture_runnable; /* for sysctl */ | 47 | extern int rcutorture_runnable; /* for sysctl */ |
48 | #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ | 48 | #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */ |
49 | 49 | ||
50 | #define UINT_CMP_GE(a, b) (UINT_MAX / 2 >= (a) - (b)) | ||
51 | #define UINT_CMP_LT(a, b) (UINT_MAX / 2 < (a) - (b)) | ||
50 | #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) | 52 | #define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) |
51 | #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) | 53 | #define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) |
52 | 54 | ||
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index c22c4ef2a0d0..a3638710dc67 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h | |||
@@ -1025,7 +1025,8 @@ EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | |||
1025 | 1025 | ||
1026 | #else /* #ifndef CONFIG_SMP */ | 1026 | #else /* #ifndef CONFIG_SMP */ |
1027 | 1027 | ||
1028 | static atomic_t synchronize_sched_expedited_count = ATOMIC_INIT(0); | 1028 | static atomic_t sync_sched_expedited_started = ATOMIC_INIT(0); |
1029 | static atomic_t sync_sched_expedited_done = ATOMIC_INIT(0); | ||
1029 | 1030 | ||
1030 | static int synchronize_sched_expedited_cpu_stop(void *data) | 1031 | static int synchronize_sched_expedited_cpu_stop(void *data) |
1031 | { | 1032 | { |
@@ -1041,8 +1042,6 @@ static int synchronize_sched_expedited_cpu_stop(void *data) | |||
1041 | * robustness against future implementation changes. | 1042 | * robustness against future implementation changes. |
1042 | */ | 1043 | */ |
1043 | smp_mb(); /* See above comment block. */ | 1044 | smp_mb(); /* See above comment block. */ |
1044 | if (cpumask_first(cpu_online_mask) == smp_processor_id()) | ||
1045 | atomic_inc(&synchronize_sched_expedited_count); | ||
1046 | return 0; | 1045 | return 0; |
1047 | } | 1046 | } |
1048 | 1047 | ||
@@ -1056,43 +1055,86 @@ static int synchronize_sched_expedited_cpu_stop(void *data) | |||
1056 | * lock that is acquired by a CPU-hotplug notifier. Failing to | 1055 | * lock that is acquired by a CPU-hotplug notifier. Failing to |
1057 | * observe this restriction will result in deadlock. | 1056 | * observe this restriction will result in deadlock. |
1058 | * | 1057 | * |
1059 | * The synchronize_sched_expedited_cpu_stop() function is called | 1058 | * This implementation can be thought of as an application of ticket |
1060 | * in stop-CPU context, but in order to keep overhead down to a dull | 1059 | * locking to RCU, with sync_sched_expedited_started and |
1061 | * roar, we don't force this function to wait for its counterparts | 1060 | * sync_sched_expedited_done taking on the roles of the halves |
1062 | * on other CPUs. One instance of this function will increment the | 1061 | * of the ticket-lock word. Each task atomically increments |
1063 | * synchronize_sched_expedited_count variable per call to | 1062 | * sync_sched_expedited_started upon entry, snapshotting the old value, |
1064 | * try_stop_cpus(), but there is no guarantee what order this instance | 1063 | * then attempts to stop all the CPUs. If this succeeds, then each |
1065 | * will occur in. The worst case is that it is last on one call | 1064 | * CPU will have executed a context switch, resulting in an RCU-sched |
1066 | * to try_stop_cpus(), and the first on the next call. This means | 1065 | * grace period. We are then done, so we use atomic_cmpxchg() to |
1067 | * that piggybacking requires that synchronize_sched_expedited_count | 1066 | * update sync_sched_expedited_done to match our snapshot -- but |
1068 | * be incremented by 3: this guarantees that the piggybacking | 1067 | * only if someone else has not already advanced past our snapshot. |
1069 | * task has waited through an entire cycle of context switches, | 1068 | * |
1070 | * even in the worst case. | 1069 | * On the other hand, if try_stop_cpus() fails, we check the value |
1070 | * of sync_sched_expedited_done. If it has advanced past our | ||
1071 | * initial snapshot, then someone else must have forced a grace period | ||
1072 | * some time after we took our snapshot. In this case, our work is | ||
1073 | * done for us, and we can simply return. Otherwise, we try again, | ||
1074 | * but keep our initial snapshot for purposes of checking for someone | ||
1075 | * doing our work for us. | ||
1076 | * | ||
1077 | * If we fail too many times in a row, we fall back to synchronize_sched(). | ||
1071 | */ | 1078 | */ |
1072 | void synchronize_sched_expedited(void) | 1079 | void synchronize_sched_expedited(void) |
1073 | { | 1080 | { |
1074 | int snap, trycount = 0; | 1081 | int firstsnap, s, snap, trycount = 0; |
1075 | 1082 | ||
1076 | smp_mb(); /* ensure prior mod happens before capturing snap. */ | 1083 | /* Note that atomic_inc_return() implies full memory barrier. */ |
1077 | snap = atomic_read(&synchronize_sched_expedited_count) + 2; | 1084 | firstsnap = snap = atomic_inc_return(&sync_sched_expedited_started); |
1078 | get_online_cpus(); | 1085 | get_online_cpus(); |
1086 | |||
1087 | /* | ||
1088 | * Each pass through the following loop attempts to force a | ||
1089 | * context switch on each CPU. | ||
1090 | */ | ||
1079 | while (try_stop_cpus(cpu_online_mask, | 1091 | while (try_stop_cpus(cpu_online_mask, |
1080 | synchronize_sched_expedited_cpu_stop, | 1092 | synchronize_sched_expedited_cpu_stop, |
1081 | NULL) == -EAGAIN) { | 1093 | NULL) == -EAGAIN) { |
1082 | put_online_cpus(); | 1094 | put_online_cpus(); |
1095 | |||
1096 | /* No joy, try again later. Or just synchronize_sched(). */ | ||
1083 | if (trycount++ < 10) | 1097 | if (trycount++ < 10) |
1084 | udelay(trycount * num_online_cpus()); | 1098 | udelay(trycount * num_online_cpus()); |
1085 | else { | 1099 | else { |
1086 | synchronize_sched(); | 1100 | synchronize_sched(); |
1087 | return; | 1101 | return; |
1088 | } | 1102 | } |
1089 | if (atomic_read(&synchronize_sched_expedited_count) - snap > 0) { | 1103 | |
1104 | /* Check to see if someone else did our work for us. */ | ||
1105 | s = atomic_read(&sync_sched_expedited_done); | ||
1106 | if (UINT_CMP_GE((unsigned)s, (unsigned)firstsnap)) { | ||
1090 | smp_mb(); /* ensure test happens before caller kfree */ | 1107 | smp_mb(); /* ensure test happens before caller kfree */ |
1091 | return; | 1108 | return; |
1092 | } | 1109 | } |
1110 | |||
1111 | /* | ||
1112 | * Refetching sync_sched_expedited_started allows later | ||
1113 | * callers to piggyback on our grace period. We subtract | ||
1114 | * 1 to get the same token that the last incrementer got. | ||
1115 | * We retry after they started, so our grace period works | ||
1116 | * for them, and they started after our first try, so their | ||
1117 | * grace period works for us. | ||
1118 | */ | ||
1093 | get_online_cpus(); | 1119 | get_online_cpus(); |
1120 | snap = atomic_read(&sync_sched_expedited_started) - 1; | ||
1121 | smp_mb(); /* ensure read is before try_stop_cpus(). */ | ||
1094 | } | 1122 | } |
1095 | smp_mb__after_atomic_inc(); /* ensure post-GP actions seen after GP. */ | 1123 | |
1124 | /* | ||
1125 | * Everyone up to our most recent fetch is covered by our grace | ||
1126 | * period. Update the counter, but only if our work is still | ||
1127 | * relevant -- which it won't be if someone who started later | ||
1128 | * than we did beat us to the punch. | ||
1129 | */ | ||
1130 | do { | ||
1131 | s = atomic_read(&sync_sched_expedited_done); | ||
1132 | if (UINT_CMP_GE((unsigned)s, (unsigned)snap)) { | ||
1133 | smp_mb(); /* ensure test happens before caller kfree */ | ||
1134 | break; | ||
1135 | } | ||
1136 | } while (atomic_cmpxchg(&sync_sched_expedited_done, s, snap) != s); | ||
1137 | |||
1096 | put_online_cpus(); | 1138 | put_online_cpus(); |
1097 | } | 1139 | } |
1098 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); | 1140 | EXPORT_SYMBOL_GPL(synchronize_sched_expedited); |