diff options
author | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2012-12-03 16:52:00 -0500 |
---|---|---|
committer | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2013-01-08 17:15:57 -0500 |
commit | dc35c8934eba959b690921615fcd987e8bc17e4a (patch) | |
tree | c8020ec8ac191505e35905e89e58f02aa4e5fc31 /kernel/rcutree.c | |
parent | 1b0048a44c502c5ab850203e6e0a6498d7d8676d (diff) |
rcu: Tag callback lists with corresponding grace-period number
Currently, callbacks are advanced each time the corresponding CPU
notices a change in its leaf rcu_node structure's ->completed value
(this value counts grace-period completions). This approach has worked
quite well, but with the advent of RCU_FAST_NO_HZ, we cannot count on
a given CPU seeing all the grace-period completions. When a CPU misses
a grace-period completion that occurs while it is in dyntick-idle mode,
this will delay invocation of its callbacks.
In addition, acceleration of callbacks (when RCU realizes that a given
callback need only wait until the end of the next grace period, rather
than having to wait for a partial grace period followed by a full
grace period) must be carried out extremely carefully. Insufficient
acceleration will result in unnecessarily long grace-period latencies,
while excessive acceleration will result in premature callback invocation.
Changes that involve this tradeoff are therefore among the most
nerve-wracking changes to RCU.
This commit therefore explicitly tags groups of callbacks with the
number of the grace period that they are waiting for. This means that
callback-advancement and callback-acceleration functions are idempotent,
so that excessive acceleration will merely waste a few CPU cycles. This
also allows a CPU to take full advantage of any grace periods that have
elapsed while it has been in dyntick-idle mode. It should also enable
simulataneous simplifications to and optimizations of RCU_FAST_NO_HZ.
Signed-off-by: Paul E. McKenney <paul.mckenney@linaro.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Diffstat (limited to 'kernel/rcutree.c')
-rw-r--r-- | kernel/rcutree.c | 195 |
1 files changed, 167 insertions, 28 deletions
diff --git a/kernel/rcutree.c b/kernel/rcutree.c index e441b77b614e..ac6a75d152ef 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c | |||
@@ -305,17 +305,27 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp) | |||
305 | } | 305 | } |
306 | 306 | ||
307 | /* | 307 | /* |
308 | * Does the current CPU require a yet-as-unscheduled grace period? | 308 | * Does the current CPU require a not-yet-started grace period? |
309 | * The caller must have disabled interrupts to prevent races with | ||
310 | * normal callback registry. | ||
309 | */ | 311 | */ |
310 | static int | 312 | static int |
311 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) | 313 | cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) |
312 | { | 314 | { |
313 | struct rcu_head **ntp; | 315 | int i; |
314 | 316 | ||
315 | ntp = rdp->nxttail[RCU_DONE_TAIL + | 317 | if (rcu_gp_in_progress(rsp)) |
316 | (ACCESS_ONCE(rsp->completed) != rdp->completed)]; | 318 | return 0; /* No, a grace period is already in progress. */ |
317 | return rdp->nxttail[RCU_DONE_TAIL] && ntp && *ntp && | 319 | if (!rdp->nxttail[RCU_NEXT_TAIL]) |
318 | !rcu_gp_in_progress(rsp); | 320 | return 0; /* No, this is a no-CBs (or offline) CPU. */ |
321 | if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) | ||
322 | return 1; /* Yes, this CPU has newly registered callbacks. */ | ||
323 | for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) | ||
324 | if (rdp->nxttail[i - 1] != rdp->nxttail[i] && | ||
325 | ULONG_CMP_LT(ACCESS_ONCE(rsp->completed), | ||
326 | rdp->nxtcompleted[i])) | ||
327 | return 1; /* Yes, CBs for future grace period. */ | ||
328 | return 0; /* No grace period needed. */ | ||
319 | } | 329 | } |
320 | 330 | ||
321 | /* | 331 | /* |
@@ -1071,6 +1081,139 @@ static void init_callback_list(struct rcu_data *rdp) | |||
1071 | } | 1081 | } |
1072 | 1082 | ||
1073 | /* | 1083 | /* |
1084 | * Determine the value that ->completed will have at the end of the | ||
1085 | * next subsequent grace period. This is used to tag callbacks so that | ||
1086 | * a CPU can invoke callbacks in a timely fashion even if that CPU has | ||
1087 | * been dyntick-idle for an extended period with callbacks under the | ||
1088 | * influence of RCU_FAST_NO_HZ. | ||
1089 | * | ||
1090 | * The caller must hold rnp->lock with interrupts disabled. | ||
1091 | */ | ||
1092 | static unsigned long rcu_cbs_completed(struct rcu_state *rsp, | ||
1093 | struct rcu_node *rnp) | ||
1094 | { | ||
1095 | /* | ||
1096 | * If RCU is idle, we just wait for the next grace period. | ||
1097 | * But we can only be sure that RCU is idle if we are looking | ||
1098 | * at the root rcu_node structure -- otherwise, a new grace | ||
1099 | * period might have started, but just not yet gotten around | ||
1100 | * to initializing the current non-root rcu_node structure. | ||
1101 | */ | ||
1102 | if (rcu_get_root(rsp) == rnp && rnp->gpnum == rnp->completed) | ||
1103 | return rnp->completed + 1; | ||
1104 | |||
1105 | /* | ||
1106 | * Otherwise, wait for a possible partial grace period and | ||
1107 | * then the subsequent full grace period. | ||
1108 | */ | ||
1109 | return rnp->completed + 2; | ||
1110 | } | ||
1111 | |||
1112 | /* | ||
1113 | * If there is room, assign a ->completed number to any callbacks on | ||
1114 | * this CPU that have not already been assigned. Also accelerate any | ||
1115 | * callbacks that were previously assigned a ->completed number that has | ||
1116 | * since proven to be too conservative, which can happen if callbacks get | ||
1117 | * assigned a ->completed number while RCU is idle, but with reference to | ||
1118 | * a non-root rcu_node structure. This function is idempotent, so it does | ||
1119 | * not hurt to call it repeatedly. | ||
1120 | * | ||
1121 | * The caller must hold rnp->lock with interrupts disabled. | ||
1122 | */ | ||
1123 | static void rcu_accelerate_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | ||
1124 | struct rcu_data *rdp) | ||
1125 | { | ||
1126 | unsigned long c; | ||
1127 | int i; | ||
1128 | |||
1129 | /* If the CPU has no callbacks, nothing to do. */ | ||
1130 | if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) | ||
1131 | return; | ||
1132 | |||
1133 | /* | ||
1134 | * Starting from the sublist containing the callbacks most | ||
1135 | * recently assigned a ->completed number and working down, find the | ||
1136 | * first sublist that is not assignable to an upcoming grace period. | ||
1137 | * Such a sublist has something in it (first two tests) and has | ||
1138 | * a ->completed number assigned that will complete sooner than | ||
1139 | * the ->completed number for newly arrived callbacks (last test). | ||
1140 | * | ||
1141 | * The key point is that any later sublist can be assigned the | ||
1142 | * same ->completed number as the newly arrived callbacks, which | ||
1143 | * means that the callbacks in any of these later sublist can be | ||
1144 | * grouped into a single sublist, whether or not they have already | ||
1145 | * been assigned a ->completed number. | ||
1146 | */ | ||
1147 | c = rcu_cbs_completed(rsp, rnp); | ||
1148 | for (i = RCU_NEXT_TAIL - 1; i > RCU_DONE_TAIL; i--) | ||
1149 | if (rdp->nxttail[i] != rdp->nxttail[i - 1] && | ||
1150 | !ULONG_CMP_GE(rdp->nxtcompleted[i], c)) | ||
1151 | break; | ||
1152 | |||
1153 | /* | ||
1154 | * If there are no sublist for unassigned callbacks, leave. | ||
1155 | * At the same time, advance "i" one sublist, so that "i" will | ||
1156 | * index into the sublist where all the remaining callbacks should | ||
1157 | * be grouped into. | ||
1158 | */ | ||
1159 | if (++i >= RCU_NEXT_TAIL) | ||
1160 | return; | ||
1161 | |||
1162 | /* | ||
1163 | * Assign all subsequent callbacks' ->completed number to the next | ||
1164 | * full grace period and group them all in the sublist initially | ||
1165 | * indexed by "i". | ||
1166 | */ | ||
1167 | for (; i <= RCU_NEXT_TAIL; i++) { | ||
1168 | rdp->nxttail[i] = rdp->nxttail[RCU_NEXT_TAIL]; | ||
1169 | rdp->nxtcompleted[i] = c; | ||
1170 | } | ||
1171 | } | ||
1172 | |||
1173 | /* | ||
1174 | * Move any callbacks whose grace period has completed to the | ||
1175 | * RCU_DONE_TAIL sublist, then compact the remaining sublists and | ||
1176 | * assign ->completed numbers to any callbacks in the RCU_NEXT_TAIL | ||
1177 | * sublist. This function is idempotent, so it does not hurt to | ||
1178 | * invoke it repeatedly. As long as it is not invoked -too- often... | ||
1179 | * | ||
1180 | * The caller must hold rnp->lock with interrupts disabled. | ||
1181 | */ | ||
1182 | static void rcu_advance_cbs(struct rcu_state *rsp, struct rcu_node *rnp, | ||
1183 | struct rcu_data *rdp) | ||
1184 | { | ||
1185 | int i, j; | ||
1186 | |||
1187 | /* If the CPU has no callbacks, nothing to do. */ | ||
1188 | if (!rdp->nxttail[RCU_NEXT_TAIL] || !*rdp->nxttail[RCU_DONE_TAIL]) | ||
1189 | return; | ||
1190 | |||
1191 | /* | ||
1192 | * Find all callbacks whose ->completed numbers indicate that they | ||
1193 | * are ready to invoke, and put them into the RCU_DONE_TAIL sublist. | ||
1194 | */ | ||
1195 | for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) { | ||
1196 | if (ULONG_CMP_LT(rnp->completed, rdp->nxtcompleted[i])) | ||
1197 | break; | ||
1198 | rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[i]; | ||
1199 | } | ||
1200 | /* Clean up any sublist tail pointers that were misordered above. */ | ||
1201 | for (j = RCU_WAIT_TAIL; j < i; j++) | ||
1202 | rdp->nxttail[j] = rdp->nxttail[RCU_DONE_TAIL]; | ||
1203 | |||
1204 | /* Copy down callbacks to fill in empty sublists. */ | ||
1205 | for (j = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++, j++) { | ||
1206 | if (rdp->nxttail[j] == rdp->nxttail[RCU_NEXT_TAIL]) | ||
1207 | break; | ||
1208 | rdp->nxttail[j] = rdp->nxttail[i]; | ||
1209 | rdp->nxtcompleted[j] = rdp->nxtcompleted[i]; | ||
1210 | } | ||
1211 | |||
1212 | /* Classify any remaining callbacks. */ | ||
1213 | rcu_accelerate_cbs(rsp, rnp, rdp); | ||
1214 | } | ||
1215 | |||
1216 | /* | ||
1074 | * Advance this CPU's callbacks, but only if the current grace period | 1217 | * Advance this CPU's callbacks, but only if the current grace period |
1075 | * has ended. This may be called only from the CPU to whom the rdp | 1218 | * has ended. This may be called only from the CPU to whom the rdp |
1076 | * belongs. In addition, the corresponding leaf rcu_node structure's | 1219 | * belongs. In addition, the corresponding leaf rcu_node structure's |
@@ -1080,12 +1223,15 @@ static void | |||
1080 | __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) | 1223 | __rcu_process_gp_end(struct rcu_state *rsp, struct rcu_node *rnp, struct rcu_data *rdp) |
1081 | { | 1224 | { |
1082 | /* Did another grace period end? */ | 1225 | /* Did another grace period end? */ |
1083 | if (rdp->completed != rnp->completed) { | 1226 | if (rdp->completed == rnp->completed) { |
1227 | |||
1228 | /* No, so just accelerate recent callbacks. */ | ||
1229 | rcu_accelerate_cbs(rsp, rnp, rdp); | ||
1084 | 1230 | ||
1085 | /* Advance callbacks. No harm if list empty. */ | 1231 | } else { |
1086 | rdp->nxttail[RCU_DONE_TAIL] = rdp->nxttail[RCU_WAIT_TAIL]; | 1232 | |
1087 | rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_READY_TAIL]; | 1233 | /* Advance callbacks. */ |
1088 | rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | 1234 | rcu_advance_cbs(rsp, rnp, rdp); |
1089 | 1235 | ||
1090 | /* Remember that we saw this grace-period completion. */ | 1236 | /* Remember that we saw this grace-period completion. */ |
1091 | rdp->completed = rnp->completed; | 1237 | rdp->completed = rnp->completed; |
@@ -1392,17 +1538,10 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) | |||
1392 | /* | 1538 | /* |
1393 | * Because there is no grace period in progress right now, | 1539 | * Because there is no grace period in progress right now, |
1394 | * any callbacks we have up to this point will be satisfied | 1540 | * any callbacks we have up to this point will be satisfied |
1395 | * by the next grace period. So promote all callbacks to be | 1541 | * by the next grace period. So this is a good place to |
1396 | * handled after the end of the next grace period. If the | 1542 | * assign a grace period number to recently posted callbacks. |
1397 | * CPU is not yet aware of the end of the previous grace period, | ||
1398 | * we need to allow for the callback advancement that will | ||
1399 | * occur when it does become aware. Deadlock prevents us from | ||
1400 | * making it aware at this point: We cannot acquire a leaf | ||
1401 | * rcu_node ->lock while holding the root rcu_node ->lock. | ||
1402 | */ | 1543 | */ |
1403 | rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | 1544 | rcu_accelerate_cbs(rsp, rnp, rdp); |
1404 | if (rdp->completed == rsp->completed) | ||
1405 | rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | ||
1406 | 1545 | ||
1407 | rsp->gp_flags = RCU_GP_FLAG_INIT; | 1546 | rsp->gp_flags = RCU_GP_FLAG_INIT; |
1408 | raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ | 1547 | raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ |
@@ -1527,7 +1666,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) | |||
1527 | * This GP can't end until cpu checks in, so all of our | 1666 | * This GP can't end until cpu checks in, so all of our |
1528 | * callbacks can be processed during the next GP. | 1667 | * callbacks can be processed during the next GP. |
1529 | */ | 1668 | */ |
1530 | rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL]; | 1669 | rcu_accelerate_cbs(rsp, rnp, rdp); |
1531 | 1670 | ||
1532 | rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ | 1671 | rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */ |
1533 | } | 1672 | } |
@@ -1779,7 +1918,7 @@ static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1779 | long bl, count, count_lazy; | 1918 | long bl, count, count_lazy; |
1780 | int i; | 1919 | int i; |
1781 | 1920 | ||
1782 | /* If no callbacks are ready, just return.*/ | 1921 | /* If no callbacks are ready, just return. */ |
1783 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { | 1922 | if (!cpu_has_callbacks_ready_to_invoke(rdp)) { |
1784 | trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); | 1923 | trace_rcu_batch_start(rsp->name, rdp->qlen_lazy, rdp->qlen, 0); |
1785 | trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), | 1924 | trace_rcu_batch_end(rsp->name, 0, !!ACCESS_ONCE(rdp->nxtlist), |
@@ -2008,19 +2147,19 @@ __rcu_process_callbacks(struct rcu_state *rsp) | |||
2008 | 2147 | ||
2009 | WARN_ON_ONCE(rdp->beenonline == 0); | 2148 | WARN_ON_ONCE(rdp->beenonline == 0); |
2010 | 2149 | ||
2011 | /* | 2150 | /* Handle the end of a grace period that some other CPU ended. */ |
2012 | * Advance callbacks in response to end of earlier grace | ||
2013 | * period that some other CPU ended. | ||
2014 | */ | ||
2015 | rcu_process_gp_end(rsp, rdp); | 2151 | rcu_process_gp_end(rsp, rdp); |
2016 | 2152 | ||
2017 | /* Update RCU state based on any recent quiescent states. */ | 2153 | /* Update RCU state based on any recent quiescent states. */ |
2018 | rcu_check_quiescent_state(rsp, rdp); | 2154 | rcu_check_quiescent_state(rsp, rdp); |
2019 | 2155 | ||
2020 | /* Does this CPU require a not-yet-started grace period? */ | 2156 | /* Does this CPU require a not-yet-started grace period? */ |
2157 | local_irq_save(flags); | ||
2021 | if (cpu_needs_another_gp(rsp, rdp)) { | 2158 | if (cpu_needs_another_gp(rsp, rdp)) { |
2022 | raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); | 2159 | raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ |
2023 | rcu_start_gp(rsp, flags); /* releases above lock */ | 2160 | rcu_start_gp(rsp, flags); /* releases above lock */ |
2161 | } else { | ||
2162 | local_irq_restore(flags); | ||
2024 | } | 2163 | } |
2025 | 2164 | ||
2026 | /* If there are callbacks ready, invoke them. */ | 2165 | /* If there are callbacks ready, invoke them. */ |