aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2013-09-04 13:51:13 -0400
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>2013-09-23 12:15:30 -0400
commit26cdfedf6a902345f8604ea8e0b7dd2566b37a46 (patch)
tree27ae87c5a6de2e1077c760d054558b9aa0834842
parent69c8d28c96445e28f081fcd987e34ea2afa65039 (diff)
rcu: Reject memory-order-induced stall-warning false positives
If a system is idle from an RCU perspective for longer than specified by CONFIG_RCU_CPU_STALL_TIMEOUT, and if one CPU starts a grace period just as a second checks for CPU stalls, and if this second CPU happens to see the old value of rsp->jiffies_stall, it will incorrectly report a CPU stall. This is quite rare, but apparently occurs deterministically on systems with about 6TB of memory. This commit therefore orders accesses to the data used to determine whether or not a CPU stall is in progress. Grace-period initialization and cleanup first increments rsp->completed to mark the end of the previous grace period, then records the current jiffies in rsp->gp_start, then records the jiffies at which a stall can be expected to occur in rsp->jiffies_stall, and finally increments rsp->gpnum to mark the start of the new grace period. Now, this ordering by itself does not prevent false positives. For example, if grace-period initialization was delayed between recording rsp->gp_start and rsp->jiffies_stall, the CPU stall warning code might still see an old value of rsp->jiffies_stall. Therefore, this commit also orders the CPU stall warning accesses as well, loading rsp->gpnum and jiffies, then rsp->jiffies_stall, then rsp->gp_start, and finally rsp->completed. This ordering means that the false-positive scenario in the previous paragraph would result in rsp->completed being greater than or equal to rsp->gpnum, which is never valid for a CPU stall, allowing the false positive to be rejected. Furthermore, any fetch that gets an old value of rsp->jiffies_stall must also get an old value of rsp->gpnum, which will again be rejected by the comparison of rsp->gpnum and rsp->completed. Situations where rsp->gp_start is later than rsp->jiffies_stall are also rejected, as are situations where jiffies is less than rsp->jiffies_stall. Although use of unsynchronized accesses means that there are likely still some false-positive scenarios (synchronization has proven to be a very bad idea on large systems), this should get rid of a large class of these scenarios. Reported-by: Fabian Herschel <fabian.herschel@suse.com> Reported-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Reviewed-by: Michal Hocko <mhocko@suse.cz> Tested-by: Jochen Striepe <jochen@tolot.escape.de>
-rw-r--r--kernel/rcutree.c45
1 files changed, 40 insertions, 5 deletions
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 49464aded7f7..b618d72bd8ec 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -804,8 +804,11 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp,
804 804
805static void record_gp_stall_check_time(struct rcu_state *rsp) 805static void record_gp_stall_check_time(struct rcu_state *rsp)
806{ 806{
807 rsp->gp_start = jiffies; 807 unsigned long j = ACCESS_ONCE(jiffies);
808 rsp->jiffies_stall = jiffies + rcu_jiffies_till_stall_check(); 808
809 rsp->gp_start = j;
810 smp_wmb(); /* Record start time before stall time. */
811 rsp->jiffies_stall = j + rcu_jiffies_till_stall_check();
809} 812}
810 813
811/* 814/*
@@ -934,17 +937,48 @@ static void print_cpu_stall(struct rcu_state *rsp)
934 937
935static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) 938static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
936{ 939{
940 unsigned long completed;
941 unsigned long gpnum;
942 unsigned long gps;
937 unsigned long j; 943 unsigned long j;
938 unsigned long js; 944 unsigned long js;
939 struct rcu_node *rnp; 945 struct rcu_node *rnp;
940 946
941 if (rcu_cpu_stall_suppress) 947 if (rcu_cpu_stall_suppress || !rcu_gp_in_progress(rsp))
942 return; 948 return;
943 j = ACCESS_ONCE(jiffies); 949 j = ACCESS_ONCE(jiffies);
950
951 /*
952 * Lots of memory barriers to reject false positives.
953 *
954 * The idea is to pick up rsp->gpnum, then rsp->jiffies_stall,
955 * then rsp->gp_start, and finally rsp->completed. These values
956 * are updated in the opposite order with memory barriers (or
957 * equivalent) during grace-period initialization and cleanup.
958 * Now, a false positive can occur if we get an new value of
959 * rsp->gp_start and a old value of rsp->jiffies_stall. But given
960 * the memory barriers, the only way that this can happen is if one
961 * grace period ends and another starts between these two fetches.
962 * Detect this by comparing rsp->completed with the previous fetch
963 * from rsp->gpnum.
964 *
965 * Given this check, comparisons of jiffies, rsp->jiffies_stall,
966 * and rsp->gp_start suffice to forestall false positives.
967 */
968 gpnum = ACCESS_ONCE(rsp->gpnum);
969 smp_rmb(); /* Pick up ->gpnum first... */
944 js = ACCESS_ONCE(rsp->jiffies_stall); 970 js = ACCESS_ONCE(rsp->jiffies_stall);
971 smp_rmb(); /* ...then ->jiffies_stall before the rest... */
972 gps = ACCESS_ONCE(rsp->gp_start);
973 smp_rmb(); /* ...and finally ->gp_start before ->completed. */
974 completed = ACCESS_ONCE(rsp->completed);
975 if (ULONG_CMP_GE(completed, gpnum) ||
976 ULONG_CMP_LT(j, js) ||
977 ULONG_CMP_GE(gps, js))
978 return; /* No stall or GP completed since entering function. */
945 rnp = rdp->mynode; 979 rnp = rdp->mynode;
946 if (rcu_gp_in_progress(rsp) && 980 if (rcu_gp_in_progress(rsp) &&
947 (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask) && ULONG_CMP_GE(j, js)) { 981 (ACCESS_ONCE(rnp->qsmask) & rdp->grpmask)) {
948 982
949 /* We haven't checked in, so go dump stack. */ 983 /* We haven't checked in, so go dump stack. */
950 print_cpu_stall(rsp); 984 print_cpu_stall(rsp);
@@ -1317,9 +1351,10 @@ static int rcu_gp_init(struct rcu_state *rsp)
1317 } 1351 }
1318 1352
1319 /* Advance to a new grace period and initialize state. */ 1353 /* Advance to a new grace period and initialize state. */
1354 record_gp_stall_check_time(rsp);
1355 smp_wmb(); /* Record GP times before starting GP. */
1320 rsp->gpnum++; 1356 rsp->gpnum++;
1321 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); 1357 trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
1322 record_gp_stall_check_time(rsp);
1323 raw_spin_unlock_irq(&rnp->lock); 1358 raw_spin_unlock_irq(&rnp->lock);
1324 1359
1325 /* Exclude any concurrent CPU-hotplug operations. */ 1360 /* Exclude any concurrent CPU-hotplug operations. */