aboutsummaryrefslogtreecommitdiffstats
path: root/kernel/rcu/tree.c
diff options
context:
space:
mode:
authorPaul E. McKenney <paulmck@linux.vnet.ibm.com>2018-04-09 14:04:46 -0400
committerPaul E. McKenney <paulmck@linux.vnet.ibm.com>2018-06-26 15:25:56 -0400
commit8c42b1f39fdf9fde7cfc4024397255f31a860db6 (patch)
treecafa367b0ac5f20e061959ce35dc52acf7f53be8 /kernel/rcu/tree.c
parentce11fae8d43fe9a36823fbbfe7c44de775b7e346 (diff)
rcu: Exclude near-simultaneous RCU CPU stall warnings
There is a two-jiffy delay between the time that a CPU will self-report an RCU CPU stall warning and the time that some other CPU will report a warning on behalf of the first CPU. This has worked well in the past, but on busy systems, it is possible for the two warnings to overlap, which makes interpreting them extremely difficult. This commit therefore uses a cmpxchg-based timing decision that allows only one report in a given one-minute period (assuming default stall-warning Kconfig parameters). This approach will of course fail if you are seeing minute-long vCPU preemption, but in that case the overlapping RCU CPU stall warnings are the least of your worries. Reported-by: Dmitry Vyukov <dvyukov@google.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Diffstat (limited to 'kernel/rcu/tree.c')
-rw-r--r--kernel/rcu/tree.c26
1 files changed, 11 insertions, 15 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 79c7fe978b17..b1fffa21b9e4 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1368,7 +1368,6 @@ static inline void panic_on_rcu_stall(void)
1368static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) 1368static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
1369{ 1369{
1370 int cpu; 1370 int cpu;
1371 long delta;
1372 unsigned long flags; 1371 unsigned long flags;
1373 unsigned long gpa; 1372 unsigned long gpa;
1374 unsigned long j; 1373 unsigned long j;
@@ -1381,18 +1380,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
1381 if (rcu_cpu_stall_suppress) 1380 if (rcu_cpu_stall_suppress)
1382 return; 1381 return;
1383 1382
1384 /* Only let one CPU complain about others per time interval. */
1385
1386 raw_spin_lock_irqsave_rcu_node(rnp, flags);
1387 delta = jiffies - READ_ONCE(rsp->jiffies_stall);
1388 if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
1389 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1390 return;
1391 }
1392 WRITE_ONCE(rsp->jiffies_stall,
1393 jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
1394 raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
1395
1396 /* 1383 /*
1397 * OK, time to rat on our buddy... 1384 * OK, time to rat on our buddy...
1398 * See Documentation/RCU/stallwarn.txt for info on how to debug 1385 * See Documentation/RCU/stallwarn.txt for info on how to debug
@@ -1441,6 +1428,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
1441 sched_show_task(current); 1428 sched_show_task(current);
1442 } 1429 }
1443 } 1430 }
1431 /* Rewrite if needed in case of slow consoles. */
1432 if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
1433 WRITE_ONCE(rsp->jiffies_stall,
1434 jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
1444 1435
1445 rcu_check_gp_kthread_starvation(rsp); 1436 rcu_check_gp_kthread_starvation(rsp);
1446 1437
@@ -1485,6 +1476,7 @@ static void print_cpu_stall(struct rcu_state *rsp)
1485 rcu_dump_cpu_stacks(rsp); 1476 rcu_dump_cpu_stacks(rsp);
1486 1477
1487 raw_spin_lock_irqsave_rcu_node(rnp, flags); 1478 raw_spin_lock_irqsave_rcu_node(rnp, flags);
1479 /* Rewrite if needed in case of slow consoles. */
1488 if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) 1480 if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall)))
1489 WRITE_ONCE(rsp->jiffies_stall, 1481 WRITE_ONCE(rsp->jiffies_stall,
1490 jiffies + 3 * rcu_jiffies_till_stall_check() + 3); 1482 jiffies + 3 * rcu_jiffies_till_stall_check() + 3);
@@ -1508,6 +1500,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
1508 unsigned long gpnum; 1500 unsigned long gpnum;
1509 unsigned long gps; 1501 unsigned long gps;
1510 unsigned long j; 1502 unsigned long j;
1503 unsigned long jn;
1511 unsigned long js; 1504 unsigned long js;
1512 struct rcu_node *rnp; 1505 struct rcu_node *rnp;
1513 1506
@@ -1546,14 +1539,17 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
1546 ULONG_CMP_GE(gps, js)) 1539 ULONG_CMP_GE(gps, js))
1547 return; /* No stall or GP completed since entering function. */ 1540 return; /* No stall or GP completed since entering function. */
1548 rnp = rdp->mynode; 1541 rnp = rdp->mynode;
1542 jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
1549 if (rcu_gp_in_progress(rsp) && 1543 if (rcu_gp_in_progress(rsp) &&
1550 (READ_ONCE(rnp->qsmask) & rdp->grpmask)) { 1544 (READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
1545 cmpxchg(&rsp->jiffies_stall, js, jn) == js) {
1551 1546
1552 /* We haven't checked in, so go dump stack. */ 1547 /* We haven't checked in, so go dump stack. */
1553 print_cpu_stall(rsp); 1548 print_cpu_stall(rsp);
1554 1549
1555 } else if (rcu_gp_in_progress(rsp) && 1550 } else if (rcu_gp_in_progress(rsp) &&
1556 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { 1551 ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
1552 cmpxchg(&rsp->jiffies_stall, js, jn) == js) {
1557 1553
1558 /* They had a few time units to dump stack, so complain. */ 1554 /* They had a few time units to dump stack, so complain. */
1559 print_other_cpu_stall(rsp, gpnum); 1555 print_other_cpu_stall(rsp, gpnum);