diff options
author | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2014-12-11 13:20:59 -0500 |
---|---|---|
committer | Paul E. McKenney <paulmck@linux.vnet.ibm.com> | 2015-01-06 14:05:27 -0500 |
commit | 6ccd2ecd422644277b7d8b37222e3af3f43ea9ae (patch) | |
tree | 7aabe254edd3dc05e992d631123a3275ef4dd1f6 /kernel/rcu/tree.c | |
parent | fc908ed33e7c1428f799abb12399f906da03b397 (diff) |
rcu: Improve diagnostics for spurious RCU CPU stall warnings
The current RCU CPU stall warning code will print "Stall ended before
state dump start" any time that the stall-warning code is triggered on
a CPU that has already reported a quiescent state for the current grace
period and if all quiescent states have been reported for the current
grace period. However, a true stall can result in these symptoms, for
example, by preventing RCU's grace-period kthreads from ever running
This commit therefore checks for this condition, reporting the end of
the stall only if one of the grace-period counters has actually advanced.
Otherwise, it reports the last time that the grace-period kthread made
meaningful progress. (In normal situations, the grace-period kthread
should make meaningful progress at least every jiffies_till_next_fqs
jiffies.)
Reported-by: Miroslav Benes <mbenes@suse.cz>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Miroslav Benes <mbenes@suse.cz>
Diffstat (limited to 'kernel/rcu/tree.c')
-rw-r--r-- | kernel/rcu/tree.c | 32 |
1 files changed, 27 insertions, 5 deletions
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 654b15be1e36..a2ceb66bcd67 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c | |||
@@ -1066,11 +1066,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) | |||
1066 | } | 1066 | } |
1067 | } | 1067 | } |
1068 | 1068 | ||
1069 | static void print_other_cpu_stall(struct rcu_state *rsp) | 1069 | static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) |
1070 | { | 1070 | { |
1071 | int cpu; | 1071 | int cpu; |
1072 | long delta; | 1072 | long delta; |
1073 | unsigned long flags; | 1073 | unsigned long flags; |
1074 | unsigned long gpa; | ||
1075 | unsigned long j; | ||
1074 | int ndetected = 0; | 1076 | int ndetected = 0; |
1075 | struct rcu_node *rnp = rcu_get_root(rsp); | 1077 | struct rcu_node *rnp = rcu_get_root(rsp); |
1076 | long totqlen = 0; | 1078 | long totqlen = 0; |
@@ -1123,10 +1125,22 @@ static void print_other_cpu_stall(struct rcu_state *rsp) | |||
1123 | pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", | 1125 | pr_cont("(detected by %d, t=%ld jiffies, g=%ld, c=%ld, q=%lu)\n", |
1124 | smp_processor_id(), (long)(jiffies - rsp->gp_start), | 1126 | smp_processor_id(), (long)(jiffies - rsp->gp_start), |
1125 | (long)rsp->gpnum, (long)rsp->completed, totqlen); | 1127 | (long)rsp->gpnum, (long)rsp->completed, totqlen); |
1126 | if (ndetected == 0) | 1128 | if (ndetected) { |
1127 | pr_err("INFO: Stall ended before state dump start\n"); | ||
1128 | else | ||
1129 | rcu_dump_cpu_stacks(rsp); | 1129 | rcu_dump_cpu_stacks(rsp); |
1130 | } else { | ||
1131 | if (ACCESS_ONCE(rsp->gpnum) != gpnum || | ||
1132 | ACCESS_ONCE(rsp->completed) == gpnum) { | ||
1133 | pr_err("INFO: Stall ended before state dump start\n"); | ||
1134 | } else { | ||
1135 | j = jiffies; | ||
1136 | gpa = ACCESS_ONCE(rsp->gp_activity); | ||
1137 | pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n", | ||
1138 | rsp->name, j - gpa, j, gpa, | ||
1139 | jiffies_till_next_fqs); | ||
1140 | /* In this case, the current CPU might be at fault. */ | ||
1141 | sched_show_task(current); | ||
1142 | } | ||
1143 | } | ||
1130 | 1144 | ||
1131 | /* Complain about tasks blocking the grace period. */ | 1145 | /* Complain about tasks blocking the grace period. */ |
1132 | 1146 | ||
@@ -1226,7 +1240,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp) | |||
1226 | ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { | 1240 | ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY)) { |
1227 | 1241 | ||
1228 | /* They had a few time units to dump stack, so complain. */ | 1242 | /* They had a few time units to dump stack, so complain. */ |
1229 | print_other_cpu_stall(rsp); | 1243 | print_other_cpu_stall(rsp, gpnum); |
1230 | } | 1244 | } |
1231 | } | 1245 | } |
1232 | 1246 | ||
@@ -1622,6 +1636,7 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1622 | struct rcu_data *rdp; | 1636 | struct rcu_data *rdp; |
1623 | struct rcu_node *rnp = rcu_get_root(rsp); | 1637 | struct rcu_node *rnp = rcu_get_root(rsp); |
1624 | 1638 | ||
1639 | ACCESS_ONCE(rsp->gp_activity) = jiffies; | ||
1625 | rcu_bind_gp_kthread(); | 1640 | rcu_bind_gp_kthread(); |
1626 | raw_spin_lock_irq(&rnp->lock); | 1641 | raw_spin_lock_irq(&rnp->lock); |
1627 | smp_mb__after_unlock_lock(); | 1642 | smp_mb__after_unlock_lock(); |
@@ -1682,6 +1697,7 @@ static int rcu_gp_init(struct rcu_state *rsp) | |||
1682 | rnp->grphi, rnp->qsmask); | 1697 | rnp->grphi, rnp->qsmask); |
1683 | raw_spin_unlock_irq(&rnp->lock); | 1698 | raw_spin_unlock_irq(&rnp->lock); |
1684 | cond_resched_rcu_qs(); | 1699 | cond_resched_rcu_qs(); |
1700 | ACCESS_ONCE(rsp->gp_activity) = jiffies; | ||
1685 | } | 1701 | } |
1686 | 1702 | ||
1687 | mutex_unlock(&rsp->onoff_mutex); | 1703 | mutex_unlock(&rsp->onoff_mutex); |
@@ -1698,6 +1714,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in) | |||
1698 | unsigned long maxj; | 1714 | unsigned long maxj; |
1699 | struct rcu_node *rnp = rcu_get_root(rsp); | 1715 | struct rcu_node *rnp = rcu_get_root(rsp); |
1700 | 1716 | ||
1717 | ACCESS_ONCE(rsp->gp_activity) = jiffies; | ||
1701 | rsp->n_force_qs++; | 1718 | rsp->n_force_qs++; |
1702 | if (fqs_state == RCU_SAVE_DYNTICK) { | 1719 | if (fqs_state == RCU_SAVE_DYNTICK) { |
1703 | /* Collect dyntick-idle snapshots. */ | 1720 | /* Collect dyntick-idle snapshots. */ |
@@ -1736,6 +1753,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
1736 | struct rcu_data *rdp; | 1753 | struct rcu_data *rdp; |
1737 | struct rcu_node *rnp = rcu_get_root(rsp); | 1754 | struct rcu_node *rnp = rcu_get_root(rsp); |
1738 | 1755 | ||
1756 | ACCESS_ONCE(rsp->gp_activity) = jiffies; | ||
1739 | raw_spin_lock_irq(&rnp->lock); | 1757 | raw_spin_lock_irq(&rnp->lock); |
1740 | smp_mb__after_unlock_lock(); | 1758 | smp_mb__after_unlock_lock(); |
1741 | gp_duration = jiffies - rsp->gp_start; | 1759 | gp_duration = jiffies - rsp->gp_start; |
@@ -1772,6 +1790,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) | |||
1772 | nocb += rcu_future_gp_cleanup(rsp, rnp); | 1790 | nocb += rcu_future_gp_cleanup(rsp, rnp); |
1773 | raw_spin_unlock_irq(&rnp->lock); | 1791 | raw_spin_unlock_irq(&rnp->lock); |
1774 | cond_resched_rcu_qs(); | 1792 | cond_resched_rcu_qs(); |
1793 | ACCESS_ONCE(rsp->gp_activity) = jiffies; | ||
1775 | } | 1794 | } |
1776 | rnp = rcu_get_root(rsp); | 1795 | rnp = rcu_get_root(rsp); |
1777 | raw_spin_lock_irq(&rnp->lock); | 1796 | raw_spin_lock_irq(&rnp->lock); |
@@ -1821,6 +1840,7 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
1821 | if (rcu_gp_init(rsp)) | 1840 | if (rcu_gp_init(rsp)) |
1822 | break; | 1841 | break; |
1823 | cond_resched_rcu_qs(); | 1842 | cond_resched_rcu_qs(); |
1843 | ACCESS_ONCE(rsp->gp_activity) = jiffies; | ||
1824 | WARN_ON(signal_pending(current)); | 1844 | WARN_ON(signal_pending(current)); |
1825 | trace_rcu_grace_period(rsp->name, | 1845 | trace_rcu_grace_period(rsp->name, |
1826 | ACCESS_ONCE(rsp->gpnum), | 1846 | ACCESS_ONCE(rsp->gpnum), |
@@ -1864,9 +1884,11 @@ static int __noreturn rcu_gp_kthread(void *arg) | |||
1864 | ACCESS_ONCE(rsp->gpnum), | 1884 | ACCESS_ONCE(rsp->gpnum), |
1865 | TPS("fqsend")); | 1885 | TPS("fqsend")); |
1866 | cond_resched_rcu_qs(); | 1886 | cond_resched_rcu_qs(); |
1887 | ACCESS_ONCE(rsp->gp_activity) = jiffies; | ||
1867 | } else { | 1888 | } else { |
1868 | /* Deal with stray signal. */ | 1889 | /* Deal with stray signal. */ |
1869 | cond_resched_rcu_qs(); | 1890 | cond_resched_rcu_qs(); |
1891 | ACCESS_ONCE(rsp->gp_activity) = jiffies; | ||
1870 | WARN_ON(signal_pending(current)); | 1892 | WARN_ON(signal_pending(current)); |
1871 | trace_rcu_grace_period(rsp->name, | 1893 | trace_rcu_grace_period(rsp->name, |
1872 | ACCESS_ONCE(rsp->gpnum), | 1894 | ACCESS_ONCE(rsp->gpnum), |