aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSunil Mushran <sunil.mushran@oracle.com>2011-05-04 13:28:00 -0400
committerJoel Becker <jlbec@evilplan.org>2011-05-13 14:26:48 -0400
commit76d9fc2954d057b19bf5d7b854df2b621b00fdec (patch)
treeca6725a08b9ffd79030d0bb22ec4c162f341f40e
parent4da6dc293604f55d156148b8f60b94053e3195fc (diff)
ocfs2/cluster: Increase the live threshold for global heartbeat
We have seen isolated cases (very few, I might add) of o2hb not detecting all live nodes on startup. One plausible reasoning for it is that other node had a hb io delay at the same time. The live threshold set at 2 (as low as it can be) could be increased to ameliorate the situation. But increasing the threshold directly affects mount time. Currently it takes around 5 secs to mount a volume in o2cb cluster with local heartbeat. Increasing the threshold will make mounts even slower. As the issue itself is rare, we have left things as they are for the local heartbeat mode. However we can improve the situation for global heartbeat mode as in that mode, we start the heartbeat much before the mount. This patch doubles the live threshold for the start of the first region in global heartbeat mode. Addresses internal Oracle bug#10635585. Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com> Acked-by: Mark Fasheh <mfasheh@suse.com> Signed-off-by: Joel Becker <jlbec@evilplan.org>
-rw-r--r--fs/ocfs2/cluster/heartbeat.c13
1 files changed, 12 insertions, 1 deletions
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 643720209a98..1d28505caff8 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -1690,6 +1690,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1690 struct file *filp = NULL; 1690 struct file *filp = NULL;
1691 struct inode *inode = NULL; 1691 struct inode *inode = NULL;
1692 ssize_t ret = -EINVAL; 1692 ssize_t ret = -EINVAL;
1693 int live_threshold;
1693 1694
1694 if (reg->hr_bdev) 1695 if (reg->hr_bdev)
1695 goto out; 1696 goto out;
@@ -1766,8 +1767,18 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1766 * A node is considered live after it has beat LIVE_THRESHOLD 1767 * A node is considered live after it has beat LIVE_THRESHOLD
1767 * times. We're not steady until we've given them a chance 1768 * times. We're not steady until we've given them a chance
1768 * _after_ our first read. 1769 * _after_ our first read.
1770 * The default threshold is bare minimum so as to limit the delay
1771 * during mounts. For global heartbeat, the threshold doubled for the
1772 * first region.
1769 */ 1773 */
1770 atomic_set(&reg->hr_steady_iterations, O2HB_LIVE_THRESHOLD + 1); 1774 live_threshold = O2HB_LIVE_THRESHOLD;
1775 if (o2hb_global_heartbeat_active()) {
1776 spin_lock(&o2hb_live_lock);
1777 if (o2hb_pop_count(&o2hb_region_bitmap, O2NM_MAX_REGIONS) == 1)
1778 live_threshold <<= 1;
1779 spin_unlock(&o2hb_live_lock);
1780 }
1781 atomic_set(&reg->hr_steady_iterations, live_threshold + 1);
1771 1782
1772 hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", 1783 hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
1773 reg->hr_item.ci_name); 1784 reg->hr_item.ci_name);