diff options
author | Mark Fasheh <mark.fasheh@oracle.com> | 2006-05-09 18:09:35 -0400 |
---|---|---|
committer | Mark Fasheh <mark.fasheh@oracle.com> | 2006-06-29 18:45:35 -0400 |
commit | 0db638f44e7db9732d9c5704ca837f57ce061f42 (patch) | |
tree | 0ac8c092bfa27a9d88431b3c404af19fa0b1bdd1 /fs/ocfs2/cluster | |
parent | 4ba63adce06bc7549e1dd36344123dbaccdaa52f (diff) |
ocfs2: warn the user on a dead timeout mismatch
Print a warning to the user when a node with a different dead count joins
the region.
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Diffstat (limited to 'fs/ocfs2/cluster')
-rw-r--r-- | fs/ocfs2/cluster/heartbeat.c | 20 | ||||
-rw-r--r-- | fs/ocfs2/cluster/ocfs2_heartbeat.h | 1 |
2 files changed, 21 insertions, 0 deletions
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 1d26cfcd9f84..504595d6cf65 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c | |||
@@ -517,6 +517,7 @@ static inline void o2hb_prepare_block(struct o2hb_region *reg, | |||
517 | hb_block->hb_seq = cpu_to_le64(cputime); | 517 | hb_block->hb_seq = cpu_to_le64(cputime); |
518 | hb_block->hb_node = node_num; | 518 | hb_block->hb_node = node_num; |
519 | hb_block->hb_generation = cpu_to_le64(generation); | 519 | hb_block->hb_generation = cpu_to_le64(generation); |
520 | hb_block->hb_dead_ms = cpu_to_le32(o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS); | ||
520 | 521 | ||
521 | /* This step must always happen last! */ | 522 | /* This step must always happen last! */ |
522 | hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg, | 523 | hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg, |
@@ -645,6 +646,8 @@ static int o2hb_check_slot(struct o2hb_region *reg, | |||
645 | struct o2nm_node *node; | 646 | struct o2nm_node *node; |
646 | struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block; | 647 | struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block; |
647 | u64 cputime; | 648 | u64 cputime; |
649 | unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; | ||
650 | unsigned int slot_dead_ms; | ||
648 | 651 | ||
649 | memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); | 652 | memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); |
650 | 653 | ||
@@ -733,6 +736,23 @@ fire_callbacks: | |||
733 | &o2hb_live_slots[slot->ds_node_num]); | 736 | &o2hb_live_slots[slot->ds_node_num]); |
734 | 737 | ||
735 | slot->ds_equal_samples = 0; | 738 | slot->ds_equal_samples = 0; |
739 | |||
740 | /* We want to be sure that all nodes agree on the | ||
741 | * number of milliseconds before a node will be | ||
742 | * considered dead. The self-fencing timeout is | ||
743 | * computed from this value, and a discrepancy might | ||
744 | * result in heartbeat calling a node dead when it | ||
745 | * hasn't self-fenced yet. */ | ||
746 | slot_dead_ms = le32_to_cpu(hb_block->hb_dead_ms); | ||
747 | if (slot_dead_ms && slot_dead_ms != dead_ms) { | ||
748 | /* TODO: Perhaps we can fail the region here. */ | ||
749 | mlog(ML_ERROR, "Node %d on device %s has a dead count " | ||
750 | "of %u ms, but our count is %u ms.\n" | ||
751 | "Please double check your configuration values " | ||
752 | "for 'O2CB_HEARTBEAT_THRESHOLD'\n", | ||
753 | slot->ds_node_num, reg->hr_dev_name, slot_dead_ms, | ||
754 | dead_ms); | ||
755 | } | ||
736 | goto out; | 756 | goto out; |
737 | } | 757 | } |
738 | 758 | ||
diff --git a/fs/ocfs2/cluster/ocfs2_heartbeat.h b/fs/ocfs2/cluster/ocfs2_heartbeat.h index 94096069cb43..3f4151da9709 100644 --- a/fs/ocfs2/cluster/ocfs2_heartbeat.h +++ b/fs/ocfs2/cluster/ocfs2_heartbeat.h | |||
@@ -32,6 +32,7 @@ struct o2hb_disk_heartbeat_block { | |||
32 | __u8 hb_pad1[3]; | 32 | __u8 hb_pad1[3]; |
33 | __le32 hb_cksum; | 33 | __le32 hb_cksum; |
34 | __le64 hb_generation; | 34 | __le64 hb_generation; |
35 | __le32 hb_dead_ms; | ||
35 | }; | 36 | }; |
36 | 37 | ||
37 | #endif /* _OCFS2_HEARTBEAT_H */ | 38 | #endif /* _OCFS2_HEARTBEAT_H */ |