diff options
author | Sunil Mushran <sunil.mushran@oracle.com> | 2010-10-07 20:00:16 -0400 |
---|---|---|
committer | Sunil Mushran <sunil.mushran@oracle.com> | 2010-10-07 20:00:16 -0400 |
commit | 0e105d37c2adb19cb777aa6701a866f211764a30 (patch) | |
tree | 288022158da0392b240484784db4aaca0a44915d /fs/ocfs2 | |
parent | 39a298563e0619b1b6e2e0974e58801de780621c (diff) |
ocfs2/cluster: Check slots for unconfigured live nodes
o2hb currently checks slots for configured nodes only. This patch makes
it check the slots for the live nodes too to take care of a race in which
a node is removed from the configuration but not from the live map.
Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Diffstat (limited to 'fs/ocfs2')
-rw-r--r-- | fs/ocfs2/cluster/heartbeat.c | 38 | ||||
-rw-r--r-- | fs/ocfs2/cluster/tcp.c | 5 |
2 files changed, 36 insertions, 7 deletions
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 12bb12ba864..a8f10649674 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c | |||
@@ -541,6 +541,8 @@ static void o2hb_queue_node_event(struct o2hb_node_event *event, | |||
541 | { | 541 | { |
542 | assert_spin_locked(&o2hb_live_lock); | 542 | assert_spin_locked(&o2hb_live_lock); |
543 | 543 | ||
544 | BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB)); | ||
545 | |||
544 | event->hn_event_type = type; | 546 | event->hn_event_type = type; |
545 | event->hn_node = node; | 547 | event->hn_node = node; |
546 | event->hn_node_num = node_num; | 548 | event->hn_node_num = node_num; |
@@ -593,14 +595,22 @@ static int o2hb_check_slot(struct o2hb_region *reg, | |||
593 | u64 cputime; | 595 | u64 cputime; |
594 | unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; | 596 | unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; |
595 | unsigned int slot_dead_ms; | 597 | unsigned int slot_dead_ms; |
598 | int tmp; | ||
596 | 599 | ||
597 | memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); | 600 | memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); |
598 | 601 | ||
599 | /* Is this correct? Do we assume that the node doesn't exist | 602 | /* |
600 | * if we're not configured for him? */ | 603 | * If a node is no longer configured but is still in the livemap, we |
604 | * may need to clear that bit from the livemap. | ||
605 | */ | ||
601 | node = o2nm_get_node_by_num(slot->ds_node_num); | 606 | node = o2nm_get_node_by_num(slot->ds_node_num); |
602 | if (!node) | 607 | if (!node) { |
603 | return 0; | 608 | spin_lock(&o2hb_live_lock); |
609 | tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap); | ||
610 | spin_unlock(&o2hb_live_lock); | ||
611 | if (!tmp) | ||
612 | return 0; | ||
613 | } | ||
604 | 614 | ||
605 | if (!o2hb_verify_crc(reg, hb_block)) { | 615 | if (!o2hb_verify_crc(reg, hb_block)) { |
606 | /* all paths from here will drop o2hb_live_lock for | 616 | /* all paths from here will drop o2hb_live_lock for |
@@ -717,8 +727,9 @@ fire_callbacks: | |||
717 | if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { | 727 | if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { |
718 | clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); | 728 | clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); |
719 | 729 | ||
720 | o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, | 730 | /* node can be null */ |
721 | slot->ds_node_num); | 731 | o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, |
732 | node, slot->ds_node_num); | ||
722 | 733 | ||
723 | changed = 1; | 734 | changed = 1; |
724 | } | 735 | } |
@@ -738,7 +749,8 @@ out: | |||
738 | 749 | ||
739 | o2hb_run_event_list(&event); | 750 | o2hb_run_event_list(&event); |
740 | 751 | ||
741 | o2nm_node_put(node); | 752 | if (node) |
753 | o2nm_node_put(node); | ||
742 | return changed; | 754 | return changed; |
743 | } | 755 | } |
744 | 756 | ||
@@ -765,6 +777,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) | |||
765 | { | 777 | { |
766 | int i, ret, highest_node, change = 0; | 778 | int i, ret, highest_node, change = 0; |
767 | unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; | 779 | unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; |
780 | unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; | ||
768 | struct o2hb_bio_wait_ctxt write_wc; | 781 | struct o2hb_bio_wait_ctxt write_wc; |
769 | 782 | ||
770 | ret = o2nm_configured_node_map(configured_nodes, | 783 | ret = o2nm_configured_node_map(configured_nodes, |
@@ -774,6 +787,17 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) | |||
774 | return ret; | 787 | return ret; |
775 | } | 788 | } |
776 | 789 | ||
790 | /* | ||
791 | * If a node is not configured but is in the livemap, we still need | ||
792 | * to read the slot so as to be able to remove it from the livemap. | ||
793 | */ | ||
794 | o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap)); | ||
795 | i = -1; | ||
796 | while ((i = find_next_bit(live_node_bitmap, | ||
797 | O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { | ||
798 | set_bit(i, configured_nodes); | ||
799 | } | ||
800 | |||
777 | highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); | 801 | highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); |
778 | if (highest_node >= O2NM_MAX_NODES) { | 802 | if (highest_node >= O2NM_MAX_NODES) { |
779 | mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); | 803 | mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); |
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index cbe2f057cc2..9aa426e4212 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c | |||
@@ -1696,6 +1696,9 @@ static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num, | |||
1696 | { | 1696 | { |
1697 | o2quo_hb_down(node_num); | 1697 | o2quo_hb_down(node_num); |
1698 | 1698 | ||
1699 | if (!node) | ||
1700 | return; | ||
1701 | |||
1699 | if (node_num != o2nm_this_node()) | 1702 | if (node_num != o2nm_this_node()) |
1700 | o2net_disconnect_node(node); | 1703 | o2net_disconnect_node(node); |
1701 | 1704 | ||
@@ -1709,6 +1712,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num, | |||
1709 | 1712 | ||
1710 | o2quo_hb_up(node_num); | 1713 | o2quo_hb_up(node_num); |
1711 | 1714 | ||
1715 | BUG_ON(!node); | ||
1716 | |||
1712 | /* ensure an immediate connect attempt */ | 1717 | /* ensure an immediate connect attempt */ |
1713 | nn->nn_last_connect_attempt = jiffies - | 1718 | nn->nn_last_connect_attempt = jiffies - |
1714 | (msecs_to_jiffies(o2net_reconnect_delay()) + 1); | 1719 | (msecs_to_jiffies(o2net_reconnect_delay()) + 1); |