aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2
diff options
context:
space:
mode:
authorSunil Mushran <sunil.mushran@oracle.com>2010-10-07 20:00:16 -0400
committerSunil Mushran <sunil.mushran@oracle.com>2010-10-07 20:00:16 -0400
commit0e105d37c2adb19cb777aa6701a866f211764a30 (patch)
tree288022158da0392b240484784db4aaca0a44915d /fs/ocfs2
parent39a298563e0619b1b6e2e0974e58801de780621c (diff)
ocfs2/cluster: Check slots for unconfigured live nodes
o2hb currently checks slots for configured nodes only. This patch makes it check the slots for the live nodes too to take care of a race in which a node is removed from the configuration but not from the live map. Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Diffstat (limited to 'fs/ocfs2')
-rw-r--r--fs/ocfs2/cluster/heartbeat.c38
-rw-r--r--fs/ocfs2/cluster/tcp.c5
2 files changed, 36 insertions, 7 deletions
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 12bb12ba864..a8f10649674 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -541,6 +541,8 @@ static void o2hb_queue_node_event(struct o2hb_node_event *event,
541{ 541{
542 assert_spin_locked(&o2hb_live_lock); 542 assert_spin_locked(&o2hb_live_lock);
543 543
544 BUG_ON((!node) && (type != O2HB_NODE_DOWN_CB));
545
544 event->hn_event_type = type; 546 event->hn_event_type = type;
545 event->hn_node = node; 547 event->hn_node = node;
546 event->hn_node_num = node_num; 548 event->hn_node_num = node_num;
@@ -593,14 +595,22 @@ static int o2hb_check_slot(struct o2hb_region *reg,
593 u64 cputime; 595 u64 cputime;
594 unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS; 596 unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
595 unsigned int slot_dead_ms; 597 unsigned int slot_dead_ms;
598 int tmp;
596 599
597 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes); 600 memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);
598 601
599 /* Is this correct? Do we assume that the node doesn't exist 602 /*
600 * if we're not configured for him? */ 603 * If a node is no longer configured but is still in the livemap, we
604 * may need to clear that bit from the livemap.
605 */
601 node = o2nm_get_node_by_num(slot->ds_node_num); 606 node = o2nm_get_node_by_num(slot->ds_node_num);
602 if (!node) 607 if (!node) {
603 return 0; 608 spin_lock(&o2hb_live_lock);
609 tmp = test_bit(slot->ds_node_num, o2hb_live_node_bitmap);
610 spin_unlock(&o2hb_live_lock);
611 if (!tmp)
612 return 0;
613 }
604 614
605 if (!o2hb_verify_crc(reg, hb_block)) { 615 if (!o2hb_verify_crc(reg, hb_block)) {
606 /* all paths from here will drop o2hb_live_lock for 616 /* all paths from here will drop o2hb_live_lock for
@@ -717,8 +727,9 @@ fire_callbacks:
717 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) { 727 if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
718 clear_bit(slot->ds_node_num, o2hb_live_node_bitmap); 728 clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);
719 729
720 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node, 730 /* node can be null */
721 slot->ds_node_num); 731 o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB,
732 node, slot->ds_node_num);
722 733
723 changed = 1; 734 changed = 1;
724 } 735 }
@@ -738,7 +749,8 @@ out:
738 749
739 o2hb_run_event_list(&event); 750 o2hb_run_event_list(&event);
740 751
741 o2nm_node_put(node); 752 if (node)
753 o2nm_node_put(node);
742 return changed; 754 return changed;
743} 755}
744 756
@@ -765,6 +777,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
765{ 777{
766 int i, ret, highest_node, change = 0; 778 int i, ret, highest_node, change = 0;
767 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; 779 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
780 unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
768 struct o2hb_bio_wait_ctxt write_wc; 781 struct o2hb_bio_wait_ctxt write_wc;
769 782
770 ret = o2nm_configured_node_map(configured_nodes, 783 ret = o2nm_configured_node_map(configured_nodes,
@@ -774,6 +787,17 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
774 return ret; 787 return ret;
775 } 788 }
776 789
790 /*
791 * If a node is not configured but is in the livemap, we still need
792 * to read the slot so as to be able to remove it from the livemap.
793 */
794 o2hb_fill_node_map(live_node_bitmap, sizeof(live_node_bitmap));
795 i = -1;
796 while ((i = find_next_bit(live_node_bitmap,
797 O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
798 set_bit(i, configured_nodes);
799 }
800
777 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); 801 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
778 if (highest_node >= O2NM_MAX_NODES) { 802 if (highest_node >= O2NM_MAX_NODES) {
779 mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); 803 mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index cbe2f057cc2..9aa426e4212 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1696,6 +1696,9 @@ static void o2net_hb_node_down_cb(struct o2nm_node *node, int node_num,
1696{ 1696{
1697 o2quo_hb_down(node_num); 1697 o2quo_hb_down(node_num);
1698 1698
1699 if (!node)
1700 return;
1701
1699 if (node_num != o2nm_this_node()) 1702 if (node_num != o2nm_this_node())
1700 o2net_disconnect_node(node); 1703 o2net_disconnect_node(node);
1701 1704
@@ -1709,6 +1712,8 @@ static void o2net_hb_node_up_cb(struct o2nm_node *node, int node_num,
1709 1712
1710 o2quo_hb_up(node_num); 1713 o2quo_hb_up(node_num);
1711 1714
1715 BUG_ON(!node);
1716
1712 /* ensure an immediate connect attempt */ 1717 /* ensure an immediate connect attempt */
1713 nn->nn_last_connect_attempt = jiffies - 1718 nn->nn_last_connect_attempt = jiffies -
1714 (msecs_to_jiffies(o2net_reconnect_delay()) + 1); 1719 (msecs_to_jiffies(o2net_reconnect_delay()) + 1);