aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2
diff options
context:
space:
mode:
authorSunil Mushran <sunil.mushran@oracle.com>2011-07-24 13:21:54 -0400
committerSunil Mushran <sunil.mushran@oracle.com>2011-07-24 13:21:54 -0400
commitd2eece376648d2f7ba0a7d78f3c4d0421e608ac2 (patch)
tree6d43ac36bcbdf01f544027ec5d2c1427411ab498 /fs/ocfs2
parentb6844e8f64920cdee620157252169ba63afb0c89 (diff)
ocfs2/cluster: Abort heartbeat start on hard-ro devices
Currently if the heartbeat device is hard-ro, the o2hb thread keeps chugging along and dumping errors along the way. The user needs to manually stop the heartbeat. The patch addresses this shortcoming by adding a limit to the number of times the hb thread will iterate in an unsteady state. If the hb thread does not ready steady state in that many interation, the start is aborted. Signed-off-by: Sunil Mushran <sunil.mushran@oracle.com>
Diffstat (limited to 'fs/ocfs2')
-rw-r--r--fs/ocfs2/cluster/heartbeat.c185
1 files changed, 116 insertions, 69 deletions
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 9a3e6bbff27b..4728a746281d 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -216,6 +216,7 @@ struct o2hb_region {
216 216
217 struct list_head hr_all_item; 217 struct list_head hr_all_item;
218 unsigned hr_unclean_stop:1, 218 unsigned hr_unclean_stop:1,
219 hr_aborted_start:1,
219 hr_item_pinned:1, 220 hr_item_pinned:1,
220 hr_item_dropped:1; 221 hr_item_dropped:1;
221 222
@@ -254,6 +255,10 @@ struct o2hb_region {
254 * a more complete api that doesn't lead to this sort of fragility. */ 255 * a more complete api that doesn't lead to this sort of fragility. */
255 atomic_t hr_steady_iterations; 256 atomic_t hr_steady_iterations;
256 257
258 /* terminate o2hb thread if it does not reach steady state
259 * (hr_steady_iterations == 0) within hr_unsteady_iterations */
260 atomic_t hr_unsteady_iterations;
261
257 char hr_dev_name[BDEVNAME_SIZE]; 262 char hr_dev_name[BDEVNAME_SIZE];
258 263
259 unsigned int hr_timeout_ms; 264 unsigned int hr_timeout_ms;
@@ -324,6 +329,10 @@ static void o2hb_write_timeout(struct work_struct *work)
324 329
325static void o2hb_arm_write_timeout(struct o2hb_region *reg) 330static void o2hb_arm_write_timeout(struct o2hb_region *reg)
326{ 331{
332 /* Arm writeout only after thread reaches steady state */
333 if (atomic_read(&reg->hr_steady_iterations) != 0)
334 return;
335
327 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", 336 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
328 O2HB_MAX_WRITE_TIMEOUT_MS); 337 O2HB_MAX_WRITE_TIMEOUT_MS);
329 338
@@ -537,9 +546,14 @@ static int o2hb_verify_crc(struct o2hb_region *reg,
537 return read == computed; 546 return read == computed;
538} 547}
539 548
540/* We want to make sure that nobody is heartbeating on top of us -- 549/*
541 * this will help detect an invalid configuration. */ 550 * Compare the slot data with what we wrote in the last iteration.
542static void o2hb_check_last_timestamp(struct o2hb_region *reg) 551 * If the match fails, print an appropriate error message. This is to
552 * detect errors like... another node hearting on the same slot,
553 * flaky device that is losing writes, etc.
554 * Returns 1 if check succeeds, 0 otherwise.
555 */
556static int o2hb_check_own_slot(struct o2hb_region *reg)
543{ 557{
544 struct o2hb_disk_slot *slot; 558 struct o2hb_disk_slot *slot;
545 struct o2hb_disk_heartbeat_block *hb_block; 559 struct o2hb_disk_heartbeat_block *hb_block;
@@ -548,13 +562,13 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg)
548 slot = &reg->hr_slots[o2nm_this_node()]; 562 slot = &reg->hr_slots[o2nm_this_node()];
549 /* Don't check on our 1st timestamp */ 563 /* Don't check on our 1st timestamp */
550 if (!slot->ds_last_time) 564 if (!slot->ds_last_time)
551 return; 565 return 0;
552 566
553 hb_block = slot->ds_raw_block; 567 hb_block = slot->ds_raw_block;
554 if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time && 568 if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time &&
555 le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation && 569 le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation &&
556 hb_block->hb_node == slot->ds_node_num) 570 hb_block->hb_node == slot->ds_node_num)
557 return; 571 return 1;
558 572
559#define ERRSTR1 "Another node is heartbeating on device" 573#define ERRSTR1 "Another node is heartbeating on device"
560#define ERRSTR2 "Heartbeat generation mismatch on device" 574#define ERRSTR2 "Heartbeat generation mismatch on device"
@@ -574,6 +588,8 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg)
574 (unsigned long long)slot->ds_last_time, hb_block->hb_node, 588 (unsigned long long)slot->ds_last_time, hb_block->hb_node,
575 (unsigned long long)le64_to_cpu(hb_block->hb_generation), 589 (unsigned long long)le64_to_cpu(hb_block->hb_generation),
576 (unsigned long long)le64_to_cpu(hb_block->hb_seq)); 590 (unsigned long long)le64_to_cpu(hb_block->hb_seq));
591
592 return 0;
577} 593}
578 594
579static inline void o2hb_prepare_block(struct o2hb_region *reg, 595static inline void o2hb_prepare_block(struct o2hb_region *reg,
@@ -719,17 +735,24 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
719 o2nm_node_put(node); 735 o2nm_node_put(node);
720} 736}
721 737
722static void o2hb_set_quorum_device(struct o2hb_region *reg, 738static void o2hb_set_quorum_device(struct o2hb_region *reg)
723 struct o2hb_disk_slot *slot)
724{ 739{
725 assert_spin_locked(&o2hb_live_lock);
726
727 if (!o2hb_global_heartbeat_active()) 740 if (!o2hb_global_heartbeat_active())
728 return; 741 return;
729 742
730 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) 743 /* Prevent race with o2hb_heartbeat_group_drop_item() */
744 if (kthread_should_stop())
745 return;
746
747 /* Tag region as quorum only after thread reaches steady state */
748 if (atomic_read(&reg->hr_steady_iterations) != 0)
731 return; 749 return;
732 750
751 spin_lock(&o2hb_live_lock);
752
753 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
754 goto unlock;
755
733 /* 756 /*
734 * A region can be added to the quorum only when it sees all 757 * A region can be added to the quorum only when it sees all
735 * live nodes heartbeat on it. In other words, the region has been 758 * live nodes heartbeat on it. In other words, the region has been
@@ -737,13 +760,10 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
737 */ 760 */
738 if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, 761 if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
739 sizeof(o2hb_live_node_bitmap))) 762 sizeof(o2hb_live_node_bitmap)))
740 return; 763 goto unlock;
741
742 if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD)
743 return;
744 764
745 printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n", 765 printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n",
746 config_item_name(&reg->hr_item)); 766 config_item_name(&reg->hr_item), reg->hr_dev_name);
747 767
748 set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); 768 set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
749 769
@@ -754,6 +774,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
754 if (o2hb_pop_count(&o2hb_quorum_region_bitmap, 774 if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
755 O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF) 775 O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
756 o2hb_region_unpin(NULL); 776 o2hb_region_unpin(NULL);
777unlock:
778 spin_unlock(&o2hb_live_lock);
757} 779}
758 780
759static int o2hb_check_slot(struct o2hb_region *reg, 781static int o2hb_check_slot(struct o2hb_region *reg,
@@ -925,8 +947,6 @@ fire_callbacks:
925 slot->ds_equal_samples = 0; 947 slot->ds_equal_samples = 0;
926 } 948 }
927out: 949out:
928 o2hb_set_quorum_device(reg, slot);
929
930 spin_unlock(&o2hb_live_lock); 950 spin_unlock(&o2hb_live_lock);
931 951
932 o2hb_run_event_list(&event); 952 o2hb_run_event_list(&event);
@@ -957,7 +977,8 @@ static int o2hb_highest_node(unsigned long *nodes,
957 977
958static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) 978static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
959{ 979{
960 int i, ret, highest_node, change = 0; 980 int i, ret, highest_node;
981 int membership_change = 0, own_slot_ok = 0;
961 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; 982 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
962 unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; 983 unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
963 struct o2hb_bio_wait_ctxt write_wc; 984 struct o2hb_bio_wait_ctxt write_wc;
@@ -966,7 +987,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
966 sizeof(configured_nodes)); 987 sizeof(configured_nodes));
967 if (ret) { 988 if (ret) {
968 mlog_errno(ret); 989 mlog_errno(ret);
969 return ret; 990 goto bail;
970 } 991 }
971 992
972 /* 993 /*
@@ -982,8 +1003,9 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
982 1003
983 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); 1004 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
984 if (highest_node >= O2NM_MAX_NODES) { 1005 if (highest_node >= O2NM_MAX_NODES) {
985 mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); 1006 mlog(ML_NOTICE, "o2hb: No configured nodes found!\n");
986 return -EINVAL; 1007 ret = -EINVAL;
1008 goto bail;
987 } 1009 }
988 1010
989 /* No sense in reading the slots of nodes that don't exist 1011 /* No sense in reading the slots of nodes that don't exist
@@ -993,29 +1015,27 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
993 ret = o2hb_read_slots(reg, highest_node + 1); 1015 ret = o2hb_read_slots(reg, highest_node + 1);
994 if (ret < 0) { 1016 if (ret < 0) {
995 mlog_errno(ret); 1017 mlog_errno(ret);
996 return ret; 1018 goto bail;
997 } 1019 }
998 1020
999 /* With an up to date view of the slots, we can check that no 1021 /* With an up to date view of the slots, we can check that no
1000 * other node has been improperly configured to heartbeat in 1022 * other node has been improperly configured to heartbeat in
1001 * our slot. */ 1023 * our slot. */
1002 o2hb_check_last_timestamp(reg); 1024 own_slot_ok = o2hb_check_own_slot(reg);
1003 1025
1004 /* fill in the proper info for our next heartbeat */ 1026 /* fill in the proper info for our next heartbeat */
1005 o2hb_prepare_block(reg, reg->hr_generation); 1027 o2hb_prepare_block(reg, reg->hr_generation);
1006 1028
1007 /* And fire off the write. Note that we don't wait on this I/O
1008 * until later. */
1009 ret = o2hb_issue_node_write(reg, &write_wc); 1029 ret = o2hb_issue_node_write(reg, &write_wc);
1010 if (ret < 0) { 1030 if (ret < 0) {
1011 mlog_errno(ret); 1031 mlog_errno(ret);
1012 return ret; 1032 goto bail;
1013 } 1033 }
1014 1034
1015 i = -1; 1035 i = -1;
1016 while((i = find_next_bit(configured_nodes, 1036 while((i = find_next_bit(configured_nodes,
1017 O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { 1037 O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
1018 change |= o2hb_check_slot(reg, &reg->hr_slots[i]); 1038 membership_change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
1019 } 1039 }
1020 1040
1021 /* 1041 /*
@@ -1030,18 +1050,39 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
1030 * disk */ 1050 * disk */
1031 mlog(ML_ERROR, "Write error %d on device \"%s\"\n", 1051 mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
1032 write_wc.wc_error, reg->hr_dev_name); 1052 write_wc.wc_error, reg->hr_dev_name);
1033 return write_wc.wc_error; 1053 ret = write_wc.wc_error;
1054 goto bail;
1034 } 1055 }
1035 1056
1036 o2hb_arm_write_timeout(reg); 1057 /* Skip disarming the timeout if own slot has stale/bad data */
1058 if (own_slot_ok) {
1059 o2hb_set_quorum_device(reg);
1060 o2hb_arm_write_timeout(reg);
1061 }
1037 1062
1063bail:
1038 /* let the person who launched us know when things are steady */ 1064 /* let the person who launched us know when things are steady */
1039 if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) { 1065 if (atomic_read(&reg->hr_steady_iterations) != 0) {
1040 if (atomic_dec_and_test(&reg->hr_steady_iterations)) 1066 if (!ret && own_slot_ok && !membership_change) {
1067 if (atomic_dec_and_test(&reg->hr_steady_iterations))
1068 wake_up(&o2hb_steady_queue);
1069 }
1070 }
1071
1072 if (atomic_read(&reg->hr_steady_iterations) != 0) {
1073 if (atomic_dec_and_test(&reg->hr_unsteady_iterations)) {
1074 printk(KERN_NOTICE "o2hb: Unable to stabilize "
1075 "heartbeart on region %s (%s)\n",
1076 config_item_name(&reg->hr_item),
1077 reg->hr_dev_name);
1078 atomic_set(&reg->hr_steady_iterations, 0);
1079 reg->hr_aborted_start = 1;
1041 wake_up(&o2hb_steady_queue); 1080 wake_up(&o2hb_steady_queue);
1081 ret = -EIO;
1082 }
1042 } 1083 }
1043 1084
1044 return 0; 1085 return ret;
1045} 1086}
1046 1087
1047/* Subtract b from a, storing the result in a. a *must* have a larger 1088/* Subtract b from a, storing the result in a. a *must* have a larger
@@ -1095,7 +1136,8 @@ static int o2hb_thread(void *data)
1095 /* Pin node */ 1136 /* Pin node */
1096 o2nm_depend_this_node(); 1137 o2nm_depend_this_node();
1097 1138
1098 while (!kthread_should_stop() && !reg->hr_unclean_stop) { 1139 while (!kthread_should_stop() &&
1140 !reg->hr_unclean_stop && !reg->hr_aborted_start) {
1099 /* We track the time spent inside 1141 /* We track the time spent inside
1100 * o2hb_do_disk_heartbeat so that we avoid more than 1142 * o2hb_do_disk_heartbeat so that we avoid more than
1101 * hr_timeout_ms between disk writes. On busy systems 1143 * hr_timeout_ms between disk writes. On busy systems
@@ -1103,10 +1145,7 @@ static int o2hb_thread(void *data)
1103 * likely to time itself out. */ 1145 * likely to time itself out. */
1104 do_gettimeofday(&before_hb); 1146 do_gettimeofday(&before_hb);
1105 1147
1106 i = 0; 1148 ret = o2hb_do_disk_heartbeat(reg);
1107 do {
1108 ret = o2hb_do_disk_heartbeat(reg);
1109 } while (ret && ++i < 2);
1110 1149
1111 do_gettimeofday(&after_hb); 1150 do_gettimeofday(&after_hb);
1112 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); 1151 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
@@ -1117,7 +1156,8 @@ static int o2hb_thread(void *data)
1117 after_hb.tv_sec, (unsigned long) after_hb.tv_usec, 1156 after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
1118 elapsed_msec); 1157 elapsed_msec);
1119 1158
1120 if (elapsed_msec < reg->hr_timeout_ms) { 1159 if (!kthread_should_stop() &&
1160 elapsed_msec < reg->hr_timeout_ms) {
1121 /* the kthread api has blocked signals for us so no 1161 /* the kthread api has blocked signals for us so no
1122 * need to record the return value. */ 1162 * need to record the return value. */
1123 msleep_interruptible(reg->hr_timeout_ms - elapsed_msec); 1163 msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
@@ -1134,20 +1174,20 @@ static int o2hb_thread(void *data)
1134 * to timeout on this region when we could just as easily 1174 * to timeout on this region when we could just as easily
1135 * write a clear generation - thus indicating to them that 1175 * write a clear generation - thus indicating to them that
1136 * this node has left this region. 1176 * this node has left this region.
1137 * 1177 */
1138 * XXX: Should we skip this on unclean_stop? */ 1178 if (!reg->hr_unclean_stop && !reg->hr_aborted_start) {
1139 o2hb_prepare_block(reg, 0); 1179 o2hb_prepare_block(reg, 0);
1140 ret = o2hb_issue_node_write(reg, &write_wc); 1180 ret = o2hb_issue_node_write(reg, &write_wc);
1141 if (ret == 0) { 1181 if (ret == 0)
1142 o2hb_wait_on_io(reg, &write_wc); 1182 o2hb_wait_on_io(reg, &write_wc);
1143 } else { 1183 else
1144 mlog_errno(ret); 1184 mlog_errno(ret);
1145 } 1185 }
1146 1186
1147 /* Unpin node */ 1187 /* Unpin node */
1148 o2nm_undepend_this_node(); 1188 o2nm_undepend_this_node();
1149 1189
1150 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n"); 1190 mlog(ML_HEARTBEAT|ML_KTHREAD, "o2hb thread exiting\n");
1151 1191
1152 return 0; 1192 return 0;
1153} 1193}
@@ -1426,6 +1466,8 @@ static void o2hb_region_release(struct config_item *item)
1426 struct page *page; 1466 struct page *page;
1427 struct o2hb_region *reg = to_o2hb_region(item); 1467 struct o2hb_region *reg = to_o2hb_region(item);
1428 1468
1469 mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name);
1470
1429 if (reg->hr_tmp_block) 1471 if (reg->hr_tmp_block)
1430 kfree(reg->hr_tmp_block); 1472 kfree(reg->hr_tmp_block);
1431 1473
@@ -1792,7 +1834,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1792 live_threshold <<= 1; 1834 live_threshold <<= 1;
1793 spin_unlock(&o2hb_live_lock); 1835 spin_unlock(&o2hb_live_lock);
1794 } 1836 }
1795 atomic_set(&reg->hr_steady_iterations, live_threshold + 1); 1837 ++live_threshold;
1838 atomic_set(&reg->hr_steady_iterations, live_threshold);
1839 /* unsteady_iterations is double the steady_iterations */
1840 atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1));
1796 1841
1797 hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", 1842 hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
1798 reg->hr_item.ci_name); 1843 reg->hr_item.ci_name);
@@ -1809,14 +1854,12 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1809 ret = wait_event_interruptible(o2hb_steady_queue, 1854 ret = wait_event_interruptible(o2hb_steady_queue,
1810 atomic_read(&reg->hr_steady_iterations) == 0); 1855 atomic_read(&reg->hr_steady_iterations) == 0);
1811 if (ret) { 1856 if (ret) {
1812 /* We got interrupted (hello ptrace!). Clean up */ 1857 atomic_set(&reg->hr_steady_iterations, 0);
1813 spin_lock(&o2hb_live_lock); 1858 reg->hr_aborted_start = 1;
1814 hb_task = reg->hr_task; 1859 }
1815 reg->hr_task = NULL;
1816 spin_unlock(&o2hb_live_lock);
1817 1860
1818 if (hb_task) 1861 if (reg->hr_aborted_start) {
1819 kthread_stop(hb_task); 1862 ret = -EIO;
1820 goto out; 1863 goto out;
1821 } 1864 }
1822 1865
@@ -1833,8 +1876,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1833 ret = -EIO; 1876 ret = -EIO;
1834 1877
1835 if (hb_task && o2hb_global_heartbeat_active()) 1878 if (hb_task && o2hb_global_heartbeat_active())
1836 printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n", 1879 printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n",
1837 config_item_name(&reg->hr_item)); 1880 config_item_name(&reg->hr_item), reg->hr_dev_name);
1838 1881
1839out: 1882out:
1840 if (filp) 1883 if (filp)
@@ -2092,13 +2135,6 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
2092 2135
2093 /* stop the thread when the user removes the region dir */ 2136 /* stop the thread when the user removes the region dir */
2094 spin_lock(&o2hb_live_lock); 2137 spin_lock(&o2hb_live_lock);
2095 if (o2hb_global_heartbeat_active()) {
2096 clear_bit(reg->hr_region_num, o2hb_region_bitmap);
2097 clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
2098 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
2099 quorum_region = 1;
2100 clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
2101 }
2102 hb_task = reg->hr_task; 2138 hb_task = reg->hr_task;
2103 reg->hr_task = NULL; 2139 reg->hr_task = NULL;
2104 reg->hr_item_dropped = 1; 2140 reg->hr_item_dropped = 1;
@@ -2107,19 +2143,30 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
2107 if (hb_task) 2143 if (hb_task)
2108 kthread_stop(hb_task); 2144 kthread_stop(hb_task);
2109 2145
2146 if (o2hb_global_heartbeat_active()) {
2147 spin_lock(&o2hb_live_lock);
2148 clear_bit(reg->hr_region_num, o2hb_region_bitmap);
2149 clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
2150 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
2151 quorum_region = 1;
2152 clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
2153 spin_unlock(&o2hb_live_lock);
2154 printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n",
2155 ((atomic_read(&reg->hr_steady_iterations) == 0) ?
2156 "stopped" : "start aborted"), config_item_name(item),
2157 reg->hr_dev_name);
2158 }
2159
2110 /* 2160 /*
2111 * If we're racing a dev_write(), we need to wake them. They will 2161 * If we're racing a dev_write(), we need to wake them. They will
2112 * check reg->hr_task 2162 * check reg->hr_task
2113 */ 2163 */
2114 if (atomic_read(&reg->hr_steady_iterations) != 0) { 2164 if (atomic_read(&reg->hr_steady_iterations) != 0) {
2165 reg->hr_aborted_start = 1;
2115 atomic_set(&reg->hr_steady_iterations, 0); 2166 atomic_set(&reg->hr_steady_iterations, 0);
2116 wake_up(&o2hb_steady_queue); 2167 wake_up(&o2hb_steady_queue);
2117 } 2168 }
2118 2169
2119 if (o2hb_global_heartbeat_active())
2120 printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
2121 config_item_name(&reg->hr_item));
2122
2123 config_item_put(item); 2170 config_item_put(item);
2124 2171
2125 if (!o2hb_global_heartbeat_active() || !quorum_region) 2172 if (!o2hb_global_heartbeat_active() || !quorum_region)