diff options
-rw-r--r-- | fs/ocfs2/cluster/heartbeat.c | 41 |
1 files changed, 41 insertions, 0 deletions
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 62a8af271344..f890656127fa 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c | |||
@@ -68,10 +68,12 @@ static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue); | |||
68 | * - o2hb_live_region_bitmap tracks live regions (seen steady iterations). | 68 | * - o2hb_live_region_bitmap tracks live regions (seen steady iterations). |
69 | * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes | 69 | * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes |
70 | * heartbeat on it. | 70 | * heartbeat on it. |
71 | * - o2hb_failed_region_bitmap tracks the regions that have seen io timeouts. | ||
71 | */ | 72 | */ |
72 | static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; | 73 | static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; |
73 | static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; | 74 | static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; |
74 | static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; | 75 | static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; |
76 | static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; | ||
75 | 77 | ||
76 | #define O2HB_DB_TYPE_LIVENODES 0 | 78 | #define O2HB_DB_TYPE_LIVENODES 0 |
77 | struct o2hb_debug_buf { | 79 | struct o2hb_debug_buf { |
@@ -217,8 +219,19 @@ struct o2hb_bio_wait_ctxt { | |||
217 | int wc_error; | 219 | int wc_error; |
218 | }; | 220 | }; |
219 | 221 | ||
222 | static int o2hb_pop_count(void *map, int count) | ||
223 | { | ||
224 | int i = -1, pop = 0; | ||
225 | |||
226 | while ((i = find_next_bit(map, count, i + 1)) < count) | ||
227 | pop++; | ||
228 | return pop; | ||
229 | } | ||
230 | |||
220 | static void o2hb_write_timeout(struct work_struct *work) | 231 | static void o2hb_write_timeout(struct work_struct *work) |
221 | { | 232 | { |
233 | int failed, quorum; | ||
234 | unsigned long flags; | ||
222 | struct o2hb_region *reg = | 235 | struct o2hb_region *reg = |
223 | container_of(work, struct o2hb_region, | 236 | container_of(work, struct o2hb_region, |
224 | hr_write_timeout_work.work); | 237 | hr_write_timeout_work.work); |
@@ -226,6 +239,28 @@ static void o2hb_write_timeout(struct work_struct *work) | |||
226 | mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " | 239 | mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " |
227 | "milliseconds\n", reg->hr_dev_name, | 240 | "milliseconds\n", reg->hr_dev_name, |
228 | jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); | 241 | jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); |
242 | |||
243 | if (o2hb_global_heartbeat_active()) { | ||
244 | spin_lock_irqsave(&o2hb_live_lock, flags); | ||
245 | if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) | ||
246 | set_bit(reg->hr_region_num, o2hb_failed_region_bitmap); | ||
247 | failed = o2hb_pop_count(&o2hb_failed_region_bitmap, | ||
248 | O2NM_MAX_REGIONS); | ||
249 | quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap, | ||
250 | O2NM_MAX_REGIONS); | ||
251 | spin_unlock_irqrestore(&o2hb_live_lock, flags); | ||
252 | |||
253 | mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n", | ||
254 | quorum, failed); | ||
255 | |||
256 | /* | ||
257 | * Fence if the number of failed regions >= half the number | ||
258 | * of quorum regions | ||
259 | */ | ||
260 | if ((failed << 1) < quorum) | ||
261 | return; | ||
262 | } | ||
263 | |||
229 | o2quo_disk_timeout(); | 264 | o2quo_disk_timeout(); |
230 | } | 265 | } |
231 | 266 | ||
@@ -234,6 +269,11 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg) | |||
234 | mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", | 269 | mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", |
235 | O2HB_MAX_WRITE_TIMEOUT_MS); | 270 | O2HB_MAX_WRITE_TIMEOUT_MS); |
236 | 271 | ||
272 | if (o2hb_global_heartbeat_active()) { | ||
273 | spin_lock(&o2hb_live_lock); | ||
274 | clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap); | ||
275 | spin_unlock(&o2hb_live_lock); | ||
276 | } | ||
237 | cancel_delayed_work(®->hr_write_timeout_work); | 277 | cancel_delayed_work(®->hr_write_timeout_work); |
238 | reg->hr_last_timeout_start = jiffies; | 278 | reg->hr_last_timeout_start = jiffies; |
239 | schedule_delayed_work(®->hr_write_timeout_work, | 279 | schedule_delayed_work(®->hr_write_timeout_work, |
@@ -1173,6 +1213,7 @@ int o2hb_init(void) | |||
1173 | memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); | 1213 | memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); |
1174 | memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap)); | 1214 | memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap)); |
1175 | memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap)); | 1215 | memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap)); |
1216 | memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap)); | ||
1176 | 1217 | ||
1177 | return o2hb_debug_init(); | 1218 | return o2hb_debug_init(); |
1178 | } | 1219 | } |