aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/ocfs2/cluster/heartbeat.c41
1 files changed, 41 insertions, 0 deletions
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 62a8af271344..f890656127fa 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -68,10 +68,12 @@ static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);
68 * - o2hb_live_region_bitmap tracks live regions (seen steady iterations). 68 * - o2hb_live_region_bitmap tracks live regions (seen steady iterations).
69 * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes 69 * - o2hb_quorum_region_bitmap tracks live regions that have seen all nodes
70 * heartbeat on it. 70 * heartbeat on it.
71 * - o2hb_failed_region_bitmap tracks the regions that have seen io timeouts.
71 */ 72 */
72static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; 73static unsigned long o2hb_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
73static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; 74static unsigned long o2hb_live_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
74static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)]; 75static unsigned long o2hb_quorum_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
76static unsigned long o2hb_failed_region_bitmap[BITS_TO_LONGS(O2NM_MAX_REGIONS)];
75 77
76#define O2HB_DB_TYPE_LIVENODES 0 78#define O2HB_DB_TYPE_LIVENODES 0
77struct o2hb_debug_buf { 79struct o2hb_debug_buf {
@@ -217,8 +219,19 @@ struct o2hb_bio_wait_ctxt {
217 int wc_error; 219 int wc_error;
218}; 220};
219 221
222static int o2hb_pop_count(void *map, int count)
223{
224 int i = -1, pop = 0;
225
226 while ((i = find_next_bit(map, count, i + 1)) < count)
227 pop++;
228 return pop;
229}
230
220static void o2hb_write_timeout(struct work_struct *work) 231static void o2hb_write_timeout(struct work_struct *work)
221{ 232{
233 int failed, quorum;
234 unsigned long flags;
222 struct o2hb_region *reg = 235 struct o2hb_region *reg =
223 container_of(work, struct o2hb_region, 236 container_of(work, struct o2hb_region,
224 hr_write_timeout_work.work); 237 hr_write_timeout_work.work);
@@ -226,6 +239,28 @@ static void o2hb_write_timeout(struct work_struct *work)
226 mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u " 239 mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
227 "milliseconds\n", reg->hr_dev_name, 240 "milliseconds\n", reg->hr_dev_name,
228 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); 241 jiffies_to_msecs(jiffies - reg->hr_last_timeout_start));
242
243 if (o2hb_global_heartbeat_active()) {
244 spin_lock_irqsave(&o2hb_live_lock, flags);
245 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
246 set_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
247 failed = o2hb_pop_count(&o2hb_failed_region_bitmap,
248 O2NM_MAX_REGIONS);
249 quorum = o2hb_pop_count(&o2hb_quorum_region_bitmap,
250 O2NM_MAX_REGIONS);
251 spin_unlock_irqrestore(&o2hb_live_lock, flags);
252
253 mlog(ML_HEARTBEAT, "Number of regions %d, failed regions %d\n",
254 quorum, failed);
255
256 /*
257 * Fence if the number of failed regions >= half the number
258 * of quorum regions
259 */
260 if ((failed << 1) < quorum)
261 return;
262 }
263
229 o2quo_disk_timeout(); 264 o2quo_disk_timeout();
230} 265}
231 266
@@ -234,6 +269,11 @@ static void o2hb_arm_write_timeout(struct o2hb_region *reg)
234 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", 269 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
235 O2HB_MAX_WRITE_TIMEOUT_MS); 270 O2HB_MAX_WRITE_TIMEOUT_MS);
236 271
272 if (o2hb_global_heartbeat_active()) {
273 spin_lock(&o2hb_live_lock);
274 clear_bit(reg->hr_region_num, o2hb_failed_region_bitmap);
275 spin_unlock(&o2hb_live_lock);
276 }
237 cancel_delayed_work(&reg->hr_write_timeout_work); 277 cancel_delayed_work(&reg->hr_write_timeout_work);
238 reg->hr_last_timeout_start = jiffies; 278 reg->hr_last_timeout_start = jiffies;
239 schedule_delayed_work(&reg->hr_write_timeout_work, 279 schedule_delayed_work(&reg->hr_write_timeout_work,
@@ -1173,6 +1213,7 @@ int o2hb_init(void)
1173 memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap)); 1213 memset(o2hb_region_bitmap, 0, sizeof(o2hb_region_bitmap));
1174 memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap)); 1214 memset(o2hb_live_region_bitmap, 0, sizeof(o2hb_live_region_bitmap));
1175 memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap)); 1215 memset(o2hb_quorum_region_bitmap, 0, sizeof(o2hb_quorum_region_bitmap));
1216 memset(o2hb_failed_region_bitmap, 0, sizeof(o2hb_failed_region_bitmap));
1176 1217
1177 return o2hb_debug_init(); 1218 return o2hb_debug_init();
1178} 1219}