aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2')
-rw-r--r--fs/ocfs2/acl.c4
-rw-r--r--fs/ocfs2/acl.h2
-rw-r--r--fs/ocfs2/aops.c61
-rw-r--r--fs/ocfs2/cluster/heartbeat.c194
-rw-r--r--fs/ocfs2/cluster/netdebug.c102
-rw-r--r--fs/ocfs2/cluster/tcp.c138
-rw-r--r--fs/ocfs2/cluster/tcp.h2
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h56
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c44
-rw-r--r--fs/ocfs2/dlm/dlmlock.c52
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c175
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c164
-rw-r--r--fs/ocfs2/dlm/dlmthread.c16
-rw-r--r--fs/ocfs2/extent_map.c96
-rw-r--r--fs/ocfs2/extent_map.h2
-rw-r--r--fs/ocfs2/file.c96
-rw-r--r--fs/ocfs2/file.h2
-rw-r--r--fs/ocfs2/journal.c23
-rw-r--r--fs/ocfs2/mmap.c53
-rw-r--r--fs/ocfs2/namei.c1
-rw-r--r--fs/ocfs2/quota_local.c13
-rw-r--r--fs/ocfs2/refcounttree.c49
-rw-r--r--fs/ocfs2/slot_map.c4
-rw-r--r--fs/ocfs2/stack_o2cb.c71
-rw-r--r--fs/ocfs2/super.c12
25 files changed, 865 insertions, 567 deletions
diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c
index e913ad130fdd..1cee970eb55a 100644
--- a/fs/ocfs2/acl.c
+++ b/fs/ocfs2/acl.c
@@ -290,14 +290,14 @@ static int ocfs2_set_acl(handle_t *handle,
290 return ret; 290 return ret;
291} 291}
292 292
293int ocfs2_check_acl(struct inode *inode, int mask, unsigned int flags) 293int ocfs2_check_acl(struct inode *inode, int mask)
294{ 294{
295 struct ocfs2_super *osb; 295 struct ocfs2_super *osb;
296 struct buffer_head *di_bh = NULL; 296 struct buffer_head *di_bh = NULL;
297 struct posix_acl *acl; 297 struct posix_acl *acl;
298 int ret = -EAGAIN; 298 int ret = -EAGAIN;
299 299
300 if (flags & IPERM_FLAG_RCU) 300 if (mask & MAY_NOT_BLOCK)
301 return -ECHILD; 301 return -ECHILD;
302 302
303 osb = OCFS2_SB(inode->i_sb); 303 osb = OCFS2_SB(inode->i_sb);
diff --git a/fs/ocfs2/acl.h b/fs/ocfs2/acl.h
index 4fe7c9cf4bfb..5c5d31f05853 100644
--- a/fs/ocfs2/acl.h
+++ b/fs/ocfs2/acl.h
@@ -26,7 +26,7 @@ struct ocfs2_acl_entry {
26 __le32 e_id; 26 __le32 e_id;
27}; 27};
28 28
29extern int ocfs2_check_acl(struct inode *, int, unsigned int); 29extern int ocfs2_check_acl(struct inode *, int);
30extern int ocfs2_acl_chmod(struct inode *); 30extern int ocfs2_acl_chmod(struct inode *);
31extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *, 31extern int ocfs2_init_acl(handle_t *, struct inode *, struct inode *,
32 struct buffer_head *, struct buffer_head *, 32 struct buffer_head *, struct buffer_head *,
diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c
index ba3ca1e63b51..78b68af3b0e3 100644
--- a/fs/ocfs2/aops.c
+++ b/fs/ocfs2/aops.c
@@ -559,9 +559,8 @@ bail:
559 559
560/* 560/*
561 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're 561 * ocfs2_dio_end_io is called by the dio core when a dio is finished. We're
562 * particularly interested in the aio/dio case. Like the core uses 562 * particularly interested in the aio/dio case. We use the rw_lock DLM lock
563 * i_alloc_sem, we use the rw_lock DLM lock to protect io on one node from 563 * to protect io on one node from truncation on another.
564 * truncation on another.
565 */ 564 */
566static void ocfs2_dio_end_io(struct kiocb *iocb, 565static void ocfs2_dio_end_io(struct kiocb *iocb,
567 loff_t offset, 566 loff_t offset,
@@ -577,10 +576,8 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
577 /* this io's submitter should not have unlocked this before we could */ 576 /* this io's submitter should not have unlocked this before we could */
578 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb)); 577 BUG_ON(!ocfs2_iocb_is_rw_locked(iocb));
579 578
580 if (ocfs2_iocb_is_sem_locked(iocb)) { 579 if (ocfs2_iocb_is_sem_locked(iocb))
581 up_read(&inode->i_alloc_sem);
582 ocfs2_iocb_clear_sem_locked(iocb); 580 ocfs2_iocb_clear_sem_locked(iocb);
583 }
584 581
585 if (ocfs2_iocb_is_unaligned_aio(iocb)) { 582 if (ocfs2_iocb_is_unaligned_aio(iocb)) {
586 ocfs2_iocb_clear_unaligned_aio(iocb); 583 ocfs2_iocb_clear_unaligned_aio(iocb);
@@ -598,6 +595,7 @@ static void ocfs2_dio_end_io(struct kiocb *iocb,
598 595
599 if (is_async) 596 if (is_async)
600 aio_complete(iocb, ret, 0); 597 aio_complete(iocb, ret, 0);
598 inode_dio_done(inode);
601} 599}
602 600
603/* 601/*
@@ -883,6 +881,12 @@ struct ocfs2_write_ctxt {
883 struct page *w_target_page; 881 struct page *w_target_page;
884 882
885 /* 883 /*
884 * w_target_locked is used for page_mkwrite path indicating no unlocking
885 * against w_target_page in ocfs2_write_end_nolock.
886 */
887 unsigned int w_target_locked:1;
888
889 /*
886 * ocfs2_write_end() uses this to know what the real range to 890 * ocfs2_write_end() uses this to know what the real range to
887 * write in the target should be. 891 * write in the target should be.
888 */ 892 */
@@ -915,6 +919,24 @@ void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages)
915 919
916static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc) 920static void ocfs2_free_write_ctxt(struct ocfs2_write_ctxt *wc)
917{ 921{
922 int i;
923
924 /*
925 * w_target_locked is only set to true in the page_mkwrite() case.
926 * The intent is to allow us to lock the target page from write_begin()
927 * to write_end(). The caller must hold a ref on w_target_page.
928 */
929 if (wc->w_target_locked) {
930 BUG_ON(!wc->w_target_page);
931 for (i = 0; i < wc->w_num_pages; i++) {
932 if (wc->w_target_page == wc->w_pages[i]) {
933 wc->w_pages[i] = NULL;
934 break;
935 }
936 }
937 mark_page_accessed(wc->w_target_page);
938 page_cache_release(wc->w_target_page);
939 }
918 ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages); 940 ocfs2_unlock_and_free_pages(wc->w_pages, wc->w_num_pages);
919 941
920 brelse(wc->w_di_bh); 942 brelse(wc->w_di_bh);
@@ -1152,20 +1174,17 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1152 */ 1174 */
1153 lock_page(mmap_page); 1175 lock_page(mmap_page);
1154 1176
1177 /* Exit and let the caller retry */
1155 if (mmap_page->mapping != mapping) { 1178 if (mmap_page->mapping != mapping) {
1179 WARN_ON(mmap_page->mapping);
1156 unlock_page(mmap_page); 1180 unlock_page(mmap_page);
1157 /* 1181 ret = -EAGAIN;
1158 * Sanity check - the locking in
1159 * ocfs2_pagemkwrite() should ensure
1160 * that this code doesn't trigger.
1161 */
1162 ret = -EINVAL;
1163 mlog_errno(ret);
1164 goto out; 1182 goto out;
1165 } 1183 }
1166 1184
1167 page_cache_get(mmap_page); 1185 page_cache_get(mmap_page);
1168 wc->w_pages[i] = mmap_page; 1186 wc->w_pages[i] = mmap_page;
1187 wc->w_target_locked = true;
1169 } else { 1188 } else {
1170 wc->w_pages[i] = find_or_create_page(mapping, index, 1189 wc->w_pages[i] = find_or_create_page(mapping, index,
1171 GFP_NOFS); 1190 GFP_NOFS);
@@ -1180,6 +1199,8 @@ static int ocfs2_grab_pages_for_write(struct address_space *mapping,
1180 wc->w_target_page = wc->w_pages[i]; 1199 wc->w_target_page = wc->w_pages[i];
1181 } 1200 }
1182out: 1201out:
1202 if (ret)
1203 wc->w_target_locked = false;
1183 return ret; 1204 return ret;
1184} 1205}
1185 1206
@@ -1837,11 +1858,23 @@ try_again:
1837 */ 1858 */
1838 ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len, 1859 ret = ocfs2_grab_pages_for_write(mapping, wc, wc->w_cpos, pos, len,
1839 cluster_of_pages, mmap_page); 1860 cluster_of_pages, mmap_page);
1840 if (ret) { 1861 if (ret && ret != -EAGAIN) {
1841 mlog_errno(ret); 1862 mlog_errno(ret);
1842 goto out_quota; 1863 goto out_quota;
1843 } 1864 }
1844 1865
1866 /*
1867 * ocfs2_grab_pages_for_write() returns -EAGAIN if it could not lock
1868 * the target page. In this case, we exit with no error and no target
1869 * page. This will trigger the caller, page_mkwrite(), to re-try
1870 * the operation.
1871 */
1872 if (ret == -EAGAIN) {
1873 BUG_ON(wc->w_target_page);
1874 ret = 0;
1875 goto out_quota;
1876 }
1877
1845 ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos, 1878 ret = ocfs2_write_cluster_by_desc(mapping, data_ac, meta_ac, wc, pos,
1846 len); 1879 len);
1847 if (ret) { 1880 if (ret) {
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 9a3e6bbff27b..a4e855e3690e 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -216,6 +216,7 @@ struct o2hb_region {
216 216
217 struct list_head hr_all_item; 217 struct list_head hr_all_item;
218 unsigned hr_unclean_stop:1, 218 unsigned hr_unclean_stop:1,
219 hr_aborted_start:1,
219 hr_item_pinned:1, 220 hr_item_pinned:1,
220 hr_item_dropped:1; 221 hr_item_dropped:1;
221 222
@@ -254,6 +255,10 @@ struct o2hb_region {
254 * a more complete api that doesn't lead to this sort of fragility. */ 255 * a more complete api that doesn't lead to this sort of fragility. */
255 atomic_t hr_steady_iterations; 256 atomic_t hr_steady_iterations;
256 257
258 /* terminate o2hb thread if it does not reach steady state
259 * (hr_steady_iterations == 0) within hr_unsteady_iterations */
260 atomic_t hr_unsteady_iterations;
261
257 char hr_dev_name[BDEVNAME_SIZE]; 262 char hr_dev_name[BDEVNAME_SIZE];
258 263
259 unsigned int hr_timeout_ms; 264 unsigned int hr_timeout_ms;
@@ -324,6 +329,10 @@ static void o2hb_write_timeout(struct work_struct *work)
324 329
325static void o2hb_arm_write_timeout(struct o2hb_region *reg) 330static void o2hb_arm_write_timeout(struct o2hb_region *reg)
326{ 331{
332 /* Arm writeout only after thread reaches steady state */
333 if (atomic_read(&reg->hr_steady_iterations) != 0)
334 return;
335
327 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n", 336 mlog(ML_HEARTBEAT, "Queue write timeout for %u ms\n",
328 O2HB_MAX_WRITE_TIMEOUT_MS); 337 O2HB_MAX_WRITE_TIMEOUT_MS);
329 338
@@ -537,9 +546,14 @@ static int o2hb_verify_crc(struct o2hb_region *reg,
537 return read == computed; 546 return read == computed;
538} 547}
539 548
540/* We want to make sure that nobody is heartbeating on top of us -- 549/*
541 * this will help detect an invalid configuration. */ 550 * Compare the slot data with what we wrote in the last iteration.
542static void o2hb_check_last_timestamp(struct o2hb_region *reg) 551 * If the match fails, print an appropriate error message. This is to
552 * detect errors like... another node hearting on the same slot,
553 * flaky device that is losing writes, etc.
554 * Returns 1 if check succeeds, 0 otherwise.
555 */
556static int o2hb_check_own_slot(struct o2hb_region *reg)
543{ 557{
544 struct o2hb_disk_slot *slot; 558 struct o2hb_disk_slot *slot;
545 struct o2hb_disk_heartbeat_block *hb_block; 559 struct o2hb_disk_heartbeat_block *hb_block;
@@ -548,13 +562,13 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg)
548 slot = &reg->hr_slots[o2nm_this_node()]; 562 slot = &reg->hr_slots[o2nm_this_node()];
549 /* Don't check on our 1st timestamp */ 563 /* Don't check on our 1st timestamp */
550 if (!slot->ds_last_time) 564 if (!slot->ds_last_time)
551 return; 565 return 0;
552 566
553 hb_block = slot->ds_raw_block; 567 hb_block = slot->ds_raw_block;
554 if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time && 568 if (le64_to_cpu(hb_block->hb_seq) == slot->ds_last_time &&
555 le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation && 569 le64_to_cpu(hb_block->hb_generation) == slot->ds_last_generation &&
556 hb_block->hb_node == slot->ds_node_num) 570 hb_block->hb_node == slot->ds_node_num)
557 return; 571 return 1;
558 572
559#define ERRSTR1 "Another node is heartbeating on device" 573#define ERRSTR1 "Another node is heartbeating on device"
560#define ERRSTR2 "Heartbeat generation mismatch on device" 574#define ERRSTR2 "Heartbeat generation mismatch on device"
@@ -574,6 +588,8 @@ static void o2hb_check_last_timestamp(struct o2hb_region *reg)
574 (unsigned long long)slot->ds_last_time, hb_block->hb_node, 588 (unsigned long long)slot->ds_last_time, hb_block->hb_node,
575 (unsigned long long)le64_to_cpu(hb_block->hb_generation), 589 (unsigned long long)le64_to_cpu(hb_block->hb_generation),
576 (unsigned long long)le64_to_cpu(hb_block->hb_seq)); 590 (unsigned long long)le64_to_cpu(hb_block->hb_seq));
591
592 return 0;
577} 593}
578 594
579static inline void o2hb_prepare_block(struct o2hb_region *reg, 595static inline void o2hb_prepare_block(struct o2hb_region *reg,
@@ -719,17 +735,24 @@ static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
719 o2nm_node_put(node); 735 o2nm_node_put(node);
720} 736}
721 737
722static void o2hb_set_quorum_device(struct o2hb_region *reg, 738static void o2hb_set_quorum_device(struct o2hb_region *reg)
723 struct o2hb_disk_slot *slot)
724{ 739{
725 assert_spin_locked(&o2hb_live_lock);
726
727 if (!o2hb_global_heartbeat_active()) 740 if (!o2hb_global_heartbeat_active())
728 return; 741 return;
729 742
730 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap)) 743 /* Prevent race with o2hb_heartbeat_group_drop_item() */
744 if (kthread_should_stop())
745 return;
746
747 /* Tag region as quorum only after thread reaches steady state */
748 if (atomic_read(&reg->hr_steady_iterations) != 0)
731 return; 749 return;
732 750
751 spin_lock(&o2hb_live_lock);
752
753 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
754 goto unlock;
755
733 /* 756 /*
734 * A region can be added to the quorum only when it sees all 757 * A region can be added to the quorum only when it sees all
735 * live nodes heartbeat on it. In other words, the region has been 758 * live nodes heartbeat on it. In other words, the region has been
@@ -737,13 +760,10 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
737 */ 760 */
738 if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap, 761 if (memcmp(reg->hr_live_node_bitmap, o2hb_live_node_bitmap,
739 sizeof(o2hb_live_node_bitmap))) 762 sizeof(o2hb_live_node_bitmap)))
740 return; 763 goto unlock;
741
742 if (slot->ds_changed_samples < O2HB_LIVE_THRESHOLD)
743 return;
744 764
745 printk(KERN_NOTICE "o2hb: Region %s is now a quorum device\n", 765 printk(KERN_NOTICE "o2hb: Region %s (%s) is now a quorum device\n",
746 config_item_name(&reg->hr_item)); 766 config_item_name(&reg->hr_item), reg->hr_dev_name);
747 767
748 set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap); 768 set_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
749 769
@@ -754,6 +774,8 @@ static void o2hb_set_quorum_device(struct o2hb_region *reg,
754 if (o2hb_pop_count(&o2hb_quorum_region_bitmap, 774 if (o2hb_pop_count(&o2hb_quorum_region_bitmap,
755 O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF) 775 O2NM_MAX_REGIONS) > O2HB_PIN_CUT_OFF)
756 o2hb_region_unpin(NULL); 776 o2hb_region_unpin(NULL);
777unlock:
778 spin_unlock(&o2hb_live_lock);
757} 779}
758 780
759static int o2hb_check_slot(struct o2hb_region *reg, 781static int o2hb_check_slot(struct o2hb_region *reg,
@@ -925,8 +947,6 @@ fire_callbacks:
925 slot->ds_equal_samples = 0; 947 slot->ds_equal_samples = 0;
926 } 948 }
927out: 949out:
928 o2hb_set_quorum_device(reg, slot);
929
930 spin_unlock(&o2hb_live_lock); 950 spin_unlock(&o2hb_live_lock);
931 951
932 o2hb_run_event_list(&event); 952 o2hb_run_event_list(&event);
@@ -957,7 +977,8 @@ static int o2hb_highest_node(unsigned long *nodes,
957 977
958static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) 978static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
959{ 979{
960 int i, ret, highest_node, change = 0; 980 int i, ret, highest_node;
981 int membership_change = 0, own_slot_ok = 0;
961 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; 982 unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
962 unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)]; 983 unsigned long live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
963 struct o2hb_bio_wait_ctxt write_wc; 984 struct o2hb_bio_wait_ctxt write_wc;
@@ -966,7 +987,7 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
966 sizeof(configured_nodes)); 987 sizeof(configured_nodes));
967 if (ret) { 988 if (ret) {
968 mlog_errno(ret); 989 mlog_errno(ret);
969 return ret; 990 goto bail;
970 } 991 }
971 992
972 /* 993 /*
@@ -982,8 +1003,9 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
982 1003
983 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); 1004 highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
984 if (highest_node >= O2NM_MAX_NODES) { 1005 if (highest_node >= O2NM_MAX_NODES) {
985 mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); 1006 mlog(ML_NOTICE, "o2hb: No configured nodes found!\n");
986 return -EINVAL; 1007 ret = -EINVAL;
1008 goto bail;
987 } 1009 }
988 1010
989 /* No sense in reading the slots of nodes that don't exist 1011 /* No sense in reading the slots of nodes that don't exist
@@ -993,29 +1015,27 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
993 ret = o2hb_read_slots(reg, highest_node + 1); 1015 ret = o2hb_read_slots(reg, highest_node + 1);
994 if (ret < 0) { 1016 if (ret < 0) {
995 mlog_errno(ret); 1017 mlog_errno(ret);
996 return ret; 1018 goto bail;
997 } 1019 }
998 1020
999 /* With an up to date view of the slots, we can check that no 1021 /* With an up to date view of the slots, we can check that no
1000 * other node has been improperly configured to heartbeat in 1022 * other node has been improperly configured to heartbeat in
1001 * our slot. */ 1023 * our slot. */
1002 o2hb_check_last_timestamp(reg); 1024 own_slot_ok = o2hb_check_own_slot(reg);
1003 1025
1004 /* fill in the proper info for our next heartbeat */ 1026 /* fill in the proper info for our next heartbeat */
1005 o2hb_prepare_block(reg, reg->hr_generation); 1027 o2hb_prepare_block(reg, reg->hr_generation);
1006 1028
1007 /* And fire off the write. Note that we don't wait on this I/O
1008 * until later. */
1009 ret = o2hb_issue_node_write(reg, &write_wc); 1029 ret = o2hb_issue_node_write(reg, &write_wc);
1010 if (ret < 0) { 1030 if (ret < 0) {
1011 mlog_errno(ret); 1031 mlog_errno(ret);
1012 return ret; 1032 goto bail;
1013 } 1033 }
1014 1034
1015 i = -1; 1035 i = -1;
1016 while((i = find_next_bit(configured_nodes, 1036 while((i = find_next_bit(configured_nodes,
1017 O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) { 1037 O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {
1018 change |= o2hb_check_slot(reg, &reg->hr_slots[i]); 1038 membership_change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
1019 } 1039 }
1020 1040
1021 /* 1041 /*
@@ -1030,18 +1050,39 @@ static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
1030 * disk */ 1050 * disk */
1031 mlog(ML_ERROR, "Write error %d on device \"%s\"\n", 1051 mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
1032 write_wc.wc_error, reg->hr_dev_name); 1052 write_wc.wc_error, reg->hr_dev_name);
1033 return write_wc.wc_error; 1053 ret = write_wc.wc_error;
1054 goto bail;
1034 } 1055 }
1035 1056
1036 o2hb_arm_write_timeout(reg); 1057 /* Skip disarming the timeout if own slot has stale/bad data */
1058 if (own_slot_ok) {
1059 o2hb_set_quorum_device(reg);
1060 o2hb_arm_write_timeout(reg);
1061 }
1037 1062
1063bail:
1038 /* let the person who launched us know when things are steady */ 1064 /* let the person who launched us know when things are steady */
1039 if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) { 1065 if (atomic_read(&reg->hr_steady_iterations) != 0) {
1040 if (atomic_dec_and_test(&reg->hr_steady_iterations)) 1066 if (!ret && own_slot_ok && !membership_change) {
1067 if (atomic_dec_and_test(&reg->hr_steady_iterations))
1068 wake_up(&o2hb_steady_queue);
1069 }
1070 }
1071
1072 if (atomic_read(&reg->hr_steady_iterations) != 0) {
1073 if (atomic_dec_and_test(&reg->hr_unsteady_iterations)) {
1074 printk(KERN_NOTICE "o2hb: Unable to stabilize "
1075 "heartbeart on region %s (%s)\n",
1076 config_item_name(&reg->hr_item),
1077 reg->hr_dev_name);
1078 atomic_set(&reg->hr_steady_iterations, 0);
1079 reg->hr_aborted_start = 1;
1041 wake_up(&o2hb_steady_queue); 1080 wake_up(&o2hb_steady_queue);
1081 ret = -EIO;
1082 }
1042 } 1083 }
1043 1084
1044 return 0; 1085 return ret;
1045} 1086}
1046 1087
1047/* Subtract b from a, storing the result in a. a *must* have a larger 1088/* Subtract b from a, storing the result in a. a *must* have a larger
@@ -1095,7 +1136,8 @@ static int o2hb_thread(void *data)
1095 /* Pin node */ 1136 /* Pin node */
1096 o2nm_depend_this_node(); 1137 o2nm_depend_this_node();
1097 1138
1098 while (!kthread_should_stop() && !reg->hr_unclean_stop) { 1139 while (!kthread_should_stop() &&
1140 !reg->hr_unclean_stop && !reg->hr_aborted_start) {
1099 /* We track the time spent inside 1141 /* We track the time spent inside
1100 * o2hb_do_disk_heartbeat so that we avoid more than 1142 * o2hb_do_disk_heartbeat so that we avoid more than
1101 * hr_timeout_ms between disk writes. On busy systems 1143 * hr_timeout_ms between disk writes. On busy systems
@@ -1103,10 +1145,7 @@ static int o2hb_thread(void *data)
1103 * likely to time itself out. */ 1145 * likely to time itself out. */
1104 do_gettimeofday(&before_hb); 1146 do_gettimeofday(&before_hb);
1105 1147
1106 i = 0; 1148 ret = o2hb_do_disk_heartbeat(reg);
1107 do {
1108 ret = o2hb_do_disk_heartbeat(reg);
1109 } while (ret && ++i < 2);
1110 1149
1111 do_gettimeofday(&after_hb); 1150 do_gettimeofday(&after_hb);
1112 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); 1151 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
@@ -1117,7 +1156,8 @@ static int o2hb_thread(void *data)
1117 after_hb.tv_sec, (unsigned long) after_hb.tv_usec, 1156 after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
1118 elapsed_msec); 1157 elapsed_msec);
1119 1158
1120 if (elapsed_msec < reg->hr_timeout_ms) { 1159 if (!kthread_should_stop() &&
1160 elapsed_msec < reg->hr_timeout_ms) {
1121 /* the kthread api has blocked signals for us so no 1161 /* the kthread api has blocked signals for us so no
1122 * need to record the return value. */ 1162 * need to record the return value. */
1123 msleep_interruptible(reg->hr_timeout_ms - elapsed_msec); 1163 msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
@@ -1134,20 +1174,20 @@ static int o2hb_thread(void *data)
1134 * to timeout on this region when we could just as easily 1174 * to timeout on this region when we could just as easily
1135 * write a clear generation - thus indicating to them that 1175 * write a clear generation - thus indicating to them that
1136 * this node has left this region. 1176 * this node has left this region.
1137 * 1177 */
1138 * XXX: Should we skip this on unclean_stop? */ 1178 if (!reg->hr_unclean_stop && !reg->hr_aborted_start) {
1139 o2hb_prepare_block(reg, 0); 1179 o2hb_prepare_block(reg, 0);
1140 ret = o2hb_issue_node_write(reg, &write_wc); 1180 ret = o2hb_issue_node_write(reg, &write_wc);
1141 if (ret == 0) { 1181 if (ret == 0)
1142 o2hb_wait_on_io(reg, &write_wc); 1182 o2hb_wait_on_io(reg, &write_wc);
1143 } else { 1183 else
1144 mlog_errno(ret); 1184 mlog_errno(ret);
1145 } 1185 }
1146 1186
1147 /* Unpin node */ 1187 /* Unpin node */
1148 o2nm_undepend_this_node(); 1188 o2nm_undepend_this_node();
1149 1189
1150 mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n"); 1190 mlog(ML_HEARTBEAT|ML_KTHREAD, "o2hb thread exiting\n");
1151 1191
1152 return 0; 1192 return 0;
1153} 1193}
@@ -1158,6 +1198,7 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
1158 struct o2hb_debug_buf *db = inode->i_private; 1198 struct o2hb_debug_buf *db = inode->i_private;
1159 struct o2hb_region *reg; 1199 struct o2hb_region *reg;
1160 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)]; 1200 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
1201 unsigned long lts;
1161 char *buf = NULL; 1202 char *buf = NULL;
1162 int i = -1; 1203 int i = -1;
1163 int out = 0; 1204 int out = 0;
@@ -1194,9 +1235,11 @@ static int o2hb_debug_open(struct inode *inode, struct file *file)
1194 1235
1195 case O2HB_DB_TYPE_REGION_ELAPSED_TIME: 1236 case O2HB_DB_TYPE_REGION_ELAPSED_TIME:
1196 reg = (struct o2hb_region *)db->db_data; 1237 reg = (struct o2hb_region *)db->db_data;
1197 out += snprintf(buf + out, PAGE_SIZE - out, "%u\n", 1238 lts = reg->hr_last_timeout_start;
1198 jiffies_to_msecs(jiffies - 1239 /* If 0, it has never been set before */
1199 reg->hr_last_timeout_start)); 1240 if (lts)
1241 lts = jiffies_to_msecs(jiffies - lts);
1242 out += snprintf(buf + out, PAGE_SIZE - out, "%lu\n", lts);
1200 goto done; 1243 goto done;
1201 1244
1202 case O2HB_DB_TYPE_REGION_PINNED: 1245 case O2HB_DB_TYPE_REGION_PINNED:
@@ -1426,6 +1469,8 @@ static void o2hb_region_release(struct config_item *item)
1426 struct page *page; 1469 struct page *page;
1427 struct o2hb_region *reg = to_o2hb_region(item); 1470 struct o2hb_region *reg = to_o2hb_region(item);
1428 1471
1472 mlog(ML_HEARTBEAT, "hb region release (%s)\n", reg->hr_dev_name);
1473
1429 if (reg->hr_tmp_block) 1474 if (reg->hr_tmp_block)
1430 kfree(reg->hr_tmp_block); 1475 kfree(reg->hr_tmp_block);
1431 1476
@@ -1792,7 +1837,10 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1792 live_threshold <<= 1; 1837 live_threshold <<= 1;
1793 spin_unlock(&o2hb_live_lock); 1838 spin_unlock(&o2hb_live_lock);
1794 } 1839 }
1795 atomic_set(&reg->hr_steady_iterations, live_threshold + 1); 1840 ++live_threshold;
1841 atomic_set(&reg->hr_steady_iterations, live_threshold);
1842 /* unsteady_iterations is double the steady_iterations */
1843 atomic_set(&reg->hr_unsteady_iterations, (live_threshold << 1));
1796 1844
1797 hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s", 1845 hb_task = kthread_run(o2hb_thread, reg, "o2hb-%s",
1798 reg->hr_item.ci_name); 1846 reg->hr_item.ci_name);
@@ -1809,14 +1857,12 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1809 ret = wait_event_interruptible(o2hb_steady_queue, 1857 ret = wait_event_interruptible(o2hb_steady_queue,
1810 atomic_read(&reg->hr_steady_iterations) == 0); 1858 atomic_read(&reg->hr_steady_iterations) == 0);
1811 if (ret) { 1859 if (ret) {
1812 /* We got interrupted (hello ptrace!). Clean up */ 1860 atomic_set(&reg->hr_steady_iterations, 0);
1813 spin_lock(&o2hb_live_lock); 1861 reg->hr_aborted_start = 1;
1814 hb_task = reg->hr_task; 1862 }
1815 reg->hr_task = NULL;
1816 spin_unlock(&o2hb_live_lock);
1817 1863
1818 if (hb_task) 1864 if (reg->hr_aborted_start) {
1819 kthread_stop(hb_task); 1865 ret = -EIO;
1820 goto out; 1866 goto out;
1821 } 1867 }
1822 1868
@@ -1833,8 +1879,8 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg,
1833 ret = -EIO; 1879 ret = -EIO;
1834 1880
1835 if (hb_task && o2hb_global_heartbeat_active()) 1881 if (hb_task && o2hb_global_heartbeat_active())
1836 printk(KERN_NOTICE "o2hb: Heartbeat started on region %s\n", 1882 printk(KERN_NOTICE "o2hb: Heartbeat started on region %s (%s)\n",
1837 config_item_name(&reg->hr_item)); 1883 config_item_name(&reg->hr_item), reg->hr_dev_name);
1838 1884
1839out: 1885out:
1840 if (filp) 1886 if (filp)
@@ -2092,13 +2138,6 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
2092 2138
2093 /* stop the thread when the user removes the region dir */ 2139 /* stop the thread when the user removes the region dir */
2094 spin_lock(&o2hb_live_lock); 2140 spin_lock(&o2hb_live_lock);
2095 if (o2hb_global_heartbeat_active()) {
2096 clear_bit(reg->hr_region_num, o2hb_region_bitmap);
2097 clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
2098 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
2099 quorum_region = 1;
2100 clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
2101 }
2102 hb_task = reg->hr_task; 2141 hb_task = reg->hr_task;
2103 reg->hr_task = NULL; 2142 reg->hr_task = NULL;
2104 reg->hr_item_dropped = 1; 2143 reg->hr_item_dropped = 1;
@@ -2107,19 +2146,30 @@ static void o2hb_heartbeat_group_drop_item(struct config_group *group,
2107 if (hb_task) 2146 if (hb_task)
2108 kthread_stop(hb_task); 2147 kthread_stop(hb_task);
2109 2148
2149 if (o2hb_global_heartbeat_active()) {
2150 spin_lock(&o2hb_live_lock);
2151 clear_bit(reg->hr_region_num, o2hb_region_bitmap);
2152 clear_bit(reg->hr_region_num, o2hb_live_region_bitmap);
2153 if (test_bit(reg->hr_region_num, o2hb_quorum_region_bitmap))
2154 quorum_region = 1;
2155 clear_bit(reg->hr_region_num, o2hb_quorum_region_bitmap);
2156 spin_unlock(&o2hb_live_lock);
2157 printk(KERN_NOTICE "o2hb: Heartbeat %s on region %s (%s)\n",
2158 ((atomic_read(&reg->hr_steady_iterations) == 0) ?
2159 "stopped" : "start aborted"), config_item_name(item),
2160 reg->hr_dev_name);
2161 }
2162
2110 /* 2163 /*
2111 * If we're racing a dev_write(), we need to wake them. They will 2164 * If we're racing a dev_write(), we need to wake them. They will
2112 * check reg->hr_task 2165 * check reg->hr_task
2113 */ 2166 */
2114 if (atomic_read(&reg->hr_steady_iterations) != 0) { 2167 if (atomic_read(&reg->hr_steady_iterations) != 0) {
2168 reg->hr_aborted_start = 1;
2115 atomic_set(&reg->hr_steady_iterations, 0); 2169 atomic_set(&reg->hr_steady_iterations, 0);
2116 wake_up(&o2hb_steady_queue); 2170 wake_up(&o2hb_steady_queue);
2117 } 2171 }
2118 2172
2119 if (o2hb_global_heartbeat_active())
2120 printk(KERN_NOTICE "o2hb: Heartbeat stopped on region %s\n",
2121 config_item_name(&reg->hr_item));
2122
2123 config_item_put(item); 2173 config_item_put(item);
2124 2174
2125 if (!o2hb_global_heartbeat_active() || !quorum_region) 2175 if (!o2hb_global_heartbeat_active() || !quorum_region)
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index 3a5835904b3d..dc45deb19e68 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -47,6 +47,7 @@
47#define SC_DEBUG_NAME "sock_containers" 47#define SC_DEBUG_NAME "sock_containers"
48#define NST_DEBUG_NAME "send_tracking" 48#define NST_DEBUG_NAME "send_tracking"
49#define STATS_DEBUG_NAME "stats" 49#define STATS_DEBUG_NAME "stats"
50#define NODES_DEBUG_NAME "connected_nodes"
50 51
51#define SHOW_SOCK_CONTAINERS 0 52#define SHOW_SOCK_CONTAINERS 0
52#define SHOW_SOCK_STATS 1 53#define SHOW_SOCK_STATS 1
@@ -55,6 +56,7 @@ static struct dentry *o2net_dentry;
55static struct dentry *sc_dentry; 56static struct dentry *sc_dentry;
56static struct dentry *nst_dentry; 57static struct dentry *nst_dentry;
57static struct dentry *stats_dentry; 58static struct dentry *stats_dentry;
59static struct dentry *nodes_dentry;
58 60
59static DEFINE_SPINLOCK(o2net_debug_lock); 61static DEFINE_SPINLOCK(o2net_debug_lock);
60 62
@@ -491,53 +493,87 @@ static const struct file_operations sc_seq_fops = {
491 .release = sc_fop_release, 493 .release = sc_fop_release,
492}; 494};
493 495
494int o2net_debugfs_init(void) 496static int o2net_fill_bitmap(char *buf, int len)
495{ 497{
496 o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL); 498 unsigned long map[BITS_TO_LONGS(O2NM_MAX_NODES)];
497 if (!o2net_dentry) { 499 int i = -1, out = 0;
498 mlog_errno(-ENOMEM);
499 goto bail;
500 }
501 500
502 nst_dentry = debugfs_create_file(NST_DEBUG_NAME, S_IFREG|S_IRUSR, 501 o2net_fill_node_map(map, sizeof(map));
503 o2net_dentry, NULL,
504 &nst_seq_fops);
505 if (!nst_dentry) {
506 mlog_errno(-ENOMEM);
507 goto bail;
508 }
509 502
510 sc_dentry = debugfs_create_file(SC_DEBUG_NAME, S_IFREG|S_IRUSR, 503 while ((i = find_next_bit(map, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES)
511 o2net_dentry, NULL, 504 out += snprintf(buf + out, PAGE_SIZE - out, "%d ", i);
512 &sc_seq_fops); 505 out += snprintf(buf + out, PAGE_SIZE - out, "\n");
513 if (!sc_dentry) {
514 mlog_errno(-ENOMEM);
515 goto bail;
516 }
517 506
518 stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, S_IFREG|S_IRUSR, 507 return out;
519 o2net_dentry, NULL, 508}
520 &stats_seq_fops); 509
521 if (!stats_dentry) { 510static int nodes_fop_open(struct inode *inode, struct file *file)
522 mlog_errno(-ENOMEM); 511{
523 goto bail; 512 char *buf;
524 } 513
514 buf = kmalloc(PAGE_SIZE, GFP_KERNEL);
515 if (!buf)
516 return -ENOMEM;
517
518 i_size_write(inode, o2net_fill_bitmap(buf, PAGE_SIZE));
519
520 file->private_data = buf;
525 521
526 return 0; 522 return 0;
527bail:
528 debugfs_remove(stats_dentry);
529 debugfs_remove(sc_dentry);
530 debugfs_remove(nst_dentry);
531 debugfs_remove(o2net_dentry);
532 return -ENOMEM;
533} 523}
534 524
525static int o2net_debug_release(struct inode *inode, struct file *file)
526{
527 kfree(file->private_data);
528 return 0;
529}
530
531static ssize_t o2net_debug_read(struct file *file, char __user *buf,
532 size_t nbytes, loff_t *ppos)
533{
534 return simple_read_from_buffer(buf, nbytes, ppos, file->private_data,
535 i_size_read(file->f_mapping->host));
536}
537
538static const struct file_operations nodes_fops = {
539 .open = nodes_fop_open,
540 .release = o2net_debug_release,
541 .read = o2net_debug_read,
542 .llseek = generic_file_llseek,
543};
544
535void o2net_debugfs_exit(void) 545void o2net_debugfs_exit(void)
536{ 546{
547 debugfs_remove(nodes_dentry);
537 debugfs_remove(stats_dentry); 548 debugfs_remove(stats_dentry);
538 debugfs_remove(sc_dentry); 549 debugfs_remove(sc_dentry);
539 debugfs_remove(nst_dentry); 550 debugfs_remove(nst_dentry);
540 debugfs_remove(o2net_dentry); 551 debugfs_remove(o2net_dentry);
541} 552}
542 553
554int o2net_debugfs_init(void)
555{
556 mode_t mode = S_IFREG|S_IRUSR;
557
558 o2net_dentry = debugfs_create_dir(O2NET_DEBUG_DIR, NULL);
559 if (o2net_dentry)
560 nst_dentry = debugfs_create_file(NST_DEBUG_NAME, mode,
561 o2net_dentry, NULL, &nst_seq_fops);
562 if (nst_dentry)
563 sc_dentry = debugfs_create_file(SC_DEBUG_NAME, mode,
564 o2net_dentry, NULL, &sc_seq_fops);
565 if (sc_dentry)
566 stats_dentry = debugfs_create_file(STATS_DEBUG_NAME, mode,
567 o2net_dentry, NULL, &stats_seq_fops);
568 if (stats_dentry)
569 nodes_dentry = debugfs_create_file(NODES_DEBUG_NAME, mode,
570 o2net_dentry, NULL, &nodes_fops);
571 if (nodes_dentry)
572 return 0;
573
574 o2net_debugfs_exit();
575 mlog_errno(-ENOMEM);
576 return -ENOMEM;
577}
578
543#endif /* CONFIG_DEBUG_FS */ 579#endif /* CONFIG_DEBUG_FS */
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index db5ee4b4f47a..ae13d5ca7908 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -545,7 +545,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
545 } 545 }
546 546
547 if (was_valid && !valid) { 547 if (was_valid && !valid) {
548 printk(KERN_NOTICE "o2net: no longer connected to " 548 printk(KERN_NOTICE "o2net: No longer connected to "
549 SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc)); 549 SC_NODEF_FMT "\n", SC_NODEF_ARGS(old_sc));
550 o2net_complete_nodes_nsw(nn); 550 o2net_complete_nodes_nsw(nn);
551 } 551 }
@@ -555,7 +555,7 @@ static void o2net_set_nn_state(struct o2net_node *nn,
555 cancel_delayed_work(&nn->nn_connect_expired); 555 cancel_delayed_work(&nn->nn_connect_expired);
556 printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n", 556 printk(KERN_NOTICE "o2net: %s " SC_NODEF_FMT "\n",
557 o2nm_this_node() > sc->sc_node->nd_num ? 557 o2nm_this_node() > sc->sc_node->nd_num ?
558 "connected to" : "accepted connection from", 558 "Connected to" : "Accepted connection from",
559 SC_NODEF_ARGS(sc)); 559 SC_NODEF_ARGS(sc));
560 } 560 }
561 561
@@ -643,7 +643,7 @@ static void o2net_state_change(struct sock *sk)
643 o2net_sc_queue_work(sc, &sc->sc_connect_work); 643 o2net_sc_queue_work(sc, &sc->sc_connect_work);
644 break; 644 break;
645 default: 645 default:
646 printk(KERN_INFO "o2net: connection to " SC_NODEF_FMT 646 printk(KERN_INFO "o2net: Connection to " SC_NODEF_FMT
647 " shutdown, state %d\n", 647 " shutdown, state %d\n",
648 SC_NODEF_ARGS(sc), sk->sk_state); 648 SC_NODEF_ARGS(sc), sk->sk_state);
649 o2net_sc_queue_work(sc, &sc->sc_shutdown_work); 649 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
@@ -1034,6 +1034,25 @@ static int o2net_tx_can_proceed(struct o2net_node *nn,
1034 return ret; 1034 return ret;
1035} 1035}
1036 1036
1037/* Get a map of all nodes to which this node is currently connected to */
1038void o2net_fill_node_map(unsigned long *map, unsigned bytes)
1039{
1040 struct o2net_sock_container *sc;
1041 int node, ret;
1042
1043 BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));
1044
1045 memset(map, 0, bytes);
1046 for (node = 0; node < O2NM_MAX_NODES; ++node) {
1047 o2net_tx_can_proceed(o2net_nn_from_num(node), &sc, &ret);
1048 if (!ret) {
1049 set_bit(node, map);
1050 sc_put(sc);
1051 }
1052 }
1053}
1054EXPORT_SYMBOL_GPL(o2net_fill_node_map);
1055
1037int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec, 1056int o2net_send_message_vec(u32 msg_type, u32 key, struct kvec *caller_vec,
1038 size_t caller_veclen, u8 target_node, int *status) 1057 size_t caller_veclen, u8 target_node, int *status)
1039{ 1058{
@@ -1284,11 +1303,11 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
1284 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); 1303 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1285 1304
1286 if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) { 1305 if (hand->protocol_version != cpu_to_be64(O2NET_PROTOCOL_VERSION)) {
1287 mlog(ML_NOTICE, SC_NODEF_FMT " advertised net protocol " 1306 printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " Advertised net "
1288 "version %llu but %llu is required, disconnecting\n", 1307 "protocol version %llu but %llu is required. "
1289 SC_NODEF_ARGS(sc), 1308 "Disconnecting.\n", SC_NODEF_ARGS(sc),
1290 (unsigned long long)be64_to_cpu(hand->protocol_version), 1309 (unsigned long long)be64_to_cpu(hand->protocol_version),
1291 O2NET_PROTOCOL_VERSION); 1310 O2NET_PROTOCOL_VERSION);
1292 1311
1293 /* don't bother reconnecting if its the wrong version. */ 1312 /* don't bother reconnecting if its the wrong version. */
1294 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1313 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
@@ -1302,33 +1321,33 @@ static int o2net_check_handshake(struct o2net_sock_container *sc)
1302 */ 1321 */
1303 if (be32_to_cpu(hand->o2net_idle_timeout_ms) != 1322 if (be32_to_cpu(hand->o2net_idle_timeout_ms) !=
1304 o2net_idle_timeout()) { 1323 o2net_idle_timeout()) {
1305 mlog(ML_NOTICE, SC_NODEF_FMT " uses a network idle timeout of " 1324 printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a network "
1306 "%u ms, but we use %u ms locally. disconnecting\n", 1325 "idle timeout of %u ms, but we use %u ms locally. "
1307 SC_NODEF_ARGS(sc), 1326 "Disconnecting.\n", SC_NODEF_ARGS(sc),
1308 be32_to_cpu(hand->o2net_idle_timeout_ms), 1327 be32_to_cpu(hand->o2net_idle_timeout_ms),
1309 o2net_idle_timeout()); 1328 o2net_idle_timeout());
1310 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1329 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
1311 return -1; 1330 return -1;
1312 } 1331 }
1313 1332
1314 if (be32_to_cpu(hand->o2net_keepalive_delay_ms) != 1333 if (be32_to_cpu(hand->o2net_keepalive_delay_ms) !=
1315 o2net_keepalive_delay()) { 1334 o2net_keepalive_delay()) {
1316 mlog(ML_NOTICE, SC_NODEF_FMT " uses a keepalive delay of " 1335 printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a keepalive "
1317 "%u ms, but we use %u ms locally. disconnecting\n", 1336 "delay of %u ms, but we use %u ms locally. "
1318 SC_NODEF_ARGS(sc), 1337 "Disconnecting.\n", SC_NODEF_ARGS(sc),
1319 be32_to_cpu(hand->o2net_keepalive_delay_ms), 1338 be32_to_cpu(hand->o2net_keepalive_delay_ms),
1320 o2net_keepalive_delay()); 1339 o2net_keepalive_delay());
1321 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1340 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
1322 return -1; 1341 return -1;
1323 } 1342 }
1324 1343
1325 if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) != 1344 if (be32_to_cpu(hand->o2hb_heartbeat_timeout_ms) !=
1326 O2HB_MAX_WRITE_TIMEOUT_MS) { 1345 O2HB_MAX_WRITE_TIMEOUT_MS) {
1327 mlog(ML_NOTICE, SC_NODEF_FMT " uses a heartbeat timeout of " 1346 printk(KERN_NOTICE "o2net: " SC_NODEF_FMT " uses a heartbeat "
1328 "%u ms, but we use %u ms locally. disconnecting\n", 1347 "timeout of %u ms, but we use %u ms locally. "
1329 SC_NODEF_ARGS(sc), 1348 "Disconnecting.\n", SC_NODEF_ARGS(sc),
1330 be32_to_cpu(hand->o2hb_heartbeat_timeout_ms), 1349 be32_to_cpu(hand->o2hb_heartbeat_timeout_ms),
1331 O2HB_MAX_WRITE_TIMEOUT_MS); 1350 O2HB_MAX_WRITE_TIMEOUT_MS);
1332 o2net_ensure_shutdown(nn, sc, -ENOTCONN); 1351 o2net_ensure_shutdown(nn, sc, -ENOTCONN);
1333 return -1; 1352 return -1;
1334 } 1353 }
@@ -1539,28 +1558,16 @@ static void o2net_idle_timer(unsigned long data)
1539{ 1558{
1540 struct o2net_sock_container *sc = (struct o2net_sock_container *)data; 1559 struct o2net_sock_container *sc = (struct o2net_sock_container *)data;
1541 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num); 1560 struct o2net_node *nn = o2net_nn_from_num(sc->sc_node->nd_num);
1542
1543#ifdef CONFIG_DEBUG_FS 1561#ifdef CONFIG_DEBUG_FS
1544 ktime_t now = ktime_get(); 1562 unsigned long msecs = ktime_to_ms(ktime_get()) -
1563 ktime_to_ms(sc->sc_tv_timer);
1564#else
1565 unsigned long msecs = o2net_idle_timeout();
1545#endif 1566#endif
1546 1567
1547 printk(KERN_NOTICE "o2net: connection to " SC_NODEF_FMT " has been idle for %u.%u " 1568 printk(KERN_NOTICE "o2net: Connection to " SC_NODEF_FMT " has been "
1548 "seconds, shutting it down.\n", SC_NODEF_ARGS(sc), 1569 "idle for %lu.%lu secs, shutting it down.\n", SC_NODEF_ARGS(sc),
1549 o2net_idle_timeout() / 1000, 1570 msecs / 1000, msecs % 1000);
1550 o2net_idle_timeout() % 1000);
1551
1552#ifdef CONFIG_DEBUG_FS
1553 mlog(ML_NOTICE, "Here are some times that might help debug the "
1554 "situation: (Timer: %lld, Now %lld, DataReady %lld, Advance %lld-%lld, "
1555 "Key 0x%08x, Func %u, FuncTime %lld-%lld)\n",
1556 (long long)ktime_to_us(sc->sc_tv_timer), (long long)ktime_to_us(now),
1557 (long long)ktime_to_us(sc->sc_tv_data_ready),
1558 (long long)ktime_to_us(sc->sc_tv_advance_start),
1559 (long long)ktime_to_us(sc->sc_tv_advance_stop),
1560 sc->sc_msg_key, sc->sc_msg_type,
1561 (long long)ktime_to_us(sc->sc_tv_func_start),
1562 (long long)ktime_to_us(sc->sc_tv_func_stop));
1563#endif
1564 1571
1565 /* 1572 /*
1566 * Initialize the nn_timeout so that the next connection attempt 1573 * Initialize the nn_timeout so that the next connection attempt
@@ -1693,8 +1700,8 @@ static void o2net_start_connect(struct work_struct *work)
1693 1700
1694out: 1701out:
1695 if (ret) { 1702 if (ret) {
1696 mlog(ML_NOTICE, "connect attempt to " SC_NODEF_FMT " failed " 1703 printk(KERN_NOTICE "o2net: Connect attempt to " SC_NODEF_FMT
1697 "with errno %d\n", SC_NODEF_ARGS(sc), ret); 1704 " failed with errno %d\n", SC_NODEF_ARGS(sc), ret);
1698 /* 0 err so that another will be queued and attempted 1705 /* 0 err so that another will be queued and attempted
1699 * from set_nn_state */ 1706 * from set_nn_state */
1700 if (sc) 1707 if (sc)
@@ -1717,8 +1724,8 @@ static void o2net_connect_expired(struct work_struct *work)
1717 1724
1718 spin_lock(&nn->nn_lock); 1725 spin_lock(&nn->nn_lock);
1719 if (!nn->nn_sc_valid) { 1726 if (!nn->nn_sc_valid) {
1720 mlog(ML_ERROR, "no connection established with node %u after " 1727 printk(KERN_NOTICE "o2net: No connection established with "
1721 "%u.%u seconds, giving up and returning errors.\n", 1728 "node %u after %u.%u seconds, giving up.\n",
1722 o2net_num_from_nn(nn), 1729 o2net_num_from_nn(nn),
1723 o2net_idle_timeout() / 1000, 1730 o2net_idle_timeout() / 1000,
1724 o2net_idle_timeout() % 1000); 1731 o2net_idle_timeout() % 1000);
@@ -1861,21 +1868,21 @@ static int o2net_accept_one(struct socket *sock)
1861 1868
1862 node = o2nm_get_node_by_ip(sin.sin_addr.s_addr); 1869 node = o2nm_get_node_by_ip(sin.sin_addr.s_addr);
1863 if (node == NULL) { 1870 if (node == NULL) {
1864 mlog(ML_NOTICE, "attempt to connect from unknown node at %pI4:%d\n", 1871 printk(KERN_NOTICE "o2net: Attempt to connect from unknown "
1865 &sin.sin_addr.s_addr, ntohs(sin.sin_port)); 1872 "node at %pI4:%d\n", &sin.sin_addr.s_addr,
1873 ntohs(sin.sin_port));
1866 ret = -EINVAL; 1874 ret = -EINVAL;
1867 goto out; 1875 goto out;
1868 } 1876 }
1869 1877
1870 if (o2nm_this_node() >= node->nd_num) { 1878 if (o2nm_this_node() >= node->nd_num) {
1871 local_node = o2nm_get_node_by_num(o2nm_this_node()); 1879 local_node = o2nm_get_node_by_num(o2nm_this_node());
1872 mlog(ML_NOTICE, "unexpected connect attempt seen at node '%s' (" 1880 printk(KERN_NOTICE "o2net: Unexpected connect attempt seen "
1873 "%u, %pI4:%d) from node '%s' (%u, %pI4:%d)\n", 1881 "at node '%s' (%u, %pI4:%d) from node '%s' (%u, "
1874 local_node->nd_name, local_node->nd_num, 1882 "%pI4:%d)\n", local_node->nd_name, local_node->nd_num,
1875 &(local_node->nd_ipv4_address), 1883 &(local_node->nd_ipv4_address),
1876 ntohs(local_node->nd_ipv4_port), 1884 ntohs(local_node->nd_ipv4_port), node->nd_name,
1877 node->nd_name, node->nd_num, &sin.sin_addr.s_addr, 1885 node->nd_num, &sin.sin_addr.s_addr, ntohs(sin.sin_port));
1878 ntohs(sin.sin_port));
1879 ret = -EINVAL; 1886 ret = -EINVAL;
1880 goto out; 1887 goto out;
1881 } 1888 }
@@ -1900,10 +1907,10 @@ static int o2net_accept_one(struct socket *sock)
1900 ret = 0; 1907 ret = 0;
1901 spin_unlock(&nn->nn_lock); 1908 spin_unlock(&nn->nn_lock);
1902 if (ret) { 1909 if (ret) {
1903 mlog(ML_NOTICE, "attempt to connect from node '%s' at " 1910 printk(KERN_NOTICE "o2net: Attempt to connect from node '%s' "
1904 "%pI4:%d but it already has an open connection\n", 1911 "at %pI4:%d but it already has an open connection\n",
1905 node->nd_name, &sin.sin_addr.s_addr, 1912 node->nd_name, &sin.sin_addr.s_addr,
1906 ntohs(sin.sin_port)); 1913 ntohs(sin.sin_port));
1907 goto out; 1914 goto out;
1908 } 1915 }
1909 1916
@@ -1983,7 +1990,7 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
1983 1990
1984 ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); 1991 ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
1985 if (ret < 0) { 1992 if (ret < 0) {
1986 mlog(ML_ERROR, "unable to create socket, ret=%d\n", ret); 1993 printk(KERN_ERR "o2net: Error %d while creating socket\n", ret);
1987 goto out; 1994 goto out;
1988 } 1995 }
1989 1996
@@ -2000,16 +2007,15 @@ static int o2net_open_listening_sock(__be32 addr, __be16 port)
2000 sock->sk->sk_reuse = 1; 2007 sock->sk->sk_reuse = 1;
2001 ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin)); 2008 ret = sock->ops->bind(sock, (struct sockaddr *)&sin, sizeof(sin));
2002 if (ret < 0) { 2009 if (ret < 0) {
2003 mlog(ML_ERROR, "unable to bind socket at %pI4:%u, " 2010 printk(KERN_ERR "o2net: Error %d while binding socket at "
2004 "ret=%d\n", &addr, ntohs(port), ret); 2011 "%pI4:%u\n", ret, &addr, ntohs(port));
2005 goto out; 2012 goto out;
2006 } 2013 }
2007 2014
2008 ret = sock->ops->listen(sock, 64); 2015 ret = sock->ops->listen(sock, 64);
2009 if (ret < 0) { 2016 if (ret < 0)
2010 mlog(ML_ERROR, "unable to listen on %pI4:%u, ret=%d\n", 2017 printk(KERN_ERR "o2net: Error %d while listening on %pI4:%u\n",
2011 &addr, ntohs(port), ret); 2018 ret, &addr, ntohs(port));
2012 }
2013 2019
2014out: 2020out:
2015 if (ret) { 2021 if (ret) {
diff --git a/fs/ocfs2/cluster/tcp.h b/fs/ocfs2/cluster/tcp.h
index fd6179eb26d4..5bada2a69b50 100644
--- a/fs/ocfs2/cluster/tcp.h
+++ b/fs/ocfs2/cluster/tcp.h
@@ -106,6 +106,8 @@ int o2net_register_handler(u32 msg_type, u32 key, u32 max_len,
106 struct list_head *unreg_list); 106 struct list_head *unreg_list);
107void o2net_unregister_handler_list(struct list_head *list); 107void o2net_unregister_handler_list(struct list_head *list);
108 108
109void o2net_fill_node_map(unsigned long *map, unsigned bytes);
110
109struct o2nm_node; 111struct o2nm_node;
110int o2net_register_hb_callbacks(void); 112int o2net_register_hb_callbacks(void);
111void o2net_unregister_hb_callbacks(void); 113void o2net_unregister_hb_callbacks(void);
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index d602abb51b61..a5952ceecba5 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -859,8 +859,8 @@ void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
859void dlm_wait_for_recovery(struct dlm_ctxt *dlm); 859void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
860void dlm_kick_recovery_thread(struct dlm_ctxt *dlm); 860void dlm_kick_recovery_thread(struct dlm_ctxt *dlm);
861int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node); 861int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
862int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout); 862void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout);
863int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout); 863void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout);
864 864
865void dlm_put(struct dlm_ctxt *dlm); 865void dlm_put(struct dlm_ctxt *dlm);
866struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); 866struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
@@ -877,9 +877,8 @@ static inline void dlm_lockres_get(struct dlm_lock_resource *res)
877 kref_get(&res->refs); 877 kref_get(&res->refs);
878} 878}
879void dlm_lockres_put(struct dlm_lock_resource *res); 879void dlm_lockres_put(struct dlm_lock_resource *res);
880void __dlm_unhash_lockres(struct dlm_lock_resource *res); 880void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
881void __dlm_insert_lockres(struct dlm_ctxt *dlm, 881void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res);
882 struct dlm_lock_resource *res);
883struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, 882struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
884 const char *name, 883 const char *name,
885 unsigned int len, 884 unsigned int len,
@@ -902,46 +901,15 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm,
902 const char *name, 901 const char *name,
903 unsigned int namelen); 902 unsigned int namelen);
904 903
905#define dlm_lockres_set_refmap_bit(bit,res) \ 904void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
906 __dlm_lockres_set_refmap_bit(bit,res,__FILE__,__LINE__) 905 struct dlm_lock_resource *res, int bit);
907#define dlm_lockres_clear_refmap_bit(bit,res) \ 906void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
908 __dlm_lockres_clear_refmap_bit(bit,res,__FILE__,__LINE__) 907 struct dlm_lock_resource *res, int bit);
909 908
910static inline void __dlm_lockres_set_refmap_bit(int bit, 909void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
911 struct dlm_lock_resource *res, 910 struct dlm_lock_resource *res);
912 const char *file, 911void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
913 int line) 912 struct dlm_lock_resource *res);
914{
915 //printk("%s:%d:%.*s: setting bit %d\n", file, line,
916 // res->lockname.len, res->lockname.name, bit);
917 set_bit(bit, res->refmap);
918}
919
920static inline void __dlm_lockres_clear_refmap_bit(int bit,
921 struct dlm_lock_resource *res,
922 const char *file,
923 int line)
924{
925 //printk("%s:%d:%.*s: clearing bit %d\n", file, line,
926 // res->lockname.len, res->lockname.name, bit);
927 clear_bit(bit, res->refmap);
928}
929
930void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
931 struct dlm_lock_resource *res,
932 const char *file,
933 int line);
934void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
935 struct dlm_lock_resource *res,
936 int new_lockres,
937 const char *file,
938 int line);
939#define dlm_lockres_drop_inflight_ref(d,r) \
940 __dlm_lockres_drop_inflight_ref(d,r,__FILE__,__LINE__)
941#define dlm_lockres_grab_inflight_ref(d,r) \
942 __dlm_lockres_grab_inflight_ref(d,r,0,__FILE__,__LINE__)
943#define dlm_lockres_grab_inflight_ref_new(d,r) \
944 __dlm_lockres_grab_inflight_ref(d,r,1,__FILE__,__LINE__)
945 913
946void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 914void dlm_queue_ast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
947void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock); 915void dlm_queue_bast(struct dlm_ctxt *dlm, struct dlm_lock *lock);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index 6ed6b95dcf93..92f2ead0fab6 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -157,16 +157,18 @@ static int dlm_protocol_compare(struct dlm_protocol_version *existing,
157 157
158static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm); 158static void dlm_unregister_domain_handlers(struct dlm_ctxt *dlm);
159 159
160void __dlm_unhash_lockres(struct dlm_lock_resource *lockres) 160void __dlm_unhash_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
161{ 161{
162 if (!hlist_unhashed(&lockres->hash_node)) { 162 if (hlist_unhashed(&res->hash_node))
163 hlist_del_init(&lockres->hash_node); 163 return;
164 dlm_lockres_put(lockres); 164
165 } 165 mlog(0, "%s: Unhash res %.*s\n", dlm->name, res->lockname.len,
166 res->lockname.name);
167 hlist_del_init(&res->hash_node);
168 dlm_lockres_put(res);
166} 169}
167 170
168void __dlm_insert_lockres(struct dlm_ctxt *dlm, 171void __dlm_insert_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
169 struct dlm_lock_resource *res)
170{ 172{
171 struct hlist_head *bucket; 173 struct hlist_head *bucket;
172 struct qstr *q; 174 struct qstr *q;
@@ -180,6 +182,9 @@ void __dlm_insert_lockres(struct dlm_ctxt *dlm,
180 dlm_lockres_get(res); 182 dlm_lockres_get(res);
181 183
182 hlist_add_head(&res->hash_node, bucket); 184 hlist_add_head(&res->hash_node, bucket);
185
186 mlog(0, "%s: Hash res %.*s\n", dlm->name, res->lockname.len,
187 res->lockname.name);
183} 188}
184 189
185struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm, 190struct dlm_lock_resource * __dlm_lookup_lockres_full(struct dlm_ctxt *dlm,
@@ -539,17 +544,17 @@ again:
539 544
540static void __dlm_print_nodes(struct dlm_ctxt *dlm) 545static void __dlm_print_nodes(struct dlm_ctxt *dlm)
541{ 546{
542 int node = -1; 547 int node = -1, num = 0;
543 548
544 assert_spin_locked(&dlm->spinlock); 549 assert_spin_locked(&dlm->spinlock);
545 550
546 printk(KERN_NOTICE "o2dlm: Nodes in domain %s: ", dlm->name); 551 printk("( ");
547
548 while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES, 552 while ((node = find_next_bit(dlm->domain_map, O2NM_MAX_NODES,
549 node + 1)) < O2NM_MAX_NODES) { 553 node + 1)) < O2NM_MAX_NODES) {
550 printk("%d ", node); 554 printk("%d ", node);
555 ++num;
551 } 556 }
552 printk("\n"); 557 printk(") %u nodes\n", num);
553} 558}
554 559
555static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data, 560static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
@@ -566,11 +571,10 @@ static int dlm_exit_domain_handler(struct o2net_msg *msg, u32 len, void *data,
566 571
567 node = exit_msg->node_idx; 572 node = exit_msg->node_idx;
568 573
569 printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s\n", node, dlm->name);
570
571 spin_lock(&dlm->spinlock); 574 spin_lock(&dlm->spinlock);
572 clear_bit(node, dlm->domain_map); 575 clear_bit(node, dlm->domain_map);
573 clear_bit(node, dlm->exit_domain_map); 576 clear_bit(node, dlm->exit_domain_map);
577 printk(KERN_NOTICE "o2dlm: Node %u leaves domain %s ", node, dlm->name);
574 __dlm_print_nodes(dlm); 578 __dlm_print_nodes(dlm);
575 579
576 /* notify anything attached to the heartbeat events */ 580 /* notify anything attached to the heartbeat events */
@@ -755,6 +759,7 @@ void dlm_unregister_domain(struct dlm_ctxt *dlm)
755 759
756 dlm_mark_domain_leaving(dlm); 760 dlm_mark_domain_leaving(dlm);
757 dlm_leave_domain(dlm); 761 dlm_leave_domain(dlm);
762 printk(KERN_NOTICE "o2dlm: Leaving domain %s\n", dlm->name);
758 dlm_force_free_mles(dlm); 763 dlm_force_free_mles(dlm);
759 dlm_complete_dlm_shutdown(dlm); 764 dlm_complete_dlm_shutdown(dlm);
760 } 765 }
@@ -970,7 +975,7 @@ static int dlm_assert_joined_handler(struct o2net_msg *msg, u32 len, void *data,
970 clear_bit(assert->node_idx, dlm->exit_domain_map); 975 clear_bit(assert->node_idx, dlm->exit_domain_map);
971 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); 976 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
972 977
973 printk(KERN_NOTICE "o2dlm: Node %u joins domain %s\n", 978 printk(KERN_NOTICE "o2dlm: Node %u joins domain %s ",
974 assert->node_idx, dlm->name); 979 assert->node_idx, dlm->name);
975 __dlm_print_nodes(dlm); 980 __dlm_print_nodes(dlm);
976 981
@@ -1701,8 +1706,10 @@ static int dlm_try_to_join_domain(struct dlm_ctxt *dlm)
1701bail: 1706bail:
1702 spin_lock(&dlm->spinlock); 1707 spin_lock(&dlm->spinlock);
1703 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN); 1708 __dlm_set_joining_node(dlm, DLM_LOCK_RES_OWNER_UNKNOWN);
1704 if (!status) 1709 if (!status) {
1710 printk(KERN_NOTICE "o2dlm: Joining domain %s ", dlm->name);
1705 __dlm_print_nodes(dlm); 1711 __dlm_print_nodes(dlm);
1712 }
1706 spin_unlock(&dlm->spinlock); 1713 spin_unlock(&dlm->spinlock);
1707 1714
1708 if (ctxt) { 1715 if (ctxt) {
@@ -2131,13 +2138,6 @@ struct dlm_ctxt * dlm_register_domain(const char *domain,
2131 goto leave; 2138 goto leave;
2132 } 2139 }
2133 2140
2134 if (!o2hb_check_local_node_heartbeating()) {
2135 mlog(ML_ERROR, "the local node has not been configured, or is "
2136 "not heartbeating\n");
2137 ret = -EPROTO;
2138 goto leave;
2139 }
2140
2141 mlog(0, "register called for domain \"%s\"\n", domain); 2141 mlog(0, "register called for domain \"%s\"\n", domain);
2142 2142
2143retry: 2143retry:
diff --git a/fs/ocfs2/dlm/dlmlock.c b/fs/ocfs2/dlm/dlmlock.c
index 8d39e0fd66f7..f32fcba04923 100644
--- a/fs/ocfs2/dlm/dlmlock.c
+++ b/fs/ocfs2/dlm/dlmlock.c
@@ -183,10 +183,6 @@ static enum dlm_status dlmlock_master(struct dlm_ctxt *dlm,
183 kick_thread = 1; 183 kick_thread = 1;
184 } 184 }
185 } 185 }
186 /* reduce the inflight count, this may result in the lockres
187 * being purged below during calc_usage */
188 if (lock->ml.node == dlm->node_num)
189 dlm_lockres_drop_inflight_ref(dlm, res);
190 186
191 spin_unlock(&res->spinlock); 187 spin_unlock(&res->spinlock);
192 wake_up(&res->wq); 188 wake_up(&res->wq);
@@ -231,10 +227,16 @@ static enum dlm_status dlmlock_remote(struct dlm_ctxt *dlm,
231 lock->ml.type, res->lockname.len, 227 lock->ml.type, res->lockname.len,
232 res->lockname.name, flags); 228 res->lockname.name, flags);
233 229
230 /*
231 * Wait if resource is getting recovered, remastered, etc.
232 * If the resource was remastered and new owner is self, then exit.
233 */
234 spin_lock(&res->spinlock); 234 spin_lock(&res->spinlock);
235
236 /* will exit this call with spinlock held */
237 __dlm_wait_on_lockres(res); 235 __dlm_wait_on_lockres(res);
236 if (res->owner == dlm->node_num) {
237 spin_unlock(&res->spinlock);
238 return DLM_RECOVERING;
239 }
238 res->state |= DLM_LOCK_RES_IN_PROGRESS; 240 res->state |= DLM_LOCK_RES_IN_PROGRESS;
239 241
240 /* add lock to local (secondary) queue */ 242 /* add lock to local (secondary) queue */
@@ -319,27 +321,23 @@ static enum dlm_status dlm_send_remote_lock_request(struct dlm_ctxt *dlm,
319 tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create, 321 tmpret = o2net_send_message(DLM_CREATE_LOCK_MSG, dlm->key, &create,
320 sizeof(create), res->owner, &status); 322 sizeof(create), res->owner, &status);
321 if (tmpret >= 0) { 323 if (tmpret >= 0) {
322 // successfully sent and received 324 ret = status;
323 ret = status; // this is already a dlm_status
324 if (ret == DLM_REJECTED) { 325 if (ret == DLM_REJECTED) {
325 mlog(ML_ERROR, "%s:%.*s: BUG. this is a stale lockres " 326 mlog(ML_ERROR, "%s: res %.*s, Stale lockres no longer "
326 "no longer owned by %u. that node is coming back " 327 "owned by node %u. That node is coming back up "
327 "up currently.\n", dlm->name, create.namelen, 328 "currently.\n", dlm->name, create.namelen,
328 create.name, res->owner); 329 create.name, res->owner);
329 dlm_print_one_lock_resource(res); 330 dlm_print_one_lock_resource(res);
330 BUG(); 331 BUG();
331 } 332 }
332 } else { 333 } else {
333 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " 334 mlog(ML_ERROR, "%s: res %.*s, Error %d send CREATE LOCK to "
334 "node %u\n", tmpret, DLM_CREATE_LOCK_MSG, dlm->key, 335 "node %u\n", dlm->name, create.namelen, create.name,
335 res->owner); 336 tmpret, res->owner);
336 if (dlm_is_host_down(tmpret)) { 337 if (dlm_is_host_down(tmpret))
337 ret = DLM_RECOVERING; 338 ret = DLM_RECOVERING;
338 mlog(0, "node %u died so returning DLM_RECOVERING " 339 else
339 "from lock message!\n", res->owner);
340 } else {
341 ret = dlm_err_to_dlm_status(tmpret); 340 ret = dlm_err_to_dlm_status(tmpret);
342 }
343 } 341 }
344 342
345 return ret; 343 return ret;
@@ -718,18 +716,10 @@ retry_lock:
718 716
719 if (status == DLM_RECOVERING || status == DLM_MIGRATING || 717 if (status == DLM_RECOVERING || status == DLM_MIGRATING ||
720 status == DLM_FORWARD) { 718 status == DLM_FORWARD) {
721 mlog(0, "retrying lock with migration/"
722 "recovery/in progress\n");
723 msleep(100); 719 msleep(100);
724 /* no waiting for dlm_reco_thread */
725 if (recovery) { 720 if (recovery) {
726 if (status != DLM_RECOVERING) 721 if (status != DLM_RECOVERING)
727 goto retry_lock; 722 goto retry_lock;
728
729 mlog(0, "%s: got RECOVERING "
730 "for $RECOVERY lock, master "
731 "was %u\n", dlm->name,
732 res->owner);
733 /* wait to see the node go down, then 723 /* wait to see the node go down, then
734 * drop down and allow the lockres to 724 * drop down and allow the lockres to
735 * get cleaned up. need to remaster. */ 725 * get cleaned up. need to remaster. */
@@ -741,6 +731,14 @@ retry_lock:
741 } 731 }
742 } 732 }
743 733
734 /* Inflight taken in dlm_get_lock_resource() is dropped here */
735 spin_lock(&res->spinlock);
736 dlm_lockres_drop_inflight_ref(dlm, res);
737 spin_unlock(&res->spinlock);
738
739 dlm_lockres_calc_usage(dlm, res);
740 dlm_kick_thread(dlm, res);
741
744 if (status != DLM_NORMAL) { 742 if (status != DLM_NORMAL) {
745 lock->lksb->flags &= ~DLM_LKSB_GET_LVB; 743 lock->lksb->flags &= ~DLM_LKSB_GET_LVB;
746 if (status != DLM_NOTQUEUED) 744 if (status != DLM_NOTQUEUED)
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 11eefb8c12e9..005261c333b0 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -631,39 +631,54 @@ error:
631 return NULL; 631 return NULL;
632} 632}
633 633
634void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, 634void dlm_lockres_set_refmap_bit(struct dlm_ctxt *dlm,
635 struct dlm_lock_resource *res, 635 struct dlm_lock_resource *res, int bit)
636 int new_lockres,
637 const char *file,
638 int line)
639{ 636{
640 if (!new_lockres) 637 assert_spin_locked(&res->spinlock);
641 assert_spin_locked(&res->spinlock); 638
639 mlog(0, "res %.*s, set node %u, %ps()\n", res->lockname.len,
640 res->lockname.name, bit, __builtin_return_address(0));
641
642 set_bit(bit, res->refmap);
643}
644
645void dlm_lockres_clear_refmap_bit(struct dlm_ctxt *dlm,
646 struct dlm_lock_resource *res, int bit)
647{
648 assert_spin_locked(&res->spinlock);
649
650 mlog(0, "res %.*s, clr node %u, %ps()\n", res->lockname.len,
651 res->lockname.name, bit, __builtin_return_address(0));
652
653 clear_bit(bit, res->refmap);
654}
655
656
657void dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm,
658 struct dlm_lock_resource *res)
659{
660 assert_spin_locked(&res->spinlock);
642 661
643 if (!test_bit(dlm->node_num, res->refmap)) {
644 BUG_ON(res->inflight_locks != 0);
645 dlm_lockres_set_refmap_bit(dlm->node_num, res);
646 }
647 res->inflight_locks++; 662 res->inflight_locks++;
648 mlog(0, "%s:%.*s: inflight++: now %u\n", 663
649 dlm->name, res->lockname.len, res->lockname.name, 664 mlog(0, "%s: res %.*s, inflight++: now %u, %ps()\n", dlm->name,
650 res->inflight_locks); 665 res->lockname.len, res->lockname.name, res->inflight_locks,
666 __builtin_return_address(0));
651} 667}
652 668
653void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, 669void dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm,
654 struct dlm_lock_resource *res, 670 struct dlm_lock_resource *res)
655 const char *file,
656 int line)
657{ 671{
658 assert_spin_locked(&res->spinlock); 672 assert_spin_locked(&res->spinlock);
659 673
660 BUG_ON(res->inflight_locks == 0); 674 BUG_ON(res->inflight_locks == 0);
675
661 res->inflight_locks--; 676 res->inflight_locks--;
662 mlog(0, "%s:%.*s: inflight--: now %u\n", 677
663 dlm->name, res->lockname.len, res->lockname.name, 678 mlog(0, "%s: res %.*s, inflight--: now %u, %ps()\n", dlm->name,
664 res->inflight_locks); 679 res->lockname.len, res->lockname.name, res->inflight_locks,
665 if (res->inflight_locks == 0) 680 __builtin_return_address(0));
666 dlm_lockres_clear_refmap_bit(dlm->node_num, res); 681
667 wake_up(&res->wq); 682 wake_up(&res->wq);
668} 683}
669 684
@@ -697,7 +712,6 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm,
697 unsigned int hash; 712 unsigned int hash;
698 int tries = 0; 713 int tries = 0;
699 int bit, wait_on_recovery = 0; 714 int bit, wait_on_recovery = 0;
700 int drop_inflight_if_nonlocal = 0;
701 715
702 BUG_ON(!lockid); 716 BUG_ON(!lockid);
703 717
@@ -709,36 +723,33 @@ lookup:
709 spin_lock(&dlm->spinlock); 723 spin_lock(&dlm->spinlock);
710 tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); 724 tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash);
711 if (tmpres) { 725 if (tmpres) {
712 int dropping_ref = 0;
713
714 spin_unlock(&dlm->spinlock); 726 spin_unlock(&dlm->spinlock);
715
716 spin_lock(&tmpres->spinlock); 727 spin_lock(&tmpres->spinlock);
717 /* We wait for the other thread that is mastering the resource */ 728 /* Wait on the thread that is mastering the resource */
718 if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) { 729 if (tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN) {
719 __dlm_wait_on_lockres(tmpres); 730 __dlm_wait_on_lockres(tmpres);
720 BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN); 731 BUG_ON(tmpres->owner == DLM_LOCK_RES_OWNER_UNKNOWN);
732 spin_unlock(&tmpres->spinlock);
733 dlm_lockres_put(tmpres);
734 tmpres = NULL;
735 goto lookup;
721 } 736 }
722 737
723 if (tmpres->owner == dlm->node_num) { 738 /* Wait on the resource purge to complete before continuing */
724 BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF); 739 if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) {
725 dlm_lockres_grab_inflight_ref(dlm, tmpres); 740 BUG_ON(tmpres->owner == dlm->node_num);
726 } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) 741 __dlm_wait_on_lockres_flags(tmpres,
727 dropping_ref = 1; 742 DLM_LOCK_RES_DROPPING_REF);
728 spin_unlock(&tmpres->spinlock);
729
730 /* wait until done messaging the master, drop our ref to allow
731 * the lockres to be purged, start over. */
732 if (dropping_ref) {
733 spin_lock(&tmpres->spinlock);
734 __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF);
735 spin_unlock(&tmpres->spinlock); 743 spin_unlock(&tmpres->spinlock);
736 dlm_lockres_put(tmpres); 744 dlm_lockres_put(tmpres);
737 tmpres = NULL; 745 tmpres = NULL;
738 goto lookup; 746 goto lookup;
739 } 747 }
740 748
741 mlog(0, "found in hash!\n"); 749 /* Grab inflight ref to pin the resource */
750 dlm_lockres_grab_inflight_ref(dlm, tmpres);
751
752 spin_unlock(&tmpres->spinlock);
742 if (res) 753 if (res)
743 dlm_lockres_put(res); 754 dlm_lockres_put(res);
744 res = tmpres; 755 res = tmpres;
@@ -829,8 +840,8 @@ lookup:
829 * but they might own this lockres. wait on them. */ 840 * but they might own this lockres. wait on them. */
830 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); 841 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
831 if (bit < O2NM_MAX_NODES) { 842 if (bit < O2NM_MAX_NODES) {
832 mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " 843 mlog(0, "%s: res %.*s, At least one node (%d) "
833 "recover before lock mastery can begin\n", 844 "to recover before lock mastery can begin\n",
834 dlm->name, namelen, (char *)lockid, bit); 845 dlm->name, namelen, (char *)lockid, bit);
835 wait_on_recovery = 1; 846 wait_on_recovery = 1;
836 } 847 }
@@ -843,12 +854,11 @@ lookup:
843 854
844 /* finally add the lockres to its hash bucket */ 855 /* finally add the lockres to its hash bucket */
845 __dlm_insert_lockres(dlm, res); 856 __dlm_insert_lockres(dlm, res);
846 /* since this lockres is new it doesn't not require the spinlock */
847 dlm_lockres_grab_inflight_ref_new(dlm, res);
848 857
849 /* if this node does not become the master make sure to drop 858 /* Grab inflight ref to pin the resource */
850 * this inflight reference below */ 859 spin_lock(&res->spinlock);
851 drop_inflight_if_nonlocal = 1; 860 dlm_lockres_grab_inflight_ref(dlm, res);
861 spin_unlock(&res->spinlock);
852 862
853 /* get an extra ref on the mle in case this is a BLOCK 863 /* get an extra ref on the mle in case this is a BLOCK
854 * if so, the creator of the BLOCK may try to put the last 864 * if so, the creator of the BLOCK may try to put the last
@@ -864,8 +874,8 @@ redo_request:
864 * dlm spinlock would be detectable be a change on the mle, 874 * dlm spinlock would be detectable be a change on the mle,
865 * so we only need to clear out the recovery map once. */ 875 * so we only need to clear out the recovery map once. */
866 if (dlm_is_recovery_lock(lockid, namelen)) { 876 if (dlm_is_recovery_lock(lockid, namelen)) {
867 mlog(ML_NOTICE, "%s: recovery map is not empty, but " 877 mlog(0, "%s: Recovery map is not empty, but must "
868 "must master $RECOVERY lock now\n", dlm->name); 878 "master $RECOVERY lock now\n", dlm->name);
869 if (!dlm_pre_master_reco_lockres(dlm, res)) 879 if (!dlm_pre_master_reco_lockres(dlm, res))
870 wait_on_recovery = 0; 880 wait_on_recovery = 0;
871 else { 881 else {
@@ -883,8 +893,8 @@ redo_request:
883 spin_lock(&dlm->spinlock); 893 spin_lock(&dlm->spinlock);
884 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); 894 bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0);
885 if (bit < O2NM_MAX_NODES) { 895 if (bit < O2NM_MAX_NODES) {
886 mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to " 896 mlog(0, "%s: res %.*s, At least one node (%d) "
887 "recover before lock mastery can begin\n", 897 "to recover before lock mastery can begin\n",
888 dlm->name, namelen, (char *)lockid, bit); 898 dlm->name, namelen, (char *)lockid, bit);
889 wait_on_recovery = 1; 899 wait_on_recovery = 1;
890 } else 900 } else
@@ -913,8 +923,8 @@ redo_request:
913 * yet, keep going until it does. this is how the 923 * yet, keep going until it does. this is how the
914 * master will know that asserts are needed back to 924 * master will know that asserts are needed back to
915 * the lower nodes. */ 925 * the lower nodes. */
916 mlog(0, "%s:%.*s: requests only up to %u but master " 926 mlog(0, "%s: res %.*s, Requests only up to %u but "
917 "is %u, keep going\n", dlm->name, namelen, 927 "master is %u, keep going\n", dlm->name, namelen,
918 lockid, nodenum, mle->master); 928 lockid, nodenum, mle->master);
919 } 929 }
920 } 930 }
@@ -924,13 +934,12 @@ wait:
924 ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked); 934 ret = dlm_wait_for_lock_mastery(dlm, res, mle, &blocked);
925 if (ret < 0) { 935 if (ret < 0) {
926 wait_on_recovery = 1; 936 wait_on_recovery = 1;
927 mlog(0, "%s:%.*s: node map changed, redo the " 937 mlog(0, "%s: res %.*s, Node map changed, redo the master "
928 "master request now, blocked=%d\n", 938 "request now, blocked=%d\n", dlm->name, res->lockname.len,
929 dlm->name, res->lockname.len,
930 res->lockname.name, blocked); 939 res->lockname.name, blocked);
931 if (++tries > 20) { 940 if (++tries > 20) {
932 mlog(ML_ERROR, "%s:%.*s: spinning on " 941 mlog(ML_ERROR, "%s: res %.*s, Spinning on "
933 "dlm_wait_for_lock_mastery, blocked=%d\n", 942 "dlm_wait_for_lock_mastery, blocked = %d\n",
934 dlm->name, res->lockname.len, 943 dlm->name, res->lockname.len,
935 res->lockname.name, blocked); 944 res->lockname.name, blocked);
936 dlm_print_one_lock_resource(res); 945 dlm_print_one_lock_resource(res);
@@ -940,7 +949,8 @@ wait:
940 goto redo_request; 949 goto redo_request;
941 } 950 }
942 951
943 mlog(0, "lockres mastered by %u\n", res->owner); 952 mlog(0, "%s: res %.*s, Mastered by %u\n", dlm->name, res->lockname.len,
953 res->lockname.name, res->owner);
944 /* make sure we never continue without this */ 954 /* make sure we never continue without this */
945 BUG_ON(res->owner == O2NM_MAX_NODES); 955 BUG_ON(res->owner == O2NM_MAX_NODES);
946 956
@@ -952,8 +962,6 @@ wait:
952 962
953wake_waiters: 963wake_waiters:
954 spin_lock(&res->spinlock); 964 spin_lock(&res->spinlock);
955 if (res->owner != dlm->node_num && drop_inflight_if_nonlocal)
956 dlm_lockres_drop_inflight_ref(dlm, res);
957 res->state &= ~DLM_LOCK_RES_IN_PROGRESS; 965 res->state &= ~DLM_LOCK_RES_IN_PROGRESS;
958 spin_unlock(&res->spinlock); 966 spin_unlock(&res->spinlock);
959 wake_up(&res->wq); 967 wake_up(&res->wq);
@@ -1426,9 +1434,7 @@ way_up_top:
1426 } 1434 }
1427 1435
1428 if (res->owner == dlm->node_num) { 1436 if (res->owner == dlm->node_num) {
1429 mlog(0, "%s:%.*s: setting bit %u in refmap\n", 1437 dlm_lockres_set_refmap_bit(dlm, res, request->node_idx);
1430 dlm->name, namelen, name, request->node_idx);
1431 dlm_lockres_set_refmap_bit(request->node_idx, res);
1432 spin_unlock(&res->spinlock); 1438 spin_unlock(&res->spinlock);
1433 response = DLM_MASTER_RESP_YES; 1439 response = DLM_MASTER_RESP_YES;
1434 if (mle) 1440 if (mle)
@@ -1493,10 +1499,8 @@ way_up_top:
1493 * go back and clean the mles on any 1499 * go back and clean the mles on any
1494 * other nodes */ 1500 * other nodes */
1495 dispatch_assert = 1; 1501 dispatch_assert = 1;
1496 dlm_lockres_set_refmap_bit(request->node_idx, res); 1502 dlm_lockres_set_refmap_bit(dlm, res,
1497 mlog(0, "%s:%.*s: setting bit %u in refmap\n", 1503 request->node_idx);
1498 dlm->name, namelen, name,
1499 request->node_idx);
1500 } else 1504 } else
1501 response = DLM_MASTER_RESP_NO; 1505 response = DLM_MASTER_RESP_NO;
1502 } else { 1506 } else {
@@ -1702,7 +1706,7 @@ again:
1702 "lockres, set the bit in the refmap\n", 1706 "lockres, set the bit in the refmap\n",
1703 namelen, lockname, to); 1707 namelen, lockname, to);
1704 spin_lock(&res->spinlock); 1708 spin_lock(&res->spinlock);
1705 dlm_lockres_set_refmap_bit(to, res); 1709 dlm_lockres_set_refmap_bit(dlm, res, to);
1706 spin_unlock(&res->spinlock); 1710 spin_unlock(&res->spinlock);
1707 } 1711 }
1708 } 1712 }
@@ -2187,8 +2191,6 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2187 namelen = res->lockname.len; 2191 namelen = res->lockname.len;
2188 BUG_ON(namelen > O2NM_MAX_NAME_LEN); 2192 BUG_ON(namelen > O2NM_MAX_NAME_LEN);
2189 2193
2190 mlog(0, "%s:%.*s: sending deref to %d\n",
2191 dlm->name, namelen, lockname, res->owner);
2192 memset(&deref, 0, sizeof(deref)); 2194 memset(&deref, 0, sizeof(deref));
2193 deref.node_idx = dlm->node_num; 2195 deref.node_idx = dlm->node_num;
2194 deref.namelen = namelen; 2196 deref.namelen = namelen;
@@ -2197,14 +2199,12 @@ int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res)
2197 ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, 2199 ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key,
2198 &deref, sizeof(deref), res->owner, &r); 2200 &deref, sizeof(deref), res->owner, &r);
2199 if (ret < 0) 2201 if (ret < 0)
2200 mlog(ML_ERROR, "Error %d when sending message %u (key 0x%x) to " 2202 mlog(ML_ERROR, "%s: res %.*s, error %d send DEREF to node %u\n",
2201 "node %u\n", ret, DLM_DEREF_LOCKRES_MSG, dlm->key, 2203 dlm->name, namelen, lockname, ret, res->owner);
2202 res->owner);
2203 else if (r < 0) { 2204 else if (r < 0) {
2204 /* BAD. other node says I did not have a ref. */ 2205 /* BAD. other node says I did not have a ref. */
2205 mlog(ML_ERROR,"while dropping ref on %s:%.*s " 2206 mlog(ML_ERROR, "%s: res %.*s, DEREF to node %u got %d\n",
2206 "(master=%u) got %d.\n", dlm->name, namelen, 2207 dlm->name, namelen, lockname, res->owner, r);
2207 lockname, res->owner, r);
2208 dlm_print_one_lock_resource(res); 2208 dlm_print_one_lock_resource(res);
2209 BUG(); 2209 BUG();
2210 } 2210 }
@@ -2260,7 +2260,7 @@ int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data,
2260 else { 2260 else {
2261 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); 2261 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
2262 if (test_bit(node, res->refmap)) { 2262 if (test_bit(node, res->refmap)) {
2263 dlm_lockres_clear_refmap_bit(node, res); 2263 dlm_lockres_clear_refmap_bit(dlm, res, node);
2264 cleared = 1; 2264 cleared = 1;
2265 } 2265 }
2266 } 2266 }
@@ -2320,7 +2320,7 @@ static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data)
2320 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); 2320 BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF);
2321 if (test_bit(node, res->refmap)) { 2321 if (test_bit(node, res->refmap)) {
2322 __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); 2322 __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG);
2323 dlm_lockres_clear_refmap_bit(node, res); 2323 dlm_lockres_clear_refmap_bit(dlm, res, node);
2324 cleared = 1; 2324 cleared = 1;
2325 } 2325 }
2326 spin_unlock(&res->spinlock); 2326 spin_unlock(&res->spinlock);
@@ -2802,7 +2802,8 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2802 BUG_ON(!list_empty(&lock->bast_list)); 2802 BUG_ON(!list_empty(&lock->bast_list));
2803 BUG_ON(lock->ast_pending); 2803 BUG_ON(lock->ast_pending);
2804 BUG_ON(lock->bast_pending); 2804 BUG_ON(lock->bast_pending);
2805 dlm_lockres_clear_refmap_bit(lock->ml.node, res); 2805 dlm_lockres_clear_refmap_bit(dlm, res,
2806 lock->ml.node);
2806 list_del_init(&lock->list); 2807 list_del_init(&lock->list);
2807 dlm_lock_put(lock); 2808 dlm_lock_put(lock);
2808 /* In a normal unlock, we would have added a 2809 /* In a normal unlock, we would have added a
@@ -2823,7 +2824,7 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm,
2823 mlog(0, "%s:%.*s: node %u had a ref to this " 2824 mlog(0, "%s:%.*s: node %u had a ref to this "
2824 "migrating lockres, clearing\n", dlm->name, 2825 "migrating lockres, clearing\n", dlm->name,
2825 res->lockname.len, res->lockname.name, bit); 2826 res->lockname.len, res->lockname.name, bit);
2826 dlm_lockres_clear_refmap_bit(bit, res); 2827 dlm_lockres_clear_refmap_bit(dlm, res, bit);
2827 } 2828 }
2828 bit++; 2829 bit++;
2829 } 2830 }
@@ -2916,9 +2917,9 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2916 &migrate, sizeof(migrate), nodenum, 2917 &migrate, sizeof(migrate), nodenum,
2917 &status); 2918 &status);
2918 if (ret < 0) { 2919 if (ret < 0) {
2919 mlog(ML_ERROR, "Error %d when sending message %u (key " 2920 mlog(ML_ERROR, "%s: res %.*s, Error %d send "
2920 "0x%x) to node %u\n", ret, DLM_MIGRATE_REQUEST_MSG, 2921 "MIGRATE_REQUEST to node %u\n", dlm->name,
2921 dlm->key, nodenum); 2922 migrate.namelen, migrate.name, ret, nodenum);
2922 if (!dlm_is_host_down(ret)) { 2923 if (!dlm_is_host_down(ret)) {
2923 mlog(ML_ERROR, "unhandled error=%d!\n", ret); 2924 mlog(ML_ERROR, "unhandled error=%d!\n", ret);
2924 BUG(); 2925 BUG();
@@ -2937,7 +2938,7 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm,
2937 dlm->name, res->lockname.len, res->lockname.name, 2938 dlm->name, res->lockname.len, res->lockname.name,
2938 nodenum); 2939 nodenum);
2939 spin_lock(&res->spinlock); 2940 spin_lock(&res->spinlock);
2940 dlm_lockres_set_refmap_bit(nodenum, res); 2941 dlm_lockres_set_refmap_bit(dlm, res, nodenum);
2941 spin_unlock(&res->spinlock); 2942 spin_unlock(&res->spinlock);
2942 } 2943 }
2943 } 2944 }
@@ -3271,7 +3272,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
3271 * mastery reference here since old_master will briefly have 3272 * mastery reference here since old_master will briefly have
3272 * a reference after the migration completes */ 3273 * a reference after the migration completes */
3273 spin_lock(&res->spinlock); 3274 spin_lock(&res->spinlock);
3274 dlm_lockres_set_refmap_bit(old_master, res); 3275 dlm_lockres_set_refmap_bit(dlm, res, old_master);
3275 spin_unlock(&res->spinlock); 3276 spin_unlock(&res->spinlock);
3276 3277
3277 mlog(0, "now time to do a migrate request to other nodes\n"); 3278 mlog(0, "now time to do a migrate request to other nodes\n");
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 7efab6d28a21..01ebfd0bdad7 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -362,40 +362,38 @@ static int dlm_is_node_recovered(struct dlm_ctxt *dlm, u8 node)
362} 362}
363 363
364 364
365int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout) 365void dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
366{ 366{
367 if (timeout) { 367 if (dlm_is_node_dead(dlm, node))
368 mlog(ML_NOTICE, "%s: waiting %dms for notification of " 368 return;
369 "death of node %u\n", dlm->name, timeout, node); 369
370 printk(KERN_NOTICE "o2dlm: Waiting on the death of node %u in "
371 "domain %s\n", node, dlm->name);
372
373 if (timeout)
370 wait_event_timeout(dlm->dlm_reco_thread_wq, 374 wait_event_timeout(dlm->dlm_reco_thread_wq,
371 dlm_is_node_dead(dlm, node), 375 dlm_is_node_dead(dlm, node),
372 msecs_to_jiffies(timeout)); 376 msecs_to_jiffies(timeout));
373 } else { 377 else
374 mlog(ML_NOTICE, "%s: waiting indefinitely for notification "
375 "of death of node %u\n", dlm->name, node);
376 wait_event(dlm->dlm_reco_thread_wq, 378 wait_event(dlm->dlm_reco_thread_wq,
377 dlm_is_node_dead(dlm, node)); 379 dlm_is_node_dead(dlm, node));
378 }
379 /* for now, return 0 */
380 return 0;
381} 380}
382 381
383int dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout) 382void dlm_wait_for_node_recovery(struct dlm_ctxt *dlm, u8 node, int timeout)
384{ 383{
385 if (timeout) { 384 if (dlm_is_node_recovered(dlm, node))
386 mlog(0, "%s: waiting %dms for notification of " 385 return;
387 "recovery of node %u\n", dlm->name, timeout, node); 386
387 printk(KERN_NOTICE "o2dlm: Waiting on the recovery of node %u in "
388 "domain %s\n", node, dlm->name);
389
390 if (timeout)
388 wait_event_timeout(dlm->dlm_reco_thread_wq, 391 wait_event_timeout(dlm->dlm_reco_thread_wq,
389 dlm_is_node_recovered(dlm, node), 392 dlm_is_node_recovered(dlm, node),
390 msecs_to_jiffies(timeout)); 393 msecs_to_jiffies(timeout));
391 } else { 394 else
392 mlog(0, "%s: waiting indefinitely for notification "
393 "of recovery of node %u\n", dlm->name, node);
394 wait_event(dlm->dlm_reco_thread_wq, 395 wait_event(dlm->dlm_reco_thread_wq,
395 dlm_is_node_recovered(dlm, node)); 396 dlm_is_node_recovered(dlm, node));
396 }
397 /* for now, return 0 */
398 return 0;
399} 397}
400 398
401/* callers of the top-level api calls (dlmlock/dlmunlock) should 399/* callers of the top-level api calls (dlmlock/dlmunlock) should
@@ -430,6 +428,8 @@ static void dlm_begin_recovery(struct dlm_ctxt *dlm)
430{ 428{
431 spin_lock(&dlm->spinlock); 429 spin_lock(&dlm->spinlock);
432 BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE); 430 BUG_ON(dlm->reco.state & DLM_RECO_STATE_ACTIVE);
431 printk(KERN_NOTICE "o2dlm: Begin recovery on domain %s for node %u\n",
432 dlm->name, dlm->reco.dead_node);
433 dlm->reco.state |= DLM_RECO_STATE_ACTIVE; 433 dlm->reco.state |= DLM_RECO_STATE_ACTIVE;
434 spin_unlock(&dlm->spinlock); 434 spin_unlock(&dlm->spinlock);
435} 435}
@@ -440,9 +440,18 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm)
440 BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE)); 440 BUG_ON(!(dlm->reco.state & DLM_RECO_STATE_ACTIVE));
441 dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE; 441 dlm->reco.state &= ~DLM_RECO_STATE_ACTIVE;
442 spin_unlock(&dlm->spinlock); 442 spin_unlock(&dlm->spinlock);
443 printk(KERN_NOTICE "o2dlm: End recovery on domain %s\n", dlm->name);
443 wake_up(&dlm->reco.event); 444 wake_up(&dlm->reco.event);
444} 445}
445 446
447static void dlm_print_recovery_master(struct dlm_ctxt *dlm)
448{
449 printk(KERN_NOTICE "o2dlm: Node %u (%s) is the Recovery Master for the "
450 "dead node %u in domain %s\n", dlm->reco.new_master,
451 (dlm->node_num == dlm->reco.new_master ? "me" : "he"),
452 dlm->reco.dead_node, dlm->name);
453}
454
446static int dlm_do_recovery(struct dlm_ctxt *dlm) 455static int dlm_do_recovery(struct dlm_ctxt *dlm)
447{ 456{
448 int status = 0; 457 int status = 0;
@@ -505,9 +514,8 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
505 } 514 }
506 mlog(0, "another node will master this recovery session.\n"); 515 mlog(0, "another node will master this recovery session.\n");
507 } 516 }
508 mlog(0, "dlm=%s (%d), new_master=%u, this node=%u, dead_node=%u\n", 517
509 dlm->name, task_pid_nr(dlm->dlm_reco_thread_task), dlm->reco.new_master, 518 dlm_print_recovery_master(dlm);
510 dlm->node_num, dlm->reco.dead_node);
511 519
512 /* it is safe to start everything back up here 520 /* it is safe to start everything back up here
513 * because all of the dead node's lock resources 521 * because all of the dead node's lock resources
@@ -518,15 +526,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
518 return 0; 526 return 0;
519 527
520master_here: 528master_here:
521 mlog(ML_NOTICE, "(%d) Node %u is the Recovery Master for the Dead Node " 529 dlm_print_recovery_master(dlm);
522 "%u for Domain %s\n", task_pid_nr(dlm->dlm_reco_thread_task),
523 dlm->node_num, dlm->reco.dead_node, dlm->name);
524 530
525 status = dlm_remaster_locks(dlm, dlm->reco.dead_node); 531 status = dlm_remaster_locks(dlm, dlm->reco.dead_node);
526 if (status < 0) { 532 if (status < 0) {
527 /* we should never hit this anymore */ 533 /* we should never hit this anymore */
528 mlog(ML_ERROR, "error %d remastering locks for node %u, " 534 mlog(ML_ERROR, "%s: Error %d remastering locks for node %u, "
529 "retrying.\n", status, dlm->reco.dead_node); 535 "retrying.\n", dlm->name, status, dlm->reco.dead_node);
530 /* yield a bit to allow any final network messages 536 /* yield a bit to allow any final network messages
531 * to get handled on remaining nodes */ 537 * to get handled on remaining nodes */
532 msleep(100); 538 msleep(100);
@@ -567,7 +573,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
567 BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT); 573 BUG_ON(ndata->state != DLM_RECO_NODE_DATA_INIT);
568 ndata->state = DLM_RECO_NODE_DATA_REQUESTING; 574 ndata->state = DLM_RECO_NODE_DATA_REQUESTING;
569 575
570 mlog(0, "requesting lock info from node %u\n", 576 mlog(0, "%s: Requesting lock info from node %u\n", dlm->name,
571 ndata->node_num); 577 ndata->node_num);
572 578
573 if (ndata->node_num == dlm->node_num) { 579 if (ndata->node_num == dlm->node_num) {
@@ -640,7 +646,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
640 spin_unlock(&dlm_reco_state_lock); 646 spin_unlock(&dlm_reco_state_lock);
641 } 647 }
642 648
643 mlog(0, "done requesting all lock info\n"); 649 mlog(0, "%s: Done requesting all lock info\n", dlm->name);
644 650
645 /* nodes should be sending reco data now 651 /* nodes should be sending reco data now
646 * just need to wait */ 652 * just need to wait */
@@ -802,10 +808,9 @@ static int dlm_request_all_locks(struct dlm_ctxt *dlm, u8 request_from,
802 808
803 /* negative status is handled by caller */ 809 /* negative status is handled by caller */
804 if (ret < 0) 810 if (ret < 0)
805 mlog(ML_ERROR, "Error %d when sending message %u (key " 811 mlog(ML_ERROR, "%s: Error %d send LOCK_REQUEST to node %u "
806 "0x%x) to node %u\n", ret, DLM_LOCK_REQUEST_MSG, 812 "to recover dead node %u\n", dlm->name, ret,
807 dlm->key, request_from); 813 request_from, dead_node);
808
809 // return from here, then 814 // return from here, then
810 // sleep until all received or error 815 // sleep until all received or error
811 return ret; 816 return ret;
@@ -956,9 +961,9 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
956 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, 961 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
957 sizeof(done_msg), send_to, &tmpret); 962 sizeof(done_msg), send_to, &tmpret);
958 if (ret < 0) { 963 if (ret < 0) {
959 mlog(ML_ERROR, "Error %d when sending message %u (key " 964 mlog(ML_ERROR, "%s: Error %d send RECO_DATA_DONE to node %u "
960 "0x%x) to node %u\n", ret, DLM_RECO_DATA_DONE_MSG, 965 "to recover dead node %u\n", dlm->name, ret, send_to,
961 dlm->key, send_to); 966 dead_node);
962 if (!dlm_is_host_down(ret)) { 967 if (!dlm_is_host_down(ret)) {
963 BUG(); 968 BUG();
964 } 969 }
@@ -1127,9 +1132,11 @@ static int dlm_send_mig_lockres_msg(struct dlm_ctxt *dlm,
1127 if (ret < 0) { 1132 if (ret < 0) {
1128 /* XXX: negative status is not handled. 1133 /* XXX: negative status is not handled.
1129 * this will end up killing this node. */ 1134 * this will end up killing this node. */
1130 mlog(ML_ERROR, "Error %d when sending message %u (key " 1135 mlog(ML_ERROR, "%s: res %.*s, Error %d send MIG_LOCKRES to "
1131 "0x%x) to node %u\n", ret, DLM_MIG_LOCKRES_MSG, 1136 "node %u (%s)\n", dlm->name, mres->lockname_len,
1132 dlm->key, send_to); 1137 mres->lockname, ret, send_to,
1138 (orig_flags & DLM_MRES_MIGRATION ?
1139 "migration" : "recovery"));
1133 } else { 1140 } else {
1134 /* might get an -ENOMEM back here */ 1141 /* might get an -ENOMEM back here */
1135 ret = status; 1142 ret = status;
@@ -1767,7 +1774,7 @@ static int dlm_process_recovery_data(struct dlm_ctxt *dlm,
1767 dlm->name, mres->lockname_len, mres->lockname, 1774 dlm->name, mres->lockname_len, mres->lockname,
1768 from); 1775 from);
1769 spin_lock(&res->spinlock); 1776 spin_lock(&res->spinlock);
1770 dlm_lockres_set_refmap_bit(from, res); 1777 dlm_lockres_set_refmap_bit(dlm, res, from);
1771 spin_unlock(&res->spinlock); 1778 spin_unlock(&res->spinlock);
1772 added++; 1779 added++;
1773 break; 1780 break;
@@ -1965,7 +1972,7 @@ skip_lvb:
1965 mlog(0, "%s:%.*s: added lock for node %u, " 1972 mlog(0, "%s:%.*s: added lock for node %u, "
1966 "setting refmap bit\n", dlm->name, 1973 "setting refmap bit\n", dlm->name,
1967 res->lockname.len, res->lockname.name, ml->node); 1974 res->lockname.len, res->lockname.name, ml->node);
1968 dlm_lockres_set_refmap_bit(ml->node, res); 1975 dlm_lockres_set_refmap_bit(dlm, res, ml->node);
1969 added++; 1976 added++;
1970 } 1977 }
1971 spin_unlock(&res->spinlock); 1978 spin_unlock(&res->spinlock);
@@ -2084,6 +2091,9 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
2084 2091
2085 list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) { 2092 list_for_each_entry_safe(res, next, &dlm->reco.resources, recovering) {
2086 if (res->owner == dead_node) { 2093 if (res->owner == dead_node) {
2094 mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
2095 dlm->name, res->lockname.len, res->lockname.name,
2096 res->owner, new_master);
2087 list_del_init(&res->recovering); 2097 list_del_init(&res->recovering);
2088 spin_lock(&res->spinlock); 2098 spin_lock(&res->spinlock);
2089 /* new_master has our reference from 2099 /* new_master has our reference from
@@ -2105,40 +2115,30 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
2105 for (i = 0; i < DLM_HASH_BUCKETS; i++) { 2115 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
2106 bucket = dlm_lockres_hash(dlm, i); 2116 bucket = dlm_lockres_hash(dlm, i);
2107 hlist_for_each_entry(res, hash_iter, bucket, hash_node) { 2117 hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
2108 if (res->state & DLM_LOCK_RES_RECOVERING) { 2118 if (!(res->state & DLM_LOCK_RES_RECOVERING))
2109 if (res->owner == dead_node) { 2119 continue;
2110 mlog(0, "(this=%u) res %.*s owner=%u "
2111 "was not on recovering list, but "
2112 "clearing state anyway\n",
2113 dlm->node_num, res->lockname.len,
2114 res->lockname.name, new_master);
2115 } else if (res->owner == dlm->node_num) {
2116 mlog(0, "(this=%u) res %.*s owner=%u "
2117 "was not on recovering list, "
2118 "owner is THIS node, clearing\n",
2119 dlm->node_num, res->lockname.len,
2120 res->lockname.name, new_master);
2121 } else
2122 continue;
2123 2120
2124 if (!list_empty(&res->recovering)) { 2121 if (res->owner != dead_node &&
2125 mlog(0, "%s:%.*s: lockres was " 2122 res->owner != dlm->node_num)
2126 "marked RECOVERING, owner=%u\n", 2123 continue;
2127 dlm->name, res->lockname.len, 2124
2128 res->lockname.name, res->owner); 2125 if (!list_empty(&res->recovering)) {
2129 list_del_init(&res->recovering); 2126 list_del_init(&res->recovering);
2130 dlm_lockres_put(res); 2127 dlm_lockres_put(res);
2131 }
2132 spin_lock(&res->spinlock);
2133 /* new_master has our reference from
2134 * the lock state sent during recovery */
2135 dlm_change_lockres_owner(dlm, res, new_master);
2136 res->state &= ~DLM_LOCK_RES_RECOVERING;
2137 if (__dlm_lockres_has_locks(res))
2138 __dlm_dirty_lockres(dlm, res);
2139 spin_unlock(&res->spinlock);
2140 wake_up(&res->wq);
2141 } 2128 }
2129
2130 /* new_master has our reference from
2131 * the lock state sent during recovery */
2132 mlog(0, "%s: res %.*s, Changing owner from %u to %u\n",
2133 dlm->name, res->lockname.len, res->lockname.name,
2134 res->owner, new_master);
2135 spin_lock(&res->spinlock);
2136 dlm_change_lockres_owner(dlm, res, new_master);
2137 res->state &= ~DLM_LOCK_RES_RECOVERING;
2138 if (__dlm_lockres_has_locks(res))
2139 __dlm_dirty_lockres(dlm, res);
2140 spin_unlock(&res->spinlock);
2141 wake_up(&res->wq);
2142 } 2142 }
2143 } 2143 }
2144} 2144}
@@ -2252,12 +2252,12 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2252 res->lockname.len, res->lockname.name, freed, dead_node); 2252 res->lockname.len, res->lockname.name, freed, dead_node);
2253 __dlm_print_one_lock_resource(res); 2253 __dlm_print_one_lock_resource(res);
2254 } 2254 }
2255 dlm_lockres_clear_refmap_bit(dead_node, res); 2255 dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
2256 } else if (test_bit(dead_node, res->refmap)) { 2256 } else if (test_bit(dead_node, res->refmap)) {
2257 mlog(0, "%s:%.*s: dead node %u had a ref, but had " 2257 mlog(0, "%s:%.*s: dead node %u had a ref, but had "
2258 "no locks and had not purged before dying\n", dlm->name, 2258 "no locks and had not purged before dying\n", dlm->name,
2259 res->lockname.len, res->lockname.name, dead_node); 2259 res->lockname.len, res->lockname.name, dead_node);
2260 dlm_lockres_clear_refmap_bit(dead_node, res); 2260 dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
2261 } 2261 }
2262 2262
2263 /* do not kick thread yet */ 2263 /* do not kick thread yet */
@@ -2324,9 +2324,9 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
2324 dlm_revalidate_lvb(dlm, res, dead_node); 2324 dlm_revalidate_lvb(dlm, res, dead_node);
2325 if (res->owner == dead_node) { 2325 if (res->owner == dead_node) {
2326 if (res->state & DLM_LOCK_RES_DROPPING_REF) { 2326 if (res->state & DLM_LOCK_RES_DROPPING_REF) {
2327 mlog(ML_NOTICE, "Ignore %.*s for " 2327 mlog(ML_NOTICE, "%s: res %.*s, Skip "
2328 "recovery as it is being freed\n", 2328 "recovery as it is being freed\n",
2329 res->lockname.len, 2329 dlm->name, res->lockname.len,
2330 res->lockname.name); 2330 res->lockname.name);
2331 } else 2331 } else
2332 dlm_move_lockres_to_recovery_list(dlm, 2332 dlm_move_lockres_to_recovery_list(dlm,
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 1d6d1d22c471..e73c833fc2a1 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -94,24 +94,26 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
94{ 94{
95 int bit; 95 int bit;
96 96
97 assert_spin_locked(&res->spinlock);
98
97 if (__dlm_lockres_has_locks(res)) 99 if (__dlm_lockres_has_locks(res))
98 return 0; 100 return 0;
99 101
102 /* Locks are in the process of being created */
103 if (res->inflight_locks)
104 return 0;
105
100 if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) 106 if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
101 return 0; 107 return 0;
102 108
103 if (res->state & DLM_LOCK_RES_RECOVERING) 109 if (res->state & DLM_LOCK_RES_RECOVERING)
104 return 0; 110 return 0;
105 111
112 /* Another node has this resource with this node as the master */
106 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0); 113 bit = find_next_bit(res->refmap, O2NM_MAX_NODES, 0);
107 if (bit < O2NM_MAX_NODES) 114 if (bit < O2NM_MAX_NODES)
108 return 0; 115 return 0;
109 116
110 /*
111 * since the bit for dlm->node_num is not set, inflight_locks better
112 * be zero
113 */
114 BUG_ON(res->inflight_locks != 0);
115 return 1; 117 return 1;
116} 118}
117 119
@@ -185,8 +187,6 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
185 /* clear our bit from the master's refmap, ignore errors */ 187 /* clear our bit from the master's refmap, ignore errors */
186 ret = dlm_drop_lockres_ref(dlm, res); 188 ret = dlm_drop_lockres_ref(dlm, res);
187 if (ret < 0) { 189 if (ret < 0) {
188 mlog(ML_ERROR, "%s: deref %.*s failed %d\n", dlm->name,
189 res->lockname.len, res->lockname.name, ret);
190 if (!dlm_is_host_down(ret)) 190 if (!dlm_is_host_down(ret))
191 BUG(); 191 BUG();
192 } 192 }
@@ -209,7 +209,7 @@ static void dlm_purge_lockres(struct dlm_ctxt *dlm,
209 BUG(); 209 BUG();
210 } 210 }
211 211
212 __dlm_unhash_lockres(res); 212 __dlm_unhash_lockres(dlm, res);
213 213
214 /* lockres is not in the hash now. drop the flag and wake up 214 /* lockres is not in the hash now. drop the flag and wake up
215 * any processes waiting in dlm_get_lock_resource. */ 215 * any processes waiting in dlm_get_lock_resource. */
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index 23457b491e8c..2f5b92ef0e53 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -832,6 +832,102 @@ out:
832 return ret; 832 return ret;
833} 833}
834 834
835int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin)
836{
837 struct inode *inode = file->f_mapping->host;
838 int ret;
839 unsigned int is_last = 0, is_data = 0;
840 u16 cs_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits;
841 u32 cpos, cend, clen, hole_size;
842 u64 extoff, extlen;
843 struct buffer_head *di_bh = NULL;
844 struct ocfs2_extent_rec rec;
845
846 BUG_ON(origin != SEEK_DATA && origin != SEEK_HOLE);
847
848 ret = ocfs2_inode_lock(inode, &di_bh, 0);
849 if (ret) {
850 mlog_errno(ret);
851 goto out;
852 }
853
854 down_read(&OCFS2_I(inode)->ip_alloc_sem);
855
856 if (*offset >= inode->i_size) {
857 ret = -ENXIO;
858 goto out_unlock;
859 }
860
861 if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
862 if (origin == SEEK_HOLE)
863 *offset = inode->i_size;
864 goto out_unlock;
865 }
866
867 clen = 0;
868 cpos = *offset >> cs_bits;
869 cend = ocfs2_clusters_for_bytes(inode->i_sb, inode->i_size);
870
871 while (cpos < cend && !is_last) {
872 ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, &hole_size,
873 &rec, &is_last);
874 if (ret) {
875 mlog_errno(ret);
876 goto out_unlock;
877 }
878
879 extoff = cpos;
880 extoff <<= cs_bits;
881
882 if (rec.e_blkno == 0ULL) {
883 clen = hole_size;
884 is_data = 0;
885 } else {
886 clen = le16_to_cpu(rec.e_leaf_clusters) -
887 (cpos - le32_to_cpu(rec.e_cpos));
888 is_data = (rec.e_flags & OCFS2_EXT_UNWRITTEN) ? 0 : 1;
889 }
890
891 if ((!is_data && origin == SEEK_HOLE) ||
892 (is_data && origin == SEEK_DATA)) {
893 if (extoff > *offset)
894 *offset = extoff;
895 goto out_unlock;
896 }
897
898 if (!is_last)
899 cpos += clen;
900 }
901
902 if (origin == SEEK_HOLE) {
903 extoff = cpos;
904 extoff <<= cs_bits;
905 extlen = clen;
906 extlen <<= cs_bits;
907
908 if ((extoff + extlen) > inode->i_size)
909 extlen = inode->i_size - extoff;
910 extoff += extlen;
911 if (extoff > *offset)
912 *offset = extoff;
913 goto out_unlock;
914 }
915
916 ret = -ENXIO;
917
918out_unlock:
919
920 brelse(di_bh);
921
922 up_read(&OCFS2_I(inode)->ip_alloc_sem);
923
924 ocfs2_inode_unlock(inode, 0);
925out:
926 if (ret && ret != -ENXIO)
927 ret = -ENXIO;
928 return ret;
929}
930
835int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr, 931int ocfs2_read_virt_blocks(struct inode *inode, u64 v_block, int nr,
836 struct buffer_head *bhs[], int flags, 932 struct buffer_head *bhs[], int flags,
837 int (*validate)(struct super_block *sb, 933 int (*validate)(struct super_block *sb,
diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h
index e79d41c2c909..67ea57d2fd59 100644
--- a/fs/ocfs2/extent_map.h
+++ b/fs/ocfs2/extent_map.h
@@ -53,6 +53,8 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno,
53int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, 53int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
54 u64 map_start, u64 map_len); 54 u64 map_start, u64 map_len);
55 55
56int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin);
57
56int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, 58int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster,
57 u32 *p_cluster, u32 *num_clusters, 59 u32 *p_cluster, u32 *num_clusters,
58 struct ocfs2_extent_list *el, 60 struct ocfs2_extent_list *el,
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index 145f4533a936..5c4a74e04ab4 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -171,7 +171,8 @@ static int ocfs2_dir_release(struct inode *inode, struct file *file)
171 return 0; 171 return 0;
172} 172}
173 173
174static int ocfs2_sync_file(struct file *file, int datasync) 174static int ocfs2_sync_file(struct file *file, loff_t start, loff_t end,
175 int datasync)
175{ 176{
176 int err = 0; 177 int err = 0;
177 journal_t *journal; 178 journal_t *journal;
@@ -184,6 +185,16 @@ static int ocfs2_sync_file(struct file *file, int datasync)
184 file->f_path.dentry->d_name.name, 185 file->f_path.dentry->d_name.name,
185 (unsigned long long)datasync); 186 (unsigned long long)datasync);
186 187
188 err = filemap_write_and_wait_range(inode->i_mapping, start, end);
189 if (err)
190 return err;
191
192 /*
193 * Probably don't need the i_mutex at all in here, just putting it here
194 * to be consistent with how fsync used to be called, someone more
195 * familiar with the fs could possibly remove it.
196 */
197 mutex_lock(&inode->i_mutex);
187 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) { 198 if (datasync && !(inode->i_state & I_DIRTY_DATASYNC)) {
188 /* 199 /*
189 * We still have to flush drive's caches to get data to the 200 * We still have to flush drive's caches to get data to the
@@ -200,6 +211,7 @@ static int ocfs2_sync_file(struct file *file, int datasync)
200bail: 211bail:
201 if (err) 212 if (err)
202 mlog_errno(err); 213 mlog_errno(err);
214 mutex_unlock(&inode->i_mutex);
203 215
204 return (err < 0) ? -EIO : 0; 216 return (err < 0) ? -EIO : 0;
205} 217}
@@ -1142,6 +1154,8 @@ int ocfs2_setattr(struct dentry *dentry, struct iattr *attr)
1142 if (status) 1154 if (status)
1143 goto bail_unlock; 1155 goto bail_unlock;
1144 1156
1157 inode_dio_wait(inode);
1158
1145 if (i_size_read(inode) > attr->ia_size) { 1159 if (i_size_read(inode) > attr->ia_size) {
1146 if (ocfs2_should_order_data(inode)) { 1160 if (ocfs2_should_order_data(inode)) {
1147 status = ocfs2_begin_ordered_truncate(inode, 1161 status = ocfs2_begin_ordered_truncate(inode,
@@ -1279,11 +1293,11 @@ bail:
1279 return err; 1293 return err;
1280} 1294}
1281 1295
1282int ocfs2_permission(struct inode *inode, int mask, unsigned int flags) 1296int ocfs2_permission(struct inode *inode, int mask)
1283{ 1297{
1284 int ret; 1298 int ret;
1285 1299
1286 if (flags & IPERM_FLAG_RCU) 1300 if (mask & MAY_NOT_BLOCK)
1287 return -ECHILD; 1301 return -ECHILD;
1288 1302
1289 ret = ocfs2_inode_lock(inode, NULL, 0); 1303 ret = ocfs2_inode_lock(inode, NULL, 0);
@@ -1293,7 +1307,7 @@ int ocfs2_permission(struct inode *inode, int mask, unsigned int flags)
1293 goto out; 1307 goto out;
1294 } 1308 }
1295 1309
1296 ret = generic_permission(inode, mask, flags, ocfs2_check_acl); 1310 ret = generic_permission(inode, mask);
1297 1311
1298 ocfs2_inode_unlock(inode, 0); 1312 ocfs2_inode_unlock(inode, 0);
1299out: 1313out:
@@ -2254,9 +2268,8 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
2254 ocfs2_iocb_clear_sem_locked(iocb); 2268 ocfs2_iocb_clear_sem_locked(iocb);
2255 2269
2256relock: 2270relock:
2257 /* to match setattr's i_mutex -> i_alloc_sem -> rw_lock ordering */ 2271 /* to match setattr's i_mutex -> rw_lock ordering */
2258 if (direct_io) { 2272 if (direct_io) {
2259 down_read(&inode->i_alloc_sem);
2260 have_alloc_sem = 1; 2273 have_alloc_sem = 1;
2261 /* communicate with ocfs2_dio_end_io */ 2274 /* communicate with ocfs2_dio_end_io */
2262 ocfs2_iocb_set_sem_locked(iocb); 2275 ocfs2_iocb_set_sem_locked(iocb);
@@ -2312,7 +2325,6 @@ relock:
2312 */ 2325 */
2313 if (direct_io && !can_do_direct) { 2326 if (direct_io && !can_do_direct) {
2314 ocfs2_rw_unlock(inode, rw_level); 2327 ocfs2_rw_unlock(inode, rw_level);
2315 up_read(&inode->i_alloc_sem);
2316 2328
2317 have_alloc_sem = 0; 2329 have_alloc_sem = 0;
2318 rw_level = -1; 2330 rw_level = -1;
@@ -2395,8 +2407,7 @@ out_dio:
2395 /* 2407 /*
2396 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io 2408 * deep in g_f_a_w_n()->ocfs2_direct_IO we pass in a ocfs2_dio_end_io
2397 * function pointer which is called when o_direct io completes so that 2409 * function pointer which is called when o_direct io completes so that
2398 * it can unlock our rw lock. (it's the clustered equivalent of 2410 * it can unlock our rw lock.
2399 * i_alloc_sem; protects truncate from racing with pending ios).
2400 * Unfortunately there are error cases which call end_io and others 2411 * Unfortunately there are error cases which call end_io and others
2401 * that don't. so we don't have to unlock the rw_lock if either an 2412 * that don't. so we don't have to unlock the rw_lock if either an
2402 * async dio is going to do it in the future or an end_io after an 2413 * async dio is going to do it in the future or an end_io after an
@@ -2416,10 +2427,8 @@ out:
2416 ocfs2_rw_unlock(inode, rw_level); 2427 ocfs2_rw_unlock(inode, rw_level);
2417 2428
2418out_sems: 2429out_sems:
2419 if (have_alloc_sem) { 2430 if (have_alloc_sem)
2420 up_read(&inode->i_alloc_sem);
2421 ocfs2_iocb_clear_sem_locked(iocb); 2431 ocfs2_iocb_clear_sem_locked(iocb);
2422 }
2423 2432
2424 mutex_unlock(&inode->i_mutex); 2433 mutex_unlock(&inode->i_mutex);
2425 2434
@@ -2569,7 +2578,6 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2569 * need locks to protect pending reads from racing with truncate. 2578 * need locks to protect pending reads from racing with truncate.
2570 */ 2579 */
2571 if (filp->f_flags & O_DIRECT) { 2580 if (filp->f_flags & O_DIRECT) {
2572 down_read(&inode->i_alloc_sem);
2573 have_alloc_sem = 1; 2581 have_alloc_sem = 1;
2574 ocfs2_iocb_set_sem_locked(iocb); 2582 ocfs2_iocb_set_sem_locked(iocb);
2575 2583
@@ -2612,16 +2620,66 @@ static ssize_t ocfs2_file_aio_read(struct kiocb *iocb,
2612 } 2620 }
2613 2621
2614bail: 2622bail:
2615 if (have_alloc_sem) { 2623 if (have_alloc_sem)
2616 up_read(&inode->i_alloc_sem);
2617 ocfs2_iocb_clear_sem_locked(iocb); 2624 ocfs2_iocb_clear_sem_locked(iocb);
2618 } 2625
2619 if (rw_level != -1) 2626 if (rw_level != -1)
2620 ocfs2_rw_unlock(inode, rw_level); 2627 ocfs2_rw_unlock(inode, rw_level);
2621 2628
2622 return ret; 2629 return ret;
2623} 2630}
2624 2631
2632/* Refer generic_file_llseek_unlocked() */
2633static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int origin)
2634{
2635 struct inode *inode = file->f_mapping->host;
2636 int ret = 0;
2637
2638 mutex_lock(&inode->i_mutex);
2639
2640 switch (origin) {
2641 case SEEK_SET:
2642 break;
2643 case SEEK_END:
2644 offset += inode->i_size;
2645 break;
2646 case SEEK_CUR:
2647 if (offset == 0) {
2648 offset = file->f_pos;
2649 goto out;
2650 }
2651 offset += file->f_pos;
2652 break;
2653 case SEEK_DATA:
2654 case SEEK_HOLE:
2655 ret = ocfs2_seek_data_hole_offset(file, &offset, origin);
2656 if (ret)
2657 goto out;
2658 break;
2659 default:
2660 ret = -EINVAL;
2661 goto out;
2662 }
2663
2664 if (offset < 0 && !(file->f_mode & FMODE_UNSIGNED_OFFSET))
2665 ret = -EINVAL;
2666 if (!ret && offset > inode->i_sb->s_maxbytes)
2667 ret = -EINVAL;
2668 if (ret)
2669 goto out;
2670
2671 if (offset != file->f_pos) {
2672 file->f_pos = offset;
2673 file->f_version = 0;
2674 }
2675
2676out:
2677 mutex_unlock(&inode->i_mutex);
2678 if (ret)
2679 return ret;
2680 return offset;
2681}
2682
2625const struct inode_operations ocfs2_file_iops = { 2683const struct inode_operations ocfs2_file_iops = {
2626 .setattr = ocfs2_setattr, 2684 .setattr = ocfs2_setattr,
2627 .getattr = ocfs2_getattr, 2685 .getattr = ocfs2_getattr,
@@ -2631,12 +2689,14 @@ const struct inode_operations ocfs2_file_iops = {
2631 .listxattr = ocfs2_listxattr, 2689 .listxattr = ocfs2_listxattr,
2632 .removexattr = generic_removexattr, 2690 .removexattr = generic_removexattr,
2633 .fiemap = ocfs2_fiemap, 2691 .fiemap = ocfs2_fiemap,
2692 .check_acl = ocfs2_check_acl,
2634}; 2693};
2635 2694
2636const struct inode_operations ocfs2_special_file_iops = { 2695const struct inode_operations ocfs2_special_file_iops = {
2637 .setattr = ocfs2_setattr, 2696 .setattr = ocfs2_setattr,
2638 .getattr = ocfs2_getattr, 2697 .getattr = ocfs2_getattr,
2639 .permission = ocfs2_permission, 2698 .permission = ocfs2_permission,
2699 .check_acl = ocfs2_check_acl,
2640}; 2700};
2641 2701
2642/* 2702/*
@@ -2644,7 +2704,7 @@ const struct inode_operations ocfs2_special_file_iops = {
2644 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks! 2704 * ocfs2_fops_no_plocks and ocfs2_dops_no_plocks!
2645 */ 2705 */
2646const struct file_operations ocfs2_fops = { 2706const struct file_operations ocfs2_fops = {
2647 .llseek = generic_file_llseek, 2707 .llseek = ocfs2_file_llseek,
2648 .read = do_sync_read, 2708 .read = do_sync_read,
2649 .write = do_sync_write, 2709 .write = do_sync_write,
2650 .mmap = ocfs2_mmap, 2710 .mmap = ocfs2_mmap,
@@ -2692,7 +2752,7 @@ const struct file_operations ocfs2_dops = {
2692 * the cluster. 2752 * the cluster.
2693 */ 2753 */
2694const struct file_operations ocfs2_fops_no_plocks = { 2754const struct file_operations ocfs2_fops_no_plocks = {
2695 .llseek = generic_file_llseek, 2755 .llseek = ocfs2_file_llseek,
2696 .read = do_sync_read, 2756 .read = do_sync_read,
2697 .write = do_sync_write, 2757 .write = do_sync_write,
2698 .mmap = ocfs2_mmap, 2758 .mmap = ocfs2_mmap,
diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h
index f5afbbef6703..97bf761c9e7c 100644
--- a/fs/ocfs2/file.h
+++ b/fs/ocfs2/file.h
@@ -61,7 +61,7 @@ int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh,
61int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); 61int ocfs2_setattr(struct dentry *dentry, struct iattr *attr);
62int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry, 62int ocfs2_getattr(struct vfsmount *mnt, struct dentry *dentry,
63 struct kstat *stat); 63 struct kstat *stat);
64int ocfs2_permission(struct inode *inode, int mask, unsigned int flags); 64int ocfs2_permission(struct inode *inode, int mask);
65 65
66int ocfs2_should_update_atime(struct inode *inode, 66int ocfs2_should_update_atime(struct inode *inode,
67 struct vfsmount *vfsmnt); 67 struct vfsmount *vfsmnt);
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 295d56454e8b..0a42ae96dca7 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -1544,9 +1544,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1544 /* we need to run complete recovery for offline orphan slots */ 1544 /* we need to run complete recovery for offline orphan slots */
1545 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED); 1545 ocfs2_replay_map_set_state(osb, REPLAY_NEEDED);
1546 1546
1547 mlog(ML_NOTICE, "Recovering node %d from slot %d on device (%u,%u)\n", 1547 printk(KERN_NOTICE "ocfs2: Begin replay journal (node %d, slot %d) on "\
1548 node_num, slot_num, 1548 "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
1549 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 1549 MINOR(osb->sb->s_dev));
1550 1550
1551 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters); 1551 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters);
1552 1552
@@ -1601,6 +1601,9 @@ static int ocfs2_replay_journal(struct ocfs2_super *osb,
1601 1601
1602 jbd2_journal_destroy(journal); 1602 jbd2_journal_destroy(journal);
1603 1603
1604 printk(KERN_NOTICE "ocfs2: End replay journal (node %d, slot %d) on "\
1605 "device (%u,%u)\n", node_num, slot_num, MAJOR(osb->sb->s_dev),
1606 MINOR(osb->sb->s_dev));
1604done: 1607done:
1605 /* drop the lock on this nodes journal */ 1608 /* drop the lock on this nodes journal */
1606 if (got_lock) 1609 if (got_lock)
@@ -1808,6 +1811,20 @@ static inline unsigned long ocfs2_orphan_scan_timeout(void)
1808 * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This 1811 * every slot, queuing a recovery of the slot on the ocfs2_wq thread. This
1809 * is done to catch any orphans that are left over in orphan directories. 1812 * is done to catch any orphans that are left over in orphan directories.
1810 * 1813 *
1814 * It scans all slots, even ones that are in use. It does so to handle the
1815 * case described below:
1816 *
1817 * Node 1 has an inode it was using. The dentry went away due to memory
1818 * pressure. Node 1 closes the inode, but it's on the free list. The node
1819 * has the open lock.
1820 * Node 2 unlinks the inode. It grabs the dentry lock to notify others,
1821 * but node 1 has no dentry and doesn't get the message. It trylocks the
1822 * open lock, sees that another node has a PR, and does nothing.
1823 * Later node 2 runs its orphan dir. It igets the inode, trylocks the
1824 * open lock, sees the PR still, and does nothing.
1825 * Basically, we have to trigger an orphan iput on node 1. The only way
1826 * for this to happen is if node 1 runs node 2's orphan dir.
1827 *
1811 * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT 1828 * ocfs2_queue_orphan_scan gets called every ORPHAN_SCAN_SCHEDULE_TIMEOUT
1812 * seconds. It gets an EX lock on os_lockres and checks sequence number 1829 * seconds. It gets an EX lock on os_lockres and checks sequence number
1813 * stored in LVB. If the sequence number has changed, it means some other 1830 * stored in LVB. If the sequence number has changed, it means some other
diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c
index 3e9393ca39eb..9cd41083e991 100644
--- a/fs/ocfs2/mmap.c
+++ b/fs/ocfs2/mmap.c
@@ -61,7 +61,7 @@ static int ocfs2_fault(struct vm_area_struct *area, struct vm_fault *vmf)
61static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, 61static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
62 struct page *page) 62 struct page *page)
63{ 63{
64 int ret; 64 int ret = VM_FAULT_NOPAGE;
65 struct inode *inode = file->f_path.dentry->d_inode; 65 struct inode *inode = file->f_path.dentry->d_inode;
66 struct address_space *mapping = inode->i_mapping; 66 struct address_space *mapping = inode->i_mapping;
67 loff_t pos = page_offset(page); 67 loff_t pos = page_offset(page);
@@ -71,32 +71,25 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
71 void *fsdata; 71 void *fsdata;
72 loff_t size = i_size_read(inode); 72 loff_t size = i_size_read(inode);
73 73
74 /*
75 * Another node might have truncated while we were waiting on
76 * cluster locks.
77 * We don't check size == 0 before the shift. This is borrowed
78 * from do_generic_file_read.
79 */
80 last_index = (size - 1) >> PAGE_CACHE_SHIFT; 74 last_index = (size - 1) >> PAGE_CACHE_SHIFT;
81 if (unlikely(!size || page->index > last_index)) {
82 ret = -EINVAL;
83 goto out;
84 }
85 75
86 /* 76 /*
87 * The i_size check above doesn't catch the case where nodes 77 * There are cases that lead to the page no longer bebongs to the
88 * truncated and then re-extended the file. We'll re-check the 78 * mapping.
89 * page mapping after taking the page lock inside of 79 * 1) pagecache truncates locally due to memory pressure.
90 * ocfs2_write_begin_nolock(). 80 * 2) pagecache truncates when another is taking EX lock against
81 * inode lock. see ocfs2_data_convert_worker.
82 *
83 * The i_size check doesn't catch the case where nodes truncated and
84 * then re-extended the file. We'll re-check the page mapping after
85 * taking the page lock inside of ocfs2_write_begin_nolock().
86 *
87 * Let VM retry with these cases.
91 */ 88 */
92 if (!PageUptodate(page) || page->mapping != inode->i_mapping) { 89 if ((page->mapping != inode->i_mapping) ||
93 /* 90 (!PageUptodate(page)) ||
94 * the page has been umapped in ocfs2_data_downconvert_worker. 91 (page_offset(page) >= size))
95 * So return 0 here and let VFS retry.
96 */
97 ret = 0;
98 goto out; 92 goto out;
99 }
100 93
101 /* 94 /*
102 * Call ocfs2_write_begin() and ocfs2_write_end() to take 95 * Call ocfs2_write_begin() and ocfs2_write_end() to take
@@ -116,17 +109,21 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh,
116 if (ret) { 109 if (ret) {
117 if (ret != -ENOSPC) 110 if (ret != -ENOSPC)
118 mlog_errno(ret); 111 mlog_errno(ret);
112 if (ret == -ENOMEM)
113 ret = VM_FAULT_OOM;
114 else
115 ret = VM_FAULT_SIGBUS;
119 goto out; 116 goto out;
120 } 117 }
121 118
122 ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page, 119 if (!locked_page) {
123 fsdata); 120 ret = VM_FAULT_NOPAGE;
124 if (ret < 0) {
125 mlog_errno(ret);
126 goto out; 121 goto out;
127 } 122 }
123 ret = ocfs2_write_end_nolock(mapping, pos, len, len, locked_page,
124 fsdata);
128 BUG_ON(ret != len); 125 BUG_ON(ret != len);
129 ret = 0; 126 ret = VM_FAULT_LOCKED;
130out: 127out:
131 return ret; 128 return ret;
132} 129}
@@ -168,8 +165,6 @@ static int ocfs2_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
168 165
169out: 166out:
170 ocfs2_unblock_signals(&oldset); 167 ocfs2_unblock_signals(&oldset);
171 if (ret)
172 ret = VM_FAULT_SIGBUS;
173 return ret; 168 return ret;
174} 169}
175 170
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index e5d738cd9cc0..33889dc52dd7 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -2498,4 +2498,5 @@ const struct inode_operations ocfs2_dir_iops = {
2498 .listxattr = ocfs2_listxattr, 2498 .listxattr = ocfs2_listxattr,
2499 .removexattr = generic_removexattr, 2499 .removexattr = generic_removexattr,
2500 .fiemap = ocfs2_fiemap, 2500 .fiemap = ocfs2_fiemap,
2501 .check_acl = ocfs2_check_acl,
2501}; 2502};
diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c
index dc8007fc9247..942fd65bdad3 100644
--- a/fs/ocfs2/quota_local.c
+++ b/fs/ocfs2/quota_local.c
@@ -404,7 +404,9 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery(
404 int status = 0; 404 int status = 0;
405 struct ocfs2_quota_recovery *rec; 405 struct ocfs2_quota_recovery *rec;
406 406
407 mlog(ML_NOTICE, "Beginning quota recovery in slot %u\n", slot_num); 407 printk(KERN_NOTICE "ocfs2: Beginning quota recovery on device (%s) for "
408 "slot %u\n", osb->dev_str, slot_num);
409
408 rec = ocfs2_alloc_quota_recovery(); 410 rec = ocfs2_alloc_quota_recovery();
409 if (!rec) 411 if (!rec)
410 return ERR_PTR(-ENOMEM); 412 return ERR_PTR(-ENOMEM);
@@ -596,7 +598,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
596 struct inode *lqinode; 598 struct inode *lqinode;
597 unsigned int flags; 599 unsigned int flags;
598 600
599 mlog(ML_NOTICE, "Finishing quota recovery in slot %u\n", slot_num); 601 printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for "
602 "slot %u\n", osb->dev_str, slot_num);
603
600 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); 604 mutex_lock(&sb_dqopt(sb)->dqonoff_mutex);
601 for (type = 0; type < MAXQUOTAS; type++) { 605 for (type = 0; type < MAXQUOTAS; type++) {
602 if (list_empty(&(rec->r_list[type]))) 606 if (list_empty(&(rec->r_list[type])))
@@ -612,8 +616,9 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb,
612 /* Someone else is holding the lock? Then he must be 616 /* Someone else is holding the lock? Then he must be
613 * doing the recovery. Just skip the file... */ 617 * doing the recovery. Just skip the file... */
614 if (status == -EAGAIN) { 618 if (status == -EAGAIN) {
615 mlog(ML_NOTICE, "skipping quota recovery for slot %d " 619 printk(KERN_NOTICE "ocfs2: Skipping quota recovery on "
616 "because quota file is locked.\n", slot_num); 620 "device (%s) for slot %d because quota file is "
621 "locked.\n", osb->dev_str, slot_num);
617 status = 0; 622 status = 0;
618 goto out_put; 623 goto out_put;
619 } else if (status < 0) { 624 } else if (status < 0) {
diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c
index ebfd3825f12a..cf7823382664 100644
--- a/fs/ocfs2/refcounttree.c
+++ b/fs/ocfs2/refcounttree.c
@@ -4368,25 +4368,6 @@ static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
4368 return inode_permission(dir, MAY_WRITE | MAY_EXEC); 4368 return inode_permission(dir, MAY_WRITE | MAY_EXEC);
4369} 4369}
4370 4370
4371/* copied from user_path_parent. */
4372static int ocfs2_user_path_parent(const char __user *path,
4373 struct nameidata *nd, char **name)
4374{
4375 char *s = getname(path);
4376 int error;
4377
4378 if (IS_ERR(s))
4379 return PTR_ERR(s);
4380
4381 error = kern_path_parent(s, nd);
4382 if (error)
4383 putname(s);
4384 else
4385 *name = s;
4386
4387 return error;
4388}
4389
4390/** 4371/**
4391 * ocfs2_vfs_reflink - Create a reference-counted link 4372 * ocfs2_vfs_reflink - Create a reference-counted link
4392 * 4373 *
@@ -4460,10 +4441,8 @@ int ocfs2_reflink_ioctl(struct inode *inode,
4460 bool preserve) 4441 bool preserve)
4461{ 4442{
4462 struct dentry *new_dentry; 4443 struct dentry *new_dentry;
4463 struct nameidata nd; 4444 struct path old_path, new_path;
4464 struct path old_path;
4465 int error; 4445 int error;
4466 char *to = NULL;
4467 4446
4468 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) 4447 if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
4469 return -EOPNOTSUPP; 4448 return -EOPNOTSUPP;
@@ -4474,39 +4453,33 @@ int ocfs2_reflink_ioctl(struct inode *inode,
4474 return error; 4453 return error;
4475 } 4454 }
4476 4455
4477 error = ocfs2_user_path_parent(newname, &nd, &to); 4456 new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0);
4478 if (error) { 4457 error = PTR_ERR(new_dentry);
4458 if (IS_ERR(new_dentry)) {
4479 mlog_errno(error); 4459 mlog_errno(error);
4480 goto out; 4460 goto out;
4481 } 4461 }
4482 4462
4483 error = -EXDEV; 4463 error = -EXDEV;
4484 if (old_path.mnt != nd.path.mnt) 4464 if (old_path.mnt != new_path.mnt) {
4485 goto out_release;
4486 new_dentry = lookup_create(&nd, 0);
4487 error = PTR_ERR(new_dentry);
4488 if (IS_ERR(new_dentry)) {
4489 mlog_errno(error); 4465 mlog_errno(error);
4490 goto out_unlock; 4466 goto out_dput;
4491 } 4467 }
4492 4468
4493 error = mnt_want_write(nd.path.mnt); 4469 error = mnt_want_write(new_path.mnt);
4494 if (error) { 4470 if (error) {
4495 mlog_errno(error); 4471 mlog_errno(error);
4496 goto out_dput; 4472 goto out_dput;
4497 } 4473 }
4498 4474
4499 error = ocfs2_vfs_reflink(old_path.dentry, 4475 error = ocfs2_vfs_reflink(old_path.dentry,
4500 nd.path.dentry->d_inode, 4476 new_path.dentry->d_inode,
4501 new_dentry, preserve); 4477 new_dentry, preserve);
4502 mnt_drop_write(nd.path.mnt); 4478 mnt_drop_write(new_path.mnt);
4503out_dput: 4479out_dput:
4504 dput(new_dentry); 4480 dput(new_dentry);
4505out_unlock: 4481 mutex_unlock(&new_path.dentry->d_inode->i_mutex);
4506 mutex_unlock(&nd.path.dentry->d_inode->i_mutex); 4482 path_put(&new_path);
4507out_release:
4508 path_put(&nd.path);
4509 putname(to);
4510out: 4483out:
4511 path_put(&old_path); 4484 path_put(&old_path);
4512 4485
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 26fc0014d509..1424c151cccc 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -493,8 +493,8 @@ int ocfs2_find_slot(struct ocfs2_super *osb)
493 goto bail; 493 goto bail;
494 } 494 }
495 } else 495 } else
496 mlog(ML_NOTICE, "slot %d is already allocated to this node!\n", 496 printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already "
497 slot); 497 "allocated to this node!\n", slot, osb->dev_str);
498 498
499 ocfs2_set_slot(si, slot, osb->node_num); 499 ocfs2_set_slot(si, slot, osb->node_num);
500 osb->slot_num = slot; 500 osb->slot_num = slot;
diff --git a/fs/ocfs2/stack_o2cb.c b/fs/ocfs2/stack_o2cb.c
index 19965b00c43c..94368017edb3 100644
--- a/fs/ocfs2/stack_o2cb.c
+++ b/fs/ocfs2/stack_o2cb.c
@@ -28,6 +28,7 @@
28#include "cluster/masklog.h" 28#include "cluster/masklog.h"
29#include "cluster/nodemanager.h" 29#include "cluster/nodemanager.h"
30#include "cluster/heartbeat.h" 30#include "cluster/heartbeat.h"
31#include "cluster/tcp.h"
31 32
32#include "stackglue.h" 33#include "stackglue.h"
33 34
@@ -256,6 +257,61 @@ static void o2cb_dump_lksb(struct ocfs2_dlm_lksb *lksb)
256} 257}
257 258
258/* 259/*
260 * Check if this node is heartbeating and is connected to all other
261 * heartbeating nodes.
262 */
263static int o2cb_cluster_check(void)
264{
265 u8 node_num;
266 int i;
267 unsigned long hbmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
268 unsigned long netmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
269
270 node_num = o2nm_this_node();
271 if (node_num == O2NM_MAX_NODES) {
272 printk(KERN_ERR "o2cb: This node has not been configured.\n");
273 return -EINVAL;
274 }
275
276 /*
277 * o2dlm expects o2net sockets to be created. If not, then
278 * dlm_join_domain() fails with a stack of errors which are both cryptic
279 * and incomplete. The idea here is to detect upfront whether we have
280 * managed to connect to all nodes or not. If not, then list the nodes
281 * to allow the user to check the configuration (incorrect IP, firewall,
282 * etc.) Yes, this is racy. But its not the end of the world.
283 */
284#define O2CB_MAP_STABILIZE_COUNT 60
285 for (i = 0; i < O2CB_MAP_STABILIZE_COUNT; ++i) {
286 o2hb_fill_node_map(hbmap, sizeof(hbmap));
287 if (!test_bit(node_num, hbmap)) {
288 printk(KERN_ERR "o2cb: %s heartbeat has not been "
289 "started.\n", (o2hb_global_heartbeat_active() ?
290 "Global" : "Local"));
291 return -EINVAL;
292 }
293 o2net_fill_node_map(netmap, sizeof(netmap));
294 /* Force set the current node to allow easy compare */
295 set_bit(node_num, netmap);
296 if (!memcmp(hbmap, netmap, sizeof(hbmap)))
297 return 0;
298 if (i < O2CB_MAP_STABILIZE_COUNT)
299 msleep(1000);
300 }
301
302 printk(KERN_ERR "o2cb: This node could not connect to nodes:");
303 i = -1;
304 while ((i = find_next_bit(hbmap, O2NM_MAX_NODES,
305 i + 1)) < O2NM_MAX_NODES) {
306 if (!test_bit(i, netmap))
307 printk(" %u", i);
308 }
309 printk(".\n");
310
311 return -ENOTCONN;
312}
313
314/*
259 * Called from the dlm when it's about to evict a node. This is how the 315 * Called from the dlm when it's about to evict a node. This is how the
260 * classic stack signals node death. 316 * classic stack signals node death.
261 */ 317 */
@@ -263,8 +319,8 @@ static void o2dlm_eviction_cb(int node_num, void *data)
263{ 319{
264 struct ocfs2_cluster_connection *conn = data; 320 struct ocfs2_cluster_connection *conn = data;
265 321
266 mlog(ML_NOTICE, "o2dlm has evicted node %d from group %.*s\n", 322 printk(KERN_NOTICE "o2cb: o2dlm has evicted node %d from domain %.*s\n",
267 node_num, conn->cc_namelen, conn->cc_name); 323 node_num, conn->cc_namelen, conn->cc_name);
268 324
269 conn->cc_recovery_handler(node_num, conn->cc_recovery_data); 325 conn->cc_recovery_handler(node_num, conn->cc_recovery_data);
270} 326}
@@ -280,12 +336,11 @@ static int o2cb_cluster_connect(struct ocfs2_cluster_connection *conn)
280 BUG_ON(conn == NULL); 336 BUG_ON(conn == NULL);
281 BUG_ON(conn->cc_proto == NULL); 337 BUG_ON(conn->cc_proto == NULL);
282 338
283 /* for now we only have one cluster/node, make sure we see it 339 /* Ensure cluster stack is up and all nodes are connected */
284 * in the heartbeat universe */ 340 rc = o2cb_cluster_check();
285 if (!o2hb_check_local_node_heartbeating()) { 341 if (rc) {
286 if (o2hb_global_heartbeat_active()) 342 printk(KERN_ERR "o2cb: Cluster check failed. Fix errors "
287 mlog(ML_ERROR, "Global heartbeat not started\n"); 343 "before retrying.\n");
288 rc = -EINVAL;
289 goto out; 344 goto out;
290 } 345 }
291 346
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 603f5fe9f816..938e2b2b0c9c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -1073,7 +1073,7 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1073 1073
1074 sb->s_magic = OCFS2_SUPER_MAGIC; 1074 sb->s_magic = OCFS2_SUPER_MAGIC;
1075 1075
1076 sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | 1076 sb->s_flags = (sb->s_flags & ~(MS_POSIXACL | MS_NOSEC)) |
1077 ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0); 1077 ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) ? MS_POSIXACL : 0);
1078 1078
1079 /* Hard readonly mode only if: bdev_read_only, MS_RDONLY, 1079 /* Hard readonly mode only if: bdev_read_only, MS_RDONLY,
@@ -1108,9 +1108,9 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent)
1108 1108
1109 ocfs2_set_ro_flag(osb, 1); 1109 ocfs2_set_ro_flag(osb, 1);
1110 1110
1111 printk(KERN_NOTICE "Readonly device detected. No cluster " 1111 printk(KERN_NOTICE "ocfs2: Readonly device (%s) detected. "
1112 "services will be utilized for this mount. Recovery " 1112 "Cluster services will not be used for this mount. "
1113 "will be skipped.\n"); 1113 "Recovery will be skipped.\n", osb->dev_str);
1114 } 1114 }
1115 1115
1116 if (!ocfs2_is_hard_readonly(osb)) { 1116 if (!ocfs2_is_hard_readonly(osb)) {
@@ -2469,8 +2469,8 @@ static int ocfs2_check_volume(struct ocfs2_super *osb)
2469 goto finally; 2469 goto finally;
2470 } 2470 }
2471 } else { 2471 } else {
2472 mlog(ML_NOTICE, "File system was not unmounted cleanly, " 2472 printk(KERN_NOTICE "ocfs2: File system on device (%s) was not "
2473 "recovering volume.\n"); 2473 "unmounted cleanly, recovering it.\n", osb->dev_str);
2474 } 2474 }
2475 2475
2476 local = ocfs2_mount_local(osb); 2476 local = ocfs2_mount_local(osb);