aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@g5.osdl.org>2006-02-03 18:21:40 -0500
committerLinus Torvalds <torvalds@g5.osdl.org>2006-02-03 18:21:40 -0500
commitd1ffa5669cd834f901141756e63195f48c1bfbf9 (patch)
treef0bed266c1f3fef528bbced56b48aac63e0a26b1 /fs/ocfs2
parentd6c8f6aaa1d7f68c1e6471ab0839d9047cdd159f (diff)
parent6eff5790d57a5c9c01489c95946881808a4b2a2c (diff)
Merge branch 'upstream-linus' of git://oss.oracle.com/home/sourcebo/git/ocfs2
Diffstat (limited to 'fs/ocfs2')
-rw-r--r--fs/ocfs2/buffer_head_io.c10
-rw-r--r--fs/ocfs2/cluster/heartbeat.c5
-rw-r--r--fs/ocfs2/cluster/tcp.c16
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h1
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c18
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c24
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c250
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c13
-rw-r--r--fs/ocfs2/dlm/userdlm.c2
-rw-r--r--fs/ocfs2/extent_map.c12
-rw-r--r--fs/ocfs2/file.c10
-rw-r--r--fs/ocfs2/inode.c6
-rw-r--r--fs/ocfs2/inode.h4
-rw-r--r--fs/ocfs2/journal.c32
-rw-r--r--fs/ocfs2/ocfs2.h3
-rw-r--r--fs/ocfs2/super.c11
-rw-r--r--fs/ocfs2/sysfile.c6
-rw-r--r--fs/ocfs2/uptodate.c12
-rw-r--r--fs/ocfs2/uptodate.h2
19 files changed, 323 insertions, 114 deletions
diff --git a/fs/ocfs2/buffer_head_io.c b/fs/ocfs2/buffer_head_io.c
index d424041b38e9..bae3d7548bea 100644
--- a/fs/ocfs2/buffer_head_io.c
+++ b/fs/ocfs2/buffer_head_io.c
@@ -58,7 +58,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
58 goto out; 58 goto out;
59 } 59 }
60 60
61 down(&OCFS2_I(inode)->ip_io_sem); 61 mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
62 62
63 lock_buffer(bh); 63 lock_buffer(bh);
64 set_buffer_uptodate(bh); 64 set_buffer_uptodate(bh);
@@ -82,7 +82,7 @@ int ocfs2_write_block(struct ocfs2_super *osb, struct buffer_head *bh,
82 brelse(bh); 82 brelse(bh);
83 } 83 }
84 84
85 up(&OCFS2_I(inode)->ip_io_sem); 85 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
86out: 86out:
87 mlog_exit(ret); 87 mlog_exit(ret);
88 return ret; 88 return ret;
@@ -125,13 +125,13 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
125 flags &= ~OCFS2_BH_CACHED; 125 flags &= ~OCFS2_BH_CACHED;
126 126
127 if (inode) 127 if (inode)
128 down(&OCFS2_I(inode)->ip_io_sem); 128 mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
129 for (i = 0 ; i < nr ; i++) { 129 for (i = 0 ; i < nr ; i++) {
130 if (bhs[i] == NULL) { 130 if (bhs[i] == NULL) {
131 bhs[i] = sb_getblk(sb, block++); 131 bhs[i] = sb_getblk(sb, block++);
132 if (bhs[i] == NULL) { 132 if (bhs[i] == NULL) {
133 if (inode) 133 if (inode)
134 up(&OCFS2_I(inode)->ip_io_sem); 134 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
135 status = -EIO; 135 status = -EIO;
136 mlog_errno(status); 136 mlog_errno(status);
137 goto bail; 137 goto bail;
@@ -220,7 +220,7 @@ int ocfs2_read_blocks(struct ocfs2_super *osb, u64 block, int nr,
220 ocfs2_set_buffer_uptodate(inode, bh); 220 ocfs2_set_buffer_uptodate(inode, bh);
221 } 221 }
222 if (inode) 222 if (inode)
223 up(&OCFS2_I(inode)->ip_io_sem); 223 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
224 224
225 mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr, 225 mlog(ML_BH_IO, "block=(%"MLFu64"), nr=(%d), cached=%s\n", block, nr,
226 (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes"); 226 (!(flags & OCFS2_BH_CACHED) || ignore_cache) ? "no" : "yes");
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 7307ba528913..d08971d29b63 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -917,8 +917,9 @@ static int o2hb_thread(void *data)
917 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); 917 elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);
918 918
919 mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n", 919 mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
920 before_hb.tv_sec, before_hb.tv_usec, 920 before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
921 after_hb.tv_sec, after_hb.tv_usec, elapsed_msec); 921 after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
922 elapsed_msec);
922 923
923 if (elapsed_msec < reg->hr_timeout_ms) { 924 if (elapsed_msec < reg->hr_timeout_ms) {
924 /* the kthread api has blocked signals for us so no 925 /* the kthread api has blocked signals for us so no
diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c
index 35d92c01a972..d22d4cf08db1 100644
--- a/fs/ocfs2/cluster/tcp.c
+++ b/fs/ocfs2/cluster/tcp.c
@@ -1285,14 +1285,16 @@ static void o2net_idle_timer(unsigned long data)
1285 mlog(ML_NOTICE, "here are some times that might help debug the " 1285 mlog(ML_NOTICE, "here are some times that might help debug the "
1286 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv " 1286 "situation: (tmr %ld.%ld now %ld.%ld dr %ld.%ld adv "
1287 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n", 1287 "%ld.%ld:%ld.%ld func (%08x:%u) %ld.%ld:%ld.%ld)\n",
1288 sc->sc_tv_timer.tv_sec, sc->sc_tv_timer.tv_usec, 1288 sc->sc_tv_timer.tv_sec, (long) sc->sc_tv_timer.tv_usec,
1289 now.tv_sec, now.tv_usec, 1289 now.tv_sec, (long) now.tv_usec,
1290 sc->sc_tv_data_ready.tv_sec, sc->sc_tv_data_ready.tv_usec, 1290 sc->sc_tv_data_ready.tv_sec, (long) sc->sc_tv_data_ready.tv_usec,
1291 sc->sc_tv_advance_start.tv_sec, sc->sc_tv_advance_start.tv_usec, 1291 sc->sc_tv_advance_start.tv_sec,
1292 sc->sc_tv_advance_stop.tv_sec, sc->sc_tv_advance_stop.tv_usec, 1292 (long) sc->sc_tv_advance_start.tv_usec,
1293 sc->sc_tv_advance_stop.tv_sec,
1294 (long) sc->sc_tv_advance_stop.tv_usec,
1293 sc->sc_msg_key, sc->sc_msg_type, 1295 sc->sc_msg_key, sc->sc_msg_type,
1294 sc->sc_tv_func_start.tv_sec, sc->sc_tv_func_start.tv_usec, 1296 sc->sc_tv_func_start.tv_sec, (long) sc->sc_tv_func_start.tv_usec,
1295 sc->sc_tv_func_stop.tv_sec, sc->sc_tv_func_stop.tv_usec); 1297 sc->sc_tv_func_stop.tv_sec, (long) sc->sc_tv_func_stop.tv_usec);
1296 1298
1297 o2net_sc_queue_work(sc, &sc->sc_shutdown_work); 1299 o2net_sc_queue_work(sc, &sc->sc_shutdown_work);
1298} 1300}
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 3fecba0a6023..42eb53b5293b 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -657,6 +657,7 @@ void dlm_complete_thread(struct dlm_ctxt *dlm);
657int dlm_launch_recovery_thread(struct dlm_ctxt *dlm); 657int dlm_launch_recovery_thread(struct dlm_ctxt *dlm);
658void dlm_complete_recovery_thread(struct dlm_ctxt *dlm); 658void dlm_complete_recovery_thread(struct dlm_ctxt *dlm);
659void dlm_wait_for_recovery(struct dlm_ctxt *dlm); 659void dlm_wait_for_recovery(struct dlm_ctxt *dlm);
660int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node);
660 661
661void dlm_put(struct dlm_ctxt *dlm); 662void dlm_put(struct dlm_ctxt *dlm);
662struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm); 663struct dlm_ctxt *dlm_grab(struct dlm_ctxt *dlm);
diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c
index da3c22045f89..6ee30837389c 100644
--- a/fs/ocfs2/dlm/dlmdomain.c
+++ b/fs/ocfs2/dlm/dlmdomain.c
@@ -573,8 +573,11 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
573 spin_lock(&dlm_domain_lock); 573 spin_lock(&dlm_domain_lock);
574 dlm = __dlm_lookup_domain_full(query->domain, query->name_len); 574 dlm = __dlm_lookup_domain_full(query->domain, query->name_len);
575 /* Once the dlm ctxt is marked as leaving then we don't want 575 /* Once the dlm ctxt is marked as leaving then we don't want
576 * to be put in someone's domain map. */ 576 * to be put in someone's domain map.
577 * Also, explicitly disallow joining at certain troublesome
578 * times (ie. during recovery). */
577 if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) { 579 if (dlm && dlm->dlm_state != DLM_CTXT_LEAVING) {
580 int bit = query->node_idx;
578 spin_lock(&dlm->spinlock); 581 spin_lock(&dlm->spinlock);
579 582
580 if (dlm->dlm_state == DLM_CTXT_NEW && 583 if (dlm->dlm_state == DLM_CTXT_NEW &&
@@ -586,6 +589,19 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data)
586 } else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) { 589 } else if (dlm->joining_node != DLM_LOCK_RES_OWNER_UNKNOWN) {
587 /* Disallow parallel joins. */ 590 /* Disallow parallel joins. */
588 response = JOIN_DISALLOW; 591 response = JOIN_DISALLOW;
592 } else if (dlm->reco.state & DLM_RECO_STATE_ACTIVE) {
593 mlog(ML_NOTICE, "node %u trying to join, but recovery "
594 "is ongoing.\n", bit);
595 response = JOIN_DISALLOW;
596 } else if (test_bit(bit, dlm->recovery_map)) {
597 mlog(ML_NOTICE, "node %u trying to join, but it "
598 "still needs recovery.\n", bit);
599 response = JOIN_DISALLOW;
600 } else if (test_bit(bit, dlm->domain_map)) {
601 mlog(ML_NOTICE, "node %u trying to join, but it "
602 "is still in the domain! needs recovery?\n",
603 bit);
604 response = JOIN_DISALLOW;
589 } else { 605 } else {
590 /* Alright we're fully a part of this domain 606 /* Alright we're fully a part of this domain
591 * so we keep some state as to who's joining 607 * so we keep some state as to who's joining
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 27e984f7e4cd..a3194fe173d9 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -1050,17 +1050,10 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm,
1050 node = dlm_bitmap_diff_iter_next(&bdi, &sc); 1050 node = dlm_bitmap_diff_iter_next(&bdi, &sc);
1051 while (node >= 0) { 1051 while (node >= 0) {
1052 if (sc == NODE_UP) { 1052 if (sc == NODE_UP) {
1053 /* a node came up. easy. might not even need 1053 /* a node came up. clear any old vote from
1054 * to talk to it if its node number is higher 1054 * the response map and set it in the vote map
1055 * or if we are already blocked. */ 1055 * then restart the mastery. */
1056 mlog(0, "node up! %d\n", node); 1056 mlog(ML_NOTICE, "node %d up while restarting\n", node);
1057 if (blocked)
1058 goto next;
1059
1060 if (node > dlm->node_num) {
1061 mlog(0, "node > this node. skipping.\n");
1062 goto next;
1063 }
1064 1057
1065 /* redo the master request, but only for the new node */ 1058 /* redo the master request, but only for the new node */
1066 mlog(0, "sending request to new node\n"); 1059 mlog(0, "sending request to new node\n");
@@ -2005,6 +1998,15 @@ fail:
2005 break; 1998 break;
2006 1999
2007 mlog(0, "timed out during migration\n"); 2000 mlog(0, "timed out during migration\n");
2001 /* avoid hang during shutdown when migrating lockres
2002 * to a node which also goes down */
2003 if (dlm_is_node_dead(dlm, target)) {
2004 mlog(0, "%s:%.*s: expected migration target %u "
2005 "is no longer up. restarting.\n",
2006 dlm->name, res->lockname.len,
2007 res->lockname.name, target);
2008 ret = -ERESTARTSYS;
2009 }
2008 } 2010 }
2009 if (ret == -ERESTARTSYS) { 2011 if (ret == -ERESTARTSYS) {
2010 /* migration failed, detach and clean up mle */ 2012 /* migration failed, detach and clean up mle */
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 0c8eb1093f00..186e9a76aa58 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -39,6 +39,7 @@
39#include <linux/inet.h> 39#include <linux/inet.h>
40#include <linux/timer.h> 40#include <linux/timer.h>
41#include <linux/kthread.h> 41#include <linux/kthread.h>
42#include <linux/delay.h>
42 43
43 44
44#include "cluster/heartbeat.h" 45#include "cluster/heartbeat.h"
@@ -256,6 +257,27 @@ static int dlm_recovery_thread(void *data)
256 return 0; 257 return 0;
257} 258}
258 259
260/* returns true when the recovery master has contacted us */
261static int dlm_reco_master_ready(struct dlm_ctxt *dlm)
262{
263 int ready;
264 spin_lock(&dlm->spinlock);
265 ready = (dlm->reco.new_master != O2NM_INVALID_NODE_NUM);
266 spin_unlock(&dlm->spinlock);
267 return ready;
268}
269
270/* returns true if node is no longer in the domain
271 * could be dead or just not joined */
272int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
273{
274 int dead;
275 spin_lock(&dlm->spinlock);
276 dead = test_bit(node, dlm->domain_map);
277 spin_unlock(&dlm->spinlock);
278 return dead;
279}
280
259/* callers of the top-level api calls (dlmlock/dlmunlock) should 281/* callers of the top-level api calls (dlmlock/dlmunlock) should
260 * block on the dlm->reco.event when recovery is in progress. 282 * block on the dlm->reco.event when recovery is in progress.
261 * the dlm recovery thread will set this state when it begins 283 * the dlm recovery thread will set this state when it begins
@@ -297,6 +319,7 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm)
297static int dlm_do_recovery(struct dlm_ctxt *dlm) 319static int dlm_do_recovery(struct dlm_ctxt *dlm)
298{ 320{
299 int status = 0; 321 int status = 0;
322 int ret;
300 323
301 spin_lock(&dlm->spinlock); 324 spin_lock(&dlm->spinlock);
302 325
@@ -343,10 +366,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
343 goto master_here; 366 goto master_here;
344 367
345 if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { 368 if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
346 /* choose a new master */ 369 /* choose a new master, returns 0 if this node
347 if (!dlm_pick_recovery_master(dlm)) { 370 * is the master, -EEXIST if it's another node.
371 * this does not return until a new master is chosen
372 * or recovery completes entirely. */
373 ret = dlm_pick_recovery_master(dlm);
374 if (!ret) {
348 /* already notified everyone. go. */ 375 /* already notified everyone. go. */
349 dlm->reco.new_master = dlm->node_num;
350 goto master_here; 376 goto master_here;
351 } 377 }
352 mlog(0, "another node will master this recovery session.\n"); 378 mlog(0, "another node will master this recovery session.\n");
@@ -371,8 +397,13 @@ master_here:
371 if (status < 0) { 397 if (status < 0) {
372 mlog(ML_ERROR, "error %d remastering locks for node %u, " 398 mlog(ML_ERROR, "error %d remastering locks for node %u, "
373 "retrying.\n", status, dlm->reco.dead_node); 399 "retrying.\n", status, dlm->reco.dead_node);
400 /* yield a bit to allow any final network messages
401 * to get handled on remaining nodes */
402 msleep(100);
374 } else { 403 } else {
375 /* success! see if any other nodes need recovery */ 404 /* success! see if any other nodes need recovery */
405 mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n",
406 dlm->name, dlm->reco.dead_node, dlm->node_num);
376 dlm_reset_recovery(dlm); 407 dlm_reset_recovery(dlm);
377 } 408 }
378 dlm_end_recovery(dlm); 409 dlm_end_recovery(dlm);
@@ -477,7 +508,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
477 BUG(); 508 BUG();
478 break; 509 break;
479 case DLM_RECO_NODE_DATA_DEAD: 510 case DLM_RECO_NODE_DATA_DEAD:
480 mlog(0, "node %u died after " 511 mlog(ML_NOTICE, "node %u died after "
481 "requesting recovery info for " 512 "requesting recovery info for "
482 "node %u\n", ndata->node_num, 513 "node %u\n", ndata->node_num,
483 dead_node); 514 dead_node);
@@ -485,6 +516,19 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
485 // start all over 516 // start all over
486 destroy = 1; 517 destroy = 1;
487 status = -EAGAIN; 518 status = -EAGAIN;
519 /* instead of spinning like crazy here,
520 * wait for the domain map to catch up
521 * with the network state. otherwise this
522 * can be hit hundreds of times before
523 * the node is really seen as dead. */
524 wait_event_timeout(dlm->dlm_reco_thread_wq,
525 dlm_is_node_dead(dlm,
526 ndata->node_num),
527 msecs_to_jiffies(1000));
528 mlog(0, "waited 1 sec for %u, "
529 "dead? %s\n", ndata->node_num,
530 dlm_is_node_dead(dlm, ndata->node_num) ?
531 "yes" : "no");
488 goto leave; 532 goto leave;
489 case DLM_RECO_NODE_DATA_RECEIVING: 533 case DLM_RECO_NODE_DATA_RECEIVING:
490 case DLM_RECO_NODE_DATA_REQUESTED: 534 case DLM_RECO_NODE_DATA_REQUESTED:
@@ -678,11 +722,27 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
678 dlm = item->dlm; 722 dlm = item->dlm;
679 dead_node = item->u.ral.dead_node; 723 dead_node = item->u.ral.dead_node;
680 reco_master = item->u.ral.reco_master; 724 reco_master = item->u.ral.reco_master;
725 mres = (struct dlm_migratable_lockres *)data;
726
727 if (dead_node != dlm->reco.dead_node ||
728 reco_master != dlm->reco.new_master) {
729 /* show extra debug info if the recovery state is messed */
730 mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), "
731 "request(dead=%u, master=%u)\n",
732 dlm->name, dlm->reco.dead_node, dlm->reco.new_master,
733 dead_node, reco_master);
734 mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u "
735 "entry[0]={c=%"MLFu64",l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n",
736 dlm->name, mres->lockname_len, mres->lockname, mres->master,
737 mres->num_locks, mres->total_locks, mres->flags,
738 mres->ml[0].cookie, mres->ml[0].list, mres->ml[0].flags,
739 mres->ml[0].type, mres->ml[0].convert_type,
740 mres->ml[0].highest_blocked, mres->ml[0].node);
741 BUG();
742 }
681 BUG_ON(dead_node != dlm->reco.dead_node); 743 BUG_ON(dead_node != dlm->reco.dead_node);
682 BUG_ON(reco_master != dlm->reco.new_master); 744 BUG_ON(reco_master != dlm->reco.new_master);
683 745
684 mres = (struct dlm_migratable_lockres *)data;
685
686 /* lock resources should have already been moved to the 746 /* lock resources should have already been moved to the
687 * dlm->reco.resources list. now move items from that list 747 * dlm->reco.resources list. now move items from that list
688 * to a temp list if the dead owner matches. note that the 748 * to a temp list if the dead owner matches. note that the
@@ -757,15 +817,18 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
757 continue; 817 continue;
758 818
759 switch (ndata->state) { 819 switch (ndata->state) {
820 /* should have moved beyond INIT but not to FINALIZE yet */
760 case DLM_RECO_NODE_DATA_INIT: 821 case DLM_RECO_NODE_DATA_INIT:
761 case DLM_RECO_NODE_DATA_DEAD: 822 case DLM_RECO_NODE_DATA_DEAD:
762 case DLM_RECO_NODE_DATA_DONE:
763 case DLM_RECO_NODE_DATA_FINALIZE_SENT: 823 case DLM_RECO_NODE_DATA_FINALIZE_SENT:
764 mlog(ML_ERROR, "bad ndata state for node %u:" 824 mlog(ML_ERROR, "bad ndata state for node %u:"
765 " state=%d\n", ndata->node_num, 825 " state=%d\n", ndata->node_num,
766 ndata->state); 826 ndata->state);
767 BUG(); 827 BUG();
768 break; 828 break;
829 /* these states are possible at this point, anywhere along
830 * the line of recovery */
831 case DLM_RECO_NODE_DATA_DONE:
769 case DLM_RECO_NODE_DATA_RECEIVING: 832 case DLM_RECO_NODE_DATA_RECEIVING:
770 case DLM_RECO_NODE_DATA_REQUESTED: 833 case DLM_RECO_NODE_DATA_REQUESTED:
771 case DLM_RECO_NODE_DATA_REQUESTING: 834 case DLM_RECO_NODE_DATA_REQUESTING:
@@ -799,13 +862,31 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
799{ 862{
800 struct dlm_lock_resource *res; 863 struct dlm_lock_resource *res;
801 struct list_head *iter, *iter2; 864 struct list_head *iter, *iter2;
865 struct dlm_lock *lock;
802 866
803 spin_lock(&dlm->spinlock); 867 spin_lock(&dlm->spinlock);
804 list_for_each_safe(iter, iter2, &dlm->reco.resources) { 868 list_for_each_safe(iter, iter2, &dlm->reco.resources) {
805 res = list_entry (iter, struct dlm_lock_resource, recovering); 869 res = list_entry (iter, struct dlm_lock_resource, recovering);
870 /* always prune any $RECOVERY entries for dead nodes,
871 * otherwise hangs can occur during later recovery */
806 if (dlm_is_recovery_lock(res->lockname.name, 872 if (dlm_is_recovery_lock(res->lockname.name,
807 res->lockname.len)) 873 res->lockname.len)) {
874 spin_lock(&res->spinlock);
875 list_for_each_entry(lock, &res->granted, list) {
876 if (lock->ml.node == dead_node) {
877 mlog(0, "AHA! there was "
878 "a $RECOVERY lock for dead "
879 "node %u (%s)!\n",
880 dead_node, dlm->name);
881 list_del_init(&lock->list);
882 dlm_lock_put(lock);
883 break;
884 }
885 }
886 spin_unlock(&res->spinlock);
808 continue; 887 continue;
888 }
889
809 if (res->owner == dead_node) { 890 if (res->owner == dead_node) {
810 mlog(0, "found lockres owned by dead node while " 891 mlog(0, "found lockres owned by dead node while "
811 "doing recovery for node %u. sending it.\n", 892 "doing recovery for node %u. sending it.\n",
@@ -1179,7 +1260,7 @@ static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data)
1179again: 1260again:
1180 ret = dlm_lockres_master_requery(dlm, res, &real_master); 1261 ret = dlm_lockres_master_requery(dlm, res, &real_master);
1181 if (ret < 0) { 1262 if (ret < 0) {
1182 mlog(0, "dlm_lockres_master_requery failure: %d\n", 1263 mlog(0, "dlm_lockres_master_requery ret=%d\n",
1183 ret); 1264 ret);
1184 goto again; 1265 goto again;
1185 } 1266 }
@@ -1757,6 +1838,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
1757 struct dlm_lock_resource *res; 1838 struct dlm_lock_resource *res;
1758 int i; 1839 int i;
1759 struct list_head *bucket; 1840 struct list_head *bucket;
1841 struct dlm_lock *lock;
1760 1842
1761 1843
1762 /* purge any stale mles */ 1844 /* purge any stale mles */
@@ -1780,10 +1862,25 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
1780 bucket = &(dlm->resources[i]); 1862 bucket = &(dlm->resources[i]);
1781 list_for_each(iter, bucket) { 1863 list_for_each(iter, bucket) {
1782 res = list_entry (iter, struct dlm_lock_resource, list); 1864 res = list_entry (iter, struct dlm_lock_resource, list);
1865 /* always prune any $RECOVERY entries for dead nodes,
1866 * otherwise hangs can occur during later recovery */
1783 if (dlm_is_recovery_lock(res->lockname.name, 1867 if (dlm_is_recovery_lock(res->lockname.name,
1784 res->lockname.len)) 1868 res->lockname.len)) {
1869 spin_lock(&res->spinlock);
1870 list_for_each_entry(lock, &res->granted, list) {
1871 if (lock->ml.node == dead_node) {
1872 mlog(0, "AHA! there was "
1873 "a $RECOVERY lock for dead "
1874 "node %u (%s)!\n",
1875 dead_node, dlm->name);
1876 list_del_init(&lock->list);
1877 dlm_lock_put(lock);
1878 break;
1879 }
1880 }
1881 spin_unlock(&res->spinlock);
1785 continue; 1882 continue;
1786 1883 }
1787 spin_lock(&res->spinlock); 1884 spin_lock(&res->spinlock);
1788 /* zero the lvb if necessary */ 1885 /* zero the lvb if necessary */
1789 dlm_revalidate_lvb(dlm, res, dead_node); 1886 dlm_revalidate_lvb(dlm, res, dead_node);
@@ -1869,12 +1966,9 @@ void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data)
1869 return; 1966 return;
1870 1967
1871 spin_lock(&dlm->spinlock); 1968 spin_lock(&dlm->spinlock);
1872
1873 set_bit(idx, dlm->live_nodes_map); 1969 set_bit(idx, dlm->live_nodes_map);
1874 1970 /* do NOT notify mle attached to the heartbeat events.
1875 /* notify any mles attached to the heartbeat events */ 1971 * new nodes are not interesting in mastery until joined. */
1876 dlm_hb_event_notify_attached(dlm, idx, 1);
1877
1878 spin_unlock(&dlm->spinlock); 1972 spin_unlock(&dlm->spinlock);
1879 1973
1880 dlm_put(dlm); 1974 dlm_put(dlm);
@@ -1897,7 +1991,18 @@ static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
1897 mlog(0, "unlockast for recovery lock fired!\n"); 1991 mlog(0, "unlockast for recovery lock fired!\n");
1898} 1992}
1899 1993
1900 1994/*
1995 * dlm_pick_recovery_master will continually attempt to use
1996 * dlmlock() on the special "$RECOVERY" lockres with the
1997 * LKM_NOQUEUE flag to get an EX. every thread that enters
1998 * this function on each node racing to become the recovery
1999 * master will not stop attempting this until either:
2000 * a) this node gets the EX (and becomes the recovery master),
2001 * or b) dlm->reco.new_master gets set to some nodenum
2002 * != O2NM_INVALID_NODE_NUM (another node will do the reco).
2003 * so each time a recovery master is needed, the entire cluster
2004 * will sync at this point. if the new master dies, that will
2005 * be detected in dlm_do_recovery */
1901static int dlm_pick_recovery_master(struct dlm_ctxt *dlm) 2006static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
1902{ 2007{
1903 enum dlm_status ret; 2008 enum dlm_status ret;
@@ -1906,23 +2011,45 @@ static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
1906 2011
1907 mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n", 2012 mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
1908 dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num); 2013 dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
1909retry: 2014again:
1910 memset(&lksb, 0, sizeof(lksb)); 2015 memset(&lksb, 0, sizeof(lksb));
1911 2016
1912 ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY, 2017 ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
1913 DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast); 2018 DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast);
1914 2019
2020 mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n",
2021 dlm->name, ret, lksb.status);
2022
1915 if (ret == DLM_NORMAL) { 2023 if (ret == DLM_NORMAL) {
1916 mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n", 2024 mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
1917 dlm->name, dlm->node_num); 2025 dlm->name, dlm->node_num);
1918 /* I am master, send message to all nodes saying 2026
1919 * that I am beginning a recovery session */ 2027 /* got the EX lock. check to see if another node
1920 status = dlm_send_begin_reco_message(dlm, 2028 * just became the reco master */
1921 dlm->reco.dead_node); 2029 if (dlm_reco_master_ready(dlm)) {
2030 mlog(0, "%s: got reco EX lock, but %u will "
2031 "do the recovery\n", dlm->name,
2032 dlm->reco.new_master);
2033 status = -EEXIST;
2034 } else {
2035 status = dlm_send_begin_reco_message(dlm,
2036 dlm->reco.dead_node);
2037 /* this always succeeds */
2038 BUG_ON(status);
2039
2040 /* set the new_master to this node */
2041 spin_lock(&dlm->spinlock);
2042 dlm->reco.new_master = dlm->node_num;
2043 spin_unlock(&dlm->spinlock);
2044 }
1922 2045
1923 /* recovery lock is a special case. ast will not get fired, 2046 /* recovery lock is a special case. ast will not get fired,
1924 * so just go ahead and unlock it. */ 2047 * so just go ahead and unlock it. */
1925 ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm); 2048 ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm);
2049 if (ret == DLM_DENIED) {
2050 mlog(0, "got DLM_DENIED, trying LKM_CANCEL\n");
2051 ret = dlmunlock(dlm, &lksb, LKM_CANCEL, dlm_reco_unlock_ast, dlm);
2052 }
1926 if (ret != DLM_NORMAL) { 2053 if (ret != DLM_NORMAL) {
1927 /* this would really suck. this could only happen 2054 /* this would really suck. this could only happen
1928 * if there was a network error during the unlock 2055 * if there was a network error during the unlock
@@ -1930,20 +2057,42 @@ retry:
1930 * is actually "done" and the lock structure is 2057 * is actually "done" and the lock structure is
1931 * even freed. we can continue, but only 2058 * even freed. we can continue, but only
1932 * because this specific lock name is special. */ 2059 * because this specific lock name is special. */
1933 mlog(0, "dlmunlock returned %d\n", ret); 2060 mlog(ML_ERROR, "dlmunlock returned %d\n", ret);
1934 }
1935
1936 if (status < 0) {
1937 mlog(0, "failed to send recovery message. "
1938 "must retry with new node map.\n");
1939 goto retry;
1940 } 2061 }
1941 } else if (ret == DLM_NOTQUEUED) { 2062 } else if (ret == DLM_NOTQUEUED) {
1942 mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n", 2063 mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
1943 dlm->name, dlm->node_num); 2064 dlm->name, dlm->node_num);
1944 /* another node is master. wait on 2065 /* another node is master. wait on
1945 * reco.new_master != O2NM_INVALID_NODE_NUM */ 2066 * reco.new_master != O2NM_INVALID_NODE_NUM
2067 * for at most one second */
2068 wait_event_timeout(dlm->dlm_reco_thread_wq,
2069 dlm_reco_master_ready(dlm),
2070 msecs_to_jiffies(1000));
2071 if (!dlm_reco_master_ready(dlm)) {
2072 mlog(0, "%s: reco master taking awhile\n",
2073 dlm->name);
2074 goto again;
2075 }
2076 /* another node has informed this one that it is reco master */
2077 mlog(0, "%s: reco master %u is ready to recover %u\n",
2078 dlm->name, dlm->reco.new_master, dlm->reco.dead_node);
1946 status = -EEXIST; 2079 status = -EEXIST;
2080 } else {
2081 struct dlm_lock_resource *res;
2082
2083 /* dlmlock returned something other than NOTQUEUED or NORMAL */
2084 mlog(ML_ERROR, "%s: got %s from dlmlock($RECOVERY), "
2085 "lksb.status=%s\n", dlm->name, dlm_errname(ret),
2086 dlm_errname(lksb.status));
2087 res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
2088 DLM_RECOVERY_LOCK_NAME_LEN);
2089 if (res) {
2090 dlm_print_one_lock_resource(res);
2091 dlm_lockres_put(res);
2092 } else {
2093 mlog(ML_ERROR, "recovery lock not found\n");
2094 }
2095 BUG();
1947 } 2096 }
1948 2097
1949 return status; 2098 return status;
@@ -1982,7 +2131,7 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
1982 mlog(0, "not sending begin reco to self\n"); 2131 mlog(0, "not sending begin reco to self\n");
1983 continue; 2132 continue;
1984 } 2133 }
1985 2134retry:
1986 ret = -EINVAL; 2135 ret = -EINVAL;
1987 mlog(0, "attempting to send begin reco msg to %d\n", 2136 mlog(0, "attempting to send begin reco msg to %d\n",
1988 nodenum); 2137 nodenum);
@@ -1991,8 +2140,17 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
1991 /* negative status is handled ok by caller here */ 2140 /* negative status is handled ok by caller here */
1992 if (ret >= 0) 2141 if (ret >= 0)
1993 ret = status; 2142 ret = status;
2143 if (dlm_is_host_down(ret)) {
2144 /* node is down. not involved in recovery
2145 * so just keep going */
2146 mlog(0, "%s: node %u was down when sending "
2147 "begin reco msg (%d)\n", dlm->name, nodenum, ret);
2148 ret = 0;
2149 }
1994 if (ret < 0) { 2150 if (ret < 0) {
1995 struct dlm_lock_resource *res; 2151 struct dlm_lock_resource *res;
2152 /* this is now a serious problem, possibly ENOMEM
2153 * in the network stack. must retry */
1996 mlog_errno(ret); 2154 mlog_errno(ret);
1997 mlog(ML_ERROR, "begin reco of dlm %s to node %u " 2155 mlog(ML_ERROR, "begin reco of dlm %s to node %u "
1998 " returned %d\n", dlm->name, nodenum, ret); 2156 " returned %d\n", dlm->name, nodenum, ret);
@@ -2004,7 +2162,10 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
2004 } else { 2162 } else {
2005 mlog(ML_ERROR, "recovery lock not found\n"); 2163 mlog(ML_ERROR, "recovery lock not found\n");
2006 } 2164 }
2007 break; 2165 /* sleep for a bit in hopes that we can avoid
2166 * another ENOMEM */
2167 msleep(100);
2168 goto retry;
2008 } 2169 }
2009 } 2170 }
2010 2171
@@ -2027,19 +2188,34 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
2027 2188
2028 spin_lock(&dlm->spinlock); 2189 spin_lock(&dlm->spinlock);
2029 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { 2190 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
2030 mlog(0, "new_master already set to %u!\n", 2191 if (test_bit(dlm->reco.new_master, dlm->recovery_map)) {
2031 dlm->reco.new_master); 2192 mlog(0, "%s: new_master %u died, changing "
2193 "to %u\n", dlm->name, dlm->reco.new_master,
2194 br->node_idx);
2195 } else {
2196 mlog(0, "%s: new_master %u NOT DEAD, changing "
2197 "to %u\n", dlm->name, dlm->reco.new_master,
2198 br->node_idx);
2199 /* may not have seen the new master as dead yet */
2200 }
2032 } 2201 }
2033 if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) { 2202 if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
2034 mlog(0, "dead_node already set to %u!\n", 2203 mlog(ML_NOTICE, "%s: dead_node previously set to %u, "
2035 dlm->reco.dead_node); 2204 "node %u changing it to %u\n", dlm->name,
2205 dlm->reco.dead_node, br->node_idx, br->dead_node);
2036 } 2206 }
2037 dlm->reco.new_master = br->node_idx; 2207 dlm->reco.new_master = br->node_idx;
2038 dlm->reco.dead_node = br->dead_node; 2208 dlm->reco.dead_node = br->dead_node;
2039 if (!test_bit(br->dead_node, dlm->recovery_map)) { 2209 if (!test_bit(br->dead_node, dlm->recovery_map)) {
2040 mlog(ML_ERROR, "recovery master %u sees %u as dead, but this " 2210 mlog(0, "recovery master %u sees %u as dead, but this "
2041 "node has not yet. marking %u as dead\n", 2211 "node has not yet. marking %u as dead\n",
2042 br->node_idx, br->dead_node, br->dead_node); 2212 br->node_idx, br->dead_node, br->dead_node);
2213 if (!test_bit(br->dead_node, dlm->domain_map) ||
2214 !test_bit(br->dead_node, dlm->live_nodes_map))
2215 mlog(0, "%u not in domain/live_nodes map "
2216 "so setting it in reco map manually\n",
2217 br->dead_node);
2218 set_bit(br->dead_node, dlm->recovery_map);
2043 __dlm_hb_node_down(dlm, br->dead_node); 2219 __dlm_hb_node_down(dlm, br->dead_node);
2044 } 2220 }
2045 spin_unlock(&dlm->spinlock); 2221 spin_unlock(&dlm->spinlock);
diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c
index cec2ce1cd318..c95f08d2e925 100644
--- a/fs/ocfs2/dlm/dlmunlock.c
+++ b/fs/ocfs2/dlm/dlmunlock.c
@@ -188,6 +188,19 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm,
188 actions &= ~(DLM_UNLOCK_REMOVE_LOCK| 188 actions &= ~(DLM_UNLOCK_REMOVE_LOCK|
189 DLM_UNLOCK_REGRANT_LOCK| 189 DLM_UNLOCK_REGRANT_LOCK|
190 DLM_UNLOCK_CLEAR_CONVERT_TYPE); 190 DLM_UNLOCK_CLEAR_CONVERT_TYPE);
191 } else if (status == DLM_RECOVERING ||
192 status == DLM_MIGRATING ||
193 status == DLM_FORWARD) {
194 /* must clear the actions because this unlock
195 * is about to be retried. cannot free or do
196 * any list manipulation. */
197 mlog(0, "%s:%.*s: clearing actions, %s\n",
198 dlm->name, res->lockname.len,
199 res->lockname.name,
200 status==DLM_RECOVERING?"recovering":
201 (status==DLM_MIGRATING?"migrating":
202 "forward"));
203 actions = 0;
191 } 204 }
192 if (flags & LKM_CANCEL) 205 if (flags & LKM_CANCEL)
193 lock->cancel_pending = 0; 206 lock->cancel_pending = 0;
diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c
index e1fdd288796e..c3764f4744ee 100644
--- a/fs/ocfs2/dlm/userdlm.c
+++ b/fs/ocfs2/dlm/userdlm.c
@@ -27,7 +27,7 @@
27 * Boston, MA 021110-1307, USA. 27 * Boston, MA 021110-1307, USA.
28 */ 28 */
29 29
30#include <asm/signal.h> 30#include <linux/signal.h>
31 31
32#include <linux/module.h> 32#include <linux/module.h>
33#include <linux/fs.h> 33#include <linux/fs.h>
diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c
index f2fb40cd296a..b6ba292e9544 100644
--- a/fs/ocfs2/extent_map.c
+++ b/fs/ocfs2/extent_map.c
@@ -262,8 +262,7 @@ static int ocfs2_extent_map_find_leaf(struct inode *inode,
262 el = &eb->h_list; 262 el = &eb->h_list;
263 } 263 }
264 264
265 if (el->l_tree_depth) 265 BUG_ON(el->l_tree_depth);
266 BUG();
267 266
268 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 267 for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
269 rec = &el->l_recs[i]; 268 rec = &el->l_recs[i];
@@ -364,8 +363,8 @@ static int ocfs2_extent_map_lookup_read(struct inode *inode,
364 return ret; 363 return ret;
365 } 364 }
366 365
367 if (ent->e_tree_depth) 366 /* FIXME: Make sure this isn't a corruption */
368 BUG(); /* FIXME: Make sure this isn't a corruption */ 367 BUG_ON(ent->e_tree_depth);
369 368
370 *ret_ent = ent; 369 *ret_ent = ent;
371 370
@@ -423,8 +422,7 @@ static int ocfs2_extent_map_try_insert(struct inode *inode,
423 le32_to_cpu(rec->e_clusters), NULL, 422 le32_to_cpu(rec->e_clusters), NULL,
424 NULL); 423 NULL);
425 424
426 if (!old_ent) 425 BUG_ON(!old_ent);
427 BUG();
428 426
429 ret = -EEXIST; 427 ret = -EEXIST;
430 if (old_ent->e_tree_depth < tree_depth) 428 if (old_ent->e_tree_depth < tree_depth)
@@ -988,7 +986,7 @@ int __init init_ocfs2_extent_maps(void)
988 return 0; 986 return 0;
989} 987}
990 988
991void __exit exit_ocfs2_extent_maps(void) 989void exit_ocfs2_extent_maps(void)
992{ 990{
993 kmem_cache_destroy(ocfs2_em_ent_cachep); 991 kmem_cache_destroy(ocfs2_em_ent_cachep);
994} 992}
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index eaf33caa0a1f..1715bc90e705 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1022,8 +1022,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1022 } 1022 }
1023 newsize = count + saved_pos; 1023 newsize = count + saved_pos;
1024 1024
1025 mlog(0, "pos=%lld newsize=%"MLFu64" cursize=%lld\n", 1025 mlog(0, "pos=%lld newsize=%lld cursize=%lld\n",
1026 saved_pos, newsize, i_size_read(inode)); 1026 (long long) saved_pos, (long long) newsize,
1027 (long long) i_size_read(inode));
1027 1028
1028 /* No need for a higher level metadata lock if we're 1029 /* No need for a higher level metadata lock if we're
1029 * never going past i_size. */ 1030 * never going past i_size. */
@@ -1042,8 +1043,9 @@ static ssize_t ocfs2_file_aio_write(struct kiocb *iocb,
1042 spin_unlock(&OCFS2_I(inode)->ip_lock); 1043 spin_unlock(&OCFS2_I(inode)->ip_lock);
1043 1044
1044 mlog(0, "Writing at EOF, may need more allocation: " 1045 mlog(0, "Writing at EOF, may need more allocation: "
1045 "i_size = %lld, newsize = %"MLFu64", need %u clusters\n", 1046 "i_size = %lld, newsize = %lld, need %u clusters\n",
1046 i_size_read(inode), newsize, clusters); 1047 (long long) i_size_read(inode), (long long) newsize,
1048 clusters);
1047 1049
1048 /* We only want to continue the rest of this loop if 1050 /* We only want to continue the rest of this loop if
1049 * our extend will actually require more 1051 * our extend will actually require more
diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c
index d4ecc0627716..8122489c5762 100644
--- a/fs/ocfs2/inode.c
+++ b/fs/ocfs2/inode.c
@@ -903,10 +903,10 @@ void ocfs2_clear_inode(struct inode *inode)
903 "Clear inode of %"MLFu64", inode is locked\n", 903 "Clear inode of %"MLFu64", inode is locked\n",
904 oi->ip_blkno); 904 oi->ip_blkno);
905 905
906 mlog_bug_on_msg(down_trylock(&oi->ip_io_sem), 906 mlog_bug_on_msg(!mutex_trylock(&oi->ip_io_mutex),
907 "Clear inode of %"MLFu64", io_sem is locked\n", 907 "Clear inode of %"MLFu64", io_mutex is locked\n",
908 oi->ip_blkno); 908 oi->ip_blkno);
909 up(&oi->ip_io_sem); 909 mutex_unlock(&oi->ip_io_mutex);
910 910
911 /* 911 /*
912 * down_trylock() returns 0, down_write_trylock() returns 1 912 * down_trylock() returns 0, down_write_trylock() returns 1
diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h
index 9b0177433653..84c507961287 100644
--- a/fs/ocfs2/inode.h
+++ b/fs/ocfs2/inode.h
@@ -46,10 +46,10 @@ struct ocfs2_inode_info
46 struct list_head ip_io_markers; 46 struct list_head ip_io_markers;
47 int ip_orphaned_slot; 47 int ip_orphaned_slot;
48 48
49 struct semaphore ip_io_sem; 49 struct mutex ip_io_mutex;
50 50
51 /* Used by the journalling code to attach an inode to a 51 /* Used by the journalling code to attach an inode to a
52 * handle. These are protected by ip_io_sem in order to lock 52 * handle. These are protected by ip_io_mutex in order to lock
53 * out other I/O to the inode until we either commit or 53 * out other I/O to the inode until we either commit or
54 * abort. */ 54 * abort. */
55 struct list_head ip_handle_list; 55 struct list_head ip_handle_list;
diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c
index 303c8d96457f..fa0bcac5ceae 100644
--- a/fs/ocfs2/journal.c
+++ b/fs/ocfs2/journal.c
@@ -147,8 +147,7 @@ struct ocfs2_journal_handle *ocfs2_start_trans(struct ocfs2_super *osb,
147 147
148 mlog_entry("(max_buffs = %d)\n", max_buffs); 148 mlog_entry("(max_buffs = %d)\n", max_buffs);
149 149
150 if (!osb || !osb->journal->j_journal) 150 BUG_ON(!osb || !osb->journal->j_journal);
151 BUG();
152 151
153 if (ocfs2_is_hard_readonly(osb)) { 152 if (ocfs2_is_hard_readonly(osb)) {
154 ret = -EROFS; 153 ret = -EROFS;
@@ -401,7 +400,7 @@ int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
401 * j_trans_barrier for us. */ 400 * j_trans_barrier for us. */
402 ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode); 401 ocfs2_set_inode_lock_trans(OCFS2_SB(inode->i_sb)->journal, inode);
403 402
404 down(&OCFS2_I(inode)->ip_io_sem); 403 mutex_lock(&OCFS2_I(inode)->ip_io_mutex);
405 switch (type) { 404 switch (type) {
406 case OCFS2_JOURNAL_ACCESS_CREATE: 405 case OCFS2_JOURNAL_ACCESS_CREATE:
407 case OCFS2_JOURNAL_ACCESS_WRITE: 406 case OCFS2_JOURNAL_ACCESS_WRITE:
@@ -416,7 +415,7 @@ int ocfs2_journal_access(struct ocfs2_journal_handle *handle,
416 status = -EINVAL; 415 status = -EINVAL;
417 mlog(ML_ERROR, "Uknown access type!\n"); 416 mlog(ML_ERROR, "Uknown access type!\n");
418 } 417 }
419 up(&OCFS2_I(inode)->ip_io_sem); 418 mutex_unlock(&OCFS2_I(inode)->ip_io_mutex);
420 419
421 if (status < 0) 420 if (status < 0)
422 mlog(ML_ERROR, "Error %d getting %d access to buffer!\n", 421 mlog(ML_ERROR, "Error %d getting %d access to buffer!\n",
@@ -561,7 +560,11 @@ int ocfs2_journal_init(struct ocfs2_journal *journal, int *dirty)
561 SET_INODE_JOURNAL(inode); 560 SET_INODE_JOURNAL(inode);
562 OCFS2_I(inode)->ip_open_count++; 561 OCFS2_I(inode)->ip_open_count++;
563 562
564 status = ocfs2_meta_lock(inode, NULL, &bh, 1); 563 /* Skip recovery waits here - journal inode metadata never
564 * changes in a live cluster so it can be considered an
565 * exception to the rule. */
566 status = ocfs2_meta_lock_full(inode, NULL, &bh, 1,
567 OCFS2_META_LOCK_RECOVERY);
565 if (status < 0) { 568 if (status < 0) {
566 if (status != -ERESTARTSYS) 569 if (status != -ERESTARTSYS)
567 mlog(ML_ERROR, "Could not get lock on journal!\n"); 570 mlog(ML_ERROR, "Could not get lock on journal!\n");
@@ -672,8 +675,7 @@ void ocfs2_journal_shutdown(struct ocfs2_super *osb)
672 675
673 mlog_entry_void(); 676 mlog_entry_void();
674 677
675 if (!osb) 678 BUG_ON(!osb);
676 BUG();
677 679
678 journal = osb->journal; 680 journal = osb->journal;
679 if (!journal) 681 if (!journal)
@@ -805,8 +807,7 @@ int ocfs2_journal_wipe(struct ocfs2_journal *journal, int full)
805 807
806 mlog_entry_void(); 808 mlog_entry_void();
807 809
808 if (!journal) 810 BUG_ON(!journal);
809 BUG();
810 811
811 status = journal_wipe(journal->j_journal, full); 812 status = journal_wipe(journal->j_journal, full);
812 if (status < 0) { 813 if (status < 0) {
@@ -1072,10 +1073,10 @@ restart:
1072 NULL); 1073 NULL);
1073 1074
1074bail: 1075bail:
1075 down(&osb->recovery_lock); 1076 mutex_lock(&osb->recovery_lock);
1076 if (!status && 1077 if (!status &&
1077 !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) { 1078 !ocfs2_node_map_is_empty(osb, &osb->recovery_map)) {
1078 up(&osb->recovery_lock); 1079 mutex_unlock(&osb->recovery_lock);
1079 goto restart; 1080 goto restart;
1080 } 1081 }
1081 1082
@@ -1083,7 +1084,7 @@ bail:
1083 mb(); /* sync with ocfs2_recovery_thread_running */ 1084 mb(); /* sync with ocfs2_recovery_thread_running */
1084 wake_up(&osb->recovery_event); 1085 wake_up(&osb->recovery_event);
1085 1086
1086 up(&osb->recovery_lock); 1087 mutex_unlock(&osb->recovery_lock);
1087 1088
1088 mlog_exit(status); 1089 mlog_exit(status);
1089 /* no one is callint kthread_stop() for us so the kthread() api 1090 /* no one is callint kthread_stop() for us so the kthread() api
@@ -1098,7 +1099,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
1098 mlog_entry("(node_num=%d, osb->node_num = %d)\n", 1099 mlog_entry("(node_num=%d, osb->node_num = %d)\n",
1099 node_num, osb->node_num); 1100 node_num, osb->node_num);
1100 1101
1101 down(&osb->recovery_lock); 1102 mutex_lock(&osb->recovery_lock);
1102 if (osb->disable_recovery) 1103 if (osb->disable_recovery)
1103 goto out; 1104 goto out;
1104 1105
@@ -1120,7 +1121,7 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num)
1120 } 1121 }
1121 1122
1122out: 1123out:
1123 up(&osb->recovery_lock); 1124 mutex_unlock(&osb->recovery_lock);
1124 wake_up(&osb->recovery_event); 1125 wake_up(&osb->recovery_event);
1125 1126
1126 mlog_exit_void(); 1127 mlog_exit_void();
@@ -1271,8 +1272,7 @@ static int ocfs2_recover_node(struct ocfs2_super *osb,
1271 1272
1272 /* Should not ever be called to recover ourselves -- in that 1273 /* Should not ever be called to recover ourselves -- in that
1273 * case we should've called ocfs2_journal_load instead. */ 1274 * case we should've called ocfs2_journal_load instead. */
1274 if (osb->node_num == node_num) 1275 BUG_ON(osb->node_num == node_num);
1275 BUG();
1276 1276
1277 slot_num = ocfs2_node_num_to_slot(si, node_num); 1277 slot_num = ocfs2_node_num_to_slot(si, node_num);
1278 if (slot_num == OCFS2_INVALID_SLOT) { 1278 if (slot_num == OCFS2_INVALID_SLOT) {
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index f468c600cf92..8d8e4779df92 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -33,6 +33,7 @@
33#include <linux/rbtree.h> 33#include <linux/rbtree.h>
34#include <linux/workqueue.h> 34#include <linux/workqueue.h>
35#include <linux/kref.h> 35#include <linux/kref.h>
36#include <linux/mutex.h>
36 37
37#include "cluster/nodemanager.h" 38#include "cluster/nodemanager.h"
38#include "cluster/heartbeat.h" 39#include "cluster/heartbeat.h"
@@ -233,7 +234,7 @@ struct ocfs2_super
233 struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */ 234 struct proc_dir_entry *proc_sub_dir; /* points to /proc/fs/ocfs2/<maj_min> */
234 235
235 atomic_t vol_state; 236 atomic_t vol_state;
236 struct semaphore recovery_lock; 237 struct mutex recovery_lock;
237 struct task_struct *recovery_thread_task; 238 struct task_struct *recovery_thread_task;
238 int disable_recovery; 239 int disable_recovery;
239 wait_queue_head_t checkpoint_event; 240 wait_queue_head_t checkpoint_event;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 364d64bd5f10..046824b6b625 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -932,7 +932,7 @@ static void ocfs2_inode_init_once(void *data,
932 oi->ip_dir_start_lookup = 0; 932 oi->ip_dir_start_lookup = 0;
933 933
934 init_rwsem(&oi->ip_alloc_sem); 934 init_rwsem(&oi->ip_alloc_sem);
935 init_MUTEX(&(oi->ip_io_sem)); 935 mutex_init(&oi->ip_io_mutex);
936 936
937 oi->ip_blkno = 0ULL; 937 oi->ip_blkno = 0ULL;
938 oi->ip_clusters = 0; 938 oi->ip_clusters = 0;
@@ -1137,9 +1137,9 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err)
1137 1137
1138 /* disable any new recovery threads and wait for any currently 1138 /* disable any new recovery threads and wait for any currently
1139 * running ones to exit. Do this before setting the vol_state. */ 1139 * running ones to exit. Do this before setting the vol_state. */
1140 down(&osb->recovery_lock); 1140 mutex_lock(&osb->recovery_lock);
1141 osb->disable_recovery = 1; 1141 osb->disable_recovery = 1;
1142 up(&osb->recovery_lock); 1142 mutex_unlock(&osb->recovery_lock);
1143 wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); 1143 wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb));
1144 1144
1145 /* At this point, we know that no more recovery threads can be 1145 /* At this point, we know that no more recovery threads can be
@@ -1254,8 +1254,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1254 osb->sb = sb; 1254 osb->sb = sb;
1255 /* Save off for ocfs2_rw_direct */ 1255 /* Save off for ocfs2_rw_direct */
1256 osb->s_sectsize_bits = blksize_bits(sector_size); 1256 osb->s_sectsize_bits = blksize_bits(sector_size);
1257 if (!osb->s_sectsize_bits) 1257 BUG_ON(!osb->s_sectsize_bits);
1258 BUG();
1259 1258
1260 osb->net_response_ids = 0; 1259 osb->net_response_ids = 0;
1261 spin_lock_init(&osb->net_response_lock); 1260 spin_lock_init(&osb->net_response_lock);
@@ -1283,7 +1282,7 @@ static int ocfs2_initialize_super(struct super_block *sb,
1283 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u", 1282 snprintf(osb->dev_str, sizeof(osb->dev_str), "%u,%u",
1284 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev)); 1283 MAJOR(osb->sb->s_dev), MINOR(osb->sb->s_dev));
1285 1284
1286 init_MUTEX(&osb->recovery_lock); 1285 mutex_init(&osb->recovery_lock);
1287 1286
1288 osb->disable_recovery = 0; 1287 osb->disable_recovery = 0;
1289 osb->recovery_thread_task = NULL; 1288 osb->recovery_thread_task = NULL;
diff --git a/fs/ocfs2/sysfile.c b/fs/ocfs2/sysfile.c
index 600a8bc5b541..fc29cb7a437d 100644
--- a/fs/ocfs2/sysfile.c
+++ b/fs/ocfs2/sysfile.c
@@ -77,8 +77,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
77 if (arr && ((inode = *arr) != NULL)) { 77 if (arr && ((inode = *arr) != NULL)) {
78 /* get a ref in addition to the array ref */ 78 /* get a ref in addition to the array ref */
79 inode = igrab(inode); 79 inode = igrab(inode);
80 if (!inode) 80 BUG_ON(!inode);
81 BUG();
82 81
83 return inode; 82 return inode;
84 } 83 }
@@ -89,8 +88,7 @@ struct inode *ocfs2_get_system_file_inode(struct ocfs2_super *osb,
89 /* add one more if putting into array for first time */ 88 /* add one more if putting into array for first time */
90 if (arr && inode) { 89 if (arr && inode) {
91 *arr = igrab(inode); 90 *arr = igrab(inode);
92 if (!*arr) 91 BUG_ON(!*arr);
93 BUG();
94 } 92 }
95 return inode; 93 return inode;
96} 94}
diff --git a/fs/ocfs2/uptodate.c b/fs/ocfs2/uptodate.c
index 3a0458fd3e1b..300b5bedfb21 100644
--- a/fs/ocfs2/uptodate.c
+++ b/fs/ocfs2/uptodate.c
@@ -388,7 +388,7 @@ out_free:
388 } 388 }
389} 389}
390 390
391/* Item insertion is guarded by ip_io_sem, so the insertion path takes 391/* Item insertion is guarded by ip_io_mutex, so the insertion path takes
392 * advantage of this by not rechecking for a duplicate insert during 392 * advantage of this by not rechecking for a duplicate insert during
393 * the slow case. Additionally, if the cache needs to be bumped up to 393 * the slow case. Additionally, if the cache needs to be bumped up to
394 * a tree, the code will not recheck after acquiring the lock -- 394 * a tree, the code will not recheck after acquiring the lock --
@@ -418,7 +418,7 @@ void ocfs2_set_buffer_uptodate(struct inode *inode,
418 (unsigned long long) bh->b_blocknr); 418 (unsigned long long) bh->b_blocknr);
419 419
420 /* No need to recheck under spinlock - insertion is guarded by 420 /* No need to recheck under spinlock - insertion is guarded by
421 * ip_io_sem */ 421 * ip_io_mutex */
422 spin_lock(&oi->ip_lock); 422 spin_lock(&oi->ip_lock);
423 if (ocfs2_insert_can_use_array(oi, ci)) { 423 if (ocfs2_insert_can_use_array(oi, ci)) {
424 /* Fast case - it's an array and there's a free 424 /* Fast case - it's an array and there's a free
@@ -440,7 +440,7 @@ void ocfs2_set_buffer_uptodate(struct inode *inode,
440 440
441/* Called against a newly allocated buffer. Most likely nobody should 441/* Called against a newly allocated buffer. Most likely nobody should
442 * be able to read this sort of metadata while it's still being 442 * be able to read this sort of metadata while it's still being
443 * allocated, but this is careful to take ip_io_sem anyway. */ 443 * allocated, but this is careful to take ip_io_mutex anyway. */
444void ocfs2_set_new_buffer_uptodate(struct inode *inode, 444void ocfs2_set_new_buffer_uptodate(struct inode *inode,
445 struct buffer_head *bh) 445 struct buffer_head *bh)
446{ 446{
@@ -451,9 +451,9 @@ void ocfs2_set_new_buffer_uptodate(struct inode *inode,
451 451
452 set_buffer_uptodate(bh); 452 set_buffer_uptodate(bh);
453 453
454 down(&oi->ip_io_sem); 454 mutex_lock(&oi->ip_io_mutex);
455 ocfs2_set_buffer_uptodate(inode, bh); 455 ocfs2_set_buffer_uptodate(inode, bh);
456 up(&oi->ip_io_sem); 456 mutex_unlock(&oi->ip_io_mutex);
457} 457}
458 458
459/* Requires ip_lock. */ 459/* Requires ip_lock. */
@@ -537,7 +537,7 @@ int __init init_ocfs2_uptodate_cache(void)
537 return 0; 537 return 0;
538} 538}
539 539
540void __exit exit_ocfs2_uptodate_cache(void) 540void exit_ocfs2_uptodate_cache(void)
541{ 541{
542 if (ocfs2_uptodate_cachep) 542 if (ocfs2_uptodate_cachep)
543 kmem_cache_destroy(ocfs2_uptodate_cachep); 543 kmem_cache_destroy(ocfs2_uptodate_cachep);
diff --git a/fs/ocfs2/uptodate.h b/fs/ocfs2/uptodate.h
index e5aacdf4eabf..01cd32d26b06 100644
--- a/fs/ocfs2/uptodate.h
+++ b/fs/ocfs2/uptodate.h
@@ -27,7 +27,7 @@
27#define OCFS2_UPTODATE_H 27#define OCFS2_UPTODATE_H
28 28
29int __init init_ocfs2_uptodate_cache(void); 29int __init init_ocfs2_uptodate_cache(void);
30void __exit exit_ocfs2_uptodate_cache(void); 30void exit_ocfs2_uptodate_cache(void);
31 31
32void ocfs2_metadata_cache_init(struct inode *inode); 32void ocfs2_metadata_cache_init(struct inode *inode);
33void ocfs2_metadata_cache_purge(struct inode *inode); 33void ocfs2_metadata_cache_purge(struct inode *inode);