diff options
author | Linus Torvalds <torvalds@g5.osdl.org> | 2006-02-03 18:21:40 -0500 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2006-02-03 18:21:40 -0500 |
commit | d1ffa5669cd834f901141756e63195f48c1bfbf9 (patch) | |
tree | f0bed266c1f3fef528bbced56b48aac63e0a26b1 /fs/ocfs2/dlm/dlmrecovery.c | |
parent | d6c8f6aaa1d7f68c1e6471ab0839d9047cdd159f (diff) | |
parent | 6eff5790d57a5c9c01489c95946881808a4b2a2c (diff) |
Merge branch 'upstream-linus' of git://oss.oracle.com/home/sourcebo/git/ocfs2
Diffstat (limited to 'fs/ocfs2/dlm/dlmrecovery.c')
-rw-r--r-- | fs/ocfs2/dlm/dlmrecovery.c | 250 |
1 files changed, 213 insertions, 37 deletions
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 0c8eb1093f00..186e9a76aa58 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c | |||
@@ -39,6 +39,7 @@ | |||
39 | #include <linux/inet.h> | 39 | #include <linux/inet.h> |
40 | #include <linux/timer.h> | 40 | #include <linux/timer.h> |
41 | #include <linux/kthread.h> | 41 | #include <linux/kthread.h> |
42 | #include <linux/delay.h> | ||
42 | 43 | ||
43 | 44 | ||
44 | #include "cluster/heartbeat.h" | 45 | #include "cluster/heartbeat.h" |
@@ -256,6 +257,27 @@ static int dlm_recovery_thread(void *data) | |||
256 | return 0; | 257 | return 0; |
257 | } | 258 | } |
258 | 259 | ||
260 | /* returns true when the recovery master has contacted us */ | ||
261 | static int dlm_reco_master_ready(struct dlm_ctxt *dlm) | ||
262 | { | ||
263 | int ready; | ||
264 | spin_lock(&dlm->spinlock); | ||
265 | ready = (dlm->reco.new_master != O2NM_INVALID_NODE_NUM); | ||
266 | spin_unlock(&dlm->spinlock); | ||
267 | return ready; | ||
268 | } | ||
269 | |||
270 | /* returns true if node is no longer in the domain | ||
271 | * could be dead or just not joined */ | ||
272 | int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node) | ||
273 | { | ||
274 | int dead; | ||
275 | spin_lock(&dlm->spinlock); | ||
276 | dead = test_bit(node, dlm->domain_map); | ||
277 | spin_unlock(&dlm->spinlock); | ||
278 | return dead; | ||
279 | } | ||
280 | |||
259 | /* callers of the top-level api calls (dlmlock/dlmunlock) should | 281 | /* callers of the top-level api calls (dlmlock/dlmunlock) should |
260 | * block on the dlm->reco.event when recovery is in progress. | 282 | * block on the dlm->reco.event when recovery is in progress. |
261 | * the dlm recovery thread will set this state when it begins | 283 | * the dlm recovery thread will set this state when it begins |
@@ -297,6 +319,7 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm) | |||
297 | static int dlm_do_recovery(struct dlm_ctxt *dlm) | 319 | static int dlm_do_recovery(struct dlm_ctxt *dlm) |
298 | { | 320 | { |
299 | int status = 0; | 321 | int status = 0; |
322 | int ret; | ||
300 | 323 | ||
301 | spin_lock(&dlm->spinlock); | 324 | spin_lock(&dlm->spinlock); |
302 | 325 | ||
@@ -343,10 +366,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm) | |||
343 | goto master_here; | 366 | goto master_here; |
344 | 367 | ||
345 | if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { | 368 | if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { |
346 | /* choose a new master */ | 369 | /* choose a new master, returns 0 if this node |
347 | if (!dlm_pick_recovery_master(dlm)) { | 370 | * is the master, -EEXIST if it's another node. |
371 | * this does not return until a new master is chosen | ||
372 | * or recovery completes entirely. */ | ||
373 | ret = dlm_pick_recovery_master(dlm); | ||
374 | if (!ret) { | ||
348 | /* already notified everyone. go. */ | 375 | /* already notified everyone. go. */ |
349 | dlm->reco.new_master = dlm->node_num; | ||
350 | goto master_here; | 376 | goto master_here; |
351 | } | 377 | } |
352 | mlog(0, "another node will master this recovery session.\n"); | 378 | mlog(0, "another node will master this recovery session.\n"); |
@@ -371,8 +397,13 @@ master_here: | |||
371 | if (status < 0) { | 397 | if (status < 0) { |
372 | mlog(ML_ERROR, "error %d remastering locks for node %u, " | 398 | mlog(ML_ERROR, "error %d remastering locks for node %u, " |
373 | "retrying.\n", status, dlm->reco.dead_node); | 399 | "retrying.\n", status, dlm->reco.dead_node); |
400 | /* yield a bit to allow any final network messages | ||
401 | * to get handled on remaining nodes */ | ||
402 | msleep(100); | ||
374 | } else { | 403 | } else { |
375 | /* success! see if any other nodes need recovery */ | 404 | /* success! see if any other nodes need recovery */ |
405 | mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n", | ||
406 | dlm->name, dlm->reco.dead_node, dlm->node_num); | ||
376 | dlm_reset_recovery(dlm); | 407 | dlm_reset_recovery(dlm); |
377 | } | 408 | } |
378 | dlm_end_recovery(dlm); | 409 | dlm_end_recovery(dlm); |
@@ -477,7 +508,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
477 | BUG(); | 508 | BUG(); |
478 | break; | 509 | break; |
479 | case DLM_RECO_NODE_DATA_DEAD: | 510 | case DLM_RECO_NODE_DATA_DEAD: |
480 | mlog(0, "node %u died after " | 511 | mlog(ML_NOTICE, "node %u died after " |
481 | "requesting recovery info for " | 512 | "requesting recovery info for " |
482 | "node %u\n", ndata->node_num, | 513 | "node %u\n", ndata->node_num, |
483 | dead_node); | 514 | dead_node); |
@@ -485,6 +516,19 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node) | |||
485 | // start all over | 516 | // start all over |
486 | destroy = 1; | 517 | destroy = 1; |
487 | status = -EAGAIN; | 518 | status = -EAGAIN; |
519 | /* instead of spinning like crazy here, | ||
520 | * wait for the domain map to catch up | ||
521 | * with the network state. otherwise this | ||
522 | * can be hit hundreds of times before | ||
523 | * the node is really seen as dead. */ | ||
524 | wait_event_timeout(dlm->dlm_reco_thread_wq, | ||
525 | dlm_is_node_dead(dlm, | ||
526 | ndata->node_num), | ||
527 | msecs_to_jiffies(1000)); | ||
528 | mlog(0, "waited 1 sec for %u, " | ||
529 | "dead? %s\n", ndata->node_num, | ||
530 | dlm_is_node_dead(dlm, ndata->node_num) ? | ||
531 | "yes" : "no"); | ||
488 | goto leave; | 532 | goto leave; |
489 | case DLM_RECO_NODE_DATA_RECEIVING: | 533 | case DLM_RECO_NODE_DATA_RECEIVING: |
490 | case DLM_RECO_NODE_DATA_REQUESTED: | 534 | case DLM_RECO_NODE_DATA_REQUESTED: |
@@ -678,11 +722,27 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) | |||
678 | dlm = item->dlm; | 722 | dlm = item->dlm; |
679 | dead_node = item->u.ral.dead_node; | 723 | dead_node = item->u.ral.dead_node; |
680 | reco_master = item->u.ral.reco_master; | 724 | reco_master = item->u.ral.reco_master; |
725 | mres = (struct dlm_migratable_lockres *)data; | ||
726 | |||
727 | if (dead_node != dlm->reco.dead_node || | ||
728 | reco_master != dlm->reco.new_master) { | ||
729 | /* show extra debug info if the recovery state is messed */ | ||
730 | mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), " | ||
731 | "request(dead=%u, master=%u)\n", | ||
732 | dlm->name, dlm->reco.dead_node, dlm->reco.new_master, | ||
733 | dead_node, reco_master); | ||
734 | mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u " | ||
735 | "entry[0]={c=%"MLFu64",l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n", | ||
736 | dlm->name, mres->lockname_len, mres->lockname, mres->master, | ||
737 | mres->num_locks, mres->total_locks, mres->flags, | ||
738 | mres->ml[0].cookie, mres->ml[0].list, mres->ml[0].flags, | ||
739 | mres->ml[0].type, mres->ml[0].convert_type, | ||
740 | mres->ml[0].highest_blocked, mres->ml[0].node); | ||
741 | BUG(); | ||
742 | } | ||
681 | BUG_ON(dead_node != dlm->reco.dead_node); | 743 | BUG_ON(dead_node != dlm->reco.dead_node); |
682 | BUG_ON(reco_master != dlm->reco.new_master); | 744 | BUG_ON(reco_master != dlm->reco.new_master); |
683 | 745 | ||
684 | mres = (struct dlm_migratable_lockres *)data; | ||
685 | |||
686 | /* lock resources should have already been moved to the | 746 | /* lock resources should have already been moved to the |
687 | * dlm->reco.resources list. now move items from that list | 747 | * dlm->reco.resources list. now move items from that list |
688 | * to a temp list if the dead owner matches. note that the | 748 | * to a temp list if the dead owner matches. note that the |
@@ -757,15 +817,18 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data) | |||
757 | continue; | 817 | continue; |
758 | 818 | ||
759 | switch (ndata->state) { | 819 | switch (ndata->state) { |
820 | /* should have moved beyond INIT but not to FINALIZE yet */ | ||
760 | case DLM_RECO_NODE_DATA_INIT: | 821 | case DLM_RECO_NODE_DATA_INIT: |
761 | case DLM_RECO_NODE_DATA_DEAD: | 822 | case DLM_RECO_NODE_DATA_DEAD: |
762 | case DLM_RECO_NODE_DATA_DONE: | ||
763 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: | 823 | case DLM_RECO_NODE_DATA_FINALIZE_SENT: |
764 | mlog(ML_ERROR, "bad ndata state for node %u:" | 824 | mlog(ML_ERROR, "bad ndata state for node %u:" |
765 | " state=%d\n", ndata->node_num, | 825 | " state=%d\n", ndata->node_num, |
766 | ndata->state); | 826 | ndata->state); |
767 | BUG(); | 827 | BUG(); |
768 | break; | 828 | break; |
829 | /* these states are possible at this point, anywhere along | ||
830 | * the line of recovery */ | ||
831 | case DLM_RECO_NODE_DATA_DONE: | ||
769 | case DLM_RECO_NODE_DATA_RECEIVING: | 832 | case DLM_RECO_NODE_DATA_RECEIVING: |
770 | case DLM_RECO_NODE_DATA_REQUESTED: | 833 | case DLM_RECO_NODE_DATA_REQUESTED: |
771 | case DLM_RECO_NODE_DATA_REQUESTING: | 834 | case DLM_RECO_NODE_DATA_REQUESTING: |
@@ -799,13 +862,31 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm, | |||
799 | { | 862 | { |
800 | struct dlm_lock_resource *res; | 863 | struct dlm_lock_resource *res; |
801 | struct list_head *iter, *iter2; | 864 | struct list_head *iter, *iter2; |
865 | struct dlm_lock *lock; | ||
802 | 866 | ||
803 | spin_lock(&dlm->spinlock); | 867 | spin_lock(&dlm->spinlock); |
804 | list_for_each_safe(iter, iter2, &dlm->reco.resources) { | 868 | list_for_each_safe(iter, iter2, &dlm->reco.resources) { |
805 | res = list_entry (iter, struct dlm_lock_resource, recovering); | 869 | res = list_entry (iter, struct dlm_lock_resource, recovering); |
870 | /* always prune any $RECOVERY entries for dead nodes, | ||
871 | * otherwise hangs can occur during later recovery */ | ||
806 | if (dlm_is_recovery_lock(res->lockname.name, | 872 | if (dlm_is_recovery_lock(res->lockname.name, |
807 | res->lockname.len)) | 873 | res->lockname.len)) { |
874 | spin_lock(&res->spinlock); | ||
875 | list_for_each_entry(lock, &res->granted, list) { | ||
876 | if (lock->ml.node == dead_node) { | ||
877 | mlog(0, "AHA! there was " | ||
878 | "a $RECOVERY lock for dead " | ||
879 | "node %u (%s)!\n", | ||
880 | dead_node, dlm->name); | ||
881 | list_del_init(&lock->list); | ||
882 | dlm_lock_put(lock); | ||
883 | break; | ||
884 | } | ||
885 | } | ||
886 | spin_unlock(&res->spinlock); | ||
808 | continue; | 887 | continue; |
888 | } | ||
889 | |||
809 | if (res->owner == dead_node) { | 890 | if (res->owner == dead_node) { |
810 | mlog(0, "found lockres owned by dead node while " | 891 | mlog(0, "found lockres owned by dead node while " |
811 | "doing recovery for node %u. sending it.\n", | 892 | "doing recovery for node %u. sending it.\n", |
@@ -1179,7 +1260,7 @@ static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data) | |||
1179 | again: | 1260 | again: |
1180 | ret = dlm_lockres_master_requery(dlm, res, &real_master); | 1261 | ret = dlm_lockres_master_requery(dlm, res, &real_master); |
1181 | if (ret < 0) { | 1262 | if (ret < 0) { |
1182 | mlog(0, "dlm_lockres_master_requery failure: %d\n", | 1263 | mlog(0, "dlm_lockres_master_requery ret=%d\n", |
1183 | ret); | 1264 | ret); |
1184 | goto again; | 1265 | goto again; |
1185 | } | 1266 | } |
@@ -1757,6 +1838,7 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) | |||
1757 | struct dlm_lock_resource *res; | 1838 | struct dlm_lock_resource *res; |
1758 | int i; | 1839 | int i; |
1759 | struct list_head *bucket; | 1840 | struct list_head *bucket; |
1841 | struct dlm_lock *lock; | ||
1760 | 1842 | ||
1761 | 1843 | ||
1762 | /* purge any stale mles */ | 1844 | /* purge any stale mles */ |
@@ -1780,10 +1862,25 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) | |||
1780 | bucket = &(dlm->resources[i]); | 1862 | bucket = &(dlm->resources[i]); |
1781 | list_for_each(iter, bucket) { | 1863 | list_for_each(iter, bucket) { |
1782 | res = list_entry (iter, struct dlm_lock_resource, list); | 1864 | res = list_entry (iter, struct dlm_lock_resource, list); |
1865 | /* always prune any $RECOVERY entries for dead nodes, | ||
1866 | * otherwise hangs can occur during later recovery */ | ||
1783 | if (dlm_is_recovery_lock(res->lockname.name, | 1867 | if (dlm_is_recovery_lock(res->lockname.name, |
1784 | res->lockname.len)) | 1868 | res->lockname.len)) { |
1869 | spin_lock(&res->spinlock); | ||
1870 | list_for_each_entry(lock, &res->granted, list) { | ||
1871 | if (lock->ml.node == dead_node) { | ||
1872 | mlog(0, "AHA! there was " | ||
1873 | "a $RECOVERY lock for dead " | ||
1874 | "node %u (%s)!\n", | ||
1875 | dead_node, dlm->name); | ||
1876 | list_del_init(&lock->list); | ||
1877 | dlm_lock_put(lock); | ||
1878 | break; | ||
1879 | } | ||
1880 | } | ||
1881 | spin_unlock(&res->spinlock); | ||
1785 | continue; | 1882 | continue; |
1786 | 1883 | } | |
1787 | spin_lock(&res->spinlock); | 1884 | spin_lock(&res->spinlock); |
1788 | /* zero the lvb if necessary */ | 1885 | /* zero the lvb if necessary */ |
1789 | dlm_revalidate_lvb(dlm, res, dead_node); | 1886 | dlm_revalidate_lvb(dlm, res, dead_node); |
@@ -1869,12 +1966,9 @@ void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data) | |||
1869 | return; | 1966 | return; |
1870 | 1967 | ||
1871 | spin_lock(&dlm->spinlock); | 1968 | spin_lock(&dlm->spinlock); |
1872 | |||
1873 | set_bit(idx, dlm->live_nodes_map); | 1969 | set_bit(idx, dlm->live_nodes_map); |
1874 | 1970 | /* do NOT notify mle attached to the heartbeat events. | |
1875 | /* notify any mles attached to the heartbeat events */ | 1971 | * new nodes are not interesting in mastery until joined. */ |
1876 | dlm_hb_event_notify_attached(dlm, idx, 1); | ||
1877 | |||
1878 | spin_unlock(&dlm->spinlock); | 1972 | spin_unlock(&dlm->spinlock); |
1879 | 1973 | ||
1880 | dlm_put(dlm); | 1974 | dlm_put(dlm); |
@@ -1897,7 +1991,18 @@ static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st) | |||
1897 | mlog(0, "unlockast for recovery lock fired!\n"); | 1991 | mlog(0, "unlockast for recovery lock fired!\n"); |
1898 | } | 1992 | } |
1899 | 1993 | ||
1900 | 1994 | /* | |
1995 | * dlm_pick_recovery_master will continually attempt to use | ||
1996 | * dlmlock() on the special "$RECOVERY" lockres with the | ||
1997 | * LKM_NOQUEUE flag to get an EX. every thread that enters | ||
1998 | * this function on each node racing to become the recovery | ||
1999 | * master will not stop attempting this until either: | ||
2000 | * a) this node gets the EX (and becomes the recovery master), | ||
2001 | * or b) dlm->reco.new_master gets set to some nodenum | ||
2002 | * != O2NM_INVALID_NODE_NUM (another node will do the reco). | ||
2003 | * so each time a recovery master is needed, the entire cluster | ||
2004 | * will sync at this point. if the new master dies, that will | ||
2005 | * be detected in dlm_do_recovery */ | ||
1901 | static int dlm_pick_recovery_master(struct dlm_ctxt *dlm) | 2006 | static int dlm_pick_recovery_master(struct dlm_ctxt *dlm) |
1902 | { | 2007 | { |
1903 | enum dlm_status ret; | 2008 | enum dlm_status ret; |
@@ -1906,23 +2011,45 @@ static int dlm_pick_recovery_master(struct dlm_ctxt *dlm) | |||
1906 | 2011 | ||
1907 | mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n", | 2012 | mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n", |
1908 | dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num); | 2013 | dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num); |
1909 | retry: | 2014 | again: |
1910 | memset(&lksb, 0, sizeof(lksb)); | 2015 | memset(&lksb, 0, sizeof(lksb)); |
1911 | 2016 | ||
1912 | ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY, | 2017 | ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY, |
1913 | DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast); | 2018 | DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast); |
1914 | 2019 | ||
2020 | mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n", | ||
2021 | dlm->name, ret, lksb.status); | ||
2022 | |||
1915 | if (ret == DLM_NORMAL) { | 2023 | if (ret == DLM_NORMAL) { |
1916 | mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n", | 2024 | mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n", |
1917 | dlm->name, dlm->node_num); | 2025 | dlm->name, dlm->node_num); |
1918 | /* I am master, send message to all nodes saying | 2026 | |
1919 | * that I am beginning a recovery session */ | 2027 | /* got the EX lock. check to see if another node |
1920 | status = dlm_send_begin_reco_message(dlm, | 2028 | * just became the reco master */ |
1921 | dlm->reco.dead_node); | 2029 | if (dlm_reco_master_ready(dlm)) { |
2030 | mlog(0, "%s: got reco EX lock, but %u will " | ||
2031 | "do the recovery\n", dlm->name, | ||
2032 | dlm->reco.new_master); | ||
2033 | status = -EEXIST; | ||
2034 | } else { | ||
2035 | status = dlm_send_begin_reco_message(dlm, | ||
2036 | dlm->reco.dead_node); | ||
2037 | /* this always succeeds */ | ||
2038 | BUG_ON(status); | ||
2039 | |||
2040 | /* set the new_master to this node */ | ||
2041 | spin_lock(&dlm->spinlock); | ||
2042 | dlm->reco.new_master = dlm->node_num; | ||
2043 | spin_unlock(&dlm->spinlock); | ||
2044 | } | ||
1922 | 2045 | ||
1923 | /* recovery lock is a special case. ast will not get fired, | 2046 | /* recovery lock is a special case. ast will not get fired, |
1924 | * so just go ahead and unlock it. */ | 2047 | * so just go ahead and unlock it. */ |
1925 | ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm); | 2048 | ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm); |
2049 | if (ret == DLM_DENIED) { | ||
2050 | mlog(0, "got DLM_DENIED, trying LKM_CANCEL\n"); | ||
2051 | ret = dlmunlock(dlm, &lksb, LKM_CANCEL, dlm_reco_unlock_ast, dlm); | ||
2052 | } | ||
1926 | if (ret != DLM_NORMAL) { | 2053 | if (ret != DLM_NORMAL) { |
1927 | /* this would really suck. this could only happen | 2054 | /* this would really suck. this could only happen |
1928 | * if there was a network error during the unlock | 2055 | * if there was a network error during the unlock |
@@ -1930,20 +2057,42 @@ retry: | |||
1930 | * is actually "done" and the lock structure is | 2057 | * is actually "done" and the lock structure is |
1931 | * even freed. we can continue, but only | 2058 | * even freed. we can continue, but only |
1932 | * because this specific lock name is special. */ | 2059 | * because this specific lock name is special. */ |
1933 | mlog(0, "dlmunlock returned %d\n", ret); | 2060 | mlog(ML_ERROR, "dlmunlock returned %d\n", ret); |
1934 | } | ||
1935 | |||
1936 | if (status < 0) { | ||
1937 | mlog(0, "failed to send recovery message. " | ||
1938 | "must retry with new node map.\n"); | ||
1939 | goto retry; | ||
1940 | } | 2061 | } |
1941 | } else if (ret == DLM_NOTQUEUED) { | 2062 | } else if (ret == DLM_NOTQUEUED) { |
1942 | mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n", | 2063 | mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n", |
1943 | dlm->name, dlm->node_num); | 2064 | dlm->name, dlm->node_num); |
1944 | /* another node is master. wait on | 2065 | /* another node is master. wait on |
1945 | * reco.new_master != O2NM_INVALID_NODE_NUM */ | 2066 | * reco.new_master != O2NM_INVALID_NODE_NUM |
2067 | * for at most one second */ | ||
2068 | wait_event_timeout(dlm->dlm_reco_thread_wq, | ||
2069 | dlm_reco_master_ready(dlm), | ||
2070 | msecs_to_jiffies(1000)); | ||
2071 | if (!dlm_reco_master_ready(dlm)) { | ||
2072 | mlog(0, "%s: reco master taking awhile\n", | ||
2073 | dlm->name); | ||
2074 | goto again; | ||
2075 | } | ||
2076 | /* another node has informed this one that it is reco master */ | ||
2077 | mlog(0, "%s: reco master %u is ready to recover %u\n", | ||
2078 | dlm->name, dlm->reco.new_master, dlm->reco.dead_node); | ||
1946 | status = -EEXIST; | 2079 | status = -EEXIST; |
2080 | } else { | ||
2081 | struct dlm_lock_resource *res; | ||
2082 | |||
2083 | /* dlmlock returned something other than NOTQUEUED or NORMAL */ | ||
2084 | mlog(ML_ERROR, "%s: got %s from dlmlock($RECOVERY), " | ||
2085 | "lksb.status=%s\n", dlm->name, dlm_errname(ret), | ||
2086 | dlm_errname(lksb.status)); | ||
2087 | res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME, | ||
2088 | DLM_RECOVERY_LOCK_NAME_LEN); | ||
2089 | if (res) { | ||
2090 | dlm_print_one_lock_resource(res); | ||
2091 | dlm_lockres_put(res); | ||
2092 | } else { | ||
2093 | mlog(ML_ERROR, "recovery lock not found\n"); | ||
2094 | } | ||
2095 | BUG(); | ||
1947 | } | 2096 | } |
1948 | 2097 | ||
1949 | return status; | 2098 | return status; |
@@ -1982,7 +2131,7 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) | |||
1982 | mlog(0, "not sending begin reco to self\n"); | 2131 | mlog(0, "not sending begin reco to self\n"); |
1983 | continue; | 2132 | continue; |
1984 | } | 2133 | } |
1985 | 2134 | retry: | |
1986 | ret = -EINVAL; | 2135 | ret = -EINVAL; |
1987 | mlog(0, "attempting to send begin reco msg to %d\n", | 2136 | mlog(0, "attempting to send begin reco msg to %d\n", |
1988 | nodenum); | 2137 | nodenum); |
@@ -1991,8 +2140,17 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) | |||
1991 | /* negative status is handled ok by caller here */ | 2140 | /* negative status is handled ok by caller here */ |
1992 | if (ret >= 0) | 2141 | if (ret >= 0) |
1993 | ret = status; | 2142 | ret = status; |
2143 | if (dlm_is_host_down(ret)) { | ||
2144 | /* node is down. not involved in recovery | ||
2145 | * so just keep going */ | ||
2146 | mlog(0, "%s: node %u was down when sending " | ||
2147 | "begin reco msg (%d)\n", dlm->name, nodenum, ret); | ||
2148 | ret = 0; | ||
2149 | } | ||
1994 | if (ret < 0) { | 2150 | if (ret < 0) { |
1995 | struct dlm_lock_resource *res; | 2151 | struct dlm_lock_resource *res; |
2152 | /* this is now a serious problem, possibly ENOMEM | ||
2153 | * in the network stack. must retry */ | ||
1996 | mlog_errno(ret); | 2154 | mlog_errno(ret); |
1997 | mlog(ML_ERROR, "begin reco of dlm %s to node %u " | 2155 | mlog(ML_ERROR, "begin reco of dlm %s to node %u " |
1998 | " returned %d\n", dlm->name, nodenum, ret); | 2156 | " returned %d\n", dlm->name, nodenum, ret); |
@@ -2004,7 +2162,10 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node) | |||
2004 | } else { | 2162 | } else { |
2005 | mlog(ML_ERROR, "recovery lock not found\n"); | 2163 | mlog(ML_ERROR, "recovery lock not found\n"); |
2006 | } | 2164 | } |
2007 | break; | 2165 | /* sleep for a bit in hopes that we can avoid |
2166 | * another ENOMEM */ | ||
2167 | msleep(100); | ||
2168 | goto retry; | ||
2008 | } | 2169 | } |
2009 | } | 2170 | } |
2010 | 2171 | ||
@@ -2027,19 +2188,34 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data) | |||
2027 | 2188 | ||
2028 | spin_lock(&dlm->spinlock); | 2189 | spin_lock(&dlm->spinlock); |
2029 | if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { | 2190 | if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { |
2030 | mlog(0, "new_master already set to %u!\n", | 2191 | if (test_bit(dlm->reco.new_master, dlm->recovery_map)) { |
2031 | dlm->reco.new_master); | 2192 | mlog(0, "%s: new_master %u died, changing " |
2193 | "to %u\n", dlm->name, dlm->reco.new_master, | ||
2194 | br->node_idx); | ||
2195 | } else { | ||
2196 | mlog(0, "%s: new_master %u NOT DEAD, changing " | ||
2197 | "to %u\n", dlm->name, dlm->reco.new_master, | ||
2198 | br->node_idx); | ||
2199 | /* may not have seen the new master as dead yet */ | ||
2200 | } | ||
2032 | } | 2201 | } |
2033 | if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) { | 2202 | if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) { |
2034 | mlog(0, "dead_node already set to %u!\n", | 2203 | mlog(ML_NOTICE, "%s: dead_node previously set to %u, " |
2035 | dlm->reco.dead_node); | 2204 | "node %u changing it to %u\n", dlm->name, |
2205 | dlm->reco.dead_node, br->node_idx, br->dead_node); | ||
2036 | } | 2206 | } |
2037 | dlm->reco.new_master = br->node_idx; | 2207 | dlm->reco.new_master = br->node_idx; |
2038 | dlm->reco.dead_node = br->dead_node; | 2208 | dlm->reco.dead_node = br->dead_node; |
2039 | if (!test_bit(br->dead_node, dlm->recovery_map)) { | 2209 | if (!test_bit(br->dead_node, dlm->recovery_map)) { |
2040 | mlog(ML_ERROR, "recovery master %u sees %u as dead, but this " | 2210 | mlog(0, "recovery master %u sees %u as dead, but this " |
2041 | "node has not yet. marking %u as dead\n", | 2211 | "node has not yet. marking %u as dead\n", |
2042 | br->node_idx, br->dead_node, br->dead_node); | 2212 | br->node_idx, br->dead_node, br->dead_node); |
2213 | if (!test_bit(br->dead_node, dlm->domain_map) || | ||
2214 | !test_bit(br->dead_node, dlm->live_nodes_map)) | ||
2215 | mlog(0, "%u not in domain/live_nodes map " | ||
2216 | "so setting it in reco map manually\n", | ||
2217 | br->dead_node); | ||
2218 | set_bit(br->dead_node, dlm->recovery_map); | ||
2043 | __dlm_hb_node_down(dlm, br->dead_node); | 2219 | __dlm_hb_node_down(dlm, br->dead_node); |
2044 | } | 2220 | } |
2045 | spin_unlock(&dlm->spinlock); | 2221 | spin_unlock(&dlm->spinlock); |