aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2/dlm/dlmrecovery.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/ocfs2/dlm/dlmrecovery.c')
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c315
1 files changed, 267 insertions, 48 deletions
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 0c8eb1093f00..1e232000f3f7 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -39,6 +39,7 @@
39#include <linux/inet.h> 39#include <linux/inet.h>
40#include <linux/timer.h> 40#include <linux/timer.h>
41#include <linux/kthread.h> 41#include <linux/kthread.h>
42#include <linux/delay.h>
42 43
43 44
44#include "cluster/heartbeat.h" 45#include "cluster/heartbeat.h"
@@ -256,6 +257,45 @@ static int dlm_recovery_thread(void *data)
256 return 0; 257 return 0;
257} 258}
258 259
260/* returns true when the recovery master has contacted us */
261static int dlm_reco_master_ready(struct dlm_ctxt *dlm)
262{
263 int ready;
264 spin_lock(&dlm->spinlock);
265 ready = (dlm->reco.new_master != O2NM_INVALID_NODE_NUM);
266 spin_unlock(&dlm->spinlock);
267 return ready;
268}
269
270/* returns true if node is no longer in the domain
271 * could be dead or just not joined */
272int dlm_is_node_dead(struct dlm_ctxt *dlm, u8 node)
273{
274 int dead;
275 spin_lock(&dlm->spinlock);
276 dead = test_bit(node, dlm->domain_map);
277 spin_unlock(&dlm->spinlock);
278 return dead;
279}
280
281int dlm_wait_for_node_death(struct dlm_ctxt *dlm, u8 node, int timeout)
282{
283 if (timeout) {
284 mlog(ML_NOTICE, "%s: waiting %dms for notification of "
285 "death of node %u\n", dlm->name, timeout, node);
286 wait_event_timeout(dlm->dlm_reco_thread_wq,
287 dlm_is_node_dead(dlm, node),
288 msecs_to_jiffies(timeout));
289 } else {
290 mlog(ML_NOTICE, "%s: waiting indefinitely for notification "
291 "of death of node %u\n", dlm->name, node);
292 wait_event(dlm->dlm_reco_thread_wq,
293 dlm_is_node_dead(dlm, node));
294 }
295 /* for now, return 0 */
296 return 0;
297}
298
259/* callers of the top-level api calls (dlmlock/dlmunlock) should 299/* callers of the top-level api calls (dlmlock/dlmunlock) should
260 * block on the dlm->reco.event when recovery is in progress. 300 * block on the dlm->reco.event when recovery is in progress.
261 * the dlm recovery thread will set this state when it begins 301 * the dlm recovery thread will set this state when it begins
@@ -297,6 +337,7 @@ static void dlm_end_recovery(struct dlm_ctxt *dlm)
297static int dlm_do_recovery(struct dlm_ctxt *dlm) 337static int dlm_do_recovery(struct dlm_ctxt *dlm)
298{ 338{
299 int status = 0; 339 int status = 0;
340 int ret;
300 341
301 spin_lock(&dlm->spinlock); 342 spin_lock(&dlm->spinlock);
302 343
@@ -343,10 +384,13 @@ static int dlm_do_recovery(struct dlm_ctxt *dlm)
343 goto master_here; 384 goto master_here;
344 385
345 if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) { 386 if (dlm->reco.new_master == O2NM_INVALID_NODE_NUM) {
346 /* choose a new master */ 387 /* choose a new master, returns 0 if this node
347 if (!dlm_pick_recovery_master(dlm)) { 388 * is the master, -EEXIST if it's another node.
389 * this does not return until a new master is chosen
390 * or recovery completes entirely. */
391 ret = dlm_pick_recovery_master(dlm);
392 if (!ret) {
348 /* already notified everyone. go. */ 393 /* already notified everyone. go. */
349 dlm->reco.new_master = dlm->node_num;
350 goto master_here; 394 goto master_here;
351 } 395 }
352 mlog(0, "another node will master this recovery session.\n"); 396 mlog(0, "another node will master this recovery session.\n");
@@ -371,8 +415,13 @@ master_here:
371 if (status < 0) { 415 if (status < 0) {
372 mlog(ML_ERROR, "error %d remastering locks for node %u, " 416 mlog(ML_ERROR, "error %d remastering locks for node %u, "
373 "retrying.\n", status, dlm->reco.dead_node); 417 "retrying.\n", status, dlm->reco.dead_node);
418 /* yield a bit to allow any final network messages
419 * to get handled on remaining nodes */
420 msleep(100);
374 } else { 421 } else {
375 /* success! see if any other nodes need recovery */ 422 /* success! see if any other nodes need recovery */
423 mlog(0, "DONE mastering recovery of %s:%u here(this=%u)!\n",
424 dlm->name, dlm->reco.dead_node, dlm->node_num);
376 dlm_reset_recovery(dlm); 425 dlm_reset_recovery(dlm);
377 } 426 }
378 dlm_end_recovery(dlm); 427 dlm_end_recovery(dlm);
@@ -477,7 +526,7 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
477 BUG(); 526 BUG();
478 break; 527 break;
479 case DLM_RECO_NODE_DATA_DEAD: 528 case DLM_RECO_NODE_DATA_DEAD:
480 mlog(0, "node %u died after " 529 mlog(ML_NOTICE, "node %u died after "
481 "requesting recovery info for " 530 "requesting recovery info for "
482 "node %u\n", ndata->node_num, 531 "node %u\n", ndata->node_num,
483 dead_node); 532 dead_node);
@@ -485,6 +534,19 @@ static int dlm_remaster_locks(struct dlm_ctxt *dlm, u8 dead_node)
485 // start all over 534 // start all over
486 destroy = 1; 535 destroy = 1;
487 status = -EAGAIN; 536 status = -EAGAIN;
537 /* instead of spinning like crazy here,
538 * wait for the domain map to catch up
539 * with the network state. otherwise this
540 * can be hit hundreds of times before
541 * the node is really seen as dead. */
542 wait_event_timeout(dlm->dlm_reco_thread_wq,
543 dlm_is_node_dead(dlm,
544 ndata->node_num),
545 msecs_to_jiffies(1000));
546 mlog(0, "waited 1 sec for %u, "
547 "dead? %s\n", ndata->node_num,
548 dlm_is_node_dead(dlm, ndata->node_num) ?
549 "yes" : "no");
488 goto leave; 550 goto leave;
489 case DLM_RECO_NODE_DATA_RECEIVING: 551 case DLM_RECO_NODE_DATA_RECEIVING:
490 case DLM_RECO_NODE_DATA_REQUESTED: 552 case DLM_RECO_NODE_DATA_REQUESTED:
@@ -678,11 +740,27 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
678 dlm = item->dlm; 740 dlm = item->dlm;
679 dead_node = item->u.ral.dead_node; 741 dead_node = item->u.ral.dead_node;
680 reco_master = item->u.ral.reco_master; 742 reco_master = item->u.ral.reco_master;
743 mres = (struct dlm_migratable_lockres *)data;
744
745 if (dead_node != dlm->reco.dead_node ||
746 reco_master != dlm->reco.new_master) {
747 /* show extra debug info if the recovery state is messed */
748 mlog(ML_ERROR, "%s: bad reco state: reco(dead=%u, master=%u), "
749 "request(dead=%u, master=%u)\n",
750 dlm->name, dlm->reco.dead_node, dlm->reco.new_master,
751 dead_node, reco_master);
752 mlog(ML_ERROR, "%s: name=%.*s master=%u locks=%u/%u flags=%u "
753 "entry[0]={c=%"MLFu64",l=%u,f=%u,t=%d,ct=%d,hb=%d,n=%u}\n",
754 dlm->name, mres->lockname_len, mres->lockname, mres->master,
755 mres->num_locks, mres->total_locks, mres->flags,
756 mres->ml[0].cookie, mres->ml[0].list, mres->ml[0].flags,
757 mres->ml[0].type, mres->ml[0].convert_type,
758 mres->ml[0].highest_blocked, mres->ml[0].node);
759 BUG();
760 }
681 BUG_ON(dead_node != dlm->reco.dead_node); 761 BUG_ON(dead_node != dlm->reco.dead_node);
682 BUG_ON(reco_master != dlm->reco.new_master); 762 BUG_ON(reco_master != dlm->reco.new_master);
683 763
684 mres = (struct dlm_migratable_lockres *)data;
685
686 /* lock resources should have already been moved to the 764 /* lock resources should have already been moved to the
687 * dlm->reco.resources list. now move items from that list 765 * dlm->reco.resources list. now move items from that list
688 * to a temp list if the dead owner matches. note that the 766 * to a temp list if the dead owner matches. note that the
@@ -757,15 +835,18 @@ int dlm_reco_data_done_handler(struct o2net_msg *msg, u32 len, void *data)
757 continue; 835 continue;
758 836
759 switch (ndata->state) { 837 switch (ndata->state) {
838 /* should have moved beyond INIT but not to FINALIZE yet */
760 case DLM_RECO_NODE_DATA_INIT: 839 case DLM_RECO_NODE_DATA_INIT:
761 case DLM_RECO_NODE_DATA_DEAD: 840 case DLM_RECO_NODE_DATA_DEAD:
762 case DLM_RECO_NODE_DATA_DONE:
763 case DLM_RECO_NODE_DATA_FINALIZE_SENT: 841 case DLM_RECO_NODE_DATA_FINALIZE_SENT:
764 mlog(ML_ERROR, "bad ndata state for node %u:" 842 mlog(ML_ERROR, "bad ndata state for node %u:"
765 " state=%d\n", ndata->node_num, 843 " state=%d\n", ndata->node_num,
766 ndata->state); 844 ndata->state);
767 BUG(); 845 BUG();
768 break; 846 break;
847 /* these states are possible at this point, anywhere along
848 * the line of recovery */
849 case DLM_RECO_NODE_DATA_DONE:
769 case DLM_RECO_NODE_DATA_RECEIVING: 850 case DLM_RECO_NODE_DATA_RECEIVING:
770 case DLM_RECO_NODE_DATA_REQUESTED: 851 case DLM_RECO_NODE_DATA_REQUESTED:
771 case DLM_RECO_NODE_DATA_REQUESTING: 852 case DLM_RECO_NODE_DATA_REQUESTING:
@@ -799,13 +880,31 @@ static void dlm_move_reco_locks_to_list(struct dlm_ctxt *dlm,
799{ 880{
800 struct dlm_lock_resource *res; 881 struct dlm_lock_resource *res;
801 struct list_head *iter, *iter2; 882 struct list_head *iter, *iter2;
883 struct dlm_lock *lock;
802 884
803 spin_lock(&dlm->spinlock); 885 spin_lock(&dlm->spinlock);
804 list_for_each_safe(iter, iter2, &dlm->reco.resources) { 886 list_for_each_safe(iter, iter2, &dlm->reco.resources) {
805 res = list_entry (iter, struct dlm_lock_resource, recovering); 887 res = list_entry (iter, struct dlm_lock_resource, recovering);
888 /* always prune any $RECOVERY entries for dead nodes,
889 * otherwise hangs can occur during later recovery */
806 if (dlm_is_recovery_lock(res->lockname.name, 890 if (dlm_is_recovery_lock(res->lockname.name,
807 res->lockname.len)) 891 res->lockname.len)) {
892 spin_lock(&res->spinlock);
893 list_for_each_entry(lock, &res->granted, list) {
894 if (lock->ml.node == dead_node) {
895 mlog(0, "AHA! there was "
896 "a $RECOVERY lock for dead "
897 "node %u (%s)!\n",
898 dead_node, dlm->name);
899 list_del_init(&lock->list);
900 dlm_lock_put(lock);
901 break;
902 }
903 }
904 spin_unlock(&res->spinlock);
808 continue; 905 continue;
906 }
907
809 if (res->owner == dead_node) { 908 if (res->owner == dead_node) {
810 mlog(0, "found lockres owned by dead node while " 909 mlog(0, "found lockres owned by dead node while "
811 "doing recovery for node %u. sending it.\n", 910 "doing recovery for node %u. sending it.\n",
@@ -1179,7 +1278,7 @@ static void dlm_mig_lockres_worker(struct dlm_work_item *item, void *data)
1179again: 1278again:
1180 ret = dlm_lockres_master_requery(dlm, res, &real_master); 1279 ret = dlm_lockres_master_requery(dlm, res, &real_master);
1181 if (ret < 0) { 1280 if (ret < 0) {
1182 mlog(0, "dlm_lockres_master_requery failure: %d\n", 1281 mlog(0, "dlm_lockres_master_requery ret=%d\n",
1183 ret); 1282 ret);
1184 goto again; 1283 goto again;
1185 } 1284 }
@@ -1594,7 +1693,10 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
1594 u8 dead_node, u8 new_master) 1693 u8 dead_node, u8 new_master)
1595{ 1694{
1596 int i; 1695 int i;
1597 struct list_head *iter, *iter2, *bucket; 1696 struct list_head *iter, *iter2;
1697 struct hlist_node *hash_iter;
1698 struct hlist_head *bucket;
1699
1598 struct dlm_lock_resource *res; 1700 struct dlm_lock_resource *res;
1599 1701
1600 mlog_entry_void(); 1702 mlog_entry_void();
@@ -1618,10 +1720,9 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
1618 * for now we need to run the whole hash, clear 1720 * for now we need to run the whole hash, clear
1619 * the RECOVERING state and set the owner 1721 * the RECOVERING state and set the owner
1620 * if necessary */ 1722 * if necessary */
1621 for (i=0; i<DLM_HASH_SIZE; i++) { 1723 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
1622 bucket = &(dlm->resources[i]); 1724 bucket = &(dlm->lockres_hash[i]);
1623 list_for_each(iter, bucket) { 1725 hlist_for_each_entry(res, hash_iter, bucket, hash_node) {
1624 res = list_entry (iter, struct dlm_lock_resource, list);
1625 if (res->state & DLM_LOCK_RES_RECOVERING) { 1726 if (res->state & DLM_LOCK_RES_RECOVERING) {
1626 if (res->owner == dead_node) { 1727 if (res->owner == dead_node) {
1627 mlog(0, "(this=%u) res %.*s owner=%u " 1728 mlog(0, "(this=%u) res %.*s owner=%u "
@@ -1753,10 +1854,11 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
1753 1854
1754static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node) 1855static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
1755{ 1856{
1756 struct list_head *iter; 1857 struct hlist_node *iter;
1757 struct dlm_lock_resource *res; 1858 struct dlm_lock_resource *res;
1758 int i; 1859 int i;
1759 struct list_head *bucket; 1860 struct hlist_head *bucket;
1861 struct dlm_lock *lock;
1760 1862
1761 1863
1762 /* purge any stale mles */ 1864 /* purge any stale mles */
@@ -1776,14 +1878,28 @@ static void dlm_do_local_recovery_cleanup(struct dlm_ctxt *dlm, u8 dead_node)
1776 * can be kicked again to see if any ASTs or BASTs 1878 * can be kicked again to see if any ASTs or BASTs
1777 * need to be fired as a result. 1879 * need to be fired as a result.
1778 */ 1880 */
1779 for (i=0; i<DLM_HASH_SIZE; i++) { 1881 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
1780 bucket = &(dlm->resources[i]); 1882 bucket = &(dlm->lockres_hash[i]);
1781 list_for_each(iter, bucket) { 1883 hlist_for_each_entry(res, iter, bucket, hash_node) {
1782 res = list_entry (iter, struct dlm_lock_resource, list); 1884 /* always prune any $RECOVERY entries for dead nodes,
1885 * otherwise hangs can occur during later recovery */
1783 if (dlm_is_recovery_lock(res->lockname.name, 1886 if (dlm_is_recovery_lock(res->lockname.name,
1784 res->lockname.len)) 1887 res->lockname.len)) {
1888 spin_lock(&res->spinlock);
1889 list_for_each_entry(lock, &res->granted, list) {
1890 if (lock->ml.node == dead_node) {
1891 mlog(0, "AHA! there was "
1892 "a $RECOVERY lock for dead "
1893 "node %u (%s)!\n",
1894 dead_node, dlm->name);
1895 list_del_init(&lock->list);
1896 dlm_lock_put(lock);
1897 break;
1898 }
1899 }
1900 spin_unlock(&res->spinlock);
1785 continue; 1901 continue;
1786 1902 }
1787 spin_lock(&res->spinlock); 1903 spin_lock(&res->spinlock);
1788 /* zero the lvb if necessary */ 1904 /* zero the lvb if necessary */
1789 dlm_revalidate_lvb(dlm, res, dead_node); 1905 dlm_revalidate_lvb(dlm, res, dead_node);
@@ -1869,12 +1985,9 @@ void dlm_hb_node_up_cb(struct o2nm_node *node, int idx, void *data)
1869 return; 1985 return;
1870 1986
1871 spin_lock(&dlm->spinlock); 1987 spin_lock(&dlm->spinlock);
1872
1873 set_bit(idx, dlm->live_nodes_map); 1988 set_bit(idx, dlm->live_nodes_map);
1874 1989 /* do NOT notify mle attached to the heartbeat events.
1875 /* notify any mles attached to the heartbeat events */ 1990 * new nodes are not interesting in mastery until joined. */
1876 dlm_hb_event_notify_attached(dlm, idx, 1);
1877
1878 spin_unlock(&dlm->spinlock); 1991 spin_unlock(&dlm->spinlock);
1879 1992
1880 dlm_put(dlm); 1993 dlm_put(dlm);
@@ -1897,7 +2010,18 @@ static void dlm_reco_unlock_ast(void *astdata, enum dlm_status st)
1897 mlog(0, "unlockast for recovery lock fired!\n"); 2010 mlog(0, "unlockast for recovery lock fired!\n");
1898} 2011}
1899 2012
1900 2013/*
2014 * dlm_pick_recovery_master will continually attempt to use
2015 * dlmlock() on the special "$RECOVERY" lockres with the
2016 * LKM_NOQUEUE flag to get an EX. every thread that enters
2017 * this function on each node racing to become the recovery
2018 * master will not stop attempting this until either:
2019 * a) this node gets the EX (and becomes the recovery master),
2020 * or b) dlm->reco.new_master gets set to some nodenum
2021 * != O2NM_INVALID_NODE_NUM (another node will do the reco).
2022 * so each time a recovery master is needed, the entire cluster
2023 * will sync at this point. if the new master dies, that will
2024 * be detected in dlm_do_recovery */
1901static int dlm_pick_recovery_master(struct dlm_ctxt *dlm) 2025static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
1902{ 2026{
1903 enum dlm_status ret; 2027 enum dlm_status ret;
@@ -1906,23 +2030,69 @@ static int dlm_pick_recovery_master(struct dlm_ctxt *dlm)
1906 2030
1907 mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n", 2031 mlog(0, "starting recovery of %s at %lu, dead=%u, this=%u\n",
1908 dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num); 2032 dlm->name, jiffies, dlm->reco.dead_node, dlm->node_num);
1909retry: 2033again:
1910 memset(&lksb, 0, sizeof(lksb)); 2034 memset(&lksb, 0, sizeof(lksb));
1911 2035
1912 ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY, 2036 ret = dlmlock(dlm, LKM_EXMODE, &lksb, LKM_NOQUEUE|LKM_RECOVERY,
1913 DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast); 2037 DLM_RECOVERY_LOCK_NAME, dlm_reco_ast, dlm, dlm_reco_bast);
1914 2038
2039 mlog(0, "%s: dlmlock($RECOVERY) returned %d, lksb=%d\n",
2040 dlm->name, ret, lksb.status);
2041
1915 if (ret == DLM_NORMAL) { 2042 if (ret == DLM_NORMAL) {
1916 mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n", 2043 mlog(0, "dlm=%s dlmlock says I got it (this=%u)\n",
1917 dlm->name, dlm->node_num); 2044 dlm->name, dlm->node_num);
1918 /* I am master, send message to all nodes saying 2045
1919 * that I am beginning a recovery session */ 2046 /* got the EX lock. check to see if another node
1920 status = dlm_send_begin_reco_message(dlm, 2047 * just became the reco master */
1921 dlm->reco.dead_node); 2048 if (dlm_reco_master_ready(dlm)) {
2049 mlog(0, "%s: got reco EX lock, but %u will "
2050 "do the recovery\n", dlm->name,
2051 dlm->reco.new_master);
2052 status = -EEXIST;
2053 } else {
2054 status = 0;
2055
2056 /* see if recovery was already finished elsewhere */
2057 spin_lock(&dlm->spinlock);
2058 if (dlm->reco.dead_node == O2NM_INVALID_NODE_NUM) {
2059 status = -EINVAL;
2060 mlog(0, "%s: got reco EX lock, but "
2061 "node got recovered already\n", dlm->name);
2062 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
2063 mlog(ML_ERROR, "%s: new master is %u "
2064 "but no dead node!\n",
2065 dlm->name, dlm->reco.new_master);
2066 BUG();
2067 }
2068 }
2069 spin_unlock(&dlm->spinlock);
2070 }
2071
2072 /* if this node has actually become the recovery master,
2073 * set the master and send the messages to begin recovery */
2074 if (!status) {
2075 mlog(0, "%s: dead=%u, this=%u, sending "
2076 "begin_reco now\n", dlm->name,
2077 dlm->reco.dead_node, dlm->node_num);
2078 status = dlm_send_begin_reco_message(dlm,
2079 dlm->reco.dead_node);
2080 /* this always succeeds */
2081 BUG_ON(status);
2082
2083 /* set the new_master to this node */
2084 spin_lock(&dlm->spinlock);
2085 dlm->reco.new_master = dlm->node_num;
2086 spin_unlock(&dlm->spinlock);
2087 }
1922 2088
1923 /* recovery lock is a special case. ast will not get fired, 2089 /* recovery lock is a special case. ast will not get fired,
1924 * so just go ahead and unlock it. */ 2090 * so just go ahead and unlock it. */
1925 ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm); 2091 ret = dlmunlock(dlm, &lksb, 0, dlm_reco_unlock_ast, dlm);
2092 if (ret == DLM_DENIED) {
2093 mlog(0, "got DLM_DENIED, trying LKM_CANCEL\n");
2094 ret = dlmunlock(dlm, &lksb, LKM_CANCEL, dlm_reco_unlock_ast, dlm);
2095 }
1926 if (ret != DLM_NORMAL) { 2096 if (ret != DLM_NORMAL) {
1927 /* this would really suck. this could only happen 2097 /* this would really suck. this could only happen
1928 * if there was a network error during the unlock 2098 * if there was a network error during the unlock
@@ -1930,20 +2100,42 @@ retry:
1930 * is actually "done" and the lock structure is 2100 * is actually "done" and the lock structure is
1931 * even freed. we can continue, but only 2101 * even freed. we can continue, but only
1932 * because this specific lock name is special. */ 2102 * because this specific lock name is special. */
1933 mlog(0, "dlmunlock returned %d\n", ret); 2103 mlog(ML_ERROR, "dlmunlock returned %d\n", ret);
1934 }
1935
1936 if (status < 0) {
1937 mlog(0, "failed to send recovery message. "
1938 "must retry with new node map.\n");
1939 goto retry;
1940 } 2104 }
1941 } else if (ret == DLM_NOTQUEUED) { 2105 } else if (ret == DLM_NOTQUEUED) {
1942 mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n", 2106 mlog(0, "dlm=%s dlmlock says another node got it (this=%u)\n",
1943 dlm->name, dlm->node_num); 2107 dlm->name, dlm->node_num);
1944 /* another node is master. wait on 2108 /* another node is master. wait on
1945 * reco.new_master != O2NM_INVALID_NODE_NUM */ 2109 * reco.new_master != O2NM_INVALID_NODE_NUM
2110 * for at most one second */
2111 wait_event_timeout(dlm->dlm_reco_thread_wq,
2112 dlm_reco_master_ready(dlm),
2113 msecs_to_jiffies(1000));
2114 if (!dlm_reco_master_ready(dlm)) {
2115 mlog(0, "%s: reco master taking awhile\n",
2116 dlm->name);
2117 goto again;
2118 }
2119 /* another node has informed this one that it is reco master */
2120 mlog(0, "%s: reco master %u is ready to recover %u\n",
2121 dlm->name, dlm->reco.new_master, dlm->reco.dead_node);
1946 status = -EEXIST; 2122 status = -EEXIST;
2123 } else {
2124 struct dlm_lock_resource *res;
2125
2126 /* dlmlock returned something other than NOTQUEUED or NORMAL */
2127 mlog(ML_ERROR, "%s: got %s from dlmlock($RECOVERY), "
2128 "lksb.status=%s\n", dlm->name, dlm_errname(ret),
2129 dlm_errname(lksb.status));
2130 res = dlm_lookup_lockres(dlm, DLM_RECOVERY_LOCK_NAME,
2131 DLM_RECOVERY_LOCK_NAME_LEN);
2132 if (res) {
2133 dlm_print_one_lock_resource(res);
2134 dlm_lockres_put(res);
2135 } else {
2136 mlog(ML_ERROR, "recovery lock not found\n");
2137 }
2138 BUG();
1947 } 2139 }
1948 2140
1949 return status; 2141 return status;
@@ -1982,7 +2174,7 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
1982 mlog(0, "not sending begin reco to self\n"); 2174 mlog(0, "not sending begin reco to self\n");
1983 continue; 2175 continue;
1984 } 2176 }
1985 2177retry:
1986 ret = -EINVAL; 2178 ret = -EINVAL;
1987 mlog(0, "attempting to send begin reco msg to %d\n", 2179 mlog(0, "attempting to send begin reco msg to %d\n",
1988 nodenum); 2180 nodenum);
@@ -1991,8 +2183,17 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
1991 /* negative status is handled ok by caller here */ 2183 /* negative status is handled ok by caller here */
1992 if (ret >= 0) 2184 if (ret >= 0)
1993 ret = status; 2185 ret = status;
2186 if (dlm_is_host_down(ret)) {
2187 /* node is down. not involved in recovery
2188 * so just keep going */
2189 mlog(0, "%s: node %u was down when sending "
2190 "begin reco msg (%d)\n", dlm->name, nodenum, ret);
2191 ret = 0;
2192 }
1994 if (ret < 0) { 2193 if (ret < 0) {
1995 struct dlm_lock_resource *res; 2194 struct dlm_lock_resource *res;
2195 /* this is now a serious problem, possibly ENOMEM
2196 * in the network stack. must retry */
1996 mlog_errno(ret); 2197 mlog_errno(ret);
1997 mlog(ML_ERROR, "begin reco of dlm %s to node %u " 2198 mlog(ML_ERROR, "begin reco of dlm %s to node %u "
1998 " returned %d\n", dlm->name, nodenum, ret); 2199 " returned %d\n", dlm->name, nodenum, ret);
@@ -2004,7 +2205,10 @@ static int dlm_send_begin_reco_message(struct dlm_ctxt *dlm, u8 dead_node)
2004 } else { 2205 } else {
2005 mlog(ML_ERROR, "recovery lock not found\n"); 2206 mlog(ML_ERROR, "recovery lock not found\n");
2006 } 2207 }
2007 break; 2208 /* sleep for a bit in hopes that we can avoid
2209 * another ENOMEM */
2210 msleep(100);
2211 goto retry;
2008 } 2212 }
2009 } 2213 }
2010 2214
@@ -2027,19 +2231,34 @@ int dlm_begin_reco_handler(struct o2net_msg *msg, u32 len, void *data)
2027 2231
2028 spin_lock(&dlm->spinlock); 2232 spin_lock(&dlm->spinlock);
2029 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) { 2233 if (dlm->reco.new_master != O2NM_INVALID_NODE_NUM) {
2030 mlog(0, "new_master already set to %u!\n", 2234 if (test_bit(dlm->reco.new_master, dlm->recovery_map)) {
2031 dlm->reco.new_master); 2235 mlog(0, "%s: new_master %u died, changing "
2236 "to %u\n", dlm->name, dlm->reco.new_master,
2237 br->node_idx);
2238 } else {
2239 mlog(0, "%s: new_master %u NOT DEAD, changing "
2240 "to %u\n", dlm->name, dlm->reco.new_master,
2241 br->node_idx);
2242 /* may not have seen the new master as dead yet */
2243 }
2032 } 2244 }
2033 if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) { 2245 if (dlm->reco.dead_node != O2NM_INVALID_NODE_NUM) {
2034 mlog(0, "dead_node already set to %u!\n", 2246 mlog(ML_NOTICE, "%s: dead_node previously set to %u, "
2035 dlm->reco.dead_node); 2247 "node %u changing it to %u\n", dlm->name,
2248 dlm->reco.dead_node, br->node_idx, br->dead_node);
2036 } 2249 }
2037 dlm->reco.new_master = br->node_idx; 2250 dlm->reco.new_master = br->node_idx;
2038 dlm->reco.dead_node = br->dead_node; 2251 dlm->reco.dead_node = br->dead_node;
2039 if (!test_bit(br->dead_node, dlm->recovery_map)) { 2252 if (!test_bit(br->dead_node, dlm->recovery_map)) {
2040 mlog(ML_ERROR, "recovery master %u sees %u as dead, but this " 2253 mlog(0, "recovery master %u sees %u as dead, but this "
2041 "node has not yet. marking %u as dead\n", 2254 "node has not yet. marking %u as dead\n",
2042 br->node_idx, br->dead_node, br->dead_node); 2255 br->node_idx, br->dead_node, br->dead_node);
2256 if (!test_bit(br->dead_node, dlm->domain_map) ||
2257 !test_bit(br->dead_node, dlm->live_nodes_map))
2258 mlog(0, "%u not in domain/live_nodes map "
2259 "so setting it in reco map manually\n",
2260 br->dead_node);
2261 set_bit(br->dead_node, dlm->recovery_map);
2043 __dlm_hb_node_down(dlm, br->dead_node); 2262 __dlm_hb_node_down(dlm, br->dead_node);
2044 } 2263 }
2045 spin_unlock(&dlm->spinlock); 2264 spin_unlock(&dlm->spinlock);