diff options
Diffstat (limited to 'fs/ocfs2/dlm/dlmmaster.c')
-rw-r--r-- | fs/ocfs2/dlm/dlmmaster.c | 103 |
1 files changed, 103 insertions, 0 deletions
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 78ac3a00eb54..940be4c13b1f 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c | |||
@@ -239,6 +239,8 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, | |||
239 | static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, | 239 | static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, |
240 | struct dlm_lock_resource *res, | 240 | struct dlm_lock_resource *res, |
241 | u8 target); | 241 | u8 target); |
242 | static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, | ||
243 | struct dlm_lock_resource *res); | ||
242 | 244 | ||
243 | 245 | ||
244 | int dlm_is_host_down(int errno) | 246 | int dlm_is_host_down(int errno) |
@@ -677,6 +679,7 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, | |||
677 | struct dlm_node_iter iter; | 679 | struct dlm_node_iter iter; |
678 | unsigned int namelen; | 680 | unsigned int namelen; |
679 | int tries = 0; | 681 | int tries = 0; |
682 | int bit, wait_on_recovery = 0; | ||
680 | 683 | ||
681 | BUG_ON(!lockid); | 684 | BUG_ON(!lockid); |
682 | 685 | ||
@@ -762,6 +765,18 @@ lookup: | |||
762 | dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0); | 765 | dlm_init_mle(mle, DLM_MLE_MASTER, dlm, res, NULL, 0); |
763 | set_bit(dlm->node_num, mle->maybe_map); | 766 | set_bit(dlm->node_num, mle->maybe_map); |
764 | list_add(&mle->list, &dlm->master_list); | 767 | list_add(&mle->list, &dlm->master_list); |
768 | |||
769 | /* still holding the dlm spinlock, check the recovery map | ||
770 | * to see if there are any nodes that still need to be | ||
771 | * considered. these will not appear in the mle nodemap | ||
772 | * but they might own this lockres. wait on them. */ | ||
773 | bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); | ||
774 | if (bit < O2NM_MAX_NODES) { | ||
775 | mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to" | ||
776 | "recover before lock mastery can begin\n", | ||
777 | dlm->name, namelen, (char *)lockid, bit); | ||
778 | wait_on_recovery = 1; | ||
779 | } | ||
765 | } | 780 | } |
766 | 781 | ||
767 | /* at this point there is either a DLM_MLE_BLOCK or a | 782 | /* at this point there is either a DLM_MLE_BLOCK or a |
@@ -779,6 +794,39 @@ lookup: | |||
779 | spin_unlock(&dlm->master_lock); | 794 | spin_unlock(&dlm->master_lock); |
780 | spin_unlock(&dlm->spinlock); | 795 | spin_unlock(&dlm->spinlock); |
781 | 796 | ||
797 | while (wait_on_recovery) { | ||
798 | /* any cluster changes that occurred after dropping the | ||
799 | * dlm spinlock would be detectable be a change on the mle, | ||
800 | * so we only need to clear out the recovery map once. */ | ||
801 | if (dlm_is_recovery_lock(lockid, namelen)) { | ||
802 | mlog(ML_NOTICE, "%s: recovery map is not empty, but " | ||
803 | "must master $RECOVERY lock now\n", dlm->name); | ||
804 | if (!dlm_pre_master_reco_lockres(dlm, res)) | ||
805 | wait_on_recovery = 0; | ||
806 | else { | ||
807 | mlog(0, "%s: waiting 500ms for heartbeat state " | ||
808 | "change\n", dlm->name); | ||
809 | msleep(500); | ||
810 | } | ||
811 | continue; | ||
812 | } | ||
813 | |||
814 | dlm_kick_recovery_thread(dlm); | ||
815 | msleep(100); | ||
816 | dlm_wait_for_recovery(dlm); | ||
817 | |||
818 | spin_lock(&dlm->spinlock); | ||
819 | bit = find_next_bit(dlm->recovery_map, O2NM_MAX_NODES, 0); | ||
820 | if (bit < O2NM_MAX_NODES) { | ||
821 | mlog(ML_NOTICE, "%s:%.*s: at least one node (%d) to" | ||
822 | "recover before lock mastery can begin\n", | ||
823 | dlm->name, namelen, (char *)lockid, bit); | ||
824 | wait_on_recovery = 1; | ||
825 | } else | ||
826 | wait_on_recovery = 0; | ||
827 | spin_unlock(&dlm->spinlock); | ||
828 | } | ||
829 | |||
782 | /* must wait for lock to be mastered elsewhere */ | 830 | /* must wait for lock to be mastered elsewhere */ |
783 | if (blocked) | 831 | if (blocked) |
784 | goto wait; | 832 | goto wait; |
@@ -1835,6 +1883,61 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) | |||
1835 | mlog(0, "finished with dlm_assert_master_worker\n"); | 1883 | mlog(0, "finished with dlm_assert_master_worker\n"); |
1836 | } | 1884 | } |
1837 | 1885 | ||
1886 | /* SPECIAL CASE for the $RECOVERY lock used by the recovery thread. | ||
1887 | * We cannot wait for node recovery to complete to begin mastering this | ||
1888 | * lockres because this lockres is used to kick off recovery! ;-) | ||
1889 | * So, do a pre-check on all living nodes to see if any of those nodes | ||
1890 | * think that $RECOVERY is currently mastered by a dead node. If so, | ||
1891 | * we wait a short time to allow that node to get notified by its own | ||
1892 | * heartbeat stack, then check again. All $RECOVERY lock resources | ||
1893 | * mastered by dead nodes are purged when the hearbeat callback is | ||
1894 | * fired, so we can know for sure that it is safe to continue once | ||
1895 | * the node returns a live node or no node. */ | ||
1896 | static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, | ||
1897 | struct dlm_lock_resource *res) | ||
1898 | { | ||
1899 | struct dlm_node_iter iter; | ||
1900 | int nodenum; | ||
1901 | int ret = 0; | ||
1902 | u8 master = DLM_LOCK_RES_OWNER_UNKNOWN; | ||
1903 | |||
1904 | spin_lock(&dlm->spinlock); | ||
1905 | dlm_node_iter_init(dlm->domain_map, &iter); | ||
1906 | spin_unlock(&dlm->spinlock); | ||
1907 | |||
1908 | while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { | ||
1909 | /* do not send to self */ | ||
1910 | if (nodenum == dlm->node_num) | ||
1911 | continue; | ||
1912 | ret = dlm_do_master_requery(dlm, res, nodenum, &master); | ||
1913 | if (ret < 0) { | ||
1914 | mlog_errno(ret); | ||
1915 | if (!dlm_is_host_down(ret)) | ||
1916 | BUG(); | ||
1917 | /* host is down, so answer for that node would be | ||
1918 | * DLM_LOCK_RES_OWNER_UNKNOWN. continue. */ | ||
1919 | } | ||
1920 | |||
1921 | if (master != DLM_LOCK_RES_OWNER_UNKNOWN) { | ||
1922 | /* check to see if this master is in the recovery map */ | ||
1923 | spin_lock(&dlm->spinlock); | ||
1924 | if (test_bit(master, dlm->recovery_map)) { | ||
1925 | mlog(ML_NOTICE, "%s: node %u has not seen " | ||
1926 | "node %u go down yet, and thinks the " | ||
1927 | "dead node is mastering the recovery " | ||
1928 | "lock. must wait.\n", dlm->name, | ||
1929 | nodenum, master); | ||
1930 | ret = -EAGAIN; | ||
1931 | } | ||
1932 | spin_unlock(&dlm->spinlock); | ||
1933 | mlog(0, "%s: reco lock master is %u\n", dlm->name, | ||
1934 | master); | ||
1935 | break; | ||
1936 | } | ||
1937 | } | ||
1938 | return ret; | ||
1939 | } | ||
1940 | |||
1838 | 1941 | ||
1839 | /* | 1942 | /* |
1840 | * DLM_MIGRATE_LOCKRES | 1943 | * DLM_MIGRATE_LOCKRES |