diff options
author | Kurt Hackel <kurt.hackel@oracle.com> | 2006-04-27 21:06:58 -0400 |
---|---|---|
committer | Mark Fasheh <mark.fasheh@oracle.com> | 2006-06-26 17:42:49 -0400 |
commit | 29c0fa0f56f20b4512f65b0f3e55bc8af50485b7 (patch) | |
tree | da16efa4c6c70f6ea01f84f2eb3c2899cf00654c /fs/ocfs2/dlm/dlmrecovery.c | |
parent | c3187ce5e335cf8e06391236cc1ad7d1b1e193ed (diff) |
ocfs2: handle network errors during recovery
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com>
Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Diffstat (limited to 'fs/ocfs2/dlm/dlmrecovery.c')
-rw-r--r-- | fs/ocfs2/dlm/dlmrecovery.c | 53 |
1 files changed, 36 insertions, 17 deletions
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 394887637289..59c8976915a9 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c | |||
@@ -757,6 +757,7 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) | |||
757 | struct list_head *iter; | 757 | struct list_head *iter; |
758 | int ret; | 758 | int ret; |
759 | u8 dead_node, reco_master; | 759 | u8 dead_node, reco_master; |
760 | int skip_all_done = 0; | ||
760 | 761 | ||
761 | dlm = item->dlm; | 762 | dlm = item->dlm; |
762 | dead_node = item->u.ral.dead_node; | 763 | dead_node = item->u.ral.dead_node; |
@@ -793,12 +794,18 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) | |||
793 | dlm_move_reco_locks_to_list(dlm, &resources, dead_node); | 794 | dlm_move_reco_locks_to_list(dlm, &resources, dead_node); |
794 | 795 | ||
795 | /* now we can begin blasting lockreses without the dlm lock */ | 796 | /* now we can begin blasting lockreses without the dlm lock */ |
797 | |||
798 | /* any errors returned will be due to the new_master dying, | ||
799 | * the dlm_reco_thread should detect this */ | ||
796 | list_for_each(iter, &resources) { | 800 | list_for_each(iter, &resources) { |
797 | res = list_entry (iter, struct dlm_lock_resource, recovering); | 801 | res = list_entry (iter, struct dlm_lock_resource, recovering); |
798 | ret = dlm_send_one_lockres(dlm, res, mres, reco_master, | 802 | ret = dlm_send_one_lockres(dlm, res, mres, reco_master, |
799 | DLM_MRES_RECOVERY); | 803 | DLM_MRES_RECOVERY); |
800 | if (ret < 0) | 804 | if (ret < 0) { |
801 | mlog_errno(ret); | 805 | mlog_errno(ret); |
806 | skip_all_done = 1; | ||
807 | break; | ||
808 | } | ||
802 | } | 809 | } |
803 | 810 | ||
804 | /* move the resources back to the list */ | 811 | /* move the resources back to the list */ |
@@ -806,9 +813,12 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data) | |||
806 | list_splice_init(&resources, &dlm->reco.resources); | 813 | list_splice_init(&resources, &dlm->reco.resources); |
807 | spin_unlock(&dlm->spinlock); | 814 | spin_unlock(&dlm->spinlock); |
808 | 815 | ||
809 | ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); | 816 | if (!skip_all_done) { |
810 | if (ret < 0) | 817 | ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); |
811 | mlog_errno(ret); | 818 | if (ret < 0) { |
819 | mlog_errno(ret); | ||
820 | } | ||
821 | } | ||
812 | 822 | ||
813 | free_page((unsigned long)data); | 823 | free_page((unsigned long)data); |
814 | } | 824 | } |
@@ -828,8 +838,14 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to) | |||
828 | 838 | ||
829 | ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, | 839 | ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, |
830 | sizeof(done_msg), send_to, &tmpret); | 840 | sizeof(done_msg), send_to, &tmpret); |
831 | /* negative status is ignored by the caller */ | 841 | if (ret < 0) { |
832 | if (ret >= 0) | 842 | if (!dlm_is_host_down(ret)) { |
843 | mlog_errno(ret); | ||
844 | mlog(ML_ERROR, "%s: unknown error sending data-done " | ||
845 | "to %u\n", dlm->name, send_to); | ||
846 | BUG(); | ||
847 | } | ||
848 | } else | ||
833 | ret = tmpret; | 849 | ret = tmpret; |
834 | return ret; | 850 | return ret; |
835 | } | 851 | } |
@@ -1109,22 +1125,25 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | |||
1109 | * we must send it immediately. */ | 1125 | * we must send it immediately. */ |
1110 | ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, | 1126 | ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, |
1111 | res, total_locks); | 1127 | res, total_locks); |
1112 | if (ret < 0) { | 1128 | if (ret < 0) |
1113 | // TODO | 1129 | goto error; |
1114 | mlog(ML_ERROR, "dlm_send_mig_lockres_msg " | ||
1115 | "returned %d, TODO\n", ret); | ||
1116 | BUG(); | ||
1117 | } | ||
1118 | } | 1130 | } |
1119 | } | 1131 | } |
1120 | /* flush any remaining locks */ | 1132 | /* flush any remaining locks */ |
1121 | ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); | 1133 | ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); |
1122 | if (ret < 0) { | 1134 | if (ret < 0) |
1123 | // TODO | 1135 | goto error; |
1124 | mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, " | 1136 | return ret; |
1125 | "TODO\n", ret); | 1137 | |
1138 | error: | ||
1139 | mlog(ML_ERROR, "%s: dlm_send_mig_lockres_msg returned %d\n", | ||
1140 | dlm->name, ret); | ||
1141 | if (!dlm_is_host_down(ret)) | ||
1126 | BUG(); | 1142 | BUG(); |
1127 | } | 1143 | mlog(0, "%s: node %u went down while sending %s " |
1144 | "lockres %.*s\n", dlm->name, send_to, | ||
1145 | flags & DLM_MRES_RECOVERY ? "recovery" : "migration", | ||
1146 | res->lockname.len, res->lockname.name); | ||
1128 | return ret; | 1147 | return ret; |
1129 | } | 1148 | } |
1130 | 1149 | ||