aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ocfs2/dlm
diff options
context:
space:
mode:
authorKurt Hackel <kurt.hackel@oracle.com>2006-04-27 21:06:58 -0400
committerMark Fasheh <mark.fasheh@oracle.com>2006-06-26 17:42:49 -0400
commit29c0fa0f56f20b4512f65b0f3e55bc8af50485b7 (patch)
treeda16efa4c6c70f6ea01f84f2eb3c2899cf00654c /fs/ocfs2/dlm
parentc3187ce5e335cf8e06391236cc1ad7d1b1e193ed (diff)
ocfs2: handle network errors during recovery
Signed-off-by: Kurt Hackel <kurt.hackel@oracle.com> Signed-off-by: Mark Fasheh <mark.fasheh@oracle.com>
Diffstat (limited to 'fs/ocfs2/dlm')
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c53
1 files changed, 36 insertions, 17 deletions
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 394887637289..59c8976915a9 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -757,6 +757,7 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
757 struct list_head *iter; 757 struct list_head *iter;
758 int ret; 758 int ret;
759 u8 dead_node, reco_master; 759 u8 dead_node, reco_master;
760 int skip_all_done = 0;
760 761
761 dlm = item->dlm; 762 dlm = item->dlm;
762 dead_node = item->u.ral.dead_node; 763 dead_node = item->u.ral.dead_node;
@@ -793,12 +794,18 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
793 dlm_move_reco_locks_to_list(dlm, &resources, dead_node); 794 dlm_move_reco_locks_to_list(dlm, &resources, dead_node);
794 795
795 /* now we can begin blasting lockreses without the dlm lock */ 796 /* now we can begin blasting lockreses without the dlm lock */
797
798 /* any errors returned will be due to the new_master dying,
799 * the dlm_reco_thread should detect this */
796 list_for_each(iter, &resources) { 800 list_for_each(iter, &resources) {
797 res = list_entry (iter, struct dlm_lock_resource, recovering); 801 res = list_entry (iter, struct dlm_lock_resource, recovering);
798 ret = dlm_send_one_lockres(dlm, res, mres, reco_master, 802 ret = dlm_send_one_lockres(dlm, res, mres, reco_master,
799 DLM_MRES_RECOVERY); 803 DLM_MRES_RECOVERY);
800 if (ret < 0) 804 if (ret < 0) {
801 mlog_errno(ret); 805 mlog_errno(ret);
806 skip_all_done = 1;
807 break;
808 }
802 } 809 }
803 810
804 /* move the resources back to the list */ 811 /* move the resources back to the list */
@@ -806,9 +813,12 @@ static void dlm_request_all_locks_worker(struct dlm_work_item *item, void *data)
806 list_splice_init(&resources, &dlm->reco.resources); 813 list_splice_init(&resources, &dlm->reco.resources);
807 spin_unlock(&dlm->spinlock); 814 spin_unlock(&dlm->spinlock);
808 815
809 ret = dlm_send_all_done_msg(dlm, dead_node, reco_master); 816 if (!skip_all_done) {
810 if (ret < 0) 817 ret = dlm_send_all_done_msg(dlm, dead_node, reco_master);
811 mlog_errno(ret); 818 if (ret < 0) {
819 mlog_errno(ret);
820 }
821 }
812 822
813 free_page((unsigned long)data); 823 free_page((unsigned long)data);
814} 824}
@@ -828,8 +838,14 @@ static int dlm_send_all_done_msg(struct dlm_ctxt *dlm, u8 dead_node, u8 send_to)
828 838
829 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg, 839 ret = o2net_send_message(DLM_RECO_DATA_DONE_MSG, dlm->key, &done_msg,
830 sizeof(done_msg), send_to, &tmpret); 840 sizeof(done_msg), send_to, &tmpret);
831 /* negative status is ignored by the caller */ 841 if (ret < 0) {
832 if (ret >= 0) 842 if (!dlm_is_host_down(ret)) {
843 mlog_errno(ret);
844 mlog(ML_ERROR, "%s: unknown error sending data-done "
845 "to %u\n", dlm->name, send_to);
846 BUG();
847 }
848 } else
833 ret = tmpret; 849 ret = tmpret;
834 return ret; 850 return ret;
835} 851}
@@ -1109,22 +1125,25 @@ int dlm_send_one_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res,
1109 * we must send it immediately. */ 1125 * we must send it immediately. */
1110 ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, 1126 ret = dlm_send_mig_lockres_msg(dlm, mres, send_to,
1111 res, total_locks); 1127 res, total_locks);
1112 if (ret < 0) { 1128 if (ret < 0)
1113 // TODO 1129 goto error;
1114 mlog(ML_ERROR, "dlm_send_mig_lockres_msg "
1115 "returned %d, TODO\n", ret);
1116 BUG();
1117 }
1118 } 1130 }
1119 } 1131 }
1120 /* flush any remaining locks */ 1132 /* flush any remaining locks */
1121 ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks); 1133 ret = dlm_send_mig_lockres_msg(dlm, mres, send_to, res, total_locks);
1122 if (ret < 0) { 1134 if (ret < 0)
1123 // TODO 1135 goto error;
1124 mlog(ML_ERROR, "dlm_send_mig_lockres_msg returned %d, " 1136 return ret;
1125 "TODO\n", ret); 1137
1138error:
1139 mlog(ML_ERROR, "%s: dlm_send_mig_lockres_msg returned %d\n",
1140 dlm->name, ret);
1141 if (!dlm_is_host_down(ret))
1126 BUG(); 1142 BUG();
1127 } 1143 mlog(0, "%s: node %u went down while sending %s "
1144 "lockres %.*s\n", dlm->name, send_to,
1145 flags & DLM_MRES_RECOVERY ? "recovery" : "migration",
1146 res->lockname.len, res->lockname.name);
1128 return ret; 1147 return ret;
1129} 1148}
1130 1149