diff options
Diffstat (limited to 'fs/ocfs2/dlm/dlmmaster.c')
-rw-r--r-- | fs/ocfs2/dlm/dlmmaster.c | 579 |
1 files changed, 510 insertions, 69 deletions
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 0ad872055cb3..77e4e6169a0d 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c | |||
@@ -99,9 +99,10 @@ static void dlm_mle_node_up(struct dlm_ctxt *dlm, | |||
99 | int idx); | 99 | int idx); |
100 | 100 | ||
101 | static void dlm_assert_master_worker(struct dlm_work_item *item, void *data); | 101 | static void dlm_assert_master_worker(struct dlm_work_item *item, void *data); |
102 | static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname, | 102 | static int dlm_do_assert_master(struct dlm_ctxt *dlm, |
103 | unsigned int namelen, void *nodemap, | 103 | struct dlm_lock_resource *res, |
104 | u32 flags); | 104 | void *nodemap, u32 flags); |
105 | static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data); | ||
105 | 106 | ||
106 | static inline int dlm_mle_equal(struct dlm_ctxt *dlm, | 107 | static inline int dlm_mle_equal(struct dlm_ctxt *dlm, |
107 | struct dlm_master_list_entry *mle, | 108 | struct dlm_master_list_entry *mle, |
@@ -237,7 +238,8 @@ static int dlm_find_mle(struct dlm_ctxt *dlm, | |||
237 | struct dlm_master_list_entry **mle, | 238 | struct dlm_master_list_entry **mle, |
238 | char *name, unsigned int namelen); | 239 | char *name, unsigned int namelen); |
239 | 240 | ||
240 | static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to); | 241 | static int dlm_do_master_request(struct dlm_lock_resource *res, |
242 | struct dlm_master_list_entry *mle, int to); | ||
241 | 243 | ||
242 | 244 | ||
243 | static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, | 245 | static int dlm_wait_for_lock_mastery(struct dlm_ctxt *dlm, |
@@ -687,6 +689,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, | |||
687 | INIT_LIST_HEAD(&res->purge); | 689 | INIT_LIST_HEAD(&res->purge); |
688 | atomic_set(&res->asts_reserved, 0); | 690 | atomic_set(&res->asts_reserved, 0); |
689 | res->migration_pending = 0; | 691 | res->migration_pending = 0; |
692 | res->inflight_locks = 0; | ||
690 | 693 | ||
691 | kref_init(&res->refs); | 694 | kref_init(&res->refs); |
692 | 695 | ||
@@ -700,6 +703,7 @@ static void dlm_init_lockres(struct dlm_ctxt *dlm, | |||
700 | res->last_used = 0; | 703 | res->last_used = 0; |
701 | 704 | ||
702 | memset(res->lvb, 0, DLM_LVB_LEN); | 705 | memset(res->lvb, 0, DLM_LVB_LEN); |
706 | memset(res->refmap, 0, sizeof(res->refmap)); | ||
703 | } | 707 | } |
704 | 708 | ||
705 | struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, | 709 | struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, |
@@ -722,6 +726,42 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, | |||
722 | return res; | 726 | return res; |
723 | } | 727 | } |
724 | 728 | ||
729 | void __dlm_lockres_grab_inflight_ref(struct dlm_ctxt *dlm, | ||
730 | struct dlm_lock_resource *res, | ||
731 | int new_lockres, | ||
732 | const char *file, | ||
733 | int line) | ||
734 | { | ||
735 | if (!new_lockres) | ||
736 | assert_spin_locked(&res->spinlock); | ||
737 | |||
738 | if (!test_bit(dlm->node_num, res->refmap)) { | ||
739 | BUG_ON(res->inflight_locks != 0); | ||
740 | dlm_lockres_set_refmap_bit(dlm->node_num, res); | ||
741 | } | ||
742 | res->inflight_locks++; | ||
743 | mlog(0, "%s:%.*s: inflight++: now %u\n", | ||
744 | dlm->name, res->lockname.len, res->lockname.name, | ||
745 | res->inflight_locks); | ||
746 | } | ||
747 | |||
748 | void __dlm_lockres_drop_inflight_ref(struct dlm_ctxt *dlm, | ||
749 | struct dlm_lock_resource *res, | ||
750 | const char *file, | ||
751 | int line) | ||
752 | { | ||
753 | assert_spin_locked(&res->spinlock); | ||
754 | |||
755 | BUG_ON(res->inflight_locks == 0); | ||
756 | res->inflight_locks--; | ||
757 | mlog(0, "%s:%.*s: inflight--: now %u\n", | ||
758 | dlm->name, res->lockname.len, res->lockname.name, | ||
759 | res->inflight_locks); | ||
760 | if (res->inflight_locks == 0) | ||
761 | dlm_lockres_clear_refmap_bit(dlm->node_num, res); | ||
762 | wake_up(&res->wq); | ||
763 | } | ||
764 | |||
725 | /* | 765 | /* |
726 | * lookup a lock resource by name. | 766 | * lookup a lock resource by name. |
727 | * may already exist in the hashtable. | 767 | * may already exist in the hashtable. |
@@ -752,6 +792,7 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, | |||
752 | unsigned int hash; | 792 | unsigned int hash; |
753 | int tries = 0; | 793 | int tries = 0; |
754 | int bit, wait_on_recovery = 0; | 794 | int bit, wait_on_recovery = 0; |
795 | int drop_inflight_if_nonlocal = 0; | ||
755 | 796 | ||
756 | BUG_ON(!lockid); | 797 | BUG_ON(!lockid); |
757 | 798 | ||
@@ -761,9 +802,30 @@ struct dlm_lock_resource * dlm_get_lock_resource(struct dlm_ctxt *dlm, | |||
761 | 802 | ||
762 | lookup: | 803 | lookup: |
763 | spin_lock(&dlm->spinlock); | 804 | spin_lock(&dlm->spinlock); |
764 | tmpres = __dlm_lookup_lockres(dlm, lockid, namelen, hash); | 805 | tmpres = __dlm_lookup_lockres_full(dlm, lockid, namelen, hash); |
765 | if (tmpres) { | 806 | if (tmpres) { |
807 | int dropping_ref = 0; | ||
808 | |||
809 | spin_lock(&tmpres->spinlock); | ||
810 | if (tmpres->owner == dlm->node_num) { | ||
811 | BUG_ON(tmpres->state & DLM_LOCK_RES_DROPPING_REF); | ||
812 | dlm_lockres_grab_inflight_ref(dlm, tmpres); | ||
813 | } else if (tmpres->state & DLM_LOCK_RES_DROPPING_REF) | ||
814 | dropping_ref = 1; | ||
815 | spin_unlock(&tmpres->spinlock); | ||
766 | spin_unlock(&dlm->spinlock); | 816 | spin_unlock(&dlm->spinlock); |
817 | |||
818 | /* wait until done messaging the master, drop our ref to allow | ||
819 | * the lockres to be purged, start over. */ | ||
820 | if (dropping_ref) { | ||
821 | spin_lock(&tmpres->spinlock); | ||
822 | __dlm_wait_on_lockres_flags(tmpres, DLM_LOCK_RES_DROPPING_REF); | ||
823 | spin_unlock(&tmpres->spinlock); | ||
824 | dlm_lockres_put(tmpres); | ||
825 | tmpres = NULL; | ||
826 | goto lookup; | ||
827 | } | ||
828 | |||
767 | mlog(0, "found in hash!\n"); | 829 | mlog(0, "found in hash!\n"); |
768 | if (res) | 830 | if (res) |
769 | dlm_lockres_put(res); | 831 | dlm_lockres_put(res); |
@@ -793,6 +855,7 @@ lookup: | |||
793 | spin_lock(&res->spinlock); | 855 | spin_lock(&res->spinlock); |
794 | dlm_change_lockres_owner(dlm, res, dlm->node_num); | 856 | dlm_change_lockres_owner(dlm, res, dlm->node_num); |
795 | __dlm_insert_lockres(dlm, res); | 857 | __dlm_insert_lockres(dlm, res); |
858 | dlm_lockres_grab_inflight_ref(dlm, res); | ||
796 | spin_unlock(&res->spinlock); | 859 | spin_unlock(&res->spinlock); |
797 | spin_unlock(&dlm->spinlock); | 860 | spin_unlock(&dlm->spinlock); |
798 | /* lockres still marked IN_PROGRESS */ | 861 | /* lockres still marked IN_PROGRESS */ |
@@ -805,29 +868,40 @@ lookup: | |||
805 | /* if we found a block, wait for lock to be mastered by another node */ | 868 | /* if we found a block, wait for lock to be mastered by another node */ |
806 | blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen); | 869 | blocked = dlm_find_mle(dlm, &mle, (char *)lockid, namelen); |
807 | if (blocked) { | 870 | if (blocked) { |
871 | int mig; | ||
808 | if (mle->type == DLM_MLE_MASTER) { | 872 | if (mle->type == DLM_MLE_MASTER) { |
809 | mlog(ML_ERROR, "master entry for nonexistent lock!\n"); | 873 | mlog(ML_ERROR, "master entry for nonexistent lock!\n"); |
810 | BUG(); | 874 | BUG(); |
811 | } else if (mle->type == DLM_MLE_MIGRATION) { | 875 | } |
812 | /* migration is in progress! */ | 876 | mig = (mle->type == DLM_MLE_MIGRATION); |
813 | /* the good news is that we now know the | 877 | /* if there is a migration in progress, let the migration |
814 | * "current" master (mle->master). */ | 878 | * finish before continuing. we can wait for the absence |
815 | 879 | * of the MIGRATION mle: either the migrate finished or | |
880 | * one of the nodes died and the mle was cleaned up. | ||
881 | * if there is a BLOCK here, but it already has a master | ||
882 | * set, we are too late. the master does not have a ref | ||
883 | * for us in the refmap. detach the mle and drop it. | ||
884 | * either way, go back to the top and start over. */ | ||
885 | if (mig || mle->master != O2NM_MAX_NODES) { | ||
886 | BUG_ON(mig && mle->master == dlm->node_num); | ||
887 | /* we arrived too late. the master does not | ||
888 | * have a ref for us. retry. */ | ||
889 | mlog(0, "%s:%.*s: late on %s\n", | ||
890 | dlm->name, namelen, lockid, | ||
891 | mig ? "MIGRATION" : "BLOCK"); | ||
816 | spin_unlock(&dlm->master_lock); | 892 | spin_unlock(&dlm->master_lock); |
817 | assert_spin_locked(&dlm->spinlock); | ||
818 | |||
819 | /* set the lockres owner and hash it */ | ||
820 | spin_lock(&res->spinlock); | ||
821 | dlm_set_lockres_owner(dlm, res, mle->master); | ||
822 | __dlm_insert_lockres(dlm, res); | ||
823 | spin_unlock(&res->spinlock); | ||
824 | spin_unlock(&dlm->spinlock); | 893 | spin_unlock(&dlm->spinlock); |
825 | 894 | ||
826 | /* master is known, detach */ | 895 | /* master is known, detach */ |
827 | dlm_mle_detach_hb_events(dlm, mle); | 896 | if (!mig) |
897 | dlm_mle_detach_hb_events(dlm, mle); | ||
828 | dlm_put_mle(mle); | 898 | dlm_put_mle(mle); |
829 | mle = NULL; | 899 | mle = NULL; |
830 | goto wake_waiters; | 900 | /* this is lame, but we cant wait on either |
901 | * the mle or lockres waitqueue here */ | ||
902 | if (mig) | ||
903 | msleep(100); | ||
904 | goto lookup; | ||
831 | } | 905 | } |
832 | } else { | 906 | } else { |
833 | /* go ahead and try to master lock on this node */ | 907 | /* go ahead and try to master lock on this node */ |
@@ -858,6 +932,13 @@ lookup: | |||
858 | 932 | ||
859 | /* finally add the lockres to its hash bucket */ | 933 | /* finally add the lockres to its hash bucket */ |
860 | __dlm_insert_lockres(dlm, res); | 934 | __dlm_insert_lockres(dlm, res); |
935 | /* since this lockres is new it doesnt not require the spinlock */ | ||
936 | dlm_lockres_grab_inflight_ref_new(dlm, res); | ||
937 | |||
938 | /* if this node does not become the master make sure to drop | ||
939 | * this inflight reference below */ | ||
940 | drop_inflight_if_nonlocal = 1; | ||
941 | |||
861 | /* get an extra ref on the mle in case this is a BLOCK | 942 | /* get an extra ref on the mle in case this is a BLOCK |
862 | * if so, the creator of the BLOCK may try to put the last | 943 | * if so, the creator of the BLOCK may try to put the last |
863 | * ref at this time in the assert master handler, so we | 944 | * ref at this time in the assert master handler, so we |
@@ -910,7 +991,7 @@ redo_request: | |||
910 | ret = -EINVAL; | 991 | ret = -EINVAL; |
911 | dlm_node_iter_init(mle->vote_map, &iter); | 992 | dlm_node_iter_init(mle->vote_map, &iter); |
912 | while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { | 993 | while ((nodenum = dlm_node_iter_next(&iter)) >= 0) { |
913 | ret = dlm_do_master_request(mle, nodenum); | 994 | ret = dlm_do_master_request(res, mle, nodenum); |
914 | if (ret < 0) | 995 | if (ret < 0) |
915 | mlog_errno(ret); | 996 | mlog_errno(ret); |
916 | if (mle->master != O2NM_MAX_NODES) { | 997 | if (mle->master != O2NM_MAX_NODES) { |
@@ -960,6 +1041,8 @@ wait: | |||
960 | 1041 | ||
961 | wake_waiters: | 1042 | wake_waiters: |
962 | spin_lock(&res->spinlock); | 1043 | spin_lock(&res->spinlock); |
1044 | if (res->owner != dlm->node_num && drop_inflight_if_nonlocal) | ||
1045 | dlm_lockres_drop_inflight_ref(dlm, res); | ||
963 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; | 1046 | res->state &= ~DLM_LOCK_RES_IN_PROGRESS; |
964 | spin_unlock(&res->spinlock); | 1047 | spin_unlock(&res->spinlock); |
965 | wake_up(&res->wq); | 1048 | wake_up(&res->wq); |
@@ -998,7 +1081,7 @@ recheck: | |||
998 | /* this will cause the master to re-assert across | 1081 | /* this will cause the master to re-assert across |
999 | * the whole cluster, freeing up mles */ | 1082 | * the whole cluster, freeing up mles */ |
1000 | if (res->owner != dlm->node_num) { | 1083 | if (res->owner != dlm->node_num) { |
1001 | ret = dlm_do_master_request(mle, res->owner); | 1084 | ret = dlm_do_master_request(res, mle, res->owner); |
1002 | if (ret < 0) { | 1085 | if (ret < 0) { |
1003 | /* give recovery a chance to run */ | 1086 | /* give recovery a chance to run */ |
1004 | mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); | 1087 | mlog(ML_ERROR, "link to %u went down?: %d\n", res->owner, ret); |
@@ -1062,6 +1145,8 @@ recheck: | |||
1062 | * now tell other nodes that I am | 1145 | * now tell other nodes that I am |
1063 | * mastering this. */ | 1146 | * mastering this. */ |
1064 | mle->master = dlm->node_num; | 1147 | mle->master = dlm->node_num; |
1148 | /* ref was grabbed in get_lock_resource | ||
1149 | * will be dropped in dlmlock_master */ | ||
1065 | assert = 1; | 1150 | assert = 1; |
1066 | sleep = 0; | 1151 | sleep = 0; |
1067 | } | 1152 | } |
@@ -1087,7 +1172,8 @@ recheck: | |||
1087 | (atomic_read(&mle->woken) == 1), | 1172 | (atomic_read(&mle->woken) == 1), |
1088 | timeo); | 1173 | timeo); |
1089 | if (res->owner == O2NM_MAX_NODES) { | 1174 | if (res->owner == O2NM_MAX_NODES) { |
1090 | mlog(0, "waiting again\n"); | 1175 | mlog(0, "%s:%.*s: waiting again\n", dlm->name, |
1176 | res->lockname.len, res->lockname.name); | ||
1091 | goto recheck; | 1177 | goto recheck; |
1092 | } | 1178 | } |
1093 | mlog(0, "done waiting, master is %u\n", res->owner); | 1179 | mlog(0, "done waiting, master is %u\n", res->owner); |
@@ -1100,8 +1186,7 @@ recheck: | |||
1100 | m = dlm->node_num; | 1186 | m = dlm->node_num; |
1101 | mlog(0, "about to master %.*s here, this=%u\n", | 1187 | mlog(0, "about to master %.*s here, this=%u\n", |
1102 | res->lockname.len, res->lockname.name, m); | 1188 | res->lockname.len, res->lockname.name, m); |
1103 | ret = dlm_do_assert_master(dlm, res->lockname.name, | 1189 | ret = dlm_do_assert_master(dlm, res, mle->vote_map, 0); |
1104 | res->lockname.len, mle->vote_map, 0); | ||
1105 | if (ret) { | 1190 | if (ret) { |
1106 | /* This is a failure in the network path, | 1191 | /* This is a failure in the network path, |
1107 | * not in the response to the assert_master | 1192 | * not in the response to the assert_master |
@@ -1117,6 +1202,8 @@ recheck: | |||
1117 | 1202 | ||
1118 | /* set the lockres owner */ | 1203 | /* set the lockres owner */ |
1119 | spin_lock(&res->spinlock); | 1204 | spin_lock(&res->spinlock); |
1205 | /* mastery reference obtained either during | ||
1206 | * assert_master_handler or in get_lock_resource */ | ||
1120 | dlm_change_lockres_owner(dlm, res, m); | 1207 | dlm_change_lockres_owner(dlm, res, m); |
1121 | spin_unlock(&res->spinlock); | 1208 | spin_unlock(&res->spinlock); |
1122 | 1209 | ||
@@ -1283,7 +1370,8 @@ static int dlm_restart_lock_mastery(struct dlm_ctxt *dlm, | |||
1283 | * | 1370 | * |
1284 | */ | 1371 | */ |
1285 | 1372 | ||
1286 | static int dlm_do_master_request(struct dlm_master_list_entry *mle, int to) | 1373 | static int dlm_do_master_request(struct dlm_lock_resource *res, |
1374 | struct dlm_master_list_entry *mle, int to) | ||
1287 | { | 1375 | { |
1288 | struct dlm_ctxt *dlm = mle->dlm; | 1376 | struct dlm_ctxt *dlm = mle->dlm; |
1289 | struct dlm_master_request request; | 1377 | struct dlm_master_request request; |
@@ -1339,6 +1427,9 @@ again: | |||
1339 | case DLM_MASTER_RESP_YES: | 1427 | case DLM_MASTER_RESP_YES: |
1340 | set_bit(to, mle->response_map); | 1428 | set_bit(to, mle->response_map); |
1341 | mlog(0, "node %u is the master, response=YES\n", to); | 1429 | mlog(0, "node %u is the master, response=YES\n", to); |
1430 | mlog(0, "%s:%.*s: master node %u now knows I have a " | ||
1431 | "reference\n", dlm->name, res->lockname.len, | ||
1432 | res->lockname.name, to); | ||
1342 | mle->master = to; | 1433 | mle->master = to; |
1343 | break; | 1434 | break; |
1344 | case DLM_MASTER_RESP_NO: | 1435 | case DLM_MASTER_RESP_NO: |
@@ -1379,7 +1470,8 @@ out: | |||
1379 | * | 1470 | * |
1380 | * if possible, TRIM THIS DOWN!!! | 1471 | * if possible, TRIM THIS DOWN!!! |
1381 | */ | 1472 | */ |
1382 | int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data) | 1473 | int dlm_master_request_handler(struct o2net_msg *msg, u32 len, void *data, |
1474 | void **ret_data) | ||
1383 | { | 1475 | { |
1384 | u8 response = DLM_MASTER_RESP_MAYBE; | 1476 | u8 response = DLM_MASTER_RESP_MAYBE; |
1385 | struct dlm_ctxt *dlm = data; | 1477 | struct dlm_ctxt *dlm = data; |
@@ -1417,10 +1509,11 @@ way_up_top: | |||
1417 | 1509 | ||
1418 | /* take care of the easy cases up front */ | 1510 | /* take care of the easy cases up front */ |
1419 | spin_lock(&res->spinlock); | 1511 | spin_lock(&res->spinlock); |
1420 | if (res->state & DLM_LOCK_RES_RECOVERING) { | 1512 | if (res->state & (DLM_LOCK_RES_RECOVERING| |
1513 | DLM_LOCK_RES_MIGRATING)) { | ||
1421 | spin_unlock(&res->spinlock); | 1514 | spin_unlock(&res->spinlock); |
1422 | mlog(0, "returning DLM_MASTER_RESP_ERROR since res is " | 1515 | mlog(0, "returning DLM_MASTER_RESP_ERROR since res is " |
1423 | "being recovered\n"); | 1516 | "being recovered/migrated\n"); |
1424 | response = DLM_MASTER_RESP_ERROR; | 1517 | response = DLM_MASTER_RESP_ERROR; |
1425 | if (mle) | 1518 | if (mle) |
1426 | kmem_cache_free(dlm_mle_cache, mle); | 1519 | kmem_cache_free(dlm_mle_cache, mle); |
@@ -1428,8 +1521,10 @@ way_up_top: | |||
1428 | } | 1521 | } |
1429 | 1522 | ||
1430 | if (res->owner == dlm->node_num) { | 1523 | if (res->owner == dlm->node_num) { |
1524 | mlog(0, "%s:%.*s: setting bit %u in refmap\n", | ||
1525 | dlm->name, namelen, name, request->node_idx); | ||
1526 | dlm_lockres_set_refmap_bit(request->node_idx, res); | ||
1431 | spin_unlock(&res->spinlock); | 1527 | spin_unlock(&res->spinlock); |
1432 | // mlog(0, "this node is the master\n"); | ||
1433 | response = DLM_MASTER_RESP_YES; | 1528 | response = DLM_MASTER_RESP_YES; |
1434 | if (mle) | 1529 | if (mle) |
1435 | kmem_cache_free(dlm_mle_cache, mle); | 1530 | kmem_cache_free(dlm_mle_cache, mle); |
@@ -1477,7 +1572,6 @@ way_up_top: | |||
1477 | mlog(0, "node %u is master, but trying to migrate to " | 1572 | mlog(0, "node %u is master, but trying to migrate to " |
1478 | "node %u.\n", tmpmle->master, tmpmle->new_master); | 1573 | "node %u.\n", tmpmle->master, tmpmle->new_master); |
1479 | if (tmpmle->master == dlm->node_num) { | 1574 | if (tmpmle->master == dlm->node_num) { |
1480 | response = DLM_MASTER_RESP_YES; | ||
1481 | mlog(ML_ERROR, "no owner on lockres, but this " | 1575 | mlog(ML_ERROR, "no owner on lockres, but this " |
1482 | "node is trying to migrate it to %u?!\n", | 1576 | "node is trying to migrate it to %u?!\n", |
1483 | tmpmle->new_master); | 1577 | tmpmle->new_master); |
@@ -1494,6 +1588,10 @@ way_up_top: | |||
1494 | * go back and clean the mles on any | 1588 | * go back and clean the mles on any |
1495 | * other nodes */ | 1589 | * other nodes */ |
1496 | dispatch_assert = 1; | 1590 | dispatch_assert = 1; |
1591 | dlm_lockres_set_refmap_bit(request->node_idx, res); | ||
1592 | mlog(0, "%s:%.*s: setting bit %u in refmap\n", | ||
1593 | dlm->name, namelen, name, | ||
1594 | request->node_idx); | ||
1497 | } else | 1595 | } else |
1498 | response = DLM_MASTER_RESP_NO; | 1596 | response = DLM_MASTER_RESP_NO; |
1499 | } else { | 1597 | } else { |
@@ -1607,17 +1705,24 @@ send_response: | |||
1607 | * can periodically run all locks owned by this node | 1705 | * can periodically run all locks owned by this node |
1608 | * and re-assert across the cluster... | 1706 | * and re-assert across the cluster... |
1609 | */ | 1707 | */ |
1610 | static int dlm_do_assert_master(struct dlm_ctxt *dlm, const char *lockname, | 1708 | int dlm_do_assert_master(struct dlm_ctxt *dlm, |
1611 | unsigned int namelen, void *nodemap, | 1709 | struct dlm_lock_resource *res, |
1612 | u32 flags) | 1710 | void *nodemap, u32 flags) |
1613 | { | 1711 | { |
1614 | struct dlm_assert_master assert; | 1712 | struct dlm_assert_master assert; |
1615 | int to, tmpret; | 1713 | int to, tmpret; |
1616 | struct dlm_node_iter iter; | 1714 | struct dlm_node_iter iter; |
1617 | int ret = 0; | 1715 | int ret = 0; |
1618 | int reassert; | 1716 | int reassert; |
1717 | const char *lockname = res->lockname.name; | ||
1718 | unsigned int namelen = res->lockname.len; | ||
1619 | 1719 | ||
1620 | BUG_ON(namelen > O2NM_MAX_NAME_LEN); | 1720 | BUG_ON(namelen > O2NM_MAX_NAME_LEN); |
1721 | |||
1722 | spin_lock(&res->spinlock); | ||
1723 | res->state |= DLM_LOCK_RES_SETREF_INPROG; | ||
1724 | spin_unlock(&res->spinlock); | ||
1725 | |||
1621 | again: | 1726 | again: |
1622 | reassert = 0; | 1727 | reassert = 0; |
1623 | 1728 | ||
@@ -1647,6 +1752,7 @@ again: | |||
1647 | mlog(0, "link to %d went down!\n", to); | 1752 | mlog(0, "link to %d went down!\n", to); |
1648 | /* any nonzero status return will do */ | 1753 | /* any nonzero status return will do */ |
1649 | ret = tmpret; | 1754 | ret = tmpret; |
1755 | r = 0; | ||
1650 | } else if (r < 0) { | 1756 | } else if (r < 0) { |
1651 | /* ok, something horribly messed. kill thyself. */ | 1757 | /* ok, something horribly messed. kill thyself. */ |
1652 | mlog(ML_ERROR,"during assert master of %.*s to %u, " | 1758 | mlog(ML_ERROR,"during assert master of %.*s to %u, " |
@@ -1661,17 +1767,39 @@ again: | |||
1661 | spin_unlock(&dlm->master_lock); | 1767 | spin_unlock(&dlm->master_lock); |
1662 | spin_unlock(&dlm->spinlock); | 1768 | spin_unlock(&dlm->spinlock); |
1663 | BUG(); | 1769 | BUG(); |
1664 | } else if (r == EAGAIN) { | 1770 | } |
1771 | |||
1772 | if (r & DLM_ASSERT_RESPONSE_REASSERT && | ||
1773 | !(r & DLM_ASSERT_RESPONSE_MASTERY_REF)) { | ||
1774 | mlog(ML_ERROR, "%.*s: very strange, " | ||
1775 | "master MLE but no lockres on %u\n", | ||
1776 | namelen, lockname, to); | ||
1777 | } | ||
1778 | |||
1779 | if (r & DLM_ASSERT_RESPONSE_REASSERT) { | ||
1665 | mlog(0, "%.*s: node %u create mles on other " | 1780 | mlog(0, "%.*s: node %u create mles on other " |
1666 | "nodes and requests a re-assert\n", | 1781 | "nodes and requests a re-assert\n", |
1667 | namelen, lockname, to); | 1782 | namelen, lockname, to); |
1668 | reassert = 1; | 1783 | reassert = 1; |
1669 | } | 1784 | } |
1785 | if (r & DLM_ASSERT_RESPONSE_MASTERY_REF) { | ||
1786 | mlog(0, "%.*s: node %u has a reference to this " | ||
1787 | "lockres, set the bit in the refmap\n", | ||
1788 | namelen, lockname, to); | ||
1789 | spin_lock(&res->spinlock); | ||
1790 | dlm_lockres_set_refmap_bit(to, res); | ||
1791 | spin_unlock(&res->spinlock); | ||
1792 | } | ||
1670 | } | 1793 | } |
1671 | 1794 | ||
1672 | if (reassert) | 1795 | if (reassert) |
1673 | goto again; | 1796 | goto again; |
1674 | 1797 | ||
1798 | spin_lock(&res->spinlock); | ||
1799 | res->state &= ~DLM_LOCK_RES_SETREF_INPROG; | ||
1800 | spin_unlock(&res->spinlock); | ||
1801 | wake_up(&res->wq); | ||
1802 | |||
1675 | return ret; | 1803 | return ret; |
1676 | } | 1804 | } |
1677 | 1805 | ||
@@ -1684,7 +1812,8 @@ again: | |||
1684 | * | 1812 | * |
1685 | * if possible, TRIM THIS DOWN!!! | 1813 | * if possible, TRIM THIS DOWN!!! |
1686 | */ | 1814 | */ |
1687 | int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) | 1815 | int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, |
1816 | void **ret_data) | ||
1688 | { | 1817 | { |
1689 | struct dlm_ctxt *dlm = data; | 1818 | struct dlm_ctxt *dlm = data; |
1690 | struct dlm_master_list_entry *mle = NULL; | 1819 | struct dlm_master_list_entry *mle = NULL; |
@@ -1693,7 +1822,7 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data) | |||
1693 | char *name; | 1822 | char *name; |
1694 | unsigned int namelen, hash; | 1823 | unsigned int namelen, hash; |
1695 | u32 flags; | 1824 | u32 flags; |
1696 | int master_request = 0; | 1825 | int master_request = 0, have_lockres_ref = 0; |
1697 | int ret = 0; | 1826 | int ret = 0; |
1698 | 1827 | ||
1699 | if (!dlm_grab(dlm)) | 1828 | if (!dlm_grab(dlm)) |
@@ -1851,6 +1980,7 @@ ok: | |||
1851 | spin_unlock(&mle->spinlock); | 1980 | spin_unlock(&mle->spinlock); |
1852 | 1981 | ||
1853 | if (res) { | 1982 | if (res) { |
1983 | int wake = 0; | ||
1854 | spin_lock(&res->spinlock); | 1984 | spin_lock(&res->spinlock); |
1855 | if (mle->type == DLM_MLE_MIGRATION) { | 1985 | if (mle->type == DLM_MLE_MIGRATION) { |
1856 | mlog(0, "finishing off migration of lockres %.*s, " | 1986 | mlog(0, "finishing off migration of lockres %.*s, " |
@@ -1858,12 +1988,16 @@ ok: | |||
1858 | res->lockname.len, res->lockname.name, | 1988 | res->lockname.len, res->lockname.name, |
1859 | dlm->node_num, mle->new_master); | 1989 | dlm->node_num, mle->new_master); |
1860 | res->state &= ~DLM_LOCK_RES_MIGRATING; | 1990 | res->state &= ~DLM_LOCK_RES_MIGRATING; |
1991 | wake = 1; | ||
1861 | dlm_change_lockres_owner(dlm, res, mle->new_master); | 1992 | dlm_change_lockres_owner(dlm, res, mle->new_master); |
1862 | BUG_ON(res->state & DLM_LOCK_RES_DIRTY); | 1993 | BUG_ON(res->state & DLM_LOCK_RES_DIRTY); |
1863 | } else { | 1994 | } else { |
1864 | dlm_change_lockres_owner(dlm, res, mle->master); | 1995 | dlm_change_lockres_owner(dlm, res, mle->master); |
1865 | } | 1996 | } |
1866 | spin_unlock(&res->spinlock); | 1997 | spin_unlock(&res->spinlock); |
1998 | have_lockres_ref = 1; | ||
1999 | if (wake) | ||
2000 | wake_up(&res->wq); | ||
1867 | } | 2001 | } |
1868 | 2002 | ||
1869 | /* master is known, detach if not already detached. | 2003 | /* master is known, detach if not already detached. |
@@ -1913,12 +2047,28 @@ ok: | |||
1913 | 2047 | ||
1914 | done: | 2048 | done: |
1915 | ret = 0; | 2049 | ret = 0; |
1916 | if (res) | 2050 | if (res) { |
1917 | dlm_lockres_put(res); | 2051 | spin_lock(&res->spinlock); |
2052 | res->state |= DLM_LOCK_RES_SETREF_INPROG; | ||
2053 | spin_unlock(&res->spinlock); | ||
2054 | *ret_data = (void *)res; | ||
2055 | } | ||
1918 | dlm_put(dlm); | 2056 | dlm_put(dlm); |
1919 | if (master_request) { | 2057 | if (master_request) { |
1920 | mlog(0, "need to tell master to reassert\n"); | 2058 | mlog(0, "need to tell master to reassert\n"); |
1921 | ret = EAGAIN; // positive. negative would shoot down the node. | 2059 | /* positive. negative would shoot down the node. */ |
2060 | ret |= DLM_ASSERT_RESPONSE_REASSERT; | ||
2061 | if (!have_lockres_ref) { | ||
2062 | mlog(ML_ERROR, "strange, got assert from %u, MASTER " | ||
2063 | "mle present here for %s:%.*s, but no lockres!\n", | ||
2064 | assert->node_idx, dlm->name, namelen, name); | ||
2065 | } | ||
2066 | } | ||
2067 | if (have_lockres_ref) { | ||
2068 | /* let the master know we have a reference to the lockres */ | ||
2069 | ret |= DLM_ASSERT_RESPONSE_MASTERY_REF; | ||
2070 | mlog(0, "%s:%.*s: got assert from %u, need a ref\n", | ||
2071 | dlm->name, namelen, name, assert->node_idx); | ||
1922 | } | 2072 | } |
1923 | return ret; | 2073 | return ret; |
1924 | 2074 | ||
@@ -1929,11 +2079,25 @@ kill: | |||
1929 | __dlm_print_one_lock_resource(res); | 2079 | __dlm_print_one_lock_resource(res); |
1930 | spin_unlock(&res->spinlock); | 2080 | spin_unlock(&res->spinlock); |
1931 | spin_unlock(&dlm->spinlock); | 2081 | spin_unlock(&dlm->spinlock); |
1932 | dlm_lockres_put(res); | 2082 | *ret_data = (void *)res; |
1933 | dlm_put(dlm); | 2083 | dlm_put(dlm); |
1934 | return -EINVAL; | 2084 | return -EINVAL; |
1935 | } | 2085 | } |
1936 | 2086 | ||
2087 | void dlm_assert_master_post_handler(int status, void *data, void *ret_data) | ||
2088 | { | ||
2089 | struct dlm_lock_resource *res = (struct dlm_lock_resource *)ret_data; | ||
2090 | |||
2091 | if (ret_data) { | ||
2092 | spin_lock(&res->spinlock); | ||
2093 | res->state &= ~DLM_LOCK_RES_SETREF_INPROG; | ||
2094 | spin_unlock(&res->spinlock); | ||
2095 | wake_up(&res->wq); | ||
2096 | dlm_lockres_put(res); | ||
2097 | } | ||
2098 | return; | ||
2099 | } | ||
2100 | |||
1937 | int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, | 2101 | int dlm_dispatch_assert_master(struct dlm_ctxt *dlm, |
1938 | struct dlm_lock_resource *res, | 2102 | struct dlm_lock_resource *res, |
1939 | int ignore_higher, u8 request_from, u32 flags) | 2103 | int ignore_higher, u8 request_from, u32 flags) |
@@ -2023,9 +2187,7 @@ static void dlm_assert_master_worker(struct dlm_work_item *item, void *data) | |||
2023 | * even if one or more nodes die */ | 2187 | * even if one or more nodes die */ |
2024 | mlog(0, "worker about to master %.*s here, this=%u\n", | 2188 | mlog(0, "worker about to master %.*s here, this=%u\n", |
2025 | res->lockname.len, res->lockname.name, dlm->node_num); | 2189 | res->lockname.len, res->lockname.name, dlm->node_num); |
2026 | ret = dlm_do_assert_master(dlm, res->lockname.name, | 2190 | ret = dlm_do_assert_master(dlm, res, nodemap, flags); |
2027 | res->lockname.len, | ||
2028 | nodemap, flags); | ||
2029 | if (ret < 0) { | 2191 | if (ret < 0) { |
2030 | /* no need to restart, we are done */ | 2192 | /* no need to restart, we are done */ |
2031 | if (!dlm_is_host_down(ret)) | 2193 | if (!dlm_is_host_down(ret)) |
@@ -2097,14 +2259,180 @@ static int dlm_pre_master_reco_lockres(struct dlm_ctxt *dlm, | |||
2097 | return ret; | 2259 | return ret; |
2098 | } | 2260 | } |
2099 | 2261 | ||
2262 | /* | ||
2263 | * DLM_DEREF_LOCKRES_MSG | ||
2264 | */ | ||
2265 | |||
2266 | int dlm_drop_lockres_ref(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) | ||
2267 | { | ||
2268 | struct dlm_deref_lockres deref; | ||
2269 | int ret = 0, r; | ||
2270 | const char *lockname; | ||
2271 | unsigned int namelen; | ||
2272 | |||
2273 | lockname = res->lockname.name; | ||
2274 | namelen = res->lockname.len; | ||
2275 | BUG_ON(namelen > O2NM_MAX_NAME_LEN); | ||
2276 | |||
2277 | mlog(0, "%s:%.*s: sending deref to %d\n", | ||
2278 | dlm->name, namelen, lockname, res->owner); | ||
2279 | memset(&deref, 0, sizeof(deref)); | ||
2280 | deref.node_idx = dlm->node_num; | ||
2281 | deref.namelen = namelen; | ||
2282 | memcpy(deref.name, lockname, namelen); | ||
2283 | |||
2284 | ret = o2net_send_message(DLM_DEREF_LOCKRES_MSG, dlm->key, | ||
2285 | &deref, sizeof(deref), res->owner, &r); | ||
2286 | if (ret < 0) | ||
2287 | mlog_errno(ret); | ||
2288 | else if (r < 0) { | ||
2289 | /* BAD. other node says I did not have a ref. */ | ||
2290 | mlog(ML_ERROR,"while dropping ref on %s:%.*s " | ||
2291 | "(master=%u) got %d.\n", dlm->name, namelen, | ||
2292 | lockname, res->owner, r); | ||
2293 | dlm_print_one_lock_resource(res); | ||
2294 | BUG(); | ||
2295 | } | ||
2296 | return ret; | ||
2297 | } | ||
2298 | |||
2299 | int dlm_deref_lockres_handler(struct o2net_msg *msg, u32 len, void *data, | ||
2300 | void **ret_data) | ||
2301 | { | ||
2302 | struct dlm_ctxt *dlm = data; | ||
2303 | struct dlm_deref_lockres *deref = (struct dlm_deref_lockres *)msg->buf; | ||
2304 | struct dlm_lock_resource *res = NULL; | ||
2305 | char *name; | ||
2306 | unsigned int namelen; | ||
2307 | int ret = -EINVAL; | ||
2308 | u8 node; | ||
2309 | unsigned int hash; | ||
2310 | struct dlm_work_item *item; | ||
2311 | int cleared = 0; | ||
2312 | int dispatch = 0; | ||
2313 | |||
2314 | if (!dlm_grab(dlm)) | ||
2315 | return 0; | ||
2316 | |||
2317 | name = deref->name; | ||
2318 | namelen = deref->namelen; | ||
2319 | node = deref->node_idx; | ||
2320 | |||
2321 | if (namelen > DLM_LOCKID_NAME_MAX) { | ||
2322 | mlog(ML_ERROR, "Invalid name length!"); | ||
2323 | goto done; | ||
2324 | } | ||
2325 | if (deref->node_idx >= O2NM_MAX_NODES) { | ||
2326 | mlog(ML_ERROR, "Invalid node number: %u\n", node); | ||
2327 | goto done; | ||
2328 | } | ||
2329 | |||
2330 | hash = dlm_lockid_hash(name, namelen); | ||
2331 | |||
2332 | spin_lock(&dlm->spinlock); | ||
2333 | res = __dlm_lookup_lockres_full(dlm, name, namelen, hash); | ||
2334 | if (!res) { | ||
2335 | spin_unlock(&dlm->spinlock); | ||
2336 | mlog(ML_ERROR, "%s:%.*s: bad lockres name\n", | ||
2337 | dlm->name, namelen, name); | ||
2338 | goto done; | ||
2339 | } | ||
2340 | spin_unlock(&dlm->spinlock); | ||
2341 | |||
2342 | spin_lock(&res->spinlock); | ||
2343 | if (res->state & DLM_LOCK_RES_SETREF_INPROG) | ||
2344 | dispatch = 1; | ||
2345 | else { | ||
2346 | BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); | ||
2347 | if (test_bit(node, res->refmap)) { | ||
2348 | dlm_lockres_clear_refmap_bit(node, res); | ||
2349 | cleared = 1; | ||
2350 | } | ||
2351 | } | ||
2352 | spin_unlock(&res->spinlock); | ||
2353 | |||
2354 | if (!dispatch) { | ||
2355 | if (cleared) | ||
2356 | dlm_lockres_calc_usage(dlm, res); | ||
2357 | else { | ||
2358 | mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " | ||
2359 | "but it is already dropped!\n", dlm->name, | ||
2360 | res->lockname.len, res->lockname.name, node); | ||
2361 | __dlm_print_one_lock_resource(res); | ||
2362 | } | ||
2363 | ret = 0; | ||
2364 | goto done; | ||
2365 | } | ||
2366 | |||
2367 | item = kzalloc(sizeof(*item), GFP_NOFS); | ||
2368 | if (!item) { | ||
2369 | ret = -ENOMEM; | ||
2370 | mlog_errno(ret); | ||
2371 | goto done; | ||
2372 | } | ||
2373 | |||
2374 | dlm_init_work_item(dlm, item, dlm_deref_lockres_worker, NULL); | ||
2375 | item->u.dl.deref_res = res; | ||
2376 | item->u.dl.deref_node = node; | ||
2377 | |||
2378 | spin_lock(&dlm->work_lock); | ||
2379 | list_add_tail(&item->list, &dlm->work_list); | ||
2380 | spin_unlock(&dlm->work_lock); | ||
2381 | |||
2382 | queue_work(dlm->dlm_worker, &dlm->dispatched_work); | ||
2383 | return 0; | ||
2384 | |||
2385 | done: | ||
2386 | if (res) | ||
2387 | dlm_lockres_put(res); | ||
2388 | dlm_put(dlm); | ||
2389 | |||
2390 | return ret; | ||
2391 | } | ||
2392 | |||
2393 | static void dlm_deref_lockres_worker(struct dlm_work_item *item, void *data) | ||
2394 | { | ||
2395 | struct dlm_ctxt *dlm; | ||
2396 | struct dlm_lock_resource *res; | ||
2397 | u8 node; | ||
2398 | u8 cleared = 0; | ||
2399 | |||
2400 | dlm = item->dlm; | ||
2401 | res = item->u.dl.deref_res; | ||
2402 | node = item->u.dl.deref_node; | ||
2403 | |||
2404 | spin_lock(&res->spinlock); | ||
2405 | BUG_ON(res->state & DLM_LOCK_RES_DROPPING_REF); | ||
2406 | if (test_bit(node, res->refmap)) { | ||
2407 | __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); | ||
2408 | dlm_lockres_clear_refmap_bit(node, res); | ||
2409 | cleared = 1; | ||
2410 | } | ||
2411 | spin_unlock(&res->spinlock); | ||
2412 | |||
2413 | if (cleared) { | ||
2414 | mlog(0, "%s:%.*s node %u ref dropped in dispatch\n", | ||
2415 | dlm->name, res->lockname.len, res->lockname.name, node); | ||
2416 | dlm_lockres_calc_usage(dlm, res); | ||
2417 | } else { | ||
2418 | mlog(ML_ERROR, "%s:%.*s: node %u trying to drop ref " | ||
2419 | "but it is already dropped!\n", dlm->name, | ||
2420 | res->lockname.len, res->lockname.name, node); | ||
2421 | __dlm_print_one_lock_resource(res); | ||
2422 | } | ||
2423 | |||
2424 | dlm_lockres_put(res); | ||
2425 | } | ||
2426 | |||
2100 | 2427 | ||
2101 | /* | 2428 | /* |
2102 | * DLM_MIGRATE_LOCKRES | 2429 | * DLM_MIGRATE_LOCKRES |
2103 | */ | 2430 | */ |
2104 | 2431 | ||
2105 | 2432 | ||
2106 | int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | 2433 | static int dlm_migrate_lockres(struct dlm_ctxt *dlm, |
2107 | u8 target) | 2434 | struct dlm_lock_resource *res, |
2435 | u8 target) | ||
2108 | { | 2436 | { |
2109 | struct dlm_master_list_entry *mle = NULL; | 2437 | struct dlm_master_list_entry *mle = NULL; |
2110 | struct dlm_master_list_entry *oldmle = NULL; | 2438 | struct dlm_master_list_entry *oldmle = NULL; |
@@ -2116,7 +2444,7 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | |||
2116 | struct list_head *queue, *iter; | 2444 | struct list_head *queue, *iter; |
2117 | int i; | 2445 | int i; |
2118 | struct dlm_lock *lock; | 2446 | struct dlm_lock *lock; |
2119 | int empty = 1; | 2447 | int empty = 1, wake = 0; |
2120 | 2448 | ||
2121 | if (!dlm_grab(dlm)) | 2449 | if (!dlm_grab(dlm)) |
2122 | return -EINVAL; | 2450 | return -EINVAL; |
@@ -2241,6 +2569,7 @@ int dlm_migrate_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | |||
2241 | res->lockname.name, target); | 2569 | res->lockname.name, target); |
2242 | spin_lock(&res->spinlock); | 2570 | spin_lock(&res->spinlock); |
2243 | res->state &= ~DLM_LOCK_RES_MIGRATING; | 2571 | res->state &= ~DLM_LOCK_RES_MIGRATING; |
2572 | wake = 1; | ||
2244 | spin_unlock(&res->spinlock); | 2573 | spin_unlock(&res->spinlock); |
2245 | ret = -EINVAL; | 2574 | ret = -EINVAL; |
2246 | } | 2575 | } |
@@ -2268,6 +2597,9 @@ fail: | |||
2268 | * the lockres | 2597 | * the lockres |
2269 | */ | 2598 | */ |
2270 | 2599 | ||
2600 | /* now that remote nodes are spinning on the MIGRATING flag, | ||
2601 | * ensure that all assert_master work is flushed. */ | ||
2602 | flush_workqueue(dlm->dlm_worker); | ||
2271 | 2603 | ||
2272 | /* get an extra reference on the mle. | 2604 | /* get an extra reference on the mle. |
2273 | * otherwise the assert_master from the new | 2605 | * otherwise the assert_master from the new |
@@ -2296,6 +2628,7 @@ fail: | |||
2296 | dlm_put_mle_inuse(mle); | 2628 | dlm_put_mle_inuse(mle); |
2297 | spin_lock(&res->spinlock); | 2629 | spin_lock(&res->spinlock); |
2298 | res->state &= ~DLM_LOCK_RES_MIGRATING; | 2630 | res->state &= ~DLM_LOCK_RES_MIGRATING; |
2631 | wake = 1; | ||
2299 | spin_unlock(&res->spinlock); | 2632 | spin_unlock(&res->spinlock); |
2300 | goto leave; | 2633 | goto leave; |
2301 | } | 2634 | } |
@@ -2322,7 +2655,8 @@ fail: | |||
2322 | res->owner == target) | 2655 | res->owner == target) |
2323 | break; | 2656 | break; |
2324 | 2657 | ||
2325 | mlog(0, "timed out during migration\n"); | 2658 | mlog(0, "%s:%.*s: timed out during migration\n", |
2659 | dlm->name, res->lockname.len, res->lockname.name); | ||
2326 | /* avoid hang during shutdown when migrating lockres | 2660 | /* avoid hang during shutdown when migrating lockres |
2327 | * to a node which also goes down */ | 2661 | * to a node which also goes down */ |
2328 | if (dlm_is_node_dead(dlm, target)) { | 2662 | if (dlm_is_node_dead(dlm, target)) { |
@@ -2330,20 +2664,20 @@ fail: | |||
2330 | "target %u is no longer up, restarting\n", | 2664 | "target %u is no longer up, restarting\n", |
2331 | dlm->name, res->lockname.len, | 2665 | dlm->name, res->lockname.len, |
2332 | res->lockname.name, target); | 2666 | res->lockname.name, target); |
2333 | ret = -ERESTARTSYS; | 2667 | ret = -EINVAL; |
2668 | /* migration failed, detach and clean up mle */ | ||
2669 | dlm_mle_detach_hb_events(dlm, mle); | ||
2670 | dlm_put_mle(mle); | ||
2671 | dlm_put_mle_inuse(mle); | ||
2672 | spin_lock(&res->spinlock); | ||
2673 | res->state &= ~DLM_LOCK_RES_MIGRATING; | ||
2674 | wake = 1; | ||
2675 | spin_unlock(&res->spinlock); | ||
2676 | goto leave; | ||
2334 | } | 2677 | } |
2335 | } | 2678 | } else |
2336 | if (ret == -ERESTARTSYS) { | 2679 | mlog(0, "%s:%.*s: caught signal during migration\n", |
2337 | /* migration failed, detach and clean up mle */ | 2680 | dlm->name, res->lockname.len, res->lockname.name); |
2338 | dlm_mle_detach_hb_events(dlm, mle); | ||
2339 | dlm_put_mle(mle); | ||
2340 | dlm_put_mle_inuse(mle); | ||
2341 | spin_lock(&res->spinlock); | ||
2342 | res->state &= ~DLM_LOCK_RES_MIGRATING; | ||
2343 | spin_unlock(&res->spinlock); | ||
2344 | goto leave; | ||
2345 | } | ||
2346 | /* TODO: if node died: stop, clean up, return error */ | ||
2347 | } | 2681 | } |
2348 | 2682 | ||
2349 | /* all done, set the owner, clear the flag */ | 2683 | /* all done, set the owner, clear the flag */ |
@@ -2366,6 +2700,11 @@ leave: | |||
2366 | if (ret < 0) | 2700 | if (ret < 0) |
2367 | dlm_kick_thread(dlm, res); | 2701 | dlm_kick_thread(dlm, res); |
2368 | 2702 | ||
2703 | /* wake up waiters if the MIGRATING flag got set | ||
2704 | * but migration failed */ | ||
2705 | if (wake) | ||
2706 | wake_up(&res->wq); | ||
2707 | |||
2369 | /* TODO: cleanup */ | 2708 | /* TODO: cleanup */ |
2370 | if (mres) | 2709 | if (mres) |
2371 | free_page((unsigned long)mres); | 2710 | free_page((unsigned long)mres); |
@@ -2376,6 +2715,53 @@ leave: | |||
2376 | return ret; | 2715 | return ret; |
2377 | } | 2716 | } |
2378 | 2717 | ||
2718 | #define DLM_MIGRATION_RETRY_MS 100 | ||
2719 | |||
2720 | /* Should be called only after beginning the domain leave process. | ||
2721 | * There should not be any remaining locks on nonlocal lock resources, | ||
2722 | * and there should be no local locks left on locally mastered resources. | ||
2723 | * | ||
2724 | * Called with the dlm spinlock held, may drop it to do migration, but | ||
2725 | * will re-acquire before exit. | ||
2726 | * | ||
2727 | * Returns: 1 if dlm->spinlock was dropped/retaken, 0 if never dropped */ | ||
2728 | int dlm_empty_lockres(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) | ||
2729 | { | ||
2730 | int ret; | ||
2731 | int lock_dropped = 0; | ||
2732 | |||
2733 | if (res->owner != dlm->node_num) { | ||
2734 | if (!__dlm_lockres_unused(res)) { | ||
2735 | mlog(ML_ERROR, "%s:%.*s: this node is not master, " | ||
2736 | "trying to free this but locks remain\n", | ||
2737 | dlm->name, res->lockname.len, res->lockname.name); | ||
2738 | } | ||
2739 | goto leave; | ||
2740 | } | ||
2741 | |||
2742 | /* Wheee! Migrate lockres here! Will sleep so drop spinlock. */ | ||
2743 | spin_unlock(&dlm->spinlock); | ||
2744 | lock_dropped = 1; | ||
2745 | while (1) { | ||
2746 | ret = dlm_migrate_lockres(dlm, res, O2NM_MAX_NODES); | ||
2747 | if (ret >= 0) | ||
2748 | break; | ||
2749 | if (ret == -ENOTEMPTY) { | ||
2750 | mlog(ML_ERROR, "lockres %.*s still has local locks!\n", | ||
2751 | res->lockname.len, res->lockname.name); | ||
2752 | BUG(); | ||
2753 | } | ||
2754 | |||
2755 | mlog(0, "lockres %.*s: migrate failed, " | ||
2756 | "retrying\n", res->lockname.len, | ||
2757 | res->lockname.name); | ||
2758 | msleep(DLM_MIGRATION_RETRY_MS); | ||
2759 | } | ||
2760 | spin_lock(&dlm->spinlock); | ||
2761 | leave: | ||
2762 | return lock_dropped; | ||
2763 | } | ||
2764 | |||
2379 | int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock) | 2765 | int dlm_lock_basts_flushed(struct dlm_ctxt *dlm, struct dlm_lock *lock) |
2380 | { | 2766 | { |
2381 | int ret; | 2767 | int ret; |
@@ -2405,7 +2791,8 @@ static int dlm_migration_can_proceed(struct dlm_ctxt *dlm, | |||
2405 | return can_proceed; | 2791 | return can_proceed; |
2406 | } | 2792 | } |
2407 | 2793 | ||
2408 | int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, struct dlm_lock_resource *res) | 2794 | static int dlm_lockres_is_dirty(struct dlm_ctxt *dlm, |
2795 | struct dlm_lock_resource *res) | ||
2409 | { | 2796 | { |
2410 | int ret; | 2797 | int ret; |
2411 | spin_lock(&res->spinlock); | 2798 | spin_lock(&res->spinlock); |
@@ -2434,8 +2821,15 @@ static int dlm_mark_lockres_migrating(struct dlm_ctxt *dlm, | |||
2434 | __dlm_lockres_reserve_ast(res); | 2821 | __dlm_lockres_reserve_ast(res); |
2435 | spin_unlock(&res->spinlock); | 2822 | spin_unlock(&res->spinlock); |
2436 | 2823 | ||
2437 | /* now flush all the pending asts.. hang out for a bit */ | 2824 | /* now flush all the pending asts */ |
2438 | dlm_kick_thread(dlm, res); | 2825 | dlm_kick_thread(dlm, res); |
2826 | /* before waiting on DIRTY, block processes which may | ||
2827 | * try to dirty the lockres before MIGRATING is set */ | ||
2828 | spin_lock(&res->spinlock); | ||
2829 | BUG_ON(res->state & DLM_LOCK_RES_BLOCK_DIRTY); | ||
2830 | res->state |= DLM_LOCK_RES_BLOCK_DIRTY; | ||
2831 | spin_unlock(&res->spinlock); | ||
2832 | /* now wait on any pending asts and the DIRTY state */ | ||
2439 | wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); | 2833 | wait_event(dlm->ast_wq, !dlm_lockres_is_dirty(dlm, res)); |
2440 | dlm_lockres_release_ast(dlm, res); | 2834 | dlm_lockres_release_ast(dlm, res); |
2441 | 2835 | ||
@@ -2461,6 +2855,13 @@ again: | |||
2461 | mlog(0, "trying again...\n"); | 2855 | mlog(0, "trying again...\n"); |
2462 | goto again; | 2856 | goto again; |
2463 | } | 2857 | } |
2858 | /* now that we are sure the MIGRATING state is there, drop | ||
2859 | * the unneded state which blocked threads trying to DIRTY */ | ||
2860 | spin_lock(&res->spinlock); | ||
2861 | BUG_ON(!(res->state & DLM_LOCK_RES_BLOCK_DIRTY)); | ||
2862 | BUG_ON(!(res->state & DLM_LOCK_RES_MIGRATING)); | ||
2863 | res->state &= ~DLM_LOCK_RES_BLOCK_DIRTY; | ||
2864 | spin_unlock(&res->spinlock); | ||
2464 | 2865 | ||
2465 | /* did the target go down or die? */ | 2866 | /* did the target go down or die? */ |
2466 | spin_lock(&dlm->spinlock); | 2867 | spin_lock(&dlm->spinlock); |
@@ -2490,7 +2891,7 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, | |||
2490 | { | 2891 | { |
2491 | struct list_head *iter, *iter2; | 2892 | struct list_head *iter, *iter2; |
2492 | struct list_head *queue = &res->granted; | 2893 | struct list_head *queue = &res->granted; |
2493 | int i; | 2894 | int i, bit; |
2494 | struct dlm_lock *lock; | 2895 | struct dlm_lock *lock; |
2495 | 2896 | ||
2496 | assert_spin_locked(&res->spinlock); | 2897 | assert_spin_locked(&res->spinlock); |
@@ -2508,12 +2909,28 @@ static void dlm_remove_nonlocal_locks(struct dlm_ctxt *dlm, | |||
2508 | BUG_ON(!list_empty(&lock->bast_list)); | 2909 | BUG_ON(!list_empty(&lock->bast_list)); |
2509 | BUG_ON(lock->ast_pending); | 2910 | BUG_ON(lock->ast_pending); |
2510 | BUG_ON(lock->bast_pending); | 2911 | BUG_ON(lock->bast_pending); |
2912 | dlm_lockres_clear_refmap_bit(lock->ml.node, res); | ||
2511 | list_del_init(&lock->list); | 2913 | list_del_init(&lock->list); |
2512 | dlm_lock_put(lock); | 2914 | dlm_lock_put(lock); |
2513 | } | 2915 | } |
2514 | } | 2916 | } |
2515 | queue++; | 2917 | queue++; |
2516 | } | 2918 | } |
2919 | bit = 0; | ||
2920 | while (1) { | ||
2921 | bit = find_next_bit(res->refmap, O2NM_MAX_NODES, bit); | ||
2922 | if (bit >= O2NM_MAX_NODES) | ||
2923 | break; | ||
2924 | /* do not clear the local node reference, if there is a | ||
2925 | * process holding this, let it drop the ref itself */ | ||
2926 | if (bit != dlm->node_num) { | ||
2927 | mlog(0, "%s:%.*s: node %u had a ref to this " | ||
2928 | "migrating lockres, clearing\n", dlm->name, | ||
2929 | res->lockname.len, res->lockname.name, bit); | ||
2930 | dlm_lockres_clear_refmap_bit(bit, res); | ||
2931 | } | ||
2932 | bit++; | ||
2933 | } | ||
2517 | } | 2934 | } |
2518 | 2935 | ||
2519 | /* for now this is not too intelligent. we will | 2936 | /* for now this is not too intelligent. we will |
@@ -2601,6 +3018,16 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, | |||
2601 | mlog(0, "migrate request (node %u) returned %d!\n", | 3018 | mlog(0, "migrate request (node %u) returned %d!\n", |
2602 | nodenum, status); | 3019 | nodenum, status); |
2603 | ret = status; | 3020 | ret = status; |
3021 | } else if (status == DLM_MIGRATE_RESPONSE_MASTERY_REF) { | ||
3022 | /* during the migration request we short-circuited | ||
3023 | * the mastery of the lockres. make sure we have | ||
3024 | * a mastery ref for nodenum */ | ||
3025 | mlog(0, "%s:%.*s: need ref for node %u\n", | ||
3026 | dlm->name, res->lockname.len, res->lockname.name, | ||
3027 | nodenum); | ||
3028 | spin_lock(&res->spinlock); | ||
3029 | dlm_lockres_set_refmap_bit(nodenum, res); | ||
3030 | spin_unlock(&res->spinlock); | ||
2604 | } | 3031 | } |
2605 | } | 3032 | } |
2606 | 3033 | ||
@@ -2619,7 +3046,8 @@ static int dlm_do_migrate_request(struct dlm_ctxt *dlm, | |||
2619 | * we will have no mle in the list to start with. now we can add an mle for | 3046 | * we will have no mle in the list to start with. now we can add an mle for |
2620 | * the migration and this should be the only one found for those scanning the | 3047 | * the migration and this should be the only one found for those scanning the |
2621 | * list. */ | 3048 | * list. */ |
2622 | int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data) | 3049 | int dlm_migrate_request_handler(struct o2net_msg *msg, u32 len, void *data, |
3050 | void **ret_data) | ||
2623 | { | 3051 | { |
2624 | struct dlm_ctxt *dlm = data; | 3052 | struct dlm_ctxt *dlm = data; |
2625 | struct dlm_lock_resource *res = NULL; | 3053 | struct dlm_lock_resource *res = NULL; |
@@ -2745,7 +3173,13 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, | |||
2745 | /* remove it from the list so that only one | 3173 | /* remove it from the list so that only one |
2746 | * mle will be found */ | 3174 | * mle will be found */ |
2747 | list_del_init(&tmp->list); | 3175 | list_del_init(&tmp->list); |
2748 | __dlm_mle_detach_hb_events(dlm, mle); | 3176 | /* this was obviously WRONG. mle is uninited here. should be tmp. */ |
3177 | __dlm_mle_detach_hb_events(dlm, tmp); | ||
3178 | ret = DLM_MIGRATE_RESPONSE_MASTERY_REF; | ||
3179 | mlog(0, "%s:%.*s: master=%u, newmaster=%u, " | ||
3180 | "telling master to get ref for cleared out mle " | ||
3181 | "during migration\n", dlm->name, namelen, name, | ||
3182 | master, new_master); | ||
2749 | } | 3183 | } |
2750 | spin_unlock(&tmp->spinlock); | 3184 | spin_unlock(&tmp->spinlock); |
2751 | } | 3185 | } |
@@ -2753,6 +3187,8 @@ static int dlm_add_migration_mle(struct dlm_ctxt *dlm, | |||
2753 | /* now add a migration mle to the tail of the list */ | 3187 | /* now add a migration mle to the tail of the list */ |
2754 | dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen); | 3188 | dlm_init_mle(mle, DLM_MLE_MIGRATION, dlm, res, name, namelen); |
2755 | mle->new_master = new_master; | 3189 | mle->new_master = new_master; |
3190 | /* the new master will be sending an assert master for this. | ||
3191 | * at that point we will get the refmap reference */ | ||
2756 | mle->master = master; | 3192 | mle->master = master; |
2757 | /* do this for consistency with other mle types */ | 3193 | /* do this for consistency with other mle types */ |
2758 | set_bit(new_master, mle->maybe_map); | 3194 | set_bit(new_master, mle->maybe_map); |
@@ -2902,6 +3338,13 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | |||
2902 | clear_bit(dlm->node_num, iter.node_map); | 3338 | clear_bit(dlm->node_num, iter.node_map); |
2903 | spin_unlock(&dlm->spinlock); | 3339 | spin_unlock(&dlm->spinlock); |
2904 | 3340 | ||
3341 | /* ownership of the lockres is changing. account for the | ||
3342 | * mastery reference here since old_master will briefly have | ||
3343 | * a reference after the migration completes */ | ||
3344 | spin_lock(&res->spinlock); | ||
3345 | dlm_lockres_set_refmap_bit(old_master, res); | ||
3346 | spin_unlock(&res->spinlock); | ||
3347 | |||
2905 | mlog(0, "now time to do a migrate request to other nodes\n"); | 3348 | mlog(0, "now time to do a migrate request to other nodes\n"); |
2906 | ret = dlm_do_migrate_request(dlm, res, old_master, | 3349 | ret = dlm_do_migrate_request(dlm, res, old_master, |
2907 | dlm->node_num, &iter); | 3350 | dlm->node_num, &iter); |
@@ -2914,8 +3357,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | |||
2914 | res->lockname.len, res->lockname.name); | 3357 | res->lockname.len, res->lockname.name); |
2915 | /* this call now finishes out the nodemap | 3358 | /* this call now finishes out the nodemap |
2916 | * even if one or more nodes die */ | 3359 | * even if one or more nodes die */ |
2917 | ret = dlm_do_assert_master(dlm, res->lockname.name, | 3360 | ret = dlm_do_assert_master(dlm, res, iter.node_map, |
2918 | res->lockname.len, iter.node_map, | ||
2919 | DLM_ASSERT_MASTER_FINISH_MIGRATION); | 3361 | DLM_ASSERT_MASTER_FINISH_MIGRATION); |
2920 | if (ret < 0) { | 3362 | if (ret < 0) { |
2921 | /* no longer need to retry. all living nodes contacted. */ | 3363 | /* no longer need to retry. all living nodes contacted. */ |
@@ -2927,8 +3369,7 @@ int dlm_finish_migration(struct dlm_ctxt *dlm, struct dlm_lock_resource *res, | |||
2927 | set_bit(old_master, iter.node_map); | 3369 | set_bit(old_master, iter.node_map); |
2928 | mlog(0, "doing assert master of %.*s back to %u\n", | 3370 | mlog(0, "doing assert master of %.*s back to %u\n", |
2929 | res->lockname.len, res->lockname.name, old_master); | 3371 | res->lockname.len, res->lockname.name, old_master); |
2930 | ret = dlm_do_assert_master(dlm, res->lockname.name, | 3372 | ret = dlm_do_assert_master(dlm, res, iter.node_map, |
2931 | res->lockname.len, iter.node_map, | ||
2932 | DLM_ASSERT_MASTER_FINISH_MIGRATION); | 3373 | DLM_ASSERT_MASTER_FINISH_MIGRATION); |
2933 | if (ret < 0) { | 3374 | if (ret < 0) { |
2934 | mlog(0, "assert master to original master failed " | 3375 | mlog(0, "assert master to original master failed " |