aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJiufei Xue <xuejiufei@huawei.com>2016-03-15 17:53:20 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2016-03-15 19:55:16 -0400
commit814ce69432bffdd0533fda28deea5dcfba153d17 (patch)
tree9cc76feac9406d03644c030354566a6ef4970276 /fs
parentd277f33eda000ca03b1497fcf1c9e2ec33adf4c6 (diff)
ocfs2: fix a tiny race that leads file system read-only
when o2hb detect a node down, it first set the dead node to recovery map and create ocfs2rec which will replay journal for dead node. o2hb thread then call dlm_do_local_recovery_cleanup() to delete the lock for dead node. After the lock of dead node is gone, locks for other nodes can be granted and may modify the meta data without replaying journal of the dead node. The detail is described as follows. N1 N2 N3(master) modify the extent tree of inode, and commit dirty metadata to journal, then goes down. o2hb thread detects N1 goes down, set recovery map and delete the lock of N1. dlm_thread flush ast for the lock of N2. do not detect the death of N1, so recovery map is empty. read inode from disk without replaying the journal of N1 and modify the extent tree of the inode that N1 had modified. ocfs2rec recover the journal of N1. The modification of N2 is lost. The modification of N1 and N2 are not serial, and it will lead to read-only file system. We can set recovery_waiting flag to the lock resource after delete the lock for dead node to prevent other node from getting the lock before dlm recovery. After dlm recovery, the recovery map on N2 is not empty, ocfs2_inode_lock_full_nested() will wait for ocfs2 recovery. Signed-off-by: Jiufei Xue <xuejiufei@huawei.com> Reviewed-by: Joseph Qi <joseph.qi@huawei.com> Cc: Mark Fasheh <mfasheh@suse.de> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h5
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c3
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c8
-rw-r--r--fs/ocfs2/dlm/dlmthread.c6
4 files changed, 18 insertions, 4 deletions
diff --git a/fs/ocfs2/dlm/dlmcommon.h b/fs/ocfs2/dlm/dlmcommon.h
index 3b77862fc85d..004f2cbe8f71 100644
--- a/fs/ocfs2/dlm/dlmcommon.h
+++ b/fs/ocfs2/dlm/dlmcommon.h
@@ -282,6 +282,7 @@ static inline void __dlm_set_joining_node(struct dlm_ctxt *dlm,
282#define DLM_LOCK_RES_DROPPING_REF 0x00000040 282#define DLM_LOCK_RES_DROPPING_REF 0x00000040
283#define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000 283#define DLM_LOCK_RES_BLOCK_DIRTY 0x00001000
284#define DLM_LOCK_RES_SETREF_INPROG 0x00002000 284#define DLM_LOCK_RES_SETREF_INPROG 0x00002000
285#define DLM_LOCK_RES_RECOVERY_WAITING 0x00004000
285 286
286/* max milliseconds to wait to sync up a network failure with a node death */ 287/* max milliseconds to wait to sync up a network failure with a node death */
287#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000) 288#define DLM_NODE_DEATH_WAIT_MAX (5 * 1000)
@@ -804,7 +805,8 @@ __dlm_lockres_state_to_status(struct dlm_lock_resource *res)
804 805
805 assert_spin_locked(&res->spinlock); 806 assert_spin_locked(&res->spinlock);
806 807
807 if (res->state & DLM_LOCK_RES_RECOVERING) 808 if (res->state & (DLM_LOCK_RES_RECOVERING|
809 DLM_LOCK_RES_RECOVERY_WAITING))
808 status = DLM_RECOVERING; 810 status = DLM_RECOVERING;
809 else if (res->state & DLM_LOCK_RES_MIGRATING) 811 else if (res->state & DLM_LOCK_RES_MIGRATING)
810 status = DLM_MIGRATING; 812 status = DLM_MIGRATING;
@@ -1026,6 +1028,7 @@ static inline void __dlm_wait_on_lockres(struct dlm_lock_resource *res)
1026{ 1028{
1027 __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS| 1029 __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_IN_PROGRESS|
1028 DLM_LOCK_RES_RECOVERING| 1030 DLM_LOCK_RES_RECOVERING|
1031 DLM_LOCK_RES_RECOVERY_WAITING|
1029 DLM_LOCK_RES_MIGRATING)); 1032 DLM_LOCK_RES_MIGRATING));
1030} 1033}
1031 1034
diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c
index 87e22541850e..9aed6e202201 100644
--- a/fs/ocfs2/dlm/dlmmaster.c
+++ b/fs/ocfs2/dlm/dlmmaster.c
@@ -2550,7 +2550,8 @@ static int dlm_is_lockres_migrateable(struct dlm_ctxt *dlm,
2550 return 0; 2550 return 0;
2551 2551
2552 /* delay migration when the lockres is in RECOCERING state */ 2552 /* delay migration when the lockres is in RECOCERING state */
2553 if (res->state & DLM_LOCK_RES_RECOVERING) 2553 if (res->state & (DLM_LOCK_RES_RECOVERING|
2554 DLM_LOCK_RES_RECOVERY_WAITING))
2554 return 0; 2555 return 0;
2555 2556
2556 if (res->owner != dlm->node_num) 2557 if (res->owner != dlm->node_num)
diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c
index 213279db3f28..cd38488a10fc 100644
--- a/fs/ocfs2/dlm/dlmrecovery.c
+++ b/fs/ocfs2/dlm/dlmrecovery.c
@@ -2175,6 +2175,13 @@ static void dlm_finish_local_lockres_recovery(struct dlm_ctxt *dlm,
2175 for (i = 0; i < DLM_HASH_BUCKETS; i++) { 2175 for (i = 0; i < DLM_HASH_BUCKETS; i++) {
2176 bucket = dlm_lockres_hash(dlm, i); 2176 bucket = dlm_lockres_hash(dlm, i);
2177 hlist_for_each_entry(res, bucket, hash_node) { 2177 hlist_for_each_entry(res, bucket, hash_node) {
2178 if (res->state & DLM_LOCK_RES_RECOVERY_WAITING) {
2179 spin_lock(&res->spinlock);
2180 res->state &= ~DLM_LOCK_RES_RECOVERY_WAITING;
2181 spin_unlock(&res->spinlock);
2182 wake_up(&res->wq);
2183 }
2184
2178 if (!(res->state & DLM_LOCK_RES_RECOVERING)) 2185 if (!(res->state & DLM_LOCK_RES_RECOVERING))
2179 continue; 2186 continue;
2180 2187
@@ -2312,6 +2319,7 @@ static void dlm_free_dead_locks(struct dlm_ctxt *dlm,
2312 res->lockname.len, res->lockname.name, freed, dead_node); 2319 res->lockname.len, res->lockname.name, freed, dead_node);
2313 __dlm_print_one_lock_resource(res); 2320 __dlm_print_one_lock_resource(res);
2314 } 2321 }
2322 res->state |= DLM_LOCK_RES_RECOVERY_WAITING;
2315 dlm_lockres_clear_refmap_bit(dlm, res, dead_node); 2323 dlm_lockres_clear_refmap_bit(dlm, res, dead_node);
2316 } else if (test_bit(dead_node, res->refmap)) { 2324 } else if (test_bit(dead_node, res->refmap)) {
2317 mlog(0, "%s:%.*s: dead node %u had a ref, but had " 2325 mlog(0, "%s:%.*s: dead node %u had a ref, but had "
diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c
index 22e6eb8b8d22..68d239ba0c63 100644
--- a/fs/ocfs2/dlm/dlmthread.c
+++ b/fs/ocfs2/dlm/dlmthread.c
@@ -106,7 +106,8 @@ int __dlm_lockres_unused(struct dlm_lock_resource *res)
106 if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY) 106 if (!list_empty(&res->dirty) || res->state & DLM_LOCK_RES_DIRTY)
107 return 0; 107 return 0;
108 108
109 if (res->state & DLM_LOCK_RES_RECOVERING) 109 if (res->state & (DLM_LOCK_RES_RECOVERING|
110 DLM_LOCK_RES_RECOVERY_WAITING))
110 return 0; 111 return 0;
111 112
112 /* Another node has this resource with this node as the master */ 113 /* Another node has this resource with this node as the master */
@@ -707,7 +708,8 @@ static int dlm_thread(void *data)
707 * dirty for a short while. */ 708 * dirty for a short while. */
708 BUG_ON(res->state & DLM_LOCK_RES_MIGRATING); 709 BUG_ON(res->state & DLM_LOCK_RES_MIGRATING);
709 if (res->state & (DLM_LOCK_RES_IN_PROGRESS | 710 if (res->state & (DLM_LOCK_RES_IN_PROGRESS |
710 DLM_LOCK_RES_RECOVERING)) { 711 DLM_LOCK_RES_RECOVERING |
712 DLM_LOCK_RES_RECOVERY_WAITING)) {
711 /* move it to the tail and keep going */ 713 /* move it to the tail and keep going */
712 res->state &= ~DLM_LOCK_RES_DIRTY; 714 res->state &= ~DLM_LOCK_RES_DIRTY;
713 spin_unlock(&res->spinlock); 715 spin_unlock(&res->spinlock);