diff options
author | Yiwen Jiang <jiangyiwen@huawei.com> | 2014-06-23 16:22:09 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-06-23 19:47:45 -0400 |
commit | f7a14f32e7e1e7e025d88e7b4c8e3cc99770f756 (patch) | |
tree | 001fd6f29f90bb7a7bd8d7533b836e781bf6e11c | |
parent | a270c6d3c0d7ba914bd82da34152d1102920d805 (diff) |
ocfs2: fix a tiny race when running dirop_fileop_racer
When running dirop_fileop_racer we found a dead lock case.
2 nodes, say Node A and Node B, mount the same ocfs2 volume. Create
/race/16/1 in the filesystem, and let the inode number of dir 16 is less
than the inode number of dir race.
Node A Node B
mv /race/16/1 /race/
right after Node A has got the
EX mode of /race/16/, and tries to
get EX mode of /race
ls /race/16/
In this case, Node A has got the EX mode of /race/16/, and wants to get EX
mode of /race/. Node B has got the PR mode of /race/, and wants to get
the PR mode of /race/16/. Since EX and PR are mutually exclusive, dead
lock happens.
This patch fixes this case by locking in ancestor order before trying
inode number order.
Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com>
Signed-off-by: Joseph Qi <joseph.qi@huawei.com>
Cc: Joel Becker <jlbec@evilplan.org>
Reviewed-by: Mark Fasheh <mfasheh@suse.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r-- | fs/ocfs2/namei.c | 96 | ||||
-rw-r--r-- | fs/ocfs2/ocfs2_trace.h | 2 |
2 files changed, 96 insertions, 2 deletions
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 5819bb5d6483..879f9f36b607 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c | |||
@@ -991,6 +991,65 @@ leave: | |||
991 | return status; | 991 | return status; |
992 | } | 992 | } |
993 | 993 | ||
994 | static int ocfs2_check_if_ancestor(struct ocfs2_super *osb, | ||
995 | u64 src_inode_no, u64 dest_inode_no) | ||
996 | { | ||
997 | int ret = 0, i = 0; | ||
998 | u64 parent_inode_no = 0; | ||
999 | u64 child_inode_no = src_inode_no; | ||
1000 | struct inode *child_inode; | ||
1001 | |||
1002 | #define MAX_LOOKUP_TIMES 32 | ||
1003 | while (1) { | ||
1004 | child_inode = ocfs2_iget(osb, child_inode_no, 0, 0); | ||
1005 | if (IS_ERR(child_inode)) { | ||
1006 | ret = PTR_ERR(child_inode); | ||
1007 | break; | ||
1008 | } | ||
1009 | |||
1010 | ret = ocfs2_inode_lock(child_inode, NULL, 0); | ||
1011 | if (ret < 0) { | ||
1012 | iput(child_inode); | ||
1013 | if (ret != -ENOENT) | ||
1014 | mlog_errno(ret); | ||
1015 | break; | ||
1016 | } | ||
1017 | |||
1018 | ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2, | ||
1019 | &parent_inode_no); | ||
1020 | ocfs2_inode_unlock(child_inode, 0); | ||
1021 | iput(child_inode); | ||
1022 | if (ret < 0) { | ||
1023 | ret = -ENOENT; | ||
1024 | break; | ||
1025 | } | ||
1026 | |||
1027 | if (parent_inode_no == dest_inode_no) { | ||
1028 | ret = 1; | ||
1029 | break; | ||
1030 | } | ||
1031 | |||
1032 | if (parent_inode_no == osb->root_inode->i_ino) { | ||
1033 | ret = 0; | ||
1034 | break; | ||
1035 | } | ||
1036 | |||
1037 | child_inode_no = parent_inode_no; | ||
1038 | |||
1039 | if (++i >= MAX_LOOKUP_TIMES) { | ||
1040 | mlog(ML_NOTICE, "max lookup times reached, filesystem " | ||
1041 | "may have nested directories, " | ||
1042 | "src inode: %llu, dest inode: %llu.\n", | ||
1043 | (unsigned long long)src_inode_no, | ||
1044 | (unsigned long long)dest_inode_no); | ||
1045 | ret = 0; | ||
1046 | break; | ||
1047 | } | ||
1048 | } | ||
1049 | |||
1050 | return ret; | ||
1051 | } | ||
1052 | |||
994 | /* | 1053 | /* |
995 | * The only place this should be used is rename! | 1054 | * The only place this should be used is rename! |
996 | * if they have the same id, then the 1st one is the only one locked. | 1055 | * if they have the same id, then the 1st one is the only one locked. |
@@ -1002,6 +1061,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, | |||
1002 | struct inode *inode2) | 1061 | struct inode *inode2) |
1003 | { | 1062 | { |
1004 | int status; | 1063 | int status; |
1064 | int inode1_is_ancestor, inode2_is_ancestor; | ||
1005 | struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); | 1065 | struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); |
1006 | struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); | 1066 | struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); |
1007 | struct buffer_head **tmpbh; | 1067 | struct buffer_head **tmpbh; |
@@ -1015,9 +1075,26 @@ static int ocfs2_double_lock(struct ocfs2_super *osb, | |||
1015 | if (*bh2) | 1075 | if (*bh2) |
1016 | *bh2 = NULL; | 1076 | *bh2 = NULL; |
1017 | 1077 | ||
1018 | /* we always want to lock the one with the lower lockid first. */ | 1078 | /* we always want to lock the one with the lower lockid first. |
1079 | * and if they are nested, we lock ancestor first */ | ||
1019 | if (oi1->ip_blkno != oi2->ip_blkno) { | 1080 | if (oi1->ip_blkno != oi2->ip_blkno) { |
1020 | if (oi1->ip_blkno < oi2->ip_blkno) { | 1081 | inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno, |
1082 | oi1->ip_blkno); | ||
1083 | if (inode1_is_ancestor < 0) { | ||
1084 | status = inode1_is_ancestor; | ||
1085 | goto bail; | ||
1086 | } | ||
1087 | |||
1088 | inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno, | ||
1089 | oi2->ip_blkno); | ||
1090 | if (inode2_is_ancestor < 0) { | ||
1091 | status = inode2_is_ancestor; | ||
1092 | goto bail; | ||
1093 | } | ||
1094 | |||
1095 | if ((inode1_is_ancestor == 1) || | ||
1096 | (oi1->ip_blkno < oi2->ip_blkno && | ||
1097 | inode2_is_ancestor == 0)) { | ||
1021 | /* switch id1 and id2 around */ | 1098 | /* switch id1 and id2 around */ |
1022 | tmpbh = bh2; | 1099 | tmpbh = bh2; |
1023 | bh2 = bh1; | 1100 | bh2 = bh1; |
@@ -1135,6 +1212,21 @@ static int ocfs2_rename(struct inode *old_dir, | |||
1135 | goto bail; | 1212 | goto bail; |
1136 | } | 1213 | } |
1137 | rename_lock = 1; | 1214 | rename_lock = 1; |
1215 | |||
1216 | /* here we cannot guarantee the inodes haven't just been | ||
1217 | * changed, so check if they are nested again */ | ||
1218 | status = ocfs2_check_if_ancestor(osb, new_dir->i_ino, | ||
1219 | old_inode->i_ino); | ||
1220 | if (status < 0) { | ||
1221 | mlog_errno(status); | ||
1222 | goto bail; | ||
1223 | } else if (status == 1) { | ||
1224 | status = -EPERM; | ||
1225 | trace_ocfs2_rename_not_permitted( | ||
1226 | (unsigned long long)old_inode->i_ino, | ||
1227 | (unsigned long long)new_dir->i_ino); | ||
1228 | goto bail; | ||
1229 | } | ||
1138 | } | 1230 | } |
1139 | 1231 | ||
1140 | /* if old and new are the same, this'll just do one lock. */ | 1232 | /* if old and new are the same, this'll just do one lock. */ |
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index 1b60c62aa9d6..6cb019b7c6a8 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h | |||
@@ -2292,6 +2292,8 @@ TRACE_EVENT(ocfs2_rename, | |||
2292 | __entry->new_len, __get_str(new_name)) | 2292 | __entry->new_len, __get_str(new_name)) |
2293 | ); | 2293 | ); |
2294 | 2294 | ||
2295 | DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_rename_not_permitted); | ||
2296 | |||
2295 | TRACE_EVENT(ocfs2_rename_target_exists, | 2297 | TRACE_EVENT(ocfs2_rename_target_exists, |
2296 | TP_PROTO(int new_len, const char *new_name), | 2298 | TP_PROTO(int new_len, const char *new_name), |
2297 | TP_ARGS(new_len, new_name), | 2299 | TP_ARGS(new_len, new_name), |