aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorYiwen Jiang <jiangyiwen@huawei.com>2014-06-23 16:22:09 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-06-23 19:47:45 -0400
commitf7a14f32e7e1e7e025d88e7b4c8e3cc99770f756 (patch)
tree001fd6f29f90bb7a7bd8d7533b836e781bf6e11c
parenta270c6d3c0d7ba914bd82da34152d1102920d805 (diff)
ocfs2: fix a tiny race when running dirop_fileop_racer
When running dirop_fileop_racer we found a dead lock case. 2 nodes, say Node A and Node B, mount the same ocfs2 volume. Create /race/16/1 in the filesystem, and let the inode number of dir 16 is less than the inode number of dir race. Node A Node B mv /race/16/1 /race/ right after Node A has got the EX mode of /race/16/, and tries to get EX mode of /race ls /race/16/ In this case, Node A has got the EX mode of /race/16/, and wants to get EX mode of /race/. Node B has got the PR mode of /race/, and wants to get the PR mode of /race/16/. Since EX and PR are mutually exclusive, dead lock happens. This patch fixes this case by locking in ancestor order before trying inode number order. Signed-off-by: Yiwen Jiang <jiangyiwen@huawei.com> Signed-off-by: Joseph Qi <joseph.qi@huawei.com> Cc: Joel Becker <jlbec@evilplan.org> Reviewed-by: Mark Fasheh <mfasheh@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
-rw-r--r--fs/ocfs2/namei.c96
-rw-r--r--fs/ocfs2/ocfs2_trace.h2
2 files changed, 96 insertions, 2 deletions
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index 5819bb5d6483..879f9f36b607 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -991,6 +991,65 @@ leave:
991 return status; 991 return status;
992} 992}
993 993
994static int ocfs2_check_if_ancestor(struct ocfs2_super *osb,
995 u64 src_inode_no, u64 dest_inode_no)
996{
997 int ret = 0, i = 0;
998 u64 parent_inode_no = 0;
999 u64 child_inode_no = src_inode_no;
1000 struct inode *child_inode;
1001
1002#define MAX_LOOKUP_TIMES 32
1003 while (1) {
1004 child_inode = ocfs2_iget(osb, child_inode_no, 0, 0);
1005 if (IS_ERR(child_inode)) {
1006 ret = PTR_ERR(child_inode);
1007 break;
1008 }
1009
1010 ret = ocfs2_inode_lock(child_inode, NULL, 0);
1011 if (ret < 0) {
1012 iput(child_inode);
1013 if (ret != -ENOENT)
1014 mlog_errno(ret);
1015 break;
1016 }
1017
1018 ret = ocfs2_lookup_ino_from_name(child_inode, "..", 2,
1019 &parent_inode_no);
1020 ocfs2_inode_unlock(child_inode, 0);
1021 iput(child_inode);
1022 if (ret < 0) {
1023 ret = -ENOENT;
1024 break;
1025 }
1026
1027 if (parent_inode_no == dest_inode_no) {
1028 ret = 1;
1029 break;
1030 }
1031
1032 if (parent_inode_no == osb->root_inode->i_ino) {
1033 ret = 0;
1034 break;
1035 }
1036
1037 child_inode_no = parent_inode_no;
1038
1039 if (++i >= MAX_LOOKUP_TIMES) {
1040 mlog(ML_NOTICE, "max lookup times reached, filesystem "
1041 "may have nested directories, "
1042 "src inode: %llu, dest inode: %llu.\n",
1043 (unsigned long long)src_inode_no,
1044 (unsigned long long)dest_inode_no);
1045 ret = 0;
1046 break;
1047 }
1048 }
1049
1050 return ret;
1051}
1052
994/* 1053/*
995 * The only place this should be used is rename! 1054 * The only place this should be used is rename!
996 * if they have the same id, then the 1st one is the only one locked. 1055 * if they have the same id, then the 1st one is the only one locked.
@@ -1002,6 +1061,7 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
1002 struct inode *inode2) 1061 struct inode *inode2)
1003{ 1062{
1004 int status; 1063 int status;
1064 int inode1_is_ancestor, inode2_is_ancestor;
1005 struct ocfs2_inode_info *oi1 = OCFS2_I(inode1); 1065 struct ocfs2_inode_info *oi1 = OCFS2_I(inode1);
1006 struct ocfs2_inode_info *oi2 = OCFS2_I(inode2); 1066 struct ocfs2_inode_info *oi2 = OCFS2_I(inode2);
1007 struct buffer_head **tmpbh; 1067 struct buffer_head **tmpbh;
@@ -1015,9 +1075,26 @@ static int ocfs2_double_lock(struct ocfs2_super *osb,
1015 if (*bh2) 1075 if (*bh2)
1016 *bh2 = NULL; 1076 *bh2 = NULL;
1017 1077
1018 /* we always want to lock the one with the lower lockid first. */ 1078 /* we always want to lock the one with the lower lockid first.
1079 * and if they are nested, we lock ancestor first */
1019 if (oi1->ip_blkno != oi2->ip_blkno) { 1080 if (oi1->ip_blkno != oi2->ip_blkno) {
1020 if (oi1->ip_blkno < oi2->ip_blkno) { 1081 inode1_is_ancestor = ocfs2_check_if_ancestor(osb, oi2->ip_blkno,
1082 oi1->ip_blkno);
1083 if (inode1_is_ancestor < 0) {
1084 status = inode1_is_ancestor;
1085 goto bail;
1086 }
1087
1088 inode2_is_ancestor = ocfs2_check_if_ancestor(osb, oi1->ip_blkno,
1089 oi2->ip_blkno);
1090 if (inode2_is_ancestor < 0) {
1091 status = inode2_is_ancestor;
1092 goto bail;
1093 }
1094
1095 if ((inode1_is_ancestor == 1) ||
1096 (oi1->ip_blkno < oi2->ip_blkno &&
1097 inode2_is_ancestor == 0)) {
1021 /* switch id1 and id2 around */ 1098 /* switch id1 and id2 around */
1022 tmpbh = bh2; 1099 tmpbh = bh2;
1023 bh2 = bh1; 1100 bh2 = bh1;
@@ -1135,6 +1212,21 @@ static int ocfs2_rename(struct inode *old_dir,
1135 goto bail; 1212 goto bail;
1136 } 1213 }
1137 rename_lock = 1; 1214 rename_lock = 1;
1215
1216 /* here we cannot guarantee the inodes haven't just been
1217 * changed, so check if they are nested again */
1218 status = ocfs2_check_if_ancestor(osb, new_dir->i_ino,
1219 old_inode->i_ino);
1220 if (status < 0) {
1221 mlog_errno(status);
1222 goto bail;
1223 } else if (status == 1) {
1224 status = -EPERM;
1225 trace_ocfs2_rename_not_permitted(
1226 (unsigned long long)old_inode->i_ino,
1227 (unsigned long long)new_dir->i_ino);
1228 goto bail;
1229 }
1138 } 1230 }
1139 1231
1140 /* if old and new are the same, this'll just do one lock. */ 1232 /* if old and new are the same, this'll just do one lock. */
diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h
index 1b60c62aa9d6..6cb019b7c6a8 100644
--- a/fs/ocfs2/ocfs2_trace.h
+++ b/fs/ocfs2/ocfs2_trace.h
@@ -2292,6 +2292,8 @@ TRACE_EVENT(ocfs2_rename,
2292 __entry->new_len, __get_str(new_name)) 2292 __entry->new_len, __get_str(new_name))
2293); 2293);
2294 2294
2295DEFINE_OCFS2_ULL_ULL_EVENT(ocfs2_rename_not_permitted);
2296
2295TRACE_EVENT(ocfs2_rename_target_exists, 2297TRACE_EVENT(ocfs2_rename_target_exists,
2296 TP_PROTO(int new_len, const char *new_name), 2298 TP_PROTO(int new_len, const char *new_name),
2297 TP_ARGS(new_len, new_name), 2299 TP_ARGS(new_len, new_name),