aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/directory-locking31
-rw-r--r--fs/namei.c10
2 files changed, 27 insertions, 14 deletions
diff --git a/Documentation/filesystems/directory-locking b/Documentation/filesystems/directory-locking
index ff7b611abf33..09bbf9a54f80 100644
--- a/Documentation/filesystems/directory-locking
+++ b/Documentation/filesystems/directory-locking
@@ -2,6 +2,10 @@
2kinds of locks - per-inode (->i_mutex) and per-filesystem 2kinds of locks - per-inode (->i_mutex) and per-filesystem
3(->s_vfs_rename_mutex). 3(->s_vfs_rename_mutex).
4 4
5 When taking the i_mutex on multiple non-directory objects, we
6always acquire the locks in order by increasing address. We'll call
7that "inode pointer" order in the following.
8
5 For our purposes all operations fall in 5 classes: 9 For our purposes all operations fall in 5 classes:
6 10
71) read access. Locking rules: caller locks directory we are accessing. 111) read access. Locking rules: caller locks directory we are accessing.
@@ -12,8 +16,9 @@ kinds of locks - per-inode (->i_mutex) and per-filesystem
12locks victim and calls the method. 16locks victim and calls the method.
13 17
144) rename() that is _not_ cross-directory. Locking rules: caller locks 184) rename() that is _not_ cross-directory. Locking rules: caller locks
15the parent, finds source and target, if target already exists - locks it 19the parent and finds source and target. If target already exists, lock
16and then calls the method. 20it. If source is a non-directory, lock it. If that means we need to
21lock both, lock them in inode pointer order.
17 22
185) link creation. Locking rules: 235) link creation. Locking rules:
19 * lock parent 24 * lock parent
@@ -30,7 +35,9 @@ rules:
30 fail with -ENOTEMPTY 35 fail with -ENOTEMPTY
31 * if new parent is equal to or is a descendent of source 36 * if new parent is equal to or is a descendent of source
32 fail with -ELOOP 37 fail with -ELOOP
33 * if target exists - lock it. 38 * If target exists, lock it. If source is a non-directory, lock
39 it. In case that means we need to lock both source and target,
40 do so in inode pointer order.
34 * call the method. 41 * call the method.
35 42
36 43
@@ -56,9 +63,11 @@ objects - A < B iff A is an ancestor of B.
56 renames will be blocked on filesystem lock and we don't start changing 63 renames will be blocked on filesystem lock and we don't start changing
57 the order until we had acquired all locks). 64 the order until we had acquired all locks).
58 65
59(3) any operation holds at most one lock on non-directory object and 66(3) locks on non-directory objects are acquired only after locks on
60 that lock is acquired after all other locks. (Proof: see descriptions 67 directory objects, and are acquired in inode pointer order.
61 of operations). 68 (Proof: all operations but renames take lock on at most one
69 non-directory object, except renames, which take locks on source and
70 target in inode pointer order in the case they are not directories.)
62 71
63 Now consider the minimal deadlock. Each process is blocked on 72 Now consider the minimal deadlock. Each process is blocked on
64attempt to acquire some lock and already holds at least one lock. Let's 73attempt to acquire some lock and already holds at least one lock. Let's
@@ -66,9 +75,13 @@ consider the set of contended locks. First of all, filesystem lock is
66not contended, since any process blocked on it is not holding any locks. 75not contended, since any process blocked on it is not holding any locks.
67Thus all processes are blocked on ->i_mutex. 76Thus all processes are blocked on ->i_mutex.
68 77
69 Non-directory objects are not contended due to (3). Thus link 78 By (3), any process holding a non-directory lock can only be
70creation can't be a part of deadlock - it can't be blocked on source 79waiting on another non-directory lock with a larger address. Therefore
71and it means that it doesn't hold any locks. 80the process holding the "largest" such lock can always make progress, and
81non-directory objects are not included in the set of contended locks.
82
83 Thus link creation can't be a part of deadlock - it can't be
84blocked on source and it means that it doesn't hold any locks.
72 85
73 Any contended object is either held by cross-directory rename or 86 Any contended object is either held by cross-directory rename or
74has a child that is also contended. Indeed, suppose that it is held by 87has a child that is also contended. Indeed, suppose that it is held by
diff --git a/fs/namei.c b/fs/namei.c
index 2a5a7aa9f43f..88cec0330bf7 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -3918,7 +3918,8 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
3918 * That's where 4.4 screws up. Current fix: serialization on 3918 * That's where 4.4 screws up. Current fix: serialization on
3919 * sb->s_vfs_rename_mutex. We might be more accurate, but that's another 3919 * sb->s_vfs_rename_mutex. We might be more accurate, but that's another
3920 * story. 3920 * story.
3921 * c) we have to lock _three_ objects - parents and victim (if it exists). 3921 * c) we have to lock _four_ objects - parents and victim (if it exists),
3922 * and source (if it is not a directory).
3922 * And that - after we got ->i_mutex on parents (until then we don't know 3923 * And that - after we got ->i_mutex on parents (until then we don't know
3923 * whether the target exists). Solution: try to be smart with locking 3924 * whether the target exists). Solution: try to be smart with locking
3924 * order for inodes. We rely on the fact that tree topology may change 3925 * order for inodes. We rely on the fact that tree topology may change
@@ -3994,6 +3995,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
3994 struct inode *new_dir, struct dentry *new_dentry) 3995 struct inode *new_dir, struct dentry *new_dentry)
3995{ 3996{
3996 struct inode *target = new_dentry->d_inode; 3997 struct inode *target = new_dentry->d_inode;
3998 struct inode *source = old_dentry->d_inode;
3997 int error; 3999 int error;
3998 4000
3999 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); 4001 error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry);
@@ -4001,8 +4003,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
4001 return error; 4003 return error;
4002 4004
4003 dget(new_dentry); 4005 dget(new_dentry);
4004 if (target) 4006 lock_two_nondirectories(source, target);
4005 mutex_lock(&target->i_mutex);
4006 4007
4007 error = -EBUSY; 4008 error = -EBUSY;
4008 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) 4009 if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry))
@@ -4017,8 +4018,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry,
4017 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) 4018 if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE))
4018 d_move(old_dentry, new_dentry); 4019 d_move(old_dentry, new_dentry);
4019out: 4020out:
4020 if (target) 4021 unlock_two_nondirectories(source, target);
4021 mutex_unlock(&target->i_mutex);
4022 dput(new_dentry); 4022 dput(new_dentry);
4023 return error; 4023 return error;
4024} 4024}