diff options
-rw-r--r-- | Documentation/filesystems/directory-locking | 31 | ||||
-rw-r--r-- | fs/namei.c | 10 |
2 files changed, 27 insertions, 14 deletions
diff --git a/Documentation/filesystems/directory-locking b/Documentation/filesystems/directory-locking index ff7b611abf33..09bbf9a54f80 100644 --- a/Documentation/filesystems/directory-locking +++ b/Documentation/filesystems/directory-locking | |||
@@ -2,6 +2,10 @@ | |||
2 | kinds of locks - per-inode (->i_mutex) and per-filesystem | 2 | kinds of locks - per-inode (->i_mutex) and per-filesystem |
3 | (->s_vfs_rename_mutex). | 3 | (->s_vfs_rename_mutex). |
4 | 4 | ||
5 | When taking the i_mutex on multiple non-directory objects, we | ||
6 | always acquire the locks in order by increasing address. We'll call | ||
7 | that "inode pointer" order in the following. | ||
8 | |||
5 | For our purposes all operations fall in 5 classes: | 9 | For our purposes all operations fall in 5 classes: |
6 | 10 | ||
7 | 1) read access. Locking rules: caller locks directory we are accessing. | 11 | 1) read access. Locking rules: caller locks directory we are accessing. |
@@ -12,8 +16,9 @@ kinds of locks - per-inode (->i_mutex) and per-filesystem | |||
12 | locks victim and calls the method. | 16 | locks victim and calls the method. |
13 | 17 | ||
14 | 4) rename() that is _not_ cross-directory. Locking rules: caller locks | 18 | 4) rename() that is _not_ cross-directory. Locking rules: caller locks |
15 | the parent, finds source and target, if target already exists - locks it | 19 | the parent and finds source and target. If target already exists, lock |
16 | and then calls the method. | 20 | it. If source is a non-directory, lock it. If that means we need to |
21 | lock both, lock them in inode pointer order. | ||
17 | 22 | ||
18 | 5) link creation. Locking rules: | 23 | 5) link creation. Locking rules: |
19 | * lock parent | 24 | * lock parent |
@@ -30,7 +35,9 @@ rules: | |||
30 | fail with -ENOTEMPTY | 35 | fail with -ENOTEMPTY |
31 | * if new parent is equal to or is a descendent of source | 36 | * if new parent is equal to or is a descendent of source |
32 | fail with -ELOOP | 37 | fail with -ELOOP |
33 | * if target exists - lock it. | 38 | * If target exists, lock it. If source is a non-directory, lock |
39 | it. In case that means we need to lock both source and target, | ||
40 | do so in inode pointer order. | ||
34 | * call the method. | 41 | * call the method. |
35 | 42 | ||
36 | 43 | ||
@@ -56,9 +63,11 @@ objects - A < B iff A is an ancestor of B. | |||
56 | renames will be blocked on filesystem lock and we don't start changing | 63 | renames will be blocked on filesystem lock and we don't start changing |
57 | the order until we had acquired all locks). | 64 | the order until we had acquired all locks). |
58 | 65 | ||
59 | (3) any operation holds at most one lock on non-directory object and | 66 | (3) locks on non-directory objects are acquired only after locks on |
60 | that lock is acquired after all other locks. (Proof: see descriptions | 67 | directory objects, and are acquired in inode pointer order. |
61 | of operations). | 68 | (Proof: all operations but renames take lock on at most one |
69 | non-directory object, except renames, which take locks on source and | ||
70 | target in inode pointer order in the case they are not directories.) | ||
62 | 71 | ||
63 | Now consider the minimal deadlock. Each process is blocked on | 72 | Now consider the minimal deadlock. Each process is blocked on |
64 | attempt to acquire some lock and already holds at least one lock. Let's | 73 | attempt to acquire some lock and already holds at least one lock. Let's |
@@ -66,9 +75,13 @@ consider the set of contended locks. First of all, filesystem lock is | |||
66 | not contended, since any process blocked on it is not holding any locks. | 75 | not contended, since any process blocked on it is not holding any locks. |
67 | Thus all processes are blocked on ->i_mutex. | 76 | Thus all processes are blocked on ->i_mutex. |
68 | 77 | ||
69 | Non-directory objects are not contended due to (3). Thus link | 78 | By (3), any process holding a non-directory lock can only be |
70 | creation can't be a part of deadlock - it can't be blocked on source | 79 | waiting on another non-directory lock with a larger address. Therefore |
71 | and it means that it doesn't hold any locks. | 80 | the process holding the "largest" such lock can always make progress, and |
81 | non-directory objects are not included in the set of contended locks. | ||
82 | |||
83 | Thus link creation can't be a part of deadlock - it can't be | ||
84 | blocked on source and it means that it doesn't hold any locks. | ||
72 | 85 | ||
73 | Any contended object is either held by cross-directory rename or | 86 | Any contended object is either held by cross-directory rename or |
74 | has a child that is also contended. Indeed, suppose that it is held by | 87 | has a child that is also contended. Indeed, suppose that it is held by |
diff --git a/fs/namei.c b/fs/namei.c index 2a5a7aa9f43f..88cec0330bf7 100644 --- a/fs/namei.c +++ b/fs/namei.c | |||
@@ -3918,7 +3918,8 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname | |||
3918 | * That's where 4.4 screws up. Current fix: serialization on | 3918 | * That's where 4.4 screws up. Current fix: serialization on |
3919 | * sb->s_vfs_rename_mutex. We might be more accurate, but that's another | 3919 | * sb->s_vfs_rename_mutex. We might be more accurate, but that's another |
3920 | * story. | 3920 | * story. |
3921 | * c) we have to lock _three_ objects - parents and victim (if it exists). | 3921 | * c) we have to lock _four_ objects - parents and victim (if it exists), |
3922 | * and source (if it is not a directory). | ||
3922 | * And that - after we got ->i_mutex on parents (until then we don't know | 3923 | * And that - after we got ->i_mutex on parents (until then we don't know |
3923 | * whether the target exists). Solution: try to be smart with locking | 3924 | * whether the target exists). Solution: try to be smart with locking |
3924 | * order for inodes. We rely on the fact that tree topology may change | 3925 | * order for inodes. We rely on the fact that tree topology may change |
@@ -3994,6 +3995,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, | |||
3994 | struct inode *new_dir, struct dentry *new_dentry) | 3995 | struct inode *new_dir, struct dentry *new_dentry) |
3995 | { | 3996 | { |
3996 | struct inode *target = new_dentry->d_inode; | 3997 | struct inode *target = new_dentry->d_inode; |
3998 | struct inode *source = old_dentry->d_inode; | ||
3997 | int error; | 3999 | int error; |
3998 | 4000 | ||
3999 | error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); | 4001 | error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); |
@@ -4001,8 +4003,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, | |||
4001 | return error; | 4003 | return error; |
4002 | 4004 | ||
4003 | dget(new_dentry); | 4005 | dget(new_dentry); |
4004 | if (target) | 4006 | lock_two_nondirectories(source, target); |
4005 | mutex_lock(&target->i_mutex); | ||
4006 | 4007 | ||
4007 | error = -EBUSY; | 4008 | error = -EBUSY; |
4008 | if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) | 4009 | if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) |
@@ -4017,8 +4018,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, | |||
4017 | if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) | 4018 | if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) |
4018 | d_move(old_dentry, new_dentry); | 4019 | d_move(old_dentry, new_dentry); |
4019 | out: | 4020 | out: |
4020 | if (target) | 4021 | unlock_two_nondirectories(source, target); |
4021 | mutex_unlock(&target->i_mutex); | ||
4022 | dput(new_dentry); | 4022 | dput(new_dentry); |
4023 | return error; | 4023 | return error; |
4024 | } | 4024 | } |