diff options
| -rw-r--r-- | Documentation/filesystems/directory-locking | 31 | ||||
| -rw-r--r-- | fs/namei.c | 10 |
2 files changed, 27 insertions, 14 deletions
diff --git a/Documentation/filesystems/directory-locking b/Documentation/filesystems/directory-locking index ff7b611abf33..09bbf9a54f80 100644 --- a/Documentation/filesystems/directory-locking +++ b/Documentation/filesystems/directory-locking | |||
| @@ -2,6 +2,10 @@ | |||
| 2 | kinds of locks - per-inode (->i_mutex) and per-filesystem | 2 | kinds of locks - per-inode (->i_mutex) and per-filesystem |
| 3 | (->s_vfs_rename_mutex). | 3 | (->s_vfs_rename_mutex). |
| 4 | 4 | ||
| 5 | When taking the i_mutex on multiple non-directory objects, we | ||
| 6 | always acquire the locks in order by increasing address. We'll call | ||
| 7 | that "inode pointer" order in the following. | ||
| 8 | |||
| 5 | For our purposes all operations fall in 5 classes: | 9 | For our purposes all operations fall in 5 classes: |
| 6 | 10 | ||
| 7 | 1) read access. Locking rules: caller locks directory we are accessing. | 11 | 1) read access. Locking rules: caller locks directory we are accessing. |
| @@ -12,8 +16,9 @@ kinds of locks - per-inode (->i_mutex) and per-filesystem | |||
| 12 | locks victim and calls the method. | 16 | locks victim and calls the method. |
| 13 | 17 | ||
| 14 | 4) rename() that is _not_ cross-directory. Locking rules: caller locks | 18 | 4) rename() that is _not_ cross-directory. Locking rules: caller locks |
| 15 | the parent, finds source and target, if target already exists - locks it | 19 | the parent and finds source and target. If target already exists, lock |
| 16 | and then calls the method. | 20 | it. If source is a non-directory, lock it. If that means we need to |
| 21 | lock both, lock them in inode pointer order. | ||
| 17 | 22 | ||
| 18 | 5) link creation. Locking rules: | 23 | 5) link creation. Locking rules: |
| 19 | * lock parent | 24 | * lock parent |
| @@ -30,7 +35,9 @@ rules: | |||
| 30 | fail with -ENOTEMPTY | 35 | fail with -ENOTEMPTY |
| 31 | * if new parent is equal to or is a descendent of source | 36 | * if new parent is equal to or is a descendent of source |
| 32 | fail with -ELOOP | 37 | fail with -ELOOP |
| 33 | * if target exists - lock it. | 38 | * If target exists, lock it. If source is a non-directory, lock |
| 39 | it. In case that means we need to lock both source and target, | ||
| 40 | do so in inode pointer order. | ||
| 34 | * call the method. | 41 | * call the method. |
| 35 | 42 | ||
| 36 | 43 | ||
| @@ -56,9 +63,11 @@ objects - A < B iff A is an ancestor of B. | |||
| 56 | renames will be blocked on filesystem lock and we don't start changing | 63 | renames will be blocked on filesystem lock and we don't start changing |
| 57 | the order until we had acquired all locks). | 64 | the order until we had acquired all locks). |
| 58 | 65 | ||
| 59 | (3) any operation holds at most one lock on non-directory object and | 66 | (3) locks on non-directory objects are acquired only after locks on |
| 60 | that lock is acquired after all other locks. (Proof: see descriptions | 67 | directory objects, and are acquired in inode pointer order. |
| 61 | of operations). | 68 | (Proof: all operations but renames take lock on at most one |
| 69 | non-directory object, except renames, which take locks on source and | ||
| 70 | target in inode pointer order in the case they are not directories.) | ||
| 62 | 71 | ||
| 63 | Now consider the minimal deadlock. Each process is blocked on | 72 | Now consider the minimal deadlock. Each process is blocked on |
| 64 | attempt to acquire some lock and already holds at least one lock. Let's | 73 | attempt to acquire some lock and already holds at least one lock. Let's |
| @@ -66,9 +75,13 @@ consider the set of contended locks. First of all, filesystem lock is | |||
| 66 | not contended, since any process blocked on it is not holding any locks. | 75 | not contended, since any process blocked on it is not holding any locks. |
| 67 | Thus all processes are blocked on ->i_mutex. | 76 | Thus all processes are blocked on ->i_mutex. |
| 68 | 77 | ||
| 69 | Non-directory objects are not contended due to (3). Thus link | 78 | By (3), any process holding a non-directory lock can only be |
| 70 | creation can't be a part of deadlock - it can't be blocked on source | 79 | waiting on another non-directory lock with a larger address. Therefore |
| 71 | and it means that it doesn't hold any locks. | 80 | the process holding the "largest" such lock can always make progress, and |
| 81 | non-directory objects are not included in the set of contended locks. | ||
| 82 | |||
| 83 | Thus link creation can't be a part of deadlock - it can't be | ||
| 84 | blocked on source and it means that it doesn't hold any locks. | ||
| 72 | 85 | ||
| 73 | Any contended object is either held by cross-directory rename or | 86 | Any contended object is either held by cross-directory rename or |
| 74 | has a child that is also contended. Indeed, suppose that it is held by | 87 | has a child that is also contended. Indeed, suppose that it is held by |
diff --git a/fs/namei.c b/fs/namei.c index 2a5a7aa9f43f..88cec0330bf7 100644 --- a/fs/namei.c +++ b/fs/namei.c | |||
| @@ -3918,7 +3918,8 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname | |||
| 3918 | * That's where 4.4 screws up. Current fix: serialization on | 3918 | * That's where 4.4 screws up. Current fix: serialization on |
| 3919 | * sb->s_vfs_rename_mutex. We might be more accurate, but that's another | 3919 | * sb->s_vfs_rename_mutex. We might be more accurate, but that's another |
| 3920 | * story. | 3920 | * story. |
| 3921 | * c) we have to lock _three_ objects - parents and victim (if it exists). | 3921 | * c) we have to lock _four_ objects - parents and victim (if it exists), |
| 3922 | * and source (if it is not a directory). | ||
| 3922 | * And that - after we got ->i_mutex on parents (until then we don't know | 3923 | * And that - after we got ->i_mutex on parents (until then we don't know |
| 3923 | * whether the target exists). Solution: try to be smart with locking | 3924 | * whether the target exists). Solution: try to be smart with locking |
| 3924 | * order for inodes. We rely on the fact that tree topology may change | 3925 | * order for inodes. We rely on the fact that tree topology may change |
| @@ -3994,6 +3995,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, | |||
| 3994 | struct inode *new_dir, struct dentry *new_dentry) | 3995 | struct inode *new_dir, struct dentry *new_dentry) |
| 3995 | { | 3996 | { |
| 3996 | struct inode *target = new_dentry->d_inode; | 3997 | struct inode *target = new_dentry->d_inode; |
| 3998 | struct inode *source = old_dentry->d_inode; | ||
| 3997 | int error; | 3999 | int error; |
| 3998 | 4000 | ||
| 3999 | error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); | 4001 | error = security_inode_rename(old_dir, old_dentry, new_dir, new_dentry); |
| @@ -4001,8 +4003,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, | |||
| 4001 | return error; | 4003 | return error; |
| 4002 | 4004 | ||
| 4003 | dget(new_dentry); | 4005 | dget(new_dentry); |
| 4004 | if (target) | 4006 | lock_two_nondirectories(source, target); |
| 4005 | mutex_lock(&target->i_mutex); | ||
| 4006 | 4007 | ||
| 4007 | error = -EBUSY; | 4008 | error = -EBUSY; |
| 4008 | if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) | 4009 | if (d_mountpoint(old_dentry)||d_mountpoint(new_dentry)) |
| @@ -4017,8 +4018,7 @@ static int vfs_rename_other(struct inode *old_dir, struct dentry *old_dentry, | |||
| 4017 | if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) | 4018 | if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) |
| 4018 | d_move(old_dentry, new_dentry); | 4019 | d_move(old_dentry, new_dentry); |
| 4019 | out: | 4020 | out: |
| 4020 | if (target) | 4021 | unlock_two_nondirectories(source, target); |
| 4021 | mutex_unlock(&target->i_mutex); | ||
| 4022 | dput(new_dentry); | 4022 | dput(new_dentry); |
| 4023 | return error; | 4023 | return error; |
| 4024 | } | 4024 | } |
