aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorAnton Altaparmakov <aia21@cantab.net>2005-06-26 17:12:02 -0400
committerAnton Altaparmakov <aia21@cantab.net>2005-06-26 17:12:02 -0400
commitba6d2377c85c9b8a793f455d8c9b6cf31985d70f (patch)
tree21e65c76db693869c84864af02e91c4b997a6ba5
parentaf859a42d798f047fbfe198ed315a942662c39d2 (diff)
NTFS: Fix a nasty deadlock that appeared in recent kernels.
The situation: VFS inode X on a mounted ntfs volume is dirty. For same inode X, the ntfs_inode is dirty and thus corresponding on-disk inode, i.e. mft record, which is in a dirty PAGE_CACHE_PAGE belonging to the table of inodes, i.e. $MFT, inode 0. What happens: Process 1: sys_sync()/umount()/whatever... calls __sync_single_inode() for $MFT -> do_writepages() -> write_page for the dirty page containing the on-disk inode X, the page is now locked -> ntfs_write_mst_block() which clears PageUptodate() on the page to prevent anyone else getting hold of it whilst it does the write out. This is necessary as the on-disk inode needs "fixups" applied before the write to disk which are removed again after the write and PageUptodate is then set again. It then analyses the page looking for dirty on-disk inodes and when it finds one it calls ntfs_may_write_mft_record() to see if it is safe to write this on-disk inode. This then calls ilookup5() to check if the corresponding VFS inode is in icache(). This in turn calls ifind() which waits on the inode lock via wait_on_inode whilst holding the global inode_lock. Process 2: pdflush results in a call to __sync_single_inode for the same VFS inode X on the ntfs volume. This locks the inode (I_LOCK) then calls write-inode -> ntfs_write_inode -> map_mft_record() -> read_cache_page() for the page (in page cache of table of inodes $MFT, inode 0) containing the on-disk inode. This page has PageUptodate() clear because of Process 1 (see above) so read_cache_page() blocks when it tries to take the page lock for the page so it can call ntfs_read_page(). Thus Process 1 is holding the page lock on the page containing the on-disk inode X and it is waiting on the inode X to be unlocked in ifind() so it can write the page out and then unlock the page. And Process 2 is holding the inode lock on inode X and is waiting for the page to be unlocked so it can call ntfs_readpage() or discover that Process 1 set PageUptodate() again and use the page. Thus we have a deadlock due to ifind() waiting on the inode lock. The solution: The fix is to use the newly introduced ilookup5_nowait() which does not wait on the inode's lock and hence avoids the deadlock. This is safe as we do not care about the VFS inode and only use the fact that it is in the VFS inode cache and the fact that the vfs and ntfs inodes are one struct in memory to find the ntfs inode in memory if present. Also, the ntfs inode has its own locking so it does not matter if the vfs inode is locked. Signed-off-by: Anton Altaparmakov <aia21@cantab.net>
-rw-r--r--Documentation/filesystems/ntfs.txt5
-rw-r--r--fs/ntfs/ChangeLog42
-rw-r--r--fs/ntfs/mft.c29
3 files changed, 65 insertions, 11 deletions
diff --git a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt
index 1415b96ed491..eef4aca0c753 100644
--- a/Documentation/filesystems/ntfs.txt
+++ b/Documentation/filesystems/ntfs.txt
@@ -451,9 +451,12 @@ Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog.
451 - Implement extension of resident files using the normal file write 451 - Implement extension of resident files using the normal file write
452 code paths, i.e. most very small files can be extended to be a little 452 code paths, i.e. most very small files can be extended to be a little
453 bit bigger but not by much. 453 bit bigger but not by much.
454 - Add new mount option "disable_sparse". (See list of mount options
455 above for details.)
454 - Improve handling of ntfs volumes with errors and strange boot sectors 456 - Improve handling of ntfs volumes with errors and strange boot sectors
455 in particular. 457 in particular.
456 - Fix various bugs. 458 - Fix various bugs including a nasty deadlock that appeared in recent
459 kernels (around 2.6.11-2.6.12 timeframe).
4572.1.22: 4602.1.22:
458 - Improve handling of ntfs volumes with errors. 461 - Improve handling of ntfs volumes with errors.
459 - Fix various bugs and race conditions. 462 - Fix various bugs and race conditions.
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog
index 3d2cac4061d6..9709fac6531d 100644
--- a/fs/ntfs/ChangeLog
+++ b/fs/ntfs/ChangeLog
@@ -132,6 +132,48 @@ ToDo/Notes:
132 the already mapped runlist fragment which causes 132 the already mapped runlist fragment which causes
133 ntfs_mapping_pairs_decompress() to fail and return error. Update 133 ntfs_mapping_pairs_decompress() to fail and return error. Update
134 ntfs_attr_find_vcn_nolock() accordingly. 134 ntfs_attr_find_vcn_nolock() accordingly.
135 - Fix a nasty deadlock that appeared in recent kernels.
136 The situation: VFS inode X on a mounted ntfs volume is dirty. For
137 same inode X, the ntfs_inode is dirty and thus corresponding on-disk
138 inode, i.e. mft record, which is in a dirty PAGE_CACHE_PAGE belonging
139 to the table of inodes, i.e. $MFT, inode 0.
140 What happens:
141 Process 1: sys_sync()/umount()/whatever... calls
142 __sync_single_inode() for $MFT -> do_writepages() -> write_page for
143 the dirty page containing the on-disk inode X, the page is now locked
144 -> ntfs_write_mst_block() which clears PageUptodate() on the page to
145 prevent anyone else getting hold of it whilst it does the write out.
146 This is necessary as the on-disk inode needs "fixups" applied before
147 the write to disk which are removed again after the write and
148 PageUptodate is then set again. It then analyses the page looking
149 for dirty on-disk inodes and when it finds one it calls
150 ntfs_may_write_mft_record() to see if it is safe to write this
151 on-disk inode. This then calls ilookup5() to check if the
152 corresponding VFS inode is in icache(). This in turn calls ifind()
153 which waits on the inode lock via wait_on_inode whilst holding the
154 global inode_lock.
155 Process 2: pdflush results in a call to __sync_single_inode for the
156 same VFS inode X on the ntfs volume. This locks the inode (I_LOCK)
157 then calls write-inode -> ntfs_write_inode -> map_mft_record() ->
158 read_cache_page() for the page (in page cache of table of inodes
159 $MFT, inode 0) containing the on-disk inode. This page has
160 PageUptodate() clear because of Process 1 (see above) so
161 read_cache_page() blocks when it tries to take the page lock for the
162 page so it can call ntfs_read_page().
163 Thus Process 1 is holding the page lock on the page containing the
164 on-disk inode X and it is waiting on the inode X to be unlocked in
165 ifind() so it can write the page out and then unlock the page.
166 And Process 2 is holding the inode lock on inode X and is waiting for
167 the page to be unlocked so it can call ntfs_readpage() or discover
168 that Process 1 set PageUptodate() again and use the page.
169 Thus we have a deadlock due to ifind() waiting on the inode lock.
170 The solution: The fix is to use the newly introduced
171 ilookup5_nowait() which does not wait on the inode's lock and hence
172 avoids the deadlock. This is safe as we do not care about the VFS
173 inode and only use the fact that it is in the VFS inode cache and the
174 fact that the vfs and ntfs inodes are one struct in memory to find
175 the ntfs inode in memory if present. Also, the ntfs inode has its
176 own locking so it does not matter if the vfs inode is locked.
135 177
1362.1.22 - Many bug and race fixes and error handling improvements. 1782.1.22 - Many bug and race fixes and error handling improvements.
137 179
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c
index 3d0ba8e60adc..ac9ff39aa834 100644
--- a/fs/ntfs/mft.c
+++ b/fs/ntfs/mft.c
@@ -948,20 +948,23 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
948 na.name_len = 0; 948 na.name_len = 0;
949 na.type = AT_UNUSED; 949 na.type = AT_UNUSED;
950 /* 950 /*
951 * For inode 0, i.e. $MFT itself, we cannot use ilookup5() from here or 951 * Optimize inode 0, i.e. $MFT itself, since we have it in memory and
952 * we deadlock because the inode is already locked by the kernel 952 * we get here for it rather often.
953 * (fs/fs-writeback.c::__sync_single_inode()) and ilookup5() waits
954 * until the inode is unlocked before returning it and it never gets
955 * unlocked because ntfs_should_write_mft_record() never returns. )-:
956 * Fortunately, we have inode 0 pinned in icache for the duration of
957 * the mount so we can access it directly.
958 */ 953 */
959 if (!mft_no) { 954 if (!mft_no) {
960 /* Balance the below iput(). */ 955 /* Balance the below iput(). */
961 vi = igrab(mft_vi); 956 vi = igrab(mft_vi);
962 BUG_ON(vi != mft_vi); 957 BUG_ON(vi != mft_vi);
963 } else 958 } else {
964 vi = ilookup5(sb, mft_no, (test_t)ntfs_test_inode, &na); 959 /*
960 * Have to use ilookup5_nowait() since ilookup5() waits for the
961 * inode lock which causes ntfs to deadlock when a concurrent
962 * inode write via the inode dirty code paths and the page
963 * dirty code path of the inode dirty code path when writing
964 * $MFT occurs.
965 */
966 vi = ilookup5_nowait(sb, mft_no, (test_t)ntfs_test_inode, &na);
967 }
965 if (vi) { 968 if (vi) {
966 ntfs_debug("Base inode 0x%lx is in icache.", mft_no); 969 ntfs_debug("Base inode 0x%lx is in icache.", mft_no);
967 /* The inode is in icache. */ 970 /* The inode is in icache. */
@@ -1016,7 +1019,13 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no,
1016 na.mft_no = MREF_LE(m->base_mft_record); 1019 na.mft_no = MREF_LE(m->base_mft_record);
1017 ntfs_debug("Mft record 0x%lx is an extent record. Looking for base " 1020 ntfs_debug("Mft record 0x%lx is an extent record. Looking for base "
1018 "inode 0x%lx in icache.", mft_no, na.mft_no); 1021 "inode 0x%lx in icache.", mft_no, na.mft_no);
1019 vi = ilookup5(sb, na.mft_no, (test_t)ntfs_test_inode, &na); 1022 if (!na.mft_no) {
1023 /* Balance the below iput(). */
1024 vi = igrab(mft_vi);
1025 BUG_ON(vi != mft_vi);
1026 } else
1027 vi = ilookup5_nowait(sb, na.mft_no, (test_t)ntfs_test_inode,
1028 &na);
1020 if (!vi) { 1029 if (!vi) {
1021 /* 1030 /*
1022 * The base inode is not in icache, write this extent mft 1031 * The base inode is not in icache, write this extent mft