diff options
-rw-r--r-- | Documentation/filesystems/ntfs.txt | 5 | ||||
-rw-r--r-- | fs/ntfs/ChangeLog | 42 | ||||
-rw-r--r-- | fs/ntfs/mft.c | 29 |
3 files changed, 65 insertions, 11 deletions
diff --git a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt index 1415b96ed491..eef4aca0c753 100644 --- a/Documentation/filesystems/ntfs.txt +++ b/Documentation/filesystems/ntfs.txt | |||
@@ -451,9 +451,12 @@ Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog. | |||
451 | - Implement extension of resident files using the normal file write | 451 | - Implement extension of resident files using the normal file write |
452 | code paths, i.e. most very small files can be extended to be a little | 452 | code paths, i.e. most very small files can be extended to be a little |
453 | bit bigger but not by much. | 453 | bit bigger but not by much. |
454 | - Add new mount option "disable_sparse". (See list of mount options | ||
455 | above for details.) | ||
454 | - Improve handling of ntfs volumes with errors and strange boot sectors | 456 | - Improve handling of ntfs volumes with errors and strange boot sectors |
455 | in particular. | 457 | in particular. |
456 | - Fix various bugs. | 458 | - Fix various bugs including a nasty deadlock that appeared in recent |
459 | kernels (around 2.6.11-2.6.12 timeframe). | ||
457 | 2.1.22: | 460 | 2.1.22: |
458 | - Improve handling of ntfs volumes with errors. | 461 | - Improve handling of ntfs volumes with errors. |
459 | - Fix various bugs and race conditions. | 462 | - Fix various bugs and race conditions. |
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog index 3d2cac4061d6..9709fac6531d 100644 --- a/fs/ntfs/ChangeLog +++ b/fs/ntfs/ChangeLog | |||
@@ -132,6 +132,48 @@ ToDo/Notes: | |||
132 | the already mapped runlist fragment which causes | 132 | the already mapped runlist fragment which causes |
133 | ntfs_mapping_pairs_decompress() to fail and return error. Update | 133 | ntfs_mapping_pairs_decompress() to fail and return error. Update |
134 | ntfs_attr_find_vcn_nolock() accordingly. | 134 | ntfs_attr_find_vcn_nolock() accordingly. |
135 | - Fix a nasty deadlock that appeared in recent kernels. | ||
136 | The situation: VFS inode X on a mounted ntfs volume is dirty. For | ||
137 | same inode X, the ntfs_inode is dirty and thus corresponding on-disk | ||
138 | inode, i.e. mft record, which is in a dirty PAGE_CACHE_PAGE belonging | ||
139 | to the table of inodes, i.e. $MFT, inode 0. | ||
140 | What happens: | ||
141 | Process 1: sys_sync()/umount()/whatever... calls | ||
142 | __sync_single_inode() for $MFT -> do_writepages() -> write_page for | ||
143 | the dirty page containing the on-disk inode X, the page is now locked | ||
144 | -> ntfs_write_mst_block() which clears PageUptodate() on the page to | ||
145 | prevent anyone else getting hold of it whilst it does the write out. | ||
146 | This is necessary as the on-disk inode needs "fixups" applied before | ||
147 | the write to disk which are removed again after the write and | ||
148 | PageUptodate is then set again. It then analyses the page looking | ||
149 | for dirty on-disk inodes and when it finds one it calls | ||
150 | ntfs_may_write_mft_record() to see if it is safe to write this | ||
151 | on-disk inode. This then calls ilookup5() to check if the | ||
152 | corresponding VFS inode is in icache(). This in turn calls ifind() | ||
153 | which waits on the inode lock via wait_on_inode whilst holding the | ||
154 | global inode_lock. | ||
155 | Process 2: pdflush results in a call to __sync_single_inode for the | ||
156 | same VFS inode X on the ntfs volume. This locks the inode (I_LOCK) | ||
157 | then calls write-inode -> ntfs_write_inode -> map_mft_record() -> | ||
158 | read_cache_page() for the page (in page cache of table of inodes | ||
159 | $MFT, inode 0) containing the on-disk inode. This page has | ||
160 | PageUptodate() clear because of Process 1 (see above) so | ||
161 | read_cache_page() blocks when it tries to take the page lock for the | ||
162 | page so it can call ntfs_read_page(). | ||
163 | Thus Process 1 is holding the page lock on the page containing the | ||
164 | on-disk inode X and it is waiting on the inode X to be unlocked in | ||
165 | ifind() so it can write the page out and then unlock the page. | ||
166 | And Process 2 is holding the inode lock on inode X and is waiting for | ||
167 | the page to be unlocked so it can call ntfs_readpage() or discover | ||
168 | that Process 1 set PageUptodate() again and use the page. | ||
169 | Thus we have a deadlock due to ifind() waiting on the inode lock. | ||
170 | The solution: The fix is to use the newly introduced | ||
171 | ilookup5_nowait() which does not wait on the inode's lock and hence | ||
172 | avoids the deadlock. This is safe as we do not care about the VFS | ||
173 | inode and only use the fact that it is in the VFS inode cache and the | ||
174 | fact that the vfs and ntfs inodes are one struct in memory to find | ||
175 | the ntfs inode in memory if present. Also, the ntfs inode has its | ||
176 | own locking so it does not matter if the vfs inode is locked. | ||
135 | 177 | ||
136 | 2.1.22 - Many bug and race fixes and error handling improvements. | 178 | 2.1.22 - Many bug and race fixes and error handling improvements. |
137 | 179 | ||
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 3d0ba8e60adc..ac9ff39aa834 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c | |||
@@ -948,20 +948,23 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no, | |||
948 | na.name_len = 0; | 948 | na.name_len = 0; |
949 | na.type = AT_UNUSED; | 949 | na.type = AT_UNUSED; |
950 | /* | 950 | /* |
951 | * For inode 0, i.e. $MFT itself, we cannot use ilookup5() from here or | 951 | * Optimize inode 0, i.e. $MFT itself, since we have it in memory and |
952 | * we deadlock because the inode is already locked by the kernel | 952 | * we get here for it rather often. |
953 | * (fs/fs-writeback.c::__sync_single_inode()) and ilookup5() waits | ||
954 | * until the inode is unlocked before returning it and it never gets | ||
955 | * unlocked because ntfs_should_write_mft_record() never returns. )-: | ||
956 | * Fortunately, we have inode 0 pinned in icache for the duration of | ||
957 | * the mount so we can access it directly. | ||
958 | */ | 953 | */ |
959 | if (!mft_no) { | 954 | if (!mft_no) { |
960 | /* Balance the below iput(). */ | 955 | /* Balance the below iput(). */ |
961 | vi = igrab(mft_vi); | 956 | vi = igrab(mft_vi); |
962 | BUG_ON(vi != mft_vi); | 957 | BUG_ON(vi != mft_vi); |
963 | } else | 958 | } else { |
964 | vi = ilookup5(sb, mft_no, (test_t)ntfs_test_inode, &na); | 959 | /* |
960 | * Have to use ilookup5_nowait() since ilookup5() waits for the | ||
961 | * inode lock which causes ntfs to deadlock when a concurrent | ||
962 | * inode write via the inode dirty code paths and the page | ||
963 | * dirty code path of the inode dirty code path when writing | ||
964 | * $MFT occurs. | ||
965 | */ | ||
966 | vi = ilookup5_nowait(sb, mft_no, (test_t)ntfs_test_inode, &na); | ||
967 | } | ||
965 | if (vi) { | 968 | if (vi) { |
966 | ntfs_debug("Base inode 0x%lx is in icache.", mft_no); | 969 | ntfs_debug("Base inode 0x%lx is in icache.", mft_no); |
967 | /* The inode is in icache. */ | 970 | /* The inode is in icache. */ |
@@ -1016,7 +1019,13 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no, | |||
1016 | na.mft_no = MREF_LE(m->base_mft_record); | 1019 | na.mft_no = MREF_LE(m->base_mft_record); |
1017 | ntfs_debug("Mft record 0x%lx is an extent record. Looking for base " | 1020 | ntfs_debug("Mft record 0x%lx is an extent record. Looking for base " |
1018 | "inode 0x%lx in icache.", mft_no, na.mft_no); | 1021 | "inode 0x%lx in icache.", mft_no, na.mft_no); |
1019 | vi = ilookup5(sb, na.mft_no, (test_t)ntfs_test_inode, &na); | 1022 | if (!na.mft_no) { |
1023 | /* Balance the below iput(). */ | ||
1024 | vi = igrab(mft_vi); | ||
1025 | BUG_ON(vi != mft_vi); | ||
1026 | } else | ||
1027 | vi = ilookup5_nowait(sb, na.mft_no, (test_t)ntfs_test_inode, | ||
1028 | &na); | ||
1020 | if (!vi) { | 1029 | if (!vi) { |
1021 | /* | 1030 | /* |
1022 | * The base inode is not in icache, write this extent mft | 1031 | * The base inode is not in icache, write this extent mft |