diff options
| -rw-r--r-- | Documentation/filesystems/ntfs.txt | 5 | ||||
| -rw-r--r-- | fs/ntfs/ChangeLog | 42 | ||||
| -rw-r--r-- | fs/ntfs/mft.c | 29 |
3 files changed, 65 insertions, 11 deletions
diff --git a/Documentation/filesystems/ntfs.txt b/Documentation/filesystems/ntfs.txt index 1415b96ed491..eef4aca0c753 100644 --- a/Documentation/filesystems/ntfs.txt +++ b/Documentation/filesystems/ntfs.txt | |||
| @@ -451,9 +451,12 @@ Note, a technical ChangeLog aimed at kernel hackers is in fs/ntfs/ChangeLog. | |||
| 451 | - Implement extension of resident files using the normal file write | 451 | - Implement extension of resident files using the normal file write |
| 452 | code paths, i.e. most very small files can be extended to be a little | 452 | code paths, i.e. most very small files can be extended to be a little |
| 453 | bit bigger but not by much. | 453 | bit bigger but not by much. |
| 454 | - Add new mount option "disable_sparse". (See list of mount options | ||
| 455 | above for details.) | ||
| 454 | - Improve handling of ntfs volumes with errors and strange boot sectors | 456 | - Improve handling of ntfs volumes with errors and strange boot sectors |
| 455 | in particular. | 457 | in particular. |
| 456 | - Fix various bugs. | 458 | - Fix various bugs including a nasty deadlock that appeared in recent |
| 459 | kernels (around 2.6.11-2.6.12 timeframe). | ||
| 457 | 2.1.22: | 460 | 2.1.22: |
| 458 | - Improve handling of ntfs volumes with errors. | 461 | - Improve handling of ntfs volumes with errors. |
| 459 | - Fix various bugs and race conditions. | 462 | - Fix various bugs and race conditions. |
diff --git a/fs/ntfs/ChangeLog b/fs/ntfs/ChangeLog index 3d2cac4061d6..9709fac6531d 100644 --- a/fs/ntfs/ChangeLog +++ b/fs/ntfs/ChangeLog | |||
| @@ -132,6 +132,48 @@ ToDo/Notes: | |||
| 132 | the already mapped runlist fragment which causes | 132 | the already mapped runlist fragment which causes |
| 133 | ntfs_mapping_pairs_decompress() to fail and return error. Update | 133 | ntfs_mapping_pairs_decompress() to fail and return error. Update |
| 134 | ntfs_attr_find_vcn_nolock() accordingly. | 134 | ntfs_attr_find_vcn_nolock() accordingly. |
| 135 | - Fix a nasty deadlock that appeared in recent kernels. | ||
| 136 | The situation: VFS inode X on a mounted ntfs volume is dirty. For | ||
| 137 | same inode X, the ntfs_inode is dirty and thus corresponding on-disk | ||
| 138 | inode, i.e. mft record, which is in a dirty PAGE_CACHE_PAGE belonging | ||
| 139 | to the table of inodes, i.e. $MFT, inode 0. | ||
| 140 | What happens: | ||
| 141 | Process 1: sys_sync()/umount()/whatever... calls | ||
| 142 | __sync_single_inode() for $MFT -> do_writepages() -> write_page for | ||
| 143 | the dirty page containing the on-disk inode X, the page is now locked | ||
| 144 | -> ntfs_write_mst_block() which clears PageUptodate() on the page to | ||
| 145 | prevent anyone else getting hold of it whilst it does the write out. | ||
| 146 | This is necessary as the on-disk inode needs "fixups" applied before | ||
| 147 | the write to disk which are removed again after the write and | ||
| 148 | PageUptodate is then set again. It then analyses the page looking | ||
| 149 | for dirty on-disk inodes and when it finds one it calls | ||
| 150 | ntfs_may_write_mft_record() to see if it is safe to write this | ||
| 151 | on-disk inode. This then calls ilookup5() to check if the | ||
| 152 | corresponding VFS inode is in icache(). This in turn calls ifind() | ||
| 153 | which waits on the inode lock via wait_on_inode whilst holding the | ||
| 154 | global inode_lock. | ||
| 155 | Process 2: pdflush results in a call to __sync_single_inode for the | ||
| 156 | same VFS inode X on the ntfs volume. This locks the inode (I_LOCK) | ||
| 157 | then calls write-inode -> ntfs_write_inode -> map_mft_record() -> | ||
| 158 | read_cache_page() for the page (in page cache of table of inodes | ||
| 159 | $MFT, inode 0) containing the on-disk inode. This page has | ||
| 160 | PageUptodate() clear because of Process 1 (see above) so | ||
| 161 | read_cache_page() blocks when it tries to take the page lock for the | ||
| 162 | page so it can call ntfs_read_page(). | ||
| 163 | Thus Process 1 is holding the page lock on the page containing the | ||
| 164 | on-disk inode X and it is waiting on the inode X to be unlocked in | ||
| 165 | ifind() so it can write the page out and then unlock the page. | ||
| 166 | And Process 2 is holding the inode lock on inode X and is waiting for | ||
| 167 | the page to be unlocked so it can call ntfs_readpage() or discover | ||
| 168 | that Process 1 set PageUptodate() again and use the page. | ||
| 169 | Thus we have a deadlock due to ifind() waiting on the inode lock. | ||
| 170 | The solution: The fix is to use the newly introduced | ||
| 171 | ilookup5_nowait() which does not wait on the inode's lock and hence | ||
| 172 | avoids the deadlock. This is safe as we do not care about the VFS | ||
| 173 | inode and only use the fact that it is in the VFS inode cache and the | ||
| 174 | fact that the vfs and ntfs inodes are one struct in memory to find | ||
| 175 | the ntfs inode in memory if present. Also, the ntfs inode has its | ||
| 176 | own locking so it does not matter if the vfs inode is locked. | ||
| 135 | 177 | ||
| 136 | 2.1.22 - Many bug and race fixes and error handling improvements. | 178 | 2.1.22 - Many bug and race fixes and error handling improvements. |
| 137 | 179 | ||
diff --git a/fs/ntfs/mft.c b/fs/ntfs/mft.c index 3d0ba8e60adc..ac9ff39aa834 100644 --- a/fs/ntfs/mft.c +++ b/fs/ntfs/mft.c | |||
| @@ -948,20 +948,23 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no, | |||
| 948 | na.name_len = 0; | 948 | na.name_len = 0; |
| 949 | na.type = AT_UNUSED; | 949 | na.type = AT_UNUSED; |
| 950 | /* | 950 | /* |
| 951 | * For inode 0, i.e. $MFT itself, we cannot use ilookup5() from here or | 951 | * Optimize inode 0, i.e. $MFT itself, since we have it in memory and |
| 952 | * we deadlock because the inode is already locked by the kernel | 952 | * we get here for it rather often. |
| 953 | * (fs/fs-writeback.c::__sync_single_inode()) and ilookup5() waits | ||
| 954 | * until the inode is unlocked before returning it and it never gets | ||
| 955 | * unlocked because ntfs_should_write_mft_record() never returns. )-: | ||
| 956 | * Fortunately, we have inode 0 pinned in icache for the duration of | ||
| 957 | * the mount so we can access it directly. | ||
| 958 | */ | 953 | */ |
| 959 | if (!mft_no) { | 954 | if (!mft_no) { |
| 960 | /* Balance the below iput(). */ | 955 | /* Balance the below iput(). */ |
| 961 | vi = igrab(mft_vi); | 956 | vi = igrab(mft_vi); |
| 962 | BUG_ON(vi != mft_vi); | 957 | BUG_ON(vi != mft_vi); |
| 963 | } else | 958 | } else { |
| 964 | vi = ilookup5(sb, mft_no, (test_t)ntfs_test_inode, &na); | 959 | /* |
| 960 | * Have to use ilookup5_nowait() since ilookup5() waits for the | ||
| 961 | * inode lock which causes ntfs to deadlock when a concurrent | ||
| 962 | * inode write via the inode dirty code paths and the page | ||
| 963 | * dirty code path of the inode dirty code path when writing | ||
| 964 | * $MFT occurs. | ||
| 965 | */ | ||
| 966 | vi = ilookup5_nowait(sb, mft_no, (test_t)ntfs_test_inode, &na); | ||
| 967 | } | ||
| 965 | if (vi) { | 968 | if (vi) { |
| 966 | ntfs_debug("Base inode 0x%lx is in icache.", mft_no); | 969 | ntfs_debug("Base inode 0x%lx is in icache.", mft_no); |
| 967 | /* The inode is in icache. */ | 970 | /* The inode is in icache. */ |
| @@ -1016,7 +1019,13 @@ BOOL ntfs_may_write_mft_record(ntfs_volume *vol, const unsigned long mft_no, | |||
| 1016 | na.mft_no = MREF_LE(m->base_mft_record); | 1019 | na.mft_no = MREF_LE(m->base_mft_record); |
| 1017 | ntfs_debug("Mft record 0x%lx is an extent record. Looking for base " | 1020 | ntfs_debug("Mft record 0x%lx is an extent record. Looking for base " |
| 1018 | "inode 0x%lx in icache.", mft_no, na.mft_no); | 1021 | "inode 0x%lx in icache.", mft_no, na.mft_no); |
| 1019 | vi = ilookup5(sb, na.mft_no, (test_t)ntfs_test_inode, &na); | 1022 | if (!na.mft_no) { |
| 1023 | /* Balance the below iput(). */ | ||
| 1024 | vi = igrab(mft_vi); | ||
| 1025 | BUG_ON(vi != mft_vi); | ||
| 1026 | } else | ||
| 1027 | vi = ilookup5_nowait(sb, na.mft_no, (test_t)ntfs_test_inode, | ||
| 1028 | &na); | ||
| 1020 | if (!vi) { | 1029 | if (!vi) { |
| 1021 | /* | 1030 | /* |
| 1022 | * The base inode is not in icache, write this extent mft | 1031 | * The base inode is not in icache, write this extent mft |
