aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorAnton Altaparmakov <aia21@cam.ac.uk>2005-07-13 04:10:44 -0400
committerLinus Torvalds <torvalds@g5.osdl.org>2005-07-13 14:25:24 -0400
commit88bd5121d635136e01369141367f315665534b3c (patch)
tree33de8dda3d77ebf7bf48864c464a36665f66497c /fs
parent9a556e89081b0c1c2f83cee915363b15a68a6f2d (diff)
[PATCH] Fix soft lockup due to NTFS: VFS part and explanation
Something has changed in the core kernel such that we now get concurrent inode write outs, one e.g via pdflush and one via sys_sync or whatever. This causes a nasty deadlock in ntfs. The only clean solution unfortunately requires a minor vfs api extension. First the deadlock analysis: Prerequisive knowledge: NTFS has a file $MFT (inode 0) loaded at mount time. The NTFS driver uses the page cache for storing the file contents as usual. More interestingly this file contains the table of on-disk inodes as a sequence of MFT_RECORDs. Thus NTFS driver accesses the on-disk inodes by accessing the MFT_RECORDs in the page cache pages of the loaded inode $MFT. The situation: VFS inode X on a mounted ntfs volume is dirty. For same inode X, the ntfs_inode is dirty and thus corresponding on-disk inode, which is as explained above in a dirty PAGE_CACHE_PAGE belonging to the table of inodes ($MFT, inode 0). What happens: Process 1: sys_sync()/umount()/whatever... calls __sync_single_inode() for $MFT -> do_writepages() -> write_page for the dirty page containing the on-disk inode X, the page is now locked -> ntfs_write_mst_block() which clears PageUptodate() on the page to prevent anyone else getting hold of it whilst it does the write out (this is necessary as the on-disk inode needs "fixups" applied before the write to disk which are removed again after the write and PageUptodate is then set again). It then analyses the page looking for dirty on-disk inodes and when it finds one it calls ntfs_may_write_mft_record() to see if it is safe to write this on-disk inode. This then calls ilookup5() to check if the corresponding VFS inode is in icache(). This in turn calls ifind() which waits on the inode lock via wait_on_inode whilst holding the global inode_lock. Process 2: pdflush results in a call to __sync_single_inode for the same VFS inode X on the ntfs volume. This locks the inode (I_LOCK) then calls write-inode -> ntfs_write_inode -> map_mft_record() -> read_cache_page() of the page (in page cache of table of inodes $MFT, inode 0) containing the on-disk inode. This page has PageUptodate() clear because of Process 1 (see above) so read_cache_page() blocks when tries to take the page lock for the page so it can call ntfs_read_page(). Thus Process 1 is holding the page lock on the page containing the on-disk inode X and it is waiting on the inode X to be unlocked in ifind() so it can write the page out and then unlock the page. And Process 2 is holding the inode lock on inode X and is waiting for the page to be unlocked so it can call ntfs_readpage() or discover that Process 1 set PageUptodate() again and use the page. Thus we have a deadlock due to ifind() waiting on the inode lock. The only sensible solution: NTFS does not care whether the VFS inode is locked or not when it calls ilookup5() (it doesn't use the VFS inode at all, it just uses it to find the corresponding ntfs_inode which is of course attached to the VFS inode (both are one single struct); and it uses the ntfs_inode which is subject to its own locking so I_LOCK is irrelevant) hence we want a modified ilookup5_nowait() which is the same as ilookup5() but it does not wait on the inode lock. Without such functionality I would have to keep my own ntfs_inode cache in the NTFS driver just so I can find ntfs_inodes independent of their VFS inodes which would be slow, memory and cpu cycle wasting, and incredibly stupid given the icache already exists in the VFS. Below is a patch that does the ilookup5_nowait() implementation in fs/inode.c and exports it. ilookup5_nowait.diff: Introduce ilookup5_nowait() which is basically the same as ilookup5() but it does not wait on the inode's lock (i.e. it omits the wait_on_inode() done in ifind()). This is needed to avoid a nasty deadlock in NTFS. Signed-off-by: Anton Altaparmakov <aia21@cantab.net> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'fs')
-rw-r--r--fs/inode.c45
1 files changed, 39 insertions, 6 deletions
diff --git a/fs/inode.c b/fs/inode.c
index 96364fae0844..e57f1724db3e 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -757,6 +757,7 @@ EXPORT_SYMBOL(igrab);
757 * @head: the head of the list to search 757 * @head: the head of the list to search
758 * @test: callback used for comparisons between inodes 758 * @test: callback used for comparisons between inodes
759 * @data: opaque data pointer to pass to @test 759 * @data: opaque data pointer to pass to @test
760 * @wait: if true wait for the inode to be unlocked, if false do not
760 * 761 *
761 * ifind() searches for the inode specified by @data in the inode 762 * ifind() searches for the inode specified by @data in the inode
762 * cache. This is a generalized version of ifind_fast() for file systems where 763 * cache. This is a generalized version of ifind_fast() for file systems where
@@ -771,7 +772,7 @@ EXPORT_SYMBOL(igrab);
771 */ 772 */
772static inline struct inode *ifind(struct super_block *sb, 773static inline struct inode *ifind(struct super_block *sb,
773 struct hlist_head *head, int (*test)(struct inode *, void *), 774 struct hlist_head *head, int (*test)(struct inode *, void *),
774 void *data) 775 void *data, const int wait)
775{ 776{
776 struct inode *inode; 777 struct inode *inode;
777 778
@@ -780,7 +781,8 @@ static inline struct inode *ifind(struct super_block *sb,
780 if (inode) { 781 if (inode) {
781 __iget(inode); 782 __iget(inode);
782 spin_unlock(&inode_lock); 783 spin_unlock(&inode_lock);
783 wait_on_inode(inode); 784 if (likely(wait))
785 wait_on_inode(inode);
784 return inode; 786 return inode;
785 } 787 }
786 spin_unlock(&inode_lock); 788 spin_unlock(&inode_lock);
@@ -820,7 +822,7 @@ static inline struct inode *ifind_fast(struct super_block *sb,
820} 822}
821 823
822/** 824/**
823 * ilookup5 - search for an inode in the inode cache 825 * ilookup5_nowait - search for an inode in the inode cache
824 * @sb: super block of file system to search 826 * @sb: super block of file system to search
825 * @hashval: hash value (usually inode number) to search for 827 * @hashval: hash value (usually inode number) to search for
826 * @test: callback used for comparisons between inodes 828 * @test: callback used for comparisons between inodes
@@ -832,7 +834,38 @@ static inline struct inode *ifind_fast(struct super_block *sb,
832 * identification of an inode. 834 * identification of an inode.
833 * 835 *
834 * If the inode is in the cache, the inode is returned with an incremented 836 * If the inode is in the cache, the inode is returned with an incremented
835 * reference count. 837 * reference count. Note, the inode lock is not waited upon so you have to be
838 * very careful what you do with the returned inode. You probably should be
839 * using ilookup5() instead.
840 *
841 * Otherwise NULL is returned.
842 *
843 * Note, @test is called with the inode_lock held, so can't sleep.
844 */
845struct inode *ilookup5_nowait(struct super_block *sb, unsigned long hashval,
846 int (*test)(struct inode *, void *), void *data)
847{
848 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
849
850 return ifind(sb, head, test, data, 0);
851}
852
853EXPORT_SYMBOL(ilookup5_nowait);
854
855/**
856 * ilookup5 - search for an inode in the inode cache
857 * @sb: super block of file system to search
858 * @hashval: hash value (usually inode number) to search for
859 * @test: callback used for comparisons between inodes
860 * @data: opaque data pointer to pass to @test
861 *
862 * ilookup5() uses ifind() to search for the inode specified by @hashval and
863 * @data in the inode cache. This is a generalized version of ilookup() for
864 * file systems where the inode number is not sufficient for unique
865 * identification of an inode.
866 *
867 * If the inode is in the cache, the inode lock is waited upon and the inode is
868 * returned with an incremented reference count.
836 * 869 *
837 * Otherwise NULL is returned. 870 * Otherwise NULL is returned.
838 * 871 *
@@ -843,7 +876,7 @@ struct inode *ilookup5(struct super_block *sb, unsigned long hashval,
843{ 876{
844 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 877 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
845 878
846 return ifind(sb, head, test, data); 879 return ifind(sb, head, test, data, 1);
847} 880}
848 881
849EXPORT_SYMBOL(ilookup5); 882EXPORT_SYMBOL(ilookup5);
@@ -900,7 +933,7 @@ struct inode *iget5_locked(struct super_block *sb, unsigned long hashval,
900 struct hlist_head *head = inode_hashtable + hash(sb, hashval); 933 struct hlist_head *head = inode_hashtable + hash(sb, hashval);
901 struct inode *inode; 934 struct inode *inode;
902 935
903 inode = ifind(sb, head, test, data); 936 inode = ifind(sb, head, test, data, 1);
904 if (inode) 937 if (inode)
905 return inode; 938 return inode;
906 /* 939 /*