aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/inode.c
diff options
context:
space:
mode:
authorDmitry Monakhov <dmonakhov@openvz.org>2012-09-29 00:14:55 -0400
committerTheodore Ts'o <tytso@mit.edu>2012-09-29 00:14:55 -0400
commit28a535f9a0df060569dcc786e5bc2e1de43d7dc7 (patch)
tree07db5cecf251794492ae92c0714d400423feb5cc /fs/ext4/inode.c
parent82e54229118785badffb4ef5ba4803df25fe007f (diff)
ext4: completed_io locking cleanup
Current unwritten extent conversion state-machine is very fuzzy. - For unknown reason it performs conversion under i_mutex. What for? My diagnosis: We already protect extent tree with i_data_sem, truncate and punch_hole should wait for DIO, so the only data we have to protect is end_io->flags modification, but only flush_completed_IO and end_io_work modified this flags and we can serialize them via i_completed_io_lock. Currently all these games with mutex_trylock result in the following deadlock truncate: kworker: ext4_setattr ext4_end_io_work mutex_lock(i_mutex) inode_dio_wait(inode) ->BLOCK DEADLOCK<- mutex_trylock() inode_dio_done() #TEST_CASE1_BEGIN MNT=/mnt_scrach unlink $MNT/file fallocate -l $((1024*1024*1024)) $MNT/file aio-stress -I 100000 -O -s 100m -n -t 1 -c 10 -o 2 -o 3 $MNT/file sleep 2 truncate -s 0 $MNT/file #TEST_CASE1_END Or use 286's xfstests https://github.com/dmonakhov/xfstests/blob/devel/286 This patch makes state machine simple and clean: (1) xxx_end_io schedule final extent conversion simply by calling ext4_add_complete_io(), which append it to ei->i_completed_io_list NOTE1: because of (2A) work should be queued only if ->i_completed_io_list was empty, otherwise the work is scheduled already. (2) ext4_flush_completed_IO is responsible for handling all pending end_io from ei->i_completed_io_list Flushing sequence consists of following stages: A) LOCKED: Atomically drain completed_io_list to local_list B) Perform extents conversion C) LOCKED: move converted io's to to_free list for final deletion This logic depends on context which we was called from. D) Final end_io context destruction NOTE1: i_mutex is no longer required because end_io->flags modification is protected by ei->ext4_complete_io_lock Full list of changes: - Move all completion end_io related routines to page-io.c in order to improve logic locality - Move open coded logic from various xx_end_xx routines to ext4_add_complete_io() - remove EXT4_IO_END_FSYNC - Improve SMP scalability by removing useless i_mutex which does not protect io->flags anymore. - Reduce lock contention on i_completed_io_lock by optimizing list walk. - Rename ext4_end_io_nolock to end4_end_io and make it static - Check flush completion status to ext4_ext_punch_hole(). Because it is not good idea to punch blocks from corrupted inode. Changes since V3 (in request to Jan's comments): Fall back to active flush_completed_IO() approach in order to prevent performance issues with nolocked DIO reads. Changes since V2: Fix use-after-free caused by race truncate vs end_io_work Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r--fs/ext4/inode.c25
1 files changed, 2 insertions, 23 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a99588673566..09d0488e9a15 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -2881,9 +2881,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset,
2881{ 2881{
2882 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; 2882 struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode;
2883 ext4_io_end_t *io_end = iocb->private; 2883 ext4_io_end_t *io_end = iocb->private;
2884 struct workqueue_struct *wq;
2885 unsigned long flags;
2886 struct ext4_inode_info *ei;
2887 2884
2888 /* if not async direct IO or dio with 0 bytes write, just return */ 2885 /* if not async direct IO or dio with 0 bytes write, just return */
2889 if (!io_end || !size) 2886 if (!io_end || !size)
@@ -2912,24 +2909,14 @@ out:
2912 io_end->iocb = iocb; 2909 io_end->iocb = iocb;
2913 io_end->result = ret; 2910 io_end->result = ret;
2914 } 2911 }
2915 wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq;
2916
2917 /* Add the io_end to per-inode completed aio dio list*/
2918 ei = EXT4_I(io_end->inode);
2919 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
2920 list_add_tail(&io_end->list, &ei->i_completed_io_list);
2921 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
2922 2912
2923 /* queue the work to convert unwritten extents to written */ 2913 ext4_add_complete_io(io_end);
2924 queue_work(wq, &io_end->work);
2925} 2914}
2926 2915
2927static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) 2916static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
2928{ 2917{
2929 ext4_io_end_t *io_end = bh->b_private; 2918 ext4_io_end_t *io_end = bh->b_private;
2930 struct workqueue_struct *wq;
2931 struct inode *inode; 2919 struct inode *inode;
2932 unsigned long flags;
2933 2920
2934 if (!test_clear_buffer_uninit(bh) || !io_end) 2921 if (!test_clear_buffer_uninit(bh) || !io_end)
2935 goto out; 2922 goto out;
@@ -2948,15 +2935,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
2948 */ 2935 */
2949 inode = io_end->inode; 2936 inode = io_end->inode;
2950 ext4_set_io_unwritten_flag(inode, io_end); 2937 ext4_set_io_unwritten_flag(inode, io_end);
2951 2938 ext4_add_complete_io(io_end);
2952 /* Add the io_end to per-inode completed io list*/
2953 spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags);
2954 list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list);
2955 spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags);
2956
2957 wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq;
2958 /* queue the work to convert unwritten extents to written */
2959 queue_work(wq, &io_end->work);
2960out: 2939out:
2961 bh->b_private = NULL; 2940 bh->b_private = NULL;
2962 bh->b_end_io = NULL; 2941 bh->b_end_io = NULL;