diff options
author | Dmitry Monakhov <dmonakhov@openvz.org> | 2012-09-29 00:14:55 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2012-09-29 00:14:55 -0400 |
commit | 28a535f9a0df060569dcc786e5bc2e1de43d7dc7 (patch) | |
tree | 07db5cecf251794492ae92c0714d400423feb5cc /fs/ext4/inode.c | |
parent | 82e54229118785badffb4ef5ba4803df25fe007f (diff) |
ext4: completed_io locking cleanup
Current unwritten extent conversion state-machine is very fuzzy.
- For unknown reason it performs conversion under i_mutex. What for?
My diagnosis:
We already protect extent tree with i_data_sem, truncate and punch_hole
should wait for DIO, so the only data we have to protect is end_io->flags
modification, but only flush_completed_IO and end_io_work modified this
flags and we can serialize them via i_completed_io_lock.
Currently all these games with mutex_trylock result in the following deadlock
truncate: kworker:
ext4_setattr ext4_end_io_work
mutex_lock(i_mutex)
inode_dio_wait(inode) ->BLOCK
DEADLOCK<- mutex_trylock()
inode_dio_done()
#TEST_CASE1_BEGIN
MNT=/mnt_scrach
unlink $MNT/file
fallocate -l $((1024*1024*1024)) $MNT/file
aio-stress -I 100000 -O -s 100m -n -t 1 -c 10 -o 2 -o 3 $MNT/file
sleep 2
truncate -s 0 $MNT/file
#TEST_CASE1_END
Or use 286's xfstests https://github.com/dmonakhov/xfstests/blob/devel/286
This patch makes state machine simple and clean:
(1) xxx_end_io schedule final extent conversion simply by calling
ext4_add_complete_io(), which append it to ei->i_completed_io_list
NOTE1: because of (2A) work should be queued only if
->i_completed_io_list was empty, otherwise the work is scheduled already.
(2) ext4_flush_completed_IO is responsible for handling all pending
end_io from ei->i_completed_io_list
Flushing sequence consists of following stages:
A) LOCKED: Atomically drain completed_io_list to local_list
B) Perform extents conversion
C) LOCKED: move converted io's to to_free list for final deletion
This logic depends on context which we was called from.
D) Final end_io context destruction
NOTE1: i_mutex is no longer required because end_io->flags modification
is protected by ei->ext4_complete_io_lock
Full list of changes:
- Move all completion end_io related routines to page-io.c in order to improve
logic locality
- Move open coded logic from various xx_end_xx routines to ext4_add_complete_io()
- remove EXT4_IO_END_FSYNC
- Improve SMP scalability by removing useless i_mutex which does not
protect io->flags anymore.
- Reduce lock contention on i_completed_io_lock by optimizing list walk.
- Rename ext4_end_io_nolock to end4_end_io and make it static
- Check flush completion status to ext4_ext_punch_hole(). Because it is
not good idea to punch blocks from corrupted inode.
Changes since V3 (in request to Jan's comments):
Fall back to active flush_completed_IO() approach in order to prevent
performance issues with nolocked DIO reads.
Changes since V2:
Fix use-after-free caused by race truncate vs end_io_work
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4/inode.c')
-rw-r--r-- | fs/ext4/inode.c | 25 |
1 files changed, 2 insertions, 23 deletions
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a99588673566..09d0488e9a15 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -2881,9 +2881,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | |||
2881 | { | 2881 | { |
2882 | struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; | 2882 | struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; |
2883 | ext4_io_end_t *io_end = iocb->private; | 2883 | ext4_io_end_t *io_end = iocb->private; |
2884 | struct workqueue_struct *wq; | ||
2885 | unsigned long flags; | ||
2886 | struct ext4_inode_info *ei; | ||
2887 | 2884 | ||
2888 | /* if not async direct IO or dio with 0 bytes write, just return */ | 2885 | /* if not async direct IO or dio with 0 bytes write, just return */ |
2889 | if (!io_end || !size) | 2886 | if (!io_end || !size) |
@@ -2912,24 +2909,14 @@ out: | |||
2912 | io_end->iocb = iocb; | 2909 | io_end->iocb = iocb; |
2913 | io_end->result = ret; | 2910 | io_end->result = ret; |
2914 | } | 2911 | } |
2915 | wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; | ||
2916 | |||
2917 | /* Add the io_end to per-inode completed aio dio list*/ | ||
2918 | ei = EXT4_I(io_end->inode); | ||
2919 | spin_lock_irqsave(&ei->i_completed_io_lock, flags); | ||
2920 | list_add_tail(&io_end->list, &ei->i_completed_io_list); | ||
2921 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); | ||
2922 | 2912 | ||
2923 | /* queue the work to convert unwritten extents to written */ | 2913 | ext4_add_complete_io(io_end); |
2924 | queue_work(wq, &io_end->work); | ||
2925 | } | 2914 | } |
2926 | 2915 | ||
2927 | static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) | 2916 | static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) |
2928 | { | 2917 | { |
2929 | ext4_io_end_t *io_end = bh->b_private; | 2918 | ext4_io_end_t *io_end = bh->b_private; |
2930 | struct workqueue_struct *wq; | ||
2931 | struct inode *inode; | 2919 | struct inode *inode; |
2932 | unsigned long flags; | ||
2933 | 2920 | ||
2934 | if (!test_clear_buffer_uninit(bh) || !io_end) | 2921 | if (!test_clear_buffer_uninit(bh) || !io_end) |
2935 | goto out; | 2922 | goto out; |
@@ -2948,15 +2935,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) | |||
2948 | */ | 2935 | */ |
2949 | inode = io_end->inode; | 2936 | inode = io_end->inode; |
2950 | ext4_set_io_unwritten_flag(inode, io_end); | 2937 | ext4_set_io_unwritten_flag(inode, io_end); |
2951 | 2938 | ext4_add_complete_io(io_end); | |
2952 | /* Add the io_end to per-inode completed io list*/ | ||
2953 | spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); | ||
2954 | list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); | ||
2955 | spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); | ||
2956 | |||
2957 | wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; | ||
2958 | /* queue the work to convert unwritten extents to written */ | ||
2959 | queue_work(wq, &io_end->work); | ||
2960 | out: | 2939 | out: |
2961 | bh->b_private = NULL; | 2940 | bh->b_private = NULL; |
2962 | bh->b_end_io = NULL; | 2941 | bh->b_end_io = NULL; |