diff options
author | Dmitry Monakhov <dmonakhov@openvz.org> | 2012-09-29 00:14:55 -0400 |
---|---|---|
committer | Theodore Ts'o <tytso@mit.edu> | 2012-09-29 00:14:55 -0400 |
commit | 28a535f9a0df060569dcc786e5bc2e1de43d7dc7 (patch) | |
tree | 07db5cecf251794492ae92c0714d400423feb5cc /fs/ext4 | |
parent | 82e54229118785badffb4ef5ba4803df25fe007f (diff) |
ext4: completed_io locking cleanup
Current unwritten extent conversion state-machine is very fuzzy.
- For unknown reason it performs conversion under i_mutex. What for?
My diagnosis:
We already protect extent tree with i_data_sem, truncate and punch_hole
should wait for DIO, so the only data we have to protect is end_io->flags
modification, but only flush_completed_IO and end_io_work modified this
flags and we can serialize them via i_completed_io_lock.
Currently all these games with mutex_trylock result in the following deadlock
truncate: kworker:
ext4_setattr ext4_end_io_work
mutex_lock(i_mutex)
inode_dio_wait(inode) ->BLOCK
DEADLOCK<- mutex_trylock()
inode_dio_done()
#TEST_CASE1_BEGIN
MNT=/mnt_scrach
unlink $MNT/file
fallocate -l $((1024*1024*1024)) $MNT/file
aio-stress -I 100000 -O -s 100m -n -t 1 -c 10 -o 2 -o 3 $MNT/file
sleep 2
truncate -s 0 $MNT/file
#TEST_CASE1_END
Or use 286's xfstests https://github.com/dmonakhov/xfstests/blob/devel/286
This patch makes state machine simple and clean:
(1) xxx_end_io schedule final extent conversion simply by calling
ext4_add_complete_io(), which append it to ei->i_completed_io_list
NOTE1: because of (2A) work should be queued only if
->i_completed_io_list was empty, otherwise the work is scheduled already.
(2) ext4_flush_completed_IO is responsible for handling all pending
end_io from ei->i_completed_io_list
Flushing sequence consists of following stages:
A) LOCKED: Atomically drain completed_io_list to local_list
B) Perform extents conversion
C) LOCKED: move converted io's to to_free list for final deletion
This logic depends on context which we was called from.
D) Final end_io context destruction
NOTE1: i_mutex is no longer required because end_io->flags modification
is protected by ei->ext4_complete_io_lock
Full list of changes:
- Move all completion end_io related routines to page-io.c in order to improve
logic locality
- Move open coded logic from various xx_end_xx routines to ext4_add_complete_io()
- remove EXT4_IO_END_FSYNC
- Improve SMP scalability by removing useless i_mutex which does not
protect io->flags anymore.
- Reduce lock contention on i_completed_io_lock by optimizing list walk.
- Rename ext4_end_io_nolock to end4_end_io and make it static
- Check flush completion status to ext4_ext_punch_hole(). Because it is
not good idea to punch blocks from corrupted inode.
Changes since V3 (in request to Jan's comments):
Fall back to active flush_completed_IO() approach in order to prevent
performance issues with nolocked DIO reads.
Changes since V2:
Fix use-after-free caused by race truncate vs end_io_work
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
Diffstat (limited to 'fs/ext4')
-rw-r--r-- | fs/ext4/ext4.h | 3 | ||||
-rw-r--r-- | fs/ext4/extents.c | 4 | ||||
-rw-r--r-- | fs/ext4/fsync.c | 81 | ||||
-rw-r--r-- | fs/ext4/indirect.c | 6 | ||||
-rw-r--r-- | fs/ext4/inode.c | 25 | ||||
-rw-r--r-- | fs/ext4/page-io.c | 171 |
6 files changed, 121 insertions, 169 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6ec9856065d6..edb049579420 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h | |||
@@ -186,7 +186,6 @@ struct mpage_da_data { | |||
186 | #define EXT4_IO_END_ERROR 0x0002 | 186 | #define EXT4_IO_END_ERROR 0x0002 |
187 | #define EXT4_IO_END_QUEUED 0x0004 | 187 | #define EXT4_IO_END_QUEUED 0x0004 |
188 | #define EXT4_IO_END_DIRECT 0x0008 | 188 | #define EXT4_IO_END_DIRECT 0x0008 |
189 | #define EXT4_IO_END_IN_FSYNC 0x0010 | ||
190 | 189 | ||
191 | struct ext4_io_page { | 190 | struct ext4_io_page { |
192 | struct page *p_page; | 191 | struct page *p_page; |
@@ -2418,11 +2417,11 @@ extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, | |||
2418 | 2417 | ||
2419 | /* page-io.c */ | 2418 | /* page-io.c */ |
2420 | extern int __init ext4_init_pageio(void); | 2419 | extern int __init ext4_init_pageio(void); |
2420 | extern void ext4_add_complete_io(ext4_io_end_t *io_end); | ||
2421 | extern void ext4_exit_pageio(void); | 2421 | extern void ext4_exit_pageio(void); |
2422 | extern void ext4_ioend_wait(struct inode *); | 2422 | extern void ext4_ioend_wait(struct inode *); |
2423 | extern void ext4_free_io_end(ext4_io_end_t *io); | 2423 | extern void ext4_free_io_end(ext4_io_end_t *io); |
2424 | extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); | 2424 | extern ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags); |
2425 | extern int ext4_end_io_nolock(ext4_io_end_t *io); | ||
2426 | extern void ext4_io_submit(struct ext4_io_submit *io); | 2425 | extern void ext4_io_submit(struct ext4_io_submit *io); |
2427 | extern int ext4_bio_write_page(struct ext4_io_submit *io, | 2426 | extern int ext4_bio_write_page(struct ext4_io_submit *io, |
2428 | struct page *page, | 2427 | struct page *page, |
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 54a94426ef7b..232077439aa8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c | |||
@@ -4833,7 +4833,9 @@ int ext4_ext_punch_hole(struct file *file, loff_t offset, loff_t length) | |||
4833 | } | 4833 | } |
4834 | 4834 | ||
4835 | /* finish any pending end_io work */ | 4835 | /* finish any pending end_io work */ |
4836 | ext4_flush_completed_IO(inode); | 4836 | err = ext4_flush_completed_IO(inode); |
4837 | if (err) | ||
4838 | return err; | ||
4837 | 4839 | ||
4838 | credits = ext4_writepage_trans_blocks(inode); | 4840 | credits = ext4_writepage_trans_blocks(inode); |
4839 | handle = ext4_journal_start(inode, credits); | 4841 | handle = ext4_journal_start(inode, credits); |
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index 323eb15c2d94..460000868b8e 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c | |||
@@ -34,87 +34,6 @@ | |||
34 | 34 | ||
35 | #include <trace/events/ext4.h> | 35 | #include <trace/events/ext4.h> |
36 | 36 | ||
37 | static void dump_completed_IO(struct inode * inode) | ||
38 | { | ||
39 | #ifdef EXT4FS_DEBUG | ||
40 | struct list_head *cur, *before, *after; | ||
41 | ext4_io_end_t *io, *io0, *io1; | ||
42 | unsigned long flags; | ||
43 | |||
44 | if (list_empty(&EXT4_I(inode)->i_completed_io_list)){ | ||
45 | ext4_debug("inode %lu completed_io list is empty\n", inode->i_ino); | ||
46 | return; | ||
47 | } | ||
48 | |||
49 | ext4_debug("Dump inode %lu completed_io list \n", inode->i_ino); | ||
50 | spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); | ||
51 | list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list){ | ||
52 | cur = &io->list; | ||
53 | before = cur->prev; | ||
54 | io0 = container_of(before, ext4_io_end_t, list); | ||
55 | after = cur->next; | ||
56 | io1 = container_of(after, ext4_io_end_t, list); | ||
57 | |||
58 | ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", | ||
59 | io, inode->i_ino, io0, io1); | ||
60 | } | ||
61 | spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); | ||
62 | #endif | ||
63 | } | ||
64 | |||
65 | /* | ||
66 | * This function is called from ext4_sync_file(). | ||
67 | * | ||
68 | * When IO is completed, the work to convert unwritten extents to | ||
69 | * written is queued on workqueue but may not get immediately | ||
70 | * scheduled. When fsync is called, we need to ensure the | ||
71 | * conversion is complete before fsync returns. | ||
72 | * The inode keeps track of a list of pending/completed IO that | ||
73 | * might needs to do the conversion. This function walks through | ||
74 | * the list and convert the related unwritten extents for completed IO | ||
75 | * to written. | ||
76 | * The function return the number of pending IOs on success. | ||
77 | */ | ||
78 | int ext4_flush_completed_IO(struct inode *inode) | ||
79 | { | ||
80 | ext4_io_end_t *io; | ||
81 | struct ext4_inode_info *ei = EXT4_I(inode); | ||
82 | unsigned long flags; | ||
83 | int ret = 0; | ||
84 | int ret2 = 0; | ||
85 | |||
86 | dump_completed_IO(inode); | ||
87 | spin_lock_irqsave(&ei->i_completed_io_lock, flags); | ||
88 | while (!list_empty(&ei->i_completed_io_list)){ | ||
89 | io = list_entry(ei->i_completed_io_list.next, | ||
90 | ext4_io_end_t, list); | ||
91 | list_del_init(&io->list); | ||
92 | io->flag |= EXT4_IO_END_IN_FSYNC; | ||
93 | /* | ||
94 | * Calling ext4_end_io_nolock() to convert completed | ||
95 | * IO to written. | ||
96 | * | ||
97 | * When ext4_sync_file() is called, run_queue() may already | ||
98 | * about to flush the work corresponding to this io structure. | ||
99 | * It will be upset if it founds the io structure related | ||
100 | * to the work-to-be schedule is freed. | ||
101 | * | ||
102 | * Thus we need to keep the io structure still valid here after | ||
103 | * conversion finished. The io structure has a flag to | ||
104 | * avoid double converting from both fsync and background work | ||
105 | * queue work. | ||
106 | */ | ||
107 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); | ||
108 | ret = ext4_end_io_nolock(io); | ||
109 | if (ret < 0) | ||
110 | ret2 = ret; | ||
111 | spin_lock_irqsave(&ei->i_completed_io_lock, flags); | ||
112 | io->flag &= ~EXT4_IO_END_IN_FSYNC; | ||
113 | } | ||
114 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); | ||
115 | return (ret2 < 0) ? ret2 : 0; | ||
116 | } | ||
117 | |||
118 | /* | 37 | /* |
119 | * If we're not journaling and this is a just-created file, we have to | 38 | * If we're not journaling and this is a just-created file, we have to |
120 | * sync our parent directory (if it was freshly created) since | 39 | * sync our parent directory (if it was freshly created) since |
diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 830e1b2bf145..61f13e57975e 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c | |||
@@ -807,11 +807,9 @@ ssize_t ext4_ind_direct_IO(int rw, struct kiocb *iocb, | |||
807 | 807 | ||
808 | retry: | 808 | retry: |
809 | if (rw == READ && ext4_should_dioread_nolock(inode)) { | 809 | if (rw == READ && ext4_should_dioread_nolock(inode)) { |
810 | if (unlikely(!list_empty(&ei->i_completed_io_list))) { | 810 | if (unlikely(!list_empty(&ei->i_completed_io_list))) |
811 | mutex_lock(&inode->i_mutex); | ||
812 | ext4_flush_completed_IO(inode); | 811 | ext4_flush_completed_IO(inode); |
813 | mutex_unlock(&inode->i_mutex); | 812 | |
814 | } | ||
815 | ret = __blockdev_direct_IO(rw, iocb, inode, | 813 | ret = __blockdev_direct_IO(rw, iocb, inode, |
816 | inode->i_sb->s_bdev, iov, | 814 | inode->i_sb->s_bdev, iov, |
817 | offset, nr_segs, | 815 | offset, nr_segs, |
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a99588673566..09d0488e9a15 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c | |||
@@ -2881,9 +2881,6 @@ static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, | |||
2881 | { | 2881 | { |
2882 | struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; | 2882 | struct inode *inode = iocb->ki_filp->f_path.dentry->d_inode; |
2883 | ext4_io_end_t *io_end = iocb->private; | 2883 | ext4_io_end_t *io_end = iocb->private; |
2884 | struct workqueue_struct *wq; | ||
2885 | unsigned long flags; | ||
2886 | struct ext4_inode_info *ei; | ||
2887 | 2884 | ||
2888 | /* if not async direct IO or dio with 0 bytes write, just return */ | 2885 | /* if not async direct IO or dio with 0 bytes write, just return */ |
2889 | if (!io_end || !size) | 2886 | if (!io_end || !size) |
@@ -2912,24 +2909,14 @@ out: | |||
2912 | io_end->iocb = iocb; | 2909 | io_end->iocb = iocb; |
2913 | io_end->result = ret; | 2910 | io_end->result = ret; |
2914 | } | 2911 | } |
2915 | wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; | ||
2916 | |||
2917 | /* Add the io_end to per-inode completed aio dio list*/ | ||
2918 | ei = EXT4_I(io_end->inode); | ||
2919 | spin_lock_irqsave(&ei->i_completed_io_lock, flags); | ||
2920 | list_add_tail(&io_end->list, &ei->i_completed_io_list); | ||
2921 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); | ||
2922 | 2912 | ||
2923 | /* queue the work to convert unwritten extents to written */ | 2913 | ext4_add_complete_io(io_end); |
2924 | queue_work(wq, &io_end->work); | ||
2925 | } | 2914 | } |
2926 | 2915 | ||
2927 | static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) | 2916 | static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) |
2928 | { | 2917 | { |
2929 | ext4_io_end_t *io_end = bh->b_private; | 2918 | ext4_io_end_t *io_end = bh->b_private; |
2930 | struct workqueue_struct *wq; | ||
2931 | struct inode *inode; | 2919 | struct inode *inode; |
2932 | unsigned long flags; | ||
2933 | 2920 | ||
2934 | if (!test_clear_buffer_uninit(bh) || !io_end) | 2921 | if (!test_clear_buffer_uninit(bh) || !io_end) |
2935 | goto out; | 2922 | goto out; |
@@ -2948,15 +2935,7 @@ static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate) | |||
2948 | */ | 2935 | */ |
2949 | inode = io_end->inode; | 2936 | inode = io_end->inode; |
2950 | ext4_set_io_unwritten_flag(inode, io_end); | 2937 | ext4_set_io_unwritten_flag(inode, io_end); |
2951 | 2938 | ext4_add_complete_io(io_end); | |
2952 | /* Add the io_end to per-inode completed io list*/ | ||
2953 | spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); | ||
2954 | list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); | ||
2955 | spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); | ||
2956 | |||
2957 | wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; | ||
2958 | /* queue the work to convert unwritten extents to written */ | ||
2959 | queue_work(wq, &io_end->work); | ||
2960 | out: | 2939 | out: |
2961 | bh->b_private = NULL; | 2940 | bh->b_private = NULL; |
2962 | bh->b_end_io = NULL; | 2941 | bh->b_end_io = NULL; |
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 997002218228..5b24c407701b 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c | |||
@@ -71,6 +71,7 @@ void ext4_free_io_end(ext4_io_end_t *io) | |||
71 | int i; | 71 | int i; |
72 | 72 | ||
73 | BUG_ON(!io); | 73 | BUG_ON(!io); |
74 | BUG_ON(!list_empty(&io->list)); | ||
74 | BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); | 75 | BUG_ON(io->flag & EXT4_IO_END_UNWRITTEN); |
75 | 76 | ||
76 | if (io->page) | 77 | if (io->page) |
@@ -83,21 +84,14 @@ void ext4_free_io_end(ext4_io_end_t *io) | |||
83 | kmem_cache_free(io_end_cachep, io); | 84 | kmem_cache_free(io_end_cachep, io); |
84 | } | 85 | } |
85 | 86 | ||
86 | /* | 87 | /* check a range of space and convert unwritten extents to written. */ |
87 | * check a range of space and convert unwritten extents to written. | 88 | static int ext4_end_io(ext4_io_end_t *io) |
88 | * | ||
89 | * Called with inode->i_mutex; we depend on this when we manipulate | ||
90 | * io->flag, since we could otherwise race with ext4_flush_completed_IO() | ||
91 | */ | ||
92 | int ext4_end_io_nolock(ext4_io_end_t *io) | ||
93 | { | 89 | { |
94 | struct inode *inode = io->inode; | 90 | struct inode *inode = io->inode; |
95 | loff_t offset = io->offset; | 91 | loff_t offset = io->offset; |
96 | ssize_t size = io->size; | 92 | ssize_t size = io->size; |
97 | int ret = 0; | 93 | int ret = 0; |
98 | 94 | ||
99 | BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); | ||
100 | |||
101 | ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," | 95 | ext4_debug("ext4_end_io_nolock: io 0x%p from inode %lu,list->next 0x%p," |
102 | "list->prev 0x%p\n", | 96 | "list->prev 0x%p\n", |
103 | io, inode->i_ino, io->list.next, io->list.prev); | 97 | io, inode->i_ino, io->list.next, io->list.prev); |
@@ -110,7 +104,6 @@ int ext4_end_io_nolock(ext4_io_end_t *io) | |||
110 | "(inode %lu, offset %llu, size %zd, error %d)", | 104 | "(inode %lu, offset %llu, size %zd, error %d)", |
111 | inode->i_ino, offset, size, ret); | 105 | inode->i_ino, offset, size, ret); |
112 | } | 106 | } |
113 | io->flag &= ~EXT4_IO_END_UNWRITTEN; | ||
114 | if (io->iocb) | 107 | if (io->iocb) |
115 | aio_complete(io->iocb, io->result, 0); | 108 | aio_complete(io->iocb, io->result, 0); |
116 | 109 | ||
@@ -122,51 +115,122 @@ int ext4_end_io_nolock(ext4_io_end_t *io) | |||
122 | return ret; | 115 | return ret; |
123 | } | 116 | } |
124 | 117 | ||
125 | /* | 118 | static void dump_completed_IO(struct inode *inode) |
126 | * work on completed aio dio IO, to convert unwritten extents to extents | 119 | { |
127 | */ | 120 | #ifdef EXT4FS_DEBUG |
128 | static void ext4_end_io_work(struct work_struct *work) | 121 | struct list_head *cur, *before, *after; |
122 | ext4_io_end_t *io, *io0, *io1; | ||
123 | unsigned long flags; | ||
124 | |||
125 | if (list_empty(&EXT4_I(inode)->i_completed_io_list)) { | ||
126 | ext4_debug("inode %lu completed_io list is empty\n", | ||
127 | inode->i_ino); | ||
128 | return; | ||
129 | } | ||
130 | |||
131 | ext4_debug("Dump inode %lu completed_io list\n", inode->i_ino); | ||
132 | list_for_each_entry(io, &EXT4_I(inode)->i_completed_io_list, list) { | ||
133 | cur = &io->list; | ||
134 | before = cur->prev; | ||
135 | io0 = container_of(before, ext4_io_end_t, list); | ||
136 | after = cur->next; | ||
137 | io1 = container_of(after, ext4_io_end_t, list); | ||
138 | |||
139 | ext4_debug("io 0x%p from inode %lu,prev 0x%p,next 0x%p\n", | ||
140 | io, inode->i_ino, io0, io1); | ||
141 | } | ||
142 | #endif | ||
143 | } | ||
144 | |||
145 | /* Add the io_end to per-inode completed end_io list. */ | ||
146 | void ext4_add_complete_io(ext4_io_end_t *io_end) | ||
129 | { | 147 | { |
130 | ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); | 148 | struct ext4_inode_info *ei = EXT4_I(io_end->inode); |
131 | struct inode *inode = io->inode; | 149 | struct workqueue_struct *wq; |
132 | struct ext4_inode_info *ei = EXT4_I(inode); | 150 | unsigned long flags; |
133 | unsigned long flags; | 151 | |
152 | BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); | ||
153 | wq = EXT4_SB(io_end->inode->i_sb)->dio_unwritten_wq; | ||
134 | 154 | ||
135 | spin_lock_irqsave(&ei->i_completed_io_lock, flags); | 155 | spin_lock_irqsave(&ei->i_completed_io_lock, flags); |
136 | if (io->flag & EXT4_IO_END_IN_FSYNC) | 156 | if (list_empty(&ei->i_completed_io_list)) { |
137 | goto requeue; | 157 | io_end->flag |= EXT4_IO_END_QUEUED; |
138 | if (list_empty(&io->list)) { | 158 | queue_work(wq, &io_end->work); |
139 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); | ||
140 | goto free; | ||
141 | } | 159 | } |
160 | list_add_tail(&io_end->list, &ei->i_completed_io_list); | ||
161 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); | ||
162 | } | ||
142 | 163 | ||
143 | if (!mutex_trylock(&inode->i_mutex)) { | 164 | static int ext4_do_flush_completed_IO(struct inode *inode, |
144 | bool was_queued; | 165 | ext4_io_end_t *work_io) |
145 | requeue: | 166 | { |
146 | was_queued = !!(io->flag & EXT4_IO_END_QUEUED); | 167 | ext4_io_end_t *io; |
147 | io->flag |= EXT4_IO_END_QUEUED; | 168 | struct list_head unwritten, complete, to_free; |
148 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); | 169 | unsigned long flags; |
149 | /* | 170 | struct ext4_inode_info *ei = EXT4_I(inode); |
150 | * Requeue the work instead of waiting so that the work | 171 | int err, ret = 0; |
151 | * items queued after this can be processed. | 172 | |
152 | */ | 173 | INIT_LIST_HEAD(&complete); |
153 | queue_work(EXT4_SB(inode->i_sb)->dio_unwritten_wq, &io->work); | 174 | INIT_LIST_HEAD(&to_free); |
154 | /* | 175 | |
155 | * To prevent the ext4-dio-unwritten thread from keeping | 176 | spin_lock_irqsave(&ei->i_completed_io_lock, flags); |
156 | * requeueing end_io requests and occupying cpu for too long, | 177 | dump_completed_IO(inode); |
157 | * yield the cpu if it sees an end_io request that has already | 178 | list_replace_init(&ei->i_completed_io_list, &unwritten); |
158 | * been requeued. | 179 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); |
159 | */ | 180 | |
160 | if (was_queued) | 181 | while (!list_empty(&unwritten)) { |
161 | yield(); | 182 | io = list_entry(unwritten.next, ext4_io_end_t, list); |
162 | return; | 183 | BUG_ON(!(io->flag & EXT4_IO_END_UNWRITTEN)); |
184 | list_del_init(&io->list); | ||
185 | |||
186 | err = ext4_end_io(io); | ||
187 | if (unlikely(!ret && err)) | ||
188 | ret = err; | ||
189 | |||
190 | list_add_tail(&io->list, &complete); | ||
191 | } | ||
192 | /* It is important to update all flags for all end_io in one shot w/o | ||
193 | * dropping the lock.*/ | ||
194 | spin_lock_irqsave(&ei->i_completed_io_lock, flags); | ||
195 | while (!list_empty(&complete)) { | ||
196 | io = list_entry(complete.next, ext4_io_end_t, list); | ||
197 | io->flag &= ~EXT4_IO_END_UNWRITTEN; | ||
198 | /* end_io context can not be destroyed now because it still | ||
199 | * used by queued worker. Worker thread will destroy it later */ | ||
200 | if (io->flag & EXT4_IO_END_QUEUED) | ||
201 | list_del_init(&io->list); | ||
202 | else | ||
203 | list_move(&io->list, &to_free); | ||
204 | } | ||
205 | /* If we are called from worker context, it is time to clear queued | ||
206 | * flag, and destroy it's end_io if it was converted already */ | ||
207 | if (work_io) { | ||
208 | work_io->flag &= ~EXT4_IO_END_QUEUED; | ||
209 | if (!(work_io->flag & EXT4_IO_END_UNWRITTEN)) | ||
210 | list_add_tail(&work_io->list, &to_free); | ||
163 | } | 211 | } |
164 | list_del_init(&io->list); | ||
165 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); | 212 | spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); |
166 | (void) ext4_end_io_nolock(io); | 213 | |
167 | mutex_unlock(&inode->i_mutex); | 214 | while (!list_empty(&to_free)) { |
168 | free: | 215 | io = list_entry(to_free.next, ext4_io_end_t, list); |
169 | ext4_free_io_end(io); | 216 | list_del_init(&io->list); |
217 | ext4_free_io_end(io); | ||
218 | } | ||
219 | return ret; | ||
220 | } | ||
221 | |||
222 | /* | ||
223 | * work on completed aio dio IO, to convert unwritten extents to extents | ||
224 | */ | ||
225 | static void ext4_end_io_work(struct work_struct *work) | ||
226 | { | ||
227 | ext4_io_end_t *io = container_of(work, ext4_io_end_t, work); | ||
228 | ext4_do_flush_completed_IO(io->inode, io); | ||
229 | } | ||
230 | |||
231 | int ext4_flush_completed_IO(struct inode *inode) | ||
232 | { | ||
233 | return ext4_do_flush_completed_IO(inode, NULL); | ||
170 | } | 234 | } |
171 | 235 | ||
172 | ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) | 236 | ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) |
@@ -199,9 +263,7 @@ static void buffer_io_error(struct buffer_head *bh) | |||
199 | static void ext4_end_bio(struct bio *bio, int error) | 263 | static void ext4_end_bio(struct bio *bio, int error) |
200 | { | 264 | { |
201 | ext4_io_end_t *io_end = bio->bi_private; | 265 | ext4_io_end_t *io_end = bio->bi_private; |
202 | struct workqueue_struct *wq; | ||
203 | struct inode *inode; | 266 | struct inode *inode; |
204 | unsigned long flags; | ||
205 | int i; | 267 | int i; |
206 | sector_t bi_sector = bio->bi_sector; | 268 | sector_t bi_sector = bio->bi_sector; |
207 | 269 | ||
@@ -259,14 +321,7 @@ static void ext4_end_bio(struct bio *bio, int error) | |||
259 | return; | 321 | return; |
260 | } | 322 | } |
261 | 323 | ||
262 | /* Add the io_end to per-inode completed io list*/ | 324 | ext4_add_complete_io(io_end); |
263 | spin_lock_irqsave(&EXT4_I(inode)->i_completed_io_lock, flags); | ||
264 | list_add_tail(&io_end->list, &EXT4_I(inode)->i_completed_io_list); | ||
265 | spin_unlock_irqrestore(&EXT4_I(inode)->i_completed_io_lock, flags); | ||
266 | |||
267 | wq = EXT4_SB(inode->i_sb)->dio_unwritten_wq; | ||
268 | /* queue the work to convert unwritten extents to written */ | ||
269 | queue_work(wq, &io_end->work); | ||
270 | } | 325 | } |
271 | 326 | ||
272 | void ext4_io_submit(struct ext4_io_submit *io) | 327 | void ext4_io_submit(struct ext4_io_submit *io) |