aboutsummaryrefslogtreecommitdiffstats
path: root/fs/ext4/page-io.c
diff options
context:
space:
mode:
authorJeff Moyer <jmoyer@redhat.com>2012-03-05 10:29:52 -0500
committerTheodore Ts'o <tytso@mit.edu>2012-03-05 10:29:52 -0500
commit491caa43639abcffaa645fbab372a7ef4ce2975c (patch)
tree361ce8690a50681765a21fb6c0ed579034ae4b4e /fs/ext4/page-io.c
parent93ef8541d5c3ad1a73057ff358a49d0ee7146d6f (diff)
ext4: fix race between sync and completed io work
The following command line will leave the aio-stress process unkillable on an ext4 file system (in my case, mounted on /mnt/test): aio-stress -t 20 -s 10 -O -S -o 2 -I 1000 /mnt/test/aiostress.3561.4 /mnt/test/aiostress.3561.4.20 /mnt/test/aiostress.3561.4.19 /mnt/test/aiostress.3561.4.18 /mnt/test/aiostress.3561.4.17 /mnt/test/aiostress.3561.4.16 /mnt/test/aiostress.3561.4.15 /mnt/test/aiostress.3561.4.14 /mnt/test/aiostress.3561.4.13 /mnt/test/aiostress.3561.4.12 /mnt/test/aiostress.3561.4.11 /mnt/test/aiostress.3561.4.10 /mnt/test/aiostress.3561.4.9 /mnt/test/aiostress.3561.4.8 /mnt/test/aiostress.3561.4.7 /mnt/test/aiostress.3561.4.6 /mnt/test/aiostress.3561.4.5 /mnt/test/aiostress.3561.4.4 /mnt/test/aiostress.3561.4.3 /mnt/test/aiostress.3561.4.2 This is using the aio-stress program from the xfstests test suite. That particular command line tells aio-stress to do random writes to 20 files from 20 threads (one thread per file). The files are NOT preallocated, so you will get writes to random offsets within the file, thus creating holes and extending i_size. It also opens the file with O_DIRECT and O_SYNC. On to the problem. When an I/O requires unwritten extent conversion, it is queued onto the completed_io_list for the ext4 inode. Two code paths will pull work items from this list. The first is the ext4_end_io_work routine, and the second is ext4_flush_completed_IO, which is called via the fsync path (and O_SYNC handling, as well). There are two issues I've found in these code paths. First, if the fsync path beats the work routine to a particular I/O, the work routine will free the io_end structure! It does not take into account the fact that the io_end may still be in use by the fsync path. I've fixed this issue by adding yet another IO_END flag, indicating that the io_end is being processed by the fsync path. The second problem is that the work routine will make an assignment to io->flag outside of the lock. I have witnessed this result in a hang at umount. Moving the flag setting inside the lock resolved that problem. The problem was introduced by commit b82e384c7b ("ext4: optimize locking for end_io extent conversion"), which first appeared in 3.2. As such, the fix should be backported to that release (probably along with the unwritten extent conversion race fix). Signed-off-by: Jeff Moyer <jmoyer@redhat.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> CC: stable@kernel.org
Diffstat (limited to 'fs/ext4/page-io.c')
-rw-r--r--fs/ext4/page-io.c9
1 files changed, 7 insertions, 2 deletions
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 9e1b8eb1e7ac..dcdeef169a69 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -129,12 +129,18 @@ static void ext4_end_io_work(struct work_struct *work)
129 unsigned long flags; 129 unsigned long flags;
130 130
131 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 131 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
132 if (io->flag & EXT4_IO_END_IN_FSYNC)
133 goto requeue;
132 if (list_empty(&io->list)) { 134 if (list_empty(&io->list)) {
133 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 135 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
134 goto free; 136 goto free;
135 } 137 }
136 138
137 if (!mutex_trylock(&inode->i_mutex)) { 139 if (!mutex_trylock(&inode->i_mutex)) {
140 bool was_queued;
141requeue:
142 was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
143 io->flag |= EXT4_IO_END_QUEUED;
138 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 144 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
139 /* 145 /*
140 * Requeue the work instead of waiting so that the work 146 * Requeue the work instead of waiting so that the work
@@ -147,9 +153,8 @@ static void ext4_end_io_work(struct work_struct *work)
147 * yield the cpu if it sees an end_io request that has already 153 * yield the cpu if it sees an end_io request that has already
148 * been requeued. 154 * been requeued.
149 */ 155 */
150 if (io->flag & EXT4_IO_END_QUEUED) 156 if (was_queued)
151 yield(); 157 yield();
152 io->flag |= EXT4_IO_END_QUEUED;
153 return; 158 return;
154 } 159 }
155 list_del_init(&io->list); 160 list_del_init(&io->list);