aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJeff Moyer <jmoyer@redhat.com>2012-03-05 10:29:52 -0500
committerTheodore Ts'o <tytso@mit.edu>2012-03-05 10:29:52 -0500
commit491caa43639abcffaa645fbab372a7ef4ce2975c (patch)
tree361ce8690a50681765a21fb6c0ed579034ae4b4e /fs
parent93ef8541d5c3ad1a73057ff358a49d0ee7146d6f (diff)
ext4: fix race between sync and completed io work
The following command line will leave the aio-stress process unkillable on an ext4 file system (in my case, mounted on /mnt/test): aio-stress -t 20 -s 10 -O -S -o 2 -I 1000 /mnt/test/aiostress.3561.4 /mnt/test/aiostress.3561.4.20 /mnt/test/aiostress.3561.4.19 /mnt/test/aiostress.3561.4.18 /mnt/test/aiostress.3561.4.17 /mnt/test/aiostress.3561.4.16 /mnt/test/aiostress.3561.4.15 /mnt/test/aiostress.3561.4.14 /mnt/test/aiostress.3561.4.13 /mnt/test/aiostress.3561.4.12 /mnt/test/aiostress.3561.4.11 /mnt/test/aiostress.3561.4.10 /mnt/test/aiostress.3561.4.9 /mnt/test/aiostress.3561.4.8 /mnt/test/aiostress.3561.4.7 /mnt/test/aiostress.3561.4.6 /mnt/test/aiostress.3561.4.5 /mnt/test/aiostress.3561.4.4 /mnt/test/aiostress.3561.4.3 /mnt/test/aiostress.3561.4.2 This is using the aio-stress program from the xfstests test suite. That particular command line tells aio-stress to do random writes to 20 files from 20 threads (one thread per file). The files are NOT preallocated, so you will get writes to random offsets within the file, thus creating holes and extending i_size. It also opens the file with O_DIRECT and O_SYNC. On to the problem. When an I/O requires unwritten extent conversion, it is queued onto the completed_io_list for the ext4 inode. Two code paths will pull work items from this list. The first is the ext4_end_io_work routine, and the second is ext4_flush_completed_IO, which is called via the fsync path (and O_SYNC handling, as well). There are two issues I've found in these code paths. First, if the fsync path beats the work routine to a particular I/O, the work routine will free the io_end structure! It does not take into account the fact that the io_end may still be in use by the fsync path. I've fixed this issue by adding yet another IO_END flag, indicating that the io_end is being processed by the fsync path. The second problem is that the work routine will make an assignment to io->flag outside of the lock. I have witnessed this result in a hang at umount. Moving the flag setting inside the lock resolved that problem. The problem was introduced by commit b82e384c7b ("ext4: optimize locking for end_io extent conversion"), which first appeared in 3.2. As such, the fix should be backported to that release (probably along with the unwritten extent conversion race fix). Signed-off-by: Jeff Moyer <jmoyer@redhat.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> CC: stable@kernel.org
Diffstat (limited to 'fs')
-rw-r--r--fs/ext4/ext4.h1
-rw-r--r--fs/ext4/fsync.c2
-rw-r--r--fs/ext4/page-io.c9
3 files changed, 10 insertions, 2 deletions
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6481e3ca3528..37e7d8b66c99 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -185,6 +185,7 @@ struct mpage_da_data {
185#define EXT4_IO_END_ERROR 0x0002 185#define EXT4_IO_END_ERROR 0x0002
186#define EXT4_IO_END_QUEUED 0x0004 186#define EXT4_IO_END_QUEUED 0x0004
187#define EXT4_IO_END_DIRECT 0x0008 187#define EXT4_IO_END_DIRECT 0x0008
188#define EXT4_IO_END_IN_FSYNC 0x0010
188 189
189struct ext4_io_page { 190struct ext4_io_page {
190 struct page *p_page; 191 struct page *p_page;
diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 00a2cb753efd..bb6c7d811313 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c
@@ -89,6 +89,7 @@ int ext4_flush_completed_IO(struct inode *inode)
89 io = list_entry(ei->i_completed_io_list.next, 89 io = list_entry(ei->i_completed_io_list.next,
90 ext4_io_end_t, list); 90 ext4_io_end_t, list);
91 list_del_init(&io->list); 91 list_del_init(&io->list);
92 io->flag |= EXT4_IO_END_IN_FSYNC;
92 /* 93 /*
93 * Calling ext4_end_io_nolock() to convert completed 94 * Calling ext4_end_io_nolock() to convert completed
94 * IO to written. 95 * IO to written.
@@ -108,6 +109,7 @@ int ext4_flush_completed_IO(struct inode *inode)
108 if (ret < 0) 109 if (ret < 0)
109 ret2 = ret; 110 ret2 = ret;
110 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 111 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
112 io->flag &= ~EXT4_IO_END_IN_FSYNC;
111 } 113 }
112 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 114 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
113 return (ret2 < 0) ? ret2 : 0; 115 return (ret2 < 0) ? ret2 : 0;
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 9e1b8eb1e7ac..dcdeef169a69 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -129,12 +129,18 @@ static void ext4_end_io_work(struct work_struct *work)
129 unsigned long flags; 129 unsigned long flags;
130 130
131 spin_lock_irqsave(&ei->i_completed_io_lock, flags); 131 spin_lock_irqsave(&ei->i_completed_io_lock, flags);
132 if (io->flag & EXT4_IO_END_IN_FSYNC)
133 goto requeue;
132 if (list_empty(&io->list)) { 134 if (list_empty(&io->list)) {
133 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 135 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
134 goto free; 136 goto free;
135 } 137 }
136 138
137 if (!mutex_trylock(&inode->i_mutex)) { 139 if (!mutex_trylock(&inode->i_mutex)) {
140 bool was_queued;
141requeue:
142 was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
143 io->flag |= EXT4_IO_END_QUEUED;
138 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags); 144 spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
139 /* 145 /*
140 * Requeue the work instead of waiting so that the work 146 * Requeue the work instead of waiting so that the work
@@ -147,9 +153,8 @@ static void ext4_end_io_work(struct work_struct *work)
147 * yield the cpu if it sees an end_io request that has already 153 * yield the cpu if it sees an end_io request that has already
148 * been requeued. 154 * been requeued.
149 */ 155 */
150 if (io->flag & EXT4_IO_END_QUEUED) 156 if (was_queued)
151 yield(); 157 yield();
152 io->flag |= EXT4_IO_END_QUEUED;
153 return; 158 return;
154 } 159 }
155 list_del_init(&io->list); 160 list_del_init(&io->list);