aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLukas Czerner <lczerner@redhat.com>2017-09-21 10:16:29 -0400
committerJens Axboe <axboe@kernel.dk>2017-09-25 10:56:05 -0400
commit332391a9935da939319e473b4680e173df75afcf (patch)
tree52609917ecaadeea19dab63feaa4229af5a88561
parentbb1cc74790eb51f52d23c6e5fd9a3bb16030c3d8 (diff)
fs: Fix page cache inconsistency when mixing buffered and AIO DIO
Currently when mixing buffered reads and asynchronous direct writes it is possible to end up with the situation where we have stale data in the page cache while the new data is already written to disk. This is permanent until the affected pages are flushed away. Despite the fact that mixing buffered and direct IO is ill-advised it does pose a thread for a data integrity, is unexpected and should be fixed. Fix this by deferring completion of asynchronous direct writes to a process context in the case that there are mapped pages to be found in the inode. Later before the completion in dio_complete() invalidate the pages in question. This ensures that after the completion the pages in the written area are either unmapped, or populated with up-to-date data. Also do the same for the iomap case which uses iomap_dio_complete() instead. This has a side effect of deferring the completion to a process context for every AIO DIO that happens on inode that has pages mapped. However since the consensus is that this is ill-advised practice the performance implication should not be a problem. This was based on proposal from Jeff Moyer, thanks! Reviewed-by: Jan Kara <jack@suse.cz> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Jeff Moyer <jmoyer@redhat.com> Signed-off-by: Lukas Czerner <lczerner@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
-rw-r--r--fs/direct-io.c49
-rw-r--r--fs/iomap.c29
-rw-r--r--mm/filemap.c10
3 files changed, 67 insertions, 21 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 5fa2211e49ae..62cf812ed0e5 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -229,6 +229,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
229{ 229{
230 loff_t offset = dio->iocb->ki_pos; 230 loff_t offset = dio->iocb->ki_pos;
231 ssize_t transferred = 0; 231 ssize_t transferred = 0;
232 int err;
232 233
233 /* 234 /*
234 * AIO submission can race with bio completion to get here while 235 * AIO submission can race with bio completion to get here while
@@ -258,8 +259,22 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
258 if (ret == 0) 259 if (ret == 0)
259 ret = transferred; 260 ret = transferred;
260 261
262 /*
263 * Try again to invalidate clean pages which might have been cached by
264 * non-direct readahead, or faulted in by get_user_pages() if the source
265 * of the write was an mmap'ed region of the file we're writing. Either
266 * one is a pretty crazy thing to do, so we don't support it 100%. If
267 * this invalidation fails, tough, the write still worked...
268 */
269 if (ret > 0 && dio->op == REQ_OP_WRITE &&
270 dio->inode->i_mapping->nrpages) {
271 err = invalidate_inode_pages2_range(dio->inode->i_mapping,
272 offset >> PAGE_SHIFT,
273 (offset + ret - 1) >> PAGE_SHIFT);
274 WARN_ON_ONCE(err);
275 }
276
261 if (dio->end_io) { 277 if (dio->end_io) {
262 int err;
263 278
264 // XXX: ki_pos?? 279 // XXX: ki_pos??
265 err = dio->end_io(dio->iocb, offset, ret, dio->private); 280 err = dio->end_io(dio->iocb, offset, ret, dio->private);
@@ -304,6 +319,7 @@ static void dio_bio_end_aio(struct bio *bio)
304 struct dio *dio = bio->bi_private; 319 struct dio *dio = bio->bi_private;
305 unsigned long remaining; 320 unsigned long remaining;
306 unsigned long flags; 321 unsigned long flags;
322 bool defer_completion = false;
307 323
308 /* cleanup the bio */ 324 /* cleanup the bio */
309 dio_bio_complete(dio, bio); 325 dio_bio_complete(dio, bio);
@@ -315,7 +331,19 @@ static void dio_bio_end_aio(struct bio *bio)
315 spin_unlock_irqrestore(&dio->bio_lock, flags); 331 spin_unlock_irqrestore(&dio->bio_lock, flags);
316 332
317 if (remaining == 0) { 333 if (remaining == 0) {
318 if (dio->result && dio->defer_completion) { 334 /*
335 * Defer completion when defer_completion is set or
336 * when the inode has pages mapped and this is AIO write.
337 * We need to invalidate those pages because there is a
338 * chance they contain stale data in the case buffered IO
339 * went in between AIO submission and completion into the
340 * same region.
341 */
342 if (dio->result)
343 defer_completion = dio->defer_completion ||
344 (dio->op == REQ_OP_WRITE &&
345 dio->inode->i_mapping->nrpages);
346 if (defer_completion) {
319 INIT_WORK(&dio->complete_work, dio_aio_complete_work); 347 INIT_WORK(&dio->complete_work, dio_aio_complete_work);
320 queue_work(dio->inode->i_sb->s_dio_done_wq, 348 queue_work(dio->inode->i_sb->s_dio_done_wq,
321 &dio->complete_work); 349 &dio->complete_work);
@@ -1210,10 +1238,19 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
1210 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue 1238 * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
1211 * so that we can call ->fsync. 1239 * so that we can call ->fsync.
1212 */ 1240 */
1213 if (dio->is_async && iov_iter_rw(iter) == WRITE && 1241 if (dio->is_async && iov_iter_rw(iter) == WRITE) {
1214 ((iocb->ki_filp->f_flags & O_DSYNC) || 1242 retval = 0;
1215 IS_SYNC(iocb->ki_filp->f_mapping->host))) { 1243 if ((iocb->ki_filp->f_flags & O_DSYNC) ||
1216 retval = dio_set_defer_completion(dio); 1244 IS_SYNC(iocb->ki_filp->f_mapping->host))
1245 retval = dio_set_defer_completion(dio);
1246 else if (!dio->inode->i_sb->s_dio_done_wq) {
1247 /*
1248 * In case of AIO write racing with buffered read we
1249 * need to defer completion. We can't decide this now,
1250 * however the workqueue needs to be initialized here.
1251 */
1252 retval = sb_init_dio_done_wq(dio->inode->i_sb);
1253 }
1217 if (retval) { 1254 if (retval) {
1218 /* 1255 /*
1219 * We grab i_mutex only for reads so we don't have 1256 * We grab i_mutex only for reads so we don't have
diff --git a/fs/iomap.c b/fs/iomap.c
index 269b24a01f32..8194d30bdca0 100644
--- a/fs/iomap.c
+++ b/fs/iomap.c
@@ -713,8 +713,24 @@ struct iomap_dio {
713static ssize_t iomap_dio_complete(struct iomap_dio *dio) 713static ssize_t iomap_dio_complete(struct iomap_dio *dio)
714{ 714{
715 struct kiocb *iocb = dio->iocb; 715 struct kiocb *iocb = dio->iocb;
716 struct inode *inode = file_inode(iocb->ki_filp);
716 ssize_t ret; 717 ssize_t ret;
717 718
719 /*
720 * Try again to invalidate clean pages which might have been cached by
721 * non-direct readahead, or faulted in by get_user_pages() if the source
722 * of the write was an mmap'ed region of the file we're writing. Either
723 * one is a pretty crazy thing to do, so we don't support it 100%. If
724 * this invalidation fails, tough, the write still worked...
725 */
726 if (!dio->error &&
727 (dio->flags & IOMAP_DIO_WRITE) && inode->i_mapping->nrpages) {
728 ret = invalidate_inode_pages2_range(inode->i_mapping,
729 iocb->ki_pos >> PAGE_SHIFT,
730 (iocb->ki_pos + dio->size - 1) >> PAGE_SHIFT);
731 WARN_ON_ONCE(ret);
732 }
733
718 if (dio->end_io) { 734 if (dio->end_io) {
719 ret = dio->end_io(iocb, 735 ret = dio->end_io(iocb,
720 dio->error ? dio->error : dio->size, 736 dio->error ? dio->error : dio->size,
@@ -1042,19 +1058,6 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
1042 1058
1043 ret = iomap_dio_complete(dio); 1059 ret = iomap_dio_complete(dio);
1044 1060
1045 /*
1046 * Try again to invalidate clean pages which might have been cached by
1047 * non-direct readahead, or faulted in by get_user_pages() if the source
1048 * of the write was an mmap'ed region of the file we're writing. Either
1049 * one is a pretty crazy thing to do, so we don't support it 100%. If
1050 * this invalidation fails, tough, the write still worked...
1051 */
1052 if (iov_iter_rw(iter) == WRITE) {
1053 int err = invalidate_inode_pages2_range(mapping,
1054 start >> PAGE_SHIFT, end >> PAGE_SHIFT);
1055 WARN_ON_ONCE(err);
1056 }
1057
1058 return ret; 1061 return ret;
1059 1062
1060out_free_dio: 1063out_free_dio:
diff --git a/mm/filemap.c b/mm/filemap.c
index 870971e20967..db250d0e0565 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -2926,9 +2926,15 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
2926 * we're writing. Either one is a pretty crazy thing to do, 2926 * we're writing. Either one is a pretty crazy thing to do,
2927 * so we don't support it 100%. If this invalidation 2927 * so we don't support it 100%. If this invalidation
2928 * fails, tough, the write still worked... 2928 * fails, tough, the write still worked...
2929 *
2930 * Most of the time we do not need this since dio_complete() will do
2931 * the invalidation for us. However there are some file systems that
2932 * do not end up with dio_complete() being called, so let's not break
2933 * them by removing it completely
2929 */ 2934 */
2930 invalidate_inode_pages2_range(mapping, 2935 if (mapping->nrpages)
2931 pos >> PAGE_SHIFT, end); 2936 invalidate_inode_pages2_range(mapping,
2937 pos >> PAGE_SHIFT, end);
2932 2938
2933 if (written > 0) { 2939 if (written > 0) {
2934 pos += written; 2940 pos += written;