fs: Fix page cache inconsistency when mixing buffered and AIO DIO

Currently when mixing buffered reads and asynchronous direct writes it is possible to end up with the situation where we have stale data in the page cache while the new data is already written to disk. This is permanent until the affected pages are flushed away. Despite the fact that mixing buffered and direct IO is ill-advised it does pose a thread for a data integrity, is unexpected and should be fixed. Fix this by deferring completion of asynchronous direct writes to a process context in the case that there are mapped pages to be found in the inode. Later before the completion in dio_complete() invalidate the pages in question. This ensures that after the completion the pages in the written area are either unmapped, or populated with up-to-date data. Also do the same for the iomap case which uses iomap_dio_complete() instead. This has a side effect of deferring the completion to a process context for every AIO DIO that happens on inode that has pages mapped. However since the consensus is that this is ill-advised practice the performance implication should not be a problem. This was based on proposal from Jeff Moyer, thanks! Reviewed-by: Jan Kara <jack@suse.cz> Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Jeff Moyer <jmoyer@redhat.com> Signed-off-by: Lukas Czerner <lczerner@redhat.com> Signed-off-by: Jens Axboe <axboe@kernel.dk>
author: Lukas Czerner <lczerner@redhat.com> 2017-09-21 10:16:29 -0400
committer: Jens Axboe <axboe@kernel.dk> 2017-09-25 10:56:05 -0400
commit: 332391a9935da939319e473b4680e173df75afcf (patch)
tree: 52609917ecaadeea19dab63feaa4229af5a88561 /fs/direct-io.c
parent: bb1cc74790eb51f52d23c6e5fd9a3bb16030c3d8 (diff)
1 files changed, 43 insertions, 6 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 5fa2211e49ae..62cf812ed0e5 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -229,6 +229,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
 {
        loff_t offset = dio->iocb->ki_pos;
        ssize_t transferred = 0;
+        int err;
        /*
         * AIO submission can race with bio completion to get here while
@@ -258,8 +259,22 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
        if (ret == 0)
                ret = transferred;
+        /*
+         * Try again to invalidate clean pages which might have been cached by
+         * non-direct readahead, or faulted in by get_user_pages() if the source
+         * of the write was an mmap'ed region of the file we're writing.  Either
+         * one is a pretty crazy thing to do, so we don't support it 100%.  If
+         * this invalidation fails, tough, the write still worked...
+         */
+        if (ret > 0 && dio->op == REQ_OP_WRITE &&
+            dio->inode->i_mapping->nrpages) {
+                err = invalidate_inode_pages2_range(dio->inode->i_mapping,
+                                        offset >> PAGE_SHIFT,
+                                        (offset + ret - 1) >> PAGE_SHIFT);
+                WARN_ON_ONCE(err);
+        }
        if (dio->end_io) {
-                int err;
                // XXX: ki_pos??
                err = dio->end_io(dio->iocb, offset, ret, dio->private);
@@ -304,6 +319,7 @@ static void dio_bio_end_aio(struct bio *bio)
        struct dio *dio = bio->bi_private;
        unsigned long remaining;
        unsigned long flags;
+        bool defer_completion = false;
        /* cleanup the bio */
        dio_bio_complete(dio, bio);
@@ -315,7 +331,19 @@ static void dio_bio_end_aio(struct bio *bio)
        spin_unlock_irqrestore(&dio->bio_lock, flags);
        if (remaining == 0) {
-                if (dio->result && dio->defer_completion) {
+                /*
+                 * Defer completion when defer_completion is set or
+                 * when the inode has pages mapped and this is AIO write.
+                 * We need to invalidate those pages because there is a
+                 * chance they contain stale data in the case buffered IO
+                 * went in between AIO submission and completion into the
+                 * same region.
+                 */
+                if (dio->result)
+                        defer_completion = dio->defer_completion ||
+                                           (dio->op == REQ_OP_WRITE &&
+                                            dio->inode->i_mapping->nrpages);
+                if (defer_completion) {
                        INIT_WORK(&dio->complete_work, dio_aio_complete_work);
                        queue_work(dio->inode->i_sb->s_dio_done_wq,
                                   &dio->complete_work);
@@ -1210,10 +1238,19 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
         * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
         * so that we can call ->fsync.
         */
-        if (dio->is_async && iov_iter_rw(iter) == WRITE &&
+        if (dio->is_async && iov_iter_rw(iter) == WRITE) {
-            ((iocb->ki_filp->f_flags & O_DSYNC) ||
+                retval = 0;
-             IS_SYNC(iocb->ki_filp->f_mapping->host))) {
+                if ((iocb->ki_filp->f_flags & O_DSYNC) ||
-                retval = dio_set_defer_completion(dio);
+                    IS_SYNC(iocb->ki_filp->f_mapping->host))
+                        retval = dio_set_defer_completion(dio);
+                else if (!dio->inode->i_sb->s_dio_done_wq) {
+                        /*
+                         * In case of AIO write racing with buffered read we
+                         * need to defer completion. We can't decide this now,
+                         * however the workqueue needs to be initialized here.
+                         */
+                        retval = sb_init_dio_done_wq(dio->inode->i_sb);
+                }
                if (retval) {
                        /*
                         * We grab i_mutex only for reads so we don't have
author	Lukas Czerner <lczerner@redhat.com>	2017-09-21 10:16:29 -0400
committer	Jens Axboe <axboe@kernel.dk>	2017-09-25 10:56:05 -0400
commit	332391a9935da939319e473b4680e173df75afcf (patch)
tree	52609917ecaadeea19dab63feaa4229af5a88561 /fs/direct-io.c
parent	bb1cc74790eb51f52d23c6e5fd9a3bb16030c3d8 (diff)

diff --git a/fs/direct-io.c b/fs/direct-io.c index 5fa2211e49ae..62cf812ed0e5 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c
@@ -229,6 +229,7 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
229	{	229	{
230	loff_t offset = dio->iocb->ki_pos;	230	loff_t offset = dio->iocb->ki_pos;
231	ssize_t transferred = 0;	231	ssize_t transferred = 0;
		232	int err;
232		233
233	/*	234	/*
234	* AIO submission can race with bio completion to get here while	235	* AIO submission can race with bio completion to get here while
@@ -258,8 +259,22 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, bool is_async)
258	if (ret == 0)	259	if (ret == 0)
259	ret = transferred;	260	ret = transferred;
260		261
		262	/*
		263	* Try again to invalidate clean pages which might have been cached by
		264	* non-direct readahead, or faulted in by get_user_pages() if the source
		265	* of the write was an mmap'ed region of the file we're writing. Either
		266	* one is a pretty crazy thing to do, so we don't support it 100%. If
		267	* this invalidation fails, tough, the write still worked...
		268	*/
		269	if (ret > 0 && dio->op == REQ_OP_WRITE &&
		270	dio->inode->i_mapping->nrpages) {
		271	err = invalidate_inode_pages2_range(dio->inode->i_mapping,
		272	offset >> PAGE_SHIFT,
		273	(offset + ret - 1) >> PAGE_SHIFT);
		274	WARN_ON_ONCE(err);
		275	}
		276
261	if (dio->end_io) {	277	if (dio->end_io) {
262	int err;
263		278
264	// XXX: ki_pos??	279	// XXX: ki_pos??
265	err = dio->end_io(dio->iocb, offset, ret, dio->private);	280	err = dio->end_io(dio->iocb, offset, ret, dio->private);
@@ -304,6 +319,7 @@ static void dio_bio_end_aio(struct bio *bio)
304	struct dio *dio = bio->bi_private;	319	struct dio *dio = bio->bi_private;
305	unsigned long remaining;	320	unsigned long remaining;
306	unsigned long flags;	321	unsigned long flags;
		322	bool defer_completion = false;
307		323
308	/* cleanup the bio */	324	/* cleanup the bio */
309	dio_bio_complete(dio, bio);	325	dio_bio_complete(dio, bio);
@@ -315,7 +331,19 @@ static void dio_bio_end_aio(struct bio *bio)
315	spin_unlock_irqrestore(&dio->bio_lock, flags);	331	spin_unlock_irqrestore(&dio->bio_lock, flags);
316		332
317	if (remaining == 0) {	333	if (remaining == 0) {
318	if (dio->result && dio->defer_completion) {	334	/*
		335	* Defer completion when defer_completion is set or
		336	* when the inode has pages mapped and this is AIO write.
		337	* We need to invalidate those pages because there is a
		338	* chance they contain stale data in the case buffered IO
		339	* went in between AIO submission and completion into the
		340	* same region.
		341	*/
		342	if (dio->result)
		343	defer_completion = dio->defer_completion \|\|
		344	(dio->op == REQ_OP_WRITE &&
		345	dio->inode->i_mapping->nrpages);
		346	if (defer_completion) {
319	INIT_WORK(&dio->complete_work, dio_aio_complete_work);	347	INIT_WORK(&dio->complete_work, dio_aio_complete_work);
320	queue_work(dio->inode->i_sb->s_dio_done_wq,	348	queue_work(dio->inode->i_sb->s_dio_done_wq,
321	&dio->complete_work);	349	&dio->complete_work);
@@ -1210,10 +1238,19 @@ do_blockdev_direct_IO(struct kiocb iocb, struct inode inode,
1210	* For AIO O_(D)SYNC writes we need to defer completions to a workqueue	1238	* For AIO O_(D)SYNC writes we need to defer completions to a workqueue
1211	* so that we can call ->fsync.	1239	* so that we can call ->fsync.
1212	*/	1240	*/
1213	if (dio->is_async && iov_iter_rw(iter) == WRITE &&	1241	if (dio->is_async && iov_iter_rw(iter) == WRITE) {
1214	((iocb->ki_filp->f_flags & O_DSYNC) \|\|	1242	retval = 0;
1215	IS_SYNC(iocb->ki_filp->f_mapping->host))) {	1243	if ((iocb->ki_filp->f_flags & O_DSYNC) \|\|
1216	retval = dio_set_defer_completion(dio);	1244	IS_SYNC(iocb->ki_filp->f_mapping->host))
		1245	retval = dio_set_defer_completion(dio);
		1246	else if (!dio->inode->i_sb->s_dio_done_wq) {
		1247	/*
		1248	* In case of AIO write racing with buffered read we
		1249	* need to defer completion. We can't decide this now,
		1250	* however the workqueue needs to be initialized here.
		1251	*/
		1252	retval = sb_init_dio_done_wq(dio->inode->i_sb);
		1253	}
1217	if (retval) {	1254	if (retval) {
1218	/*	1255	/*
1219	* We grab i_mutex only for reads so we don't have	1256	* We grab i_mutex only for reads so we don't have