1 files changed, 103 insertions, 23 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 7ab90f5081ee..1782023bd68a 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -127,6 +127,7 @@ struct dio {
        spinlock_t bio_lock;            /* protects BIO fields below */
        int page_errors;                /* errno from get_user_pages() */
        int is_async;                   /* is IO async ? */
+        bool defer_completion;          /* defer AIO completion to workqueue? */
        int io_error;                   /* IO error in completion path */
        unsigned long refcount;         /* direct_io_worker() and bios */
        struct bio *bio_list;           /* singly linked via bi_private */
@@ -141,7 +142,10 @@ struct dio {
         * allocation time.  Don't add new fields after pages[] unless you
         * wish that they not be zeroed.
         */
-        struct page *pages[DIO_PAGES];  /* page buffer */
+        union {
+                struct page *pages[DIO_PAGES];  /* page buffer */
+                struct work_struct complete_work;/* deferred AIO completion */
+        };
 } ____cacheline_aligned_in_smp;
 static struct kmem_cache *dio_cache __read_mostly;
@@ -221,16 +225,16 @@ static inline struct page *dio_get_page(struct dio *dio,
 * dio_complete() - called when all DIO BIO I/O has been completed
 * @offset: the byte offset in the file of the completed operation
 *
- * This releases locks as dictated by the locking type, lets interested parties
+ * This drops i_dio_count, lets interested parties know that a DIO operation
- * know that a DIO operation has completed, and calculates the resulting return
+ * has completed, and calculates the resulting return code for the operation.
- * code for the operation.
 *
 * It lets the filesystem know if it registered an interest earlier via
 * get_block.  Pass the private field of the map buffer_head so that
 * filesystems can use it to hold additional state between get_block calls and
 * dio_complete.
 */
-static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)
+static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
+                bool is_async)
 {
        ssize_t transferred = 0;
@@ -258,19 +262,36 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
        if (ret == 0)
                ret = transferred;
-        if (dio->end_io && dio->result) {
+        if (dio->end_io && dio->result)
-                dio->end_io(dio->iocb, offset, transferred,
+                dio->end_io(dio->iocb, offset, transferred, dio->private);
-                            dio->private, ret, is_async);
-        } else {
+        inode_dio_done(dio->inode);
-                inode_dio_done(dio->inode);
+        if (is_async) {
-                if (is_async)
+                if (dio->rw & WRITE) {
-                        aio_complete(dio->iocb, ret, 0);
+                        int err;
+                        err = generic_write_sync(dio->iocb->ki_filp, offset,
+                                                 transferred);
+                        if (err < 0 && ret > 0)
+                                ret = err;
+                }
+                aio_complete(dio->iocb, ret, 0);
        }
+        kmem_cache_free(dio_cache, dio);
        return ret;
 }
+static void dio_aio_complete_work(struct work_struct *work)
+{
+        struct dio *dio = container_of(work, struct dio, complete_work);
+        dio_complete(dio, dio->iocb->ki_pos, 0, true);
+}
 static int dio_bio_complete(struct dio *dio, struct bio *bio);
 /*
 * Asynchronous IO callback. 
 */
@@ -290,8 +311,13 @@ static void dio_bio_end_aio(struct bio *bio, int error)
        spin_unlock_irqrestore(&dio->bio_lock, flags);
        if (remaining == 0) {
-                dio_complete(dio, dio->iocb->ki_pos, 0, true);
+                if (dio->result && dio->defer_completion) {
-                kmem_cache_free(dio_cache, dio);
+                        INIT_WORK(&dio->complete_work, dio_aio_complete_work);
+                        queue_work(dio->inode->i_sb->s_dio_done_wq,
+                                   &dio->complete_work);
+                } else {
+                        dio_complete(dio, dio->iocb->ki_pos, 0, true);
+                }
        }
 }
@@ -511,6 +537,41 @@ static inline int dio_bio_reap(struct dio *dio, struct dio_submit *sdio)
 }
 /*
+ * Create workqueue for deferred direct IO completions. We allocate the
+ * workqueue when it's first needed. This avoids creating workqueue for
+ * filesystems that don't need it and also allows us to create the workqueue
+ * late enough so the we can include s_id in the name of the workqueue.
+ */
+static int sb_init_dio_done_wq(struct super_block *sb)
+{
+        struct workqueue_struct *wq = alloc_workqueue("dio/%s",
+                                                      WQ_MEM_RECLAIM, 0,
+                                                      sb->s_id);
+        if (!wq)
+                return -ENOMEM;
+        /*
+         * This has to be atomic as more DIOs can race to create the workqueue
+         */
+        cmpxchg(&sb->s_dio_done_wq, NULL, wq);
+        /* Someone created workqueue before us? Free ours... */
+        if (wq != sb->s_dio_done_wq)
+                destroy_workqueue(wq);
+        return 0;
+}
+static int dio_set_defer_completion(struct dio *dio)
+{
+        struct super_block *sb = dio->inode->i_sb;
+        if (dio->defer_completion)
+                return 0;
+        dio->defer_completion = true;
+        if (!sb->s_dio_done_wq)
+                return sb_init_dio_done_wq(sb);
+        return 0;
+}
+/*
 * Call into the fs to map some more disk blocks.  We record the current number
 * of available blocks at sdio->blocks_available.  These are in units of the
 * fs blocksize, (1 << inode->i_blkbits).
@@ -581,6 +642,9 @@ static int get_more_blocks(struct dio *dio, struct dio_submit *sdio,
                /* Store for completion */
                dio->private = map_bh->b_private;
+                if (ret == 0 && buffer_defer_completion(map_bh))
+                        ret = dio_set_defer_completion(dio);
        }
        return ret;
 }
@@ -1129,11 +1193,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        }
        /*
-         * Will be decremented at I/O completion time.
-         */
-        atomic_inc(&inode->i_dio_count);
-        /*
         * For file extending writes updating i_size before data
         * writeouts complete can expose uninitialized blocks. So
         * even for AIO, we need to wait for i/o to complete before
@@ -1141,11 +1200,33 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
         */
        dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
                (end > i_size_read(inode)));
-        retval = 0;
        dio->inode = inode;
        dio->rw = rw;
+        /*
+         * For AIO O_(D)SYNC writes we need to defer completions to a workqueue
+         * so that we can call ->fsync.
+         */
+        if (dio->is_async && (rw & WRITE) &&
+            ((iocb->ki_filp->f_flags & O_DSYNC) ||
+             IS_SYNC(iocb->ki_filp->f_mapping->host))) {
+                retval = dio_set_defer_completion(dio);
+                if (retval) {
+                        /*
+                         * We grab i_mutex only for reads so we don't have
+                         * to release it here
+                         */
+                        kmem_cache_free(dio_cache, dio);
+                        goto out;
+                }
+        }
+        /*
+         * Will be decremented at I/O completion time.
+         */
+        atomic_inc(&inode->i_dio_count);
+        retval = 0;
        sdio.blkbits = blkbits;
        sdio.blkfactor = i_blkbits - blkbits;
        sdio.block_in_file = offset >> blkbits;
@@ -1269,7 +1350,6 @@ do_blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
        if (drop_refcount(dio) == 0) {
                retval = dio_complete(dio, offset, retval, false);
-                kmem_cache_free(dio_cache, dio);
        } else
                BUG_ON(retval != -EIOCBQUEUED);

diff --git a/fs/direct-io.c b/fs/direct-io.c index 7ab90f5081ee..1782023bd68a 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c
@@ -127,6 +127,7 @@ struct dio {
127	spinlock_t bio_lock; /* protects BIO fields below */	127	spinlock_t bio_lock; /* protects BIO fields below */
128	int page_errors; /* errno from get_user_pages() */	128	int page_errors; /* errno from get_user_pages() */
129	int is_async; /* is IO async ? */	129	int is_async; /* is IO async ? */
		130	bool defer_completion; /* defer AIO completion to workqueue? */
130	int io_error; /* IO error in completion path */	131	int io_error; /* IO error in completion path */
131	unsigned long refcount; /* direct_io_worker() and bios */	132	unsigned long refcount; /* direct_io_worker() and bios */
132	struct bio bio_list; / singly linked via bi_private */	133	struct bio bio_list; / singly linked via bi_private */
@@ -141,7 +142,10 @@ struct dio {
141	* allocation time. Don't add new fields after pages[] unless you	142	* allocation time. Don't add new fields after pages[] unless you
142	* wish that they not be zeroed.	143	* wish that they not be zeroed.
143	*/	144	*/
144	struct page pages[DIO_PAGES]; / page buffer */	145	union {
		146	struct page pages[DIO_PAGES]; / page buffer */
		147	struct work_struct complete_work;/* deferred AIO completion */
		148	};
145	} ____cacheline_aligned_in_smp;	149	} ____cacheline_aligned_in_smp;
146		150
147	static struct kmem_cache *dio_cache __read_mostly;	151	static struct kmem_cache *dio_cache __read_mostly;
@@ -221,16 +225,16 @@ static inline struct page dio_get_page(struct dio dio,
221	* dio_complete() - called when all DIO BIO I/O has been completed	225	* dio_complete() - called when all DIO BIO I/O has been completed
222	* @offset: the byte offset in the file of the completed operation	226	* @offset: the byte offset in the file of the completed operation
223	*	227	*
224	* This releases locks as dictated by the locking type, lets interested parties	228	* This drops i_dio_count, lets interested parties know that a DIO operation
225	* know that a DIO operation has completed, and calculates the resulting return	229	* has completed, and calculates the resulting return code for the operation.
226	* code for the operation.
227	*	230	*
228	* It lets the filesystem know if it registered an interest earlier via	231	* It lets the filesystem know if it registered an interest earlier via
229	* get_block. Pass the private field of the map buffer_head so that	232	* get_block. Pass the private field of the map buffer_head so that
230	* filesystems can use it to hold additional state between get_block calls and	233	* filesystems can use it to hold additional state between get_block calls and
231	* dio_complete.	234	* dio_complete.
232	*/	235	*/
233	static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is_async)	236	static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret,
		237	bool is_async)
234	{	238	{
235	ssize_t transferred = 0;	239	ssize_t transferred = 0;
236		240
@@ -258,19 +262,36 @@ static ssize_t dio_complete(struct dio *dio, loff_t offset, ssize_t ret, bool is
258	if (ret == 0)	262	if (ret == 0)
259	ret = transferred;	263	ret = transferred;
260		264
261	if (dio->end_io && dio->result) {	265	if (dio->end_io && dio->result)
262	dio->end_io(dio->iocb, offset, transferred,	266	dio->end_io(dio->iocb, offset, transferred, dio->private);
263	dio->private, ret, is_async);	267
264	} else {	268	inode_dio_done(dio->inode);
265	inode_dio_done(dio->inode);	269	if (is_async) {
266	if (is_async)	270	if (dio->rw & WRITE) {
267	aio_complete(dio->iocb, ret, 0);	271	int err;
		272
		273	err = generic_write_sync(dio->iocb->ki_filp, offset,
		274	transferred);
		275	if (err < 0 && ret > 0)
		276	ret = err;
		277	}
		278
		279	aio_complete(dio->iocb, ret, 0);
268	}	280	}
269		281
		282	kmem_cache_free(dio_cache, dio);
270	return ret;	283	return ret;
271	}	284	}
272		285
		286	static void dio_aio_complete_work(struct work_struct *work)
		287	{
		288	struct dio *dio = container_of(work, struct dio, complete_work);
		289
		290	dio_complete(dio, dio->iocb->ki_pos, 0, true);
		291	}
		292
273	static int dio_bio_complete(struct dio dio, struct bio bio);	293	static int dio_bio_complete(struct dio dio, struct bio bio);
		294
274	/*	295	/*
275	* Asynchronous IO callback.	296	* Asynchronous IO callback.
276	*/	297	*/
@@ -290,8 +311,13 @@ static void dio_bio_end_aio(struct bio *bio, int error)
290	spin_unlock_irqrestore(&dio->bio_lock, flags);	311	spin_unlock_irqrestore(&dio->bio_lock, flags);
291		312
292	if (remaining == 0) {	313	if (remaining == 0) {
293	dio_complete(dio, dio->iocb->ki_pos, 0, true);	314	if (dio->result && dio->defer_completion) {
294	kmem_cache_free(dio_cache, dio);	315	INIT_WORK(&dio->complete_work, dio_aio_complete_work);
		316	queue_work(dio->inode->i_sb->s_dio_done_wq,
		317	&dio->complete_work);
		318	} else {
		319	dio_complete(dio, dio->iocb->ki_pos, 0, true);
		320	}
295	}	321	}
296	}	322	}
297		323
@@ -511,6 +537,41 @@ static inline int dio_bio_reap(struct dio dio, struct dio_submit sdio)
511	}	537	}
512		538
513	/*	539	/*
		540	* Create workqueue for deferred direct IO completions. We allocate the
		541	* workqueue when it's first needed. This avoids creating workqueue for
		542	* filesystems that don't need it and also allows us to create the workqueue
		543	* late enough so the we can include s_id in the name of the workqueue.
		544	*/
		545	static int sb_init_dio_done_wq(struct super_block *sb)
		546	{
		547	struct workqueue_struct *wq = alloc_workqueue("dio/%s",
		548	WQ_MEM_RECLAIM, 0,
		549	sb->s_id);
		550	if (!wq)
		551	return -ENOMEM;
		552	/*
		553	* This has to be atomic as more DIOs can race to create the workqueue
		554	*/
		555	cmpxchg(&sb->s_dio_done_wq, NULL, wq);
		556	/* Someone created workqueue before us? Free ours... */
		557	if (wq != sb->s_dio_done_wq)
		558	destroy_workqueue(wq);
		559	return 0;
		560	}
		561
		562	static int dio_set_defer_completion(struct dio *dio)
		563	{
		564	struct super_block *sb = dio->inode->i_sb;
		565
		566	if (dio->defer_completion)
		567	return 0;
		568	dio->defer_completion = true;
		569	if (!sb->s_dio_done_wq)
		570	return sb_init_dio_done_wq(sb);
		571	return 0;
		572	}
		573
		574	/*
514	* Call into the fs to map some more disk blocks. We record the current number	575	* Call into the fs to map some more disk blocks. We record the current number
515	* of available blocks at sdio->blocks_available. These are in units of the	576	* of available blocks at sdio->blocks_available. These are in units of the
516	* fs blocksize, (1 << inode->i_blkbits).	577	* fs blocksize, (1 << inode->i_blkbits).
@@ -581,6 +642,9 @@ static int get_more_blocks(struct dio dio, struct dio_submit sdio,
581		642
582	/* Store for completion */	643	/* Store for completion */
583	dio->private = map_bh->b_private;	644	dio->private = map_bh->b_private;
		645
		646	if (ret == 0 && buffer_defer_completion(map_bh))
		647	ret = dio_set_defer_completion(dio);
584	}	648	}
585	return ret;	649	return ret;
586	}	650	}
@@ -1129,11 +1193,6 @@ do_blockdev_direct_IO(int rw, struct kiocb iocb, struct inode inode,
1129	}	1193	}
1130		1194
1131	/*	1195	/*
1132	* Will be decremented at I/O completion time.
1133	*/
1134	atomic_inc(&inode->i_dio_count);
1135
1136	/*
1137	* For file extending writes updating i_size before data	1196	* For file extending writes updating i_size before data
1138	* writeouts complete can expose uninitialized blocks. So	1197	* writeouts complete can expose uninitialized blocks. So
1139	* even for AIO, we need to wait for i/o to complete before	1198	* even for AIO, we need to wait for i/o to complete before
@@ -1141,11 +1200,33 @@ do_blockdev_direct_IO(int rw, struct kiocb iocb, struct inode inode,
1141	*/	1200	*/
1142	dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&	1201	dio->is_async = !is_sync_kiocb(iocb) && !((rw & WRITE) &&
1143	(end > i_size_read(inode)));	1202	(end > i_size_read(inode)));
1144
1145	retval = 0;
1146
1147	dio->inode = inode;	1203	dio->inode = inode;
1148	dio->rw = rw;	1204	dio->rw = rw;
		1205
		1206	/*
		1207	* For AIO O_(D)SYNC writes we need to defer completions to a workqueue
		1208	* so that we can call ->fsync.
		1209	*/
		1210	if (dio->is_async && (rw & WRITE) &&
		1211	((iocb->ki_filp->f_flags & O_DSYNC) \|\|
		1212	IS_SYNC(iocb->ki_filp->f_mapping->host))) {
		1213	retval = dio_set_defer_completion(dio);
		1214	if (retval) {
		1215	/*
		1216	* We grab i_mutex only for reads so we don't have
		1217	* to release it here
		1218	*/
		1219	kmem_cache_free(dio_cache, dio);
		1220	goto out;
		1221	}
		1222	}
		1223
		1224	/*
		1225	* Will be decremented at I/O completion time.
		1226	*/
		1227	atomic_inc(&inode->i_dio_count);
		1228
		1229	retval = 0;
1149	sdio.blkbits = blkbits;	1230	sdio.blkbits = blkbits;
1150	sdio.blkfactor = i_blkbits - blkbits;	1231	sdio.blkfactor = i_blkbits - blkbits;
1151	sdio.block_in_file = offset >> blkbits;	1232	sdio.block_in_file = offset >> blkbits;
@@ -1269,7 +1350,6 @@ do_blockdev_direct_IO(int rw, struct kiocb iocb, struct inode inode,
1269		1350
1270	if (drop_refcount(dio) == 0) {	1351	if (drop_refcount(dio) == 0) {
1271	retval = dio_complete(dio, offset, retval, false);	1352	retval = dio_complete(dio, offset, retval, false);
1272	kmem_cache_free(dio_cache, dio);
1273	} else	1353	} else
1274	BUG_ON(retval != -EIOCBQUEUED);	1354	BUG_ON(retval != -EIOCBQUEUED);
1275		1355