1 files changed, 144 insertions, 179 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 5981e17f46f0..d9d0833444f5 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -27,6 +27,7 @@
 #include <linux/slab.h>
 #include <linux/highmem.h>
 #include <linux/pagemap.h>
+#include <linux/task_io_accounting_ops.h>
 #include <linux/bio.h>
 #include <linux/wait.h>
 #include <linux/err.h>
@@ -121,8 +122,7 @@ struct dio {
        /* BIO completion state */
        spinlock_t bio_lock;            /* protects BIO fields below */
-        int bio_count;                  /* nr bios to be completed */
+        unsigned long refcount;         /* direct_io_worker() and bios */
-        int bios_in_flight;             /* nr bios in flight */
        struct bio *bio_list;           /* singly linked via bi_private */
        struct task_struct *waiter;     /* waiting task (NULL if none) */
@@ -209,76 +209,55 @@ static struct page *dio_get_page(struct dio *dio)
        return dio->pages[dio->head++];
 }
-/*
+/**
- * Called when all DIO BIO I/O has been completed - let the filesystem
+ * dio_complete() - called when all DIO BIO I/O has been completed
- * know, if it registered an interest earlier via get_block.  Pass the
+ * @offset: the byte offset in the file of the completed operation
- * private field of the map buffer_head so that filesystems can use it
+ *
- * to hold additional state between get_block calls and dio_complete.
+ * This releases locks as dictated by the locking type, lets interested parties
- */
+ * know that a DIO operation has completed, and calculates the resulting return
-static void dio_complete(struct dio *dio, loff_t offset, ssize_t bytes)
+ * code for the operation.
-{
+ *
-        if (dio->end_io && dio->result)
+ * It lets the filesystem know if it registered an interest earlier via
-                dio->end_io(dio->iocb, offset, bytes, dio->map_bh.b_private);
+ * get_block.  Pass the private field of the map buffer_head so that
-        if (dio->lock_type == DIO_LOCKING)
+ * filesystems can use it to hold additional state between get_block calls and
-                /* lockdep: non-owner release */
+ * dio_complete.
-                up_read_non_owner(&dio->inode->i_alloc_sem);
-}
-/*
- * Called when a BIO has been processed.  If the count goes to zero then IO is
- * complete and we can signal this to the AIO layer.
 */
-static void finished_one_bio(struct dio *dio)
+static int dio_complete(struct dio *dio, loff_t offset, int ret)
 {
-        unsigned long flags;
+        ssize_t transferred = 0;
-        spin_lock_irqsave(&dio->bio_lock, flags);
+        /*
-        if (dio->bio_count == 1) {
+         * AIO submission can race with bio completion to get here while
-                if (dio->is_async) {
+         * expecting to have the last io completed by bio completion.
-                        ssize_t transferred;
+         * In that case -EIOCBQUEUED is in fact not an error we want
-                        loff_t offset;
+         * to preserve through this call.
+         */
-                        /*
+        if (ret == -EIOCBQUEUED)
-                         * Last reference to the dio is going away.
+                ret = 0;
-                         * Drop spinlock and complete the DIO.
-                         */
-                        spin_unlock_irqrestore(&dio->bio_lock, flags);
-                        /* Check for short read case */
+        if (dio->result) {
-                        transferred = dio->result;
+                transferred = dio->result;
-                        offset = dio->iocb->ki_pos;
-                        if ((dio->rw == READ) &&
+                /* Check for short read case */
-                            ((offset + transferred) > dio->i_size))
+                if ((dio->rw == READ) && ((offset + transferred) > dio->i_size))
-                                transferred = dio->i_size - offset;
+                        transferred = dio->i_size - offset;
+        }
-                        /* check for error in completion path */
+        if (dio->end_io && dio->result)
-                        if (dio->io_error)
+                dio->end_io(dio->iocb, offset, transferred,
-                                transferred = dio->io_error;
+                            dio->map_bh.b_private);
+        if (dio->lock_type == DIO_LOCKING)
+                /* lockdep: non-owner release */
+                up_read_non_owner(&dio->inode->i_alloc_sem);
-                        dio_complete(dio, offset, transferred);
+        if (ret == 0)
+                ret = dio->page_errors;
+        if (ret == 0)
+                ret = dio->io_error;
+        if (ret == 0)
+                ret = transferred;
-                        /* Complete AIO later if falling back to buffered i/o */
+        return ret;
-                        if (dio->result == dio->size ||
-                                ((dio->rw == READ) && dio->result)) {
-                                aio_complete(dio->iocb, transferred, 0);
-                                kfree(dio);
-                                return;
-                        } else {
-                                /*
-                                 * Falling back to buffered
-                                 */
-                                spin_lock_irqsave(&dio->bio_lock, flags);
-                                dio->bio_count--;
-                                if (dio->waiter)
-                                        wake_up_process(dio->waiter);
-                                spin_unlock_irqrestore(&dio->bio_lock, flags);
-                                return;
-                        }
-                }
-        }
-        dio->bio_count--;
-        spin_unlock_irqrestore(&dio->bio_lock, flags);
 }
 static int dio_bio_complete(struct dio *dio, struct bio *bio);
@@ -288,12 +267,27 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio);
 static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error)
 {
        struct dio *dio = bio->bi_private;
+        unsigned long remaining;
+        unsigned long flags;
        if (bio->bi_size)
                return 1;
        /* cleanup the bio */
        dio_bio_complete(dio, bio);
+        spin_lock_irqsave(&dio->bio_lock, flags);
+        remaining = --dio->refcount;
+        if (remaining == 1 && dio->waiter)
+                wake_up_process(dio->waiter);
+        spin_unlock_irqrestore(&dio->bio_lock, flags);
+        if (remaining == 0) {
+                int ret = dio_complete(dio, dio->iocb->ki_pos, 0);
+                aio_complete(dio->iocb, ret, 0);
+                kfree(dio);
+        }
        return 0;
 }
@@ -315,8 +309,7 @@ static int dio_bio_end_io(struct bio *bio, unsigned int bytes_done, int error)
        spin_lock_irqsave(&dio->bio_lock, flags);
        bio->bi_private = dio->bio_list;
        dio->bio_list = bio;
-        dio->bios_in_flight--;
+        if (--dio->refcount == 1 && dio->waiter)
-        if (dio->waiter && dio->bios_in_flight == 0)
                wake_up_process(dio->waiter);
        spin_unlock_irqrestore(&dio->bio_lock, flags);
        return 0;
@@ -347,6 +340,8 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
 * In the AIO read case we speculatively dirty the pages before starting IO.
 * During IO completion, any of these pages which happen to have been written
 * back will be redirtied by bio_check_pages_dirty().
+ *
+ * bios hold a dio reference between submit_bio and ->end_io.
 */
 static void dio_bio_submit(struct dio *dio)
 {
@@ -354,12 +349,14 @@ static void dio_bio_submit(struct dio *dio)
        unsigned long flags;
        bio->bi_private = dio;
        spin_lock_irqsave(&dio->bio_lock, flags);
-        dio->bio_count++;
+        dio->refcount++;
-        dio->bios_in_flight++;
        spin_unlock_irqrestore(&dio->bio_lock, flags);
        if (dio->is_async && dio->rw == READ)
                bio_set_pages_dirty(bio);
        submit_bio(dio->rw, bio);
        dio->bio = NULL;
@@ -376,28 +373,37 @@ static void dio_cleanup(struct dio *dio)
 }
 /*
- * Wait for the next BIO to complete.  Remove it and return it.
+ * Wait for the next BIO to complete.  Remove it and return it.  NULL is
+ * returned once all BIOs have been completed.  This must only be called once
+ * all bios have been issued so that dio->refcount can only decrease.  This
+ * requires that that the caller hold a reference on the dio.
 */
 static struct bio *dio_await_one(struct dio *dio)
 {
        unsigned long flags;
-        struct bio *bio;
+        struct bio *bio = NULL;
        spin_lock_irqsave(&dio->bio_lock, flags);
-        while (dio->bio_list == NULL) {
-                set_current_state(TASK_UNINTERRUPTIBLE);
+        /*
-                if (dio->bio_list == NULL) {
+         * Wait as long as the list is empty and there are bios in flight.  bio
-                        dio->waiter = current;
+         * completion drops the count, maybe adds to the list, and wakes while
-                        spin_unlock_irqrestore(&dio->bio_lock, flags);
+         * holding the bio_lock so we don't need set_current_state()'s barrier
-                        blk_run_address_space(dio->inode->i_mapping);
+         * and can call it after testing our condition.
-                        io_schedule();
+         */
-                        spin_lock_irqsave(&dio->bio_lock, flags);
+        while (dio->refcount > 1 && dio->bio_list == NULL) {
-                        dio->waiter = NULL;
+                __set_current_state(TASK_UNINTERRUPTIBLE);
-                }
+                dio->waiter = current;
-                set_current_state(TASK_RUNNING);
+                spin_unlock_irqrestore(&dio->bio_lock, flags);
+                io_schedule();
+                /* wake up sets us TASK_RUNNING */
+                spin_lock_irqsave(&dio->bio_lock, flags);
+                dio->waiter = NULL;
+        }
+        if (dio->bio_list) {
+                bio = dio->bio_list;
+                dio->bio_list = bio->bi_private;
        }
-        bio = dio->bio_list;
-        dio->bio_list = bio->bi_private;
        spin_unlock_irqrestore(&dio->bio_lock, flags);
        return bio;
 }
@@ -426,34 +432,24 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
                }
                bio_put(bio);
        }
-        finished_one_bio(dio);
        return uptodate ? 0 : -EIO;
 }
 /*
- * Wait on and process all in-flight BIOs.
+ * Wait on and process all in-flight BIOs.  This must only be called once
+ * all bios have been issued so that the refcount can only decrease.
+ * This just waits for all bios to make it through dio_bio_complete.  IO
+ * errors are propogated through dio->io_error and should be propogated via
+ * dio_complete().
 */
-static int dio_await_completion(struct dio *dio)
+static void dio_await_completion(struct dio *dio)
 {
-        int ret = 0;
+        struct bio *bio;
+        do {
-        if (dio->bio)
+                bio = dio_await_one(dio);
-                dio_bio_submit(dio);
+                if (bio)
+                        dio_bio_complete(dio, bio);
-        /*
+        } while (bio);
-         * The bio_lock is not held for the read of bio_count.
-         * This is ok since it is the dio_bio_complete() that changes
-         * bio_count.
-         */
-        while (dio->bio_count) {
-                struct bio *bio = dio_await_one(dio);
-                int ret2;
-                ret2 = dio_bio_complete(dio, bio);
-                if (ret == 0)
-                        ret = ret2;
-        }
-        return ret;
 }
 /*
@@ -675,6 +671,13 @@ submit_page_section(struct dio *dio, struct page *page,
 {
        int ret = 0;
+        if (dio->rw & WRITE) {
+                /*
+                 * Read accounting is performed in submit_bio()
+                 */
+                task_io_account_write(len);
+        }
        /*
         * Can we just grow the current page's presence in the dio?
         */
@@ -953,6 +956,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
        struct dio *dio)
 {
        unsigned long user_addr; 
+        unsigned long flags;
        int seg;
        ssize_t ret = 0;
        ssize_t ret2;
@@ -983,17 +987,8 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
        dio->iocb = iocb;
        dio->i_size = i_size_read(inode);
-        /*
-         * BIO completion state.
-         *
-         * ->bio_count starts out at one, and we decrement it to zero after all
-         * BIOs are submitted.  This to avoid the situation where a really fast
-         * (or synchronous) device could take the count to zero while we're
-         * still submitting BIOs.
-         */
-        dio->bio_count = 1;
-        dio->bios_in_flight = 0;
        spin_lock_init(&dio->bio_lock);
+        dio->refcount = 1;
        dio->bio_list = NULL;
        dio->waiter = NULL;
@@ -1069,6 +1064,9 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
        if (dio->bio)
                dio_bio_submit(dio);
+        /* All IO is now issued, send it on its way */
+        blk_run_address_space(inode->i_mapping);
        /*
         * It is possible that, we return short IO due to end of file.
         * In that case, we need to release all the pages we got hold on.
@@ -1084,74 +1082,41 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
                mutex_unlock(&dio->inode->i_mutex);
        /*
-         * OK, all BIOs are submitted, so we can decrement bio_count to truly
+         * The only time we want to leave bios in flight is when a successful
-         * reflect the number of to-be-processed BIOs.
+         * partial aio read or full aio write have been setup.  In that case
+         * bio completion will call aio_complete.  The only time it's safe to
+         * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
+         * This had *better* be the only place that raises -EIOCBQUEUED.
         */
-        if (dio->is_async) {
+        BUG_ON(ret == -EIOCBQUEUED);
-                int should_wait = 0;
+        if (dio->is_async && ret == 0 && dio->result &&
+            ((rw & READ) || (dio->result == dio->size)))
+                ret = -EIOCBQUEUED;
-                if (dio->result < dio->size && (rw & WRITE)) {
+        if (ret != -EIOCBQUEUED)
-                        dio->waiter = current;
+                dio_await_completion(dio);
-                        should_wait = 1;
-                }
-                if (ret == 0)
-                        ret = dio->result;
-                finished_one_bio(dio);          /* This can free the dio */
-                blk_run_address_space(inode->i_mapping);
-                if (should_wait) {
-                        unsigned long flags;
-                        /*
-                         * Wait for already issued I/O to drain out and
-                         * release its references to user-space pages
-                         * before returning to fallback on buffered I/O
-                         */
-                        spin_lock_irqsave(&dio->bio_lock, flags);
-                        set_current_state(TASK_UNINTERRUPTIBLE);
-                        while (dio->bio_count) {
-                                spin_unlock_irqrestore(&dio->bio_lock, flags);
-                                io_schedule();
-                                spin_lock_irqsave(&dio->bio_lock, flags);
-                                set_current_state(TASK_UNINTERRUPTIBLE);
-                        }
-                        spin_unlock_irqrestore(&dio->bio_lock, flags);
-                        set_current_state(TASK_RUNNING);
-                        kfree(dio);
-                }
-        } else {
-                ssize_t transferred = 0;
-                finished_one_bio(dio);
-                ret2 = dio_await_completion(dio);
-                if (ret == 0)
-                        ret = ret2;
-                if (ret == 0)
-                        ret = dio->page_errors;
-                if (dio->result) {
-                        loff_t i_size = i_size_read(inode);
-                        transferred = dio->result;
-                        /*
-                         * Adjust the return value if the read crossed a
-                         * non-block-aligned EOF.
-                         */
-                        if (rw == READ && (offset + transferred > i_size))
-                                transferred = i_size - offset;
-                }
-                dio_complete(dio, offset, transferred);
-                if (ret == 0)
-                        ret = transferred;
-                /* We could have also come here on an AIO file extend */
+        /*
-                if (!is_sync_kiocb(iocb) && (rw & WRITE) &&
+         * Sync will always be dropping the final ref and completing the
-                    ret >= 0 && dio->result == dio->size)
+         * operation.  AIO can if it was a broken operation described above or
-                        /*
+         * in fact if all the bios race to complete before we get here.  In
-                         * For AIO writes where we have completed the
+         * that case dio_complete() translates the EIOCBQUEUED into the proper
-                         * i/o, we have to mark the the aio complete.
+         * return code that the caller will hand to aio_complete().
-                         */
+         *
-                        aio_complete(iocb, ret, 0);
+         * This is managed by the bio_lock instead of being an atomic_t so that
+         * completion paths can drop their ref and use the remaining count to
+         * decide to wake the submission path atomically.
+         */
+        spin_lock_irqsave(&dio->bio_lock, flags);
+        ret2 = --dio->refcount;
+        spin_unlock_irqrestore(&dio->bio_lock, flags);
+        BUG_ON(!dio->is_async && ret2 != 0);
+        if (ret2 == 0) {
+                ret = dio_complete(dio, offset, ret);
                kfree(dio);
-        }
+        } else
+                BUG_ON(ret != -EIOCBQUEUED);
        return ret;
 }

diff --git a/fs/direct-io.c b/fs/direct-io.c index 5981e17f46f0..d9d0833444f5 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c
@@ -27,6 +27,7 @@
27	#include <linux/slab.h>	27	#include <linux/slab.h>
28	#include <linux/highmem.h>	28	#include <linux/highmem.h>
29	#include <linux/pagemap.h>	29	#include <linux/pagemap.h>
		30	#include <linux/task_io_accounting_ops.h>
30	#include <linux/bio.h>	31	#include <linux/bio.h>
31	#include <linux/wait.h>	32	#include <linux/wait.h>
32	#include <linux/err.h>	33	#include <linux/err.h>
@@ -121,8 +122,7 @@ struct dio {
121		122
122	/* BIO completion state */	123	/* BIO completion state */
123	spinlock_t bio_lock; /* protects BIO fields below */	124	spinlock_t bio_lock; /* protects BIO fields below */
124	int bio_count; /* nr bios to be completed */	125	unsigned long refcount; /* direct_io_worker() and bios */
125	int bios_in_flight; /* nr bios in flight */
126	struct bio bio_list; / singly linked via bi_private */	126	struct bio bio_list; / singly linked via bi_private */
127	struct task_struct waiter; / waiting task (NULL if none) */	127	struct task_struct waiter; / waiting task (NULL if none) */
128		128
@@ -209,76 +209,55 @@ static struct page dio_get_page(struct dio dio)
209	return dio->pages[dio->head++];	209	return dio->pages[dio->head++];
210	}	210	}
211		211
212	/*	212	/**
213	* Called when all DIO BIO I/O has been completed - let the filesystem	213	* dio_complete() - called when all DIO BIO I/O has been completed
214	* know, if it registered an interest earlier via get_block. Pass the	214	* @offset: the byte offset in the file of the completed operation
215	* private field of the map buffer_head so that filesystems can use it	215	*
216	* to hold additional state between get_block calls and dio_complete.	216	* This releases locks as dictated by the locking type, lets interested parties
217	*/	217	* know that a DIO operation has completed, and calculates the resulting return
218	static void dio_complete(struct dio *dio, loff_t offset, ssize_t bytes)	218	* code for the operation.
219	{	219	*
220	if (dio->end_io && dio->result)	220	* It lets the filesystem know if it registered an interest earlier via
221	dio->end_io(dio->iocb, offset, bytes, dio->map_bh.b_private);	221	* get_block. Pass the private field of the map buffer_head so that
222	if (dio->lock_type == DIO_LOCKING)	222	* filesystems can use it to hold additional state between get_block calls and
223	/* lockdep: non-owner release */	223	* dio_complete.
224	up_read_non_owner(&dio->inode->i_alloc_sem);
225	}
226
227	/*
228	* Called when a BIO has been processed. If the count goes to zero then IO is
229	* complete and we can signal this to the AIO layer.
230	*/	224	*/
231	static void finished_one_bio(struct dio *dio)	225	static int dio_complete(struct dio *dio, loff_t offset, int ret)
232	{	226	{
233	unsigned long flags;	227	ssize_t transferred = 0;
234		228
235	spin_lock_irqsave(&dio->bio_lock, flags);	229	/*
236	if (dio->bio_count == 1) {	230	* AIO submission can race with bio completion to get here while
237	if (dio->is_async) {	231	* expecting to have the last io completed by bio completion.
238	ssize_t transferred;	232	* In that case -EIOCBQUEUED is in fact not an error we want
239	loff_t offset;	233	* to preserve through this call.
240		234	*/
241	/*	235	if (ret == -EIOCBQUEUED)
242	* Last reference to the dio is going away.	236	ret = 0;
243	* Drop spinlock and complete the DIO.
244	*/
245	spin_unlock_irqrestore(&dio->bio_lock, flags);
246		237
247	/* Check for short read case */	238	if (dio->result) {
248	transferred = dio->result;	239	transferred = dio->result;
249	offset = dio->iocb->ki_pos;
250		240
251	if ((dio->rw == READ) &&	241	/* Check for short read case */
252	((offset + transferred) > dio->i_size))	242	if ((dio->rw == READ) && ((offset + transferred) > dio->i_size))
253	transferred = dio->i_size - offset;	243	transferred = dio->i_size - offset;
		244	}
254		245
255	/* check for error in completion path */	246	if (dio->end_io && dio->result)
256	if (dio->io_error)	247	dio->end_io(dio->iocb, offset, transferred,
257	transferred = dio->io_error;	248	dio->map_bh.b_private);
		249	if (dio->lock_type == DIO_LOCKING)
		250	/* lockdep: non-owner release */
		251	up_read_non_owner(&dio->inode->i_alloc_sem);
258		252
259	dio_complete(dio, offset, transferred);	253	if (ret == 0)
		254	ret = dio->page_errors;
		255	if (ret == 0)
		256	ret = dio->io_error;
		257	if (ret == 0)
		258	ret = transferred;
260		259
261	/* Complete AIO later if falling back to buffered i/o */	260	return ret;
262	if (dio->result == dio->size \|\|
263	((dio->rw == READ) && dio->result)) {
264	aio_complete(dio->iocb, transferred, 0);
265	kfree(dio);
266	return;
267	} else {
268	/*
269	* Falling back to buffered
270	*/
271	spin_lock_irqsave(&dio->bio_lock, flags);
272	dio->bio_count--;
273	if (dio->waiter)
274	wake_up_process(dio->waiter);
275	spin_unlock_irqrestore(&dio->bio_lock, flags);
276	return;
277	}
278	}
279	}
280	dio->bio_count--;
281	spin_unlock_irqrestore(&dio->bio_lock, flags);
282	}	261	}
283		262
284	static int dio_bio_complete(struct dio dio, struct bio bio);	263	static int dio_bio_complete(struct dio dio, struct bio bio);
@@ -288,12 +267,27 @@ static int dio_bio_complete(struct dio dio, struct bio bio);
288	static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error)	267	static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error)
289	{	268	{
290	struct dio *dio = bio->bi_private;	269	struct dio *dio = bio->bi_private;
		270	unsigned long remaining;
		271	unsigned long flags;
291		272
292	if (bio->bi_size)	273	if (bio->bi_size)
293	return 1;	274	return 1;
294		275
295	/* cleanup the bio */	276	/* cleanup the bio */
296	dio_bio_complete(dio, bio);	277	dio_bio_complete(dio, bio);
		278
		279	spin_lock_irqsave(&dio->bio_lock, flags);
		280	remaining = --dio->refcount;
		281	if (remaining == 1 && dio->waiter)
		282	wake_up_process(dio->waiter);
		283	spin_unlock_irqrestore(&dio->bio_lock, flags);
		284
		285	if (remaining == 0) {
		286	int ret = dio_complete(dio, dio->iocb->ki_pos, 0);
		287	aio_complete(dio->iocb, ret, 0);
		288	kfree(dio);
		289	}
		290
297	return 0;	291	return 0;
298	}	292	}
299		293
@@ -315,8 +309,7 @@ static int dio_bio_end_io(struct bio *bio, unsigned int bytes_done, int error)
315	spin_lock_irqsave(&dio->bio_lock, flags);	309	spin_lock_irqsave(&dio->bio_lock, flags);
316	bio->bi_private = dio->bio_list;	310	bio->bi_private = dio->bio_list;
317	dio->bio_list = bio;	311	dio->bio_list = bio;
318	dio->bios_in_flight--;	312	if (--dio->refcount == 1 && dio->waiter)
319	if (dio->waiter && dio->bios_in_flight == 0)
320	wake_up_process(dio->waiter);	313	wake_up_process(dio->waiter);
321	spin_unlock_irqrestore(&dio->bio_lock, flags);	314	spin_unlock_irqrestore(&dio->bio_lock, flags);
322	return 0;	315	return 0;
@@ -347,6 +340,8 @@ dio_bio_alloc(struct dio dio, struct block_device bdev,
347	* In the AIO read case we speculatively dirty the pages before starting IO.	340	* In the AIO read case we speculatively dirty the pages before starting IO.
348	* During IO completion, any of these pages which happen to have been written	341	* During IO completion, any of these pages which happen to have been written
349	* back will be redirtied by bio_check_pages_dirty().	342	* back will be redirtied by bio_check_pages_dirty().
		343	*
		344	* bios hold a dio reference between submit_bio and ->end_io.
350	*/	345	*/
351	static void dio_bio_submit(struct dio *dio)	346	static void dio_bio_submit(struct dio *dio)
352	{	347	{
@@ -354,12 +349,14 @@ static void dio_bio_submit(struct dio *dio)
354	unsigned long flags;	349	unsigned long flags;
355		350
356	bio->bi_private = dio;	351	bio->bi_private = dio;
		352
357	spin_lock_irqsave(&dio->bio_lock, flags);	353	spin_lock_irqsave(&dio->bio_lock, flags);
358	dio->bio_count++;	354	dio->refcount++;
359	dio->bios_in_flight++;
360	spin_unlock_irqrestore(&dio->bio_lock, flags);	355	spin_unlock_irqrestore(&dio->bio_lock, flags);
		356
361	if (dio->is_async && dio->rw == READ)	357	if (dio->is_async && dio->rw == READ)
362	bio_set_pages_dirty(bio);	358	bio_set_pages_dirty(bio);
		359
363	submit_bio(dio->rw, bio);	360	submit_bio(dio->rw, bio);
364		361
365	dio->bio = NULL;	362	dio->bio = NULL;
@@ -376,28 +373,37 @@ static void dio_cleanup(struct dio *dio)
376	}	373	}
377		374
378	/*	375	/*
379	* Wait for the next BIO to complete. Remove it and return it.	376	* Wait for the next BIO to complete. Remove it and return it. NULL is
		377	* returned once all BIOs have been completed. This must only be called once
		378	* all bios have been issued so that dio->refcount can only decrease. This
		379	* requires that that the caller hold a reference on the dio.
380	*/	380	*/
381	static struct bio dio_await_one(struct dio dio)	381	static struct bio dio_await_one(struct dio dio)
382	{	382	{
383	unsigned long flags;	383	unsigned long flags;
384	struct bio *bio;	384	struct bio *bio = NULL;
385		385
386	spin_lock_irqsave(&dio->bio_lock, flags);	386	spin_lock_irqsave(&dio->bio_lock, flags);
387	while (dio->bio_list == NULL) {	387
388	set_current_state(TASK_UNINTERRUPTIBLE);	388	/*
389	if (dio->bio_list == NULL) {	389	* Wait as long as the list is empty and there are bios in flight. bio
390	dio->waiter = current;	390	* completion drops the count, maybe adds to the list, and wakes while
391	spin_unlock_irqrestore(&dio->bio_lock, flags);	391	* holding the bio_lock so we don't need set_current_state()'s barrier
392	blk_run_address_space(dio->inode->i_mapping);	392	* and can call it after testing our condition.
393	io_schedule();	393	*/
394	spin_lock_irqsave(&dio->bio_lock, flags);	394	while (dio->refcount > 1 && dio->bio_list == NULL) {
395	dio->waiter = NULL;	395	__set_current_state(TASK_UNINTERRUPTIBLE);
396	}	396	dio->waiter = current;
397	set_current_state(TASK_RUNNING);	397	spin_unlock_irqrestore(&dio->bio_lock, flags);
		398	io_schedule();
		399	/* wake up sets us TASK_RUNNING */
		400	spin_lock_irqsave(&dio->bio_lock, flags);
		401	dio->waiter = NULL;
		402	}
		403	if (dio->bio_list) {
		404	bio = dio->bio_list;
		405	dio->bio_list = bio->bi_private;
398	}	406	}
399	bio = dio->bio_list;
400	dio->bio_list = bio->bi_private;
401	spin_unlock_irqrestore(&dio->bio_lock, flags);	407	spin_unlock_irqrestore(&dio->bio_lock, flags);
402	return bio;	408	return bio;
403	}	409	}
@@ -426,34 +432,24 @@ static int dio_bio_complete(struct dio dio, struct bio bio)
426	}	432	}
427	bio_put(bio);	433	bio_put(bio);
428	}	434	}
429	finished_one_bio(dio);
430	return uptodate ? 0 : -EIO;	435	return uptodate ? 0 : -EIO;
431	}	436	}
432		437
433	/*	438	/*
434	* Wait on and process all in-flight BIOs.	439	* Wait on and process all in-flight BIOs. This must only be called once
		440	* all bios have been issued so that the refcount can only decrease.
		441	* This just waits for all bios to make it through dio_bio_complete. IO
		442	* errors are propogated through dio->io_error and should be propogated via
		443	* dio_complete().
435	*/	444	*/
436	static int dio_await_completion(struct dio *dio)	445	static void dio_await_completion(struct dio *dio)
437	{	446	{
438	int ret = 0;	447	struct bio *bio;
439		448	do {
440	if (dio->bio)	449	bio = dio_await_one(dio);
441	dio_bio_submit(dio);	450	if (bio)
442		451	dio_bio_complete(dio, bio);
443	/*	452	} while (bio);
444	* The bio_lock is not held for the read of bio_count.
445	* This is ok since it is the dio_bio_complete() that changes
446	* bio_count.
447	*/
448	while (dio->bio_count) {
449	struct bio *bio = dio_await_one(dio);
450	int ret2;
451
452	ret2 = dio_bio_complete(dio, bio);
453	if (ret == 0)
454	ret = ret2;
455	}
456	return ret;
457	}	453	}
458		454
459	/*	455	/*
@@ -675,6 +671,13 @@ submit_page_section(struct dio dio, struct page page,
675	{	671	{
676	int ret = 0;	672	int ret = 0;
677		673
		674	if (dio->rw & WRITE) {
		675	/*
		676	* Read accounting is performed in submit_bio()
		677	*/
		678	task_io_account_write(len);
		679	}
		680
678	/*	681	/*
679	* Can we just grow the current page's presence in the dio?	682	* Can we just grow the current page's presence in the dio?
680	*/	683	*/
@@ -953,6 +956,7 @@ direct_io_worker(int rw, struct kiocb iocb, struct inode inode,
953	struct dio *dio)	956	struct dio *dio)
954	{	957	{
955	unsigned long user_addr;	958	unsigned long user_addr;
		959	unsigned long flags;
956	int seg;	960	int seg;
957	ssize_t ret = 0;	961	ssize_t ret = 0;
958	ssize_t ret2;	962	ssize_t ret2;
@@ -983,17 +987,8 @@ direct_io_worker(int rw, struct kiocb iocb, struct inode inode,
983	dio->iocb = iocb;	987	dio->iocb = iocb;
984	dio->i_size = i_size_read(inode);	988	dio->i_size = i_size_read(inode);
985		989
986	/*
987	* BIO completion state.
988	*
989	* ->bio_count starts out at one, and we decrement it to zero after all
990	* BIOs are submitted. This to avoid the situation where a really fast
991	* (or synchronous) device could take the count to zero while we're
992	* still submitting BIOs.
993	*/
994	dio->bio_count = 1;
995	dio->bios_in_flight = 0;
996	spin_lock_init(&dio->bio_lock);	990	spin_lock_init(&dio->bio_lock);
		991	dio->refcount = 1;
997	dio->bio_list = NULL;	992	dio->bio_list = NULL;
998	dio->waiter = NULL;	993	dio->waiter = NULL;
999		994
@@ -1069,6 +1064,9 @@ direct_io_worker(int rw, struct kiocb iocb, struct inode inode,
1069	if (dio->bio)	1064	if (dio->bio)
1070	dio_bio_submit(dio);	1065	dio_bio_submit(dio);
1071		1066
		1067	/* All IO is now issued, send it on its way */
		1068	blk_run_address_space(inode->i_mapping);
		1069
1072	/*	1070	/*
1073	* It is possible that, we return short IO due to end of file.	1071	* It is possible that, we return short IO due to end of file.
1074	* In that case, we need to release all the pages we got hold on.	1072	* In that case, we need to release all the pages we got hold on.
@@ -1084,74 +1082,41 @@ direct_io_worker(int rw, struct kiocb iocb, struct inode inode,
1084	mutex_unlock(&dio->inode->i_mutex);	1082	mutex_unlock(&dio->inode->i_mutex);
1085		1083
1086	/*	1084	/*
1087	* OK, all BIOs are submitted, so we can decrement bio_count to truly	1085	* The only time we want to leave bios in flight is when a successful
1088	* reflect the number of to-be-processed BIOs.	1086	* partial aio read or full aio write have been setup. In that case
		1087	* bio completion will call aio_complete. The only time it's safe to
		1088	* call aio_complete is when we return -EIOCBQUEUED, so we key on that.
		1089	* This had better be the only place that raises -EIOCBQUEUED.
1089	*/	1090	*/
1090	if (dio->is_async) {	1091	BUG_ON(ret == -EIOCBQUEUED);
1091	int should_wait = 0;	1092	if (dio->is_async && ret == 0 && dio->result &&
		1093	((rw & READ) \|\| (dio->result == dio->size)))
		1094	ret = -EIOCBQUEUED;
1092		1095
1093	if (dio->result < dio->size && (rw & WRITE)) {	1096	if (ret != -EIOCBQUEUED)
1094	dio->waiter = current;	1097	dio_await_completion(dio);
1095	should_wait = 1;
1096	}
1097	if (ret == 0)
1098	ret = dio->result;
1099	finished_one_bio(dio); /* This can free the dio */
1100	blk_run_address_space(inode->i_mapping);
1101	if (should_wait) {
1102	unsigned long flags;
1103	/*
1104	* Wait for already issued I/O to drain out and
1105	* release its references to user-space pages
1106	* before returning to fallback on buffered I/O
1107	*/
1108
1109	spin_lock_irqsave(&dio->bio_lock, flags);
1110	set_current_state(TASK_UNINTERRUPTIBLE);
1111	while (dio->bio_count) {
1112	spin_unlock_irqrestore(&dio->bio_lock, flags);
1113	io_schedule();
1114	spin_lock_irqsave(&dio->bio_lock, flags);
1115	set_current_state(TASK_UNINTERRUPTIBLE);
1116	}
1117	spin_unlock_irqrestore(&dio->bio_lock, flags);
1118	set_current_state(TASK_RUNNING);
1119	kfree(dio);
1120	}
1121	} else {
1122	ssize_t transferred = 0;
1123
1124	finished_one_bio(dio);
1125	ret2 = dio_await_completion(dio);
1126	if (ret == 0)
1127	ret = ret2;
1128	if (ret == 0)
1129	ret = dio->page_errors;
1130	if (dio->result) {
1131	loff_t i_size = i_size_read(inode);
1132
1133	transferred = dio->result;
1134	/*
1135	* Adjust the return value if the read crossed a
1136	* non-block-aligned EOF.
1137	*/
1138	if (rw == READ && (offset + transferred > i_size))
1139	transferred = i_size - offset;
1140	}
1141	dio_complete(dio, offset, transferred);
1142	if (ret == 0)
1143	ret = transferred;
1144		1098
1145	/* We could have also come here on an AIO file extend */	1099	/*
1146	if (!is_sync_kiocb(iocb) && (rw & WRITE) &&	1100	* Sync will always be dropping the final ref and completing the
1147	ret >= 0 && dio->result == dio->size)	1101	* operation. AIO can if it was a broken operation described above or
1148	/*	1102	* in fact if all the bios race to complete before we get here. In
1149	* For AIO writes where we have completed the	1103	* that case dio_complete() translates the EIOCBQUEUED into the proper
1150	* i/o, we have to mark the the aio complete.	1104	* return code that the caller will hand to aio_complete().
1151	*/	1105	*
1152	aio_complete(iocb, ret, 0);	1106	* This is managed by the bio_lock instead of being an atomic_t so that
		1107	* completion paths can drop their ref and use the remaining count to
		1108	* decide to wake the submission path atomically.
		1109	*/
		1110	spin_lock_irqsave(&dio->bio_lock, flags);
		1111	ret2 = --dio->refcount;
		1112	spin_unlock_irqrestore(&dio->bio_lock, flags);
		1113	BUG_ON(!dio->is_async && ret2 != 0);
		1114	if (ret2 == 0) {
		1115	ret = dio_complete(dio, offset, ret);
1153	kfree(dio);	1116	kfree(dio);
1154	}	1117	} else
		1118	BUG_ON(ret != -EIOCBQUEUED);
		1119
1155	return ret;	1120	return ret;
1156	}	1121	}
1157		1122