aboutsummaryrefslogtreecommitdiffstats
path: root/fs/direct-io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/direct-io.c')
-rw-r--r--fs/direct-io.c323
1 files changed, 144 insertions, 179 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c
index 5981e17f46f0..d9d0833444f5 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -27,6 +27,7 @@
27#include <linux/slab.h> 27#include <linux/slab.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/pagemap.h> 29#include <linux/pagemap.h>
30#include <linux/task_io_accounting_ops.h>
30#include <linux/bio.h> 31#include <linux/bio.h>
31#include <linux/wait.h> 32#include <linux/wait.h>
32#include <linux/err.h> 33#include <linux/err.h>
@@ -121,8 +122,7 @@ struct dio {
121 122
122 /* BIO completion state */ 123 /* BIO completion state */
123 spinlock_t bio_lock; /* protects BIO fields below */ 124 spinlock_t bio_lock; /* protects BIO fields below */
124 int bio_count; /* nr bios to be completed */ 125 unsigned long refcount; /* direct_io_worker() and bios */
125 int bios_in_flight; /* nr bios in flight */
126 struct bio *bio_list; /* singly linked via bi_private */ 126 struct bio *bio_list; /* singly linked via bi_private */
127 struct task_struct *waiter; /* waiting task (NULL if none) */ 127 struct task_struct *waiter; /* waiting task (NULL if none) */
128 128
@@ -209,76 +209,55 @@ static struct page *dio_get_page(struct dio *dio)
209 return dio->pages[dio->head++]; 209 return dio->pages[dio->head++];
210} 210}
211 211
212/* 212/**
213 * Called when all DIO BIO I/O has been completed - let the filesystem 213 * dio_complete() - called when all DIO BIO I/O has been completed
214 * know, if it registered an interest earlier via get_block. Pass the 214 * @offset: the byte offset in the file of the completed operation
215 * private field of the map buffer_head so that filesystems can use it 215 *
216 * to hold additional state between get_block calls and dio_complete. 216 * This releases locks as dictated by the locking type, lets interested parties
217 */ 217 * know that a DIO operation has completed, and calculates the resulting return
218static void dio_complete(struct dio *dio, loff_t offset, ssize_t bytes) 218 * code for the operation.
219{ 219 *
220 if (dio->end_io && dio->result) 220 * It lets the filesystem know if it registered an interest earlier via
221 dio->end_io(dio->iocb, offset, bytes, dio->map_bh.b_private); 221 * get_block. Pass the private field of the map buffer_head so that
222 if (dio->lock_type == DIO_LOCKING) 222 * filesystems can use it to hold additional state between get_block calls and
223 /* lockdep: non-owner release */ 223 * dio_complete.
224 up_read_non_owner(&dio->inode->i_alloc_sem);
225}
226
227/*
228 * Called when a BIO has been processed. If the count goes to zero then IO is
229 * complete and we can signal this to the AIO layer.
230 */ 224 */
231static void finished_one_bio(struct dio *dio) 225static int dio_complete(struct dio *dio, loff_t offset, int ret)
232{ 226{
233 unsigned long flags; 227 ssize_t transferred = 0;
234 228
235 spin_lock_irqsave(&dio->bio_lock, flags); 229 /*
236 if (dio->bio_count == 1) { 230 * AIO submission can race with bio completion to get here while
237 if (dio->is_async) { 231 * expecting to have the last io completed by bio completion.
238 ssize_t transferred; 232 * In that case -EIOCBQUEUED is in fact not an error we want
239 loff_t offset; 233 * to preserve through this call.
240 234 */
241 /* 235 if (ret == -EIOCBQUEUED)
242 * Last reference to the dio is going away. 236 ret = 0;
243 * Drop spinlock and complete the DIO.
244 */
245 spin_unlock_irqrestore(&dio->bio_lock, flags);
246 237
247 /* Check for short read case */ 238 if (dio->result) {
248 transferred = dio->result; 239 transferred = dio->result;
249 offset = dio->iocb->ki_pos;
250 240
251 if ((dio->rw == READ) && 241 /* Check for short read case */
252 ((offset + transferred) > dio->i_size)) 242 if ((dio->rw == READ) && ((offset + transferred) > dio->i_size))
253 transferred = dio->i_size - offset; 243 transferred = dio->i_size - offset;
244 }
254 245
255 /* check for error in completion path */ 246 if (dio->end_io && dio->result)
256 if (dio->io_error) 247 dio->end_io(dio->iocb, offset, transferred,
257 transferred = dio->io_error; 248 dio->map_bh.b_private);
249 if (dio->lock_type == DIO_LOCKING)
250 /* lockdep: non-owner release */
251 up_read_non_owner(&dio->inode->i_alloc_sem);
258 252
259 dio_complete(dio, offset, transferred); 253 if (ret == 0)
254 ret = dio->page_errors;
255 if (ret == 0)
256 ret = dio->io_error;
257 if (ret == 0)
258 ret = transferred;
260 259
261 /* Complete AIO later if falling back to buffered i/o */ 260 return ret;
262 if (dio->result == dio->size ||
263 ((dio->rw == READ) && dio->result)) {
264 aio_complete(dio->iocb, transferred, 0);
265 kfree(dio);
266 return;
267 } else {
268 /*
269 * Falling back to buffered
270 */
271 spin_lock_irqsave(&dio->bio_lock, flags);
272 dio->bio_count--;
273 if (dio->waiter)
274 wake_up_process(dio->waiter);
275 spin_unlock_irqrestore(&dio->bio_lock, flags);
276 return;
277 }
278 }
279 }
280 dio->bio_count--;
281 spin_unlock_irqrestore(&dio->bio_lock, flags);
282} 261}
283 262
284static int dio_bio_complete(struct dio *dio, struct bio *bio); 263static int dio_bio_complete(struct dio *dio, struct bio *bio);
@@ -288,12 +267,27 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio);
288static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error) 267static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error)
289{ 268{
290 struct dio *dio = bio->bi_private; 269 struct dio *dio = bio->bi_private;
270 unsigned long remaining;
271 unsigned long flags;
291 272
292 if (bio->bi_size) 273 if (bio->bi_size)
293 return 1; 274 return 1;
294 275
295 /* cleanup the bio */ 276 /* cleanup the bio */
296 dio_bio_complete(dio, bio); 277 dio_bio_complete(dio, bio);
278
279 spin_lock_irqsave(&dio->bio_lock, flags);
280 remaining = --dio->refcount;
281 if (remaining == 1 && dio->waiter)
282 wake_up_process(dio->waiter);
283 spin_unlock_irqrestore(&dio->bio_lock, flags);
284
285 if (remaining == 0) {
286 int ret = dio_complete(dio, dio->iocb->ki_pos, 0);
287 aio_complete(dio->iocb, ret, 0);
288 kfree(dio);
289 }
290
297 return 0; 291 return 0;
298} 292}
299 293
@@ -315,8 +309,7 @@ static int dio_bio_end_io(struct bio *bio, unsigned int bytes_done, int error)
315 spin_lock_irqsave(&dio->bio_lock, flags); 309 spin_lock_irqsave(&dio->bio_lock, flags);
316 bio->bi_private = dio->bio_list; 310 bio->bi_private = dio->bio_list;
317 dio->bio_list = bio; 311 dio->bio_list = bio;
318 dio->bios_in_flight--; 312 if (--dio->refcount == 1 && dio->waiter)
319 if (dio->waiter && dio->bios_in_flight == 0)
320 wake_up_process(dio->waiter); 313 wake_up_process(dio->waiter);
321 spin_unlock_irqrestore(&dio->bio_lock, flags); 314 spin_unlock_irqrestore(&dio->bio_lock, flags);
322 return 0; 315 return 0;
@@ -347,6 +340,8 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev,
347 * In the AIO read case we speculatively dirty the pages before starting IO. 340 * In the AIO read case we speculatively dirty the pages before starting IO.
348 * During IO completion, any of these pages which happen to have been written 341 * During IO completion, any of these pages which happen to have been written
349 * back will be redirtied by bio_check_pages_dirty(). 342 * back will be redirtied by bio_check_pages_dirty().
343 *
344 * bios hold a dio reference between submit_bio and ->end_io.
350 */ 345 */
351static void dio_bio_submit(struct dio *dio) 346static void dio_bio_submit(struct dio *dio)
352{ 347{
@@ -354,12 +349,14 @@ static void dio_bio_submit(struct dio *dio)
354 unsigned long flags; 349 unsigned long flags;
355 350
356 bio->bi_private = dio; 351 bio->bi_private = dio;
352
357 spin_lock_irqsave(&dio->bio_lock, flags); 353 spin_lock_irqsave(&dio->bio_lock, flags);
358 dio->bio_count++; 354 dio->refcount++;
359 dio->bios_in_flight++;
360 spin_unlock_irqrestore(&dio->bio_lock, flags); 355 spin_unlock_irqrestore(&dio->bio_lock, flags);
356
361 if (dio->is_async && dio->rw == READ) 357 if (dio->is_async && dio->rw == READ)
362 bio_set_pages_dirty(bio); 358 bio_set_pages_dirty(bio);
359
363 submit_bio(dio->rw, bio); 360 submit_bio(dio->rw, bio);
364 361
365 dio->bio = NULL; 362 dio->bio = NULL;
@@ -376,28 +373,37 @@ static void dio_cleanup(struct dio *dio)
376} 373}
377 374
378/* 375/*
379 * Wait for the next BIO to complete. Remove it and return it. 376 * Wait for the next BIO to complete. Remove it and return it. NULL is
377 * returned once all BIOs have been completed. This must only be called once
378 * all bios have been issued so that dio->refcount can only decrease. This
379 * requires that that the caller hold a reference on the dio.
380 */ 380 */
381static struct bio *dio_await_one(struct dio *dio) 381static struct bio *dio_await_one(struct dio *dio)
382{ 382{
383 unsigned long flags; 383 unsigned long flags;
384 struct bio *bio; 384 struct bio *bio = NULL;
385 385
386 spin_lock_irqsave(&dio->bio_lock, flags); 386 spin_lock_irqsave(&dio->bio_lock, flags);
387 while (dio->bio_list == NULL) { 387
388 set_current_state(TASK_UNINTERRUPTIBLE); 388 /*
389 if (dio->bio_list == NULL) { 389 * Wait as long as the list is empty and there are bios in flight. bio
390 dio->waiter = current; 390 * completion drops the count, maybe adds to the list, and wakes while
391 spin_unlock_irqrestore(&dio->bio_lock, flags); 391 * holding the bio_lock so we don't need set_current_state()'s barrier
392 blk_run_address_space(dio->inode->i_mapping); 392 * and can call it after testing our condition.
393 io_schedule(); 393 */
394 spin_lock_irqsave(&dio->bio_lock, flags); 394 while (dio->refcount > 1 && dio->bio_list == NULL) {
395 dio->waiter = NULL; 395 __set_current_state(TASK_UNINTERRUPTIBLE);
396 } 396 dio->waiter = current;
397 set_current_state(TASK_RUNNING); 397 spin_unlock_irqrestore(&dio->bio_lock, flags);
398 io_schedule();
399 /* wake up sets us TASK_RUNNING */
400 spin_lock_irqsave(&dio->bio_lock, flags);
401 dio->waiter = NULL;
402 }
403 if (dio->bio_list) {
404 bio = dio->bio_list;
405 dio->bio_list = bio->bi_private;
398 } 406 }
399 bio = dio->bio_list;
400 dio->bio_list = bio->bi_private;
401 spin_unlock_irqrestore(&dio->bio_lock, flags); 407 spin_unlock_irqrestore(&dio->bio_lock, flags);
402 return bio; 408 return bio;
403} 409}
@@ -426,34 +432,24 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio)
426 } 432 }
427 bio_put(bio); 433 bio_put(bio);
428 } 434 }
429 finished_one_bio(dio);
430 return uptodate ? 0 : -EIO; 435 return uptodate ? 0 : -EIO;
431} 436}
432 437
433/* 438/*
434 * Wait on and process all in-flight BIOs. 439 * Wait on and process all in-flight BIOs. This must only be called once
440 * all bios have been issued so that the refcount can only decrease.
441 * This just waits for all bios to make it through dio_bio_complete. IO
442 * errors are propogated through dio->io_error and should be propogated via
443 * dio_complete().
435 */ 444 */
436static int dio_await_completion(struct dio *dio) 445static void dio_await_completion(struct dio *dio)
437{ 446{
438 int ret = 0; 447 struct bio *bio;
439 448 do {
440 if (dio->bio) 449 bio = dio_await_one(dio);
441 dio_bio_submit(dio); 450 if (bio)
442 451 dio_bio_complete(dio, bio);
443 /* 452 } while (bio);
444 * The bio_lock is not held for the read of bio_count.
445 * This is ok since it is the dio_bio_complete() that changes
446 * bio_count.
447 */
448 while (dio->bio_count) {
449 struct bio *bio = dio_await_one(dio);
450 int ret2;
451
452 ret2 = dio_bio_complete(dio, bio);
453 if (ret == 0)
454 ret = ret2;
455 }
456 return ret;
457} 453}
458 454
459/* 455/*
@@ -675,6 +671,13 @@ submit_page_section(struct dio *dio, struct page *page,
675{ 671{
676 int ret = 0; 672 int ret = 0;
677 673
674 if (dio->rw & WRITE) {
675 /*
676 * Read accounting is performed in submit_bio()
677 */
678 task_io_account_write(len);
679 }
680
678 /* 681 /*
679 * Can we just grow the current page's presence in the dio? 682 * Can we just grow the current page's presence in the dio?
680 */ 683 */
@@ -953,6 +956,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
953 struct dio *dio) 956 struct dio *dio)
954{ 957{
955 unsigned long user_addr; 958 unsigned long user_addr;
959 unsigned long flags;
956 int seg; 960 int seg;
957 ssize_t ret = 0; 961 ssize_t ret = 0;
958 ssize_t ret2; 962 ssize_t ret2;
@@ -983,17 +987,8 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
983 dio->iocb = iocb; 987 dio->iocb = iocb;
984 dio->i_size = i_size_read(inode); 988 dio->i_size = i_size_read(inode);
985 989
986 /*
987 * BIO completion state.
988 *
989 * ->bio_count starts out at one, and we decrement it to zero after all
990 * BIOs are submitted. This to avoid the situation where a really fast
991 * (or synchronous) device could take the count to zero while we're
992 * still submitting BIOs.
993 */
994 dio->bio_count = 1;
995 dio->bios_in_flight = 0;
996 spin_lock_init(&dio->bio_lock); 990 spin_lock_init(&dio->bio_lock);
991 dio->refcount = 1;
997 dio->bio_list = NULL; 992 dio->bio_list = NULL;
998 dio->waiter = NULL; 993 dio->waiter = NULL;
999 994
@@ -1069,6 +1064,9 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1069 if (dio->bio) 1064 if (dio->bio)
1070 dio_bio_submit(dio); 1065 dio_bio_submit(dio);
1071 1066
1067 /* All IO is now issued, send it on its way */
1068 blk_run_address_space(inode->i_mapping);
1069
1072 /* 1070 /*
1073 * It is possible that, we return short IO due to end of file. 1071 * It is possible that, we return short IO due to end of file.
1074 * In that case, we need to release all the pages we got hold on. 1072 * In that case, we need to release all the pages we got hold on.
@@ -1084,74 +1082,41 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1084 mutex_unlock(&dio->inode->i_mutex); 1082 mutex_unlock(&dio->inode->i_mutex);
1085 1083
1086 /* 1084 /*
1087 * OK, all BIOs are submitted, so we can decrement bio_count to truly 1085 * The only time we want to leave bios in flight is when a successful
1088 * reflect the number of to-be-processed BIOs. 1086 * partial aio read or full aio write have been setup. In that case
1087 * bio completion will call aio_complete. The only time it's safe to
1088 * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
1089 * This had *better* be the only place that raises -EIOCBQUEUED.
1089 */ 1090 */
1090 if (dio->is_async) { 1091 BUG_ON(ret == -EIOCBQUEUED);
1091 int should_wait = 0; 1092 if (dio->is_async && ret == 0 && dio->result &&
1093 ((rw & READ) || (dio->result == dio->size)))
1094 ret = -EIOCBQUEUED;
1092 1095
1093 if (dio->result < dio->size && (rw & WRITE)) { 1096 if (ret != -EIOCBQUEUED)
1094 dio->waiter = current; 1097 dio_await_completion(dio);
1095 should_wait = 1;
1096 }
1097 if (ret == 0)
1098 ret = dio->result;
1099 finished_one_bio(dio); /* This can free the dio */
1100 blk_run_address_space(inode->i_mapping);
1101 if (should_wait) {
1102 unsigned long flags;
1103 /*
1104 * Wait for already issued I/O to drain out and
1105 * release its references to user-space pages
1106 * before returning to fallback on buffered I/O
1107 */
1108
1109 spin_lock_irqsave(&dio->bio_lock, flags);
1110 set_current_state(TASK_UNINTERRUPTIBLE);
1111 while (dio->bio_count) {
1112 spin_unlock_irqrestore(&dio->bio_lock, flags);
1113 io_schedule();
1114 spin_lock_irqsave(&dio->bio_lock, flags);
1115 set_current_state(TASK_UNINTERRUPTIBLE);
1116 }
1117 spin_unlock_irqrestore(&dio->bio_lock, flags);
1118 set_current_state(TASK_RUNNING);
1119 kfree(dio);
1120 }
1121 } else {
1122 ssize_t transferred = 0;
1123
1124 finished_one_bio(dio);
1125 ret2 = dio_await_completion(dio);
1126 if (ret == 0)
1127 ret = ret2;
1128 if (ret == 0)
1129 ret = dio->page_errors;
1130 if (dio->result) {
1131 loff_t i_size = i_size_read(inode);
1132
1133 transferred = dio->result;
1134 /*
1135 * Adjust the return value if the read crossed a
1136 * non-block-aligned EOF.
1137 */
1138 if (rw == READ && (offset + transferred > i_size))
1139 transferred = i_size - offset;
1140 }
1141 dio_complete(dio, offset, transferred);
1142 if (ret == 0)
1143 ret = transferred;
1144 1098
1145 /* We could have also come here on an AIO file extend */ 1099 /*
1146 if (!is_sync_kiocb(iocb) && (rw & WRITE) && 1100 * Sync will always be dropping the final ref and completing the
1147 ret >= 0 && dio->result == dio->size) 1101 * operation. AIO can if it was a broken operation described above or
1148 /* 1102 * in fact if all the bios race to complete before we get here. In
1149 * For AIO writes where we have completed the 1103 * that case dio_complete() translates the EIOCBQUEUED into the proper
1150 * i/o, we have to mark the the aio complete. 1104 * return code that the caller will hand to aio_complete().
1151 */ 1105 *
1152 aio_complete(iocb, ret, 0); 1106 * This is managed by the bio_lock instead of being an atomic_t so that
1107 * completion paths can drop their ref and use the remaining count to
1108 * decide to wake the submission path atomically.
1109 */
1110 spin_lock_irqsave(&dio->bio_lock, flags);
1111 ret2 = --dio->refcount;
1112 spin_unlock_irqrestore(&dio->bio_lock, flags);
1113 BUG_ON(!dio->is_async && ret2 != 0);
1114 if (ret2 == 0) {
1115 ret = dio_complete(dio, offset, ret);
1153 kfree(dio); 1116 kfree(dio);
1154 } 1117 } else
1118 BUG_ON(ret != -EIOCBQUEUED);
1119
1155 return ret; 1120 return ret;
1156} 1121}
1157 1122