diff options
Diffstat (limited to 'fs/direct-io.c')
-rw-r--r-- | fs/direct-io.c | 323 |
1 files changed, 144 insertions, 179 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c index 5981e17f46f0..d9d0833444f5 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c | |||
@@ -27,6 +27,7 @@ | |||
27 | #include <linux/slab.h> | 27 | #include <linux/slab.h> |
28 | #include <linux/highmem.h> | 28 | #include <linux/highmem.h> |
29 | #include <linux/pagemap.h> | 29 | #include <linux/pagemap.h> |
30 | #include <linux/task_io_accounting_ops.h> | ||
30 | #include <linux/bio.h> | 31 | #include <linux/bio.h> |
31 | #include <linux/wait.h> | 32 | #include <linux/wait.h> |
32 | #include <linux/err.h> | 33 | #include <linux/err.h> |
@@ -121,8 +122,7 @@ struct dio { | |||
121 | 122 | ||
122 | /* BIO completion state */ | 123 | /* BIO completion state */ |
123 | spinlock_t bio_lock; /* protects BIO fields below */ | 124 | spinlock_t bio_lock; /* protects BIO fields below */ |
124 | int bio_count; /* nr bios to be completed */ | 125 | unsigned long refcount; /* direct_io_worker() and bios */ |
125 | int bios_in_flight; /* nr bios in flight */ | ||
126 | struct bio *bio_list; /* singly linked via bi_private */ | 126 | struct bio *bio_list; /* singly linked via bi_private */ |
127 | struct task_struct *waiter; /* waiting task (NULL if none) */ | 127 | struct task_struct *waiter; /* waiting task (NULL if none) */ |
128 | 128 | ||
@@ -209,76 +209,55 @@ static struct page *dio_get_page(struct dio *dio) | |||
209 | return dio->pages[dio->head++]; | 209 | return dio->pages[dio->head++]; |
210 | } | 210 | } |
211 | 211 | ||
212 | /* | 212 | /** |
213 | * Called when all DIO BIO I/O has been completed - let the filesystem | 213 | * dio_complete() - called when all DIO BIO I/O has been completed |
214 | * know, if it registered an interest earlier via get_block. Pass the | 214 | * @offset: the byte offset in the file of the completed operation |
215 | * private field of the map buffer_head so that filesystems can use it | 215 | * |
216 | * to hold additional state between get_block calls and dio_complete. | 216 | * This releases locks as dictated by the locking type, lets interested parties |
217 | */ | 217 | * know that a DIO operation has completed, and calculates the resulting return |
218 | static void dio_complete(struct dio *dio, loff_t offset, ssize_t bytes) | 218 | * code for the operation. |
219 | { | 219 | * |
220 | if (dio->end_io && dio->result) | 220 | * It lets the filesystem know if it registered an interest earlier via |
221 | dio->end_io(dio->iocb, offset, bytes, dio->map_bh.b_private); | 221 | * get_block. Pass the private field of the map buffer_head so that |
222 | if (dio->lock_type == DIO_LOCKING) | 222 | * filesystems can use it to hold additional state between get_block calls and |
223 | /* lockdep: non-owner release */ | 223 | * dio_complete. |
224 | up_read_non_owner(&dio->inode->i_alloc_sem); | ||
225 | } | ||
226 | |||
227 | /* | ||
228 | * Called when a BIO has been processed. If the count goes to zero then IO is | ||
229 | * complete and we can signal this to the AIO layer. | ||
230 | */ | 224 | */ |
231 | static void finished_one_bio(struct dio *dio) | 225 | static int dio_complete(struct dio *dio, loff_t offset, int ret) |
232 | { | 226 | { |
233 | unsigned long flags; | 227 | ssize_t transferred = 0; |
234 | 228 | ||
235 | spin_lock_irqsave(&dio->bio_lock, flags); | 229 | /* |
236 | if (dio->bio_count == 1) { | 230 | * AIO submission can race with bio completion to get here while |
237 | if (dio->is_async) { | 231 | * expecting to have the last io completed by bio completion. |
238 | ssize_t transferred; | 232 | * In that case -EIOCBQUEUED is in fact not an error we want |
239 | loff_t offset; | 233 | * to preserve through this call. |
240 | 234 | */ | |
241 | /* | 235 | if (ret == -EIOCBQUEUED) |
242 | * Last reference to the dio is going away. | 236 | ret = 0; |
243 | * Drop spinlock and complete the DIO. | ||
244 | */ | ||
245 | spin_unlock_irqrestore(&dio->bio_lock, flags); | ||
246 | 237 | ||
247 | /* Check for short read case */ | 238 | if (dio->result) { |
248 | transferred = dio->result; | 239 | transferred = dio->result; |
249 | offset = dio->iocb->ki_pos; | ||
250 | 240 | ||
251 | if ((dio->rw == READ) && | 241 | /* Check for short read case */ |
252 | ((offset + transferred) > dio->i_size)) | 242 | if ((dio->rw == READ) && ((offset + transferred) > dio->i_size)) |
253 | transferred = dio->i_size - offset; | 243 | transferred = dio->i_size - offset; |
244 | } | ||
254 | 245 | ||
255 | /* check for error in completion path */ | 246 | if (dio->end_io && dio->result) |
256 | if (dio->io_error) | 247 | dio->end_io(dio->iocb, offset, transferred, |
257 | transferred = dio->io_error; | 248 | dio->map_bh.b_private); |
249 | if (dio->lock_type == DIO_LOCKING) | ||
250 | /* lockdep: non-owner release */ | ||
251 | up_read_non_owner(&dio->inode->i_alloc_sem); | ||
258 | 252 | ||
259 | dio_complete(dio, offset, transferred); | 253 | if (ret == 0) |
254 | ret = dio->page_errors; | ||
255 | if (ret == 0) | ||
256 | ret = dio->io_error; | ||
257 | if (ret == 0) | ||
258 | ret = transferred; | ||
260 | 259 | ||
261 | /* Complete AIO later if falling back to buffered i/o */ | 260 | return ret; |
262 | if (dio->result == dio->size || | ||
263 | ((dio->rw == READ) && dio->result)) { | ||
264 | aio_complete(dio->iocb, transferred, 0); | ||
265 | kfree(dio); | ||
266 | return; | ||
267 | } else { | ||
268 | /* | ||
269 | * Falling back to buffered | ||
270 | */ | ||
271 | spin_lock_irqsave(&dio->bio_lock, flags); | ||
272 | dio->bio_count--; | ||
273 | if (dio->waiter) | ||
274 | wake_up_process(dio->waiter); | ||
275 | spin_unlock_irqrestore(&dio->bio_lock, flags); | ||
276 | return; | ||
277 | } | ||
278 | } | ||
279 | } | ||
280 | dio->bio_count--; | ||
281 | spin_unlock_irqrestore(&dio->bio_lock, flags); | ||
282 | } | 261 | } |
283 | 262 | ||
284 | static int dio_bio_complete(struct dio *dio, struct bio *bio); | 263 | static int dio_bio_complete(struct dio *dio, struct bio *bio); |
@@ -288,12 +267,27 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio); | |||
288 | static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error) | 267 | static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error) |
289 | { | 268 | { |
290 | struct dio *dio = bio->bi_private; | 269 | struct dio *dio = bio->bi_private; |
270 | unsigned long remaining; | ||
271 | unsigned long flags; | ||
291 | 272 | ||
292 | if (bio->bi_size) | 273 | if (bio->bi_size) |
293 | return 1; | 274 | return 1; |
294 | 275 | ||
295 | /* cleanup the bio */ | 276 | /* cleanup the bio */ |
296 | dio_bio_complete(dio, bio); | 277 | dio_bio_complete(dio, bio); |
278 | |||
279 | spin_lock_irqsave(&dio->bio_lock, flags); | ||
280 | remaining = --dio->refcount; | ||
281 | if (remaining == 1 && dio->waiter) | ||
282 | wake_up_process(dio->waiter); | ||
283 | spin_unlock_irqrestore(&dio->bio_lock, flags); | ||
284 | |||
285 | if (remaining == 0) { | ||
286 | int ret = dio_complete(dio, dio->iocb->ki_pos, 0); | ||
287 | aio_complete(dio->iocb, ret, 0); | ||
288 | kfree(dio); | ||
289 | } | ||
290 | |||
297 | return 0; | 291 | return 0; |
298 | } | 292 | } |
299 | 293 | ||
@@ -315,8 +309,7 @@ static int dio_bio_end_io(struct bio *bio, unsigned int bytes_done, int error) | |||
315 | spin_lock_irqsave(&dio->bio_lock, flags); | 309 | spin_lock_irqsave(&dio->bio_lock, flags); |
316 | bio->bi_private = dio->bio_list; | 310 | bio->bi_private = dio->bio_list; |
317 | dio->bio_list = bio; | 311 | dio->bio_list = bio; |
318 | dio->bios_in_flight--; | 312 | if (--dio->refcount == 1 && dio->waiter) |
319 | if (dio->waiter && dio->bios_in_flight == 0) | ||
320 | wake_up_process(dio->waiter); | 313 | wake_up_process(dio->waiter); |
321 | spin_unlock_irqrestore(&dio->bio_lock, flags); | 314 | spin_unlock_irqrestore(&dio->bio_lock, flags); |
322 | return 0; | 315 | return 0; |
@@ -347,6 +340,8 @@ dio_bio_alloc(struct dio *dio, struct block_device *bdev, | |||
347 | * In the AIO read case we speculatively dirty the pages before starting IO. | 340 | * In the AIO read case we speculatively dirty the pages before starting IO. |
348 | * During IO completion, any of these pages which happen to have been written | 341 | * During IO completion, any of these pages which happen to have been written |
349 | * back will be redirtied by bio_check_pages_dirty(). | 342 | * back will be redirtied by bio_check_pages_dirty(). |
343 | * | ||
344 | * bios hold a dio reference between submit_bio and ->end_io. | ||
350 | */ | 345 | */ |
351 | static void dio_bio_submit(struct dio *dio) | 346 | static void dio_bio_submit(struct dio *dio) |
352 | { | 347 | { |
@@ -354,12 +349,14 @@ static void dio_bio_submit(struct dio *dio) | |||
354 | unsigned long flags; | 349 | unsigned long flags; |
355 | 350 | ||
356 | bio->bi_private = dio; | 351 | bio->bi_private = dio; |
352 | |||
357 | spin_lock_irqsave(&dio->bio_lock, flags); | 353 | spin_lock_irqsave(&dio->bio_lock, flags); |
358 | dio->bio_count++; | 354 | dio->refcount++; |
359 | dio->bios_in_flight++; | ||
360 | spin_unlock_irqrestore(&dio->bio_lock, flags); | 355 | spin_unlock_irqrestore(&dio->bio_lock, flags); |
356 | |||
361 | if (dio->is_async && dio->rw == READ) | 357 | if (dio->is_async && dio->rw == READ) |
362 | bio_set_pages_dirty(bio); | 358 | bio_set_pages_dirty(bio); |
359 | |||
363 | submit_bio(dio->rw, bio); | 360 | submit_bio(dio->rw, bio); |
364 | 361 | ||
365 | dio->bio = NULL; | 362 | dio->bio = NULL; |
@@ -376,28 +373,37 @@ static void dio_cleanup(struct dio *dio) | |||
376 | } | 373 | } |
377 | 374 | ||
378 | /* | 375 | /* |
379 | * Wait for the next BIO to complete. Remove it and return it. | 376 | * Wait for the next BIO to complete. Remove it and return it. NULL is |
377 | * returned once all BIOs have been completed. This must only be called once | ||
378 | * all bios have been issued so that dio->refcount can only decrease. This | ||
379 | * requires that that the caller hold a reference on the dio. | ||
380 | */ | 380 | */ |
381 | static struct bio *dio_await_one(struct dio *dio) | 381 | static struct bio *dio_await_one(struct dio *dio) |
382 | { | 382 | { |
383 | unsigned long flags; | 383 | unsigned long flags; |
384 | struct bio *bio; | 384 | struct bio *bio = NULL; |
385 | 385 | ||
386 | spin_lock_irqsave(&dio->bio_lock, flags); | 386 | spin_lock_irqsave(&dio->bio_lock, flags); |
387 | while (dio->bio_list == NULL) { | 387 | |
388 | set_current_state(TASK_UNINTERRUPTIBLE); | 388 | /* |
389 | if (dio->bio_list == NULL) { | 389 | * Wait as long as the list is empty and there are bios in flight. bio |
390 | dio->waiter = current; | 390 | * completion drops the count, maybe adds to the list, and wakes while |
391 | spin_unlock_irqrestore(&dio->bio_lock, flags); | 391 | * holding the bio_lock so we don't need set_current_state()'s barrier |
392 | blk_run_address_space(dio->inode->i_mapping); | 392 | * and can call it after testing our condition. |
393 | io_schedule(); | 393 | */ |
394 | spin_lock_irqsave(&dio->bio_lock, flags); | 394 | while (dio->refcount > 1 && dio->bio_list == NULL) { |
395 | dio->waiter = NULL; | 395 | __set_current_state(TASK_UNINTERRUPTIBLE); |
396 | } | 396 | dio->waiter = current; |
397 | set_current_state(TASK_RUNNING); | 397 | spin_unlock_irqrestore(&dio->bio_lock, flags); |
398 | io_schedule(); | ||
399 | /* wake up sets us TASK_RUNNING */ | ||
400 | spin_lock_irqsave(&dio->bio_lock, flags); | ||
401 | dio->waiter = NULL; | ||
402 | } | ||
403 | if (dio->bio_list) { | ||
404 | bio = dio->bio_list; | ||
405 | dio->bio_list = bio->bi_private; | ||
398 | } | 406 | } |
399 | bio = dio->bio_list; | ||
400 | dio->bio_list = bio->bi_private; | ||
401 | spin_unlock_irqrestore(&dio->bio_lock, flags); | 407 | spin_unlock_irqrestore(&dio->bio_lock, flags); |
402 | return bio; | 408 | return bio; |
403 | } | 409 | } |
@@ -426,34 +432,24 @@ static int dio_bio_complete(struct dio *dio, struct bio *bio) | |||
426 | } | 432 | } |
427 | bio_put(bio); | 433 | bio_put(bio); |
428 | } | 434 | } |
429 | finished_one_bio(dio); | ||
430 | return uptodate ? 0 : -EIO; | 435 | return uptodate ? 0 : -EIO; |
431 | } | 436 | } |
432 | 437 | ||
433 | /* | 438 | /* |
434 | * Wait on and process all in-flight BIOs. | 439 | * Wait on and process all in-flight BIOs. This must only be called once |
440 | * all bios have been issued so that the refcount can only decrease. | ||
441 | * This just waits for all bios to make it through dio_bio_complete. IO | ||
442 | * errors are propogated through dio->io_error and should be propogated via | ||
443 | * dio_complete(). | ||
435 | */ | 444 | */ |
436 | static int dio_await_completion(struct dio *dio) | 445 | static void dio_await_completion(struct dio *dio) |
437 | { | 446 | { |
438 | int ret = 0; | 447 | struct bio *bio; |
439 | 448 | do { | |
440 | if (dio->bio) | 449 | bio = dio_await_one(dio); |
441 | dio_bio_submit(dio); | 450 | if (bio) |
442 | 451 | dio_bio_complete(dio, bio); | |
443 | /* | 452 | } while (bio); |
444 | * The bio_lock is not held for the read of bio_count. | ||
445 | * This is ok since it is the dio_bio_complete() that changes | ||
446 | * bio_count. | ||
447 | */ | ||
448 | while (dio->bio_count) { | ||
449 | struct bio *bio = dio_await_one(dio); | ||
450 | int ret2; | ||
451 | |||
452 | ret2 = dio_bio_complete(dio, bio); | ||
453 | if (ret == 0) | ||
454 | ret = ret2; | ||
455 | } | ||
456 | return ret; | ||
457 | } | 453 | } |
458 | 454 | ||
459 | /* | 455 | /* |
@@ -675,6 +671,13 @@ submit_page_section(struct dio *dio, struct page *page, | |||
675 | { | 671 | { |
676 | int ret = 0; | 672 | int ret = 0; |
677 | 673 | ||
674 | if (dio->rw & WRITE) { | ||
675 | /* | ||
676 | * Read accounting is performed in submit_bio() | ||
677 | */ | ||
678 | task_io_account_write(len); | ||
679 | } | ||
680 | |||
678 | /* | 681 | /* |
679 | * Can we just grow the current page's presence in the dio? | 682 | * Can we just grow the current page's presence in the dio? |
680 | */ | 683 | */ |
@@ -953,6 +956,7 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, | |||
953 | struct dio *dio) | 956 | struct dio *dio) |
954 | { | 957 | { |
955 | unsigned long user_addr; | 958 | unsigned long user_addr; |
959 | unsigned long flags; | ||
956 | int seg; | 960 | int seg; |
957 | ssize_t ret = 0; | 961 | ssize_t ret = 0; |
958 | ssize_t ret2; | 962 | ssize_t ret2; |
@@ -983,17 +987,8 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, | |||
983 | dio->iocb = iocb; | 987 | dio->iocb = iocb; |
984 | dio->i_size = i_size_read(inode); | 988 | dio->i_size = i_size_read(inode); |
985 | 989 | ||
986 | /* | ||
987 | * BIO completion state. | ||
988 | * | ||
989 | * ->bio_count starts out at one, and we decrement it to zero after all | ||
990 | * BIOs are submitted. This to avoid the situation where a really fast | ||
991 | * (or synchronous) device could take the count to zero while we're | ||
992 | * still submitting BIOs. | ||
993 | */ | ||
994 | dio->bio_count = 1; | ||
995 | dio->bios_in_flight = 0; | ||
996 | spin_lock_init(&dio->bio_lock); | 990 | spin_lock_init(&dio->bio_lock); |
991 | dio->refcount = 1; | ||
997 | dio->bio_list = NULL; | 992 | dio->bio_list = NULL; |
998 | dio->waiter = NULL; | 993 | dio->waiter = NULL; |
999 | 994 | ||
@@ -1069,6 +1064,9 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, | |||
1069 | if (dio->bio) | 1064 | if (dio->bio) |
1070 | dio_bio_submit(dio); | 1065 | dio_bio_submit(dio); |
1071 | 1066 | ||
1067 | /* All IO is now issued, send it on its way */ | ||
1068 | blk_run_address_space(inode->i_mapping); | ||
1069 | |||
1072 | /* | 1070 | /* |
1073 | * It is possible that, we return short IO due to end of file. | 1071 | * It is possible that, we return short IO due to end of file. |
1074 | * In that case, we need to release all the pages we got hold on. | 1072 | * In that case, we need to release all the pages we got hold on. |
@@ -1084,74 +1082,41 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode, | |||
1084 | mutex_unlock(&dio->inode->i_mutex); | 1082 | mutex_unlock(&dio->inode->i_mutex); |
1085 | 1083 | ||
1086 | /* | 1084 | /* |
1087 | * OK, all BIOs are submitted, so we can decrement bio_count to truly | 1085 | * The only time we want to leave bios in flight is when a successful |
1088 | * reflect the number of to-be-processed BIOs. | 1086 | * partial aio read or full aio write have been setup. In that case |
1087 | * bio completion will call aio_complete. The only time it's safe to | ||
1088 | * call aio_complete is when we return -EIOCBQUEUED, so we key on that. | ||
1089 | * This had *better* be the only place that raises -EIOCBQUEUED. | ||
1089 | */ | 1090 | */ |
1090 | if (dio->is_async) { | 1091 | BUG_ON(ret == -EIOCBQUEUED); |
1091 | int should_wait = 0; | 1092 | if (dio->is_async && ret == 0 && dio->result && |
1093 | ((rw & READ) || (dio->result == dio->size))) | ||
1094 | ret = -EIOCBQUEUED; | ||
1092 | 1095 | ||
1093 | if (dio->result < dio->size && (rw & WRITE)) { | 1096 | if (ret != -EIOCBQUEUED) |
1094 | dio->waiter = current; | 1097 | dio_await_completion(dio); |
1095 | should_wait = 1; | ||
1096 | } | ||
1097 | if (ret == 0) | ||
1098 | ret = dio->result; | ||
1099 | finished_one_bio(dio); /* This can free the dio */ | ||
1100 | blk_run_address_space(inode->i_mapping); | ||
1101 | if (should_wait) { | ||
1102 | unsigned long flags; | ||
1103 | /* | ||
1104 | * Wait for already issued I/O to drain out and | ||
1105 | * release its references to user-space pages | ||
1106 | * before returning to fallback on buffered I/O | ||
1107 | */ | ||
1108 | |||
1109 | spin_lock_irqsave(&dio->bio_lock, flags); | ||
1110 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
1111 | while (dio->bio_count) { | ||
1112 | spin_unlock_irqrestore(&dio->bio_lock, flags); | ||
1113 | io_schedule(); | ||
1114 | spin_lock_irqsave(&dio->bio_lock, flags); | ||
1115 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
1116 | } | ||
1117 | spin_unlock_irqrestore(&dio->bio_lock, flags); | ||
1118 | set_current_state(TASK_RUNNING); | ||
1119 | kfree(dio); | ||
1120 | } | ||
1121 | } else { | ||
1122 | ssize_t transferred = 0; | ||
1123 | |||
1124 | finished_one_bio(dio); | ||
1125 | ret2 = dio_await_completion(dio); | ||
1126 | if (ret == 0) | ||
1127 | ret = ret2; | ||
1128 | if (ret == 0) | ||
1129 | ret = dio->page_errors; | ||
1130 | if (dio->result) { | ||
1131 | loff_t i_size = i_size_read(inode); | ||
1132 | |||
1133 | transferred = dio->result; | ||
1134 | /* | ||
1135 | * Adjust the return value if the read crossed a | ||
1136 | * non-block-aligned EOF. | ||
1137 | */ | ||
1138 | if (rw == READ && (offset + transferred > i_size)) | ||
1139 | transferred = i_size - offset; | ||
1140 | } | ||
1141 | dio_complete(dio, offset, transferred); | ||
1142 | if (ret == 0) | ||
1143 | ret = transferred; | ||
1144 | 1098 | ||
1145 | /* We could have also come here on an AIO file extend */ | 1099 | /* |
1146 | if (!is_sync_kiocb(iocb) && (rw & WRITE) && | 1100 | * Sync will always be dropping the final ref and completing the |
1147 | ret >= 0 && dio->result == dio->size) | 1101 | * operation. AIO can if it was a broken operation described above or |
1148 | /* | 1102 | * in fact if all the bios race to complete before we get here. In |
1149 | * For AIO writes where we have completed the | 1103 | * that case dio_complete() translates the EIOCBQUEUED into the proper |
1150 | * i/o, we have to mark the the aio complete. | 1104 | * return code that the caller will hand to aio_complete(). |
1151 | */ | 1105 | * |
1152 | aio_complete(iocb, ret, 0); | 1106 | * This is managed by the bio_lock instead of being an atomic_t so that |
1107 | * completion paths can drop their ref and use the remaining count to | ||
1108 | * decide to wake the submission path atomically. | ||
1109 | */ | ||
1110 | spin_lock_irqsave(&dio->bio_lock, flags); | ||
1111 | ret2 = --dio->refcount; | ||
1112 | spin_unlock_irqrestore(&dio->bio_lock, flags); | ||
1113 | BUG_ON(!dio->is_async && ret2 != 0); | ||
1114 | if (ret2 == 0) { | ||
1115 | ret = dio_complete(dio, offset, ret); | ||
1153 | kfree(dio); | 1116 | kfree(dio); |
1154 | } | 1117 | } else |
1118 | BUG_ON(ret != -EIOCBQUEUED); | ||
1119 | |||
1155 | return ret; | 1120 | return ret; |
1156 | } | 1121 | } |
1157 | 1122 | ||