aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorZach Brown <zach.brown@oracle.com>2006-12-10 05:21:05 -0500
committerLinus Torvalds <torvalds@woody.osdl.org>2006-12-10 12:57:21 -0500
commit8459d86aff04fa53c2ab6a6b9f355b3063cc8014 (patch)
treec0584c4907f0d63a18998b7cbffdf7900609606b
parent20258b2b397031649e4a41922fe803d57017df84 (diff)
[PATCH] dio: only call aio_complete() after returning -EIOCBQUEUED
The only time it is safe to call aio_complete() is when the ->ki_retry function returns -EIOCBQUEUED to the AIO core. direct_io_worker() has historically done this by relying on its caller to translate positive return codes into -EIOCBQUEUED for the aio case. It did this by trying to keep conditionals in sync. direct_io_worker() knew when finished_one_bio() was going to call aio_complete(). It would reverse the test and wait and free the dio in the cases it thought that finished_one_bio() wasn't going to. Not surprisingly, it ended up getting it wrong. 'ret' could be a negative errno from the submission path but it failed to communicate this to finished_one_bio(). direct_io_worker() would return < 0, it's callers wouldn't raise -EIOCBQUEUED, and aio_complete() would be called. In the future finished_one_bio()'s tests wouldn't reflect this and aio_complete() would be called for a second time which can manifest as an oops. The previous cleanups have whittled the sync and async completion paths down to the point where we can collapse them and clearly reassert the invariant that we must only call aio_complete() after returning -EIOCBQUEUED. direct_io_worker() will only return -EIOCBQUEUED when it is not the last to drop the dio refcount and the aio bio completion path will only call aio_complete() when it is the last to drop the dio refcount. direct_io_worker() can ensure that it is the last to drop the reference count by waiting for bios to drain. It does this for sync ops, of course, and for partial dio writes that must fall back to buffered and for aio ops that saw errors during submission. This means that operations that end up waiting, even if they were issued as aio ops, will not call aio_complete() from dio. Instead we return the return code of the operation and let the aio core call aio_complete(). This is purposely done to fix a bug where AIO DIO file extensions would call aio_complete() before their callers have a chance to update i_size. Now that direct_io_worker() is explicitly returning -EIOCBQUEUED its callers no longer have to translate for it. XFS needs to be careful not to free resources that will be used during AIO completion if -EIOCBQUEUED is returned. We maintain the previous behaviour of trying to write fs metadata for O_SYNC aio+dio writes. Signed-off-by: Zach Brown <zach.brown@oracle.com> Cc: Badari Pulavarty <pbadari@us.ibm.com> Cc: Suparna Bhattacharya <suparna@in.ibm.com> Acked-by: Jeff Moyer <jmoyer@redhat.com> Cc: <xfs-masters@oss.sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
-rw-r--r--fs/direct-io.c90
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c2
-rw-r--r--mm/filemap.c9
3 files changed, 39 insertions, 62 deletions
diff --git a/fs/direct-io.c b/fs/direct-io.c
index f11f05dc9e61..71f4aeac7632 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -226,6 +226,15 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
226{ 226{
227 ssize_t transferred = 0; 227 ssize_t transferred = 0;
228 228
229 /*
230 * AIO submission can race with bio completion to get here while
231 * expecting to have the last io completed by bio completion.
232 * In that case -EIOCBQUEUED is in fact not an error we want
233 * to preserve through this call.
234 */
235 if (ret == -EIOCBQUEUED)
236 ret = 0;
237
229 if (dio->result) { 238 if (dio->result) {
230 transferred = dio->result; 239 transferred = dio->result;
231 240
@@ -251,24 +260,6 @@ static int dio_complete(struct dio *dio, loff_t offset, int ret)
251 return ret; 260 return ret;
252} 261}
253 262
254/*
255 * Called when a BIO has been processed. If the count goes to zero then IO is
256 * complete and we can signal this to the AIO layer.
257 */
258static void dio_complete_aio(struct dio *dio)
259{
260 int ret;
261
262 ret = dio_complete(dio, dio->iocb->ki_pos, 0);
263
264 /* Complete AIO later if falling back to buffered i/o */
265 if (dio->result == dio->size ||
266 ((dio->rw == READ) && dio->result)) {
267 aio_complete(dio->iocb, ret, 0);
268 kfree(dio);
269 }
270}
271
272static int dio_bio_complete(struct dio *dio, struct bio *bio); 263static int dio_bio_complete(struct dio *dio, struct bio *bio);
273/* 264/*
274 * Asynchronous IO callback. 265 * Asynchronous IO callback.
@@ -290,8 +281,11 @@ static int dio_bio_end_aio(struct bio *bio, unsigned int bytes_done, int error)
290 if (remaining == 1 && waiter_holds_ref) 281 if (remaining == 1 && waiter_holds_ref)
291 wake_up_process(dio->waiter); 282 wake_up_process(dio->waiter);
292 283
293 if (remaining == 0) 284 if (remaining == 0) {
294 dio_complete_aio(dio); 285 int ret = dio_complete(dio, dio->iocb->ki_pos, 0);
286 aio_complete(dio->iocb, ret, 0);
287 kfree(dio);
288 }
295 289
296 return 0; 290 return 0;
297} 291}
@@ -1082,47 +1076,33 @@ direct_io_worker(int rw, struct kiocb *iocb, struct inode *inode,
1082 mutex_unlock(&dio->inode->i_mutex); 1076 mutex_unlock(&dio->inode->i_mutex);
1083 1077
1084 /* 1078 /*
1085 * OK, all BIOs are submitted, so we can decrement bio_count to truly 1079 * The only time we want to leave bios in flight is when a successful
1086 * reflect the number of to-be-processed BIOs. 1080 * partial aio read or full aio write have been setup. In that case
1081 * bio completion will call aio_complete. The only time it's safe to
1082 * call aio_complete is when we return -EIOCBQUEUED, so we key on that.
1083 * This had *better* be the only place that raises -EIOCBQUEUED.
1087 */ 1084 */
1088 if (dio->is_async) { 1085 BUG_ON(ret == -EIOCBQUEUED);
1089 int should_wait = 0; 1086 if (dio->is_async && ret == 0 && dio->result &&
1090 1087 ((rw & READ) || (dio->result == dio->size)))
1091 if (dio->result < dio->size && (rw & WRITE)) { 1088 ret = -EIOCBQUEUED;
1092 dio->waiter = current;
1093 should_wait = 1;
1094 }
1095 if (ret == 0)
1096 ret = dio->result;
1097
1098 if (should_wait)
1099 dio_await_completion(dio);
1100
1101 /* this can free the dio */
1102 if (atomic_dec_and_test(&dio->refcount))
1103 dio_complete_aio(dio);
1104 1089
1105 if (should_wait) 1090 if (ret != -EIOCBQUEUED)
1106 kfree(dio);
1107 } else {
1108 dio_await_completion(dio); 1091 dio_await_completion(dio);
1109 1092
1093 /*
1094 * Sync will always be dropping the final ref and completing the
1095 * operation. AIO can if it was a broken operation described above
1096 * or in fact if all the bios race to complete before we get here.
1097 * In that case dio_complete() translates the EIOCBQUEUED into
1098 * the proper return code that the caller will hand to aio_complete().
1099 */
1100 if (atomic_dec_and_test(&dio->refcount)) {
1110 ret = dio_complete(dio, offset, ret); 1101 ret = dio_complete(dio, offset, ret);
1102 kfree(dio);
1103 } else
1104 BUG_ON(ret != -EIOCBQUEUED);
1111 1105
1112 /* We could have also come here on an AIO file extend */
1113 if (!is_sync_kiocb(iocb) && (rw & WRITE) &&
1114 ret >= 0 && dio->result == dio->size)
1115 /*
1116 * For AIO writes where we have completed the
1117 * i/o, we have to mark the the aio complete.
1118 */
1119 aio_complete(iocb, ret, 0);
1120
1121 if (atomic_dec_and_test(&dio->refcount))
1122 kfree(dio);
1123 else
1124 BUG();
1125 }
1126 return ret; 1106 return ret;
1127} 1107}
1128 1108
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 8e6b56fc1cad..b56eb754e2d2 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -1406,7 +1406,7 @@ xfs_vm_direct_IO(
1406 xfs_end_io_direct); 1406 xfs_end_io_direct);
1407 } 1407 }
1408 1408
1409 if (unlikely(ret <= 0 && iocb->private)) 1409 if (unlikely(ret != -EIOCBQUEUED && iocb->private))
1410 xfs_destroy_ioend(iocb->private); 1410 xfs_destroy_ioend(iocb->private);
1411 return ret; 1411 return ret;
1412} 1412}
diff --git a/mm/filemap.c b/mm/filemap.c
index 606432f71b3a..8332c77b1bd1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1181,8 +1181,6 @@ generic_file_aio_read(struct kiocb *iocb, const struct iovec *iov,
1181 if (pos < size) { 1181 if (pos < size) {
1182 retval = generic_file_direct_IO(READ, iocb, 1182 retval = generic_file_direct_IO(READ, iocb,
1183 iov, pos, nr_segs); 1183 iov, pos, nr_segs);
1184 if (retval > 0 && !is_sync_kiocb(iocb))
1185 retval = -EIOCBQUEUED;
1186 if (retval > 0) 1184 if (retval > 0)
1187 *ppos = pos + retval; 1185 *ppos = pos + retval;
1188 } 1186 }
@@ -2047,15 +2045,14 @@ generic_file_direct_write(struct kiocb *iocb, const struct iovec *iov,
2047 * Sync the fs metadata but not the minor inode changes and 2045 * Sync the fs metadata but not the minor inode changes and
2048 * of course not the data as we did direct DMA for the IO. 2046 * of course not the data as we did direct DMA for the IO.
2049 * i_mutex is held, which protects generic_osync_inode() from 2047 * i_mutex is held, which protects generic_osync_inode() from
2050 * livelocking. 2048 * livelocking. AIO O_DIRECT ops attempt to sync metadata here.
2051 */ 2049 */
2052 if (written >= 0 && ((file->f_flags & O_SYNC) || IS_SYNC(inode))) { 2050 if ((written >= 0 || written == -EIOCBQUEUED) &&
2051 ((file->f_flags & O_SYNC) || IS_SYNC(inode))) {
2053 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA); 2052 int err = generic_osync_inode(inode, mapping, OSYNC_METADATA);
2054 if (err < 0) 2053 if (err < 0)
2055 written = err; 2054 written = err;
2056 } 2055 }
2057 if (written == count && !is_sync_kiocb(iocb))
2058 written = -EIOCBQUEUED;
2059 return written; 2056 return written;
2060} 2057}
2061EXPORT_SYMBOL(generic_file_direct_write); 2058EXPORT_SYMBOL(generic_file_direct_write);