diff options
author | Christoph Hellwig <hch@infradead.org> | 2010-07-18 17:17:11 -0400 |
---|---|---|
committer | Alex Elder <aelder@sgi.com> | 2010-07-26 17:09:19 -0400 |
commit | 209fb87a259ead17e966627b7f053d16a96898da (patch) | |
tree | 2cb348a32e2c1488cfb14189e975bcbcb6a0695f /fs | |
parent | fb511f2150174b18b28ad54708c1adda0df39b17 (diff) |
xfs simplify and speed up direct I/O completions
Our current handling of direct I/O completions is rather suboptimal,
because we defer it to a workqueue more often than needed, and we
perform a much to aggressive flush of the workqueue in case unwritten
extent conversions happen.
This patch changes the direct I/O reads to not even use a completion
handler, as we don't bother to use it at all, and to perform the unwritten
extent conversions in caller context for synchronous direct I/O.
For a small I/O size direct I/O workload on a consumer grade SSD, such as
the untar of a kernel tree inside qemu this patch gives speedups of
about 5%. Getting us much closer to the speed of a native block device,
or a fully allocated XFS file.
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Dave Chinner <dchinner@redhat.com>
Signed-off-by: Alex Elder <aelder@sgi.com>
Diffstat (limited to 'fs')
-rw-r--r-- | fs/xfs/linux-2.6/xfs_aops.c | 158 |
1 files changed, 76 insertions, 82 deletions
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index 13622d5ba068..d24e78f32f3e 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c | |||
@@ -202,23 +202,17 @@ xfs_setfilesize( | |||
202 | } | 202 | } |
203 | 203 | ||
204 | /* | 204 | /* |
205 | * Schedule IO completion handling on a xfsdatad if this was | 205 | * Schedule IO completion handling on the final put of an ioend. |
206 | * the final hold on this ioend. If we are asked to wait, | ||
207 | * flush the workqueue. | ||
208 | */ | 206 | */ |
209 | STATIC void | 207 | STATIC void |
210 | xfs_finish_ioend( | 208 | xfs_finish_ioend( |
211 | xfs_ioend_t *ioend, | 209 | struct xfs_ioend *ioend) |
212 | int wait) | ||
213 | { | 210 | { |
214 | if (atomic_dec_and_test(&ioend->io_remaining)) { | 211 | if (atomic_dec_and_test(&ioend->io_remaining)) { |
215 | struct workqueue_struct *wq; | 212 | if (ioend->io_type == IO_UNWRITTEN) |
216 | 213 | queue_work(xfsconvertd_workqueue, &ioend->io_work); | |
217 | wq = (ioend->io_type == IO_UNWRITTEN) ? | 214 | else |
218 | xfsconvertd_workqueue : xfsdatad_workqueue; | 215 | queue_work(xfsdatad_workqueue, &ioend->io_work); |
219 | queue_work(wq, &ioend->io_work); | ||
220 | if (wait) | ||
221 | flush_workqueue(wq); | ||
222 | } | 216 | } |
223 | } | 217 | } |
224 | 218 | ||
@@ -262,7 +256,7 @@ xfs_end_io( | |||
262 | */ | 256 | */ |
263 | if (error == EAGAIN) { | 257 | if (error == EAGAIN) { |
264 | atomic_inc(&ioend->io_remaining); | 258 | atomic_inc(&ioend->io_remaining); |
265 | xfs_finish_ioend(ioend, 0); | 259 | xfs_finish_ioend(ioend); |
266 | /* ensure we don't spin on blocked ioends */ | 260 | /* ensure we don't spin on blocked ioends */ |
267 | delay(1); | 261 | delay(1); |
268 | } else { | 262 | } else { |
@@ -273,6 +267,17 @@ xfs_end_io( | |||
273 | } | 267 | } |
274 | 268 | ||
275 | /* | 269 | /* |
270 | * Call IO completion handling in caller context on the final put of an ioend. | ||
271 | */ | ||
272 | STATIC void | ||
273 | xfs_finish_ioend_sync( | ||
274 | struct xfs_ioend *ioend) | ||
275 | { | ||
276 | if (atomic_dec_and_test(&ioend->io_remaining)) | ||
277 | xfs_end_io(&ioend->io_work); | ||
278 | } | ||
279 | |||
280 | /* | ||
276 | * Allocate and initialise an IO completion structure. | 281 | * Allocate and initialise an IO completion structure. |
277 | * We need to track unwritten extent write completion here initially. | 282 | * We need to track unwritten extent write completion here initially. |
278 | * We'll need to extend this for updating the ondisk inode size later | 283 | * We'll need to extend this for updating the ondisk inode size later |
@@ -353,7 +358,7 @@ xfs_end_bio( | |||
353 | bio->bi_end_io = NULL; | 358 | bio->bi_end_io = NULL; |
354 | bio_put(bio); | 359 | bio_put(bio); |
355 | 360 | ||
356 | xfs_finish_ioend(ioend, 0); | 361 | xfs_finish_ioend(ioend); |
357 | } | 362 | } |
358 | 363 | ||
359 | STATIC void | 364 | STATIC void |
@@ -495,7 +500,7 @@ xfs_submit_ioend( | |||
495 | } | 500 | } |
496 | if (bio) | 501 | if (bio) |
497 | xfs_submit_ioend_bio(wbc, ioend, bio); | 502 | xfs_submit_ioend_bio(wbc, ioend, bio); |
498 | xfs_finish_ioend(ioend, 0); | 503 | xfs_finish_ioend(ioend); |
499 | } while ((ioend = next) != NULL); | 504 | } while ((ioend = next) != NULL); |
500 | } | 505 | } |
501 | 506 | ||
@@ -1406,70 +1411,56 @@ xfs_get_blocks_direct( | |||
1406 | return __xfs_get_blocks(inode, iblock, bh_result, create, 1); | 1411 | return __xfs_get_blocks(inode, iblock, bh_result, create, 1); |
1407 | } | 1412 | } |
1408 | 1413 | ||
1414 | /* | ||
1415 | * Complete a direct I/O write request. | ||
1416 | * | ||
1417 | * If the private argument is non-NULL __xfs_get_blocks signals us that we | ||
1418 | * need to issue a transaction to convert the range from unwritten to written | ||
1419 | * extents. In case this is regular synchronous I/O we just call xfs_end_io | ||
1420 | * to do this and we are done. But in case this was a successfull AIO | ||
1421 | * request this handler is called from interrupt context, from which we | ||
1422 | * can't start transactions. In that case offload the I/O completion to | ||
1423 | * the workqueues we also use for buffered I/O completion. | ||
1424 | */ | ||
1409 | STATIC void | 1425 | STATIC void |
1410 | xfs_end_io_direct( | 1426 | xfs_end_io_direct_write( |
1411 | struct kiocb *iocb, | 1427 | struct kiocb *iocb, |
1412 | loff_t offset, | 1428 | loff_t offset, |
1413 | ssize_t size, | 1429 | ssize_t size, |
1414 | void *private, | 1430 | void *private, |
1415 | int ret, | 1431 | int ret, |
1416 | bool is_async) | 1432 | bool is_async) |
1417 | { | 1433 | { |
1418 | xfs_ioend_t *ioend = iocb->private; | 1434 | struct xfs_ioend *ioend = iocb->private; |
1419 | bool complete_aio = is_async; | ||
1420 | 1435 | ||
1421 | /* | 1436 | /* |
1422 | * Non-NULL private data means we need to issue a transaction to | 1437 | * blockdev_direct_IO can return an error even after the I/O |
1423 | * convert a range from unwritten to written extents. This needs | 1438 | * completion handler was called. Thus we need to protect |
1424 | * to happen from process context but aio+dio I/O completion | 1439 | * against double-freeing. |
1425 | * happens from irq context so we need to defer it to a workqueue. | ||
1426 | * This is not necessary for synchronous direct I/O, but we do | ||
1427 | * it anyway to keep the code uniform and simpler. | ||
1428 | * | ||
1429 | * Well, if only it were that simple. Because synchronous direct I/O | ||
1430 | * requires extent conversion to occur *before* we return to userspace, | ||
1431 | * we have to wait for extent conversion to complete. Look at the | ||
1432 | * iocb that has been passed to us to determine if this is AIO or | ||
1433 | * not. If it is synchronous, tell xfs_finish_ioend() to kick the | ||
1434 | * workqueue and wait for it to complete. | ||
1435 | * | ||
1436 | * The core direct I/O code might be changed to always call the | ||
1437 | * completion handler in the future, in which case all this can | ||
1438 | * go away. | ||
1439 | */ | 1440 | */ |
1441 | iocb->private = NULL; | ||
1442 | |||
1440 | ioend->io_offset = offset; | 1443 | ioend->io_offset = offset; |
1441 | ioend->io_size = size; | 1444 | ioend->io_size = size; |
1442 | if (ioend->io_type == IO_READ) { | 1445 | if (private && size > 0) |
1443 | xfs_finish_ioend(ioend, 0); | 1446 | ioend->io_type = IO_UNWRITTEN; |
1444 | } else if (private && size > 0) { | 1447 | |
1445 | if (is_async) { | 1448 | if (is_async) { |
1449 | /* | ||
1450 | * If we are converting an unwritten extent we need to delay | ||
1451 | * the AIO completion until after the unwrittent extent | ||
1452 | * conversion has completed, otherwise do it ASAP. | ||
1453 | */ | ||
1454 | if (ioend->io_type == IO_UNWRITTEN) { | ||
1446 | ioend->io_iocb = iocb; | 1455 | ioend->io_iocb = iocb; |
1447 | ioend->io_result = ret; | 1456 | ioend->io_result = ret; |
1448 | complete_aio = false; | ||
1449 | xfs_finish_ioend(ioend, 0); | ||
1450 | } else { | 1457 | } else { |
1451 | xfs_finish_ioend(ioend, 1); | 1458 | aio_complete(iocb, ret, 0); |
1452 | } | 1459 | } |
1460 | xfs_finish_ioend(ioend); | ||
1453 | } else { | 1461 | } else { |
1454 | /* | 1462 | xfs_finish_ioend_sync(ioend); |
1455 | * A direct I/O write ioend starts it's life in unwritten | ||
1456 | * state in case they map an unwritten extent. This write | ||
1457 | * didn't map an unwritten extent so switch it's completion | ||
1458 | * handler. | ||
1459 | */ | ||
1460 | ioend->io_type = IO_NEW; | ||
1461 | xfs_finish_ioend(ioend, 0); | ||
1462 | } | 1463 | } |
1463 | |||
1464 | /* | ||
1465 | * blockdev_direct_IO can return an error even after the I/O | ||
1466 | * completion handler was called. Thus we need to protect | ||
1467 | * against double-freeing. | ||
1468 | */ | ||
1469 | iocb->private = NULL; | ||
1470 | |||
1471 | if (complete_aio) | ||
1472 | aio_complete(iocb, ret, 0); | ||
1473 | } | 1464 | } |
1474 | 1465 | ||
1475 | STATIC ssize_t | 1466 | STATIC ssize_t |
@@ -1480,23 +1471,26 @@ xfs_vm_direct_IO( | |||
1480 | loff_t offset, | 1471 | loff_t offset, |
1481 | unsigned long nr_segs) | 1472 | unsigned long nr_segs) |
1482 | { | 1473 | { |
1483 | struct file *file = iocb->ki_filp; | 1474 | struct inode *inode = iocb->ki_filp->f_mapping->host; |
1484 | struct inode *inode = file->f_mapping->host; | 1475 | struct block_device *bdev = xfs_find_bdev_for_inode(inode); |
1485 | struct block_device *bdev; | 1476 | ssize_t ret; |
1486 | ssize_t ret; | 1477 | |
1487 | 1478 | if (rw & WRITE) { | |
1488 | bdev = xfs_find_bdev_for_inode(inode); | 1479 | iocb->private = xfs_alloc_ioend(inode, IO_NEW); |
1489 | 1480 | ||
1490 | iocb->private = xfs_alloc_ioend(inode, rw == WRITE ? | 1481 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, |
1491 | IO_UNWRITTEN : IO_READ); | 1482 | offset, nr_segs, |
1492 | 1483 | xfs_get_blocks_direct, | |
1493 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, | 1484 | xfs_end_io_direct_write); |
1494 | offset, nr_segs, | 1485 | if (ret != -EIOCBQUEUED && iocb->private) |
1495 | xfs_get_blocks_direct, | 1486 | xfs_destroy_ioend(iocb->private); |
1496 | xfs_end_io_direct); | 1487 | } else { |
1488 | ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, | ||
1489 | offset, nr_segs, | ||
1490 | xfs_get_blocks_direct, | ||
1491 | NULL); | ||
1492 | } | ||
1497 | 1493 | ||
1498 | if (unlikely(ret != -EIOCBQUEUED && iocb->private)) | ||
1499 | xfs_destroy_ioend(iocb->private); | ||
1500 | return ret; | 1494 | return ret; |
1501 | } | 1495 | } |
1502 | 1496 | ||