aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@infradead.org>2010-07-18 17:17:11 -0400
committerAlex Elder <aelder@sgi.com>2010-07-26 17:09:19 -0400
commit209fb87a259ead17e966627b7f053d16a96898da (patch)
tree2cb348a32e2c1488cfb14189e975bcbcb6a0695f /fs
parentfb511f2150174b18b28ad54708c1adda0df39b17 (diff)
xfs simplify and speed up direct I/O completions
Our current handling of direct I/O completions is rather suboptimal, because we defer it to a workqueue more often than needed, and we perform a much to aggressive flush of the workqueue in case unwritten extent conversions happen. This patch changes the direct I/O reads to not even use a completion handler, as we don't bother to use it at all, and to perform the unwritten extent conversions in caller context for synchronous direct I/O. For a small I/O size direct I/O workload on a consumer grade SSD, such as the untar of a kernel tree inside qemu this patch gives speedups of about 5%. Getting us much closer to the speed of a native block device, or a fully allocated XFS file. Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Signed-off-by: Alex Elder <aelder@sgi.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/xfs/linux-2.6/xfs_aops.c158
1 files changed, 76 insertions, 82 deletions
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index 13622d5ba068..d24e78f32f3e 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c
@@ -202,23 +202,17 @@ xfs_setfilesize(
202} 202}
203 203
204/* 204/*
205 * Schedule IO completion handling on a xfsdatad if this was 205 * Schedule IO completion handling on the final put of an ioend.
206 * the final hold on this ioend. If we are asked to wait,
207 * flush the workqueue.
208 */ 206 */
209STATIC void 207STATIC void
210xfs_finish_ioend( 208xfs_finish_ioend(
211 xfs_ioend_t *ioend, 209 struct xfs_ioend *ioend)
212 int wait)
213{ 210{
214 if (atomic_dec_and_test(&ioend->io_remaining)) { 211 if (atomic_dec_and_test(&ioend->io_remaining)) {
215 struct workqueue_struct *wq; 212 if (ioend->io_type == IO_UNWRITTEN)
216 213 queue_work(xfsconvertd_workqueue, &ioend->io_work);
217 wq = (ioend->io_type == IO_UNWRITTEN) ? 214 else
218 xfsconvertd_workqueue : xfsdatad_workqueue; 215 queue_work(xfsdatad_workqueue, &ioend->io_work);
219 queue_work(wq, &ioend->io_work);
220 if (wait)
221 flush_workqueue(wq);
222 } 216 }
223} 217}
224 218
@@ -262,7 +256,7 @@ xfs_end_io(
262 */ 256 */
263 if (error == EAGAIN) { 257 if (error == EAGAIN) {
264 atomic_inc(&ioend->io_remaining); 258 atomic_inc(&ioend->io_remaining);
265 xfs_finish_ioend(ioend, 0); 259 xfs_finish_ioend(ioend);
266 /* ensure we don't spin on blocked ioends */ 260 /* ensure we don't spin on blocked ioends */
267 delay(1); 261 delay(1);
268 } else { 262 } else {
@@ -273,6 +267,17 @@ xfs_end_io(
273} 267}
274 268
275/* 269/*
270 * Call IO completion handling in caller context on the final put of an ioend.
271 */
272STATIC void
273xfs_finish_ioend_sync(
274 struct xfs_ioend *ioend)
275{
276 if (atomic_dec_and_test(&ioend->io_remaining))
277 xfs_end_io(&ioend->io_work);
278}
279
280/*
276 * Allocate and initialise an IO completion structure. 281 * Allocate and initialise an IO completion structure.
277 * We need to track unwritten extent write completion here initially. 282 * We need to track unwritten extent write completion here initially.
278 * We'll need to extend this for updating the ondisk inode size later 283 * We'll need to extend this for updating the ondisk inode size later
@@ -353,7 +358,7 @@ xfs_end_bio(
353 bio->bi_end_io = NULL; 358 bio->bi_end_io = NULL;
354 bio_put(bio); 359 bio_put(bio);
355 360
356 xfs_finish_ioend(ioend, 0); 361 xfs_finish_ioend(ioend);
357} 362}
358 363
359STATIC void 364STATIC void
@@ -495,7 +500,7 @@ xfs_submit_ioend(
495 } 500 }
496 if (bio) 501 if (bio)
497 xfs_submit_ioend_bio(wbc, ioend, bio); 502 xfs_submit_ioend_bio(wbc, ioend, bio);
498 xfs_finish_ioend(ioend, 0); 503 xfs_finish_ioend(ioend);
499 } while ((ioend = next) != NULL); 504 } while ((ioend = next) != NULL);
500} 505}
501 506
@@ -1406,70 +1411,56 @@ xfs_get_blocks_direct(
1406 return __xfs_get_blocks(inode, iblock, bh_result, create, 1); 1411 return __xfs_get_blocks(inode, iblock, bh_result, create, 1);
1407} 1412}
1408 1413
1414/*
1415 * Complete a direct I/O write request.
1416 *
1417 * If the private argument is non-NULL __xfs_get_blocks signals us that we
1418 * need to issue a transaction to convert the range from unwritten to written
1419 * extents. In case this is regular synchronous I/O we just call xfs_end_io
1420 * to do this and we are done. But in case this was a successfull AIO
1421 * request this handler is called from interrupt context, from which we
1422 * can't start transactions. In that case offload the I/O completion to
1423 * the workqueues we also use for buffered I/O completion.
1424 */
1409STATIC void 1425STATIC void
1410xfs_end_io_direct( 1426xfs_end_io_direct_write(
1411 struct kiocb *iocb, 1427 struct kiocb *iocb,
1412 loff_t offset, 1428 loff_t offset,
1413 ssize_t size, 1429 ssize_t size,
1414 void *private, 1430 void *private,
1415 int ret, 1431 int ret,
1416 bool is_async) 1432 bool is_async)
1417{ 1433{
1418 xfs_ioend_t *ioend = iocb->private; 1434 struct xfs_ioend *ioend = iocb->private;
1419 bool complete_aio = is_async;
1420 1435
1421 /* 1436 /*
1422 * Non-NULL private data means we need to issue a transaction to 1437 * blockdev_direct_IO can return an error even after the I/O
1423 * convert a range from unwritten to written extents. This needs 1438 * completion handler was called. Thus we need to protect
1424 * to happen from process context but aio+dio I/O completion 1439 * against double-freeing.
1425 * happens from irq context so we need to defer it to a workqueue.
1426 * This is not necessary for synchronous direct I/O, but we do
1427 * it anyway to keep the code uniform and simpler.
1428 *
1429 * Well, if only it were that simple. Because synchronous direct I/O
1430 * requires extent conversion to occur *before* we return to userspace,
1431 * we have to wait for extent conversion to complete. Look at the
1432 * iocb that has been passed to us to determine if this is AIO or
1433 * not. If it is synchronous, tell xfs_finish_ioend() to kick the
1434 * workqueue and wait for it to complete.
1435 *
1436 * The core direct I/O code might be changed to always call the
1437 * completion handler in the future, in which case all this can
1438 * go away.
1439 */ 1440 */
1441 iocb->private = NULL;
1442
1440 ioend->io_offset = offset; 1443 ioend->io_offset = offset;
1441 ioend->io_size = size; 1444 ioend->io_size = size;
1442 if (ioend->io_type == IO_READ) { 1445 if (private && size > 0)
1443 xfs_finish_ioend(ioend, 0); 1446 ioend->io_type = IO_UNWRITTEN;
1444 } else if (private && size > 0) { 1447
1445 if (is_async) { 1448 if (is_async) {
1449 /*
1450 * If we are converting an unwritten extent we need to delay
1451 * the AIO completion until after the unwrittent extent
1452 * conversion has completed, otherwise do it ASAP.
1453 */
1454 if (ioend->io_type == IO_UNWRITTEN) {
1446 ioend->io_iocb = iocb; 1455 ioend->io_iocb = iocb;
1447 ioend->io_result = ret; 1456 ioend->io_result = ret;
1448 complete_aio = false;
1449 xfs_finish_ioend(ioend, 0);
1450 } else { 1457 } else {
1451 xfs_finish_ioend(ioend, 1); 1458 aio_complete(iocb, ret, 0);
1452 } 1459 }
1460 xfs_finish_ioend(ioend);
1453 } else { 1461 } else {
1454 /* 1462 xfs_finish_ioend_sync(ioend);
1455 * A direct I/O write ioend starts it's life in unwritten
1456 * state in case they map an unwritten extent. This write
1457 * didn't map an unwritten extent so switch it's completion
1458 * handler.
1459 */
1460 ioend->io_type = IO_NEW;
1461 xfs_finish_ioend(ioend, 0);
1462 } 1463 }
1463
1464 /*
1465 * blockdev_direct_IO can return an error even after the I/O
1466 * completion handler was called. Thus we need to protect
1467 * against double-freeing.
1468 */
1469 iocb->private = NULL;
1470
1471 if (complete_aio)
1472 aio_complete(iocb, ret, 0);
1473} 1464}
1474 1465
1475STATIC ssize_t 1466STATIC ssize_t
@@ -1480,23 +1471,26 @@ xfs_vm_direct_IO(
1480 loff_t offset, 1471 loff_t offset,
1481 unsigned long nr_segs) 1472 unsigned long nr_segs)
1482{ 1473{
1483 struct file *file = iocb->ki_filp; 1474 struct inode *inode = iocb->ki_filp->f_mapping->host;
1484 struct inode *inode = file->f_mapping->host; 1475 struct block_device *bdev = xfs_find_bdev_for_inode(inode);
1485 struct block_device *bdev; 1476 ssize_t ret;
1486 ssize_t ret; 1477
1487 1478 if (rw & WRITE) {
1488 bdev = xfs_find_bdev_for_inode(inode); 1479 iocb->private = xfs_alloc_ioend(inode, IO_NEW);
1489 1480
1490 iocb->private = xfs_alloc_ioend(inode, rw == WRITE ? 1481 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
1491 IO_UNWRITTEN : IO_READ); 1482 offset, nr_segs,
1492 1483 xfs_get_blocks_direct,
1493 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov, 1484 xfs_end_io_direct_write);
1494 offset, nr_segs, 1485 if (ret != -EIOCBQUEUED && iocb->private)
1495 xfs_get_blocks_direct, 1486 xfs_destroy_ioend(iocb->private);
1496 xfs_end_io_direct); 1487 } else {
1488 ret = blockdev_direct_IO_no_locking(rw, iocb, inode, bdev, iov,
1489 offset, nr_segs,
1490 xfs_get_blocks_direct,
1491 NULL);
1492 }
1497 1493
1498 if (unlikely(ret != -EIOCBQUEUED && iocb->private))
1499 xfs_destroy_ioend(iocb->private);
1500 return ret; 1494 return ret;
1501} 1495}
1502 1496