aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_buf_item.c
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@infradead.org>2012-04-23 01:58:39 -0400
committerBen Myers <bpm@sgi.com>2012-05-14 17:20:31 -0400
commit43ff2122e6492bcc88b065c433453dce88223b30 (patch)
tree0f762cfb753edd73402b8830e0927d9efba30c61 /fs/xfs/xfs_buf_item.c
parent960c60af8b9481595e68875e79b2602e73169c29 (diff)
xfs: on-stack delayed write buffer lists
Queue delwri buffers on a local on-stack list instead of a per-buftarg one, and write back the buffers per-process instead of by waking up xfsbufd. This is now easily doable given that we have very few places left that write delwri buffers: - log recovery: Only done at mount time, and already forcing out the buffers synchronously using xfs_flush_buftarg - quotacheck: Same story. - dquot reclaim: Writes out dirty dquots on the LRU under memory pressure. We might want to look into doing more of this via xfsaild, but it's already more optimal than the synchronous inode reclaim that writes each buffer synchronously. - xfsaild: This is the main beneficiary of the change. By keeping a local list of buffers to write we reduce latency of writing out buffers, and more importably we can remove all the delwri list promotions which were hitting the buffer cache hard under sustained metadata loads. The implementation is very straight forward - xfs_buf_delwri_queue now gets a new list_head pointer that it adds the delwri buffers to, and all callers need to eventually submit the list using xfs_buf_delwi_submit or xfs_buf_delwi_submit_nowait. Buffers that already are on a delwri list are skipped in xfs_buf_delwri_queue, assuming they already are on another delwri list. The biggest change to pass down the buffer list was done to the AIL pushing. Now that we operate on buffers the trylock, push and pushbuf log item methods are merged into a single push routine, which tries to lock the item, and if possible add the buffer that needs writeback to the buffer list. This leads to much simpler code than the previous split but requires the individual IOP_PUSH instances to unlock and reacquire the AIL around calls to blocking routines. Given that xfsailds now also handle writing out buffers, the conditions for log forcing and the sleep times needed some small changes. The most important one is that we consider an AIL busy as long we still have buffers to push, and the other one is that we do increment the pushed LSN for buffers that are under flushing at this moment, but still count them towards the stuck items for restart purposes. Without this we could hammer on stuck items without ever forcing the log and not make progress under heavy random delete workloads on fast flash storage devices. [ Dave Chinner: - rebase on previous patches. - improved comments for XBF_DELWRI_Q handling - fix XBF_ASYNC handling in queue submission (test 106 failure) - rename delwri submit function buffer list parameters for clarity - xfs_efd_item_push() should return XFS_ITEM_PINNED ] Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
Diffstat (limited to 'fs/xfs/xfs_buf_item.c')
-rw-r--r--fs/xfs/xfs_buf_item.c96
1 files changed, 26 insertions, 70 deletions
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 3a0bc38f1859..fb20f384b566 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -418,7 +418,6 @@ xfs_buf_item_unpin(
418 if (freed && stale) { 418 if (freed && stale) {
419 ASSERT(bip->bli_flags & XFS_BLI_STALE); 419 ASSERT(bip->bli_flags & XFS_BLI_STALE);
420 ASSERT(xfs_buf_islocked(bp)); 420 ASSERT(xfs_buf_islocked(bp));
421 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
422 ASSERT(XFS_BUF_ISSTALE(bp)); 421 ASSERT(XFS_BUF_ISSTALE(bp));
423 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); 422 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
424 423
@@ -469,34 +468,28 @@ xfs_buf_item_unpin(
469 } 468 }
470} 469}
471 470
472/*
473 * This is called to attempt to lock the buffer associated with this
474 * buf log item. Don't sleep on the buffer lock. If we can't get
475 * the lock right away, return 0. If we can get the lock, take a
476 * reference to the buffer. If this is a delayed write buffer that
477 * needs AIL help to be written back, invoke the pushbuf routine
478 * rather than the normal success path.
479 */
480STATIC uint 471STATIC uint
481xfs_buf_item_trylock( 472xfs_buf_item_push(
482 struct xfs_log_item *lip) 473 struct xfs_log_item *lip,
474 struct list_head *buffer_list)
483{ 475{
484 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 476 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
485 struct xfs_buf *bp = bip->bli_buf; 477 struct xfs_buf *bp = bip->bli_buf;
478 uint rval = XFS_ITEM_SUCCESS;
486 479
487 if (xfs_buf_ispinned(bp)) 480 if (xfs_buf_ispinned(bp))
488 return XFS_ITEM_PINNED; 481 return XFS_ITEM_PINNED;
489 if (!xfs_buf_trylock(bp)) 482 if (!xfs_buf_trylock(bp))
490 return XFS_ITEM_LOCKED; 483 return XFS_ITEM_LOCKED;
491 484
492 /* take a reference to the buffer. */
493 xfs_buf_hold(bp);
494
495 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 485 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
496 trace_xfs_buf_item_trylock(bip); 486
497 if (XFS_BUF_ISDELAYWRITE(bp)) 487 trace_xfs_buf_item_push(bip);
498 return XFS_ITEM_PUSHBUF; 488
499 return XFS_ITEM_SUCCESS; 489 if (!xfs_buf_delwri_queue(bp, buffer_list))
490 rval = XFS_ITEM_FLUSHING;
491 xfs_buf_unlock(bp);
492 return rval;
500} 493}
501 494
502/* 495/*
@@ -609,48 +602,6 @@ xfs_buf_item_committed(
609 return lsn; 602 return lsn;
610} 603}
611 604
612/*
613 * The buffer is locked, but is not a delayed write buffer.
614 */
615STATIC void
616xfs_buf_item_push(
617 struct xfs_log_item *lip)
618{
619 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
620 struct xfs_buf *bp = bip->bli_buf;
621
622 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
623 ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
624
625 trace_xfs_buf_item_push(bip);
626
627 xfs_buf_delwri_queue(bp);
628 xfs_buf_relse(bp);
629}
630
631/*
632 * The buffer is locked and is a delayed write buffer. Promote the buffer
633 * in the delayed write queue as the caller knows that they must invoke
634 * the xfsbufd to get this buffer written. We have to unlock the buffer
635 * to allow the xfsbufd to write it, too.
636 */
637STATIC bool
638xfs_buf_item_pushbuf(
639 struct xfs_log_item *lip)
640{
641 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
642 struct xfs_buf *bp = bip->bli_buf;
643
644 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
645 ASSERT(XFS_BUF_ISDELAYWRITE(bp));
646
647 trace_xfs_buf_item_pushbuf(bip);
648
649 xfs_buf_delwri_promote(bp);
650 xfs_buf_relse(bp);
651 return true;
652}
653
654STATIC void 605STATIC void
655xfs_buf_item_committing( 606xfs_buf_item_committing(
656 struct xfs_log_item *lip, 607 struct xfs_log_item *lip,
@@ -666,11 +617,9 @@ static const struct xfs_item_ops xfs_buf_item_ops = {
666 .iop_format = xfs_buf_item_format, 617 .iop_format = xfs_buf_item_format,
667 .iop_pin = xfs_buf_item_pin, 618 .iop_pin = xfs_buf_item_pin,
668 .iop_unpin = xfs_buf_item_unpin, 619 .iop_unpin = xfs_buf_item_unpin,
669 .iop_trylock = xfs_buf_item_trylock,
670 .iop_unlock = xfs_buf_item_unlock, 620 .iop_unlock = xfs_buf_item_unlock,
671 .iop_committed = xfs_buf_item_committed, 621 .iop_committed = xfs_buf_item_committed,
672 .iop_push = xfs_buf_item_push, 622 .iop_push = xfs_buf_item_push,
673 .iop_pushbuf = xfs_buf_item_pushbuf,
674 .iop_committing = xfs_buf_item_committing 623 .iop_committing = xfs_buf_item_committing
675}; 624};
676 625
@@ -989,20 +938,27 @@ xfs_buf_iodone_callbacks(
989 * If the write was asynchronous then no one will be looking for the 938 * If the write was asynchronous then no one will be looking for the
990 * error. Clear the error state and write the buffer out again. 939 * error. Clear the error state and write the buffer out again.
991 * 940 *
992 * During sync or umount we'll write all pending buffers again 941 * XXX: This helps against transient write errors, but we need to find
993 * synchronous, which will catch these errors if they keep hanging 942 * a way to shut the filesystem down if the writes keep failing.
994 * around. 943 *
944 * In practice we'll shut the filesystem down soon as non-transient
945 * erorrs tend to affect the whole device and a failing log write
946 * will make us give up. But we really ought to do better here.
995 */ 947 */
996 if (XFS_BUF_ISASYNC(bp)) { 948 if (XFS_BUF_ISASYNC(bp)) {
949 ASSERT(bp->b_iodone != NULL);
950
951 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
952
997 xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ 953 xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
998 954
999 if (!XFS_BUF_ISSTALE(bp)) { 955 if (!XFS_BUF_ISSTALE(bp)) {
1000 xfs_buf_delwri_queue(bp); 956 bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE;
1001 XFS_BUF_DONE(bp); 957 xfs_bdstrat_cb(bp);
958 } else {
959 xfs_buf_relse(bp);
1002 } 960 }
1003 ASSERT(bp->b_iodone != NULL); 961
1004 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1005 xfs_buf_relse(bp);
1006 return; 962 return;
1007 } 963 }
1008 964