aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@infradead.org>2012-04-23 01:58:39 -0400
committerBen Myers <bpm@sgi.com>2012-05-14 17:20:31 -0400
commit43ff2122e6492bcc88b065c433453dce88223b30 (patch)
tree0f762cfb753edd73402b8830e0927d9efba30c61
parent960c60af8b9481595e68875e79b2602e73169c29 (diff)
xfs: on-stack delayed write buffer lists
Queue delwri buffers on a local on-stack list instead of a per-buftarg one, and write back the buffers per-process instead of by waking up xfsbufd. This is now easily doable given that we have very few places left that write delwri buffers: - log recovery: Only done at mount time, and already forcing out the buffers synchronously using xfs_flush_buftarg - quotacheck: Same story. - dquot reclaim: Writes out dirty dquots on the LRU under memory pressure. We might want to look into doing more of this via xfsaild, but it's already more optimal than the synchronous inode reclaim that writes each buffer synchronously. - xfsaild: This is the main beneficiary of the change. By keeping a local list of buffers to write we reduce latency of writing out buffers, and more importably we can remove all the delwri list promotions which were hitting the buffer cache hard under sustained metadata loads. The implementation is very straight forward - xfs_buf_delwri_queue now gets a new list_head pointer that it adds the delwri buffers to, and all callers need to eventually submit the list using xfs_buf_delwi_submit or xfs_buf_delwi_submit_nowait. Buffers that already are on a delwri list are skipped in xfs_buf_delwri_queue, assuming they already are on another delwri list. The biggest change to pass down the buffer list was done to the AIL pushing. Now that we operate on buffers the trylock, push and pushbuf log item methods are merged into a single push routine, which tries to lock the item, and if possible add the buffer that needs writeback to the buffer list. This leads to much simpler code than the previous split but requires the individual IOP_PUSH instances to unlock and reacquire the AIL around calls to blocking routines. Given that xfsailds now also handle writing out buffers, the conditions for log forcing and the sleep times needed some small changes. The most important one is that we consider an AIL busy as long we still have buffers to push, and the other one is that we do increment the pushed LSN for buffers that are under flushing at this moment, but still count them towards the stuck items for restart purposes. Without this we could hammer on stuck items without ever forcing the log and not make progress under heavy random delete workloads on fast flash storage devices. [ Dave Chinner: - rebase on previous patches. - improved comments for XBF_DELWRI_Q handling - fix XBF_ASYNC handling in queue submission (test 106 failure) - rename delwri submit function buffer list parameters for clarity - xfs_efd_item_push() should return XFS_ITEM_PINNED ] Signed-off-by: Christoph Hellwig <hch@lst.de> Reviewed-by: Dave Chinner <dchinner@redhat.com> Reviewed-by: Mark Tinguely <tinguely@sgi.com> Signed-off-by: Ben Myers <bpm@sgi.com>
-rw-r--r--fs/xfs/xfs_buf.c341
-rw-r--r--fs/xfs/xfs_buf.h28
-rw-r--r--fs/xfs/xfs_buf_item.c96
-rw-r--r--fs/xfs/xfs_dquot.c33
-rw-r--r--fs/xfs/xfs_dquot.h1
-rw-r--r--fs/xfs/xfs_dquot_item.c161
-rw-r--r--fs/xfs/xfs_extfree_item.c55
-rw-r--r--fs/xfs/xfs_inode.c25
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_inode_item.c152
-rw-r--r--fs/xfs/xfs_log_recover.c46
-rw-r--r--fs/xfs/xfs_qm.c148
-rw-r--r--fs/xfs/xfs_super.c16
-rw-r--r--fs/xfs/xfs_sync.c18
-rw-r--r--fs/xfs/xfs_trace.h7
-rw-r--r--fs/xfs/xfs_trans.h18
-rw-r--r--fs/xfs/xfs_trans_ail.c129
-rw-r--r--fs/xfs/xfs_trans_buf.c84
-rw-r--r--fs/xfs/xfs_trans_priv.h1
19 files changed, 442 insertions, 918 deletions
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 6819b5163e33..b82fc5c67fed 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -42,7 +42,6 @@
42#include "xfs_trace.h" 42#include "xfs_trace.h"
43 43
44static kmem_zone_t *xfs_buf_zone; 44static kmem_zone_t *xfs_buf_zone;
45STATIC int xfsbufd(void *);
46 45
47static struct workqueue_struct *xfslogd_workqueue; 46static struct workqueue_struct *xfslogd_workqueue;
48 47
@@ -144,8 +143,17 @@ void
144xfs_buf_stale( 143xfs_buf_stale(
145 struct xfs_buf *bp) 144 struct xfs_buf *bp)
146{ 145{
146 ASSERT(xfs_buf_islocked(bp));
147
147 bp->b_flags |= XBF_STALE; 148 bp->b_flags |= XBF_STALE;
148 xfs_buf_delwri_dequeue(bp); 149
150 /*
151 * Clear the delwri status so that a delwri queue walker will not
152 * flush this buffer to disk now that it is stale. The delwri queue has
153 * a reference to the buffer, so this is safe to do.
154 */
155 bp->b_flags &= ~_XBF_DELWRI_Q;
156
149 atomic_set(&(bp)->b_lru_ref, 0); 157 atomic_set(&(bp)->b_lru_ref, 0);
150 if (!list_empty(&bp->b_lru)) { 158 if (!list_empty(&bp->b_lru)) {
151 struct xfs_buftarg *btp = bp->b_target; 159 struct xfs_buftarg *btp = bp->b_target;
@@ -592,10 +600,10 @@ _xfs_buf_read(
592{ 600{
593 int status; 601 int status;
594 602
595 ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE))); 603 ASSERT(!(flags & XBF_WRITE));
596 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); 604 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
597 605
598 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | XBF_READ_AHEAD); 606 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
599 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); 607 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
600 608
601 status = xfs_buf_iorequest(bp); 609 status = xfs_buf_iorequest(bp);
@@ -855,7 +863,7 @@ xfs_buf_rele(
855 spin_unlock(&pag->pag_buf_lock); 863 spin_unlock(&pag->pag_buf_lock);
856 } else { 864 } else {
857 xfs_buf_lru_del(bp); 865 xfs_buf_lru_del(bp);
858 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); 866 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
859 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); 867 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
860 spin_unlock(&pag->pag_buf_lock); 868 spin_unlock(&pag->pag_buf_lock);
861 xfs_perag_put(pag); 869 xfs_perag_put(pag);
@@ -915,13 +923,6 @@ xfs_buf_lock(
915 trace_xfs_buf_lock_done(bp, _RET_IP_); 923 trace_xfs_buf_lock_done(bp, _RET_IP_);
916} 924}
917 925
918/*
919 * Releases the lock on the buffer object.
920 * If the buffer is marked delwri but is not queued, do so before we
921 * unlock the buffer as we need to set flags correctly. We also need to
922 * take a reference for the delwri queue because the unlocker is going to
923 * drop their's and they don't know we just queued it.
924 */
925void 926void
926xfs_buf_unlock( 927xfs_buf_unlock(
927 struct xfs_buf *bp) 928 struct xfs_buf *bp)
@@ -1019,10 +1020,11 @@ xfs_bwrite(
1019{ 1020{
1020 int error; 1021 int error;
1021 1022
1023 ASSERT(xfs_buf_islocked(bp));
1024
1022 bp->b_flags |= XBF_WRITE; 1025 bp->b_flags |= XBF_WRITE;
1023 bp->b_flags &= ~(XBF_ASYNC | XBF_READ); 1026 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q);
1024 1027
1025 xfs_buf_delwri_dequeue(bp);
1026 xfs_bdstrat_cb(bp); 1028 xfs_bdstrat_cb(bp);
1027 1029
1028 error = xfs_buf_iowait(bp); 1030 error = xfs_buf_iowait(bp);
@@ -1254,7 +1256,7 @@ xfs_buf_iorequest(
1254{ 1256{
1255 trace_xfs_buf_iorequest(bp, _RET_IP_); 1257 trace_xfs_buf_iorequest(bp, _RET_IP_);
1256 1258
1257 ASSERT(!(bp->b_flags & XBF_DELWRI)); 1259 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
1258 1260
1259 if (bp->b_flags & XBF_WRITE) 1261 if (bp->b_flags & XBF_WRITE)
1260 xfs_buf_wait_unpin(bp); 1262 xfs_buf_wait_unpin(bp);
@@ -1435,11 +1437,9 @@ xfs_free_buftarg(
1435{ 1437{
1436 unregister_shrinker(&btp->bt_shrinker); 1438 unregister_shrinker(&btp->bt_shrinker);
1437 1439
1438 xfs_flush_buftarg(btp, 1);
1439 if (mp->m_flags & XFS_MOUNT_BARRIER) 1440 if (mp->m_flags & XFS_MOUNT_BARRIER)
1440 xfs_blkdev_issue_flush(btp); 1441 xfs_blkdev_issue_flush(btp);
1441 1442
1442 kthread_stop(btp->bt_task);
1443 kmem_free(btp); 1443 kmem_free(btp);
1444} 1444}
1445 1445
@@ -1491,20 +1491,6 @@ xfs_setsize_buftarg(
1491 return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); 1491 return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
1492} 1492}
1493 1493
1494STATIC int
1495xfs_alloc_delwri_queue(
1496 xfs_buftarg_t *btp,
1497 const char *fsname)
1498{
1499 INIT_LIST_HEAD(&btp->bt_delwri_queue);
1500 spin_lock_init(&btp->bt_delwri_lock);
1501 btp->bt_flags = 0;
1502 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
1503 if (IS_ERR(btp->bt_task))
1504 return PTR_ERR(btp->bt_task);
1505 return 0;
1506}
1507
1508xfs_buftarg_t * 1494xfs_buftarg_t *
1509xfs_alloc_buftarg( 1495xfs_alloc_buftarg(
1510 struct xfs_mount *mp, 1496 struct xfs_mount *mp,
@@ -1527,8 +1513,6 @@ xfs_alloc_buftarg(
1527 spin_lock_init(&btp->bt_lru_lock); 1513 spin_lock_init(&btp->bt_lru_lock);
1528 if (xfs_setsize_buftarg_early(btp, bdev)) 1514 if (xfs_setsize_buftarg_early(btp, bdev))
1529 goto error; 1515 goto error;
1530 if (xfs_alloc_delwri_queue(btp, fsname))
1531 goto error;
1532 btp->bt_shrinker.shrink = xfs_buftarg_shrink; 1516 btp->bt_shrinker.shrink = xfs_buftarg_shrink;
1533 btp->bt_shrinker.seeks = DEFAULT_SEEKS; 1517 btp->bt_shrinker.seeks = DEFAULT_SEEKS;
1534 register_shrinker(&btp->bt_shrinker); 1518 register_shrinker(&btp->bt_shrinker);
@@ -1539,125 +1523,52 @@ error:
1539 return NULL; 1523 return NULL;
1540} 1524}
1541 1525
1542
1543/* 1526/*
1544 * Delayed write buffer handling 1527 * Add a buffer to the delayed write list.
1528 *
1529 * This queues a buffer for writeout if it hasn't already been. Note that
1530 * neither this routine nor the buffer list submission functions perform
1531 * any internal synchronization. It is expected that the lists are thread-local
1532 * to the callers.
1533 *
1534 * Returns true if we queued up the buffer, or false if it already had
1535 * been on the buffer list.
1545 */ 1536 */
1546void 1537bool
1547xfs_buf_delwri_queue( 1538xfs_buf_delwri_queue(
1548 xfs_buf_t *bp) 1539 struct xfs_buf *bp,
1540 struct list_head *list)
1549{ 1541{
1550 struct xfs_buftarg *btp = bp->b_target; 1542 ASSERT(xfs_buf_islocked(bp));
1551
1552 trace_xfs_buf_delwri_queue(bp, _RET_IP_);
1553
1554 ASSERT(!(bp->b_flags & XBF_READ)); 1543 ASSERT(!(bp->b_flags & XBF_READ));
1555 1544
1556 spin_lock(&btp->bt_delwri_lock); 1545 /*
1557 if (!list_empty(&bp->b_list)) { 1546 * If the buffer is already marked delwri it already is queued up
1558 /* if already in the queue, move it to the tail */ 1547 * by someone else for imediate writeout. Just ignore it in that
1559 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 1548 * case.
1560 list_move_tail(&bp->b_list, &btp->bt_delwri_queue); 1549 */
1561 } else { 1550 if (bp->b_flags & _XBF_DELWRI_Q) {
1562 /* start xfsbufd as it is about to have something to do */ 1551 trace_xfs_buf_delwri_queued(bp, _RET_IP_);
1563 if (list_empty(&btp->bt_delwri_queue)) 1552 return false;
1564 wake_up_process(bp->b_target->bt_task);
1565
1566 atomic_inc(&bp->b_hold);
1567 bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC;
1568 list_add_tail(&bp->b_list, &btp->bt_delwri_queue);
1569 }
1570 bp->b_queuetime = jiffies;
1571 spin_unlock(&btp->bt_delwri_lock);
1572}
1573
1574void
1575xfs_buf_delwri_dequeue(
1576 xfs_buf_t *bp)
1577{
1578 int dequeued = 0;
1579
1580 spin_lock(&bp->b_target->bt_delwri_lock);
1581 if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
1582 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1583 list_del_init(&bp->b_list);
1584 dequeued = 1;
1585 } 1553 }
1586 bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
1587 spin_unlock(&bp->b_target->bt_delwri_lock);
1588
1589 if (dequeued)
1590 xfs_buf_rele(bp);
1591
1592 trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
1593}
1594 1554
1595/* 1555 trace_xfs_buf_delwri_queue(bp, _RET_IP_);
1596 * If a delwri buffer needs to be pushed before it has aged out, then promote
1597 * it to the head of the delwri queue so that it will be flushed on the next
1598 * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
1599 * than the age currently needed to flush the buffer. Hence the next time the
1600 * xfsbufd sees it is guaranteed to be considered old enough to flush.
1601 */
1602void
1603xfs_buf_delwri_promote(
1604 struct xfs_buf *bp)
1605{
1606 struct xfs_buftarg *btp = bp->b_target;
1607 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
1608
1609 ASSERT(bp->b_flags & XBF_DELWRI);
1610 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1611 1556
1612 /* 1557 /*
1613 * Check the buffer age before locking the delayed write queue as we 1558 * If a buffer gets written out synchronously or marked stale while it
1614 * don't need to promote buffers that are already past the flush age. 1559 * is on a delwri list we lazily remove it. To do this, the other party
1560 * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
1561 * It remains referenced and on the list. In a rare corner case it
1562 * might get readded to a delwri list after the synchronous writeout, in
1563 * which case we need just need to re-add the flag here.
1615 */ 1564 */
1616 if (bp->b_queuetime < jiffies - age) 1565 bp->b_flags |= _XBF_DELWRI_Q;
1617 return; 1566 if (list_empty(&bp->b_list)) {
1618 bp->b_queuetime = jiffies - age; 1567 atomic_inc(&bp->b_hold);
1619 spin_lock(&btp->bt_delwri_lock); 1568 list_add_tail(&bp->b_list, list);
1620 list_move(&bp->b_list, &btp->bt_delwri_queue);
1621 spin_unlock(&btp->bt_delwri_lock);
1622}
1623
1624/*
1625 * Move as many buffers as specified to the supplied list
1626 * idicating if we skipped any buffers to prevent deadlocks.
1627 */
1628STATIC int
1629xfs_buf_delwri_split(
1630 xfs_buftarg_t *target,
1631 struct list_head *list,
1632 unsigned long age)
1633{
1634 xfs_buf_t *bp, *n;
1635 int skipped = 0;
1636 int force;
1637
1638 force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1639 INIT_LIST_HEAD(list);
1640 spin_lock(&target->bt_delwri_lock);
1641 list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) {
1642 ASSERT(bp->b_flags & XBF_DELWRI);
1643
1644 if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) {
1645 if (!force &&
1646 time_before(jiffies, bp->b_queuetime + age)) {
1647 xfs_buf_unlock(bp);
1648 break;
1649 }
1650
1651 bp->b_flags &= ~(XBF_DELWRI | _XBF_DELWRI_Q);
1652 bp->b_flags |= XBF_WRITE;
1653 list_move_tail(&bp->b_list, list);
1654 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1655 } else
1656 skipped++;
1657 } 1569 }
1658 1570
1659 spin_unlock(&target->bt_delwri_lock); 1571 return true;
1660 return skipped;
1661} 1572}
1662 1573
1663/* 1574/*
@@ -1683,99 +1594,109 @@ xfs_buf_cmp(
1683 return 0; 1594 return 0;
1684} 1595}
1685 1596
1686STATIC int 1597static int
1687xfsbufd( 1598__xfs_buf_delwri_submit(
1688 void *data) 1599 struct list_head *buffer_list,
1600 struct list_head *io_list,
1601 bool wait)
1689{ 1602{
1690 xfs_buftarg_t *target = (xfs_buftarg_t *)data; 1603 struct blk_plug plug;
1691 1604 struct xfs_buf *bp, *n;
1692 current->flags |= PF_MEMALLOC; 1605 int pinned = 0;
1693 1606
1694 set_freezable(); 1607 list_for_each_entry_safe(bp, n, buffer_list, b_list) {
1608 if (!wait) {
1609 if (xfs_buf_ispinned(bp)) {
1610 pinned++;
1611 continue;
1612 }
1613 if (!xfs_buf_trylock(bp))
1614 continue;
1615 } else {
1616 xfs_buf_lock(bp);
1617 }
1695 1618
1696 do { 1619 /*
1697 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10); 1620 * Someone else might have written the buffer synchronously or
1698 long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10); 1621 * marked it stale in the meantime. In that case only the
1699 struct list_head tmp; 1622 * _XBF_DELWRI_Q flag got cleared, and we have to drop the
1700 struct blk_plug plug; 1623 * reference and remove it from the list here.
1624 */
1625 if (!(bp->b_flags & _XBF_DELWRI_Q)) {
1626 list_del_init(&bp->b_list);
1627 xfs_buf_relse(bp);
1628 continue;
1629 }
1701 1630
1702 if (unlikely(freezing(current))) 1631 list_move_tail(&bp->b_list, io_list);
1703 try_to_freeze(); 1632 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1633 }
1704 1634
1705 /* sleep for a long time if there is nothing to do. */ 1635 list_sort(NULL, io_list, xfs_buf_cmp);
1706 if (list_empty(&target->bt_delwri_queue))
1707 tout = MAX_SCHEDULE_TIMEOUT;
1708 schedule_timeout_interruptible(tout);
1709 1636
1710 xfs_buf_delwri_split(target, &tmp, age); 1637 blk_start_plug(&plug);
1711 list_sort(NULL, &tmp, xfs_buf_cmp); 1638 list_for_each_entry_safe(bp, n, io_list, b_list) {
1639 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC);
1640 bp->b_flags |= XBF_WRITE;
1712 1641
1713 blk_start_plug(&plug); 1642 if (!wait) {
1714 while (!list_empty(&tmp)) { 1643 bp->b_flags |= XBF_ASYNC;
1715 struct xfs_buf *bp;
1716 bp = list_first_entry(&tmp, struct xfs_buf, b_list);
1717 list_del_init(&bp->b_list); 1644 list_del_init(&bp->b_list);
1718 xfs_bdstrat_cb(bp);
1719 } 1645 }
1720 blk_finish_plug(&plug); 1646 xfs_bdstrat_cb(bp);
1721 } while (!kthread_should_stop()); 1647 }
1648 blk_finish_plug(&plug);
1722 1649
1723 return 0; 1650 return pinned;
1724} 1651}
1725 1652
1726/* 1653/*
1727 * Go through all incore buffers, and release buffers if they belong to 1654 * Write out a buffer list asynchronously.
1728 * the given device. This is used in filesystem error handling to 1655 *
1729 * preserve the consistency of its metadata. 1656 * This will take the @buffer_list, write all non-locked and non-pinned buffers
1657 * out and not wait for I/O completion on any of the buffers. This interface
1658 * is only safely useable for callers that can track I/O completion by higher
1659 * level means, e.g. AIL pushing as the @buffer_list is consumed in this
1660 * function.
1730 */ 1661 */
1731int 1662int
1732xfs_flush_buftarg( 1663xfs_buf_delwri_submit_nowait(
1733 xfs_buftarg_t *target, 1664 struct list_head *buffer_list)
1734 int wait)
1735{ 1665{
1736 xfs_buf_t *bp; 1666 LIST_HEAD (io_list);
1737 int pincount = 0; 1667 return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
1738 LIST_HEAD(tmp_list); 1668}
1739 LIST_HEAD(wait_list);
1740 struct blk_plug plug;
1741 1669
1742 flush_workqueue(xfslogd_workqueue); 1670/*
1671 * Write out a buffer list synchronously.
1672 *
1673 * This will take the @buffer_list, write all buffers out and wait for I/O
1674 * completion on all of the buffers. @buffer_list is consumed by the function,
1675 * so callers must have some other way of tracking buffers if they require such
1676 * functionality.
1677 */
1678int
1679xfs_buf_delwri_submit(
1680 struct list_head *buffer_list)
1681{
1682 LIST_HEAD (io_list);
1683 int error = 0, error2;
1684 struct xfs_buf *bp;
1743 1685
1744 set_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1686 __xfs_buf_delwri_submit(buffer_list, &io_list, true);
1745 pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
1746 1687
1747 /* 1688 /* Wait for IO to complete. */
1748 * Dropped the delayed write list lock, now walk the temporary list. 1689 while (!list_empty(&io_list)) {
1749 * All I/O is issued async and then if we need to wait for completion 1690 bp = list_first_entry(&io_list, struct xfs_buf, b_list);
1750 * we do that after issuing all the IO.
1751 */
1752 list_sort(NULL, &tmp_list, xfs_buf_cmp);
1753 1691
1754 blk_start_plug(&plug);
1755 while (!list_empty(&tmp_list)) {
1756 bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
1757 ASSERT(target == bp->b_target);
1758 list_del_init(&bp->b_list); 1692 list_del_init(&bp->b_list);
1759 if (wait) { 1693 error2 = xfs_buf_iowait(bp);
1760 bp->b_flags &= ~XBF_ASYNC; 1694 xfs_buf_relse(bp);
1761 list_add(&bp->b_list, &wait_list); 1695 if (!error)
1762 } 1696 error = error2;
1763 xfs_bdstrat_cb(bp);
1764 }
1765 blk_finish_plug(&plug);
1766
1767 if (wait) {
1768 /* Wait for IO to complete. */
1769 while (!list_empty(&wait_list)) {
1770 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
1771
1772 list_del_init(&bp->b_list);
1773 xfs_buf_iowait(bp);
1774 xfs_buf_relse(bp);
1775 }
1776 } 1697 }
1777 1698
1778 return pincount; 1699 return error;
1779} 1700}
1780 1701
1781int __init 1702int __init
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 5bf3be45f543..7083cf44d95f 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -49,8 +49,7 @@ typedef enum {
49#define XBF_MAPPED (1 << 3) /* buffer mapped (b_addr valid) */ 49#define XBF_MAPPED (1 << 3) /* buffer mapped (b_addr valid) */
50#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ 50#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
51#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ 51#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
52#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */ 52#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
53#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */
54 53
55/* I/O hints for the BIO layer */ 54/* I/O hints for the BIO layer */
56#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ 55#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */
@@ -65,7 +64,7 @@ typedef enum {
65/* flags used only internally */ 64/* flags used only internally */
66#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ 65#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */
67#define _XBF_KMEM (1 << 21)/* backed by heap memory */ 66#define _XBF_KMEM (1 << 21)/* backed by heap memory */
68#define _XBF_DELWRI_Q (1 << 22)/* buffer on delwri queue */ 67#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
69 68
70typedef unsigned int xfs_buf_flags_t; 69typedef unsigned int xfs_buf_flags_t;
71 70
@@ -76,7 +75,6 @@ typedef unsigned int xfs_buf_flags_t;
76 { XBF_MAPPED, "MAPPED" }, \ 75 { XBF_MAPPED, "MAPPED" }, \
77 { XBF_ASYNC, "ASYNC" }, \ 76 { XBF_ASYNC, "ASYNC" }, \
78 { XBF_DONE, "DONE" }, \ 77 { XBF_DONE, "DONE" }, \
79 { XBF_DELWRI, "DELWRI" }, \
80 { XBF_STALE, "STALE" }, \ 78 { XBF_STALE, "STALE" }, \
81 { XBF_SYNCIO, "SYNCIO" }, \ 79 { XBF_SYNCIO, "SYNCIO" }, \
82 { XBF_FUA, "FUA" }, \ 80 { XBF_FUA, "FUA" }, \
@@ -88,10 +86,6 @@ typedef unsigned int xfs_buf_flags_t;
88 { _XBF_KMEM, "KMEM" }, \ 86 { _XBF_KMEM, "KMEM" }, \
89 { _XBF_DELWRI_Q, "DELWRI_Q" } 87 { _XBF_DELWRI_Q, "DELWRI_Q" }
90 88
91typedef enum {
92 XBT_FORCE_FLUSH = 0,
93} xfs_buftarg_flags_t;
94
95typedef struct xfs_buftarg { 89typedef struct xfs_buftarg {
96 dev_t bt_dev; 90 dev_t bt_dev;
97 struct block_device *bt_bdev; 91 struct block_device *bt_bdev;
@@ -101,12 +95,6 @@ typedef struct xfs_buftarg {
101 unsigned int bt_sshift; 95 unsigned int bt_sshift;
102 size_t bt_smask; 96 size_t bt_smask;
103 97
104 /* per device delwri queue */
105 struct task_struct *bt_task;
106 struct list_head bt_delwri_queue;
107 spinlock_t bt_delwri_lock;
108 unsigned long bt_flags;
109
110 /* LRU control structures */ 98 /* LRU control structures */
111 struct shrinker bt_shrinker; 99 struct shrinker bt_shrinker;
112 struct list_head bt_lru; 100 struct list_head bt_lru;
@@ -150,7 +138,6 @@ typedef struct xfs_buf {
150 struct xfs_trans *b_transp; 138 struct xfs_trans *b_transp;
151 struct page **b_pages; /* array of page pointers */ 139 struct page **b_pages; /* array of page pointers */
152 struct page *b_page_array[XB_PAGES]; /* inline pages */ 140 struct page *b_page_array[XB_PAGES]; /* inline pages */
153 unsigned long b_queuetime; /* time buffer was queued */
154 atomic_t b_pin_count; /* pin count */ 141 atomic_t b_pin_count; /* pin count */
155 atomic_t b_io_remaining; /* #outstanding I/O requests */ 142 atomic_t b_io_remaining; /* #outstanding I/O requests */
156 unsigned int b_page_count; /* size of page array */ 143 unsigned int b_page_count; /* size of page array */
@@ -220,24 +207,22 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp)
220extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); 207extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
221 208
222/* Delayed Write Buffer Routines */ 209/* Delayed Write Buffer Routines */
223extern void xfs_buf_delwri_queue(struct xfs_buf *); 210extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
224extern void xfs_buf_delwri_dequeue(struct xfs_buf *); 211extern int xfs_buf_delwri_submit(struct list_head *);
225extern void xfs_buf_delwri_promote(struct xfs_buf *); 212extern int xfs_buf_delwri_submit_nowait(struct list_head *);
226 213
227/* Buffer Daemon Setup Routines */ 214/* Buffer Daemon Setup Routines */
228extern int xfs_buf_init(void); 215extern int xfs_buf_init(void);
229extern void xfs_buf_terminate(void); 216extern void xfs_buf_terminate(void);
230 217
231#define XFS_BUF_ZEROFLAGS(bp) \ 218#define XFS_BUF_ZEROFLAGS(bp) \
232 ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \ 219 ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \
233 XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) 220 XBF_SYNCIO|XBF_FUA|XBF_FLUSH))
234 221
235void xfs_buf_stale(struct xfs_buf *bp); 222void xfs_buf_stale(struct xfs_buf *bp);
236#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) 223#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
237#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) 224#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
238 225
239#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI)
240
241#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) 226#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE)
242#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE) 227#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE)
243#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE) 228#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE)
@@ -287,7 +272,6 @@ extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
287extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); 272extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
288extern void xfs_wait_buftarg(xfs_buftarg_t *); 273extern void xfs_wait_buftarg(xfs_buftarg_t *);
289extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 274extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
290extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
291 275
292#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) 276#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
293#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) 277#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 3a0bc38f1859..fb20f384b566 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -418,7 +418,6 @@ xfs_buf_item_unpin(
418 if (freed && stale) { 418 if (freed && stale) {
419 ASSERT(bip->bli_flags & XFS_BLI_STALE); 419 ASSERT(bip->bli_flags & XFS_BLI_STALE);
420 ASSERT(xfs_buf_islocked(bp)); 420 ASSERT(xfs_buf_islocked(bp));
421 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
422 ASSERT(XFS_BUF_ISSTALE(bp)); 421 ASSERT(XFS_BUF_ISSTALE(bp));
423 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); 422 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
424 423
@@ -469,34 +468,28 @@ xfs_buf_item_unpin(
469 } 468 }
470} 469}
471 470
472/*
473 * This is called to attempt to lock the buffer associated with this
474 * buf log item. Don't sleep on the buffer lock. If we can't get
475 * the lock right away, return 0. If we can get the lock, take a
476 * reference to the buffer. If this is a delayed write buffer that
477 * needs AIL help to be written back, invoke the pushbuf routine
478 * rather than the normal success path.
479 */
480STATIC uint 471STATIC uint
481xfs_buf_item_trylock( 472xfs_buf_item_push(
482 struct xfs_log_item *lip) 473 struct xfs_log_item *lip,
474 struct list_head *buffer_list)
483{ 475{
484 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 476 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
485 struct xfs_buf *bp = bip->bli_buf; 477 struct xfs_buf *bp = bip->bli_buf;
478 uint rval = XFS_ITEM_SUCCESS;
486 479
487 if (xfs_buf_ispinned(bp)) 480 if (xfs_buf_ispinned(bp))
488 return XFS_ITEM_PINNED; 481 return XFS_ITEM_PINNED;
489 if (!xfs_buf_trylock(bp)) 482 if (!xfs_buf_trylock(bp))
490 return XFS_ITEM_LOCKED; 483 return XFS_ITEM_LOCKED;
491 484
492 /* take a reference to the buffer. */
493 xfs_buf_hold(bp);
494
495 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 485 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
496 trace_xfs_buf_item_trylock(bip); 486
497 if (XFS_BUF_ISDELAYWRITE(bp)) 487 trace_xfs_buf_item_push(bip);
498 return XFS_ITEM_PUSHBUF; 488
499 return XFS_ITEM_SUCCESS; 489 if (!xfs_buf_delwri_queue(bp, buffer_list))
490 rval = XFS_ITEM_FLUSHING;
491 xfs_buf_unlock(bp);
492 return rval;
500} 493}
501 494
502/* 495/*
@@ -609,48 +602,6 @@ xfs_buf_item_committed(
609 return lsn; 602 return lsn;
610} 603}
611 604
612/*
613 * The buffer is locked, but is not a delayed write buffer.
614 */
615STATIC void
616xfs_buf_item_push(
617 struct xfs_log_item *lip)
618{
619 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
620 struct xfs_buf *bp = bip->bli_buf;
621
622 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
623 ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
624
625 trace_xfs_buf_item_push(bip);
626
627 xfs_buf_delwri_queue(bp);
628 xfs_buf_relse(bp);
629}
630
631/*
632 * The buffer is locked and is a delayed write buffer. Promote the buffer
633 * in the delayed write queue as the caller knows that they must invoke
634 * the xfsbufd to get this buffer written. We have to unlock the buffer
635 * to allow the xfsbufd to write it, too.
636 */
637STATIC bool
638xfs_buf_item_pushbuf(
639 struct xfs_log_item *lip)
640{
641 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
642 struct xfs_buf *bp = bip->bli_buf;
643
644 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
645 ASSERT(XFS_BUF_ISDELAYWRITE(bp));
646
647 trace_xfs_buf_item_pushbuf(bip);
648
649 xfs_buf_delwri_promote(bp);
650 xfs_buf_relse(bp);
651 return true;
652}
653
654STATIC void 605STATIC void
655xfs_buf_item_committing( 606xfs_buf_item_committing(
656 struct xfs_log_item *lip, 607 struct xfs_log_item *lip,
@@ -666,11 +617,9 @@ static const struct xfs_item_ops xfs_buf_item_ops = {
666 .iop_format = xfs_buf_item_format, 617 .iop_format = xfs_buf_item_format,
667 .iop_pin = xfs_buf_item_pin, 618 .iop_pin = xfs_buf_item_pin,
668 .iop_unpin = xfs_buf_item_unpin, 619 .iop_unpin = xfs_buf_item_unpin,
669 .iop_trylock = xfs_buf_item_trylock,
670 .iop_unlock = xfs_buf_item_unlock, 620 .iop_unlock = xfs_buf_item_unlock,
671 .iop_committed = xfs_buf_item_committed, 621 .iop_committed = xfs_buf_item_committed,
672 .iop_push = xfs_buf_item_push, 622 .iop_push = xfs_buf_item_push,
673 .iop_pushbuf = xfs_buf_item_pushbuf,
674 .iop_committing = xfs_buf_item_committing 623 .iop_committing = xfs_buf_item_committing
675}; 624};
676 625
@@ -989,20 +938,27 @@ xfs_buf_iodone_callbacks(
989 * If the write was asynchronous then no one will be looking for the 938 * If the write was asynchronous then no one will be looking for the
990 * error. Clear the error state and write the buffer out again. 939 * error. Clear the error state and write the buffer out again.
991 * 940 *
992 * During sync or umount we'll write all pending buffers again 941 * XXX: This helps against transient write errors, but we need to find
993 * synchronous, which will catch these errors if they keep hanging 942 * a way to shut the filesystem down if the writes keep failing.
994 * around. 943 *
944 * In practice we'll shut the filesystem down soon as non-transient
945 * erorrs tend to affect the whole device and a failing log write
946 * will make us give up. But we really ought to do better here.
995 */ 947 */
996 if (XFS_BUF_ISASYNC(bp)) { 948 if (XFS_BUF_ISASYNC(bp)) {
949 ASSERT(bp->b_iodone != NULL);
950
951 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
952
997 xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ 953 xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
998 954
999 if (!XFS_BUF_ISSTALE(bp)) { 955 if (!XFS_BUF_ISSTALE(bp)) {
1000 xfs_buf_delwri_queue(bp); 956 bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE;
1001 XFS_BUF_DONE(bp); 957 xfs_bdstrat_cb(bp);
958 } else {
959 xfs_buf_relse(bp);
1002 } 960 }
1003 ASSERT(bp->b_iodone != NULL); 961
1004 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1005 xfs_buf_relse(bp);
1006 return; 962 return;
1007 } 963 }
1008 964
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 53757d83e4f6..65b8aa37622e 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1005,39 +1005,6 @@ xfs_dqlock2(
1005 } 1005 }
1006} 1006}
1007 1007
1008/*
1009 * Give the buffer a little push if it is incore and
1010 * wait on the flush lock.
1011 */
1012void
1013xfs_dqflock_pushbuf_wait(
1014 xfs_dquot_t *dqp)
1015{
1016 xfs_mount_t *mp = dqp->q_mount;
1017 xfs_buf_t *bp;
1018
1019 /*
1020 * Check to see if the dquot has been flushed delayed
1021 * write. If so, grab its buffer and send it
1022 * out immediately. We'll be able to acquire
1023 * the flush lock when the I/O completes.
1024 */
1025 bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno,
1026 mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
1027 if (!bp)
1028 goto out_lock;
1029
1030 if (XFS_BUF_ISDELAYWRITE(bp)) {
1031 if (xfs_buf_ispinned(bp))
1032 xfs_log_force(mp, 0);
1033 xfs_buf_delwri_promote(bp);
1034 wake_up_process(bp->b_target->bt_task);
1035 }
1036 xfs_buf_relse(bp);
1037out_lock:
1038 xfs_dqflock(dqp);
1039}
1040
1041int __init 1008int __init
1042xfs_qm_init(void) 1009xfs_qm_init(void)
1043{ 1010{
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 5f2a2f2c0c5b..7d20af27346d 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -152,7 +152,6 @@ extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
152extern void xfs_qm_dqput(xfs_dquot_t *); 152extern void xfs_qm_dqput(xfs_dquot_t *);
153 153
154extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); 154extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
155extern void xfs_dqflock_pushbuf_wait(struct xfs_dquot *dqp);
156 155
157static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) 156static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
158{ 157{
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 8d8295814272..9c5d58d24e54 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -108,46 +108,6 @@ xfs_qm_dquot_logitem_unpin(
108 wake_up(&dqp->q_pinwait); 108 wake_up(&dqp->q_pinwait);
109} 109}
110 110
111/*
112 * Given the logitem, this writes the corresponding dquot entry to disk
113 * asynchronously. This is called with the dquot entry securely locked;
114 * we simply get xfs_qm_dqflush() to do the work, and unlock the dquot
115 * at the end.
116 */
117STATIC void
118xfs_qm_dquot_logitem_push(
119 struct xfs_log_item *lip)
120{
121 struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
122 struct xfs_buf *bp = NULL;
123 int error;
124
125 ASSERT(XFS_DQ_IS_LOCKED(dqp));
126 ASSERT(!completion_done(&dqp->q_flush));
127 ASSERT(atomic_read(&dqp->q_pincount) == 0);
128
129 /*
130 * Since we were able to lock the dquot's flush lock and
131 * we found it on the AIL, the dquot must be dirty. This
132 * is because the dquot is removed from the AIL while still
133 * holding the flush lock in xfs_dqflush_done(). Thus, if
134 * we found it in the AIL and were able to obtain the flush
135 * lock without sleeping, then there must not have been
136 * anyone in the process of flushing the dquot.
137 */
138 error = xfs_qm_dqflush(dqp, &bp);
139 if (error) {
140 xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
141 __func__, error, dqp);
142 goto out_unlock;
143 }
144
145 xfs_buf_delwri_queue(bp);
146 xfs_buf_relse(bp);
147out_unlock:
148 xfs_dqunlock(dqp);
149}
150
151STATIC xfs_lsn_t 111STATIC xfs_lsn_t
152xfs_qm_dquot_logitem_committed( 112xfs_qm_dquot_logitem_committed(
153 struct xfs_log_item *lip, 113 struct xfs_log_item *lip,
@@ -179,67 +139,15 @@ xfs_qm_dqunpin_wait(
179 wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); 139 wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
180} 140}
181 141
182/*
183 * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that
184 * the dquot is locked by us, but the flush lock isn't. So, here we are
185 * going to see if the relevant dquot buffer is incore, waiting on DELWRI.
186 * If so, we want to push it out to help us take this item off the AIL as soon
187 * as possible.
188 *
189 * We must not be holding the AIL lock at this point. Calling incore() to
190 * search the buffer cache can be a time consuming thing, and AIL lock is a
191 * spinlock.
192 */
193STATIC bool
194xfs_qm_dquot_logitem_pushbuf(
195 struct xfs_log_item *lip)
196{
197 struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
198 struct xfs_dquot *dqp = qlip->qli_dquot;
199 struct xfs_buf *bp;
200 bool ret = true;
201
202 ASSERT(XFS_DQ_IS_LOCKED(dqp));
203
204 /*
205 * If flushlock isn't locked anymore, chances are that the
206 * inode flush completed and the inode was taken off the AIL.
207 * So, just get out.
208 */
209 if (completion_done(&dqp->q_flush) ||
210 !(lip->li_flags & XFS_LI_IN_AIL)) {
211 xfs_dqunlock(dqp);
212 return true;
213 }
214
215 bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno,
216 dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
217 xfs_dqunlock(dqp);
218 if (!bp)
219 return true;
220 if (XFS_BUF_ISDELAYWRITE(bp))
221 xfs_buf_delwri_promote(bp);
222 if (xfs_buf_ispinned(bp))
223 ret = false;
224 xfs_buf_relse(bp);
225 return ret;
226}
227
228/*
229 * This is called to attempt to lock the dquot associated with this
230 * dquot log item. Don't sleep on the dquot lock or the flush lock.
231 * If the flush lock is already held, indicating that the dquot has
232 * been or is in the process of being flushed, then see if we can
233 * find the dquot's buffer in the buffer cache without sleeping. If
234 * we can and it is marked delayed write, then we want to send it out.
235 * We delay doing so until the push routine, though, to avoid sleeping
236 * in any device strategy routines.
237 */
238STATIC uint 142STATIC uint
239xfs_qm_dquot_logitem_trylock( 143xfs_qm_dquot_logitem_push(
240 struct xfs_log_item *lip) 144 struct xfs_log_item *lip,
145 struct list_head *buffer_list)
241{ 146{
242 struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; 147 struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
148 struct xfs_buf *bp = NULL;
149 uint rval = XFS_ITEM_SUCCESS;
150 int error;
243 151
244 if (atomic_read(&dqp->q_pincount) > 0) 152 if (atomic_read(&dqp->q_pincount) > 0)
245 return XFS_ITEM_PINNED; 153 return XFS_ITEM_PINNED;
@@ -252,20 +160,36 @@ xfs_qm_dquot_logitem_trylock(
252 * taking the quota lock. 160 * taking the quota lock.
253 */ 161 */
254 if (atomic_read(&dqp->q_pincount) > 0) { 162 if (atomic_read(&dqp->q_pincount) > 0) {
255 xfs_dqunlock(dqp); 163 rval = XFS_ITEM_PINNED;
256 return XFS_ITEM_PINNED; 164 goto out_unlock;
257 } 165 }
258 166
167 /*
168 * Someone else is already flushing the dquot. Nothing we can do
169 * here but wait for the flush to finish and remove the item from
170 * the AIL.
171 */
259 if (!xfs_dqflock_nowait(dqp)) { 172 if (!xfs_dqflock_nowait(dqp)) {
260 /* 173 rval = XFS_ITEM_FLUSHING;
261 * dquot has already been flushed to the backing buffer, 174 goto out_unlock;
262 * leave it locked, pushbuf routine will unlock it. 175 }
263 */ 176
264 return XFS_ITEM_PUSHBUF; 177 spin_unlock(&lip->li_ailp->xa_lock);
178
179 error = xfs_qm_dqflush(dqp, &bp);
180 if (error) {
181 xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
182 __func__, error, dqp);
183 } else {
184 if (!xfs_buf_delwri_queue(bp, buffer_list))
185 rval = XFS_ITEM_FLUSHING;
186 xfs_buf_relse(bp);
265 } 187 }
266 188
267 ASSERT(lip->li_flags & XFS_LI_IN_AIL); 189 spin_lock(&lip->li_ailp->xa_lock);
268 return XFS_ITEM_SUCCESS; 190out_unlock:
191 xfs_dqunlock(dqp);
192 return rval;
269} 193}
270 194
271/* 195/*
@@ -316,11 +240,9 @@ static const struct xfs_item_ops xfs_dquot_item_ops = {
316 .iop_format = xfs_qm_dquot_logitem_format, 240 .iop_format = xfs_qm_dquot_logitem_format,
317 .iop_pin = xfs_qm_dquot_logitem_pin, 241 .iop_pin = xfs_qm_dquot_logitem_pin,
318 .iop_unpin = xfs_qm_dquot_logitem_unpin, 242 .iop_unpin = xfs_qm_dquot_logitem_unpin,
319 .iop_trylock = xfs_qm_dquot_logitem_trylock,
320 .iop_unlock = xfs_qm_dquot_logitem_unlock, 243 .iop_unlock = xfs_qm_dquot_logitem_unlock,
321 .iop_committed = xfs_qm_dquot_logitem_committed, 244 .iop_committed = xfs_qm_dquot_logitem_committed,
322 .iop_push = xfs_qm_dquot_logitem_push, 245 .iop_push = xfs_qm_dquot_logitem_push,
323 .iop_pushbuf = xfs_qm_dquot_logitem_pushbuf,
324 .iop_committing = xfs_qm_dquot_logitem_committing 246 .iop_committing = xfs_qm_dquot_logitem_committing
325}; 247};
326 248
@@ -415,11 +337,13 @@ xfs_qm_qoff_logitem_unpin(
415} 337}
416 338
417/* 339/*
418 * Quotaoff items have no locking, so just return success. 340 * There isn't much you can do to push a quotaoff item. It is simply
341 * stuck waiting for the log to be flushed to disk.
419 */ 342 */
420STATIC uint 343STATIC uint
421xfs_qm_qoff_logitem_trylock( 344xfs_qm_qoff_logitem_push(
422 struct xfs_log_item *lip) 345 struct xfs_log_item *lip,
346 struct list_head *buffer_list)
423{ 347{
424 return XFS_ITEM_LOCKED; 348 return XFS_ITEM_LOCKED;
425} 349}
@@ -446,17 +370,6 @@ xfs_qm_qoff_logitem_committed(
446 return lsn; 370 return lsn;
447} 371}
448 372
449/*
450 * There isn't much you can do to push on an quotaoff item. It is simply
451 * stuck waiting for the log to be flushed to disk.
452 */
453STATIC void
454xfs_qm_qoff_logitem_push(
455 struct xfs_log_item *lip)
456{
457}
458
459
460STATIC xfs_lsn_t 373STATIC xfs_lsn_t
461xfs_qm_qoffend_logitem_committed( 374xfs_qm_qoffend_logitem_committed(
462 struct xfs_log_item *lip, 375 struct xfs_log_item *lip,
@@ -504,7 +417,6 @@ static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
504 .iop_format = xfs_qm_qoff_logitem_format, 417 .iop_format = xfs_qm_qoff_logitem_format,
505 .iop_pin = xfs_qm_qoff_logitem_pin, 418 .iop_pin = xfs_qm_qoff_logitem_pin,
506 .iop_unpin = xfs_qm_qoff_logitem_unpin, 419 .iop_unpin = xfs_qm_qoff_logitem_unpin,
507 .iop_trylock = xfs_qm_qoff_logitem_trylock,
508 .iop_unlock = xfs_qm_qoff_logitem_unlock, 420 .iop_unlock = xfs_qm_qoff_logitem_unlock,
509 .iop_committed = xfs_qm_qoffend_logitem_committed, 421 .iop_committed = xfs_qm_qoffend_logitem_committed,
510 .iop_push = xfs_qm_qoff_logitem_push, 422 .iop_push = xfs_qm_qoff_logitem_push,
@@ -519,7 +431,6 @@ static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
519 .iop_format = xfs_qm_qoff_logitem_format, 431 .iop_format = xfs_qm_qoff_logitem_format,
520 .iop_pin = xfs_qm_qoff_logitem_pin, 432 .iop_pin = xfs_qm_qoff_logitem_pin,
521 .iop_unpin = xfs_qm_qoff_logitem_unpin, 433 .iop_unpin = xfs_qm_qoff_logitem_unpin,
522 .iop_trylock = xfs_qm_qoff_logitem_trylock,
523 .iop_unlock = xfs_qm_qoff_logitem_unlock, 434 .iop_unlock = xfs_qm_qoff_logitem_unlock,
524 .iop_committed = xfs_qm_qoff_logitem_committed, 435 .iop_committed = xfs_qm_qoff_logitem_committed,
525 .iop_push = xfs_qm_qoff_logitem_push, 436 .iop_push = xfs_qm_qoff_logitem_push,
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 35c2aff38b20..9549ef179e06 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -147,22 +147,20 @@ xfs_efi_item_unpin(
147} 147}
148 148
149/* 149/*
150 * Efi items have no locking or pushing. However, since EFIs are 150 * Efi items have no locking or pushing. However, since EFIs are pulled from
151 * pulled from the AIL when their corresponding EFDs are committed 151 * the AIL when their corresponding EFDs are committed to disk, their situation
152 * to disk, their situation is very similar to being pinned. Return 152 * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller
153 * XFS_ITEM_PINNED so that the caller will eventually flush the log. 153 * will eventually flush the log. This should help in getting the EFI out of
154 * This should help in getting the EFI out of the AIL. 154 * the AIL.
155 */ 155 */
156STATIC uint 156STATIC uint
157xfs_efi_item_trylock( 157xfs_efi_item_push(
158 struct xfs_log_item *lip) 158 struct xfs_log_item *lip,
159 struct list_head *buffer_list)
159{ 160{
160 return XFS_ITEM_PINNED; 161 return XFS_ITEM_PINNED;
161} 162}
162 163
163/*
164 * Efi items have no locking, so just return.
165 */
166STATIC void 164STATIC void
167xfs_efi_item_unlock( 165xfs_efi_item_unlock(
168 struct xfs_log_item *lip) 166 struct xfs_log_item *lip)
@@ -190,17 +188,6 @@ xfs_efi_item_committed(
190} 188}
191 189
192/* 190/*
193 * There isn't much you can do to push on an efi item. It is simply
194 * stuck waiting for all of its corresponding efd items to be
195 * committed to disk.
196 */
197STATIC void
198xfs_efi_item_push(
199 struct xfs_log_item *lip)
200{
201}
202
203/*
204 * The EFI dependency tracking op doesn't do squat. It can't because 191 * The EFI dependency tracking op doesn't do squat. It can't because
205 * it doesn't know where the free extent is coming from. The dependency 192 * it doesn't know where the free extent is coming from. The dependency
206 * tracking has to be handled by the "enclosing" metadata object. For 193 * tracking has to be handled by the "enclosing" metadata object. For
@@ -222,7 +209,6 @@ static const struct xfs_item_ops xfs_efi_item_ops = {
222 .iop_format = xfs_efi_item_format, 209 .iop_format = xfs_efi_item_format,
223 .iop_pin = xfs_efi_item_pin, 210 .iop_pin = xfs_efi_item_pin,
224 .iop_unpin = xfs_efi_item_unpin, 211 .iop_unpin = xfs_efi_item_unpin,
225 .iop_trylock = xfs_efi_item_trylock,
226 .iop_unlock = xfs_efi_item_unlock, 212 .iop_unlock = xfs_efi_item_unlock,
227 .iop_committed = xfs_efi_item_committed, 213 .iop_committed = xfs_efi_item_committed,
228 .iop_push = xfs_efi_item_push, 214 .iop_push = xfs_efi_item_push,
@@ -404,19 +390,17 @@ xfs_efd_item_unpin(
404} 390}
405 391
406/* 392/*
407 * Efd items have no locking, so just return success. 393 * There isn't much you can do to push on an efd item. It is simply stuck
394 * waiting for the log to be flushed to disk.
408 */ 395 */
409STATIC uint 396STATIC uint
410xfs_efd_item_trylock( 397xfs_efd_item_push(
411 struct xfs_log_item *lip) 398 struct xfs_log_item *lip,
399 struct list_head *buffer_list)
412{ 400{
413 return XFS_ITEM_LOCKED; 401 return XFS_ITEM_PINNED;
414} 402}
415 403
416/*
417 * Efd items have no locking or pushing, so return failure
418 * so that the caller doesn't bother with us.
419 */
420STATIC void 404STATIC void
421xfs_efd_item_unlock( 405xfs_efd_item_unlock(
422 struct xfs_log_item *lip) 406 struct xfs_log_item *lip)
@@ -451,16 +435,6 @@ xfs_efd_item_committed(
451} 435}
452 436
453/* 437/*
454 * There isn't much you can do to push on an efd item. It is simply
455 * stuck waiting for the log to be flushed to disk.
456 */
457STATIC void
458xfs_efd_item_push(
459 struct xfs_log_item *lip)
460{
461}
462
463/*
464 * The EFD dependency tracking op doesn't do squat. It can't because 438 * The EFD dependency tracking op doesn't do squat. It can't because
465 * it doesn't know where the free extent is coming from. The dependency 439 * it doesn't know where the free extent is coming from. The dependency
466 * tracking has to be handled by the "enclosing" metadata object. For 440 * tracking has to be handled by the "enclosing" metadata object. For
@@ -482,7 +456,6 @@ static const struct xfs_item_ops xfs_efd_item_ops = {
482 .iop_format = xfs_efd_item_format, 456 .iop_format = xfs_efd_item_format,
483 .iop_pin = xfs_efd_item_pin, 457 .iop_pin = xfs_efd_item_pin,
484 .iop_unpin = xfs_efd_item_unpin, 458 .iop_unpin = xfs_efd_item_unpin,
485 .iop_trylock = xfs_efd_item_trylock,
486 .iop_unlock = xfs_efd_item_unlock, 459 .iop_unlock = xfs_efd_item_unlock,
487 .iop_committed = xfs_efd_item_committed, 460 .iop_committed = xfs_efd_item_committed,
488 .iop_push = xfs_efd_item_push, 461 .iop_push = xfs_efd_item_push,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0fa987dea242..acd846d808b2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2347,11 +2347,11 @@ cluster_corrupt_out:
2347 */ 2347 */
2348 rcu_read_unlock(); 2348 rcu_read_unlock();
2349 /* 2349 /*
2350 * Clean up the buffer. If it was B_DELWRI, just release it -- 2350 * Clean up the buffer. If it was delwri, just release it --
2351 * brelse can handle it with no problems. If not, shut down the 2351 * brelse can handle it with no problems. If not, shut down the
2352 * filesystem before releasing the buffer. 2352 * filesystem before releasing the buffer.
2353 */ 2353 */
2354 bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp); 2354 bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q);
2355 if (bufwasdelwri) 2355 if (bufwasdelwri)
2356 xfs_buf_relse(bp); 2356 xfs_buf_relse(bp);
2357 2357
@@ -2685,27 +2685,6 @@ corrupt_out:
2685 return XFS_ERROR(EFSCORRUPTED); 2685 return XFS_ERROR(EFSCORRUPTED);
2686} 2686}
2687 2687
2688void
2689xfs_promote_inode(
2690 struct xfs_inode *ip)
2691{
2692 struct xfs_buf *bp;
2693
2694 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2695
2696 bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno,
2697 ip->i_imap.im_len, XBF_TRYLOCK);
2698 if (!bp)
2699 return;
2700
2701 if (XFS_BUF_ISDELAYWRITE(bp)) {
2702 xfs_buf_delwri_promote(bp);
2703 wake_up_process(ip->i_mount->m_ddev_targp->bt_task);
2704 }
2705
2706 xfs_buf_relse(bp);
2707}
2708
2709/* 2688/*
2710 * Return a pointer to the extent record at file index idx. 2689 * Return a pointer to the extent record at file index idx.
2711 */ 2690 */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index a2fa79ae410f..f0e252f384f9 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -530,7 +530,6 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
530void xfs_iext_realloc(xfs_inode_t *, int, int); 530void xfs_iext_realloc(xfs_inode_t *, int, int);
531void xfs_iunpin_wait(xfs_inode_t *); 531void xfs_iunpin_wait(xfs_inode_t *);
532int xfs_iflush(struct xfs_inode *, struct xfs_buf **); 532int xfs_iflush(struct xfs_inode *, struct xfs_buf **);
533void xfs_promote_inode(struct xfs_inode *);
534void xfs_lock_inodes(xfs_inode_t **, int, uint); 533void xfs_lock_inodes(xfs_inode_t **, int, uint);
535void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 534void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
536 535
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index d3601ab75dd3..8aaebb2f9efa 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -480,25 +480,16 @@ xfs_inode_item_unpin(
480 wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); 480 wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
481} 481}
482 482
483/*
484 * This is called to attempt to lock the inode associated with this
485 * inode log item, in preparation for the push routine which does the actual
486 * iflush. Don't sleep on the inode lock or the flush lock.
487 *
488 * If the flush lock is already held, indicating that the inode has
489 * been or is in the process of being flushed, then (ideally) we'd like to
490 * see if the inode's buffer is still incore, and if so give it a nudge.
491 * We delay doing so until the pushbuf routine, though, to avoid holding
492 * the AIL lock across a call to the blackhole which is the buffer cache.
493 * Also we don't want to sleep in any device strategy routines, which can happen
494 * if we do the subsequent bawrite in here.
495 */
496STATIC uint 483STATIC uint
497xfs_inode_item_trylock( 484xfs_inode_item_push(
498 struct xfs_log_item *lip) 485 struct xfs_log_item *lip,
486 struct list_head *buffer_list)
499{ 487{
500 struct xfs_inode_log_item *iip = INODE_ITEM(lip); 488 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
501 struct xfs_inode *ip = iip->ili_inode; 489 struct xfs_inode *ip = iip->ili_inode;
490 struct xfs_buf *bp = NULL;
491 uint rval = XFS_ITEM_SUCCESS;
492 int error;
502 493
503 if (xfs_ipincount(ip) > 0) 494 if (xfs_ipincount(ip) > 0)
504 return XFS_ITEM_PINNED; 495 return XFS_ITEM_PINNED;
@@ -511,34 +502,45 @@ xfs_inode_item_trylock(
511 * taking the ilock. 502 * taking the ilock.
512 */ 503 */
513 if (xfs_ipincount(ip) > 0) { 504 if (xfs_ipincount(ip) > 0) {
514 xfs_iunlock(ip, XFS_ILOCK_SHARED); 505 rval = XFS_ITEM_PINNED;
515 return XFS_ITEM_PINNED; 506 goto out_unlock;
516 } 507 }
517 508
509 /*
510 * Someone else is already flushing the inode. Nothing we can do
511 * here but wait for the flush to finish and remove the item from
512 * the AIL.
513 */
518 if (!xfs_iflock_nowait(ip)) { 514 if (!xfs_iflock_nowait(ip)) {
519 /* 515 rval = XFS_ITEM_FLUSHING;
520 * inode has already been flushed to the backing buffer, 516 goto out_unlock;
521 * leave it locked in shared mode, pushbuf routine will
522 * unlock it.
523 */
524 return XFS_ITEM_PUSHBUF;
525 } 517 }
526 518
527 /* Stale items should force out the iclog */ 519 /*
520 * Stale inode items should force out the iclog.
521 */
528 if (ip->i_flags & XFS_ISTALE) { 522 if (ip->i_flags & XFS_ISTALE) {
529 xfs_ifunlock(ip); 523 xfs_ifunlock(ip);
530 xfs_iunlock(ip, XFS_ILOCK_SHARED); 524 xfs_iunlock(ip, XFS_ILOCK_SHARED);
531 return XFS_ITEM_PINNED; 525 return XFS_ITEM_PINNED;
532 } 526 }
533 527
534#ifdef DEBUG 528 ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount));
535 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 529 ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount));
536 ASSERT(iip->ili_fields != 0); 530
537 ASSERT(iip->ili_logged == 0); 531 spin_unlock(&lip->li_ailp->xa_lock);
538 ASSERT(lip->li_flags & XFS_LI_IN_AIL); 532
533 error = xfs_iflush(ip, &bp);
534 if (!error) {
535 if (!xfs_buf_delwri_queue(bp, buffer_list))
536 rval = XFS_ITEM_FLUSHING;
537 xfs_buf_relse(bp);
539 } 538 }
540#endif 539
541 return XFS_ITEM_SUCCESS; 540 spin_lock(&lip->li_ailp->xa_lock);
541out_unlock:
542 xfs_iunlock(ip, XFS_ILOCK_SHARED);
543 return rval;
542} 544}
543 545
544/* 546/*
@@ -623,92 +625,6 @@ xfs_inode_item_committed(
623} 625}
624 626
625/* 627/*
626 * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
627 * failed to get the inode flush lock but did get the inode locked SHARED.
628 * Here we're trying to see if the inode buffer is incore, and if so whether it's
629 * marked delayed write. If that's the case, we'll promote it and that will
630 * allow the caller to write the buffer by triggering the xfsbufd to run.
631 */
632STATIC bool
633xfs_inode_item_pushbuf(
634 struct xfs_log_item *lip)
635{
636 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
637 struct xfs_inode *ip = iip->ili_inode;
638 struct xfs_buf *bp;
639 bool ret = true;
640
641 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
642
643 /*
644 * If a flush is not in progress anymore, chances are that the
645 * inode was taken off the AIL. So, just get out.
646 */
647 if (!xfs_isiflocked(ip) ||
648 !(lip->li_flags & XFS_LI_IN_AIL)) {
649 xfs_iunlock(ip, XFS_ILOCK_SHARED);
650 return true;
651 }
652
653 bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno,
654 iip->ili_format.ilf_len, XBF_TRYLOCK);
655
656 xfs_iunlock(ip, XFS_ILOCK_SHARED);
657 if (!bp)
658 return true;
659 if (XFS_BUF_ISDELAYWRITE(bp))
660 xfs_buf_delwri_promote(bp);
661 if (xfs_buf_ispinned(bp))
662 ret = false;
663 xfs_buf_relse(bp);
664 return ret;
665}
666
667/*
668 * This is called to asynchronously write the inode associated with this
669 * inode log item out to disk. The inode will already have been locked by
670 * a successful call to xfs_inode_item_trylock().
671 */
672STATIC void
673xfs_inode_item_push(
674 struct xfs_log_item *lip)
675{
676 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
677 struct xfs_inode *ip = iip->ili_inode;
678 struct xfs_buf *bp = NULL;
679 int error;
680
681 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
682 ASSERT(xfs_isiflocked(ip));
683
684 /*
685 * Since we were able to lock the inode's flush lock and
686 * we found it on the AIL, the inode must be dirty. This
687 * is because the inode is removed from the AIL while still
688 * holding the flush lock in xfs_iflush_done(). Thus, if
689 * we found it in the AIL and were able to obtain the flush
690 * lock without sleeping, then there must not have been
691 * anyone in the process of flushing the inode.
692 */
693 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || iip->ili_fields != 0);
694
695 /*
696 * Push the inode to it's backing buffer. This will not remove the
697 * inode from the AIL - a further push will be required to trigger a
698 * buffer push. However, this allows all the dirty inodes to be pushed
699 * to the buffer before it is pushed to disk. The buffer IO completion
700 * will pull the inode from the AIL, mark it clean and unlock the flush
701 * lock.
702 */
703 error = xfs_iflush(ip, &bp);
704 if (!error) {
705 xfs_buf_delwri_queue(bp);
706 xfs_buf_relse(bp);
707 }
708 xfs_iunlock(ip, XFS_ILOCK_SHARED);
709}
710
711/*
712 * XXX rcc - this one really has to do something. Probably needs 628 * XXX rcc - this one really has to do something. Probably needs
713 * to stamp in a new field in the incore inode. 629 * to stamp in a new field in the incore inode.
714 */ 630 */
@@ -728,11 +644,9 @@ static const struct xfs_item_ops xfs_inode_item_ops = {
728 .iop_format = xfs_inode_item_format, 644 .iop_format = xfs_inode_item_format,
729 .iop_pin = xfs_inode_item_pin, 645 .iop_pin = xfs_inode_item_pin,
730 .iop_unpin = xfs_inode_item_unpin, 646 .iop_unpin = xfs_inode_item_unpin,
731 .iop_trylock = xfs_inode_item_trylock,
732 .iop_unlock = xfs_inode_item_unlock, 647 .iop_unlock = xfs_inode_item_unlock,
733 .iop_committed = xfs_inode_item_committed, 648 .iop_committed = xfs_inode_item_committed,
734 .iop_push = xfs_inode_item_push, 649 .iop_push = xfs_inode_item_push,
735 .iop_pushbuf = xfs_inode_item_pushbuf,
736 .iop_committing = xfs_inode_item_committing 650 .iop_committing = xfs_inode_item_committing
737}; 651};
738 652
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 8ecad5bad66c..5e864a9c0ccf 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2103,6 +2103,7 @@ xlog_recover_do_dquot_buffer(
2103STATIC int 2103STATIC int
2104xlog_recover_buffer_pass2( 2104xlog_recover_buffer_pass2(
2105 xlog_t *log, 2105 xlog_t *log,
2106 struct list_head *buffer_list,
2106 xlog_recover_item_t *item) 2107 xlog_recover_item_t *item)
2107{ 2108{
2108 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; 2109 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
@@ -2173,7 +2174,7 @@ xlog_recover_buffer_pass2(
2173 } else { 2174 } else {
2174 ASSERT(bp->b_target->bt_mount == mp); 2175 ASSERT(bp->b_target->bt_mount == mp);
2175 bp->b_iodone = xlog_recover_iodone; 2176 bp->b_iodone = xlog_recover_iodone;
2176 xfs_buf_delwri_queue(bp); 2177 xfs_buf_delwri_queue(bp, buffer_list);
2177 } 2178 }
2178 2179
2179 xfs_buf_relse(bp); 2180 xfs_buf_relse(bp);
@@ -2183,6 +2184,7 @@ xlog_recover_buffer_pass2(
2183STATIC int 2184STATIC int
2184xlog_recover_inode_pass2( 2185xlog_recover_inode_pass2(
2185 xlog_t *log, 2186 xlog_t *log,
2187 struct list_head *buffer_list,
2186 xlog_recover_item_t *item) 2188 xlog_recover_item_t *item)
2187{ 2189{
2188 xfs_inode_log_format_t *in_f; 2190 xfs_inode_log_format_t *in_f;
@@ -2436,7 +2438,7 @@ xlog_recover_inode_pass2(
2436write_inode_buffer: 2438write_inode_buffer:
2437 ASSERT(bp->b_target->bt_mount == mp); 2439 ASSERT(bp->b_target->bt_mount == mp);
2438 bp->b_iodone = xlog_recover_iodone; 2440 bp->b_iodone = xlog_recover_iodone;
2439 xfs_buf_delwri_queue(bp); 2441 xfs_buf_delwri_queue(bp, buffer_list);
2440 xfs_buf_relse(bp); 2442 xfs_buf_relse(bp);
2441error: 2443error:
2442 if (need_free) 2444 if (need_free)
@@ -2477,6 +2479,7 @@ xlog_recover_quotaoff_pass1(
2477STATIC int 2479STATIC int
2478xlog_recover_dquot_pass2( 2480xlog_recover_dquot_pass2(
2479 xlog_t *log, 2481 xlog_t *log,
2482 struct list_head *buffer_list,
2480 xlog_recover_item_t *item) 2483 xlog_recover_item_t *item)
2481{ 2484{
2482 xfs_mount_t *mp = log->l_mp; 2485 xfs_mount_t *mp = log->l_mp;
@@ -2558,7 +2561,7 @@ xlog_recover_dquot_pass2(
2558 ASSERT(dq_f->qlf_size == 2); 2561 ASSERT(dq_f->qlf_size == 2);
2559 ASSERT(bp->b_target->bt_mount == mp); 2562 ASSERT(bp->b_target->bt_mount == mp);
2560 bp->b_iodone = xlog_recover_iodone; 2563 bp->b_iodone = xlog_recover_iodone;
2561 xfs_buf_delwri_queue(bp); 2564 xfs_buf_delwri_queue(bp, buffer_list);
2562 xfs_buf_relse(bp); 2565 xfs_buf_relse(bp);
2563 2566
2564 return (0); 2567 return (0);
@@ -2712,21 +2715,22 @@ STATIC int
2712xlog_recover_commit_pass2( 2715xlog_recover_commit_pass2(
2713 struct log *log, 2716 struct log *log,
2714 struct xlog_recover *trans, 2717 struct xlog_recover *trans,
2718 struct list_head *buffer_list,
2715 xlog_recover_item_t *item) 2719 xlog_recover_item_t *item)
2716{ 2720{
2717 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); 2721 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
2718 2722
2719 switch (ITEM_TYPE(item)) { 2723 switch (ITEM_TYPE(item)) {
2720 case XFS_LI_BUF: 2724 case XFS_LI_BUF:
2721 return xlog_recover_buffer_pass2(log, item); 2725 return xlog_recover_buffer_pass2(log, buffer_list, item);
2722 case XFS_LI_INODE: 2726 case XFS_LI_INODE:
2723 return xlog_recover_inode_pass2(log, item); 2727 return xlog_recover_inode_pass2(log, buffer_list, item);
2724 case XFS_LI_EFI: 2728 case XFS_LI_EFI:
2725 return xlog_recover_efi_pass2(log, item, trans->r_lsn); 2729 return xlog_recover_efi_pass2(log, item, trans->r_lsn);
2726 case XFS_LI_EFD: 2730 case XFS_LI_EFD:
2727 return xlog_recover_efd_pass2(log, item); 2731 return xlog_recover_efd_pass2(log, item);
2728 case XFS_LI_DQUOT: 2732 case XFS_LI_DQUOT:
2729 return xlog_recover_dquot_pass2(log, item); 2733 return xlog_recover_dquot_pass2(log, buffer_list, item);
2730 case XFS_LI_QUOTAOFF: 2734 case XFS_LI_QUOTAOFF:
2731 /* nothing to do in pass2 */ 2735 /* nothing to do in pass2 */
2732 return 0; 2736 return 0;
@@ -2750,8 +2754,9 @@ xlog_recover_commit_trans(
2750 struct xlog_recover *trans, 2754 struct xlog_recover *trans,
2751 int pass) 2755 int pass)
2752{ 2756{
2753 int error = 0; 2757 int error = 0, error2;
2754 xlog_recover_item_t *item; 2758 xlog_recover_item_t *item;
2759 LIST_HEAD (buffer_list);
2755 2760
2756 hlist_del(&trans->r_list); 2761 hlist_del(&trans->r_list);
2757 2762
@@ -2760,16 +2765,27 @@ xlog_recover_commit_trans(
2760 return error; 2765 return error;
2761 2766
2762 list_for_each_entry(item, &trans->r_itemq, ri_list) { 2767 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2763 if (pass == XLOG_RECOVER_PASS1) 2768 switch (pass) {
2769 case XLOG_RECOVER_PASS1:
2764 error = xlog_recover_commit_pass1(log, trans, item); 2770 error = xlog_recover_commit_pass1(log, trans, item);
2765 else 2771 break;
2766 error = xlog_recover_commit_pass2(log, trans, item); 2772 case XLOG_RECOVER_PASS2:
2773 error = xlog_recover_commit_pass2(log, trans,
2774 &buffer_list, item);
2775 break;
2776 default:
2777 ASSERT(0);
2778 }
2779
2767 if (error) 2780 if (error)
2768 return error; 2781 goto out;
2769 } 2782 }
2770 2783
2771 xlog_recover_free_trans(trans); 2784 xlog_recover_free_trans(trans);
2772 return 0; 2785
2786out:
2787 error2 = xfs_buf_delwri_submit(&buffer_list);
2788 return error ? error : error2;
2773} 2789}
2774 2790
2775STATIC int 2791STATIC int
@@ -3639,11 +3655,8 @@ xlog_do_recover(
3639 * First replay the images in the log. 3655 * First replay the images in the log.
3640 */ 3656 */
3641 error = xlog_do_log_recovery(log, head_blk, tail_blk); 3657 error = xlog_do_log_recovery(log, head_blk, tail_blk);
3642 if (error) { 3658 if (error)
3643 return error; 3659 return error;
3644 }
3645
3646 xfs_flush_buftarg(log->l_mp->m_ddev_targp, 1);
3647 3660
3648 /* 3661 /*
3649 * If IO errors happened during recovery, bail out. 3662 * If IO errors happened during recovery, bail out.
@@ -3670,7 +3683,6 @@ xlog_do_recover(
3670 bp = xfs_getsb(log->l_mp, 0); 3683 bp = xfs_getsb(log->l_mp, 0);
3671 XFS_BUF_UNDONE(bp); 3684 XFS_BUF_UNDONE(bp);
3672 ASSERT(!(XFS_BUF_ISWRITE(bp))); 3685 ASSERT(!(XFS_BUF_ISWRITE(bp)));
3673 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
3674 XFS_BUF_READ(bp); 3686 XFS_BUF_READ(bp);
3675 XFS_BUF_UNASYNC(bp); 3687 XFS_BUF_UNASYNC(bp);
3676 xfsbdstrat(log->l_mp, bp); 3688 xfsbdstrat(log->l_mp, bp);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 95aecf52475d..755a9bd749d0 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -65,7 +65,8 @@ STATIC int
65xfs_qm_dquot_walk( 65xfs_qm_dquot_walk(
66 struct xfs_mount *mp, 66 struct xfs_mount *mp,
67 int type, 67 int type,
68 int (*execute)(struct xfs_dquot *dqp)) 68 int (*execute)(struct xfs_dquot *dqp, void *data),
69 void *data)
69{ 70{
70 struct xfs_quotainfo *qi = mp->m_quotainfo; 71 struct xfs_quotainfo *qi = mp->m_quotainfo;
71 struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type); 72 struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
@@ -97,7 +98,7 @@ restart:
97 98
98 next_index = be32_to_cpu(dqp->q_core.d_id) + 1; 99 next_index = be32_to_cpu(dqp->q_core.d_id) + 1;
99 100
100 error = execute(batch[i]); 101 error = execute(batch[i], data);
101 if (error == EAGAIN) { 102 if (error == EAGAIN) {
102 skipped++; 103 skipped++;
103 continue; 104 continue;
@@ -129,7 +130,8 @@ restart:
129 */ 130 */
130STATIC int 131STATIC int
131xfs_qm_dqpurge( 132xfs_qm_dqpurge(
132 struct xfs_dquot *dqp) 133 struct xfs_dquot *dqp,
134 void *data)
133{ 135{
134 struct xfs_mount *mp = dqp->q_mount; 136 struct xfs_mount *mp = dqp->q_mount;
135 struct xfs_quotainfo *qi = mp->m_quotainfo; 137 struct xfs_quotainfo *qi = mp->m_quotainfo;
@@ -153,21 +155,7 @@ xfs_qm_dqpurge(
153 155
154 dqp->dq_flags |= XFS_DQ_FREEING; 156 dqp->dq_flags |= XFS_DQ_FREEING;
155 157
156 /* 158 xfs_dqflock(dqp);
157 * If we're turning off quotas, we have to make sure that, for
158 * example, we don't delete quota disk blocks while dquots are
159 * in the process of getting written to those disk blocks.
160 * This dquot might well be on AIL, and we can't leave it there
161 * if we're turning off quotas. Basically, we need this flush
162 * lock, and are willing to block on it.
163 */
164 if (!xfs_dqflock_nowait(dqp)) {
165 /*
166 * Block on the flush lock after nudging dquot buffer,
167 * if it is incore.
168 */
169 xfs_dqflock_pushbuf_wait(dqp);
170 }
171 159
172 /* 160 /*
173 * If we are turning this type of quotas off, we don't care 161 * If we are turning this type of quotas off, we don't care
@@ -231,11 +219,11 @@ xfs_qm_dqpurge_all(
231 uint flags) 219 uint flags)
232{ 220{
233 if (flags & XFS_QMOPT_UQUOTA) 221 if (flags & XFS_QMOPT_UQUOTA)
234 xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge); 222 xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL);
235 if (flags & XFS_QMOPT_GQUOTA) 223 if (flags & XFS_QMOPT_GQUOTA)
236 xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge); 224 xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL);
237 if (flags & XFS_QMOPT_PQUOTA) 225 if (flags & XFS_QMOPT_PQUOTA)
238 xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge); 226 xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge, NULL);
239} 227}
240 228
241/* 229/*
@@ -876,15 +864,16 @@ xfs_qm_reset_dqcounts(
876 864
877STATIC int 865STATIC int
878xfs_qm_dqiter_bufs( 866xfs_qm_dqiter_bufs(
879 xfs_mount_t *mp, 867 struct xfs_mount *mp,
880 xfs_dqid_t firstid, 868 xfs_dqid_t firstid,
881 xfs_fsblock_t bno, 869 xfs_fsblock_t bno,
882 xfs_filblks_t blkcnt, 870 xfs_filblks_t blkcnt,
883 uint flags) 871 uint flags,
872 struct list_head *buffer_list)
884{ 873{
885 xfs_buf_t *bp; 874 struct xfs_buf *bp;
886 int error; 875 int error;
887 int type; 876 int type;
888 877
889 ASSERT(blkcnt > 0); 878 ASSERT(blkcnt > 0);
890 type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : 879 type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER :
@@ -908,7 +897,7 @@ xfs_qm_dqiter_bufs(
908 break; 897 break;
909 898
910 xfs_qm_reset_dqcounts(mp, bp, firstid, type); 899 xfs_qm_reset_dqcounts(mp, bp, firstid, type);
911 xfs_buf_delwri_queue(bp); 900 xfs_buf_delwri_queue(bp, buffer_list);
912 xfs_buf_relse(bp); 901 xfs_buf_relse(bp);
913 /* 902 /*
914 * goto the next block. 903 * goto the next block.
@@ -916,6 +905,7 @@ xfs_qm_dqiter_bufs(
916 bno++; 905 bno++;
917 firstid += mp->m_quotainfo->qi_dqperchunk; 906 firstid += mp->m_quotainfo->qi_dqperchunk;
918 } 907 }
908
919 return error; 909 return error;
920} 910}
921 911
@@ -925,11 +915,12 @@ xfs_qm_dqiter_bufs(
925 */ 915 */
926STATIC int 916STATIC int
927xfs_qm_dqiterate( 917xfs_qm_dqiterate(
928 xfs_mount_t *mp, 918 struct xfs_mount *mp,
929 xfs_inode_t *qip, 919 struct xfs_inode *qip,
930 uint flags) 920 uint flags,
921 struct list_head *buffer_list)
931{ 922{
932 xfs_bmbt_irec_t *map; 923 struct xfs_bmbt_irec *map;
933 int i, nmaps; /* number of map entries */ 924 int i, nmaps; /* number of map entries */
934 int error; /* return value */ 925 int error; /* return value */
935 xfs_fileoff_t lblkno; 926 xfs_fileoff_t lblkno;
@@ -996,21 +987,17 @@ xfs_qm_dqiterate(
996 * Iterate thru all the blks in the extent and 987 * Iterate thru all the blks in the extent and
997 * reset the counters of all the dquots inside them. 988 * reset the counters of all the dquots inside them.
998 */ 989 */
999 if ((error = xfs_qm_dqiter_bufs(mp, 990 error = xfs_qm_dqiter_bufs(mp, firstid,
1000 firstid, 991 map[i].br_startblock,
1001 map[i].br_startblock, 992 map[i].br_blockcount,
1002 map[i].br_blockcount, 993 flags, buffer_list);
1003 flags))) { 994 if (error)
1004 break; 995 goto out;
1005 }
1006 } 996 }
1007
1008 if (error)
1009 break;
1010 } while (nmaps > 0); 997 } while (nmaps > 0);
1011 998
999out:
1012 kmem_free(map); 1000 kmem_free(map);
1013
1014 return error; 1001 return error;
1015} 1002}
1016 1003
@@ -1203,8 +1190,10 @@ error0:
1203 1190
1204STATIC int 1191STATIC int
1205xfs_qm_flush_one( 1192xfs_qm_flush_one(
1206 struct xfs_dquot *dqp) 1193 struct xfs_dquot *dqp,
1194 void *data)
1207{ 1195{
1196 struct list_head *buffer_list = data;
1208 struct xfs_buf *bp = NULL; 1197 struct xfs_buf *bp = NULL;
1209 int error = 0; 1198 int error = 0;
1210 1199
@@ -1214,14 +1203,12 @@ xfs_qm_flush_one(
1214 if (!XFS_DQ_IS_DIRTY(dqp)) 1203 if (!XFS_DQ_IS_DIRTY(dqp))
1215 goto out_unlock; 1204 goto out_unlock;
1216 1205
1217 if (!xfs_dqflock_nowait(dqp)) 1206 xfs_dqflock(dqp);
1218 xfs_dqflock_pushbuf_wait(dqp);
1219
1220 error = xfs_qm_dqflush(dqp, &bp); 1207 error = xfs_qm_dqflush(dqp, &bp);
1221 if (error) 1208 if (error)
1222 goto out_unlock; 1209 goto out_unlock;
1223 1210
1224 xfs_buf_delwri_queue(bp); 1211 xfs_buf_delwri_queue(bp, buffer_list);
1225 xfs_buf_relse(bp); 1212 xfs_buf_relse(bp);
1226out_unlock: 1213out_unlock:
1227 xfs_dqunlock(dqp); 1214 xfs_dqunlock(dqp);
@@ -1241,6 +1228,7 @@ xfs_qm_quotacheck(
1241 size_t structsz; 1228 size_t structsz;
1242 xfs_inode_t *uip, *gip; 1229 xfs_inode_t *uip, *gip;
1243 uint flags; 1230 uint flags;
1231 LIST_HEAD (buffer_list);
1244 1232
1245 count = INT_MAX; 1233 count = INT_MAX;
1246 structsz = 1; 1234 structsz = 1;
@@ -1259,7 +1247,8 @@ xfs_qm_quotacheck(
1259 */ 1247 */
1260 uip = mp->m_quotainfo->qi_uquotaip; 1248 uip = mp->m_quotainfo->qi_uquotaip;
1261 if (uip) { 1249 if (uip) {
1262 error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA); 1250 error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA,
1251 &buffer_list);
1263 if (error) 1252 if (error)
1264 goto error_return; 1253 goto error_return;
1265 flags |= XFS_UQUOTA_CHKD; 1254 flags |= XFS_UQUOTA_CHKD;
@@ -1268,7 +1257,8 @@ xfs_qm_quotacheck(
1268 gip = mp->m_quotainfo->qi_gquotaip; 1257 gip = mp->m_quotainfo->qi_gquotaip;
1269 if (gip) { 1258 if (gip) {
1270 error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? 1259 error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
1271 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA); 1260 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA,
1261 &buffer_list);
1272 if (error) 1262 if (error)
1273 goto error_return; 1263 goto error_return;
1274 flags |= XFS_OQUOTA_CHKD; 1264 flags |= XFS_OQUOTA_CHKD;
@@ -1291,19 +1281,27 @@ xfs_qm_quotacheck(
1291 * We've made all the changes that we need to make incore. Flush them 1281 * We've made all the changes that we need to make incore. Flush them
1292 * down to disk buffers if everything was updated successfully. 1282 * down to disk buffers if everything was updated successfully.
1293 */ 1283 */
1294 if (XFS_IS_UQUOTA_ON(mp)) 1284 if (XFS_IS_UQUOTA_ON(mp)) {
1295 error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one); 1285 error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one,
1286 &buffer_list);
1287 }
1296 if (XFS_IS_GQUOTA_ON(mp)) { 1288 if (XFS_IS_GQUOTA_ON(mp)) {
1297 error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one); 1289 error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one,
1290 &buffer_list);
1298 if (!error) 1291 if (!error)
1299 error = error2; 1292 error = error2;
1300 } 1293 }
1301 if (XFS_IS_PQUOTA_ON(mp)) { 1294 if (XFS_IS_PQUOTA_ON(mp)) {
1302 error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one); 1295 error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one,
1296 &buffer_list);
1303 if (!error) 1297 if (!error)
1304 error = error2; 1298 error = error2;
1305 } 1299 }
1306 1300
1301 error2 = xfs_buf_delwri_submit(&buffer_list);
1302 if (!error)
1303 error = error2;
1304
1307 /* 1305 /*
1308 * We can get this error if we couldn't do a dquot allocation inside 1306 * We can get this error if we couldn't do a dquot allocation inside
1309 * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the 1307 * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the
@@ -1317,15 +1315,6 @@ xfs_qm_quotacheck(
1317 } 1315 }
1318 1316
1319 /* 1317 /*
1320 * We didn't log anything, because if we crashed, we'll have to
1321 * start the quotacheck from scratch anyway. However, we must make
1322 * sure that our dquot changes are secure before we put the
1323 * quotacheck'd stamp on the superblock. So, here we do a synchronous
1324 * flush.
1325 */
1326 xfs_flush_buftarg(mp->m_ddev_targp, 1);
1327
1328 /*
1329 * If one type of quotas is off, then it will lose its 1318 * If one type of quotas is off, then it will lose its
1330 * quotachecked status, since we won't be doing accounting for 1319 * quotachecked status, since we won't be doing accounting for
1331 * that type anymore. 1320 * that type anymore.
@@ -1334,6 +1323,13 @@ xfs_qm_quotacheck(
1334 mp->m_qflags |= flags; 1323 mp->m_qflags |= flags;
1335 1324
1336 error_return: 1325 error_return:
1326 while (!list_empty(&buffer_list)) {
1327 struct xfs_buf *bp =
1328 list_first_entry(&buffer_list, struct xfs_buf, b_list);
1329 list_del_init(&bp->b_list);
1330 xfs_buf_relse(bp);
1331 }
1332
1337 if (error) { 1333 if (error) {
1338 xfs_warn(mp, 1334 xfs_warn(mp,
1339 "Quotacheck: Unsuccessful (Error %d): Disabling quotas.", 1335 "Quotacheck: Unsuccessful (Error %d): Disabling quotas.",
@@ -1450,6 +1446,7 @@ xfs_qm_dqfree_one(
1450STATIC void 1446STATIC void
1451xfs_qm_dqreclaim_one( 1447xfs_qm_dqreclaim_one(
1452 struct xfs_dquot *dqp, 1448 struct xfs_dquot *dqp,
1449 struct list_head *buffer_list,
1453 struct list_head *dispose_list) 1450 struct list_head *dispose_list)
1454{ 1451{
1455 struct xfs_mount *mp = dqp->q_mount; 1452 struct xfs_mount *mp = dqp->q_mount;
@@ -1482,21 +1479,11 @@ xfs_qm_dqreclaim_one(
1482 if (!xfs_dqflock_nowait(dqp)) 1479 if (!xfs_dqflock_nowait(dqp))
1483 goto out_busy; 1480 goto out_busy;
1484 1481
1485 /*
1486 * We have the flush lock so we know that this is not in the
1487 * process of being flushed. So, if this is dirty, flush it
1488 * DELWRI so that we don't get a freelist infested with
1489 * dirty dquots.
1490 */
1491 if (XFS_DQ_IS_DIRTY(dqp)) { 1482 if (XFS_DQ_IS_DIRTY(dqp)) {
1492 struct xfs_buf *bp = NULL; 1483 struct xfs_buf *bp = NULL;
1493 1484
1494 trace_xfs_dqreclaim_dirty(dqp); 1485 trace_xfs_dqreclaim_dirty(dqp);
1495 1486
1496 /*
1497 * We flush it delayed write, so don't bother releasing the
1498 * freelist lock.
1499 */
1500 error = xfs_qm_dqflush(dqp, &bp); 1487 error = xfs_qm_dqflush(dqp, &bp);
1501 if (error) { 1488 if (error) {
1502 xfs_warn(mp, "%s: dquot %p flush failed", 1489 xfs_warn(mp, "%s: dquot %p flush failed",
@@ -1504,7 +1491,7 @@ xfs_qm_dqreclaim_one(
1504 goto out_busy; 1491 goto out_busy;
1505 } 1492 }
1506 1493
1507 xfs_buf_delwri_queue(bp); 1494 xfs_buf_delwri_queue(bp, buffer_list);
1508 xfs_buf_relse(bp); 1495 xfs_buf_relse(bp);
1509 /* 1496 /*
1510 * Give the dquot another try on the freelist, as the 1497 * Give the dquot another try on the freelist, as the
@@ -1549,8 +1536,10 @@ xfs_qm_shake(
1549 struct xfs_quotainfo *qi = 1536 struct xfs_quotainfo *qi =
1550 container_of(shrink, struct xfs_quotainfo, qi_shrinker); 1537 container_of(shrink, struct xfs_quotainfo, qi_shrinker);
1551 int nr_to_scan = sc->nr_to_scan; 1538 int nr_to_scan = sc->nr_to_scan;
1539 LIST_HEAD (buffer_list);
1552 LIST_HEAD (dispose_list); 1540 LIST_HEAD (dispose_list);
1553 struct xfs_dquot *dqp; 1541 struct xfs_dquot *dqp;
1542 int error;
1554 1543
1555 if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) 1544 if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
1556 return 0; 1545 return 0;
@@ -1563,15 +1552,20 @@ xfs_qm_shake(
1563 break; 1552 break;
1564 dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot, 1553 dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot,
1565 q_lru); 1554 q_lru);
1566 xfs_qm_dqreclaim_one(dqp, &dispose_list); 1555 xfs_qm_dqreclaim_one(dqp, &buffer_list, &dispose_list);
1567 } 1556 }
1568 mutex_unlock(&qi->qi_lru_lock); 1557 mutex_unlock(&qi->qi_lru_lock);
1569 1558
1559 error = xfs_buf_delwri_submit(&buffer_list);
1560 if (error)
1561 xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
1562
1570 while (!list_empty(&dispose_list)) { 1563 while (!list_empty(&dispose_list)) {
1571 dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru); 1564 dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru);
1572 list_del_init(&dqp->q_lru); 1565 list_del_init(&dqp->q_lru);
1573 xfs_qm_dqfree_one(dqp); 1566 xfs_qm_dqfree_one(dqp);
1574 } 1567 }
1568
1575out: 1569out:
1576 return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure; 1570 return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure;
1577} 1571}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 28d1f508b578..fa07b7731cf2 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -981,15 +981,7 @@ xfs_fs_put_super(
981{ 981{
982 struct xfs_mount *mp = XFS_M(sb); 982 struct xfs_mount *mp = XFS_M(sb);
983 983
984 /*
985 * Blow away any referenced inode in the filestreams cache.
986 * This can and will cause log traffic as inodes go inactive
987 * here.
988 */
989 xfs_filestream_unmount(mp); 984 xfs_filestream_unmount(mp);
990
991 xfs_flush_buftarg(mp->m_ddev_targp, 1);
992
993 xfs_unmountfs(mp); 985 xfs_unmountfs(mp);
994 xfs_syncd_stop(mp); 986 xfs_syncd_stop(mp);
995 xfs_freesb(mp); 987 xfs_freesb(mp);
@@ -1404,15 +1396,7 @@ out_destroy_workqueues:
1404 return -error; 1396 return -error;
1405 1397
1406 out_unmount: 1398 out_unmount:
1407 /*
1408 * Blow away any referenced inode in the filestreams cache.
1409 * This can and will cause log traffic as inodes go inactive
1410 * here.
1411 */
1412 xfs_filestream_unmount(mp); 1399 xfs_filestream_unmount(mp);
1413
1414 xfs_flush_buftarg(mp->m_ddev_targp, 1);
1415
1416 xfs_unmountfs(mp); 1400 xfs_unmountfs(mp);
1417 xfs_syncd_stop(mp); 1401 xfs_syncd_stop(mp);
1418 goto out_free_sb; 1402 goto out_free_sb;
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 468c3c0a4f9f..cdb644fd0bd1 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -313,17 +313,10 @@ xfs_quiesce_data(
313 /* write superblock and hoover up shutdown errors */ 313 /* write superblock and hoover up shutdown errors */
314 error = xfs_sync_fsdata(mp); 314 error = xfs_sync_fsdata(mp);
315 315
316 /* make sure all delwri buffers are written out */
317 xfs_flush_buftarg(mp->m_ddev_targp, 1);
318
319 /* mark the log as covered if needed */ 316 /* mark the log as covered if needed */
320 if (xfs_log_need_covered(mp)) 317 if (xfs_log_need_covered(mp))
321 error2 = xfs_fs_log_dummy(mp); 318 error2 = xfs_fs_log_dummy(mp);
322 319
323 /* flush data-only devices */
324 if (mp->m_rtdev_targp)
325 xfs_flush_buftarg(mp->m_rtdev_targp, 1);
326
327 return error ? error : error2; 320 return error ? error : error2;
328} 321}
329 322
@@ -684,17 +677,6 @@ restart:
684 if (!xfs_iflock_nowait(ip)) { 677 if (!xfs_iflock_nowait(ip)) {
685 if (!(sync_mode & SYNC_WAIT)) 678 if (!(sync_mode & SYNC_WAIT))
686 goto out; 679 goto out;
687
688 /*
689 * If we only have a single dirty inode in a cluster there is
690 * a fair chance that the AIL push may have pushed it into
691 * the buffer, but xfsbufd won't touch it until 30 seconds
692 * from now, and thus we will lock up here.
693 *
694 * Promote the inode buffer to the front of the delwri list
695 * and wake up xfsbufd now.
696 */
697 xfs_promote_inode(ip);
698 xfs_iflock(ip); 680 xfs_iflock(ip);
699 } 681 }
700 682
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 06838c42b2a0..2e41756e263a 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -328,7 +328,7 @@ DEFINE_BUF_EVENT(xfs_buf_unlock);
328DEFINE_BUF_EVENT(xfs_buf_iowait); 328DEFINE_BUF_EVENT(xfs_buf_iowait);
329DEFINE_BUF_EVENT(xfs_buf_iowait_done); 329DEFINE_BUF_EVENT(xfs_buf_iowait_done);
330DEFINE_BUF_EVENT(xfs_buf_delwri_queue); 330DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
331DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue); 331DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
332DEFINE_BUF_EVENT(xfs_buf_delwri_split); 332DEFINE_BUF_EVENT(xfs_buf_delwri_split);
333DEFINE_BUF_EVENT(xfs_buf_get_uncached); 333DEFINE_BUF_EVENT(xfs_buf_get_uncached);
334DEFINE_BUF_EVENT(xfs_bdstrat_shut); 334DEFINE_BUF_EVENT(xfs_bdstrat_shut);
@@ -486,12 +486,10 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
486DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); 486DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
487DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); 487DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
488DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); 488DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
489DEFINE_BUF_ITEM_EVENT(xfs_buf_item_trylock);
490DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock); 489DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
491DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); 490DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
492DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); 491DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
493DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); 492DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
494DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf);
495DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); 493DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
496DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur); 494DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
497DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb); 495DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
@@ -881,10 +879,9 @@ DEFINE_EVENT(xfs_log_item_class, name, \
881 TP_PROTO(struct xfs_log_item *lip), \ 879 TP_PROTO(struct xfs_log_item *lip), \
882 TP_ARGS(lip)) 880 TP_ARGS(lip))
883DEFINE_LOG_ITEM_EVENT(xfs_ail_push); 881DEFINE_LOG_ITEM_EVENT(xfs_ail_push);
884DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf);
885DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf_pinned);
886DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned); 882DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
887DEFINE_LOG_ITEM_EVENT(xfs_ail_locked); 883DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
884DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing);
888 885
889 886
890DECLARE_EVENT_CLASS(xfs_file_class, 887DECLARE_EVENT_CLASS(xfs_file_class,
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index f6118703f20d..7ab99e1898c8 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -345,11 +345,9 @@ struct xfs_item_ops {
345 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); 345 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
346 void (*iop_pin)(xfs_log_item_t *); 346 void (*iop_pin)(xfs_log_item_t *);
347 void (*iop_unpin)(xfs_log_item_t *, int remove); 347 void (*iop_unpin)(xfs_log_item_t *, int remove);
348 uint (*iop_trylock)(xfs_log_item_t *); 348 uint (*iop_push)(struct xfs_log_item *, struct list_head *);
349 void (*iop_unlock)(xfs_log_item_t *); 349 void (*iop_unlock)(xfs_log_item_t *);
350 xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); 350 xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
351 void (*iop_push)(xfs_log_item_t *);
352 bool (*iop_pushbuf)(xfs_log_item_t *);
353 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); 351 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
354}; 352};
355 353
@@ -357,20 +355,18 @@ struct xfs_item_ops {
357#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) 355#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
358#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) 356#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip)
359#define IOP_UNPIN(ip, remove) (*(ip)->li_ops->iop_unpin)(ip, remove) 357#define IOP_UNPIN(ip, remove) (*(ip)->li_ops->iop_unpin)(ip, remove)
360#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip) 358#define IOP_PUSH(ip, list) (*(ip)->li_ops->iop_push)(ip, list)
361#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) 359#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip)
362#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn) 360#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn)
363#define IOP_PUSH(ip) (*(ip)->li_ops->iop_push)(ip)
364#define IOP_PUSHBUF(ip) (*(ip)->li_ops->iop_pushbuf)(ip)
365#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn) 361#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
366 362
367/* 363/*
368 * Return values for the IOP_TRYLOCK() routines. 364 * Return values for the IOP_PUSH() routines.
369 */ 365 */
370#define XFS_ITEM_SUCCESS 0 366#define XFS_ITEM_SUCCESS 0
371#define XFS_ITEM_PINNED 1 367#define XFS_ITEM_PINNED 1
372#define XFS_ITEM_LOCKED 2 368#define XFS_ITEM_LOCKED 2
373#define XFS_ITEM_PUSHBUF 3 369#define XFS_ITEM_FLUSHING 3
374 370
375/* 371/*
376 * This is the type of function which can be given to xfs_trans_callback() 372 * This is the type of function which can be given to xfs_trans_callback()
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 0425ca16738b..49d9cde33bb3 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -364,29 +364,31 @@ xfsaild_push(
364 xfs_log_item_t *lip; 364 xfs_log_item_t *lip;
365 xfs_lsn_t lsn; 365 xfs_lsn_t lsn;
366 xfs_lsn_t target; 366 xfs_lsn_t target;
367 long tout = 10; 367 long tout;
368 int stuck = 0; 368 int stuck = 0;
369 int flushing = 0;
369 int count = 0; 370 int count = 0;
370 int push_xfsbufd = 0;
371 371
372 /* 372 /*
373 * If last time we ran we encountered pinned items, force the log first 373 * If we encountered pinned items or did not finish writing out all
374 * and wait for it before pushing again. 374 * buffers the last time we ran, force the log first and wait for it
375 * before pushing again.
375 */ 376 */
376 spin_lock(&ailp->xa_lock); 377 if (ailp->xa_log_flush && ailp->xa_last_pushed_lsn == 0 &&
377 if (ailp->xa_last_pushed_lsn == 0 && ailp->xa_log_flush && 378 (!list_empty_careful(&ailp->xa_buf_list) ||
378 !list_empty(&ailp->xa_ail)) { 379 xfs_ail_min_lsn(ailp))) {
379 ailp->xa_log_flush = 0; 380 ailp->xa_log_flush = 0;
380 spin_unlock(&ailp->xa_lock); 381
381 XFS_STATS_INC(xs_push_ail_flush); 382 XFS_STATS_INC(xs_push_ail_flush);
382 xfs_log_force(mp, XFS_LOG_SYNC); 383 xfs_log_force(mp, XFS_LOG_SYNC);
383 spin_lock(&ailp->xa_lock);
384 } 384 }
385 385
386 spin_lock(&ailp->xa_lock);
386 lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn); 387 lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn);
387 if (!lip) { 388 if (!lip) {
388 /* 389 /*
389 * AIL is empty or our push has reached the end. 390 * If the AIL is empty or our push has reached the end we are
391 * done now.
390 */ 392 */
391 xfs_trans_ail_cursor_done(ailp, &cur); 393 xfs_trans_ail_cursor_done(ailp, &cur);
392 spin_unlock(&ailp->xa_lock); 394 spin_unlock(&ailp->xa_lock);
@@ -395,55 +397,42 @@ xfsaild_push(
395 397
396 XFS_STATS_INC(xs_push_ail); 398 XFS_STATS_INC(xs_push_ail);
397 399
398 /*
399 * While the item we are looking at is below the given threshold
400 * try to flush it out. We'd like not to stop until we've at least
401 * tried to push on everything in the AIL with an LSN less than
402 * the given threshold.
403 *
404 * However, we will stop after a certain number of pushes and wait
405 * for a reduced timeout to fire before pushing further. This
406 * prevents use from spinning when we can't do anything or there is
407 * lots of contention on the AIL lists.
408 */
409 lsn = lip->li_lsn; 400 lsn = lip->li_lsn;
410 target = ailp->xa_target; 401 target = ailp->xa_target;
411 while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) { 402 while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
412 int lock_result; 403 int lock_result;
404
413 /* 405 /*
414 * If we can lock the item without sleeping, unlock the AIL 406 * Note that IOP_PUSH may unlock and reacquire the AIL lock. We
415 * lock and flush the item. Then re-grab the AIL lock so we 407 * rely on the AIL cursor implementation to be able to deal with
416 * can look for the next item on the AIL. List changes are 408 * the dropped lock.
417 * handled by the AIL lookup functions internally
418 *
419 * If we can't lock the item, either its holder will flush it
420 * or it is already being flushed or it is being relogged. In
421 * any of these case it is being taken care of and we can just
422 * skip to the next item in the list.
423 */ 409 */
424 lock_result = IOP_TRYLOCK(lip); 410 lock_result = IOP_PUSH(lip, &ailp->xa_buf_list);
425 spin_unlock(&ailp->xa_lock);
426 switch (lock_result) { 411 switch (lock_result) {
427 case XFS_ITEM_SUCCESS: 412 case XFS_ITEM_SUCCESS:
428 XFS_STATS_INC(xs_push_ail_success); 413 XFS_STATS_INC(xs_push_ail_success);
429 trace_xfs_ail_push(lip); 414 trace_xfs_ail_push(lip);
430 415
431 IOP_PUSH(lip);
432 ailp->xa_last_pushed_lsn = lsn; 416 ailp->xa_last_pushed_lsn = lsn;
433 break; 417 break;
434 418
435 case XFS_ITEM_PUSHBUF: 419 case XFS_ITEM_FLUSHING:
436 XFS_STATS_INC(xs_push_ail_pushbuf); 420 /*
437 trace_xfs_ail_pushbuf(lip); 421 * The item or its backing buffer is already beeing
438 422 * flushed. The typical reason for that is that an
439 if (!IOP_PUSHBUF(lip)) { 423 * inode buffer is locked because we already pushed the
440 trace_xfs_ail_pushbuf_pinned(lip); 424 * updates to it as part of inode clustering.
441 stuck++; 425 *
442 ailp->xa_log_flush++; 426 * We do not want to to stop flushing just because lots
443 } else { 427 * of items are already beeing flushed, but we need to
444 ailp->xa_last_pushed_lsn = lsn; 428 * re-try the flushing relatively soon if most of the
445 } 429 * AIL is beeing flushed.
446 push_xfsbufd = 1; 430 */
431 XFS_STATS_INC(xs_push_ail_flushing);
432 trace_xfs_ail_flushing(lip);
433
434 flushing++;
435 ailp->xa_last_pushed_lsn = lsn;
447 break; 436 break;
448 437
449 case XFS_ITEM_PINNED: 438 case XFS_ITEM_PINNED:
@@ -453,23 +442,22 @@ xfsaild_push(
453 stuck++; 442 stuck++;
454 ailp->xa_log_flush++; 443 ailp->xa_log_flush++;
455 break; 444 break;
456
457 case XFS_ITEM_LOCKED: 445 case XFS_ITEM_LOCKED:
458 XFS_STATS_INC(xs_push_ail_locked); 446 XFS_STATS_INC(xs_push_ail_locked);
459 trace_xfs_ail_locked(lip); 447 trace_xfs_ail_locked(lip);
448
460 stuck++; 449 stuck++;
461 break; 450 break;
462
463 default: 451 default:
464 ASSERT(0); 452 ASSERT(0);
465 break; 453 break;
466 } 454 }
467 455
468 spin_lock(&ailp->xa_lock);
469 count++; 456 count++;
470 457
471 /* 458 /*
472 * Are there too many items we can't do anything with? 459 * Are there too many items we can't do anything with?
460 *
473 * If we we are skipping too many items because we can't flush 461 * If we we are skipping too many items because we can't flush
474 * them or they are already being flushed, we back off and 462 * them or they are already being flushed, we back off and
475 * given them time to complete whatever operation is being 463 * given them time to complete whatever operation is being
@@ -491,42 +479,36 @@ xfsaild_push(
491 xfs_trans_ail_cursor_done(ailp, &cur); 479 xfs_trans_ail_cursor_done(ailp, &cur);
492 spin_unlock(&ailp->xa_lock); 480 spin_unlock(&ailp->xa_lock);
493 481
494 if (push_xfsbufd) { 482 if (xfs_buf_delwri_submit_nowait(&ailp->xa_buf_list))
495 /* we've got delayed write buffers to flush */ 483 ailp->xa_log_flush++;
496 wake_up_process(mp->m_ddev_targp->bt_task);
497 }
498 484
499 /* assume we have more work to do in a short while */ 485 if (!count || XFS_LSN_CMP(lsn, target) >= 0) {
500out_done: 486out_done:
501 if (!count) {
502 /* We're past our target or empty, so idle */
503 ailp->xa_last_pushed_lsn = 0;
504 ailp->xa_log_flush = 0;
505
506 tout = 50;
507 } else if (XFS_LSN_CMP(lsn, target) >= 0) {
508 /* 487 /*
509 * We reached the target so wait a bit longer for I/O to 488 * We reached the target or the AIL is empty, so wait a bit
510 * complete and remove pushed items from the AIL before we 489 * longer for I/O to complete and remove pushed items from the
511 * start the next scan from the start of the AIL. 490 * AIL before we start the next scan from the start of the AIL.
512 */ 491 */
513 tout = 50; 492 tout = 50;
514 ailp->xa_last_pushed_lsn = 0; 493 ailp->xa_last_pushed_lsn = 0;
515 } else if ((stuck * 100) / count > 90) { 494 } else if (((stuck + flushing) * 100) / count > 90) {
516 /* 495 /*
517 * Either there is a lot of contention on the AIL or we 496 * Either there is a lot of contention on the AIL or we are
518 * are stuck due to operations in progress. "Stuck" in this 497 * stuck due to operations in progress. "Stuck" in this case
519 * case is defined as >90% of the items we tried to push 498 * is defined as >90% of the items we tried to push were stuck.
520 * were stuck.
521 * 499 *
522 * Backoff a bit more to allow some I/O to complete before 500 * Backoff a bit more to allow some I/O to complete before
523 * restarting from the start of the AIL. This prevents us 501 * restarting from the start of the AIL. This prevents us from
524 * from spinning on the same items, and if they are pinned will 502 * spinning on the same items, and if they are pinned will all
525 * all the restart to issue a log force to unpin the stuck 503 * the restart to issue a log force to unpin the stuck items.
526 * items.
527 */ 504 */
528 tout = 20; 505 tout = 20;
529 ailp->xa_last_pushed_lsn = 0; 506 ailp->xa_last_pushed_lsn = 0;
507 } else {
508 /*
509 * Assume we have more work to do in a short while.
510 */
511 tout = 10;
530 } 512 }
531 513
532 return tout; 514 return tout;
@@ -539,6 +521,8 @@ xfsaild(
539 struct xfs_ail *ailp = data; 521 struct xfs_ail *ailp = data;
540 long tout = 0; /* milliseconds */ 522 long tout = 0; /* milliseconds */
541 523
524 current->flags |= PF_MEMALLOC;
525
542 while (!kthread_should_stop()) { 526 while (!kthread_should_stop()) {
543 if (tout && tout <= 20) 527 if (tout && tout <= 20)
544 __set_current_state(TASK_KILLABLE); 528 __set_current_state(TASK_KILLABLE);
@@ -794,6 +778,7 @@ xfs_trans_ail_init(
794 INIT_LIST_HEAD(&ailp->xa_ail); 778 INIT_LIST_HEAD(&ailp->xa_ail);
795 INIT_LIST_HEAD(&ailp->xa_cursors); 779 INIT_LIST_HEAD(&ailp->xa_cursors);
796 spin_lock_init(&ailp->xa_lock); 780 spin_lock_init(&ailp->xa_lock);
781 INIT_LIST_HEAD(&ailp->xa_buf_list);
797 init_waitqueue_head(&ailp->xa_empty); 782 init_waitqueue_head(&ailp->xa_empty);
798 783
799 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", 784 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 296a7995a007..9132d162c4b8 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -165,14 +165,6 @@ xfs_trans_get_buf(xfs_trans_t *tp,
165 XFS_BUF_DONE(bp); 165 XFS_BUF_DONE(bp);
166 } 166 }
167 167
168 /*
169 * If the buffer is stale then it was binval'ed
170 * since last read. This doesn't matter since the
171 * caller isn't allowed to use the data anyway.
172 */
173 else if (XFS_BUF_ISSTALE(bp))
174 ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
175
176 ASSERT(bp->b_transp == tp); 168 ASSERT(bp->b_transp == tp);
177 bip = bp->b_fspriv; 169 bip = bp->b_fspriv;
178 ASSERT(bip != NULL); 170 ASSERT(bip != NULL);
@@ -418,19 +410,6 @@ xfs_trans_read_buf(
418 return 0; 410 return 0;
419 411
420shutdown_abort: 412shutdown_abort:
421 /*
422 * the theory here is that buffer is good but we're
423 * bailing out because the filesystem is being forcibly
424 * shut down. So we should leave the b_flags alone since
425 * the buffer's not staled and just get out.
426 */
427#if defined(DEBUG)
428 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
429 xfs_notice(mp, "about to pop assert, bp == 0x%p", bp);
430#endif
431 ASSERT((bp->b_flags & (XBF_STALE|XBF_DELWRI)) !=
432 (XBF_STALE|XBF_DELWRI));
433
434 trace_xfs_trans_read_buf_shut(bp, _RET_IP_); 413 trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
435 xfs_buf_relse(bp); 414 xfs_buf_relse(bp);
436 *bpp = NULL; 415 *bpp = NULL;
@@ -649,22 +628,33 @@ xfs_trans_log_buf(xfs_trans_t *tp,
649 628
650 629
651/* 630/*
652 * This called to invalidate a buffer that is being used within 631 * Invalidate a buffer that is being used within a transaction.
653 * a transaction. Typically this is because the blocks in the 632 *
654 * buffer are being freed, so we need to prevent it from being 633 * Typically this is because the blocks in the buffer are being freed, so we
655 * written out when we're done. Allowing it to be written again 634 * need to prevent it from being written out when we're done. Allowing it
656 * might overwrite data in the free blocks if they are reallocated 635 * to be written again might overwrite data in the free blocks if they are
657 * to a file. 636 * reallocated to a file.
658 * 637 *
659 * We prevent the buffer from being written out by clearing the 638 * We prevent the buffer from being written out by marking it stale. We can't
660 * B_DELWRI flag. We can't always 639 * get rid of the buf log item at this point because the buffer may still be
661 * get rid of the buf log item at this point, though, because 640 * pinned by another transaction. If that is the case, then we'll wait until
662 * the buffer may still be pinned by another transaction. If that 641 * the buffer is committed to disk for the last time (we can tell by the ref
663 * is the case, then we'll wait until the buffer is committed to 642 * count) and free it in xfs_buf_item_unpin(). Until that happens we will
664 * disk for the last time (we can tell by the ref count) and 643 * keep the buffer locked so that the buffer and buf log item are not reused.
665 * free it in xfs_buf_item_unpin(). Until it is cleaned up we 644 *
666 * will keep the buffer locked so that the buffer and buf log item 645 * We also set the XFS_BLF_CANCEL flag in the buf log format structure and log
667 * are not reused. 646 * the buf item. This will be used at recovery time to determine that copies
647 * of the buffer in the log before this should not be replayed.
648 *
649 * We mark the item descriptor and the transaction dirty so that we'll hold
650 * the buffer until after the commit.
651 *
652 * Since we're invalidating the buffer, we also clear the state about which
653 * parts of the buffer have been logged. We also clear the flag indicating
654 * that this is an inode buffer since the data in the buffer will no longer
655 * be valid.
656 *
657 * We set the stale bit in the buffer as well since we're getting rid of it.
668 */ 658 */
669void 659void
670xfs_trans_binval( 660xfs_trans_binval(
@@ -684,7 +674,6 @@ xfs_trans_binval(
684 * If the buffer is already invalidated, then 674 * If the buffer is already invalidated, then
685 * just return. 675 * just return.
686 */ 676 */
687 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
688 ASSERT(XFS_BUF_ISSTALE(bp)); 677 ASSERT(XFS_BUF_ISSTALE(bp));
689 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); 678 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
690 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF)); 679 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
@@ -694,27 +683,8 @@ xfs_trans_binval(
694 return; 683 return;
695 } 684 }
696 685
697 /*
698 * Clear the dirty bit in the buffer and set the STALE flag
699 * in the buf log item. The STALE flag will be used in
700 * xfs_buf_item_unpin() to determine if it should clean up
701 * when the last reference to the buf item is given up.
702 * We set the XFS_BLF_CANCEL flag in the buf log format structure
703 * and log the buf item. This will be used at recovery time
704 * to determine that copies of the buffer in the log before
705 * this should not be replayed.
706 * We mark the item descriptor and the transaction dirty so
707 * that we'll hold the buffer until after the commit.
708 *
709 * Since we're invalidating the buffer, we also clear the state
710 * about which parts of the buffer have been logged. We also
711 * clear the flag indicating that this is an inode buffer since
712 * the data in the buffer will no longer be valid.
713 *
714 * We set the stale bit in the buffer as well since we're getting
715 * rid of it.
716 */
717 xfs_buf_stale(bp); 686 xfs_buf_stale(bp);
687
718 bip->bli_flags |= XFS_BLI_STALE; 688 bip->bli_flags |= XFS_BLI_STALE;
719 bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); 689 bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
720 bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; 690 bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 218304a8cdc7..f72bdd48a5c1 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -71,6 +71,7 @@ struct xfs_ail {
71 spinlock_t xa_lock; 71 spinlock_t xa_lock;
72 xfs_lsn_t xa_last_pushed_lsn; 72 xfs_lsn_t xa_last_pushed_lsn;
73 int xa_log_flush; 73 int xa_log_flush;
74 struct list_head xa_buf_list;
74 wait_queue_head_t xa_empty; 75 wait_queue_head_t xa_empty;
75}; 76};
76 77