aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fs/xfs/xfs_buf.c341
-rw-r--r--fs/xfs/xfs_buf.h28
-rw-r--r--fs/xfs/xfs_buf_item.c96
-rw-r--r--fs/xfs/xfs_dquot.c33
-rw-r--r--fs/xfs/xfs_dquot.h1
-rw-r--r--fs/xfs/xfs_dquot_item.c161
-rw-r--r--fs/xfs/xfs_extfree_item.c55
-rw-r--r--fs/xfs/xfs_inode.c25
-rw-r--r--fs/xfs/xfs_inode.h1
-rw-r--r--fs/xfs/xfs_inode_item.c152
-rw-r--r--fs/xfs/xfs_log_recover.c46
-rw-r--r--fs/xfs/xfs_qm.c148
-rw-r--r--fs/xfs/xfs_super.c16
-rw-r--r--fs/xfs/xfs_sync.c18
-rw-r--r--fs/xfs/xfs_trace.h7
-rw-r--r--fs/xfs/xfs_trans.h18
-rw-r--r--fs/xfs/xfs_trans_ail.c129
-rw-r--r--fs/xfs/xfs_trans_buf.c84
-rw-r--r--fs/xfs/xfs_trans_priv.h1
19 files changed, 442 insertions, 918 deletions
diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c
index 6819b5163e33..b82fc5c67fed 100644
--- a/fs/xfs/xfs_buf.c
+++ b/fs/xfs/xfs_buf.c
@@ -42,7 +42,6 @@
42#include "xfs_trace.h" 42#include "xfs_trace.h"
43 43
44static kmem_zone_t *xfs_buf_zone; 44static kmem_zone_t *xfs_buf_zone;
45STATIC int xfsbufd(void *);
46 45
47static struct workqueue_struct *xfslogd_workqueue; 46static struct workqueue_struct *xfslogd_workqueue;
48 47
@@ -144,8 +143,17 @@ void
144xfs_buf_stale( 143xfs_buf_stale(
145 struct xfs_buf *bp) 144 struct xfs_buf *bp)
146{ 145{
146 ASSERT(xfs_buf_islocked(bp));
147
147 bp->b_flags |= XBF_STALE; 148 bp->b_flags |= XBF_STALE;
148 xfs_buf_delwri_dequeue(bp); 149
150 /*
151 * Clear the delwri status so that a delwri queue walker will not
152 * flush this buffer to disk now that it is stale. The delwri queue has
153 * a reference to the buffer, so this is safe to do.
154 */
155 bp->b_flags &= ~_XBF_DELWRI_Q;
156
149 atomic_set(&(bp)->b_lru_ref, 0); 157 atomic_set(&(bp)->b_lru_ref, 0);
150 if (!list_empty(&bp->b_lru)) { 158 if (!list_empty(&bp->b_lru)) {
151 struct xfs_buftarg *btp = bp->b_target; 159 struct xfs_buftarg *btp = bp->b_target;
@@ -592,10 +600,10 @@ _xfs_buf_read(
592{ 600{
593 int status; 601 int status;
594 602
595 ASSERT(!(flags & (XBF_DELWRI|XBF_WRITE))); 603 ASSERT(!(flags & XBF_WRITE));
596 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL); 604 ASSERT(bp->b_bn != XFS_BUF_DADDR_NULL);
597 605
598 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_DELWRI | XBF_READ_AHEAD); 606 bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD);
599 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); 607 bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD);
600 608
601 status = xfs_buf_iorequest(bp); 609 status = xfs_buf_iorequest(bp);
@@ -855,7 +863,7 @@ xfs_buf_rele(
855 spin_unlock(&pag->pag_buf_lock); 863 spin_unlock(&pag->pag_buf_lock);
856 } else { 864 } else {
857 xfs_buf_lru_del(bp); 865 xfs_buf_lru_del(bp);
858 ASSERT(!(bp->b_flags & (XBF_DELWRI|_XBF_DELWRI_Q))); 866 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
859 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree); 867 rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
860 spin_unlock(&pag->pag_buf_lock); 868 spin_unlock(&pag->pag_buf_lock);
861 xfs_perag_put(pag); 869 xfs_perag_put(pag);
@@ -915,13 +923,6 @@ xfs_buf_lock(
915 trace_xfs_buf_lock_done(bp, _RET_IP_); 923 trace_xfs_buf_lock_done(bp, _RET_IP_);
916} 924}
917 925
918/*
919 * Releases the lock on the buffer object.
920 * If the buffer is marked delwri but is not queued, do so before we
921 * unlock the buffer as we need to set flags correctly. We also need to
922 * take a reference for the delwri queue because the unlocker is going to
923 * drop their's and they don't know we just queued it.
924 */
925void 926void
926xfs_buf_unlock( 927xfs_buf_unlock(
927 struct xfs_buf *bp) 928 struct xfs_buf *bp)
@@ -1019,10 +1020,11 @@ xfs_bwrite(
1019{ 1020{
1020 int error; 1021 int error;
1021 1022
1023 ASSERT(xfs_buf_islocked(bp));
1024
1022 bp->b_flags |= XBF_WRITE; 1025 bp->b_flags |= XBF_WRITE;
1023 bp->b_flags &= ~(XBF_ASYNC | XBF_READ); 1026 bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q);
1024 1027
1025 xfs_buf_delwri_dequeue(bp);
1026 xfs_bdstrat_cb(bp); 1028 xfs_bdstrat_cb(bp);
1027 1029
1028 error = xfs_buf_iowait(bp); 1030 error = xfs_buf_iowait(bp);
@@ -1254,7 +1256,7 @@ xfs_buf_iorequest(
1254{ 1256{
1255 trace_xfs_buf_iorequest(bp, _RET_IP_); 1257 trace_xfs_buf_iorequest(bp, _RET_IP_);
1256 1258
1257 ASSERT(!(bp->b_flags & XBF_DELWRI)); 1259 ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
1258 1260
1259 if (bp->b_flags & XBF_WRITE) 1261 if (bp->b_flags & XBF_WRITE)
1260 xfs_buf_wait_unpin(bp); 1262 xfs_buf_wait_unpin(bp);
@@ -1435,11 +1437,9 @@ xfs_free_buftarg(
1435{ 1437{
1436 unregister_shrinker(&btp->bt_shrinker); 1438 unregister_shrinker(&btp->bt_shrinker);
1437 1439
1438 xfs_flush_buftarg(btp, 1);
1439 if (mp->m_flags & XFS_MOUNT_BARRIER) 1440 if (mp->m_flags & XFS_MOUNT_BARRIER)
1440 xfs_blkdev_issue_flush(btp); 1441 xfs_blkdev_issue_flush(btp);
1441 1442
1442 kthread_stop(btp->bt_task);
1443 kmem_free(btp); 1443 kmem_free(btp);
1444} 1444}
1445 1445
@@ -1491,20 +1491,6 @@ xfs_setsize_buftarg(
1491 return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1); 1491 return xfs_setsize_buftarg_flags(btp, blocksize, sectorsize, 1);
1492} 1492}
1493 1493
1494STATIC int
1495xfs_alloc_delwri_queue(
1496 xfs_buftarg_t *btp,
1497 const char *fsname)
1498{
1499 INIT_LIST_HEAD(&btp->bt_delwri_queue);
1500 spin_lock_init(&btp->bt_delwri_lock);
1501 btp->bt_flags = 0;
1502 btp->bt_task = kthread_run(xfsbufd, btp, "xfsbufd/%s", fsname);
1503 if (IS_ERR(btp->bt_task))
1504 return PTR_ERR(btp->bt_task);
1505 return 0;
1506}
1507
1508xfs_buftarg_t * 1494xfs_buftarg_t *
1509xfs_alloc_buftarg( 1495xfs_alloc_buftarg(
1510 struct xfs_mount *mp, 1496 struct xfs_mount *mp,
@@ -1527,8 +1513,6 @@ xfs_alloc_buftarg(
1527 spin_lock_init(&btp->bt_lru_lock); 1513 spin_lock_init(&btp->bt_lru_lock);
1528 if (xfs_setsize_buftarg_early(btp, bdev)) 1514 if (xfs_setsize_buftarg_early(btp, bdev))
1529 goto error; 1515 goto error;
1530 if (xfs_alloc_delwri_queue(btp, fsname))
1531 goto error;
1532 btp->bt_shrinker.shrink = xfs_buftarg_shrink; 1516 btp->bt_shrinker.shrink = xfs_buftarg_shrink;
1533 btp->bt_shrinker.seeks = DEFAULT_SEEKS; 1517 btp->bt_shrinker.seeks = DEFAULT_SEEKS;
1534 register_shrinker(&btp->bt_shrinker); 1518 register_shrinker(&btp->bt_shrinker);
@@ -1539,125 +1523,52 @@ error:
1539 return NULL; 1523 return NULL;
1540} 1524}
1541 1525
1542
1543/* 1526/*
1544 * Delayed write buffer handling 1527 * Add a buffer to the delayed write list.
1528 *
1529 * This queues a buffer for writeout if it hasn't already been. Note that
1530 * neither this routine nor the buffer list submission functions perform
1531 * any internal synchronization. It is expected that the lists are thread-local
1532 * to the callers.
1533 *
1534 * Returns true if we queued up the buffer, or false if it already had
1535 * been on the buffer list.
1545 */ 1536 */
1546void 1537bool
1547xfs_buf_delwri_queue( 1538xfs_buf_delwri_queue(
1548 xfs_buf_t *bp) 1539 struct xfs_buf *bp,
1540 struct list_head *list)
1549{ 1541{
1550 struct xfs_buftarg *btp = bp->b_target; 1542 ASSERT(xfs_buf_islocked(bp));
1551
1552 trace_xfs_buf_delwri_queue(bp, _RET_IP_);
1553
1554 ASSERT(!(bp->b_flags & XBF_READ)); 1543 ASSERT(!(bp->b_flags & XBF_READ));
1555 1544
1556 spin_lock(&btp->bt_delwri_lock); 1545 /*
1557 if (!list_empty(&bp->b_list)) { 1546 * If the buffer is already marked delwri it already is queued up
1558 /* if already in the queue, move it to the tail */ 1547 * by someone else for imediate writeout. Just ignore it in that
1559 ASSERT(bp->b_flags & _XBF_DELWRI_Q); 1548 * case.
1560 list_move_tail(&bp->b_list, &btp->bt_delwri_queue); 1549 */
1561 } else { 1550 if (bp->b_flags & _XBF_DELWRI_Q) {
1562 /* start xfsbufd as it is about to have something to do */ 1551 trace_xfs_buf_delwri_queued(bp, _RET_IP_);
1563 if (list_empty(&btp->bt_delwri_queue)) 1552 return false;
1564 wake_up_process(bp->b_target->bt_task);
1565
1566 atomic_inc(&bp->b_hold);
1567 bp->b_flags |= XBF_DELWRI | _XBF_DELWRI_Q | XBF_ASYNC;
1568 list_add_tail(&bp->b_list, &btp->bt_delwri_queue);
1569 }
1570 bp->b_queuetime = jiffies;
1571 spin_unlock(&btp->bt_delwri_lock);
1572}
1573
1574void
1575xfs_buf_delwri_dequeue(
1576 xfs_buf_t *bp)
1577{
1578 int dequeued = 0;
1579
1580 spin_lock(&bp->b_target->bt_delwri_lock);
1581 if ((bp->b_flags & XBF_DELWRI) && !list_empty(&bp->b_list)) {
1582 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1583 list_del_init(&bp->b_list);
1584 dequeued = 1;
1585 } 1553 }
1586 bp->b_flags &= ~(XBF_DELWRI|_XBF_DELWRI_Q);
1587 spin_unlock(&bp->b_target->bt_delwri_lock);
1588
1589 if (dequeued)
1590 xfs_buf_rele(bp);
1591
1592 trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
1593}
1594 1554
1595/* 1555 trace_xfs_buf_delwri_queue(bp, _RET_IP_);
1596 * If a delwri buffer needs to be pushed before it has aged out, then promote
1597 * it to the head of the delwri queue so that it will be flushed on the next
1598 * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
1599 * than the age currently needed to flush the buffer. Hence the next time the
1600 * xfsbufd sees it is guaranteed to be considered old enough to flush.
1601 */
1602void
1603xfs_buf_delwri_promote(
1604 struct xfs_buf *bp)
1605{
1606 struct xfs_buftarg *btp = bp->b_target;
1607 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
1608
1609 ASSERT(bp->b_flags & XBF_DELWRI);
1610 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1611 1556
1612 /* 1557 /*
1613 * Check the buffer age before locking the delayed write queue as we 1558 * If a buffer gets written out synchronously or marked stale while it
1614 * don't need to promote buffers that are already past the flush age. 1559 * is on a delwri list we lazily remove it. To do this, the other party
1560 * clears the _XBF_DELWRI_Q flag but otherwise leaves the buffer alone.
1561 * It remains referenced and on the list. In a rare corner case it
1562 * might get readded to a delwri list after the synchronous writeout, in
1563 * which case we need just need to re-add the flag here.
1615 */ 1564 */
1616 if (bp->b_queuetime < jiffies - age) 1565 bp->b_flags |= _XBF_DELWRI_Q;
1617 return; 1566 if (list_empty(&bp->b_list)) {
1618 bp->b_queuetime = jiffies - age; 1567 atomic_inc(&bp->b_hold);
1619 spin_lock(&btp->bt_delwri_lock); 1568 list_add_tail(&bp->b_list, list);
1620 list_move(&bp->b_list, &btp->bt_delwri_queue);
1621 spin_unlock(&btp->bt_delwri_lock);
1622}
1623
1624/*
1625 * Move as many buffers as specified to the supplied list
1626 * idicating if we skipped any buffers to prevent deadlocks.
1627 */
1628STATIC int
1629xfs_buf_delwri_split(
1630 xfs_buftarg_t *target,
1631 struct list_head *list,
1632 unsigned long age)
1633{
1634 xfs_buf_t *bp, *n;
1635 int skipped = 0;
1636 int force;
1637
1638 force = test_and_clear_bit(XBT_FORCE_FLUSH, &target->bt_flags);
1639 INIT_LIST_HEAD(list);
1640 spin_lock(&target->bt_delwri_lock);
1641 list_for_each_entry_safe(bp, n, &target->bt_delwri_queue, b_list) {
1642 ASSERT(bp->b_flags & XBF_DELWRI);
1643
1644 if (!xfs_buf_ispinned(bp) && xfs_buf_trylock(bp)) {
1645 if (!force &&
1646 time_before(jiffies, bp->b_queuetime + age)) {
1647 xfs_buf_unlock(bp);
1648 break;
1649 }
1650
1651 bp->b_flags &= ~(XBF_DELWRI | _XBF_DELWRI_Q);
1652 bp->b_flags |= XBF_WRITE;
1653 list_move_tail(&bp->b_list, list);
1654 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1655 } else
1656 skipped++;
1657 } 1569 }
1658 1570
1659 spin_unlock(&target->bt_delwri_lock); 1571 return true;
1660 return skipped;
1661} 1572}
1662 1573
1663/* 1574/*
@@ -1683,99 +1594,109 @@ xfs_buf_cmp(
1683 return 0; 1594 return 0;
1684} 1595}
1685 1596
1686STATIC int 1597static int
1687xfsbufd( 1598__xfs_buf_delwri_submit(
1688 void *data) 1599 struct list_head *buffer_list,
1600 struct list_head *io_list,
1601 bool wait)
1689{ 1602{
1690 xfs_buftarg_t *target = (xfs_buftarg_t *)data; 1603 struct blk_plug plug;
1691 1604 struct xfs_buf *bp, *n;
1692 current->flags |= PF_MEMALLOC; 1605 int pinned = 0;
1693 1606
1694 set_freezable(); 1607 list_for_each_entry_safe(bp, n, buffer_list, b_list) {
1608 if (!wait) {
1609 if (xfs_buf_ispinned(bp)) {
1610 pinned++;
1611 continue;
1612 }
1613 if (!xfs_buf_trylock(bp))
1614 continue;
1615 } else {
1616 xfs_buf_lock(bp);
1617 }
1695 1618
1696 do { 1619 /*
1697 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10); 1620 * Someone else might have written the buffer synchronously or
1698 long tout = xfs_buf_timer_centisecs * msecs_to_jiffies(10); 1621 * marked it stale in the meantime. In that case only the
1699 struct list_head tmp; 1622 * _XBF_DELWRI_Q flag got cleared, and we have to drop the
1700 struct blk_plug plug; 1623 * reference and remove it from the list here.
1624 */
1625 if (!(bp->b_flags & _XBF_DELWRI_Q)) {
1626 list_del_init(&bp->b_list);
1627 xfs_buf_relse(bp);
1628 continue;
1629 }
1701 1630
1702 if (unlikely(freezing(current))) 1631 list_move_tail(&bp->b_list, io_list);
1703 try_to_freeze(); 1632 trace_xfs_buf_delwri_split(bp, _RET_IP_);
1633 }
1704 1634
1705 /* sleep for a long time if there is nothing to do. */ 1635 list_sort(NULL, io_list, xfs_buf_cmp);
1706 if (list_empty(&target->bt_delwri_queue))
1707 tout = MAX_SCHEDULE_TIMEOUT;
1708 schedule_timeout_interruptible(tout);
1709 1636
1710 xfs_buf_delwri_split(target, &tmp, age); 1637 blk_start_plug(&plug);
1711 list_sort(NULL, &tmp, xfs_buf_cmp); 1638 list_for_each_entry_safe(bp, n, io_list, b_list) {
1639 bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC);
1640 bp->b_flags |= XBF_WRITE;
1712 1641
1713 blk_start_plug(&plug); 1642 if (!wait) {
1714 while (!list_empty(&tmp)) { 1643 bp->b_flags |= XBF_ASYNC;
1715 struct xfs_buf *bp;
1716 bp = list_first_entry(&tmp, struct xfs_buf, b_list);
1717 list_del_init(&bp->b_list); 1644 list_del_init(&bp->b_list);
1718 xfs_bdstrat_cb(bp);
1719 } 1645 }
1720 blk_finish_plug(&plug); 1646 xfs_bdstrat_cb(bp);
1721 } while (!kthread_should_stop()); 1647 }
1648 blk_finish_plug(&plug);
1722 1649
1723 return 0; 1650 return pinned;
1724} 1651}
1725 1652
1726/* 1653/*
1727 * Go through all incore buffers, and release buffers if they belong to 1654 * Write out a buffer list asynchronously.
1728 * the given device. This is used in filesystem error handling to 1655 *
1729 * preserve the consistency of its metadata. 1656 * This will take the @buffer_list, write all non-locked and non-pinned buffers
1657 * out and not wait for I/O completion on any of the buffers. This interface
1658 * is only safely useable for callers that can track I/O completion by higher
1659 * level means, e.g. AIL pushing as the @buffer_list is consumed in this
1660 * function.
1730 */ 1661 */
1731int 1662int
1732xfs_flush_buftarg( 1663xfs_buf_delwri_submit_nowait(
1733 xfs_buftarg_t *target, 1664 struct list_head *buffer_list)
1734 int wait)
1735{ 1665{
1736 xfs_buf_t *bp; 1666 LIST_HEAD (io_list);
1737 int pincount = 0; 1667 return __xfs_buf_delwri_submit(buffer_list, &io_list, false);
1738 LIST_HEAD(tmp_list); 1668}
1739 LIST_HEAD(wait_list);
1740 struct blk_plug plug;
1741 1669
1742 flush_workqueue(xfslogd_workqueue); 1670/*
1671 * Write out a buffer list synchronously.
1672 *
1673 * This will take the @buffer_list, write all buffers out and wait for I/O
1674 * completion on all of the buffers. @buffer_list is consumed by the function,
1675 * so callers must have some other way of tracking buffers if they require such
1676 * functionality.
1677 */
1678int
1679xfs_buf_delwri_submit(
1680 struct list_head *buffer_list)
1681{
1682 LIST_HEAD (io_list);
1683 int error = 0, error2;
1684 struct xfs_buf *bp;
1743 1685
1744 set_bit(XBT_FORCE_FLUSH, &target->bt_flags); 1686 __xfs_buf_delwri_submit(buffer_list, &io_list, true);
1745 pincount = xfs_buf_delwri_split(target, &tmp_list, 0);
1746 1687
1747 /* 1688 /* Wait for IO to complete. */
1748 * Dropped the delayed write list lock, now walk the temporary list. 1689 while (!list_empty(&io_list)) {
1749 * All I/O is issued async and then if we need to wait for completion 1690 bp = list_first_entry(&io_list, struct xfs_buf, b_list);
1750 * we do that after issuing all the IO.
1751 */
1752 list_sort(NULL, &tmp_list, xfs_buf_cmp);
1753 1691
1754 blk_start_plug(&plug);
1755 while (!list_empty(&tmp_list)) {
1756 bp = list_first_entry(&tmp_list, struct xfs_buf, b_list);
1757 ASSERT(target == bp->b_target);
1758 list_del_init(&bp->b_list); 1692 list_del_init(&bp->b_list);
1759 if (wait) { 1693 error2 = xfs_buf_iowait(bp);
1760 bp->b_flags &= ~XBF_ASYNC; 1694 xfs_buf_relse(bp);
1761 list_add(&bp->b_list, &wait_list); 1695 if (!error)
1762 } 1696 error = error2;
1763 xfs_bdstrat_cb(bp);
1764 }
1765 blk_finish_plug(&plug);
1766
1767 if (wait) {
1768 /* Wait for IO to complete. */
1769 while (!list_empty(&wait_list)) {
1770 bp = list_first_entry(&wait_list, struct xfs_buf, b_list);
1771
1772 list_del_init(&bp->b_list);
1773 xfs_buf_iowait(bp);
1774 xfs_buf_relse(bp);
1775 }
1776 } 1697 }
1777 1698
1778 return pincount; 1699 return error;
1779} 1700}
1780 1701
1781int __init 1702int __init
diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h
index 5bf3be45f543..7083cf44d95f 100644
--- a/fs/xfs/xfs_buf.h
+++ b/fs/xfs/xfs_buf.h
@@ -49,8 +49,7 @@ typedef enum {
49#define XBF_MAPPED (1 << 3) /* buffer mapped (b_addr valid) */ 49#define XBF_MAPPED (1 << 3) /* buffer mapped (b_addr valid) */
50#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */ 50#define XBF_ASYNC (1 << 4) /* initiator will not wait for completion */
51#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */ 51#define XBF_DONE (1 << 5) /* all pages in the buffer uptodate */
52#define XBF_DELWRI (1 << 6) /* buffer has dirty pages */ 52#define XBF_STALE (1 << 6) /* buffer has been staled, do not find it */
53#define XBF_STALE (1 << 7) /* buffer has been staled, do not find it */
54 53
55/* I/O hints for the BIO layer */ 54/* I/O hints for the BIO layer */
56#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */ 55#define XBF_SYNCIO (1 << 10)/* treat this buffer as synchronous I/O */
@@ -65,7 +64,7 @@ typedef enum {
65/* flags used only internally */ 64/* flags used only internally */
66#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */ 65#define _XBF_PAGES (1 << 20)/* backed by refcounted pages */
67#define _XBF_KMEM (1 << 21)/* backed by heap memory */ 66#define _XBF_KMEM (1 << 21)/* backed by heap memory */
68#define _XBF_DELWRI_Q (1 << 22)/* buffer on delwri queue */ 67#define _XBF_DELWRI_Q (1 << 22)/* buffer on a delwri queue */
69 68
70typedef unsigned int xfs_buf_flags_t; 69typedef unsigned int xfs_buf_flags_t;
71 70
@@ -76,7 +75,6 @@ typedef unsigned int xfs_buf_flags_t;
76 { XBF_MAPPED, "MAPPED" }, \ 75 { XBF_MAPPED, "MAPPED" }, \
77 { XBF_ASYNC, "ASYNC" }, \ 76 { XBF_ASYNC, "ASYNC" }, \
78 { XBF_DONE, "DONE" }, \ 77 { XBF_DONE, "DONE" }, \
79 { XBF_DELWRI, "DELWRI" }, \
80 { XBF_STALE, "STALE" }, \ 78 { XBF_STALE, "STALE" }, \
81 { XBF_SYNCIO, "SYNCIO" }, \ 79 { XBF_SYNCIO, "SYNCIO" }, \
82 { XBF_FUA, "FUA" }, \ 80 { XBF_FUA, "FUA" }, \
@@ -88,10 +86,6 @@ typedef unsigned int xfs_buf_flags_t;
88 { _XBF_KMEM, "KMEM" }, \ 86 { _XBF_KMEM, "KMEM" }, \
89 { _XBF_DELWRI_Q, "DELWRI_Q" } 87 { _XBF_DELWRI_Q, "DELWRI_Q" }
90 88
91typedef enum {
92 XBT_FORCE_FLUSH = 0,
93} xfs_buftarg_flags_t;
94
95typedef struct xfs_buftarg { 89typedef struct xfs_buftarg {
96 dev_t bt_dev; 90 dev_t bt_dev;
97 struct block_device *bt_bdev; 91 struct block_device *bt_bdev;
@@ -101,12 +95,6 @@ typedef struct xfs_buftarg {
101 unsigned int bt_sshift; 95 unsigned int bt_sshift;
102 size_t bt_smask; 96 size_t bt_smask;
103 97
104 /* per device delwri queue */
105 struct task_struct *bt_task;
106 struct list_head bt_delwri_queue;
107 spinlock_t bt_delwri_lock;
108 unsigned long bt_flags;
109
110 /* LRU control structures */ 98 /* LRU control structures */
111 struct shrinker bt_shrinker; 99 struct shrinker bt_shrinker;
112 struct list_head bt_lru; 100 struct list_head bt_lru;
@@ -150,7 +138,6 @@ typedef struct xfs_buf {
150 struct xfs_trans *b_transp; 138 struct xfs_trans *b_transp;
151 struct page **b_pages; /* array of page pointers */ 139 struct page **b_pages; /* array of page pointers */
152 struct page *b_page_array[XB_PAGES]; /* inline pages */ 140 struct page *b_page_array[XB_PAGES]; /* inline pages */
153 unsigned long b_queuetime; /* time buffer was queued */
154 atomic_t b_pin_count; /* pin count */ 141 atomic_t b_pin_count; /* pin count */
155 atomic_t b_io_remaining; /* #outstanding I/O requests */ 142 atomic_t b_io_remaining; /* #outstanding I/O requests */
156 unsigned int b_page_count; /* size of page array */ 143 unsigned int b_page_count; /* size of page array */
@@ -220,24 +207,22 @@ static inline int xfs_buf_geterror(xfs_buf_t *bp)
220extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); 207extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t);
221 208
222/* Delayed Write Buffer Routines */ 209/* Delayed Write Buffer Routines */
223extern void xfs_buf_delwri_queue(struct xfs_buf *); 210extern bool xfs_buf_delwri_queue(struct xfs_buf *, struct list_head *);
224extern void xfs_buf_delwri_dequeue(struct xfs_buf *); 211extern int xfs_buf_delwri_submit(struct list_head *);
225extern void xfs_buf_delwri_promote(struct xfs_buf *); 212extern int xfs_buf_delwri_submit_nowait(struct list_head *);
226 213
227/* Buffer Daemon Setup Routines */ 214/* Buffer Daemon Setup Routines */
228extern int xfs_buf_init(void); 215extern int xfs_buf_init(void);
229extern void xfs_buf_terminate(void); 216extern void xfs_buf_terminate(void);
230 217
231#define XFS_BUF_ZEROFLAGS(bp) \ 218#define XFS_BUF_ZEROFLAGS(bp) \
232 ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC|XBF_DELWRI| \ 219 ((bp)->b_flags &= ~(XBF_READ|XBF_WRITE|XBF_ASYNC| \
233 XBF_SYNCIO|XBF_FUA|XBF_FLUSH)) 220 XBF_SYNCIO|XBF_FUA|XBF_FLUSH))
234 221
235void xfs_buf_stale(struct xfs_buf *bp); 222void xfs_buf_stale(struct xfs_buf *bp);
236#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE) 223#define XFS_BUF_UNSTALE(bp) ((bp)->b_flags &= ~XBF_STALE)
237#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE) 224#define XFS_BUF_ISSTALE(bp) ((bp)->b_flags & XBF_STALE)
238 225
239#define XFS_BUF_ISDELAYWRITE(bp) ((bp)->b_flags & XBF_DELWRI)
240
241#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE) 226#define XFS_BUF_DONE(bp) ((bp)->b_flags |= XBF_DONE)
242#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE) 227#define XFS_BUF_UNDONE(bp) ((bp)->b_flags &= ~XBF_DONE)
243#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE) 228#define XFS_BUF_ISDONE(bp) ((bp)->b_flags & XBF_DONE)
@@ -287,7 +272,6 @@ extern xfs_buftarg_t *xfs_alloc_buftarg(struct xfs_mount *,
287extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); 272extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
288extern void xfs_wait_buftarg(xfs_buftarg_t *); 273extern void xfs_wait_buftarg(xfs_buftarg_t *);
289extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 274extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
290extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
291 275
292#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) 276#define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev)
293#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) 277#define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev)
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 3a0bc38f1859..fb20f384b566 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -418,7 +418,6 @@ xfs_buf_item_unpin(
418 if (freed && stale) { 418 if (freed && stale) {
419 ASSERT(bip->bli_flags & XFS_BLI_STALE); 419 ASSERT(bip->bli_flags & XFS_BLI_STALE);
420 ASSERT(xfs_buf_islocked(bp)); 420 ASSERT(xfs_buf_islocked(bp));
421 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
422 ASSERT(XFS_BUF_ISSTALE(bp)); 421 ASSERT(XFS_BUF_ISSTALE(bp));
423 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL); 422 ASSERT(bip->bli_format.blf_flags & XFS_BLF_CANCEL);
424 423
@@ -469,34 +468,28 @@ xfs_buf_item_unpin(
469 } 468 }
470} 469}
471 470
472/*
473 * This is called to attempt to lock the buffer associated with this
474 * buf log item. Don't sleep on the buffer lock. If we can't get
475 * the lock right away, return 0. If we can get the lock, take a
476 * reference to the buffer. If this is a delayed write buffer that
477 * needs AIL help to be written back, invoke the pushbuf routine
478 * rather than the normal success path.
479 */
480STATIC uint 471STATIC uint
481xfs_buf_item_trylock( 472xfs_buf_item_push(
482 struct xfs_log_item *lip) 473 struct xfs_log_item *lip,
474 struct list_head *buffer_list)
483{ 475{
484 struct xfs_buf_log_item *bip = BUF_ITEM(lip); 476 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
485 struct xfs_buf *bp = bip->bli_buf; 477 struct xfs_buf *bp = bip->bli_buf;
478 uint rval = XFS_ITEM_SUCCESS;
486 479
487 if (xfs_buf_ispinned(bp)) 480 if (xfs_buf_ispinned(bp))
488 return XFS_ITEM_PINNED; 481 return XFS_ITEM_PINNED;
489 if (!xfs_buf_trylock(bp)) 482 if (!xfs_buf_trylock(bp))
490 return XFS_ITEM_LOCKED; 483 return XFS_ITEM_LOCKED;
491 484
492 /* take a reference to the buffer. */
493 xfs_buf_hold(bp);
494
495 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 485 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
496 trace_xfs_buf_item_trylock(bip); 486
497 if (XFS_BUF_ISDELAYWRITE(bp)) 487 trace_xfs_buf_item_push(bip);
498 return XFS_ITEM_PUSHBUF; 488
499 return XFS_ITEM_SUCCESS; 489 if (!xfs_buf_delwri_queue(bp, buffer_list))
490 rval = XFS_ITEM_FLUSHING;
491 xfs_buf_unlock(bp);
492 return rval;
500} 493}
501 494
502/* 495/*
@@ -609,48 +602,6 @@ xfs_buf_item_committed(
609 return lsn; 602 return lsn;
610} 603}
611 604
612/*
613 * The buffer is locked, but is not a delayed write buffer.
614 */
615STATIC void
616xfs_buf_item_push(
617 struct xfs_log_item *lip)
618{
619 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
620 struct xfs_buf *bp = bip->bli_buf;
621
622 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
623 ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
624
625 trace_xfs_buf_item_push(bip);
626
627 xfs_buf_delwri_queue(bp);
628 xfs_buf_relse(bp);
629}
630
631/*
632 * The buffer is locked and is a delayed write buffer. Promote the buffer
633 * in the delayed write queue as the caller knows that they must invoke
634 * the xfsbufd to get this buffer written. We have to unlock the buffer
635 * to allow the xfsbufd to write it, too.
636 */
637STATIC bool
638xfs_buf_item_pushbuf(
639 struct xfs_log_item *lip)
640{
641 struct xfs_buf_log_item *bip = BUF_ITEM(lip);
642 struct xfs_buf *bp = bip->bli_buf;
643
644 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
645 ASSERT(XFS_BUF_ISDELAYWRITE(bp));
646
647 trace_xfs_buf_item_pushbuf(bip);
648
649 xfs_buf_delwri_promote(bp);
650 xfs_buf_relse(bp);
651 return true;
652}
653
654STATIC void 605STATIC void
655xfs_buf_item_committing( 606xfs_buf_item_committing(
656 struct xfs_log_item *lip, 607 struct xfs_log_item *lip,
@@ -666,11 +617,9 @@ static const struct xfs_item_ops xfs_buf_item_ops = {
666 .iop_format = xfs_buf_item_format, 617 .iop_format = xfs_buf_item_format,
667 .iop_pin = xfs_buf_item_pin, 618 .iop_pin = xfs_buf_item_pin,
668 .iop_unpin = xfs_buf_item_unpin, 619 .iop_unpin = xfs_buf_item_unpin,
669 .iop_trylock = xfs_buf_item_trylock,
670 .iop_unlock = xfs_buf_item_unlock, 620 .iop_unlock = xfs_buf_item_unlock,
671 .iop_committed = xfs_buf_item_committed, 621 .iop_committed = xfs_buf_item_committed,
672 .iop_push = xfs_buf_item_push, 622 .iop_push = xfs_buf_item_push,
673 .iop_pushbuf = xfs_buf_item_pushbuf,
674 .iop_committing = xfs_buf_item_committing 623 .iop_committing = xfs_buf_item_committing
675}; 624};
676 625
@@ -989,20 +938,27 @@ xfs_buf_iodone_callbacks(
989 * If the write was asynchronous then no one will be looking for the 938 * If the write was asynchronous then no one will be looking for the
990 * error. Clear the error state and write the buffer out again. 939 * error. Clear the error state and write the buffer out again.
991 * 940 *
992 * During sync or umount we'll write all pending buffers again 941 * XXX: This helps against transient write errors, but we need to find
993 * synchronous, which will catch these errors if they keep hanging 942 * a way to shut the filesystem down if the writes keep failing.
994 * around. 943 *
944 * In practice we'll shut the filesystem down soon as non-transient
945 * erorrs tend to affect the whole device and a failing log write
946 * will make us give up. But we really ought to do better here.
995 */ 947 */
996 if (XFS_BUF_ISASYNC(bp)) { 948 if (XFS_BUF_ISASYNC(bp)) {
949 ASSERT(bp->b_iodone != NULL);
950
951 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
952
997 xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */ 953 xfs_buf_ioerror(bp, 0); /* errno of 0 unsets the flag */
998 954
999 if (!XFS_BUF_ISSTALE(bp)) { 955 if (!XFS_BUF_ISSTALE(bp)) {
1000 xfs_buf_delwri_queue(bp); 956 bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE;
1001 XFS_BUF_DONE(bp); 957 xfs_bdstrat_cb(bp);
958 } else {
959 xfs_buf_relse(bp);
1002 } 960 }
1003 ASSERT(bp->b_iodone != NULL); 961
1004 trace_xfs_buf_item_iodone_async(bp, _RET_IP_);
1005 xfs_buf_relse(bp);
1006 return; 962 return;
1007 } 963 }
1008 964
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 53757d83e4f6..65b8aa37622e 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -1005,39 +1005,6 @@ xfs_dqlock2(
1005 } 1005 }
1006} 1006}
1007 1007
1008/*
1009 * Give the buffer a little push if it is incore and
1010 * wait on the flush lock.
1011 */
1012void
1013xfs_dqflock_pushbuf_wait(
1014 xfs_dquot_t *dqp)
1015{
1016 xfs_mount_t *mp = dqp->q_mount;
1017 xfs_buf_t *bp;
1018
1019 /*
1020 * Check to see if the dquot has been flushed delayed
1021 * write. If so, grab its buffer and send it
1022 * out immediately. We'll be able to acquire
1023 * the flush lock when the I/O completes.
1024 */
1025 bp = xfs_incore(mp->m_ddev_targp, dqp->q_blkno,
1026 mp->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
1027 if (!bp)
1028 goto out_lock;
1029
1030 if (XFS_BUF_ISDELAYWRITE(bp)) {
1031 if (xfs_buf_ispinned(bp))
1032 xfs_log_force(mp, 0);
1033 xfs_buf_delwri_promote(bp);
1034 wake_up_process(bp->b_target->bt_task);
1035 }
1036 xfs_buf_relse(bp);
1037out_lock:
1038 xfs_dqflock(dqp);
1039}
1040
1041int __init 1008int __init
1042xfs_qm_init(void) 1009xfs_qm_init(void)
1043{ 1010{
diff --git a/fs/xfs/xfs_dquot.h b/fs/xfs/xfs_dquot.h
index 5f2a2f2c0c5b..7d20af27346d 100644
--- a/fs/xfs/xfs_dquot.h
+++ b/fs/xfs/xfs_dquot.h
@@ -152,7 +152,6 @@ extern int xfs_qm_dqget(xfs_mount_t *, xfs_inode_t *,
152extern void xfs_qm_dqput(xfs_dquot_t *); 152extern void xfs_qm_dqput(xfs_dquot_t *);
153 153
154extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *); 154extern void xfs_dqlock2(struct xfs_dquot *, struct xfs_dquot *);
155extern void xfs_dqflock_pushbuf_wait(struct xfs_dquot *dqp);
156 155
157static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp) 156static inline struct xfs_dquot *xfs_qm_dqhold(struct xfs_dquot *dqp)
158{ 157{
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 8d8295814272..9c5d58d24e54 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -108,46 +108,6 @@ xfs_qm_dquot_logitem_unpin(
108 wake_up(&dqp->q_pinwait); 108 wake_up(&dqp->q_pinwait);
109} 109}
110 110
111/*
112 * Given the logitem, this writes the corresponding dquot entry to disk
113 * asynchronously. This is called with the dquot entry securely locked;
114 * we simply get xfs_qm_dqflush() to do the work, and unlock the dquot
115 * at the end.
116 */
117STATIC void
118xfs_qm_dquot_logitem_push(
119 struct xfs_log_item *lip)
120{
121 struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
122 struct xfs_buf *bp = NULL;
123 int error;
124
125 ASSERT(XFS_DQ_IS_LOCKED(dqp));
126 ASSERT(!completion_done(&dqp->q_flush));
127 ASSERT(atomic_read(&dqp->q_pincount) == 0);
128
129 /*
130 * Since we were able to lock the dquot's flush lock and
131 * we found it on the AIL, the dquot must be dirty. This
132 * is because the dquot is removed from the AIL while still
133 * holding the flush lock in xfs_dqflush_done(). Thus, if
134 * we found it in the AIL and were able to obtain the flush
135 * lock without sleeping, then there must not have been
136 * anyone in the process of flushing the dquot.
137 */
138 error = xfs_qm_dqflush(dqp, &bp);
139 if (error) {
140 xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
141 __func__, error, dqp);
142 goto out_unlock;
143 }
144
145 xfs_buf_delwri_queue(bp);
146 xfs_buf_relse(bp);
147out_unlock:
148 xfs_dqunlock(dqp);
149}
150
151STATIC xfs_lsn_t 111STATIC xfs_lsn_t
152xfs_qm_dquot_logitem_committed( 112xfs_qm_dquot_logitem_committed(
153 struct xfs_log_item *lip, 113 struct xfs_log_item *lip,
@@ -179,67 +139,15 @@ xfs_qm_dqunpin_wait(
179 wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0)); 139 wait_event(dqp->q_pinwait, (atomic_read(&dqp->q_pincount) == 0));
180} 140}
181 141
182/*
183 * This is called when IOP_TRYLOCK returns XFS_ITEM_PUSHBUF to indicate that
184 * the dquot is locked by us, but the flush lock isn't. So, here we are
185 * going to see if the relevant dquot buffer is incore, waiting on DELWRI.
186 * If so, we want to push it out to help us take this item off the AIL as soon
187 * as possible.
188 *
189 * We must not be holding the AIL lock at this point. Calling incore() to
190 * search the buffer cache can be a time consuming thing, and AIL lock is a
191 * spinlock.
192 */
193STATIC bool
194xfs_qm_dquot_logitem_pushbuf(
195 struct xfs_log_item *lip)
196{
197 struct xfs_dq_logitem *qlip = DQUOT_ITEM(lip);
198 struct xfs_dquot *dqp = qlip->qli_dquot;
199 struct xfs_buf *bp;
200 bool ret = true;
201
202 ASSERT(XFS_DQ_IS_LOCKED(dqp));
203
204 /*
205 * If flushlock isn't locked anymore, chances are that the
206 * inode flush completed and the inode was taken off the AIL.
207 * So, just get out.
208 */
209 if (completion_done(&dqp->q_flush) ||
210 !(lip->li_flags & XFS_LI_IN_AIL)) {
211 xfs_dqunlock(dqp);
212 return true;
213 }
214
215 bp = xfs_incore(dqp->q_mount->m_ddev_targp, qlip->qli_format.qlf_blkno,
216 dqp->q_mount->m_quotainfo->qi_dqchunklen, XBF_TRYLOCK);
217 xfs_dqunlock(dqp);
218 if (!bp)
219 return true;
220 if (XFS_BUF_ISDELAYWRITE(bp))
221 xfs_buf_delwri_promote(bp);
222 if (xfs_buf_ispinned(bp))
223 ret = false;
224 xfs_buf_relse(bp);
225 return ret;
226}
227
228/*
229 * This is called to attempt to lock the dquot associated with this
230 * dquot log item. Don't sleep on the dquot lock or the flush lock.
231 * If the flush lock is already held, indicating that the dquot has
232 * been or is in the process of being flushed, then see if we can
233 * find the dquot's buffer in the buffer cache without sleeping. If
234 * we can and it is marked delayed write, then we want to send it out.
235 * We delay doing so until the push routine, though, to avoid sleeping
236 * in any device strategy routines.
237 */
238STATIC uint 142STATIC uint
239xfs_qm_dquot_logitem_trylock( 143xfs_qm_dquot_logitem_push(
240 struct xfs_log_item *lip) 144 struct xfs_log_item *lip,
145 struct list_head *buffer_list)
241{ 146{
242 struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot; 147 struct xfs_dquot *dqp = DQUOT_ITEM(lip)->qli_dquot;
148 struct xfs_buf *bp = NULL;
149 uint rval = XFS_ITEM_SUCCESS;
150 int error;
243 151
244 if (atomic_read(&dqp->q_pincount) > 0) 152 if (atomic_read(&dqp->q_pincount) > 0)
245 return XFS_ITEM_PINNED; 153 return XFS_ITEM_PINNED;
@@ -252,20 +160,36 @@ xfs_qm_dquot_logitem_trylock(
252 * taking the quota lock. 160 * taking the quota lock.
253 */ 161 */
254 if (atomic_read(&dqp->q_pincount) > 0) { 162 if (atomic_read(&dqp->q_pincount) > 0) {
255 xfs_dqunlock(dqp); 163 rval = XFS_ITEM_PINNED;
256 return XFS_ITEM_PINNED; 164 goto out_unlock;
257 } 165 }
258 166
167 /*
168 * Someone else is already flushing the dquot. Nothing we can do
169 * here but wait for the flush to finish and remove the item from
170 * the AIL.
171 */
259 if (!xfs_dqflock_nowait(dqp)) { 172 if (!xfs_dqflock_nowait(dqp)) {
260 /* 173 rval = XFS_ITEM_FLUSHING;
261 * dquot has already been flushed to the backing buffer, 174 goto out_unlock;
262 * leave it locked, pushbuf routine will unlock it. 175 }
263 */ 176
264 return XFS_ITEM_PUSHBUF; 177 spin_unlock(&lip->li_ailp->xa_lock);
178
179 error = xfs_qm_dqflush(dqp, &bp);
180 if (error) {
181 xfs_warn(dqp->q_mount, "%s: push error %d on dqp %p",
182 __func__, error, dqp);
183 } else {
184 if (!xfs_buf_delwri_queue(bp, buffer_list))
185 rval = XFS_ITEM_FLUSHING;
186 xfs_buf_relse(bp);
265 } 187 }
266 188
267 ASSERT(lip->li_flags & XFS_LI_IN_AIL); 189 spin_lock(&lip->li_ailp->xa_lock);
268 return XFS_ITEM_SUCCESS; 190out_unlock:
191 xfs_dqunlock(dqp);
192 return rval;
269} 193}
270 194
271/* 195/*
@@ -316,11 +240,9 @@ static const struct xfs_item_ops xfs_dquot_item_ops = {
316 .iop_format = xfs_qm_dquot_logitem_format, 240 .iop_format = xfs_qm_dquot_logitem_format,
317 .iop_pin = xfs_qm_dquot_logitem_pin, 241 .iop_pin = xfs_qm_dquot_logitem_pin,
318 .iop_unpin = xfs_qm_dquot_logitem_unpin, 242 .iop_unpin = xfs_qm_dquot_logitem_unpin,
319 .iop_trylock = xfs_qm_dquot_logitem_trylock,
320 .iop_unlock = xfs_qm_dquot_logitem_unlock, 243 .iop_unlock = xfs_qm_dquot_logitem_unlock,
321 .iop_committed = xfs_qm_dquot_logitem_committed, 244 .iop_committed = xfs_qm_dquot_logitem_committed,
322 .iop_push = xfs_qm_dquot_logitem_push, 245 .iop_push = xfs_qm_dquot_logitem_push,
323 .iop_pushbuf = xfs_qm_dquot_logitem_pushbuf,
324 .iop_committing = xfs_qm_dquot_logitem_committing 246 .iop_committing = xfs_qm_dquot_logitem_committing
325}; 247};
326 248
@@ -415,11 +337,13 @@ xfs_qm_qoff_logitem_unpin(
415} 337}
416 338
417/* 339/*
418 * Quotaoff items have no locking, so just return success. 340 * There isn't much you can do to push a quotaoff item. It is simply
341 * stuck waiting for the log to be flushed to disk.
419 */ 342 */
420STATIC uint 343STATIC uint
421xfs_qm_qoff_logitem_trylock( 344xfs_qm_qoff_logitem_push(
422 struct xfs_log_item *lip) 345 struct xfs_log_item *lip,
346 struct list_head *buffer_list)
423{ 347{
424 return XFS_ITEM_LOCKED; 348 return XFS_ITEM_LOCKED;
425} 349}
@@ -446,17 +370,6 @@ xfs_qm_qoff_logitem_committed(
446 return lsn; 370 return lsn;
447} 371}
448 372
449/*
450 * There isn't much you can do to push on an quotaoff item. It is simply
451 * stuck waiting for the log to be flushed to disk.
452 */
453STATIC void
454xfs_qm_qoff_logitem_push(
455 struct xfs_log_item *lip)
456{
457}
458
459
460STATIC xfs_lsn_t 373STATIC xfs_lsn_t
461xfs_qm_qoffend_logitem_committed( 374xfs_qm_qoffend_logitem_committed(
462 struct xfs_log_item *lip, 375 struct xfs_log_item *lip,
@@ -504,7 +417,6 @@ static const struct xfs_item_ops xfs_qm_qoffend_logitem_ops = {
504 .iop_format = xfs_qm_qoff_logitem_format, 417 .iop_format = xfs_qm_qoff_logitem_format,
505 .iop_pin = xfs_qm_qoff_logitem_pin, 418 .iop_pin = xfs_qm_qoff_logitem_pin,
506 .iop_unpin = xfs_qm_qoff_logitem_unpin, 419 .iop_unpin = xfs_qm_qoff_logitem_unpin,
507 .iop_trylock = xfs_qm_qoff_logitem_trylock,
508 .iop_unlock = xfs_qm_qoff_logitem_unlock, 420 .iop_unlock = xfs_qm_qoff_logitem_unlock,
509 .iop_committed = xfs_qm_qoffend_logitem_committed, 421 .iop_committed = xfs_qm_qoffend_logitem_committed,
510 .iop_push = xfs_qm_qoff_logitem_push, 422 .iop_push = xfs_qm_qoff_logitem_push,
@@ -519,7 +431,6 @@ static const struct xfs_item_ops xfs_qm_qoff_logitem_ops = {
519 .iop_format = xfs_qm_qoff_logitem_format, 431 .iop_format = xfs_qm_qoff_logitem_format,
520 .iop_pin = xfs_qm_qoff_logitem_pin, 432 .iop_pin = xfs_qm_qoff_logitem_pin,
521 .iop_unpin = xfs_qm_qoff_logitem_unpin, 433 .iop_unpin = xfs_qm_qoff_logitem_unpin,
522 .iop_trylock = xfs_qm_qoff_logitem_trylock,
523 .iop_unlock = xfs_qm_qoff_logitem_unlock, 434 .iop_unlock = xfs_qm_qoff_logitem_unlock,
524 .iop_committed = xfs_qm_qoff_logitem_committed, 435 .iop_committed = xfs_qm_qoff_logitem_committed,
525 .iop_push = xfs_qm_qoff_logitem_push, 436 .iop_push = xfs_qm_qoff_logitem_push,
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 35c2aff38b20..9549ef179e06 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -147,22 +147,20 @@ xfs_efi_item_unpin(
147} 147}
148 148
149/* 149/*
150 * Efi items have no locking or pushing. However, since EFIs are 150 * Efi items have no locking or pushing. However, since EFIs are pulled from
151 * pulled from the AIL when their corresponding EFDs are committed 151 * the AIL when their corresponding EFDs are committed to disk, their situation
152 * to disk, their situation is very similar to being pinned. Return 152 * is very similar to being pinned. Return XFS_ITEM_PINNED so that the caller
153 * XFS_ITEM_PINNED so that the caller will eventually flush the log. 153 * will eventually flush the log. This should help in getting the EFI out of
154 * This should help in getting the EFI out of the AIL. 154 * the AIL.
155 */ 155 */
156STATIC uint 156STATIC uint
157xfs_efi_item_trylock( 157xfs_efi_item_push(
158 struct xfs_log_item *lip) 158 struct xfs_log_item *lip,
159 struct list_head *buffer_list)
159{ 160{
160 return XFS_ITEM_PINNED; 161 return XFS_ITEM_PINNED;
161} 162}
162 163
163/*
164 * Efi items have no locking, so just return.
165 */
166STATIC void 164STATIC void
167xfs_efi_item_unlock( 165xfs_efi_item_unlock(
168 struct xfs_log_item *lip) 166 struct xfs_log_item *lip)
@@ -190,17 +188,6 @@ xfs_efi_item_committed(
190} 188}
191 189
192/* 190/*
193 * There isn't much you can do to push on an efi item. It is simply
194 * stuck waiting for all of its corresponding efd items to be
195 * committed to disk.
196 */
197STATIC void
198xfs_efi_item_push(
199 struct xfs_log_item *lip)
200{
201}
202
203/*
204 * The EFI dependency tracking op doesn't do squat. It can't because 191 * The EFI dependency tracking op doesn't do squat. It can't because
205 * it doesn't know where the free extent is coming from. The dependency 192 * it doesn't know where the free extent is coming from. The dependency
206 * tracking has to be handled by the "enclosing" metadata object. For 193 * tracking has to be handled by the "enclosing" metadata object. For
@@ -222,7 +209,6 @@ static const struct xfs_item_ops xfs_efi_item_ops = {
222 .iop_format = xfs_efi_item_format, 209 .iop_format = xfs_efi_item_format,
223 .iop_pin = xfs_efi_item_pin, 210 .iop_pin = xfs_efi_item_pin,
224 .iop_unpin = xfs_efi_item_unpin, 211 .iop_unpin = xfs_efi_item_unpin,
225 .iop_trylock = xfs_efi_item_trylock,
226 .iop_unlock = xfs_efi_item_unlock, 212 .iop_unlock = xfs_efi_item_unlock,
227 .iop_committed = xfs_efi_item_committed, 213 .iop_committed = xfs_efi_item_committed,
228 .iop_push = xfs_efi_item_push, 214 .iop_push = xfs_efi_item_push,
@@ -404,19 +390,17 @@ xfs_efd_item_unpin(
404} 390}
405 391
406/* 392/*
407 * Efd items have no locking, so just return success. 393 * There isn't much you can do to push on an efd item. It is simply stuck
394 * waiting for the log to be flushed to disk.
408 */ 395 */
409STATIC uint 396STATIC uint
410xfs_efd_item_trylock( 397xfs_efd_item_push(
411 struct xfs_log_item *lip) 398 struct xfs_log_item *lip,
399 struct list_head *buffer_list)
412{ 400{
413 return XFS_ITEM_LOCKED; 401 return XFS_ITEM_PINNED;
414} 402}
415 403
416/*
417 * Efd items have no locking or pushing, so return failure
418 * so that the caller doesn't bother with us.
419 */
420STATIC void 404STATIC void
421xfs_efd_item_unlock( 405xfs_efd_item_unlock(
422 struct xfs_log_item *lip) 406 struct xfs_log_item *lip)
@@ -451,16 +435,6 @@ xfs_efd_item_committed(
451} 435}
452 436
453/* 437/*
454 * There isn't much you can do to push on an efd item. It is simply
455 * stuck waiting for the log to be flushed to disk.
456 */
457STATIC void
458xfs_efd_item_push(
459 struct xfs_log_item *lip)
460{
461}
462
463/*
464 * The EFD dependency tracking op doesn't do squat. It can't because 438 * The EFD dependency tracking op doesn't do squat. It can't because
465 * it doesn't know where the free extent is coming from. The dependency 439 * it doesn't know where the free extent is coming from. The dependency
466 * tracking has to be handled by the "enclosing" metadata object. For 440 * tracking has to be handled by the "enclosing" metadata object. For
@@ -482,7 +456,6 @@ static const struct xfs_item_ops xfs_efd_item_ops = {
482 .iop_format = xfs_efd_item_format, 456 .iop_format = xfs_efd_item_format,
483 .iop_pin = xfs_efd_item_pin, 457 .iop_pin = xfs_efd_item_pin,
484 .iop_unpin = xfs_efd_item_unpin, 458 .iop_unpin = xfs_efd_item_unpin,
485 .iop_trylock = xfs_efd_item_trylock,
486 .iop_unlock = xfs_efd_item_unlock, 459 .iop_unlock = xfs_efd_item_unlock,
487 .iop_committed = xfs_efd_item_committed, 460 .iop_committed = xfs_efd_item_committed,
488 .iop_push = xfs_efd_item_push, 461 .iop_push = xfs_efd_item_push,
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 0fa987dea242..acd846d808b2 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -2347,11 +2347,11 @@ cluster_corrupt_out:
2347 */ 2347 */
2348 rcu_read_unlock(); 2348 rcu_read_unlock();
2349 /* 2349 /*
2350 * Clean up the buffer. If it was B_DELWRI, just release it -- 2350 * Clean up the buffer. If it was delwri, just release it --
2351 * brelse can handle it with no problems. If not, shut down the 2351 * brelse can handle it with no problems. If not, shut down the
2352 * filesystem before releasing the buffer. 2352 * filesystem before releasing the buffer.
2353 */ 2353 */
2354 bufwasdelwri = XFS_BUF_ISDELAYWRITE(bp); 2354 bufwasdelwri = (bp->b_flags & _XBF_DELWRI_Q);
2355 if (bufwasdelwri) 2355 if (bufwasdelwri)
2356 xfs_buf_relse(bp); 2356 xfs_buf_relse(bp);
2357 2357
@@ -2685,27 +2685,6 @@ corrupt_out:
2685 return XFS_ERROR(EFSCORRUPTED); 2685 return XFS_ERROR(EFSCORRUPTED);
2686} 2686}
2687 2687
2688void
2689xfs_promote_inode(
2690 struct xfs_inode *ip)
2691{
2692 struct xfs_buf *bp;
2693
2694 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
2695
2696 bp = xfs_incore(ip->i_mount->m_ddev_targp, ip->i_imap.im_blkno,
2697 ip->i_imap.im_len, XBF_TRYLOCK);
2698 if (!bp)
2699 return;
2700
2701 if (XFS_BUF_ISDELAYWRITE(bp)) {
2702 xfs_buf_delwri_promote(bp);
2703 wake_up_process(ip->i_mount->m_ddev_targp->bt_task);
2704 }
2705
2706 xfs_buf_relse(bp);
2707}
2708
2709/* 2688/*
2710 * Return a pointer to the extent record at file index idx. 2689 * Return a pointer to the extent record at file index idx.
2711 */ 2690 */
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index a2fa79ae410f..f0e252f384f9 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -530,7 +530,6 @@ int xfs_iunlink(struct xfs_trans *, xfs_inode_t *);
530void xfs_iext_realloc(xfs_inode_t *, int, int); 530void xfs_iext_realloc(xfs_inode_t *, int, int);
531void xfs_iunpin_wait(xfs_inode_t *); 531void xfs_iunpin_wait(xfs_inode_t *);
532int xfs_iflush(struct xfs_inode *, struct xfs_buf **); 532int xfs_iflush(struct xfs_inode *, struct xfs_buf **);
533void xfs_promote_inode(struct xfs_inode *);
534void xfs_lock_inodes(xfs_inode_t **, int, uint); 533void xfs_lock_inodes(xfs_inode_t **, int, uint);
535void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint); 534void xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
536 535
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index d3601ab75dd3..8aaebb2f9efa 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -480,25 +480,16 @@ xfs_inode_item_unpin(
480 wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT); 480 wake_up_bit(&ip->i_flags, __XFS_IPINNED_BIT);
481} 481}
482 482
483/*
484 * This is called to attempt to lock the inode associated with this
485 * inode log item, in preparation for the push routine which does the actual
486 * iflush. Don't sleep on the inode lock or the flush lock.
487 *
488 * If the flush lock is already held, indicating that the inode has
489 * been or is in the process of being flushed, then (ideally) we'd like to
490 * see if the inode's buffer is still incore, and if so give it a nudge.
491 * We delay doing so until the pushbuf routine, though, to avoid holding
492 * the AIL lock across a call to the blackhole which is the buffer cache.
493 * Also we don't want to sleep in any device strategy routines, which can happen
494 * if we do the subsequent bawrite in here.
495 */
496STATIC uint 483STATIC uint
497xfs_inode_item_trylock( 484xfs_inode_item_push(
498 struct xfs_log_item *lip) 485 struct xfs_log_item *lip,
486 struct list_head *buffer_list)
499{ 487{
500 struct xfs_inode_log_item *iip = INODE_ITEM(lip); 488 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
501 struct xfs_inode *ip = iip->ili_inode; 489 struct xfs_inode *ip = iip->ili_inode;
490 struct xfs_buf *bp = NULL;
491 uint rval = XFS_ITEM_SUCCESS;
492 int error;
502 493
503 if (xfs_ipincount(ip) > 0) 494 if (xfs_ipincount(ip) > 0)
504 return XFS_ITEM_PINNED; 495 return XFS_ITEM_PINNED;
@@ -511,34 +502,45 @@ xfs_inode_item_trylock(
511 * taking the ilock. 502 * taking the ilock.
512 */ 503 */
513 if (xfs_ipincount(ip) > 0) { 504 if (xfs_ipincount(ip) > 0) {
514 xfs_iunlock(ip, XFS_ILOCK_SHARED); 505 rval = XFS_ITEM_PINNED;
515 return XFS_ITEM_PINNED; 506 goto out_unlock;
516 } 507 }
517 508
509 /*
510 * Someone else is already flushing the inode. Nothing we can do
511 * here but wait for the flush to finish and remove the item from
512 * the AIL.
513 */
518 if (!xfs_iflock_nowait(ip)) { 514 if (!xfs_iflock_nowait(ip)) {
519 /* 515 rval = XFS_ITEM_FLUSHING;
520 * inode has already been flushed to the backing buffer, 516 goto out_unlock;
521 * leave it locked in shared mode, pushbuf routine will
522 * unlock it.
523 */
524 return XFS_ITEM_PUSHBUF;
525 } 517 }
526 518
527 /* Stale items should force out the iclog */ 519 /*
520 * Stale inode items should force out the iclog.
521 */
528 if (ip->i_flags & XFS_ISTALE) { 522 if (ip->i_flags & XFS_ISTALE) {
529 xfs_ifunlock(ip); 523 xfs_ifunlock(ip);
530 xfs_iunlock(ip, XFS_ILOCK_SHARED); 524 xfs_iunlock(ip, XFS_ILOCK_SHARED);
531 return XFS_ITEM_PINNED; 525 return XFS_ITEM_PINNED;
532 } 526 }
533 527
534#ifdef DEBUG 528 ASSERT(iip->ili_fields != 0 || XFS_FORCED_SHUTDOWN(ip->i_mount));
535 if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) { 529 ASSERT(iip->ili_logged == 0 || XFS_FORCED_SHUTDOWN(ip->i_mount));
536 ASSERT(iip->ili_fields != 0); 530
537 ASSERT(iip->ili_logged == 0); 531 spin_unlock(&lip->li_ailp->xa_lock);
538 ASSERT(lip->li_flags & XFS_LI_IN_AIL); 532
533 error = xfs_iflush(ip, &bp);
534 if (!error) {
535 if (!xfs_buf_delwri_queue(bp, buffer_list))
536 rval = XFS_ITEM_FLUSHING;
537 xfs_buf_relse(bp);
539 } 538 }
540#endif 539
541 return XFS_ITEM_SUCCESS; 540 spin_lock(&lip->li_ailp->xa_lock);
541out_unlock:
542 xfs_iunlock(ip, XFS_ILOCK_SHARED);
543 return rval;
542} 544}
543 545
544/* 546/*
@@ -623,92 +625,6 @@ xfs_inode_item_committed(
623} 625}
624 626
625/* 627/*
626 * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
627 * failed to get the inode flush lock but did get the inode locked SHARED.
628 * Here we're trying to see if the inode buffer is incore, and if so whether it's
629 * marked delayed write. If that's the case, we'll promote it and that will
630 * allow the caller to write the buffer by triggering the xfsbufd to run.
631 */
632STATIC bool
633xfs_inode_item_pushbuf(
634 struct xfs_log_item *lip)
635{
636 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
637 struct xfs_inode *ip = iip->ili_inode;
638 struct xfs_buf *bp;
639 bool ret = true;
640
641 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
642
643 /*
644 * If a flush is not in progress anymore, chances are that the
645 * inode was taken off the AIL. So, just get out.
646 */
647 if (!xfs_isiflocked(ip) ||
648 !(lip->li_flags & XFS_LI_IN_AIL)) {
649 xfs_iunlock(ip, XFS_ILOCK_SHARED);
650 return true;
651 }
652
653 bp = xfs_incore(ip->i_mount->m_ddev_targp, iip->ili_format.ilf_blkno,
654 iip->ili_format.ilf_len, XBF_TRYLOCK);
655
656 xfs_iunlock(ip, XFS_ILOCK_SHARED);
657 if (!bp)
658 return true;
659 if (XFS_BUF_ISDELAYWRITE(bp))
660 xfs_buf_delwri_promote(bp);
661 if (xfs_buf_ispinned(bp))
662 ret = false;
663 xfs_buf_relse(bp);
664 return ret;
665}
666
667/*
668 * This is called to asynchronously write the inode associated with this
669 * inode log item out to disk. The inode will already have been locked by
670 * a successful call to xfs_inode_item_trylock().
671 */
672STATIC void
673xfs_inode_item_push(
674 struct xfs_log_item *lip)
675{
676 struct xfs_inode_log_item *iip = INODE_ITEM(lip);
677 struct xfs_inode *ip = iip->ili_inode;
678 struct xfs_buf *bp = NULL;
679 int error;
680
681 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
682 ASSERT(xfs_isiflocked(ip));
683
684 /*
685 * Since we were able to lock the inode's flush lock and
686 * we found it on the AIL, the inode must be dirty. This
687 * is because the inode is removed from the AIL while still
688 * holding the flush lock in xfs_iflush_done(). Thus, if
689 * we found it in the AIL and were able to obtain the flush
690 * lock without sleeping, then there must not have been
691 * anyone in the process of flushing the inode.
692 */
693 ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || iip->ili_fields != 0);
694
695 /*
696 * Push the inode to it's backing buffer. This will not remove the
697 * inode from the AIL - a further push will be required to trigger a
698 * buffer push. However, this allows all the dirty inodes to be pushed
699 * to the buffer before it is pushed to disk. The buffer IO completion
700 * will pull the inode from the AIL, mark it clean and unlock the flush
701 * lock.
702 */
703 error = xfs_iflush(ip, &bp);
704 if (!error) {
705 xfs_buf_delwri_queue(bp);
706 xfs_buf_relse(bp);
707 }
708 xfs_iunlock(ip, XFS_ILOCK_SHARED);
709}
710
711/*
712 * XXX rcc - this one really has to do something. Probably needs 628 * XXX rcc - this one really has to do something. Probably needs
713 * to stamp in a new field in the incore inode. 629 * to stamp in a new field in the incore inode.
714 */ 630 */
@@ -728,11 +644,9 @@ static const struct xfs_item_ops xfs_inode_item_ops = {
728 .iop_format = xfs_inode_item_format, 644 .iop_format = xfs_inode_item_format,
729 .iop_pin = xfs_inode_item_pin, 645 .iop_pin = xfs_inode_item_pin,
730 .iop_unpin = xfs_inode_item_unpin, 646 .iop_unpin = xfs_inode_item_unpin,
731 .iop_trylock = xfs_inode_item_trylock,
732 .iop_unlock = xfs_inode_item_unlock, 647 .iop_unlock = xfs_inode_item_unlock,
733 .iop_committed = xfs_inode_item_committed, 648 .iop_committed = xfs_inode_item_committed,
734 .iop_push = xfs_inode_item_push, 649 .iop_push = xfs_inode_item_push,
735 .iop_pushbuf = xfs_inode_item_pushbuf,
736 .iop_committing = xfs_inode_item_committing 650 .iop_committing = xfs_inode_item_committing
737}; 651};
738 652
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 8ecad5bad66c..5e864a9c0ccf 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2103,6 +2103,7 @@ xlog_recover_do_dquot_buffer(
2103STATIC int 2103STATIC int
2104xlog_recover_buffer_pass2( 2104xlog_recover_buffer_pass2(
2105 xlog_t *log, 2105 xlog_t *log,
2106 struct list_head *buffer_list,
2106 xlog_recover_item_t *item) 2107 xlog_recover_item_t *item)
2107{ 2108{
2108 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr; 2109 xfs_buf_log_format_t *buf_f = item->ri_buf[0].i_addr;
@@ -2173,7 +2174,7 @@ xlog_recover_buffer_pass2(
2173 } else { 2174 } else {
2174 ASSERT(bp->b_target->bt_mount == mp); 2175 ASSERT(bp->b_target->bt_mount == mp);
2175 bp->b_iodone = xlog_recover_iodone; 2176 bp->b_iodone = xlog_recover_iodone;
2176 xfs_buf_delwri_queue(bp); 2177 xfs_buf_delwri_queue(bp, buffer_list);
2177 } 2178 }
2178 2179
2179 xfs_buf_relse(bp); 2180 xfs_buf_relse(bp);
@@ -2183,6 +2184,7 @@ xlog_recover_buffer_pass2(
2183STATIC int 2184STATIC int
2184xlog_recover_inode_pass2( 2185xlog_recover_inode_pass2(
2185 xlog_t *log, 2186 xlog_t *log,
2187 struct list_head *buffer_list,
2186 xlog_recover_item_t *item) 2188 xlog_recover_item_t *item)
2187{ 2189{
2188 xfs_inode_log_format_t *in_f; 2190 xfs_inode_log_format_t *in_f;
@@ -2436,7 +2438,7 @@ xlog_recover_inode_pass2(
2436write_inode_buffer: 2438write_inode_buffer:
2437 ASSERT(bp->b_target->bt_mount == mp); 2439 ASSERT(bp->b_target->bt_mount == mp);
2438 bp->b_iodone = xlog_recover_iodone; 2440 bp->b_iodone = xlog_recover_iodone;
2439 xfs_buf_delwri_queue(bp); 2441 xfs_buf_delwri_queue(bp, buffer_list);
2440 xfs_buf_relse(bp); 2442 xfs_buf_relse(bp);
2441error: 2443error:
2442 if (need_free) 2444 if (need_free)
@@ -2477,6 +2479,7 @@ xlog_recover_quotaoff_pass1(
2477STATIC int 2479STATIC int
2478xlog_recover_dquot_pass2( 2480xlog_recover_dquot_pass2(
2479 xlog_t *log, 2481 xlog_t *log,
2482 struct list_head *buffer_list,
2480 xlog_recover_item_t *item) 2483 xlog_recover_item_t *item)
2481{ 2484{
2482 xfs_mount_t *mp = log->l_mp; 2485 xfs_mount_t *mp = log->l_mp;
@@ -2558,7 +2561,7 @@ xlog_recover_dquot_pass2(
2558 ASSERT(dq_f->qlf_size == 2); 2561 ASSERT(dq_f->qlf_size == 2);
2559 ASSERT(bp->b_target->bt_mount == mp); 2562 ASSERT(bp->b_target->bt_mount == mp);
2560 bp->b_iodone = xlog_recover_iodone; 2563 bp->b_iodone = xlog_recover_iodone;
2561 xfs_buf_delwri_queue(bp); 2564 xfs_buf_delwri_queue(bp, buffer_list);
2562 xfs_buf_relse(bp); 2565 xfs_buf_relse(bp);
2563 2566
2564 return (0); 2567 return (0);
@@ -2712,21 +2715,22 @@ STATIC int
2712xlog_recover_commit_pass2( 2715xlog_recover_commit_pass2(
2713 struct log *log, 2716 struct log *log,
2714 struct xlog_recover *trans, 2717 struct xlog_recover *trans,
2718 struct list_head *buffer_list,
2715 xlog_recover_item_t *item) 2719 xlog_recover_item_t *item)
2716{ 2720{
2717 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2); 2721 trace_xfs_log_recover_item_recover(log, trans, item, XLOG_RECOVER_PASS2);
2718 2722
2719 switch (ITEM_TYPE(item)) { 2723 switch (ITEM_TYPE(item)) {
2720 case XFS_LI_BUF: 2724 case XFS_LI_BUF:
2721 return xlog_recover_buffer_pass2(log, item); 2725 return xlog_recover_buffer_pass2(log, buffer_list, item);
2722 case XFS_LI_INODE: 2726 case XFS_LI_INODE:
2723 return xlog_recover_inode_pass2(log, item); 2727 return xlog_recover_inode_pass2(log, buffer_list, item);
2724 case XFS_LI_EFI: 2728 case XFS_LI_EFI:
2725 return xlog_recover_efi_pass2(log, item, trans->r_lsn); 2729 return xlog_recover_efi_pass2(log, item, trans->r_lsn);
2726 case XFS_LI_EFD: 2730 case XFS_LI_EFD:
2727 return xlog_recover_efd_pass2(log, item); 2731 return xlog_recover_efd_pass2(log, item);
2728 case XFS_LI_DQUOT: 2732 case XFS_LI_DQUOT:
2729 return xlog_recover_dquot_pass2(log, item); 2733 return xlog_recover_dquot_pass2(log, buffer_list, item);
2730 case XFS_LI_QUOTAOFF: 2734 case XFS_LI_QUOTAOFF:
2731 /* nothing to do in pass2 */ 2735 /* nothing to do in pass2 */
2732 return 0; 2736 return 0;
@@ -2750,8 +2754,9 @@ xlog_recover_commit_trans(
2750 struct xlog_recover *trans, 2754 struct xlog_recover *trans,
2751 int pass) 2755 int pass)
2752{ 2756{
2753 int error = 0; 2757 int error = 0, error2;
2754 xlog_recover_item_t *item; 2758 xlog_recover_item_t *item;
2759 LIST_HEAD (buffer_list);
2755 2760
2756 hlist_del(&trans->r_list); 2761 hlist_del(&trans->r_list);
2757 2762
@@ -2760,16 +2765,27 @@ xlog_recover_commit_trans(
2760 return error; 2765 return error;
2761 2766
2762 list_for_each_entry(item, &trans->r_itemq, ri_list) { 2767 list_for_each_entry(item, &trans->r_itemq, ri_list) {
2763 if (pass == XLOG_RECOVER_PASS1) 2768 switch (pass) {
2769 case XLOG_RECOVER_PASS1:
2764 error = xlog_recover_commit_pass1(log, trans, item); 2770 error = xlog_recover_commit_pass1(log, trans, item);
2765 else 2771 break;
2766 error = xlog_recover_commit_pass2(log, trans, item); 2772 case XLOG_RECOVER_PASS2:
2773 error = xlog_recover_commit_pass2(log, trans,
2774 &buffer_list, item);
2775 break;
2776 default:
2777 ASSERT(0);
2778 }
2779
2767 if (error) 2780 if (error)
2768 return error; 2781 goto out;
2769 } 2782 }
2770 2783
2771 xlog_recover_free_trans(trans); 2784 xlog_recover_free_trans(trans);
2772 return 0; 2785
2786out:
2787 error2 = xfs_buf_delwri_submit(&buffer_list);
2788 return error ? error : error2;
2773} 2789}
2774 2790
2775STATIC int 2791STATIC int
@@ -3639,11 +3655,8 @@ xlog_do_recover(
3639 * First replay the images in the log. 3655 * First replay the images in the log.
3640 */ 3656 */
3641 error = xlog_do_log_recovery(log, head_blk, tail_blk); 3657 error = xlog_do_log_recovery(log, head_blk, tail_blk);
3642 if (error) { 3658 if (error)
3643 return error; 3659 return error;
3644 }
3645
3646 xfs_flush_buftarg(log->l_mp->m_ddev_targp, 1);
3647 3660
3648 /* 3661 /*
3649 * If IO errors happened during recovery, bail out. 3662 * If IO errors happened during recovery, bail out.
@@ -3670,7 +3683,6 @@ xlog_do_recover(
3670 bp = xfs_getsb(log->l_mp, 0); 3683 bp = xfs_getsb(log->l_mp, 0);
3671 XFS_BUF_UNDONE(bp); 3684 XFS_BUF_UNDONE(bp);
3672 ASSERT(!(XFS_BUF_ISWRITE(bp))); 3685 ASSERT(!(XFS_BUF_ISWRITE(bp)));
3673 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
3674 XFS_BUF_READ(bp); 3686 XFS_BUF_READ(bp);
3675 XFS_BUF_UNASYNC(bp); 3687 XFS_BUF_UNASYNC(bp);
3676 xfsbdstrat(log->l_mp, bp); 3688 xfsbdstrat(log->l_mp, bp);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index 95aecf52475d..755a9bd749d0 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -65,7 +65,8 @@ STATIC int
65xfs_qm_dquot_walk( 65xfs_qm_dquot_walk(
66 struct xfs_mount *mp, 66 struct xfs_mount *mp,
67 int type, 67 int type,
68 int (*execute)(struct xfs_dquot *dqp)) 68 int (*execute)(struct xfs_dquot *dqp, void *data),
69 void *data)
69{ 70{
70 struct xfs_quotainfo *qi = mp->m_quotainfo; 71 struct xfs_quotainfo *qi = mp->m_quotainfo;
71 struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type); 72 struct radix_tree_root *tree = XFS_DQUOT_TREE(qi, type);
@@ -97,7 +98,7 @@ restart:
97 98
98 next_index = be32_to_cpu(dqp->q_core.d_id) + 1; 99 next_index = be32_to_cpu(dqp->q_core.d_id) + 1;
99 100
100 error = execute(batch[i]); 101 error = execute(batch[i], data);
101 if (error == EAGAIN) { 102 if (error == EAGAIN) {
102 skipped++; 103 skipped++;
103 continue; 104 continue;
@@ -129,7 +130,8 @@ restart:
129 */ 130 */
130STATIC int 131STATIC int
131xfs_qm_dqpurge( 132xfs_qm_dqpurge(
132 struct xfs_dquot *dqp) 133 struct xfs_dquot *dqp,
134 void *data)
133{ 135{
134 struct xfs_mount *mp = dqp->q_mount; 136 struct xfs_mount *mp = dqp->q_mount;
135 struct xfs_quotainfo *qi = mp->m_quotainfo; 137 struct xfs_quotainfo *qi = mp->m_quotainfo;
@@ -153,21 +155,7 @@ xfs_qm_dqpurge(
153 155
154 dqp->dq_flags |= XFS_DQ_FREEING; 156 dqp->dq_flags |= XFS_DQ_FREEING;
155 157
156 /* 158 xfs_dqflock(dqp);
157 * If we're turning off quotas, we have to make sure that, for
158 * example, we don't delete quota disk blocks while dquots are
159 * in the process of getting written to those disk blocks.
160 * This dquot might well be on AIL, and we can't leave it there
161 * if we're turning off quotas. Basically, we need this flush
162 * lock, and are willing to block on it.
163 */
164 if (!xfs_dqflock_nowait(dqp)) {
165 /*
166 * Block on the flush lock after nudging dquot buffer,
167 * if it is incore.
168 */
169 xfs_dqflock_pushbuf_wait(dqp);
170 }
171 159
172 /* 160 /*
173 * If we are turning this type of quotas off, we don't care 161 * If we are turning this type of quotas off, we don't care
@@ -231,11 +219,11 @@ xfs_qm_dqpurge_all(
231 uint flags) 219 uint flags)
232{ 220{
233 if (flags & XFS_QMOPT_UQUOTA) 221 if (flags & XFS_QMOPT_UQUOTA)
234 xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge); 222 xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_dqpurge, NULL);
235 if (flags & XFS_QMOPT_GQUOTA) 223 if (flags & XFS_QMOPT_GQUOTA)
236 xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge); 224 xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_dqpurge, NULL);
237 if (flags & XFS_QMOPT_PQUOTA) 225 if (flags & XFS_QMOPT_PQUOTA)
238 xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge); 226 xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_dqpurge, NULL);
239} 227}
240 228
241/* 229/*
@@ -876,15 +864,16 @@ xfs_qm_reset_dqcounts(
876 864
877STATIC int 865STATIC int
878xfs_qm_dqiter_bufs( 866xfs_qm_dqiter_bufs(
879 xfs_mount_t *mp, 867 struct xfs_mount *mp,
880 xfs_dqid_t firstid, 868 xfs_dqid_t firstid,
881 xfs_fsblock_t bno, 869 xfs_fsblock_t bno,
882 xfs_filblks_t blkcnt, 870 xfs_filblks_t blkcnt,
883 uint flags) 871 uint flags,
872 struct list_head *buffer_list)
884{ 873{
885 xfs_buf_t *bp; 874 struct xfs_buf *bp;
886 int error; 875 int error;
887 int type; 876 int type;
888 877
889 ASSERT(blkcnt > 0); 878 ASSERT(blkcnt > 0);
890 type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER : 879 type = flags & XFS_QMOPT_UQUOTA ? XFS_DQ_USER :
@@ -908,7 +897,7 @@ xfs_qm_dqiter_bufs(
908 break; 897 break;
909 898
910 xfs_qm_reset_dqcounts(mp, bp, firstid, type); 899 xfs_qm_reset_dqcounts(mp, bp, firstid, type);
911 xfs_buf_delwri_queue(bp); 900 xfs_buf_delwri_queue(bp, buffer_list);
912 xfs_buf_relse(bp); 901 xfs_buf_relse(bp);
913 /* 902 /*
914 * goto the next block. 903 * goto the next block.
@@ -916,6 +905,7 @@ xfs_qm_dqiter_bufs(
916 bno++; 905 bno++;
917 firstid += mp->m_quotainfo->qi_dqperchunk; 906 firstid += mp->m_quotainfo->qi_dqperchunk;
918 } 907 }
908
919 return error; 909 return error;
920} 910}
921 911
@@ -925,11 +915,12 @@ xfs_qm_dqiter_bufs(
925 */ 915 */
926STATIC int 916STATIC int
927xfs_qm_dqiterate( 917xfs_qm_dqiterate(
928 xfs_mount_t *mp, 918 struct xfs_mount *mp,
929 xfs_inode_t *qip, 919 struct xfs_inode *qip,
930 uint flags) 920 uint flags,
921 struct list_head *buffer_list)
931{ 922{
932 xfs_bmbt_irec_t *map; 923 struct xfs_bmbt_irec *map;
933 int i, nmaps; /* number of map entries */ 924 int i, nmaps; /* number of map entries */
934 int error; /* return value */ 925 int error; /* return value */
935 xfs_fileoff_t lblkno; 926 xfs_fileoff_t lblkno;
@@ -996,21 +987,17 @@ xfs_qm_dqiterate(
996 * Iterate thru all the blks in the extent and 987 * Iterate thru all the blks in the extent and
997 * reset the counters of all the dquots inside them. 988 * reset the counters of all the dquots inside them.
998 */ 989 */
999 if ((error = xfs_qm_dqiter_bufs(mp, 990 error = xfs_qm_dqiter_bufs(mp, firstid,
1000 firstid, 991 map[i].br_startblock,
1001 map[i].br_startblock, 992 map[i].br_blockcount,
1002 map[i].br_blockcount, 993 flags, buffer_list);
1003 flags))) { 994 if (error)
1004 break; 995 goto out;
1005 }
1006 } 996 }
1007
1008 if (error)
1009 break;
1010 } while (nmaps > 0); 997 } while (nmaps > 0);
1011 998
999out:
1012 kmem_free(map); 1000 kmem_free(map);
1013
1014 return error; 1001 return error;
1015} 1002}
1016 1003
@@ -1203,8 +1190,10 @@ error0:
1203 1190
1204STATIC int 1191STATIC int
1205xfs_qm_flush_one( 1192xfs_qm_flush_one(
1206 struct xfs_dquot *dqp) 1193 struct xfs_dquot *dqp,
1194 void *data)
1207{ 1195{
1196 struct list_head *buffer_list = data;
1208 struct xfs_buf *bp = NULL; 1197 struct xfs_buf *bp = NULL;
1209 int error = 0; 1198 int error = 0;
1210 1199
@@ -1214,14 +1203,12 @@ xfs_qm_flush_one(
1214 if (!XFS_DQ_IS_DIRTY(dqp)) 1203 if (!XFS_DQ_IS_DIRTY(dqp))
1215 goto out_unlock; 1204 goto out_unlock;
1216 1205
1217 if (!xfs_dqflock_nowait(dqp)) 1206 xfs_dqflock(dqp);
1218 xfs_dqflock_pushbuf_wait(dqp);
1219
1220 error = xfs_qm_dqflush(dqp, &bp); 1207 error = xfs_qm_dqflush(dqp, &bp);
1221 if (error) 1208 if (error)
1222 goto out_unlock; 1209 goto out_unlock;
1223 1210
1224 xfs_buf_delwri_queue(bp); 1211 xfs_buf_delwri_queue(bp, buffer_list);
1225 xfs_buf_relse(bp); 1212 xfs_buf_relse(bp);
1226out_unlock: 1213out_unlock:
1227 xfs_dqunlock(dqp); 1214 xfs_dqunlock(dqp);
@@ -1241,6 +1228,7 @@ xfs_qm_quotacheck(
1241 size_t structsz; 1228 size_t structsz;
1242 xfs_inode_t *uip, *gip; 1229 xfs_inode_t *uip, *gip;
1243 uint flags; 1230 uint flags;
1231 LIST_HEAD (buffer_list);
1244 1232
1245 count = INT_MAX; 1233 count = INT_MAX;
1246 structsz = 1; 1234 structsz = 1;
@@ -1259,7 +1247,8 @@ xfs_qm_quotacheck(
1259 */ 1247 */
1260 uip = mp->m_quotainfo->qi_uquotaip; 1248 uip = mp->m_quotainfo->qi_uquotaip;
1261 if (uip) { 1249 if (uip) {
1262 error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA); 1250 error = xfs_qm_dqiterate(mp, uip, XFS_QMOPT_UQUOTA,
1251 &buffer_list);
1263 if (error) 1252 if (error)
1264 goto error_return; 1253 goto error_return;
1265 flags |= XFS_UQUOTA_CHKD; 1254 flags |= XFS_UQUOTA_CHKD;
@@ -1268,7 +1257,8 @@ xfs_qm_quotacheck(
1268 gip = mp->m_quotainfo->qi_gquotaip; 1257 gip = mp->m_quotainfo->qi_gquotaip;
1269 if (gip) { 1258 if (gip) {
1270 error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ? 1259 error = xfs_qm_dqiterate(mp, gip, XFS_IS_GQUOTA_ON(mp) ?
1271 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA); 1260 XFS_QMOPT_GQUOTA : XFS_QMOPT_PQUOTA,
1261 &buffer_list);
1272 if (error) 1262 if (error)
1273 goto error_return; 1263 goto error_return;
1274 flags |= XFS_OQUOTA_CHKD; 1264 flags |= XFS_OQUOTA_CHKD;
@@ -1291,19 +1281,27 @@ xfs_qm_quotacheck(
1291 * We've made all the changes that we need to make incore. Flush them 1281 * We've made all the changes that we need to make incore. Flush them
1292 * down to disk buffers if everything was updated successfully. 1282 * down to disk buffers if everything was updated successfully.
1293 */ 1283 */
1294 if (XFS_IS_UQUOTA_ON(mp)) 1284 if (XFS_IS_UQUOTA_ON(mp)) {
1295 error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one); 1285 error = xfs_qm_dquot_walk(mp, XFS_DQ_USER, xfs_qm_flush_one,
1286 &buffer_list);
1287 }
1296 if (XFS_IS_GQUOTA_ON(mp)) { 1288 if (XFS_IS_GQUOTA_ON(mp)) {
1297 error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one); 1289 error2 = xfs_qm_dquot_walk(mp, XFS_DQ_GROUP, xfs_qm_flush_one,
1290 &buffer_list);
1298 if (!error) 1291 if (!error)
1299 error = error2; 1292 error = error2;
1300 } 1293 }
1301 if (XFS_IS_PQUOTA_ON(mp)) { 1294 if (XFS_IS_PQUOTA_ON(mp)) {
1302 error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one); 1295 error2 = xfs_qm_dquot_walk(mp, XFS_DQ_PROJ, xfs_qm_flush_one,
1296 &buffer_list);
1303 if (!error) 1297 if (!error)
1304 error = error2; 1298 error = error2;
1305 } 1299 }
1306 1300
1301 error2 = xfs_buf_delwri_submit(&buffer_list);
1302 if (!error)
1303 error = error2;
1304
1307 /* 1305 /*
1308 * We can get this error if we couldn't do a dquot allocation inside 1306 * We can get this error if we couldn't do a dquot allocation inside
1309 * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the 1307 * xfs_qm_dqusage_adjust (via bulkstat). We don't care about the
@@ -1317,15 +1315,6 @@ xfs_qm_quotacheck(
1317 } 1315 }
1318 1316
1319 /* 1317 /*
1320 * We didn't log anything, because if we crashed, we'll have to
1321 * start the quotacheck from scratch anyway. However, we must make
1322 * sure that our dquot changes are secure before we put the
1323 * quotacheck'd stamp on the superblock. So, here we do a synchronous
1324 * flush.
1325 */
1326 xfs_flush_buftarg(mp->m_ddev_targp, 1);
1327
1328 /*
1329 * If one type of quotas is off, then it will lose its 1318 * If one type of quotas is off, then it will lose its
1330 * quotachecked status, since we won't be doing accounting for 1319 * quotachecked status, since we won't be doing accounting for
1331 * that type anymore. 1320 * that type anymore.
@@ -1334,6 +1323,13 @@ xfs_qm_quotacheck(
1334 mp->m_qflags |= flags; 1323 mp->m_qflags |= flags;
1335 1324
1336 error_return: 1325 error_return:
1326 while (!list_empty(&buffer_list)) {
1327 struct xfs_buf *bp =
1328 list_first_entry(&buffer_list, struct xfs_buf, b_list);
1329 list_del_init(&bp->b_list);
1330 xfs_buf_relse(bp);
1331 }
1332
1337 if (error) { 1333 if (error) {
1338 xfs_warn(mp, 1334 xfs_warn(mp,
1339 "Quotacheck: Unsuccessful (Error %d): Disabling quotas.", 1335 "Quotacheck: Unsuccessful (Error %d): Disabling quotas.",
@@ -1450,6 +1446,7 @@ xfs_qm_dqfree_one(
1450STATIC void 1446STATIC void
1451xfs_qm_dqreclaim_one( 1447xfs_qm_dqreclaim_one(
1452 struct xfs_dquot *dqp, 1448 struct xfs_dquot *dqp,
1449 struct list_head *buffer_list,
1453 struct list_head *dispose_list) 1450 struct list_head *dispose_list)
1454{ 1451{
1455 struct xfs_mount *mp = dqp->q_mount; 1452 struct xfs_mount *mp = dqp->q_mount;
@@ -1482,21 +1479,11 @@ xfs_qm_dqreclaim_one(
1482 if (!xfs_dqflock_nowait(dqp)) 1479 if (!xfs_dqflock_nowait(dqp))
1483 goto out_busy; 1480 goto out_busy;
1484 1481
1485 /*
1486 * We have the flush lock so we know that this is not in the
1487 * process of being flushed. So, if this is dirty, flush it
1488 * DELWRI so that we don't get a freelist infested with
1489 * dirty dquots.
1490 */
1491 if (XFS_DQ_IS_DIRTY(dqp)) { 1482 if (XFS_DQ_IS_DIRTY(dqp)) {
1492 struct xfs_buf *bp = NULL; 1483 struct xfs_buf *bp = NULL;
1493 1484
1494 trace_xfs_dqreclaim_dirty(dqp); 1485 trace_xfs_dqreclaim_dirty(dqp);
1495 1486
1496 /*
1497 * We flush it delayed write, so don't bother releasing the
1498 * freelist lock.
1499 */
1500 error = xfs_qm_dqflush(dqp, &bp); 1487 error = xfs_qm_dqflush(dqp, &bp);
1501 if (error) { 1488 if (error) {
1502 xfs_warn(mp, "%s: dquot %p flush failed", 1489 xfs_warn(mp, "%s: dquot %p flush failed",
@@ -1504,7 +1491,7 @@ xfs_qm_dqreclaim_one(
1504 goto out_busy; 1491 goto out_busy;
1505 } 1492 }
1506 1493
1507 xfs_buf_delwri_queue(bp); 1494 xfs_buf_delwri_queue(bp, buffer_list);
1508 xfs_buf_relse(bp); 1495 xfs_buf_relse(bp);
1509 /* 1496 /*
1510 * Give the dquot another try on the freelist, as the 1497 * Give the dquot another try on the freelist, as the
@@ -1549,8 +1536,10 @@ xfs_qm_shake(
1549 struct xfs_quotainfo *qi = 1536 struct xfs_quotainfo *qi =
1550 container_of(shrink, struct xfs_quotainfo, qi_shrinker); 1537 container_of(shrink, struct xfs_quotainfo, qi_shrinker);
1551 int nr_to_scan = sc->nr_to_scan; 1538 int nr_to_scan = sc->nr_to_scan;
1539 LIST_HEAD (buffer_list);
1552 LIST_HEAD (dispose_list); 1540 LIST_HEAD (dispose_list);
1553 struct xfs_dquot *dqp; 1541 struct xfs_dquot *dqp;
1542 int error;
1554 1543
1555 if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT)) 1544 if ((sc->gfp_mask & (__GFP_FS|__GFP_WAIT)) != (__GFP_FS|__GFP_WAIT))
1556 return 0; 1545 return 0;
@@ -1563,15 +1552,20 @@ xfs_qm_shake(
1563 break; 1552 break;
1564 dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot, 1553 dqp = list_first_entry(&qi->qi_lru_list, struct xfs_dquot,
1565 q_lru); 1554 q_lru);
1566 xfs_qm_dqreclaim_one(dqp, &dispose_list); 1555 xfs_qm_dqreclaim_one(dqp, &buffer_list, &dispose_list);
1567 } 1556 }
1568 mutex_unlock(&qi->qi_lru_lock); 1557 mutex_unlock(&qi->qi_lru_lock);
1569 1558
1559 error = xfs_buf_delwri_submit(&buffer_list);
1560 if (error)
1561 xfs_warn(NULL, "%s: dquot reclaim failed", __func__);
1562
1570 while (!list_empty(&dispose_list)) { 1563 while (!list_empty(&dispose_list)) {
1571 dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru); 1564 dqp = list_first_entry(&dispose_list, struct xfs_dquot, q_lru);
1572 list_del_init(&dqp->q_lru); 1565 list_del_init(&dqp->q_lru);
1573 xfs_qm_dqfree_one(dqp); 1566 xfs_qm_dqfree_one(dqp);
1574 } 1567 }
1568
1575out: 1569out:
1576 return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure; 1570 return (qi->qi_lru_count / 100) * sysctl_vfs_cache_pressure;
1577} 1571}
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 28d1f508b578..fa07b7731cf2 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -981,15 +981,7 @@ xfs_fs_put_super(
981{ 981{
982 struct xfs_mount *mp = XFS_M(sb); 982 struct xfs_mount *mp = XFS_M(sb);
983 983
984 /*
985 * Blow away any referenced inode in the filestreams cache.
986 * This can and will cause log traffic as inodes go inactive
987 * here.
988 */
989 xfs_filestream_unmount(mp); 984 xfs_filestream_unmount(mp);
990
991 xfs_flush_buftarg(mp->m_ddev_targp, 1);
992
993 xfs_unmountfs(mp); 985 xfs_unmountfs(mp);
994 xfs_syncd_stop(mp); 986 xfs_syncd_stop(mp);
995 xfs_freesb(mp); 987 xfs_freesb(mp);
@@ -1404,15 +1396,7 @@ out_destroy_workqueues:
1404 return -error; 1396 return -error;
1405 1397
1406 out_unmount: 1398 out_unmount:
1407 /*
1408 * Blow away any referenced inode in the filestreams cache.
1409 * This can and will cause log traffic as inodes go inactive
1410 * here.
1411 */
1412 xfs_filestream_unmount(mp); 1399 xfs_filestream_unmount(mp);
1413
1414 xfs_flush_buftarg(mp->m_ddev_targp, 1);
1415
1416 xfs_unmountfs(mp); 1400 xfs_unmountfs(mp);
1417 xfs_syncd_stop(mp); 1401 xfs_syncd_stop(mp);
1418 goto out_free_sb; 1402 goto out_free_sb;
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 468c3c0a4f9f..cdb644fd0bd1 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -313,17 +313,10 @@ xfs_quiesce_data(
313 /* write superblock and hoover up shutdown errors */ 313 /* write superblock and hoover up shutdown errors */
314 error = xfs_sync_fsdata(mp); 314 error = xfs_sync_fsdata(mp);
315 315
316 /* make sure all delwri buffers are written out */
317 xfs_flush_buftarg(mp->m_ddev_targp, 1);
318
319 /* mark the log as covered if needed */ 316 /* mark the log as covered if needed */
320 if (xfs_log_need_covered(mp)) 317 if (xfs_log_need_covered(mp))
321 error2 = xfs_fs_log_dummy(mp); 318 error2 = xfs_fs_log_dummy(mp);
322 319
323 /* flush data-only devices */
324 if (mp->m_rtdev_targp)
325 xfs_flush_buftarg(mp->m_rtdev_targp, 1);
326
327 return error ? error : error2; 320 return error ? error : error2;
328} 321}
329 322
@@ -684,17 +677,6 @@ restart:
684 if (!xfs_iflock_nowait(ip)) { 677 if (!xfs_iflock_nowait(ip)) {
685 if (!(sync_mode & SYNC_WAIT)) 678 if (!(sync_mode & SYNC_WAIT))
686 goto out; 679 goto out;
687
688 /*
689 * If we only have a single dirty inode in a cluster there is
690 * a fair chance that the AIL push may have pushed it into
691 * the buffer, but xfsbufd won't touch it until 30 seconds
692 * from now, and thus we will lock up here.
693 *
694 * Promote the inode buffer to the front of the delwri list
695 * and wake up xfsbufd now.
696 */
697 xfs_promote_inode(ip);
698 xfs_iflock(ip); 680 xfs_iflock(ip);
699 } 681 }
700 682
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 06838c42b2a0..2e41756e263a 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -328,7 +328,7 @@ DEFINE_BUF_EVENT(xfs_buf_unlock);
328DEFINE_BUF_EVENT(xfs_buf_iowait); 328DEFINE_BUF_EVENT(xfs_buf_iowait);
329DEFINE_BUF_EVENT(xfs_buf_iowait_done); 329DEFINE_BUF_EVENT(xfs_buf_iowait_done);
330DEFINE_BUF_EVENT(xfs_buf_delwri_queue); 330DEFINE_BUF_EVENT(xfs_buf_delwri_queue);
331DEFINE_BUF_EVENT(xfs_buf_delwri_dequeue); 331DEFINE_BUF_EVENT(xfs_buf_delwri_queued);
332DEFINE_BUF_EVENT(xfs_buf_delwri_split); 332DEFINE_BUF_EVENT(xfs_buf_delwri_split);
333DEFINE_BUF_EVENT(xfs_buf_get_uncached); 333DEFINE_BUF_EVENT(xfs_buf_get_uncached);
334DEFINE_BUF_EVENT(xfs_bdstrat_shut); 334DEFINE_BUF_EVENT(xfs_bdstrat_shut);
@@ -486,12 +486,10 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_format_stale);
486DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin); 486DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pin);
487DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin); 487DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin);
488DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale); 488DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unpin_stale);
489DEFINE_BUF_ITEM_EVENT(xfs_buf_item_trylock);
490DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock); 489DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
491DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); 490DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
492DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); 491DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
493DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); 492DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
494DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf);
495DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); 493DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
496DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur); 494DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
497DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb); 495DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
@@ -881,10 +879,9 @@ DEFINE_EVENT(xfs_log_item_class, name, \
881 TP_PROTO(struct xfs_log_item *lip), \ 879 TP_PROTO(struct xfs_log_item *lip), \
882 TP_ARGS(lip)) 880 TP_ARGS(lip))
883DEFINE_LOG_ITEM_EVENT(xfs_ail_push); 881DEFINE_LOG_ITEM_EVENT(xfs_ail_push);
884DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf);
885DEFINE_LOG_ITEM_EVENT(xfs_ail_pushbuf_pinned);
886DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned); 882DEFINE_LOG_ITEM_EVENT(xfs_ail_pinned);
887DEFINE_LOG_ITEM_EVENT(xfs_ail_locked); 883DEFINE_LOG_ITEM_EVENT(xfs_ail_locked);
884DEFINE_LOG_ITEM_EVENT(xfs_ail_flushing);
888 885
889 886
890DECLARE_EVENT_CLASS(xfs_file_class, 887DECLARE_EVENT_CLASS(xfs_file_class,
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index f6118703f20d..7ab99e1898c8 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -345,11 +345,9 @@ struct xfs_item_ops {
345 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *); 345 void (*iop_format)(xfs_log_item_t *, struct xfs_log_iovec *);
346 void (*iop_pin)(xfs_log_item_t *); 346 void (*iop_pin)(xfs_log_item_t *);
347 void (*iop_unpin)(xfs_log_item_t *, int remove); 347 void (*iop_unpin)(xfs_log_item_t *, int remove);
348 uint (*iop_trylock)(xfs_log_item_t *); 348 uint (*iop_push)(struct xfs_log_item *, struct list_head *);
349 void (*iop_unlock)(xfs_log_item_t *); 349 void (*iop_unlock)(xfs_log_item_t *);
350 xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t); 350 xfs_lsn_t (*iop_committed)(xfs_log_item_t *, xfs_lsn_t);
351 void (*iop_push)(xfs_log_item_t *);
352 bool (*iop_pushbuf)(xfs_log_item_t *);
353 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t); 351 void (*iop_committing)(xfs_log_item_t *, xfs_lsn_t);
354}; 352};
355 353
@@ -357,20 +355,18 @@ struct xfs_item_ops {
357#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp) 355#define IOP_FORMAT(ip,vp) (*(ip)->li_ops->iop_format)(ip, vp)
358#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip) 356#define IOP_PIN(ip) (*(ip)->li_ops->iop_pin)(ip)
359#define IOP_UNPIN(ip, remove) (*(ip)->li_ops->iop_unpin)(ip, remove) 357#define IOP_UNPIN(ip, remove) (*(ip)->li_ops->iop_unpin)(ip, remove)
360#define IOP_TRYLOCK(ip) (*(ip)->li_ops->iop_trylock)(ip) 358#define IOP_PUSH(ip, list) (*(ip)->li_ops->iop_push)(ip, list)
361#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip) 359#define IOP_UNLOCK(ip) (*(ip)->li_ops->iop_unlock)(ip)
362#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn) 360#define IOP_COMMITTED(ip, lsn) (*(ip)->li_ops->iop_committed)(ip, lsn)
363#define IOP_PUSH(ip) (*(ip)->li_ops->iop_push)(ip)
364#define IOP_PUSHBUF(ip) (*(ip)->li_ops->iop_pushbuf)(ip)
365#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn) 361#define IOP_COMMITTING(ip, lsn) (*(ip)->li_ops->iop_committing)(ip, lsn)
366 362
367/* 363/*
368 * Return values for the IOP_TRYLOCK() routines. 364 * Return values for the IOP_PUSH() routines.
369 */ 365 */
370#define XFS_ITEM_SUCCESS 0 366#define XFS_ITEM_SUCCESS 0
371#define XFS_ITEM_PINNED 1 367#define XFS_ITEM_PINNED 1
372#define XFS_ITEM_LOCKED 2 368#define XFS_ITEM_LOCKED 2
373#define XFS_ITEM_PUSHBUF 3 369#define XFS_ITEM_FLUSHING 3
374 370
375/* 371/*
376 * This is the type of function which can be given to xfs_trans_callback() 372 * This is the type of function which can be given to xfs_trans_callback()
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index 0425ca16738b..49d9cde33bb3 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -364,29 +364,31 @@ xfsaild_push(
364 xfs_log_item_t *lip; 364 xfs_log_item_t *lip;
365 xfs_lsn_t lsn; 365 xfs_lsn_t lsn;
366 xfs_lsn_t target; 366 xfs_lsn_t target;
367 long tout = 10; 367 long tout;
368 int stuck = 0; 368 int stuck = 0;
369 int flushing = 0;
369 int count = 0; 370 int count = 0;
370 int push_xfsbufd = 0;
371 371
372 /* 372 /*
373 * If last time we ran we encountered pinned items, force the log first 373 * If we encountered pinned items or did not finish writing out all
374 * and wait for it before pushing again. 374 * buffers the last time we ran, force the log first and wait for it
375 * before pushing again.
375 */ 376 */
376 spin_lock(&ailp->xa_lock); 377 if (ailp->xa_log_flush && ailp->xa_last_pushed_lsn == 0 &&
377 if (ailp->xa_last_pushed_lsn == 0 && ailp->xa_log_flush && 378 (!list_empty_careful(&ailp->xa_buf_list) ||
378 !list_empty(&ailp->xa_ail)) { 379 xfs_ail_min_lsn(ailp))) {
379 ailp->xa_log_flush = 0; 380 ailp->xa_log_flush = 0;
380 spin_unlock(&ailp->xa_lock); 381
381 XFS_STATS_INC(xs_push_ail_flush); 382 XFS_STATS_INC(xs_push_ail_flush);
382 xfs_log_force(mp, XFS_LOG_SYNC); 383 xfs_log_force(mp, XFS_LOG_SYNC);
383 spin_lock(&ailp->xa_lock);
384 } 384 }
385 385
386 spin_lock(&ailp->xa_lock);
386 lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn); 387 lip = xfs_trans_ail_cursor_first(ailp, &cur, ailp->xa_last_pushed_lsn);
387 if (!lip) { 388 if (!lip) {
388 /* 389 /*
389 * AIL is empty or our push has reached the end. 390 * If the AIL is empty or our push has reached the end we are
391 * done now.
390 */ 392 */
391 xfs_trans_ail_cursor_done(ailp, &cur); 393 xfs_trans_ail_cursor_done(ailp, &cur);
392 spin_unlock(&ailp->xa_lock); 394 spin_unlock(&ailp->xa_lock);
@@ -395,55 +397,42 @@ xfsaild_push(
395 397
396 XFS_STATS_INC(xs_push_ail); 398 XFS_STATS_INC(xs_push_ail);
397 399
398 /*
399 * While the item we are looking at is below the given threshold
400 * try to flush it out. We'd like not to stop until we've at least
401 * tried to push on everything in the AIL with an LSN less than
402 * the given threshold.
403 *
404 * However, we will stop after a certain number of pushes and wait
405 * for a reduced timeout to fire before pushing further. This
406 * prevents use from spinning when we can't do anything or there is
407 * lots of contention on the AIL lists.
408 */
409 lsn = lip->li_lsn; 400 lsn = lip->li_lsn;
410 target = ailp->xa_target; 401 target = ailp->xa_target;
411 while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) { 402 while ((XFS_LSN_CMP(lip->li_lsn, target) <= 0)) {
412 int lock_result; 403 int lock_result;
404
413 /* 405 /*
414 * If we can lock the item without sleeping, unlock the AIL 406 * Note that IOP_PUSH may unlock and reacquire the AIL lock. We
415 * lock and flush the item. Then re-grab the AIL lock so we 407 * rely on the AIL cursor implementation to be able to deal with
416 * can look for the next item on the AIL. List changes are 408 * the dropped lock.
417 * handled by the AIL lookup functions internally
418 *
419 * If we can't lock the item, either its holder will flush it
420 * or it is already being flushed or it is being relogged. In
421 * any of these case it is being taken care of and we can just
422 * skip to the next item in the list.
423 */ 409 */
424 lock_result = IOP_TRYLOCK(lip); 410 lock_result = IOP_PUSH(lip, &ailp->xa_buf_list);
425 spin_unlock(&ailp->xa_lock);
426 switch (lock_result) { 411 switch (lock_result) {
427 case XFS_ITEM_SUCCESS: 412 case XFS_ITEM_SUCCESS:
428 XFS_STATS_INC(xs_push_ail_success); 413 XFS_STATS_INC(xs_push_ail_success);
429 trace_xfs_ail_push(lip); 414 trace_xfs_ail_push(lip);
430 415
431 IOP_PUSH(lip);
432 ailp->xa_last_pushed_lsn = lsn; 416 ailp->xa_last_pushed_lsn = lsn;
433 break; 417 break;
434 418
435 case XFS_ITEM_PUSHBUF: 419 case XFS_ITEM_FLUSHING:
436 XFS_STATS_INC(xs_push_ail_pushbuf); 420 /*
437 trace_xfs_ail_pushbuf(lip); 421 * The item or its backing buffer is already beeing
438 422 * flushed. The typical reason for that is that an
439 if (!IOP_PUSHBUF(lip)) { 423 * inode buffer is locked because we already pushed the
440 trace_xfs_ail_pushbuf_pinned(lip); 424 * updates to it as part of inode clustering.
441 stuck++; 425 *
442 ailp->xa_log_flush++; 426 * We do not want to to stop flushing just because lots
443 } else { 427 * of items are already beeing flushed, but we need to
444 ailp->xa_last_pushed_lsn = lsn; 428 * re-try the flushing relatively soon if most of the
445 } 429 * AIL is beeing flushed.
446 push_xfsbufd = 1; 430 */
431 XFS_STATS_INC(xs_push_ail_flushing);
432 trace_xfs_ail_flushing(lip);
433
434 flushing++;
435 ailp->xa_last_pushed_lsn = lsn;
447 break; 436 break;
448 437
449 case XFS_ITEM_PINNED: 438 case XFS_ITEM_PINNED:
@@ -453,23 +442,22 @@ xfsaild_push(
453 stuck++; 442 stuck++;
454 ailp->xa_log_flush++; 443 ailp->xa_log_flush++;
455 break; 444 break;
456
457 case XFS_ITEM_LOCKED: 445 case XFS_ITEM_LOCKED:
458 XFS_STATS_INC(xs_push_ail_locked); 446 XFS_STATS_INC(xs_push_ail_locked);
459 trace_xfs_ail_locked(lip); 447 trace_xfs_ail_locked(lip);
448
460 stuck++; 449 stuck++;
461 break; 450 break;
462
463 default: 451 default:
464 ASSERT(0); 452 ASSERT(0);
465 break; 453 break;
466 } 454 }
467 455
468 spin_lock(&ailp->xa_lock);
469 count++; 456 count++;
470 457
471 /* 458 /*
472 * Are there too many items we can't do anything with? 459 * Are there too many items we can't do anything with?
460 *
473 * If we we are skipping too many items because we can't flush 461 * If we we are skipping too many items because we can't flush
474 * them or they are already being flushed, we back off and 462 * them or they are already being flushed, we back off and
475 * given them time to complete whatever operation is being 463 * given them time to complete whatever operation is being
@@ -491,42 +479,36 @@ xfsaild_push(
491 xfs_trans_ail_cursor_done(ailp, &cur); 479 xfs_trans_ail_cursor_done(ailp, &cur);
492 spin_unlock(&ailp->xa_lock); 480 spin_unlock(&ailp->xa_lock);
493 481
494 if (push_xfsbufd) { 482 if (xfs_buf_delwri_submit_nowait(&ailp->xa_buf_list))
495 /* we've got delayed write buffers to flush */ 483 ailp->xa_log_flush++;
496 wake_up_process(mp->m_ddev_targp->bt_task);
497 }
498 484
499 /* assume we have more work to do in a short while */ 485 if (!count || XFS_LSN_CMP(lsn, target) >= 0) {
500out_done: 486out_done:
501 if (!count) {
502 /* We're past our target or empty, so idle */
503 ailp->xa_last_pushed_lsn = 0;
504 ailp->xa_log_flush = 0;
505
506 tout = 50;
507 } else if (XFS_LSN_CMP(lsn, target) >= 0) {
508 /* 487 /*
509 * We reached the target so wait a bit longer for I/O to 488 * We reached the target or the AIL is empty, so wait a bit
510 * complete and remove pushed items from the AIL before we 489 * longer for I/O to complete and remove pushed items from the
511 * start the next scan from the start of the AIL. 490 * AIL before we start the next scan from the start of the AIL.
512 */ 491 */
513 tout = 50; 492 tout = 50;
514 ailp->xa_last_pushed_lsn = 0; 493 ailp->xa_last_pushed_lsn = 0;
515 } else if ((stuck * 100) / count > 90) { 494 } else if (((stuck + flushing) * 100) / count > 90) {
516 /* 495 /*
517 * Either there is a lot of contention on the AIL or we 496 * Either there is a lot of contention on the AIL or we are
518 * are stuck due to operations in progress. "Stuck" in this 497 * stuck due to operations in progress. "Stuck" in this case
519 * case is defined as >90% of the items we tried to push 498 * is defined as >90% of the items we tried to push were stuck.
520 * were stuck.
521 * 499 *
522 * Backoff a bit more to allow some I/O to complete before 500 * Backoff a bit more to allow some I/O to complete before
523 * restarting from the start of the AIL. This prevents us 501 * restarting from the start of the AIL. This prevents us from
524 * from spinning on the same items, and if they are pinned will 502 * spinning on the same items, and if they are pinned will all
525 * all the restart to issue a log force to unpin the stuck 503 * the restart to issue a log force to unpin the stuck items.
526 * items.
527 */ 504 */
528 tout = 20; 505 tout = 20;
529 ailp->xa_last_pushed_lsn = 0; 506 ailp->xa_last_pushed_lsn = 0;
507 } else {
508 /*
509 * Assume we have more work to do in a short while.
510 */
511 tout = 10;
530 } 512 }
531 513
532 return tout; 514 return tout;
@@ -539,6 +521,8 @@ xfsaild(
539 struct xfs_ail *ailp = data; 521 struct xfs_ail *ailp = data;
540 long tout = 0; /* milliseconds */ 522 long tout = 0; /* milliseconds */
541 523
524 current->flags |= PF_MEMALLOC;
525
542 while (!kthread_should_stop()) { 526 while (!kthread_should_stop()) {
543 if (tout && tout <= 20) 527 if (tout && tout <= 20)
544 __set_current_state(TASK_KILLABLE); 528 __set_current_state(TASK_KILLABLE);
@@ -794,6 +778,7 @@ xfs_trans_ail_init(
794 INIT_LIST_HEAD(&ailp->xa_ail); 778 INIT_LIST_HEAD(&ailp->xa_ail);
795 INIT_LIST_HEAD(&ailp->xa_cursors); 779 INIT_LIST_HEAD(&ailp->xa_cursors);
796 spin_lock_init(&ailp->xa_lock); 780 spin_lock_init(&ailp->xa_lock);
781 INIT_LIST_HEAD(&ailp->xa_buf_list);
797 init_waitqueue_head(&ailp->xa_empty); 782 init_waitqueue_head(&ailp->xa_empty);
798 783
799 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s", 784 ailp->xa_task = kthread_run(xfsaild, ailp, "xfsaild/%s",
diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c
index 296a7995a007..9132d162c4b8 100644
--- a/fs/xfs/xfs_trans_buf.c
+++ b/fs/xfs/xfs_trans_buf.c
@@ -165,14 +165,6 @@ xfs_trans_get_buf(xfs_trans_t *tp,
165 XFS_BUF_DONE(bp); 165 XFS_BUF_DONE(bp);
166 } 166 }
167 167
168 /*
169 * If the buffer is stale then it was binval'ed
170 * since last read. This doesn't matter since the
171 * caller isn't allowed to use the data anyway.
172 */
173 else if (XFS_BUF_ISSTALE(bp))
174 ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
175
176 ASSERT(bp->b_transp == tp); 168 ASSERT(bp->b_transp == tp);
177 bip = bp->b_fspriv; 169 bip = bp->b_fspriv;
178 ASSERT(bip != NULL); 170 ASSERT(bip != NULL);
@@ -418,19 +410,6 @@ xfs_trans_read_buf(
418 return 0; 410 return 0;
419 411
420shutdown_abort: 412shutdown_abort:
421 /*
422 * the theory here is that buffer is good but we're
423 * bailing out because the filesystem is being forcibly
424 * shut down. So we should leave the b_flags alone since
425 * the buffer's not staled and just get out.
426 */
427#if defined(DEBUG)
428 if (XFS_BUF_ISSTALE(bp) && XFS_BUF_ISDELAYWRITE(bp))
429 xfs_notice(mp, "about to pop assert, bp == 0x%p", bp);
430#endif
431 ASSERT((bp->b_flags & (XBF_STALE|XBF_DELWRI)) !=
432 (XBF_STALE|XBF_DELWRI));
433
434 trace_xfs_trans_read_buf_shut(bp, _RET_IP_); 413 trace_xfs_trans_read_buf_shut(bp, _RET_IP_);
435 xfs_buf_relse(bp); 414 xfs_buf_relse(bp);
436 *bpp = NULL; 415 *bpp = NULL;
@@ -649,22 +628,33 @@ xfs_trans_log_buf(xfs_trans_t *tp,
649 628
650 629
651/* 630/*
652 * This called to invalidate a buffer that is being used within 631 * Invalidate a buffer that is being used within a transaction.
653 * a transaction. Typically this is because the blocks in the 632 *
654 * buffer are being freed, so we need to prevent it from being 633 * Typically this is because the blocks in the buffer are being freed, so we
655 * written out when we're done. Allowing it to be written again 634 * need to prevent it from being written out when we're done. Allowing it
656 * might overwrite data in the free blocks if they are reallocated 635 * to be written again might overwrite data in the free blocks if they are
657 * to a file. 636 * reallocated to a file.
658 * 637 *
659 * We prevent the buffer from being written out by clearing the 638 * We prevent the buffer from being written out by marking it stale. We can't
660 * B_DELWRI flag. We can't always 639 * get rid of the buf log item at this point because the buffer may still be
661 * get rid of the buf log item at this point, though, because 640 * pinned by another transaction. If that is the case, then we'll wait until
662 * the buffer may still be pinned by another transaction. If that 641 * the buffer is committed to disk for the last time (we can tell by the ref
663 * is the case, then we'll wait until the buffer is committed to 642 * count) and free it in xfs_buf_item_unpin(). Until that happens we will
664 * disk for the last time (we can tell by the ref count) and 643 * keep the buffer locked so that the buffer and buf log item are not reused.
665 * free it in xfs_buf_item_unpin(). Until it is cleaned up we 644 *
666 * will keep the buffer locked so that the buffer and buf log item 645 * We also set the XFS_BLF_CANCEL flag in the buf log format structure and log
667 * are not reused. 646 * the buf item. This will be used at recovery time to determine that copies
647 * of the buffer in the log before this should not be replayed.
648 *
649 * We mark the item descriptor and the transaction dirty so that we'll hold
650 * the buffer until after the commit.
651 *
652 * Since we're invalidating the buffer, we also clear the state about which
653 * parts of the buffer have been logged. We also clear the flag indicating
654 * that this is an inode buffer since the data in the buffer will no longer
655 * be valid.
656 *
657 * We set the stale bit in the buffer as well since we're getting rid of it.
668 */ 658 */
669void 659void
670xfs_trans_binval( 660xfs_trans_binval(
@@ -684,7 +674,6 @@ xfs_trans_binval(
684 * If the buffer is already invalidated, then 674 * If the buffer is already invalidated, then
685 * just return. 675 * just return.
686 */ 676 */
687 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
688 ASSERT(XFS_BUF_ISSTALE(bp)); 677 ASSERT(XFS_BUF_ISSTALE(bp));
689 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY))); 678 ASSERT(!(bip->bli_flags & (XFS_BLI_LOGGED | XFS_BLI_DIRTY)));
690 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF)); 679 ASSERT(!(bip->bli_format.blf_flags & XFS_BLF_INODE_BUF));
@@ -694,27 +683,8 @@ xfs_trans_binval(
694 return; 683 return;
695 } 684 }
696 685
697 /*
698 * Clear the dirty bit in the buffer and set the STALE flag
699 * in the buf log item. The STALE flag will be used in
700 * xfs_buf_item_unpin() to determine if it should clean up
701 * when the last reference to the buf item is given up.
702 * We set the XFS_BLF_CANCEL flag in the buf log format structure
703 * and log the buf item. This will be used at recovery time
704 * to determine that copies of the buffer in the log before
705 * this should not be replayed.
706 * We mark the item descriptor and the transaction dirty so
707 * that we'll hold the buffer until after the commit.
708 *
709 * Since we're invalidating the buffer, we also clear the state
710 * about which parts of the buffer have been logged. We also
711 * clear the flag indicating that this is an inode buffer since
712 * the data in the buffer will no longer be valid.
713 *
714 * We set the stale bit in the buffer as well since we're getting
715 * rid of it.
716 */
717 xfs_buf_stale(bp); 686 xfs_buf_stale(bp);
687
718 bip->bli_flags |= XFS_BLI_STALE; 688 bip->bli_flags |= XFS_BLI_STALE;
719 bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY); 689 bip->bli_flags &= ~(XFS_BLI_INODE_BUF | XFS_BLI_LOGGED | XFS_BLI_DIRTY);
720 bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF; 690 bip->bli_format.blf_flags &= ~XFS_BLF_INODE_BUF;
diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h
index 218304a8cdc7..f72bdd48a5c1 100644
--- a/fs/xfs/xfs_trans_priv.h
+++ b/fs/xfs/xfs_trans_priv.h
@@ -71,6 +71,7 @@ struct xfs_ail {
71 spinlock_t xa_lock; 71 spinlock_t xa_lock;
72 xfs_lsn_t xa_last_pushed_lsn; 72 xfs_lsn_t xa_last_pushed_lsn;
73 int xa_log_flush; 73 int xa_log_flush;
74 struct list_head xa_buf_list;
74 wait_queue_head_t xa_empty; 75 wait_queue_head_t xa_empty;
75}; 76};
76 77