aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorDave Chinner <david@fromorbit.com>2010-02-01 18:13:42 -0500
committerDave Chinner <david@fromorbit.com>2010-02-01 18:13:42 -0500
commitd808f617ad00a413585b806de340feda5ad9a2da (patch)
treeed03d4d019a9d8b566ffd454e112e9fbce70bad8 /fs
parentc854363e80b49dd04a4de18ebc379eb8c8806674 (diff)
xfs: Don't issue buffer IO direct from AIL push V2
All buffers logged into the AIL are marked as delayed write. When the AIL needs to push the buffer out, it issues an async write of the buffer. This means that IO patterns are dependent on the order of buffers in the AIL. Instead of flushing the buffer, promote the buffer in the delayed write list so that the next time the xfsbufd is run the buffer will be flushed by the xfsbufd. Return the state to the xfsaild that the buffer was promoted so that the xfsaild knows that it needs to cause the xfsbufd to run to flush the buffers that were promoted. Using the xfsbufd for issuing the IO allows us to dispatch all buffer IO from the one queue. This means that we can make much more enlightened decisions on what order to flush buffers to disk as we don't have multiple places issuing IO. Optimisations to xfsbufd will be in a future patch. Version 2 - kill XFS_ITEM_FLUSHING as it is now unused. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de>
Diffstat (limited to 'fs')
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.c29
-rw-r--r--fs/xfs/linux-2.6/xfs_buf.h2
-rw-r--r--fs/xfs/linux-2.6/xfs_trace.h1
-rw-r--r--fs/xfs/quota/xfs_dquot_item.c85
-rw-r--r--fs/xfs/quota/xfs_dquot_item.h4
-rw-r--r--fs/xfs/xfs_buf_item.c64
-rw-r--r--fs/xfs/xfs_inode_item.c98
-rw-r--r--fs/xfs/xfs_inode_item.h6
-rw-r--r--fs/xfs/xfs_trans.h3
-rw-r--r--fs/xfs/xfs_trans_ail.c13
10 files changed, 102 insertions, 203 deletions
diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c
index 44e20e578ba0..b306265caa33 100644
--- a/fs/xfs/linux-2.6/xfs_buf.c
+++ b/fs/xfs/linux-2.6/xfs_buf.c
@@ -1778,6 +1778,35 @@ xfs_buf_delwri_dequeue(
1778 trace_xfs_buf_delwri_dequeue(bp, _RET_IP_); 1778 trace_xfs_buf_delwri_dequeue(bp, _RET_IP_);
1779} 1779}
1780 1780
1781/*
1782 * If a delwri buffer needs to be pushed before it has aged out, then promote
1783 * it to the head of the delwri queue so that it will be flushed on the next
1784 * xfsbufd run. We do this by resetting the queuetime of the buffer to be older
1785 * than the age currently needed to flush the buffer. Hence the next time the
1786 * xfsbufd sees it is guaranteed to be considered old enough to flush.
1787 */
1788void
1789xfs_buf_delwri_promote(
1790 struct xfs_buf *bp)
1791{
1792 struct xfs_buftarg *btp = bp->b_target;
1793 long age = xfs_buf_age_centisecs * msecs_to_jiffies(10) + 1;
1794
1795 ASSERT(bp->b_flags & XBF_DELWRI);
1796 ASSERT(bp->b_flags & _XBF_DELWRI_Q);
1797
1798 /*
1799 * Check the buffer age before locking the delayed write queue as we
1800 * don't need to promote buffers that are already past the flush age.
1801 */
1802 if (bp->b_queuetime < jiffies - age)
1803 return;
1804 bp->b_queuetime = jiffies - age;
1805 spin_lock(&btp->bt_delwrite_lock);
1806 list_move(&bp->b_list, &btp->bt_delwrite_queue);
1807 spin_unlock(&btp->bt_delwrite_lock);
1808}
1809
1781STATIC void 1810STATIC void
1782xfs_buf_runall_queues( 1811xfs_buf_runall_queues(
1783 struct workqueue_struct *queue) 1812 struct workqueue_struct *queue)
diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h
index ea8c198f0c39..be45e8c5768d 100644
--- a/fs/xfs/linux-2.6/xfs_buf.h
+++ b/fs/xfs/linux-2.6/xfs_buf.h
@@ -266,6 +266,7 @@ extern int xfs_buf_ispin(xfs_buf_t *);
266 266
267/* Delayed Write Buffer Routines */ 267/* Delayed Write Buffer Routines */
268extern void xfs_buf_delwri_dequeue(xfs_buf_t *); 268extern void xfs_buf_delwri_dequeue(xfs_buf_t *);
269extern void xfs_buf_delwri_promote(xfs_buf_t *);
269 270
270/* Buffer Daemon Setup Routines */ 271/* Buffer Daemon Setup Routines */
271extern int xfs_buf_init(void); 272extern int xfs_buf_init(void);
@@ -395,6 +396,7 @@ extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *);
395extern void xfs_wait_buftarg(xfs_buftarg_t *); 396extern void xfs_wait_buftarg(xfs_buftarg_t *);
396extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); 397extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int);
397extern int xfs_flush_buftarg(xfs_buftarg_t *, int); 398extern int xfs_flush_buftarg(xfs_buftarg_t *, int);
399
398#ifdef CONFIG_KDB_MODULES 400#ifdef CONFIG_KDB_MODULES
399extern struct list_head *xfs_get_buftarg_list(void); 401extern struct list_head *xfs_get_buftarg_list(void);
400#endif 402#endif
diff --git a/fs/xfs/linux-2.6/xfs_trace.h b/fs/xfs/linux-2.6/xfs_trace.h
index 1bb09e70b2eb..a4574dcf5065 100644
--- a/fs/xfs/linux-2.6/xfs_trace.h
+++ b/fs/xfs/linux-2.6/xfs_trace.h
@@ -483,6 +483,7 @@ DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock);
483DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale); 483DEFINE_BUF_ITEM_EVENT(xfs_buf_item_unlock_stale);
484DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed); 484DEFINE_BUF_ITEM_EVENT(xfs_buf_item_committed);
485DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push); 485DEFINE_BUF_ITEM_EVENT(xfs_buf_item_push);
486DEFINE_BUF_ITEM_EVENT(xfs_buf_item_pushbuf);
486DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf); 487DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf);
487DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur); 488DEFINE_BUF_ITEM_EVENT(xfs_trans_get_buf_recur);
488DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb); 489DEFINE_BUF_ITEM_EVENT(xfs_trans_getsb);
diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c
index 1b564376d50c..dda0fb045c8a 100644
--- a/fs/xfs/quota/xfs_dquot_item.c
+++ b/fs/xfs/quota/xfs_dquot_item.c
@@ -212,66 +212,31 @@ xfs_qm_dquot_logitem_pushbuf(
212 xfs_dquot_t *dqp; 212 xfs_dquot_t *dqp;
213 xfs_mount_t *mp; 213 xfs_mount_t *mp;
214 xfs_buf_t *bp; 214 xfs_buf_t *bp;
215 uint dopush;
216 215
217 dqp = qip->qli_dquot; 216 dqp = qip->qli_dquot;
218 ASSERT(XFS_DQ_IS_LOCKED(dqp)); 217 ASSERT(XFS_DQ_IS_LOCKED(dqp));
219 218
220 /* 219 /*
221 * The qli_pushbuf_flag keeps others from
222 * trying to duplicate our effort.
223 */
224 ASSERT(qip->qli_pushbuf_flag != 0);
225 ASSERT(qip->qli_push_owner == current_pid());
226
227 /*
228 * If flushlock isn't locked anymore, chances are that the 220 * If flushlock isn't locked anymore, chances are that the
229 * inode flush completed and the inode was taken off the AIL. 221 * inode flush completed and the inode was taken off the AIL.
230 * So, just get out. 222 * So, just get out.
231 */ 223 */
232 if (completion_done(&dqp->q_flush) || 224 if (completion_done(&dqp->q_flush) ||
233 ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) { 225 ((qip->qli_item.li_flags & XFS_LI_IN_AIL) == 0)) {
234 qip->qli_pushbuf_flag = 0;
235 xfs_dqunlock(dqp); 226 xfs_dqunlock(dqp);
236 return; 227 return;
237 } 228 }
238 mp = dqp->q_mount; 229 mp = dqp->q_mount;
239 bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno, 230 bp = xfs_incore(mp->m_ddev_targp, qip->qli_format.qlf_blkno,
240 XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK); 231 XFS_QI_DQCHUNKLEN(mp), XBF_TRYLOCK);
241 if (bp != NULL) { 232 xfs_dqunlock(dqp);
242 if (XFS_BUF_ISDELAYWRITE(bp)) { 233 if (!bp)
243 dopush = ((qip->qli_item.li_flags & XFS_LI_IN_AIL) &&
244 !completion_done(&dqp->q_flush));
245 qip->qli_pushbuf_flag = 0;
246 xfs_dqunlock(dqp);
247
248 if (XFS_BUF_ISPINNED(bp))
249 xfs_log_force(mp, 0);
250
251 if (dopush) {
252 int error;
253#ifdef XFSRACEDEBUG
254 delay_for_intr();
255 delay(300);
256#endif
257 error = xfs_bawrite(mp, bp);
258 if (error)
259 xfs_fs_cmn_err(CE_WARN, mp,
260 "xfs_qm_dquot_logitem_pushbuf: pushbuf error %d on qip %p, bp %p",
261 error, qip, bp);
262 } else {
263 xfs_buf_relse(bp);
264 }
265 } else {
266 qip->qli_pushbuf_flag = 0;
267 xfs_dqunlock(dqp);
268 xfs_buf_relse(bp);
269 }
270 return; 234 return;
271 } 235 if (XFS_BUF_ISDELAYWRITE(bp))
236 xfs_buf_delwri_promote(bp);
237 xfs_buf_relse(bp);
238 return;
272 239
273 qip->qli_pushbuf_flag = 0;
274 xfs_dqunlock(dqp);
275} 240}
276 241
277/* 242/*
@@ -289,50 +254,24 @@ xfs_qm_dquot_logitem_trylock(
289 xfs_dq_logitem_t *qip) 254 xfs_dq_logitem_t *qip)
290{ 255{
291 xfs_dquot_t *dqp; 256 xfs_dquot_t *dqp;
292 uint retval;
293 257
294 dqp = qip->qli_dquot; 258 dqp = qip->qli_dquot;
295 if (atomic_read(&dqp->q_pincount) > 0) 259 if (atomic_read(&dqp->q_pincount) > 0)
296 return (XFS_ITEM_PINNED); 260 return XFS_ITEM_PINNED;
297 261
298 if (! xfs_qm_dqlock_nowait(dqp)) 262 if (! xfs_qm_dqlock_nowait(dqp))
299 return (XFS_ITEM_LOCKED); 263 return XFS_ITEM_LOCKED;
300 264
301 retval = XFS_ITEM_SUCCESS;
302 if (!xfs_dqflock_nowait(dqp)) { 265 if (!xfs_dqflock_nowait(dqp)) {
303 /* 266 /*
304 * The dquot is already being flushed. It may have been 267 * dquot has already been flushed to the backing buffer,
305 * flushed delayed write, however, and we don't want to 268 * leave it locked, pushbuf routine will unlock it.
306 * get stuck waiting for that to complete. So, we want to check
307 * to see if we can lock the dquot's buffer without sleeping.
308 * If we can and it is marked for delayed write, then we
309 * hold it and send it out from the push routine. We don't
310 * want to do that now since we might sleep in the device
311 * strategy routine. We also don't want to grab the buffer lock
312 * here because we'd like not to call into the buffer cache
313 * while holding the AIL lock.
314 * Make sure to only return PUSHBUF if we set pushbuf_flag
315 * ourselves. If someone else is doing it then we don't
316 * want to go to the push routine and duplicate their efforts.
317 */ 269 */
318 if (qip->qli_pushbuf_flag == 0) { 270 return XFS_ITEM_PUSHBUF;
319 qip->qli_pushbuf_flag = 1;
320 ASSERT(qip->qli_format.qlf_blkno == dqp->q_blkno);
321#ifdef DEBUG
322 qip->qli_push_owner = current_pid();
323#endif
324 /*
325 * The dquot is left locked.
326 */
327 retval = XFS_ITEM_PUSHBUF;
328 } else {
329 retval = XFS_ITEM_FLUSHING;
330 xfs_dqunlock_nonotify(dqp);
331 }
332 } 271 }
333 272
334 ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL); 273 ASSERT(qip->qli_item.li_flags & XFS_LI_IN_AIL);
335 return (retval); 274 return XFS_ITEM_SUCCESS;
336} 275}
337 276
338 277
diff --git a/fs/xfs/quota/xfs_dquot_item.h b/fs/xfs/quota/xfs_dquot_item.h
index 5a632531f843..5acae2ada70b 100644
--- a/fs/xfs/quota/xfs_dquot_item.h
+++ b/fs/xfs/quota/xfs_dquot_item.h
@@ -27,10 +27,6 @@ typedef struct xfs_dq_logitem {
27 xfs_log_item_t qli_item; /* common portion */ 27 xfs_log_item_t qli_item; /* common portion */
28 struct xfs_dquot *qli_dquot; /* dquot ptr */ 28 struct xfs_dquot *qli_dquot; /* dquot ptr */
29 xfs_lsn_t qli_flush_lsn; /* lsn at last flush */ 29 xfs_lsn_t qli_flush_lsn; /* lsn at last flush */
30 unsigned short qli_pushbuf_flag; /* 1 bit used in push_ail */
31#ifdef DEBUG
32 uint64_t qli_push_owner;
33#endif
34 xfs_dq_logformat_t qli_format; /* logged structure */ 30 xfs_dq_logformat_t qli_format; /* logged structure */
35} xfs_dq_logitem_t; 31} xfs_dq_logitem_t;
36 32
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index e0a11583ce5a..f3c49e69eab9 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -467,8 +467,10 @@ xfs_buf_item_unpin_remove(
467/* 467/*
468 * This is called to attempt to lock the buffer associated with this 468 * This is called to attempt to lock the buffer associated with this
469 * buf log item. Don't sleep on the buffer lock. If we can't get 469 * buf log item. Don't sleep on the buffer lock. If we can't get
470 * the lock right away, return 0. If we can get the lock, pull the 470 * the lock right away, return 0. If we can get the lock, take a
471 * buffer from the free list, mark it busy, and return 1. 471 * reference to the buffer. If this is a delayed write buffer that
472 * needs AIL help to be written back, invoke the pushbuf routine
473 * rather than the normal success path.
472 */ 474 */
473STATIC uint 475STATIC uint
474xfs_buf_item_trylock( 476xfs_buf_item_trylock(
@@ -477,24 +479,18 @@ xfs_buf_item_trylock(
477 xfs_buf_t *bp; 479 xfs_buf_t *bp;
478 480
479 bp = bip->bli_buf; 481 bp = bip->bli_buf;
480 482 if (XFS_BUF_ISPINNED(bp))
481 if (XFS_BUF_ISPINNED(bp)) {
482 return XFS_ITEM_PINNED; 483 return XFS_ITEM_PINNED;
483 } 484 if (!XFS_BUF_CPSEMA(bp))
484
485 if (!XFS_BUF_CPSEMA(bp)) {
486 return XFS_ITEM_LOCKED; 485 return XFS_ITEM_LOCKED;
487 }
488 486
489 /* 487 /* take a reference to the buffer. */
490 * Remove the buffer from the free list. Only do this
491 * if it's on the free list. Private buffers like the
492 * superblock buffer are not.
493 */
494 XFS_BUF_HOLD(bp); 488 XFS_BUF_HOLD(bp);
495 489
496 ASSERT(!(bip->bli_flags & XFS_BLI_STALE)); 490 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
497 trace_xfs_buf_item_trylock(bip); 491 trace_xfs_buf_item_trylock(bip);
492 if (XFS_BUF_ISDELAYWRITE(bp))
493 return XFS_ITEM_PUSHBUF;
498 return XFS_ITEM_SUCCESS; 494 return XFS_ITEM_SUCCESS;
499} 495}
500 496
@@ -626,11 +622,9 @@ xfs_buf_item_committed(
626} 622}
627 623
628/* 624/*
629 * This is called to asynchronously write the buffer associated with this 625 * The buffer is locked, but is not a delayed write buffer. This happens
630 * buf log item out to disk. The buffer will already have been locked by 626 * if we race with IO completion and hence we don't want to try to write it
631 * a successful call to xfs_buf_item_trylock(). If the buffer still has 627 * again. Just release the buffer.
632 * B_DELWRI set, then get it going out to disk with a call to bawrite().
633 * If not, then just release the buffer.
634 */ 628 */
635STATIC void 629STATIC void
636xfs_buf_item_push( 630xfs_buf_item_push(
@@ -642,17 +636,29 @@ xfs_buf_item_push(
642 trace_xfs_buf_item_push(bip); 636 trace_xfs_buf_item_push(bip);
643 637
644 bp = bip->bli_buf; 638 bp = bip->bli_buf;
639 ASSERT(!XFS_BUF_ISDELAYWRITE(bp));
640 xfs_buf_relse(bp);
641}
645 642
646 if (XFS_BUF_ISDELAYWRITE(bp)) { 643/*
647 int error; 644 * The buffer is locked and is a delayed write buffer. Promote the buffer
648 error = xfs_bawrite(bip->bli_item.li_mountp, bp); 645 * in the delayed write queue as the caller knows that they must invoke
649 if (error) 646 * the xfsbufd to get this buffer written. We have to unlock the buffer
650 xfs_fs_cmn_err(CE_WARN, bip->bli_item.li_mountp, 647 * to allow the xfsbufd to write it, too.
651 "xfs_buf_item_push: pushbuf error %d on bip %p, bp %p", 648 */
652 error, bip, bp); 649STATIC void
653 } else { 650xfs_buf_item_pushbuf(
654 xfs_buf_relse(bp); 651 xfs_buf_log_item_t *bip)
655 } 652{
653 xfs_buf_t *bp;
654
655 ASSERT(!(bip->bli_flags & XFS_BLI_STALE));
656 trace_xfs_buf_item_pushbuf(bip);
657
658 bp = bip->bli_buf;
659 ASSERT(XFS_BUF_ISDELAYWRITE(bp));
660 xfs_buf_delwri_promote(bp);
661 xfs_buf_relse(bp);
656} 662}
657 663
658/* ARGSUSED */ 664/* ARGSUSED */
@@ -677,7 +683,7 @@ static struct xfs_item_ops xfs_buf_item_ops = {
677 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t)) 683 .iop_committed = (xfs_lsn_t(*)(xfs_log_item_t*, xfs_lsn_t))
678 xfs_buf_item_committed, 684 xfs_buf_item_committed,
679 .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push, 685 .iop_push = (void(*)(xfs_log_item_t*))xfs_buf_item_push,
680 .iop_pushbuf = NULL, 686 .iop_pushbuf = (void(*)(xfs_log_item_t*))xfs_buf_item_pushbuf,
681 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t)) 687 .iop_committing = (void(*)(xfs_log_item_t*, xfs_lsn_t))
682 xfs_buf_item_committing 688 xfs_buf_item_committing
683}; 689};
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 207553e82954..d4dc063111f8 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -602,33 +602,20 @@ xfs_inode_item_trylock(
602 602
603 if (!xfs_iflock_nowait(ip)) { 603 if (!xfs_iflock_nowait(ip)) {
604 /* 604 /*
605 * If someone else isn't already trying to push the inode 605 * inode has already been flushed to the backing buffer,
606 * buffer, we get to do it. 606 * leave it locked in shared mode, pushbuf routine will
607 * unlock it.
607 */ 608 */
608 if (iip->ili_pushbuf_flag == 0) { 609 return XFS_ITEM_PUSHBUF;
609 iip->ili_pushbuf_flag = 1;
610#ifdef DEBUG
611 iip->ili_push_owner = current_pid();
612#endif
613 /*
614 * Inode is left locked in shared mode.
615 * Pushbuf routine gets to unlock it.
616 */
617 return XFS_ITEM_PUSHBUF;
618 } else {
619 /*
620 * We hold the AIL lock, so we must specify the
621 * NONOTIFY flag so that we won't double trip.
622 */
623 xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
624 return XFS_ITEM_FLUSHING;
625 }
626 /* NOTREACHED */
627 } 610 }
628 611
629 /* Stale items should force out the iclog */ 612 /* Stale items should force out the iclog */
630 if (ip->i_flags & XFS_ISTALE) { 613 if (ip->i_flags & XFS_ISTALE) {
631 xfs_ifunlock(ip); 614 xfs_ifunlock(ip);
615 /*
616 * we hold the AIL lock - notify the unlock routine of this
617 * so it doesn't try to get the lock again.
618 */
632 xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY); 619 xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
633 return XFS_ITEM_PINNED; 620 return XFS_ITEM_PINNED;
634 } 621 }
@@ -746,11 +733,8 @@ xfs_inode_item_committed(
746 * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK 733 * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
747 * failed to get the inode flush lock but did get the inode locked SHARED. 734 * failed to get the inode flush lock but did get the inode locked SHARED.
748 * Here we're trying to see if the inode buffer is incore, and if so whether it's 735 * Here we're trying to see if the inode buffer is incore, and if so whether it's
749 * marked delayed write. If that's the case, we'll initiate a bawrite on that 736 * marked delayed write. If that's the case, we'll promote it and that will
750 * buffer to expedite the process. 737 * allow the caller to write the buffer by triggering the xfsbufd to run.
751 *
752 * We aren't holding the AIL lock (or the flush lock) when this gets called,
753 * so it is inherently race-y.
754 */ 738 */
755STATIC void 739STATIC void
756xfs_inode_item_pushbuf( 740xfs_inode_item_pushbuf(
@@ -759,26 +743,16 @@ xfs_inode_item_pushbuf(
759 xfs_inode_t *ip; 743 xfs_inode_t *ip;
760 xfs_mount_t *mp; 744 xfs_mount_t *mp;
761 xfs_buf_t *bp; 745 xfs_buf_t *bp;
762 uint dopush;
763 746
764 ip = iip->ili_inode; 747 ip = iip->ili_inode;
765
766 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED)); 748 ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
767 749
768 /* 750 /*
769 * The ili_pushbuf_flag keeps others from
770 * trying to duplicate our effort.
771 */
772 ASSERT(iip->ili_pushbuf_flag != 0);
773 ASSERT(iip->ili_push_owner == current_pid());
774
775 /*
776 * If a flush is not in progress anymore, chances are that the 751 * If a flush is not in progress anymore, chances are that the
777 * inode was taken off the AIL. So, just get out. 752 * inode was taken off the AIL. So, just get out.
778 */ 753 */
779 if (completion_done(&ip->i_flush) || 754 if (completion_done(&ip->i_flush) ||
780 ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) { 755 ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
781 iip->ili_pushbuf_flag = 0;
782 xfs_iunlock(ip, XFS_ILOCK_SHARED); 756 xfs_iunlock(ip, XFS_ILOCK_SHARED);
783 return; 757 return;
784 } 758 }
@@ -787,53 +761,12 @@ xfs_inode_item_pushbuf(
787 bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno, 761 bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,
788 iip->ili_format.ilf_len, XBF_TRYLOCK); 762 iip->ili_format.ilf_len, XBF_TRYLOCK);
789 763
790 if (bp != NULL) {
791 if (XFS_BUF_ISDELAYWRITE(bp)) {
792 /*
793 * We were racing with iflush because we don't hold
794 * the AIL lock or the flush lock. However, at this point,
795 * we have the buffer, and we know that it's dirty.
796 * So, it's possible that iflush raced with us, and
797 * this item is already taken off the AIL.
798 * If not, we can flush it async.
799 */
800 dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
801 !completion_done(&ip->i_flush));
802 iip->ili_pushbuf_flag = 0;
803 xfs_iunlock(ip, XFS_ILOCK_SHARED);
804
805 trace_xfs_inode_item_push(bp, _RET_IP_);
806
807 if (XFS_BUF_ISPINNED(bp))
808 xfs_log_force(mp, 0);
809
810 if (dopush) {
811 int error;
812 error = xfs_bawrite(mp, bp);
813 if (error)
814 xfs_fs_cmn_err(CE_WARN, mp,
815 "xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p",
816 error, iip, bp);
817 } else {
818 xfs_buf_relse(bp);
819 }
820 } else {
821 iip->ili_pushbuf_flag = 0;
822 xfs_iunlock(ip, XFS_ILOCK_SHARED);
823 xfs_buf_relse(bp);
824 }
825 return;
826 }
827 /*
828 * We have to be careful about resetting pushbuf flag too early (above).
829 * Even though in theory we can do it as soon as we have the buflock,
830 * we don't want others to be doing work needlessly. They'll come to
831 * this function thinking that pushing the buffer is their
832 * responsibility only to find that the buffer is still locked by
833 * another doing the same thing
834 */
835 iip->ili_pushbuf_flag = 0;
836 xfs_iunlock(ip, XFS_ILOCK_SHARED); 764 xfs_iunlock(ip, XFS_ILOCK_SHARED);
765 if (!bp)
766 return;
767 if (XFS_BUF_ISDELAYWRITE(bp))
768 xfs_buf_delwri_promote(bp);
769 xfs_buf_relse(bp);
837 return; 770 return;
838} 771}
839 772
@@ -937,7 +870,6 @@ xfs_inode_item_init(
937 /* 870 /*
938 We have zeroed memory. No need ... 871 We have zeroed memory. No need ...
939 iip->ili_extents_buf = NULL; 872 iip->ili_extents_buf = NULL;
940 iip->ili_pushbuf_flag = 0;
941 */ 873 */
942 874
943 iip->ili_format.ilf_type = XFS_LI_INODE; 875 iip->ili_format.ilf_type = XFS_LI_INODE;
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index cc8df1ac7783..9a467958ecdd 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -144,12 +144,6 @@ typedef struct xfs_inode_log_item {
144 data exts */ 144 data exts */
145 struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged 145 struct xfs_bmbt_rec *ili_aextents_buf; /* array of logged
146 attr exts */ 146 attr exts */
147 unsigned int ili_pushbuf_flag; /* one bit used in push_ail */
148
149#ifdef DEBUG
150 uint64_t ili_push_owner; /* one who sets pushbuf_flag
151 above gets to push the buf */
152#endif
153#ifdef XFS_TRANS_DEBUG 147#ifdef XFS_TRANS_DEBUG
154 int ili_root_size; 148 int ili_root_size;
155 char *ili_orig_root; 149 char *ili_orig_root;
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index ca64f33c63a3..c93e3a102857 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -861,8 +861,7 @@ typedef struct xfs_item_ops {
861#define XFS_ITEM_SUCCESS 0 861#define XFS_ITEM_SUCCESS 0
862#define XFS_ITEM_PINNED 1 862#define XFS_ITEM_PINNED 1
863#define XFS_ITEM_LOCKED 2 863#define XFS_ITEM_LOCKED 2
864#define XFS_ITEM_FLUSHING 3 864#define XFS_ITEM_PUSHBUF 3
865#define XFS_ITEM_PUSHBUF 4
866 865
867/* 866/*
868 * This structure is used to maintain a list of block ranges that have been 867 * This structure is used to maintain a list of block ranges that have been
diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
index d7b1af8a832d..e799824f7245 100644
--- a/fs/xfs/xfs_trans_ail.c
+++ b/fs/xfs/xfs_trans_ail.c
@@ -253,6 +253,7 @@ xfsaild_push(
253 int flush_log, count, stuck; 253 int flush_log, count, stuck;
254 xfs_mount_t *mp = ailp->xa_mount; 254 xfs_mount_t *mp = ailp->xa_mount;
255 struct xfs_ail_cursor *cur = &ailp->xa_cursors; 255 struct xfs_ail_cursor *cur = &ailp->xa_cursors;
256 int push_xfsbufd = 0;
256 257
257 spin_lock(&ailp->xa_lock); 258 spin_lock(&ailp->xa_lock);
258 xfs_trans_ail_cursor_init(ailp, cur); 259 xfs_trans_ail_cursor_init(ailp, cur);
@@ -308,6 +309,7 @@ xfsaild_push(
308 XFS_STATS_INC(xs_push_ail_pushbuf); 309 XFS_STATS_INC(xs_push_ail_pushbuf);
309 IOP_PUSHBUF(lip); 310 IOP_PUSHBUF(lip);
310 last_pushed_lsn = lsn; 311 last_pushed_lsn = lsn;
312 push_xfsbufd = 1;
311 break; 313 break;
312 314
313 case XFS_ITEM_PINNED: 315 case XFS_ITEM_PINNED:
@@ -322,12 +324,6 @@ xfsaild_push(
322 stuck++; 324 stuck++;
323 break; 325 break;
324 326
325 case XFS_ITEM_FLUSHING:
326 XFS_STATS_INC(xs_push_ail_flushing);
327 last_pushed_lsn = lsn;
328 stuck++;
329 break;
330
331 default: 327 default:
332 ASSERT(0); 328 ASSERT(0);
333 break; 329 break;
@@ -374,6 +370,11 @@ xfsaild_push(
374 xfs_log_force(mp, 0); 370 xfs_log_force(mp, 0);
375 } 371 }
376 372
373 if (push_xfsbufd) {
374 /* we've got delayed write buffers to flush */
375 wake_up_process(mp->m_ddev_targp->bt_task);
376 }
377
377 if (!count) { 378 if (!count) {
378 /* We're past our target or empty, so idle */ 379 /* We're past our target or empty, so idle */
379 last_pushed_lsn = 0; 380 last_pushed_lsn = 0;