aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs/xfs_inode.c
diff options
context:
space:
mode:
authorDavid Chinner <dgc@sgi.com>2008-03-05 21:43:42 -0500
committerLachlan McIlroy <lachlan@redback.melbourne.sgi.com>2008-04-17 21:37:32 -0400
commita3f74ffb6d1448d9a8f482e593b80ec15f1695d4 (patch)
treee7a9ea7ba4032340e771605000002da4349719cb /fs/xfs/xfs_inode.c
parent4ae29b4321b99b711bcfde5527c4fbf249eac60f (diff)
[XFS] Don't block pdflush when writing back inodes
When pdflush is writing back inodes, it can get stuck on inode cluster buffers that are currently under I/O. This occurs when we write data to multiple inodes in the same inode cluster at the same time. Effectively, delayed allocation marks the inode dirty during the data writeback. Hence if the inode cluster was flushed during the writeback of the first inode, the writeback of the second inode will block waiting for the inode cluster write to complete before writing it again for the newly dirtied inode. Basically, we want to avoid this from happening so we don't block pdflush and slow down all of writeback. Hence we introduce a non-blocking async inode flush flag that pdflush uses. If this flag is set, we use non-blocking operations (e.g. try locks) whereever we can to avoid blocking or extra I/O being issued. SGI-PV: 970925 SGI-Modid: xfs-linux-melb:xfs-kern:30501a Signed-off-by: David Chinner <dgc@sgi.com> Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Diffstat (limited to 'fs/xfs/xfs_inode.c')
-rw-r--r--fs/xfs/xfs_inode.c135
1 files changed, 82 insertions, 53 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 6f156faf9d46..3c3e9e3c1da8 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -145,11 +145,16 @@ xfs_imap_to_bp(
145 xfs_buf_t *bp; 145 xfs_buf_t *bp;
146 146
147 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, 147 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno,
148 (int)imap->im_len, XFS_BUF_LOCK, &bp); 148 (int)imap->im_len, buf_flags, &bp);
149 if (error) { 149 if (error) {
150 cmn_err(CE_WARN, "xfs_imap_to_bp: xfs_trans_read_buf()returned " 150 if (error != EAGAIN) {
151 cmn_err(CE_WARN,
152 "xfs_imap_to_bp: xfs_trans_read_buf()returned "
151 "an error %d on %s. Returning error.", 153 "an error %d on %s. Returning error.",
152 error, mp->m_fsname); 154 error, mp->m_fsname);
155 } else {
156 ASSERT(buf_flags & XFS_BUF_TRYLOCK);
157 }
153 return error; 158 return error;
154 } 159 }
155 160
@@ -274,7 +279,8 @@ xfs_itobp(
274 xfs_dinode_t **dipp, 279 xfs_dinode_t **dipp,
275 xfs_buf_t **bpp, 280 xfs_buf_t **bpp,
276 xfs_daddr_t bno, 281 xfs_daddr_t bno,
277 uint imap_flags) 282 uint imap_flags,
283 uint buf_flags)
278{ 284{
279 xfs_imap_t imap; 285 xfs_imap_t imap;
280 xfs_buf_t *bp; 286 xfs_buf_t *bp;
@@ -305,10 +311,17 @@ xfs_itobp(
305 } 311 }
306 ASSERT(bno == 0 || bno == imap.im_blkno); 312 ASSERT(bno == 0 || bno == imap.im_blkno);
307 313
308 error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags); 314 error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, imap_flags);
309 if (error) 315 if (error)
310 return error; 316 return error;
311 317
318 if (!bp) {
319 ASSERT(buf_flags & XFS_BUF_TRYLOCK);
320 ASSERT(tp == NULL);
321 *bpp = NULL;
322 return EAGAIN;
323 }
324
312 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); 325 *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset);
313 *bpp = bp; 326 *bpp = bp;
314 return 0; 327 return 0;
@@ -812,7 +825,7 @@ xfs_iread(
812 * return NULL as well. Set i_blkno to 0 so that xfs_itobp() will 825 * return NULL as well. Set i_blkno to 0 so that xfs_itobp() will
813 * know that this is a new incore inode. 826 * know that this is a new incore inode.
814 */ 827 */
815 error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags); 828 error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK);
816 if (error) { 829 if (error) {
817 kmem_zone_free(xfs_inode_zone, ip); 830 kmem_zone_free(xfs_inode_zone, ip);
818 return error; 831 return error;
@@ -1901,7 +1914,7 @@ xfs_iunlink(
1901 * Here we put the head pointer into our next pointer, 1914 * Here we put the head pointer into our next pointer,
1902 * and then we fall through to point the head at us. 1915 * and then we fall through to point the head at us.
1903 */ 1916 */
1904 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); 1917 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
1905 if (error) 1918 if (error)
1906 return error; 1919 return error;
1907 1920
@@ -2009,7 +2022,7 @@ xfs_iunlink_remove(
2009 * of dealing with the buffer when there is no need to 2022 * of dealing with the buffer when there is no need to
2010 * change it. 2023 * change it.
2011 */ 2024 */
2012 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); 2025 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
2013 if (error) { 2026 if (error) {
2014 cmn_err(CE_WARN, 2027 cmn_err(CE_WARN,
2015 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 2028 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
@@ -2071,7 +2084,7 @@ xfs_iunlink_remove(
2071 * Now last_ibp points to the buffer previous to us on 2084 * Now last_ibp points to the buffer previous to us on
2072 * the unlinked list. Pull us from the list. 2085 * the unlinked list. Pull us from the list.
2073 */ 2086 */
2074 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); 2087 error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
2075 if (error) { 2088 if (error) {
2076 cmn_err(CE_WARN, 2089 cmn_err(CE_WARN,
2077 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", 2090 "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.",
@@ -2334,7 +2347,7 @@ xfs_ifree(
2334 2347
2335 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); 2348 xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
2336 2349
2337 error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0); 2350 error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK);
2338 if (error) 2351 if (error)
2339 return error; 2352 return error;
2340 2353
@@ -2777,38 +2790,41 @@ xfs_iunpin(
2777} 2790}
2778 2791
2779/* 2792/*
2780 * This is called to wait for the given inode to be unpinned. 2793 * This is called to unpin an inode. It can be directed to wait or to return
2781 * It will sleep until this happens. The caller must have the 2794 * immediately without waiting for the inode to be unpinned. The caller must
2782 * inode locked in at least shared mode so that the buffer cannot 2795 * have the inode locked in at least shared mode so that the buffer cannot be
2783 * be subsequently pinned once someone is waiting for it to be 2796 * subsequently pinned once someone is waiting for it to be unpinned.
2784 * unpinned.
2785 */ 2797 */
2786STATIC void 2798STATIC void
2787xfs_iunpin_wait( 2799__xfs_iunpin_wait(
2788 xfs_inode_t *ip) 2800 xfs_inode_t *ip,
2801 int wait)
2789{ 2802{
2790 xfs_inode_log_item_t *iip; 2803 xfs_inode_log_item_t *iip = ip->i_itemp;
2791 xfs_lsn_t lsn;
2792 2804
2793 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS)); 2805 ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS));
2794 2806 if (atomic_read(&ip->i_pincount) == 0)
2795 if (atomic_read(&ip->i_pincount) == 0) {
2796 return; 2807 return;
2797 }
2798 2808
2799 iip = ip->i_itemp; 2809 /* Give the log a push to start the unpinning I/O */
2800 if (iip && iip->ili_last_lsn) { 2810 xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ?
2801 lsn = iip->ili_last_lsn; 2811 iip->ili_last_lsn : 0, XFS_LOG_FORCE);
2802 } else { 2812 if (wait)
2803 lsn = (xfs_lsn_t)0; 2813 wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0));
2804 } 2814}
2805 2815
2806 /* 2816static inline void
2807 * Give the log a push so we don't wait here too long. 2817xfs_iunpin_wait(
2808 */ 2818 xfs_inode_t *ip)
2809 xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE); 2819{
2820 __xfs_iunpin_wait(ip, 1);
2821}
2810 2822
2811 wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0)); 2823static inline void
2824xfs_iunpin_nowait(
2825 xfs_inode_t *ip)
2826{
2827 __xfs_iunpin_wait(ip, 0);
2812} 2828}
2813 2829
2814 2830
@@ -3003,6 +3019,7 @@ xfs_iflush(
3003 int bufwasdelwri; 3019 int bufwasdelwri;
3004 struct hlist_node *entry; 3020 struct hlist_node *entry;
3005 enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) }; 3021 enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) };
3022 int noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK);
3006 3023
3007 XFS_STATS_INC(xs_iflush_count); 3024 XFS_STATS_INC(xs_iflush_count);
3008 3025
@@ -3027,11 +3044,21 @@ xfs_iflush(
3027 } 3044 }
3028 3045
3029 /* 3046 /*
3030 * We can't flush the inode until it is unpinned, so 3047 * We can't flush the inode until it is unpinned, so wait for it if we
3031 * wait for it. We know noone new can pin it, because 3048 * are allowed to block. We know noone new can pin it, because we are
3032 * we are holding the inode lock shared and you need 3049 * holding the inode lock shared and you need to hold it exclusively to
3033 * to hold it exclusively to pin the inode. 3050 * pin the inode.
3051 *
3052 * If we are not allowed to block, force the log out asynchronously so
3053 * that when we come back the inode will be unpinned. If other inodes
3054 * in the same cluster are dirty, they will probably write the inode
3055 * out for us if they occur after the log force completes.
3034 */ 3056 */
3057 if (noblock && xfs_ipincount(ip)) {
3058 xfs_iunpin_nowait(ip);
3059 xfs_ifunlock(ip);
3060 return EAGAIN;
3061 }
3035 xfs_iunpin_wait(ip); 3062 xfs_iunpin_wait(ip);
3036 3063
3037 /* 3064 /*
@@ -3048,15 +3075,6 @@ xfs_iflush(
3048 } 3075 }
3049 3076
3050 /* 3077 /*
3051 * Get the buffer containing the on-disk inode.
3052 */
3053 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0);
3054 if (error) {
3055 xfs_ifunlock(ip);
3056 return error;
3057 }
3058
3059 /*
3060 * Decide how buffer will be flushed out. This is done before 3078 * Decide how buffer will be flushed out. This is done before
3061 * the call to xfs_iflush_int because this field is zeroed by it. 3079 * the call to xfs_iflush_int because this field is zeroed by it.
3062 */ 3080 */
@@ -3072,6 +3090,7 @@ xfs_iflush(
3072 case XFS_IFLUSH_DELWRI_ELSE_SYNC: 3090 case XFS_IFLUSH_DELWRI_ELSE_SYNC:
3073 flags = 0; 3091 flags = 0;
3074 break; 3092 break;
3093 case XFS_IFLUSH_ASYNC_NOBLOCK:
3075 case XFS_IFLUSH_ASYNC: 3094 case XFS_IFLUSH_ASYNC:
3076 case XFS_IFLUSH_DELWRI_ELSE_ASYNC: 3095 case XFS_IFLUSH_DELWRI_ELSE_ASYNC:
3077 flags = INT_ASYNC; 3096 flags = INT_ASYNC;
@@ -3091,6 +3110,7 @@ xfs_iflush(
3091 case XFS_IFLUSH_DELWRI: 3110 case XFS_IFLUSH_DELWRI:
3092 flags = INT_DELWRI; 3111 flags = INT_DELWRI;
3093 break; 3112 break;
3113 case XFS_IFLUSH_ASYNC_NOBLOCK:
3094 case XFS_IFLUSH_ASYNC: 3114 case XFS_IFLUSH_ASYNC:
3095 flags = INT_ASYNC; 3115 flags = INT_ASYNC;
3096 break; 3116 break;
@@ -3105,6 +3125,16 @@ xfs_iflush(
3105 } 3125 }
3106 3126
3107 /* 3127 /*
3128 * Get the buffer containing the on-disk inode.
3129 */
3130 error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0,
3131 noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK);
3132 if (error || !bp) {
3133 xfs_ifunlock(ip);
3134 return error;
3135 }
3136
3137 /*
3108 * First flush out the inode that xfs_iflush was called with. 3138 * First flush out the inode that xfs_iflush was called with.
3109 */ 3139 */
3110 error = xfs_iflush_int(ip, bp); 3140 error = xfs_iflush_int(ip, bp);
@@ -3113,6 +3143,13 @@ xfs_iflush(
3113 } 3143 }
3114 3144
3115 /* 3145 /*
3146 * If the buffer is pinned then push on the log now so we won't
3147 * get stuck waiting in the write for too long.
3148 */
3149 if (XFS_BUF_ISPINNED(bp))
3150 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
3151
3152 /*
3116 * inode clustering: 3153 * inode clustering:
3117 * see if other inodes can be gathered into this write 3154 * see if other inodes can be gathered into this write
3118 */ 3155 */
@@ -3181,14 +3218,6 @@ xfs_iflush(
3181 XFS_STATS_ADD(xs_icluster_flushinode, clcount); 3218 XFS_STATS_ADD(xs_icluster_flushinode, clcount);
3182 } 3219 }
3183 3220
3184 /*
3185 * If the buffer is pinned then push on the log so we won't
3186 * get stuck waiting in the write for too long.
3187 */
3188 if (XFS_BUF_ISPINNED(bp)){
3189 xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE);
3190 }
3191
3192 if (flags & INT_DELWRI) { 3221 if (flags & INT_DELWRI) {
3193 xfs_bdwrite(mp, bp); 3222 xfs_bdwrite(mp, bp);
3194 } else if (flags & INT_ASYNC) { 3223 } else if (flags & INT_ASYNC) {