diff options
author | David Chinner <dgc@sgi.com> | 2008-03-05 21:43:42 -0500 |
---|---|---|
committer | Lachlan McIlroy <lachlan@redback.melbourne.sgi.com> | 2008-04-17 21:37:32 -0400 |
commit | a3f74ffb6d1448d9a8f482e593b80ec15f1695d4 (patch) | |
tree | e7a9ea7ba4032340e771605000002da4349719cb /fs/xfs/xfs_inode.c | |
parent | 4ae29b4321b99b711bcfde5527c4fbf249eac60f (diff) |
[XFS] Don't block pdflush when writing back inodes
When pdflush is writing back inodes, it can get stuck on inode cluster
buffers that are currently under I/O. This occurs when we write data to
multiple inodes in the same inode cluster at the same time.
Effectively, delayed allocation marks the inode dirty during the data
writeback. Hence if the inode cluster was flushed during the writeback of
the first inode, the writeback of the second inode will block waiting for
the inode cluster write to complete before writing it again for the newly
dirtied inode.
Basically, we want to avoid this from happening so we don't block pdflush
and slow down all of writeback. Hence we introduce a non-blocking async
inode flush flag that pdflush uses. If this flag is set, we use
non-blocking operations (e.g. try locks) whereever we can to avoid
blocking or extra I/O being issued.
SGI-PV: 970925
SGI-Modid: xfs-linux-melb:xfs-kern:30501a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
Diffstat (limited to 'fs/xfs/xfs_inode.c')
-rw-r--r-- | fs/xfs/xfs_inode.c | 135 |
1 files changed, 82 insertions, 53 deletions
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 6f156faf9d46..3c3e9e3c1da8 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c | |||
@@ -145,11 +145,16 @@ xfs_imap_to_bp( | |||
145 | xfs_buf_t *bp; | 145 | xfs_buf_t *bp; |
146 | 146 | ||
147 | error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, | 147 | error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, |
148 | (int)imap->im_len, XFS_BUF_LOCK, &bp); | 148 | (int)imap->im_len, buf_flags, &bp); |
149 | if (error) { | 149 | if (error) { |
150 | cmn_err(CE_WARN, "xfs_imap_to_bp: xfs_trans_read_buf()returned " | 150 | if (error != EAGAIN) { |
151 | cmn_err(CE_WARN, | ||
152 | "xfs_imap_to_bp: xfs_trans_read_buf()returned " | ||
151 | "an error %d on %s. Returning error.", | 153 | "an error %d on %s. Returning error.", |
152 | error, mp->m_fsname); | 154 | error, mp->m_fsname); |
155 | } else { | ||
156 | ASSERT(buf_flags & XFS_BUF_TRYLOCK); | ||
157 | } | ||
153 | return error; | 158 | return error; |
154 | } | 159 | } |
155 | 160 | ||
@@ -274,7 +279,8 @@ xfs_itobp( | |||
274 | xfs_dinode_t **dipp, | 279 | xfs_dinode_t **dipp, |
275 | xfs_buf_t **bpp, | 280 | xfs_buf_t **bpp, |
276 | xfs_daddr_t bno, | 281 | xfs_daddr_t bno, |
277 | uint imap_flags) | 282 | uint imap_flags, |
283 | uint buf_flags) | ||
278 | { | 284 | { |
279 | xfs_imap_t imap; | 285 | xfs_imap_t imap; |
280 | xfs_buf_t *bp; | 286 | xfs_buf_t *bp; |
@@ -305,10 +311,17 @@ xfs_itobp( | |||
305 | } | 311 | } |
306 | ASSERT(bno == 0 || bno == imap.im_blkno); | 312 | ASSERT(bno == 0 || bno == imap.im_blkno); |
307 | 313 | ||
308 | error = xfs_imap_to_bp(mp, tp, &imap, &bp, XFS_BUF_LOCK, imap_flags); | 314 | error = xfs_imap_to_bp(mp, tp, &imap, &bp, buf_flags, imap_flags); |
309 | if (error) | 315 | if (error) |
310 | return error; | 316 | return error; |
311 | 317 | ||
318 | if (!bp) { | ||
319 | ASSERT(buf_flags & XFS_BUF_TRYLOCK); | ||
320 | ASSERT(tp == NULL); | ||
321 | *bpp = NULL; | ||
322 | return EAGAIN; | ||
323 | } | ||
324 | |||
312 | *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); | 325 | *dipp = (xfs_dinode_t *)xfs_buf_offset(bp, imap.im_boffset); |
313 | *bpp = bp; | 326 | *bpp = bp; |
314 | return 0; | 327 | return 0; |
@@ -812,7 +825,7 @@ xfs_iread( | |||
812 | * return NULL as well. Set i_blkno to 0 so that xfs_itobp() will | 825 | * return NULL as well. Set i_blkno to 0 so that xfs_itobp() will |
813 | * know that this is a new incore inode. | 826 | * know that this is a new incore inode. |
814 | */ | 827 | */ |
815 | error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags); | 828 | error = xfs_itobp(mp, tp, ip, &dip, &bp, bno, imap_flags, XFS_BUF_LOCK); |
816 | if (error) { | 829 | if (error) { |
817 | kmem_zone_free(xfs_inode_zone, ip); | 830 | kmem_zone_free(xfs_inode_zone, ip); |
818 | return error; | 831 | return error; |
@@ -1901,7 +1914,7 @@ xfs_iunlink( | |||
1901 | * Here we put the head pointer into our next pointer, | 1914 | * Here we put the head pointer into our next pointer, |
1902 | * and then we fall through to point the head at us. | 1915 | * and then we fall through to point the head at us. |
1903 | */ | 1916 | */ |
1904 | error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); | 1917 | error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); |
1905 | if (error) | 1918 | if (error) |
1906 | return error; | 1919 | return error; |
1907 | 1920 | ||
@@ -2009,7 +2022,7 @@ xfs_iunlink_remove( | |||
2009 | * of dealing with the buffer when there is no need to | 2022 | * of dealing with the buffer when there is no need to |
2010 | * change it. | 2023 | * change it. |
2011 | */ | 2024 | */ |
2012 | error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); | 2025 | error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); |
2013 | if (error) { | 2026 | if (error) { |
2014 | cmn_err(CE_WARN, | 2027 | cmn_err(CE_WARN, |
2015 | "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", | 2028 | "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", |
@@ -2071,7 +2084,7 @@ xfs_iunlink_remove( | |||
2071 | * Now last_ibp points to the buffer previous to us on | 2084 | * Now last_ibp points to the buffer previous to us on |
2072 | * the unlinked list. Pull us from the list. | 2085 | * the unlinked list. Pull us from the list. |
2073 | */ | 2086 | */ |
2074 | error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0); | 2087 | error = xfs_itobp(mp, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); |
2075 | if (error) { | 2088 | if (error) { |
2076 | cmn_err(CE_WARN, | 2089 | cmn_err(CE_WARN, |
2077 | "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", | 2090 | "xfs_iunlink_remove: xfs_itobp() returned an error %d on %s. Returning error.", |
@@ -2334,7 +2347,7 @@ xfs_ifree( | |||
2334 | 2347 | ||
2335 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); | 2348 | xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); |
2336 | 2349 | ||
2337 | error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0); | 2350 | error = xfs_itobp(ip->i_mount, tp, ip, &dip, &ibp, 0, 0, XFS_BUF_LOCK); |
2338 | if (error) | 2351 | if (error) |
2339 | return error; | 2352 | return error; |
2340 | 2353 | ||
@@ -2777,38 +2790,41 @@ xfs_iunpin( | |||
2777 | } | 2790 | } |
2778 | 2791 | ||
2779 | /* | 2792 | /* |
2780 | * This is called to wait for the given inode to be unpinned. | 2793 | * This is called to unpin an inode. It can be directed to wait or to return |
2781 | * It will sleep until this happens. The caller must have the | 2794 | * immediately without waiting for the inode to be unpinned. The caller must |
2782 | * inode locked in at least shared mode so that the buffer cannot | 2795 | * have the inode locked in at least shared mode so that the buffer cannot be |
2783 | * be subsequently pinned once someone is waiting for it to be | 2796 | * subsequently pinned once someone is waiting for it to be unpinned. |
2784 | * unpinned. | ||
2785 | */ | 2797 | */ |
2786 | STATIC void | 2798 | STATIC void |
2787 | xfs_iunpin_wait( | 2799 | __xfs_iunpin_wait( |
2788 | xfs_inode_t *ip) | 2800 | xfs_inode_t *ip, |
2801 | int wait) | ||
2789 | { | 2802 | { |
2790 | xfs_inode_log_item_t *iip; | 2803 | xfs_inode_log_item_t *iip = ip->i_itemp; |
2791 | xfs_lsn_t lsn; | ||
2792 | 2804 | ||
2793 | ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS)); | 2805 | ASSERT(ismrlocked(&ip->i_lock, MR_UPDATE | MR_ACCESS)); |
2794 | 2806 | if (atomic_read(&ip->i_pincount) == 0) | |
2795 | if (atomic_read(&ip->i_pincount) == 0) { | ||
2796 | return; | 2807 | return; |
2797 | } | ||
2798 | 2808 | ||
2799 | iip = ip->i_itemp; | 2809 | /* Give the log a push to start the unpinning I/O */ |
2800 | if (iip && iip->ili_last_lsn) { | 2810 | xfs_log_force(ip->i_mount, (iip && iip->ili_last_lsn) ? |
2801 | lsn = iip->ili_last_lsn; | 2811 | iip->ili_last_lsn : 0, XFS_LOG_FORCE); |
2802 | } else { | 2812 | if (wait) |
2803 | lsn = (xfs_lsn_t)0; | 2813 | wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0)); |
2804 | } | 2814 | } |
2805 | 2815 | ||
2806 | /* | 2816 | static inline void |
2807 | * Give the log a push so we don't wait here too long. | 2817 | xfs_iunpin_wait( |
2808 | */ | 2818 | xfs_inode_t *ip) |
2809 | xfs_log_force(ip->i_mount, lsn, XFS_LOG_FORCE); | 2819 | { |
2820 | __xfs_iunpin_wait(ip, 1); | ||
2821 | } | ||
2810 | 2822 | ||
2811 | wait_event(ip->i_ipin_wait, (atomic_read(&ip->i_pincount) == 0)); | 2823 | static inline void |
2824 | xfs_iunpin_nowait( | ||
2825 | xfs_inode_t *ip) | ||
2826 | { | ||
2827 | __xfs_iunpin_wait(ip, 0); | ||
2812 | } | 2828 | } |
2813 | 2829 | ||
2814 | 2830 | ||
@@ -3003,6 +3019,7 @@ xfs_iflush( | |||
3003 | int bufwasdelwri; | 3019 | int bufwasdelwri; |
3004 | struct hlist_node *entry; | 3020 | struct hlist_node *entry; |
3005 | enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) }; | 3021 | enum { INT_DELWRI = (1 << 0), INT_ASYNC = (1 << 1) }; |
3022 | int noblock = (flags == XFS_IFLUSH_ASYNC_NOBLOCK); | ||
3006 | 3023 | ||
3007 | XFS_STATS_INC(xs_iflush_count); | 3024 | XFS_STATS_INC(xs_iflush_count); |
3008 | 3025 | ||
@@ -3027,11 +3044,21 @@ xfs_iflush( | |||
3027 | } | 3044 | } |
3028 | 3045 | ||
3029 | /* | 3046 | /* |
3030 | * We can't flush the inode until it is unpinned, so | 3047 | * We can't flush the inode until it is unpinned, so wait for it if we |
3031 | * wait for it. We know noone new can pin it, because | 3048 | * are allowed to block. We know noone new can pin it, because we are |
3032 | * we are holding the inode lock shared and you need | 3049 | * holding the inode lock shared and you need to hold it exclusively to |
3033 | * to hold it exclusively to pin the inode. | 3050 | * pin the inode. |
3051 | * | ||
3052 | * If we are not allowed to block, force the log out asynchronously so | ||
3053 | * that when we come back the inode will be unpinned. If other inodes | ||
3054 | * in the same cluster are dirty, they will probably write the inode | ||
3055 | * out for us if they occur after the log force completes. | ||
3034 | */ | 3056 | */ |
3057 | if (noblock && xfs_ipincount(ip)) { | ||
3058 | xfs_iunpin_nowait(ip); | ||
3059 | xfs_ifunlock(ip); | ||
3060 | return EAGAIN; | ||
3061 | } | ||
3035 | xfs_iunpin_wait(ip); | 3062 | xfs_iunpin_wait(ip); |
3036 | 3063 | ||
3037 | /* | 3064 | /* |
@@ -3048,15 +3075,6 @@ xfs_iflush( | |||
3048 | } | 3075 | } |
3049 | 3076 | ||
3050 | /* | 3077 | /* |
3051 | * Get the buffer containing the on-disk inode. | ||
3052 | */ | ||
3053 | error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0); | ||
3054 | if (error) { | ||
3055 | xfs_ifunlock(ip); | ||
3056 | return error; | ||
3057 | } | ||
3058 | |||
3059 | /* | ||
3060 | * Decide how buffer will be flushed out. This is done before | 3078 | * Decide how buffer will be flushed out. This is done before |
3061 | * the call to xfs_iflush_int because this field is zeroed by it. | 3079 | * the call to xfs_iflush_int because this field is zeroed by it. |
3062 | */ | 3080 | */ |
@@ -3072,6 +3090,7 @@ xfs_iflush( | |||
3072 | case XFS_IFLUSH_DELWRI_ELSE_SYNC: | 3090 | case XFS_IFLUSH_DELWRI_ELSE_SYNC: |
3073 | flags = 0; | 3091 | flags = 0; |
3074 | break; | 3092 | break; |
3093 | case XFS_IFLUSH_ASYNC_NOBLOCK: | ||
3075 | case XFS_IFLUSH_ASYNC: | 3094 | case XFS_IFLUSH_ASYNC: |
3076 | case XFS_IFLUSH_DELWRI_ELSE_ASYNC: | 3095 | case XFS_IFLUSH_DELWRI_ELSE_ASYNC: |
3077 | flags = INT_ASYNC; | 3096 | flags = INT_ASYNC; |
@@ -3091,6 +3110,7 @@ xfs_iflush( | |||
3091 | case XFS_IFLUSH_DELWRI: | 3110 | case XFS_IFLUSH_DELWRI: |
3092 | flags = INT_DELWRI; | 3111 | flags = INT_DELWRI; |
3093 | break; | 3112 | break; |
3113 | case XFS_IFLUSH_ASYNC_NOBLOCK: | ||
3094 | case XFS_IFLUSH_ASYNC: | 3114 | case XFS_IFLUSH_ASYNC: |
3095 | flags = INT_ASYNC; | 3115 | flags = INT_ASYNC; |
3096 | break; | 3116 | break; |
@@ -3105,6 +3125,16 @@ xfs_iflush( | |||
3105 | } | 3125 | } |
3106 | 3126 | ||
3107 | /* | 3127 | /* |
3128 | * Get the buffer containing the on-disk inode. | ||
3129 | */ | ||
3130 | error = xfs_itobp(mp, NULL, ip, &dip, &bp, 0, 0, | ||
3131 | noblock ? XFS_BUF_TRYLOCK : XFS_BUF_LOCK); | ||
3132 | if (error || !bp) { | ||
3133 | xfs_ifunlock(ip); | ||
3134 | return error; | ||
3135 | } | ||
3136 | |||
3137 | /* | ||
3108 | * First flush out the inode that xfs_iflush was called with. | 3138 | * First flush out the inode that xfs_iflush was called with. |
3109 | */ | 3139 | */ |
3110 | error = xfs_iflush_int(ip, bp); | 3140 | error = xfs_iflush_int(ip, bp); |
@@ -3113,6 +3143,13 @@ xfs_iflush( | |||
3113 | } | 3143 | } |
3114 | 3144 | ||
3115 | /* | 3145 | /* |
3146 | * If the buffer is pinned then push on the log now so we won't | ||
3147 | * get stuck waiting in the write for too long. | ||
3148 | */ | ||
3149 | if (XFS_BUF_ISPINNED(bp)) | ||
3150 | xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); | ||
3151 | |||
3152 | /* | ||
3116 | * inode clustering: | 3153 | * inode clustering: |
3117 | * see if other inodes can be gathered into this write | 3154 | * see if other inodes can be gathered into this write |
3118 | */ | 3155 | */ |
@@ -3181,14 +3218,6 @@ xfs_iflush( | |||
3181 | XFS_STATS_ADD(xs_icluster_flushinode, clcount); | 3218 | XFS_STATS_ADD(xs_icluster_flushinode, clcount); |
3182 | } | 3219 | } |
3183 | 3220 | ||
3184 | /* | ||
3185 | * If the buffer is pinned then push on the log so we won't | ||
3186 | * get stuck waiting in the write for too long. | ||
3187 | */ | ||
3188 | if (XFS_BUF_ISPINNED(bp)){ | ||
3189 | xfs_log_force(mp, (xfs_lsn_t)0, XFS_LOG_FORCE); | ||
3190 | } | ||
3191 | |||
3192 | if (flags & INT_DELWRI) { | 3221 | if (flags & INT_DELWRI) { |
3193 | xfs_bdwrite(mp, bp); | 3222 | xfs_bdwrite(mp, bp); |
3194 | } else if (flags & INT_ASYNC) { | 3223 | } else if (flags & INT_ASYNC) { |