aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDave Chinner <dchinner@redhat.com>2012-10-08 06:56:04 -0400
committerBen Myers <bpm@sgi.com>2012-10-17 13:01:25 -0400
commit9aa05000f2b7cab4be582afba64af10b2d74727e (patch)
tree530f939b017f5c5e8729edc28da0773c20b1986b
parentcf2931db2d189ce0583be7ae880d7e3f8c15f623 (diff)
xfs: xfs_sync_data is redundant.
We don't do any data writeback from XFS any more - the VFS is completely responsible for that, including for freeze. We can replace the remaining caller with a VFS level function that achieves the same thing, but without conflicting with current writeback work. This means we can remove the flush_work and xfs_flush_inodes() - the VFS functionality completely replaces the internal flush queue for doing this writeback work in a separate context to avoid stack overruns. This does have one complication - it cannot be called with page locks held. Hence move the flushing of delalloc space when ENOSPC occurs back up into xfs_file_aio_buffered_write when we don't hold any locks that will stall writeback. Unfortunately, writeback_inodes_sb_if_idle() is not sufficient to trigger delalloc conversion fast enough to prevent spurious ENOSPC whent here are hundreds of writers, thousands of small files and GBs of free RAM. Hence we need to use sync_sb_inodes() to block callers while we wait for writeback like the previous xfs_flush_inodes implementation did. That means we have to hold the s_umount lock here, but because this call can nest inside i_mutex (the parent directory in the create case, held by the VFS), we have to use down_read_trylock() to avoid potential deadlocks. In practice, this trylock will succeed on almost every attempt as unmount/remount type operations are exceedingly rare. Note: we always need to pass a count of zero to generic_file_buffered_write() as the previously written byte count. We only do this by accident before this patch by the virtue of ret always being zero when there are no errors. Make this explicit rather than needing to specifically zero ret in the ENOSPC retry case. Signed-off-by: Dave Chinner <dchinner@redhat.com> Tested-by: Brian Foster <bfoster@redhat.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Ben Myers <bpm@sgi.com>
-rw-r--r--fs/xfs/xfs_file.c13
-rw-r--r--fs/xfs/xfs_iomap.c23
-rw-r--r--fs/xfs/xfs_mount.h1
-rw-r--r--fs/xfs/xfs_super.c21
-rw-r--r--fs/xfs/xfs_super.h1
-rw-r--r--fs/xfs/xfs_sync.c78
-rw-r--r--fs/xfs/xfs_sync.h3
-rw-r--r--fs/xfs/xfs_vnodeops.c2
8 files changed, 34 insertions, 108 deletions
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index aa473fa640a2..daf4066c24b2 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -728,16 +728,17 @@ xfs_file_buffered_aio_write(
728write_retry: 728write_retry:
729 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0); 729 trace_xfs_file_buffered_write(ip, count, iocb->ki_pos, 0);
730 ret = generic_file_buffered_write(iocb, iovp, nr_segs, 730 ret = generic_file_buffered_write(iocb, iovp, nr_segs,
731 pos, &iocb->ki_pos, count, ret); 731 pos, &iocb->ki_pos, count, 0);
732
732 /* 733 /*
733 * if we just got an ENOSPC, flush the inode now we aren't holding any 734 * If we just got an ENOSPC, try to write back all dirty inodes to
734 * page locks and retry *once* 735 * convert delalloc space to free up some of the excess reserved
736 * metadata space.
735 */ 737 */
736 if (ret == -ENOSPC && !enospc) { 738 if (ret == -ENOSPC && !enospc) {
737 enospc = 1; 739 enospc = 1;
738 ret = -xfs_flush_pages(ip, 0, -1, 0, FI_NONE); 740 xfs_flush_inodes(ip->i_mount);
739 if (!ret) 741 goto write_retry;
740 goto write_retry;
741 } 742 }
742 743
743 current->backing_dev_info = NULL; 744 current->backing_dev_info = NULL;
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 973dff6ad935..f858b903678e 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -373,7 +373,7 @@ xfs_iomap_write_delay(
373 xfs_extlen_t extsz; 373 xfs_extlen_t extsz;
374 int nimaps; 374 int nimaps;
375 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS]; 375 xfs_bmbt_irec_t imap[XFS_WRITE_IMAPS];
376 int prealloc, flushed = 0; 376 int prealloc;
377 int error; 377 int error;
378 378
379 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); 379 ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
@@ -434,26 +434,17 @@ retry:
434 } 434 }
435 435
436 /* 436 /*
437 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. For 437 * If bmapi returned us nothing, we got either ENOSPC or EDQUOT. Retry
438 * ENOSPC, * flush all other inodes with delalloc blocks to free up
439 * some of the excess reserved metadata space. For both cases, retry
440 * without EOF preallocation. 438 * without EOF preallocation.
441 */ 439 */
442 if (nimaps == 0) { 440 if (nimaps == 0) {
443 trace_xfs_delalloc_enospc(ip, offset, count); 441 trace_xfs_delalloc_enospc(ip, offset, count);
444 if (flushed) 442 if (prealloc) {
445 return XFS_ERROR(error ? error : ENOSPC); 443 prealloc = 0;
446 444 error = 0;
447 if (error == ENOSPC) { 445 goto retry;
448 xfs_iunlock(ip, XFS_ILOCK_EXCL);
449 xfs_flush_inodes(ip);
450 xfs_ilock(ip, XFS_ILOCK_EXCL);
451 } 446 }
452 447 return XFS_ERROR(error ? error : ENOSPC);
453 flushed = 1;
454 error = 0;
455 prealloc = 0;
456 goto retry;
457 } 448 }
458 449
459 if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip))) 450 if (!(imap[0].br_startblock || XFS_IS_REALTIME_INODE(ip)))
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 26e46aeaa3f1..a54b5aa498d4 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -198,7 +198,6 @@ typedef struct xfs_mount {
198#endif 198#endif
199 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */ 199 struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
200 struct delayed_work m_reclaim_work; /* background inode reclaim */ 200 struct delayed_work m_reclaim_work; /* background inode reclaim */
201 struct work_struct m_flush_work; /* background inode flush */
202 __int64_t m_update_flags; /* sb flags we need to update 201 __int64_t m_update_flags; /* sb flags we need to update
203 on the next remount,rw */ 202 on the next remount,rw */
204 struct shrinker m_inode_shrink; /* inode reclaim shrinker */ 203 struct shrinker m_inode_shrink; /* inode reclaim shrinker */
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 37c39a155a58..9468c6878463 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -882,6 +882,24 @@ xfs_destroy_mount_workqueues(
882 destroy_workqueue(mp->m_unwritten_workqueue); 882 destroy_workqueue(mp->m_unwritten_workqueue);
883} 883}
884 884
885/*
886 * Flush all dirty data to disk. Must not be called while holding an XFS_ILOCK
887 * or a page lock. We use sync_inodes_sb() here to ensure we block while waiting
888 * for IO to complete so that we effectively throttle multiple callers to the
889 * rate at which IO is completing.
890 */
891void
892xfs_flush_inodes(
893 struct xfs_mount *mp)
894{
895 struct super_block *sb = mp->m_super;
896
897 if (down_read_trylock(&sb->s_umount)) {
898 sync_inodes_sb(sb);
899 up_read(&sb->s_umount);
900 }
901}
902
885/* Catch misguided souls that try to use this interface on XFS */ 903/* Catch misguided souls that try to use this interface on XFS */
886STATIC struct inode * 904STATIC struct inode *
887xfs_fs_alloc_inode( 905xfs_fs_alloc_inode(
@@ -1005,8 +1023,6 @@ xfs_fs_put_super(
1005{ 1023{
1006 struct xfs_mount *mp = XFS_M(sb); 1024 struct xfs_mount *mp = XFS_M(sb);
1007 1025
1008 cancel_work_sync(&mp->m_flush_work);
1009
1010 xfs_filestream_unmount(mp); 1026 xfs_filestream_unmount(mp);
1011 xfs_unmountfs(mp); 1027 xfs_unmountfs(mp);
1012 1028
@@ -1324,7 +1340,6 @@ xfs_fs_fill_super(
1324 spin_lock_init(&mp->m_sb_lock); 1340 spin_lock_init(&mp->m_sb_lock);
1325 mutex_init(&mp->m_growlock); 1341 mutex_init(&mp->m_growlock);
1326 atomic_set(&mp->m_active_trans, 0); 1342 atomic_set(&mp->m_active_trans, 0);
1327 INIT_WORK(&mp->m_flush_work, xfs_flush_worker);
1328 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); 1343 INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
1329 1344
1330 mp->m_super = sb; 1345 mp->m_super = sb;
diff --git a/fs/xfs/xfs_super.h b/fs/xfs/xfs_super.h
index 9de4a920ba05..bbe3d15a7904 100644
--- a/fs/xfs/xfs_super.h
+++ b/fs/xfs/xfs_super.h
@@ -74,6 +74,7 @@ struct block_device;
74 74
75extern __uint64_t xfs_max_file_offset(unsigned int); 75extern __uint64_t xfs_max_file_offset(unsigned int);
76 76
77extern void xfs_flush_inodes(struct xfs_mount *mp);
77extern void xfs_blkdev_issue_flush(struct xfs_buftarg *); 78extern void xfs_blkdev_issue_flush(struct xfs_buftarg *);
78extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *); 79extern xfs_agnumber_t xfs_set_inode32(struct xfs_mount *);
79extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *); 80extern xfs_agnumber_t xfs_set_inode64(struct xfs_mount *);
diff --git a/fs/xfs/xfs_sync.c b/fs/xfs/xfs_sync.c
index 2174555aebb2..6a2ada379166 100644
--- a/fs/xfs/xfs_sync.c
+++ b/fs/xfs/xfs_sync.c
@@ -217,51 +217,6 @@ xfs_inode_ag_iterator(
217} 217}
218 218
219STATIC int 219STATIC int
220xfs_sync_inode_data(
221 struct xfs_inode *ip,
222 struct xfs_perag *pag,
223 int flags)
224{
225 struct inode *inode = VFS_I(ip);
226 struct address_space *mapping = inode->i_mapping;
227 int error = 0;
228
229 if (!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
230 return 0;
231
232 if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED)) {
233 if (flags & SYNC_TRYLOCK)
234 return 0;
235 xfs_ilock(ip, XFS_IOLOCK_SHARED);
236 }
237
238 error = xfs_flush_pages(ip, 0, -1, (flags & SYNC_WAIT) ?
239 0 : XBF_ASYNC, FI_NONE);
240 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
241 return error;
242}
243
244/*
245 * Write out pagecache data for the whole filesystem.
246 */
247STATIC int
248xfs_sync_data(
249 struct xfs_mount *mp,
250 int flags)
251{
252 int error;
253
254 ASSERT((flags & ~(SYNC_TRYLOCK|SYNC_WAIT)) == 0);
255
256 error = xfs_inode_ag_iterator(mp, xfs_sync_inode_data, flags);
257 if (error)
258 return XFS_ERROR(error);
259
260 xfs_log_force(mp, (flags & SYNC_WAIT) ? XFS_LOG_SYNC : 0);
261 return 0;
262}
263
264STATIC int
265xfs_sync_fsdata( 220xfs_sync_fsdata(
266 struct xfs_mount *mp) 221 struct xfs_mount *mp)
267{ 222{
@@ -415,39 +370,6 @@ xfs_reclaim_worker(
415 xfs_syncd_queue_reclaim(mp); 370 xfs_syncd_queue_reclaim(mp);
416} 371}
417 372
418/*
419 * Flush delayed allocate data, attempting to free up reserved space
420 * from existing allocations. At this point a new allocation attempt
421 * has failed with ENOSPC and we are in the process of scratching our
422 * heads, looking about for more room.
423 *
424 * Queue a new data flush if there isn't one already in progress and
425 * wait for completion of the flush. This means that we only ever have one
426 * inode flush in progress no matter how many ENOSPC events are occurring and
427 * so will prevent the system from bogging down due to every concurrent
428 * ENOSPC event scanning all the active inodes in the system for writeback.
429 */
430void
431xfs_flush_inodes(
432 struct xfs_inode *ip)
433{
434 struct xfs_mount *mp = ip->i_mount;
435
436 queue_work(xfs_syncd_wq, &mp->m_flush_work);
437 flush_work(&mp->m_flush_work);
438}
439
440void
441xfs_flush_worker(
442 struct work_struct *work)
443{
444 struct xfs_mount *mp = container_of(work,
445 struct xfs_mount, m_flush_work);
446
447 xfs_sync_data(mp, SYNC_TRYLOCK);
448 xfs_sync_data(mp, SYNC_TRYLOCK | SYNC_WAIT);
449}
450
451void 373void
452__xfs_inode_set_reclaim_tag( 374__xfs_inode_set_reclaim_tag(
453 struct xfs_perag *pag, 375 struct xfs_perag *pag,
diff --git a/fs/xfs/xfs_sync.h b/fs/xfs/xfs_sync.h
index 8d58fab72a10..0018e846f0dc 100644
--- a/fs/xfs/xfs_sync.h
+++ b/fs/xfs/xfs_sync.h
@@ -26,14 +26,11 @@ struct xfs_perag;
26 26
27extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */ 27extern struct workqueue_struct *xfs_syncd_wq; /* sync workqueue */
28 28
29void xfs_flush_worker(struct work_struct *work);
30void xfs_reclaim_worker(struct work_struct *work); 29void xfs_reclaim_worker(struct work_struct *work);
31 30
32int xfs_quiesce_data(struct xfs_mount *mp); 31int xfs_quiesce_data(struct xfs_mount *mp);
33void xfs_quiesce_attr(struct xfs_mount *mp); 32void xfs_quiesce_attr(struct xfs_mount *mp);
34 33
35void xfs_flush_inodes(struct xfs_inode *ip);
36
37int xfs_reclaim_inodes(struct xfs_mount *mp, int mode); 34int xfs_reclaim_inodes(struct xfs_mount *mp, int mode);
38int xfs_reclaim_inodes_count(struct xfs_mount *mp); 35int xfs_reclaim_inodes_count(struct xfs_mount *mp);
39void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan); 36void xfs_reclaim_inodes_nr(struct xfs_mount *mp, int nr_to_scan);
diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c
index 2a5c637344b4..14928564f106 100644
--- a/fs/xfs/xfs_vnodeops.c
+++ b/fs/xfs/xfs_vnodeops.c
@@ -777,7 +777,7 @@ xfs_create(
777 XFS_TRANS_PERM_LOG_RES, log_count); 777 XFS_TRANS_PERM_LOG_RES, log_count);
778 if (error == ENOSPC) { 778 if (error == ENOSPC) {
779 /* flush outstanding delalloc blocks and retry */ 779 /* flush outstanding delalloc blocks and retry */
780 xfs_flush_inodes(dp); 780 xfs_flush_inodes(mp);
781 error = xfs_trans_reserve(tp, resblks, log_res, 0, 781 error = xfs_trans_reserve(tp, resblks, log_res, 0,
782 XFS_TRANS_PERM_LOG_RES, log_count); 782 XFS_TRANS_PERM_LOG_RES, log_count);
783 } 783 }