aboutsummaryrefslogtreecommitdiffstats
path: root/fs/xfs
diff options
context:
space:
mode:
authorDarrick J. Wong <darrick.wong@oracle.com>2019-04-15 16:13:20 -0400
committerDarrick J. Wong <darrick.wong@oracle.com>2019-04-16 13:01:57 -0400
commitcb357bf3d105f68ff5a5adcf89f1b285da675e2f (patch)
tree58bc5720f5fcfd8a684e707f7dc257b5321496fb /fs/xfs
parent4fb7951fde64985bad80dcd2d721430ba584f125 (diff)
xfs: implement per-inode writeback completion queues
When scheduling writeback of dirty file data in the page cache, XFS uses IO completion workqueue items to ensure that filesystem metadata only updates after the write completes successfully. This is essential for converting unwritten extents to real extents at the right time and performing COW remappings. Unfortunately, XFS queues each IO completion work item to an unbounded workqueue, which means that the kernel can spawn dozens of threads to try to handle the items quickly. These threads need to take the ILOCK to update file metadata, which results in heavy ILOCK contention if a large number of the work items target a single file, which is inefficient. Worse yet, the writeback completion threads get stuck waiting for the ILOCK while holding transaction reservations, which can use up all available log reservation space. When that happens, metadata updates to other parts of the filesystem grind to a halt, even if the filesystem could otherwise have handled it. Even worse, if one of the things grinding to a halt happens to be a thread in the middle of a defer-ops finish holding the same ILOCK and trying to obtain more log reservation having exhausted the permanent reservation, we now have an ABBA deadlock - writeback completion has a transaction reserved and wants the ILOCK, and someone else has the ILOCK and wants a transaction reservation. Therefore, we create a per-inode writeback io completion queue + work item. When writeback finishes, it can add the ioend to the per-inode queue and let the single worker item process that queue. This dramatically cuts down on the number of kworkers and ILOCK contention in the system, and seems to have eliminated an occasional deadlock I was seeing while running generic/476. Testing with a program that simulates a heavy random-write workload to a single file demonstrates that the number of kworkers drops from approximately 120 threads per file to 1, without dramatically changing write bandwidth or pagecache access latency. Note that we leave the xfs-conv workqueue's max_active alone because we still want to be able to run ioend processing for as many inodes as the system can handle. Signed-off-by: Darrick J. Wong <darrick.wong@oracle.com> Reviewed-by: Brian Foster <bfoster@redhat.com>
Diffstat (limited to 'fs/xfs')
-rw-r--r--fs/xfs/xfs_aops.c49
-rw-r--r--fs/xfs/xfs_aops.h1
-rw-r--r--fs/xfs/xfs_icache.c3
-rw-r--r--fs/xfs/xfs_inode.h7
4 files changed, 48 insertions, 12 deletions
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 3619e9e8d359..f5800620c78c 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -234,11 +234,9 @@ xfs_setfilesize_ioend(
234 * IO write completion. 234 * IO write completion.
235 */ 235 */
236STATIC void 236STATIC void
237xfs_end_io( 237xfs_end_ioend(
238 struct work_struct *work) 238 struct xfs_ioend *ioend)
239{ 239{
240 struct xfs_ioend *ioend =
241 container_of(work, struct xfs_ioend, io_work);
242 struct xfs_inode *ip = XFS_I(ioend->io_inode); 240 struct xfs_inode *ip = XFS_I(ioend->io_inode);
243 xfs_off_t offset = ioend->io_offset; 241 xfs_off_t offset = ioend->io_offset;
244 size_t size = ioend->io_size; 242 size_t size = ioend->io_size;
@@ -278,19 +276,49 @@ done:
278 xfs_destroy_ioend(ioend, error); 276 xfs_destroy_ioend(ioend, error);
279} 277}
280 278
279/* Finish all pending io completions. */
280void
281xfs_end_io(
282 struct work_struct *work)
283{
284 struct xfs_inode *ip;
285 struct xfs_ioend *ioend;
286 struct list_head completion_list;
287 unsigned long flags;
288
289 ip = container_of(work, struct xfs_inode, i_ioend_work);
290
291 spin_lock_irqsave(&ip->i_ioend_lock, flags);
292 list_replace_init(&ip->i_ioend_list, &completion_list);
293 spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
294
295 while (!list_empty(&completion_list)) {
296 ioend = list_first_entry(&completion_list, struct xfs_ioend,
297 io_list);
298 list_del_init(&ioend->io_list);
299 xfs_end_ioend(ioend);
300 }
301}
302
281STATIC void 303STATIC void
282xfs_end_bio( 304xfs_end_bio(
283 struct bio *bio) 305 struct bio *bio)
284{ 306{
285 struct xfs_ioend *ioend = bio->bi_private; 307 struct xfs_ioend *ioend = bio->bi_private;
286 struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; 308 struct xfs_inode *ip = XFS_I(ioend->io_inode);
309 struct xfs_mount *mp = ip->i_mount;
310 unsigned long flags;
287 311
288 if (ioend->io_fork == XFS_COW_FORK || 312 if (ioend->io_fork == XFS_COW_FORK ||
289 ioend->io_state == XFS_EXT_UNWRITTEN) 313 ioend->io_state == XFS_EXT_UNWRITTEN ||
290 queue_work(mp->m_unwritten_workqueue, &ioend->io_work); 314 ioend->io_append_trans != NULL) {
291 else if (ioend->io_append_trans) 315 spin_lock_irqsave(&ip->i_ioend_lock, flags);
292 queue_work(mp->m_data_workqueue, &ioend->io_work); 316 if (list_empty(&ip->i_ioend_list))
293 else 317 WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue,
318 &ip->i_ioend_work));
319 list_add_tail(&ioend->io_list, &ip->i_ioend_list);
320 spin_unlock_irqrestore(&ip->i_ioend_lock, flags);
321 } else
294 xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status)); 322 xfs_destroy_ioend(ioend, blk_status_to_errno(bio->bi_status));
295} 323}
296 324
@@ -594,7 +622,6 @@ xfs_alloc_ioend(
594 ioend->io_inode = inode; 622 ioend->io_inode = inode;
595 ioend->io_size = 0; 623 ioend->io_size = 0;
596 ioend->io_offset = offset; 624 ioend->io_offset = offset;
597 INIT_WORK(&ioend->io_work, xfs_end_io);
598 ioend->io_append_trans = NULL; 625 ioend->io_append_trans = NULL;
599 ioend->io_bio = bio; 626 ioend->io_bio = bio;
600 return ioend; 627 return ioend;
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 6c2615b83c5d..f62b03186c62 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -18,7 +18,6 @@ struct xfs_ioend {
18 struct inode *io_inode; /* file being written to */ 18 struct inode *io_inode; /* file being written to */
19 size_t io_size; /* size of the extent */ 19 size_t io_size; /* size of the extent */
20 xfs_off_t io_offset; /* offset in the file */ 20 xfs_off_t io_offset; /* offset in the file */
21 struct work_struct io_work; /* xfsdatad work queue */
22 struct xfs_trans *io_append_trans;/* xact. for size update */ 21 struct xfs_trans *io_append_trans;/* xact. for size update */
23 struct bio *io_bio; /* bio being built */ 22 struct bio *io_bio; /* bio being built */
24 struct bio io_inline_bio; /* MUST BE LAST! */ 23 struct bio io_inline_bio; /* MUST BE LAST! */
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index f93924d26630..69ad1f33b67e 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -72,6 +72,9 @@ xfs_inode_alloc(
72 memset(&ip->i_d, 0, sizeof(ip->i_d)); 72 memset(&ip->i_d, 0, sizeof(ip->i_d));
73 ip->i_sick = 0; 73 ip->i_sick = 0;
74 ip->i_checked = 0; 74 ip->i_checked = 0;
75 INIT_WORK(&ip->i_ioend_work, xfs_end_io);
76 INIT_LIST_HEAD(&ip->i_ioend_list);
77 spin_lock_init(&ip->i_ioend_lock);
75 78
76 return ip; 79 return ip;
77} 80}
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 7bb1961918de..87e701b638ae 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -65,6 +65,11 @@ typedef struct xfs_inode {
65 65
66 /* VFS inode */ 66 /* VFS inode */
67 struct inode i_vnode; /* embedded VFS inode */ 67 struct inode i_vnode; /* embedded VFS inode */
68
69 /* pending io completions */
70 spinlock_t i_ioend_lock;
71 struct work_struct i_ioend_work;
72 struct list_head i_ioend_list;
68} xfs_inode_t; 73} xfs_inode_t;
69 74
70/* Convert from vfs inode to xfs inode */ 75/* Convert from vfs inode to xfs inode */
@@ -511,4 +516,6 @@ bool xfs_inode_verify_forks(struct xfs_inode *ip);
511int xfs_iunlink_init(struct xfs_perag *pag); 516int xfs_iunlink_init(struct xfs_perag *pag);
512void xfs_iunlink_destroy(struct xfs_perag *pag); 517void xfs_iunlink_destroy(struct xfs_perag *pag);
513 518
519void xfs_end_io(struct work_struct *work);
520
514#endif /* __XFS_INODE_H__ */ 521#endif /* __XFS_INODE_H__ */