aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/inode.c
diff options
context:
space:
mode:
authorLiu Bo <bo.li.liu@oracle.com>2014-08-15 11:36:53 -0400
committerChris Mason <clm@fb.com>2014-08-24 10:17:02 -0400
commit9e0af23764344f7f1b68e4eefbe7dc865018b63d (patch)
treeecea2b1c13165160a5829aec21a91a4b5746b8b8 /fs/btrfs/inode.c
parentf6dc45c7a93a011dff6eb9b2ffda59c390c7705a (diff)
Btrfs: fix task hang under heavy compressed write
This has been reported and discussed for a long time, and this hang occurs in both 3.15 and 3.16. Btrfs now migrates to use kernel workqueue, but it introduces this hang problem. Btrfs has a kind of work queued as an ordered way, which means that its ordered_func() must be processed in the way of FIFO, so it usually looks like -- normal_work_helper(arg) work = container_of(arg, struct btrfs_work, normal_work); work->func() <---- (we name it work X) for ordered_work in wq->ordered_list ordered_work->ordered_func() ordered_work->ordered_free() The hang is a rare case, first when we find free space, we get an uncached block group, then we go to read its free space cache inode for free space information, so it will file a readahead request btrfs_readpages() for page that is not in page cache __do_readpage() submit_extent_page() btrfs_submit_bio_hook() btrfs_bio_wq_end_io() submit_bio() end_workqueue_bio() <--(ret by the 1st endio) queue a work(named work Y) for the 2nd also the real endio() So the hang occurs when work Y's work_struct and work X's work_struct happens to share the same address. A bit more explanation, A,B,C -- struct btrfs_work arg -- struct work_struct kthread: worker_thread() pick up a work_struct from @worklist process_one_work(arg) worker->current_work = arg; <-- arg is A->normal_work worker->current_func(arg) normal_work_helper(arg) A = container_of(arg, struct btrfs_work, normal_work); A->func() A->ordered_func() A->ordered_free() <-- A gets freed B->ordered_func() submit_compressed_extents() find_free_extent() load_free_space_inode() ... <-- (the above readhead stack) end_workqueue_bio() btrfs_queue_work(work C) B->ordered_free() As if work A has a high priority in wq->ordered_list and there are more ordered works queued after it, such as B->ordered_func(), its memory could have been freed before normal_work_helper() returns, which means that kernel workqueue code worker_thread() still has worker->current_work pointer to be work A->normal_work's, ie. arg's address. Meanwhile, work C is allocated after work A is freed, work C->normal_work and work A->normal_work are likely to share the same address(I confirmed this with ftrace output, so I'm not just guessing, it's rare though). When another kthread picks up work C->normal_work to process, and finds our kthread is processing it(see find_worker_executing_work()), it'll think work C as a collision and skip then, which ends up nobody processing work C. So the situation is that our kthread is waiting forever on work C. Besides, there're other cases that can lead to deadlock, but the real problem is that all btrfs workqueue shares one work->func, -- normal_work_helper, so this makes each workqueue to have its own helper function, but only a wraper pf normal_work_helper. With this patch, I no long hit the above hang. Signed-off-by: Liu Bo <bo.li.liu@oracle.com> Signed-off-by: Chris Mason <clm@fb.com>
Diffstat (limited to 'fs/btrfs/inode.c')
-rw-r--r--fs/btrfs/inode.c35
1 files changed, 23 insertions, 12 deletions
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index ae98df67950f..3d020d6d9ace 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1096,8 +1096,10 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
1096 async_cow->end = cur_end; 1096 async_cow->end = cur_end;
1097 INIT_LIST_HEAD(&async_cow->extents); 1097 INIT_LIST_HEAD(&async_cow->extents);
1098 1098
1099 btrfs_init_work(&async_cow->work, async_cow_start, 1099 btrfs_init_work(&async_cow->work,
1100 async_cow_submit, async_cow_free); 1100 btrfs_delalloc_helper,
1101 async_cow_start, async_cow_submit,
1102 async_cow_free);
1101 1103
1102 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> 1104 nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >>
1103 PAGE_CACHE_SHIFT; 1105 PAGE_CACHE_SHIFT;
@@ -1881,7 +1883,8 @@ static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end)
1881 1883
1882 SetPageChecked(page); 1884 SetPageChecked(page);
1883 page_cache_get(page); 1885 page_cache_get(page);
1884 btrfs_init_work(&fixup->work, btrfs_writepage_fixup_worker, NULL, NULL); 1886 btrfs_init_work(&fixup->work, btrfs_fixup_helper,
1887 btrfs_writepage_fixup_worker, NULL, NULL);
1885 fixup->page = page; 1888 fixup->page = page;
1886 btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work); 1889 btrfs_queue_work(root->fs_info->fixup_workers, &fixup->work);
1887 return -EBUSY; 1890 return -EBUSY;
@@ -2822,7 +2825,8 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2822 struct inode *inode = page->mapping->host; 2825 struct inode *inode = page->mapping->host;
2823 struct btrfs_root *root = BTRFS_I(inode)->root; 2826 struct btrfs_root *root = BTRFS_I(inode)->root;
2824 struct btrfs_ordered_extent *ordered_extent = NULL; 2827 struct btrfs_ordered_extent *ordered_extent = NULL;
2825 struct btrfs_workqueue *workers; 2828 struct btrfs_workqueue *wq;
2829 btrfs_work_func_t func;
2826 2830
2827 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); 2831 trace_btrfs_writepage_end_io_hook(page, start, end, uptodate);
2828 2832
@@ -2831,13 +2835,17 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
2831 end - start + 1, uptodate)) 2835 end - start + 1, uptodate))
2832 return 0; 2836 return 0;
2833 2837
2834 btrfs_init_work(&ordered_extent->work, finish_ordered_fn, NULL, NULL); 2838 if (btrfs_is_free_space_inode(inode)) {
2839 wq = root->fs_info->endio_freespace_worker;
2840 func = btrfs_freespace_write_helper;
2841 } else {
2842 wq = root->fs_info->endio_write_workers;
2843 func = btrfs_endio_write_helper;
2844 }
2835 2845
2836 if (btrfs_is_free_space_inode(inode)) 2846 btrfs_init_work(&ordered_extent->work, func, finish_ordered_fn, NULL,
2837 workers = root->fs_info->endio_freespace_worker; 2847 NULL);
2838 else 2848 btrfs_queue_work(wq, &ordered_extent->work);
2839 workers = root->fs_info->endio_write_workers;
2840 btrfs_queue_work(workers, &ordered_extent->work);
2841 2849
2842 return 0; 2850 return 0;
2843} 2851}
@@ -7208,7 +7216,8 @@ again:
7208 if (!ret) 7216 if (!ret)
7209 goto out_test; 7217 goto out_test;
7210 7218
7211 btrfs_init_work(&ordered->work, finish_ordered_fn, NULL, NULL); 7219 btrfs_init_work(&ordered->work, btrfs_endio_write_helper,
7220 finish_ordered_fn, NULL, NULL);
7212 btrfs_queue_work(root->fs_info->endio_write_workers, 7221 btrfs_queue_work(root->fs_info->endio_write_workers,
7213 &ordered->work); 7222 &ordered->work);
7214out_test: 7223out_test:
@@ -8535,7 +8544,9 @@ struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode,
8535 work->inode = inode; 8544 work->inode = inode;
8536 work->wait = wait; 8545 work->wait = wait;
8537 work->delay_iput = delay_iput; 8546 work->delay_iput = delay_iput;
8538 btrfs_init_work(&work->work, btrfs_run_delalloc_work, NULL, NULL); 8547 WARN_ON_ONCE(!inode);
8548 btrfs_init_work(&work->work, btrfs_flush_delalloc_helper,
8549 btrfs_run_delalloc_work, NULL, NULL);
8539 8550
8540 return work; 8551 return work;
8541} 8552}