diff options
author | Filipe Manana <fdmanana@gmail.com> | 2014-05-09 12:17:40 -0400 |
---|---|---|
committer | Chris Mason <clm@fb.com> | 2014-06-09 20:20:20 -0400 |
commit | 61391d562229ed94899ed4b4973dc2f0c015292a (patch) | |
tree | d6bc4f162a64f6f1de0d2be96c911e1f1c37621a | |
parent | 1860e379875dfe7271c649058aeddffe5afd9d0d (diff) |
Btrfs: fix hang on error (such as ENOSPC) when writing extent pages
When running low on available disk space and having several processes
doing buffered file IO, I got the following trace in dmesg:
[ 4202.720152] INFO: task kworker/u8:1:5450 blocked for more than 120 seconds.
[ 4202.720401] Not tainted 3.13.0-fdm-btrfs-next-26+ #1
[ 4202.720596] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 4202.720874] kworker/u8:1 D 0000000000000001 0 5450 2 0x00000000
[ 4202.720904] Workqueue: btrfs-flush_delalloc normal_work_helper [btrfs]
[ 4202.720908] ffff8801f62ddc38 0000000000000082 ffff880203ac2490 00000000001d3f40
[ 4202.720913] ffff8801f62ddfd8 00000000001d3f40 ffff8800c4f0c920 ffff880203ac2490
[ 4202.720918] 00000000001d4a40 ffff88020fe85a40 ffff88020fe85ab8 0000000000000001
[ 4202.720922] Call Trace:
[ 4202.720931] [<ffffffff816a3cb9>] schedule+0x29/0x70
[ 4202.720950] [<ffffffffa01ec48d>] btrfs_start_ordered_extent+0x6d/0x110 [btrfs]
[ 4202.720956] [<ffffffff8108e620>] ? bit_waitqueue+0xc0/0xc0
[ 4202.720972] [<ffffffffa01ec559>] btrfs_run_ordered_extent_work+0x29/0x40 [btrfs]
[ 4202.720988] [<ffffffffa0201987>] normal_work_helper+0x137/0x2c0 [btrfs]
[ 4202.720994] [<ffffffff810680e5>] process_one_work+0x1f5/0x530
(...)
[ 4202.721027] 2 locks held by kworker/u8:1/5450:
[ 4202.721028] #0: (%s-%s){++++..}, at: [<ffffffff81068083>] process_one_work+0x193/0x530
[ 4202.721037] #1: ((&work->normal_work)){+.+...}, at: [<ffffffff81068083>] process_one_work+0x193/0x530
[ 4202.721054] INFO: task btrfs:7891 blocked for more than 120 seconds.
[ 4202.721258] Not tainted 3.13.0-fdm-btrfs-next-26+ #1
[ 4202.721444] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message.
[ 4202.721699] btrfs D 0000000000000001 0 7891 7890 0x00000001
[ 4202.721704] ffff88018c2119e8 0000000000000086 ffff8800a33d2490 00000000001d3f40
[ 4202.721710] ffff88018c211fd8 00000000001d3f40 ffff8802144b0000 ffff8800a33d2490
[ 4202.721714] ffff8800d8576640 ffff88020fe85bc0 ffff88020fe85bc8 7fffffffffffffff
[ 4202.721718] Call Trace:
[ 4202.721723] [<ffffffff816a3cb9>] schedule+0x29/0x70
[ 4202.721727] [<ffffffff816a2ebc>] schedule_timeout+0x1dc/0x270
[ 4202.721732] [<ffffffff8109bd79>] ? mark_held_locks+0xb9/0x140
[ 4202.721736] [<ffffffff816a90c0>] ? _raw_spin_unlock_irq+0x30/0x40
[ 4202.721740] [<ffffffff8109bf0d>] ? trace_hardirqs_on_caller+0x10d/0x1d0
[ 4202.721744] [<ffffffff816a488f>] wait_for_completion+0xdf/0x120
[ 4202.721749] [<ffffffff8107fa90>] ? try_to_wake_up+0x310/0x310
[ 4202.721765] [<ffffffffa01ebee4>] btrfs_wait_ordered_extents+0x1f4/0x280 [btrfs]
[ 4202.721781] [<ffffffffa020526e>] btrfs_mksubvol.isra.62+0x30e/0x5a0 [btrfs]
[ 4202.721786] [<ffffffff8108e620>] ? bit_waitqueue+0xc0/0xc0
[ 4202.721799] [<ffffffffa02056a9>] btrfs_ioctl_snap_create_transid+0x1a9/0x1b0 [btrfs]
[ 4202.721813] [<ffffffffa020583a>] btrfs_ioctl_snap_create_v2+0x10a/0x170 [btrfs]
(...)
It turns out that extent_io.c:__extent_writepage(), which ends up being called
through filemap_fdatawrite_range() in btrfs_start_ordered_extent(), was getting
-ENOSPC when calling the fill_delalloc callback. In this situation, it returned
without the writepage_end_io_hook callback (inode.c:btrfs_writepage_end_io_hook)
ever being called for the respective page, which prevents the ordered extent's
bytes_left count from ever reaching 0, and therefore a finish_ordered_fn work
is never queued into the endio_write_workers queue. This makes the task that
called btrfs_start_ordered_extent() hang forever on the wait queue of the ordered
extent.
This is fairly easy to reproduce using a small filesystem and fsstress on
a quad core vm:
mkfs.btrfs -f -b `expr 2100 \* 1024 \* 1024` /dev/sdd
mount /dev/sdd /mnt
fsstress -p 6 -d /mnt -n 100000 -x \
"btrfs subvolume snapshot -r /mnt /mnt/mysnap" \
-f allocsp=0 \
-f bulkstat=0 \
-f bulkstat1=0 \
-f chown=0 \
-f creat=1 \
-f dread=0 \
-f dwrite=0 \
-f fallocate=1 \
-f fdatasync=0 \
-f fiemap=0 \
-f freesp=0 \
-f fsync=0 \
-f getattr=0 \
-f getdents=0 \
-f link=0 \
-f mkdir=0 \
-f mknod=0 \
-f punch=1 \
-f read=0 \
-f readlink=0 \
-f rename=0 \
-f resvsp=0 \
-f rmdir=0 \
-f setxattr=0 \
-f stat=0 \
-f symlink=0 \
-f sync=0 \
-f truncate=1 \
-f unlink=0 \
-f unresvsp=0 \
-f write=4
So just ensure that if an error happens while writing the extent page
we call the writepage_end_io_hook callback. Also make it return the
error code and ensure the caller (extent_write_cache_pages) processes
all pages in the page vector even if an error happens only for some
of them, so that ordered extents end up released.
Signed-off-by: Filipe David Borba Manana <fdmanana@gmail.com>
Signed-off-by: Chris Mason <clm@fb.com>
-rw-r--r-- | fs/btrfs/extent_io.c | 16 |
1 files changed, 11 insertions, 5 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 3955e475ceec..fa31c8d2c095 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -3278,6 +3278,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
3278 | end - cur + 1, 1); | 3278 | end - cur + 1, 1); |
3279 | if (IS_ERR_OR_NULL(em)) { | 3279 | if (IS_ERR_OR_NULL(em)) { |
3280 | SetPageError(page); | 3280 | SetPageError(page); |
3281 | ret = PTR_ERR_OR_ZERO(em); | ||
3281 | break; | 3282 | break; |
3282 | } | 3283 | } |
3283 | 3284 | ||
@@ -3364,13 +3365,17 @@ done: | |||
3364 | set_page_writeback(page); | 3365 | set_page_writeback(page); |
3365 | end_page_writeback(page); | 3366 | end_page_writeback(page); |
3366 | } | 3367 | } |
3368 | if (PageError(page)) { | ||
3369 | ret = ret < 0 ? ret : -EIO; | ||
3370 | end_extent_writepage(page, ret, start, page_end); | ||
3371 | } | ||
3367 | unlock_page(page); | 3372 | unlock_page(page); |
3368 | 3373 | ||
3369 | done_unlocked: | 3374 | done_unlocked: |
3370 | 3375 | ||
3371 | /* drop our reference on any cached states */ | 3376 | /* drop our reference on any cached states */ |
3372 | free_extent_state(cached_state); | 3377 | free_extent_state(cached_state); |
3373 | return 0; | 3378 | return ret; |
3374 | } | 3379 | } |
3375 | 3380 | ||
3376 | static int eb_wait(void *word) | 3381 | static int eb_wait(void *word) |
@@ -3690,6 +3695,7 @@ static int extent_write_cache_pages(struct extent_io_tree *tree, | |||
3690 | struct inode *inode = mapping->host; | 3695 | struct inode *inode = mapping->host; |
3691 | int ret = 0; | 3696 | int ret = 0; |
3692 | int done = 0; | 3697 | int done = 0; |
3698 | int err = 0; | ||
3693 | int nr_to_write_done = 0; | 3699 | int nr_to_write_done = 0; |
3694 | struct pagevec pvec; | 3700 | struct pagevec pvec; |
3695 | int nr_pages; | 3701 | int nr_pages; |
@@ -3776,8 +3782,8 @@ retry: | |||
3776 | unlock_page(page); | 3782 | unlock_page(page); |
3777 | ret = 0; | 3783 | ret = 0; |
3778 | } | 3784 | } |
3779 | if (ret) | 3785 | if (!err && ret < 0) |
3780 | done = 1; | 3786 | err = ret; |
3781 | 3787 | ||
3782 | /* | 3788 | /* |
3783 | * the filesystem may choose to bump up nr_to_write. | 3789 | * the filesystem may choose to bump up nr_to_write. |
@@ -3789,7 +3795,7 @@ retry: | |||
3789 | pagevec_release(&pvec); | 3795 | pagevec_release(&pvec); |
3790 | cond_resched(); | 3796 | cond_resched(); |
3791 | } | 3797 | } |
3792 | if (!scanned && !done) { | 3798 | if (!scanned && !done && !err) { |
3793 | /* | 3799 | /* |
3794 | * We hit the last page and there is more work to be done: wrap | 3800 | * We hit the last page and there is more work to be done: wrap |
3795 | * back to the start of the file | 3801 | * back to the start of the file |
@@ -3799,7 +3805,7 @@ retry: | |||
3799 | goto retry; | 3805 | goto retry; |
3800 | } | 3806 | } |
3801 | btrfs_add_delayed_iput(inode); | 3807 | btrfs_add_delayed_iput(inode); |
3802 | return ret; | 3808 | return err; |
3803 | } | 3809 | } |
3804 | 3810 | ||
3805 | static void flush_epd_write_bio(struct extent_page_data *epd) | 3811 | static void flush_epd_write_bio(struct extent_page_data *epd) |