aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorFilipe Manana <fdmanana@suse.com>2015-09-14 04:09:31 -0400
committerFilipe Manana <fdmanana@suse.com>2015-09-14 19:59:31 -0400
commit005efedf2c7d0a270ffbe28d8997b03844f3e3e7 (patch)
treebcbe8285bb18cf376c7988a13f4b35c207d58cd2
parent85e0a0f21a14bfd9145422a6a627c3df47101bd8 (diff)
Btrfs: fix read corruption of compressed and shared extents
If a file has a range pointing to a compressed extent, followed by another range that points to the same compressed extent and a read operation attempts to read both ranges (either completely or part of them), the pages that correspond to the second range are incorrectly filled with zeroes. Consider the following example: File layout [0 - 8K] [8K - 24K] | | | | points to extent X, points to extent X, offset 4K, length of 8K offset 0, length 16K [extent X, compressed length = 4K uncompressed length = 16K] If a readpages() call spans the 2 ranges, a single bio to read the extent is submitted - extent_io.c:submit_extent_page() would only create a new bio to cover the second range pointing to the extent if the extent it points to had a different logical address than the extent associated with the first range. This has a consequence of the compressed read end io handler (compression.c:end_compressed_bio_read()) finish once the extent is decompressed into the pages covering the first range, leaving the remaining pages (belonging to the second range) filled with zeroes (done by compression.c:btrfs_clear_biovec_end()). So fix this by submitting the current bio whenever we find a range pointing to a compressed extent that was preceded by a range with a different extent map. This is the simplest solution for this corner case. Making the end io callback populate both ranges (or more, if we have multiple pointing to the same extent) is a much more complex solution since each bio is tightly coupled with a single extent map and the extent maps associated to the ranges pointing to the shared extent can have different offsets and lengths. The following test case for fstests triggers the issue: seq=`basename $0` seqres=$RESULT_DIR/$seq echo "QA output created by $seq" tmp=/tmp/$$ status=1 # failure is the default! trap "_cleanup; exit \$status" 0 1 2 3 15 _cleanup() { rm -f $tmp.* } # get standard environment, filters and checks . ./common/rc . ./common/filter # real QA test starts here _need_to_be_root _supported_fs btrfs _supported_os Linux _require_scratch _require_cloner rm -f $seqres.full test_clone_and_read_compressed_extent() { local mount_opts=$1 _scratch_mkfs >>$seqres.full 2>&1 _scratch_mount $mount_opts # Create a test file with a single extent that is compressed (the # data we write into it is highly compressible no matter which # compression algorithm is used, zlib or lzo). $XFS_IO_PROG -f -c "pwrite -S 0xaa 0K 4K" \ -c "pwrite -S 0xbb 4K 8K" \ -c "pwrite -S 0xcc 12K 4K" \ $SCRATCH_MNT/foo | _filter_xfs_io # Now clone our extent into an adjacent offset. $CLONER_PROG -s $((4 * 1024)) -d $((16 * 1024)) -l $((8 * 1024)) \ $SCRATCH_MNT/foo $SCRATCH_MNT/foo # Same as before but for this file we clone the extent into a lower # file offset. $XFS_IO_PROG -f -c "pwrite -S 0xaa 8K 4K" \ -c "pwrite -S 0xbb 12K 8K" \ -c "pwrite -S 0xcc 20K 4K" \ $SCRATCH_MNT/bar | _filter_xfs_io $CLONER_PROG -s $((12 * 1024)) -d 0 -l $((8 * 1024)) \ $SCRATCH_MNT/bar $SCRATCH_MNT/bar echo "File digests before unmounting filesystem:" md5sum $SCRATCH_MNT/foo | _filter_scratch md5sum $SCRATCH_MNT/bar | _filter_scratch # Evicting the inode or clearing the page cache before reading # again the file would also trigger the bug - reads were returning # all bytes in the range corresponding to the second reference to # the extent with a value of 0, but the correct data was persisted # (it was a bug exclusively in the read path). The issue happened # only if the same readpages() call targeted pages belonging to the # first and second ranges that point to the same compressed extent. _scratch_remount echo "File digests after mounting filesystem again:" # Must match the same digests we got before. md5sum $SCRATCH_MNT/foo | _filter_scratch md5sum $SCRATCH_MNT/bar | _filter_scratch } echo -e "\nTesting with zlib compression..." test_clone_and_read_compressed_extent "-o compress=zlib" _scratch_unmount echo -e "\nTesting with lzo compression..." test_clone_and_read_compressed_extent "-o compress=lzo" status=0 exit Cc: stable@vger.kernel.org Signed-off-by: Filipe Manana <fdmanana@suse.com> Reviewed-by: Qu Wenruo<quwenruo@cn.fujitsu.com> Reviewed-by: Liu Bo <bo.li.liu@oracle.com>
-rw-r--r--fs/btrfs/extent_io.c65
1 files changed, 57 insertions, 8 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index fa19f2f68c1b..11aa8f743b90 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2805,7 +2805,8 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
2805 bio_end_io_t end_io_func, 2805 bio_end_io_t end_io_func,
2806 int mirror_num, 2806 int mirror_num,
2807 unsigned long prev_bio_flags, 2807 unsigned long prev_bio_flags,
2808 unsigned long bio_flags) 2808 unsigned long bio_flags,
2809 bool force_bio_submit)
2809{ 2810{
2810 int ret = 0; 2811 int ret = 0;
2811 struct bio *bio; 2812 struct bio *bio;
@@ -2823,6 +2824,7 @@ static int submit_extent_page(int rw, struct extent_io_tree *tree,
2823 contig = bio_end_sector(bio) == sector; 2824 contig = bio_end_sector(bio) == sector;
2824 2825
2825 if (prev_bio_flags != bio_flags || !contig || 2826 if (prev_bio_flags != bio_flags || !contig ||
2827 force_bio_submit ||
2826 merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) || 2828 merge_bio(rw, tree, page, offset, page_size, bio, bio_flags) ||
2827 bio_add_page(bio, page, page_size, offset) < page_size) { 2829 bio_add_page(bio, page, page_size, offset) < page_size) {
2828 ret = submit_one_bio(rw, bio, mirror_num, 2830 ret = submit_one_bio(rw, bio, mirror_num,
@@ -2922,7 +2924,8 @@ static int __do_readpage(struct extent_io_tree *tree,
2922 get_extent_t *get_extent, 2924 get_extent_t *get_extent,
2923 struct extent_map **em_cached, 2925 struct extent_map **em_cached,
2924 struct bio **bio, int mirror_num, 2926 struct bio **bio, int mirror_num,
2925 unsigned long *bio_flags, int rw) 2927 unsigned long *bio_flags, int rw,
2928 u64 *prev_em_start)
2926{ 2929{
2927 struct inode *inode = page->mapping->host; 2930 struct inode *inode = page->mapping->host;
2928 u64 start = page_offset(page); 2931 u64 start = page_offset(page);
@@ -2970,6 +2973,7 @@ static int __do_readpage(struct extent_io_tree *tree,
2970 } 2973 }
2971 while (cur <= end) { 2974 while (cur <= end) {
2972 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; 2975 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
2976 bool force_bio_submit = false;
2973 2977
2974 if (cur >= last_byte) { 2978 if (cur >= last_byte) {
2975 char *userpage; 2979 char *userpage;
@@ -3020,6 +3024,49 @@ static int __do_readpage(struct extent_io_tree *tree,
3020 block_start = em->block_start; 3024 block_start = em->block_start;
3021 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) 3025 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
3022 block_start = EXTENT_MAP_HOLE; 3026 block_start = EXTENT_MAP_HOLE;
3027
3028 /*
3029 * If we have a file range that points to a compressed extent
3030 * and it's followed by a consecutive file range that points to
3031 * to the same compressed extent (possibly with a different
3032 * offset and/or length, so it either points to the whole extent
3033 * or only part of it), we must make sure we do not submit a
3034 * single bio to populate the pages for the 2 ranges because
3035 * this makes the compressed extent read zero out the pages
3036 * belonging to the 2nd range. Imagine the following scenario:
3037 *
3038 * File layout
3039 * [0 - 8K] [8K - 24K]
3040 * | |
3041 * | |
3042 * points to extent X, points to extent X,
3043 * offset 4K, length of 8K offset 0, length 16K
3044 *
3045 * [extent X, compressed length = 4K uncompressed length = 16K]
3046 *
3047 * If the bio to read the compressed extent covers both ranges,
3048 * it will decompress extent X into the pages belonging to the
3049 * first range and then it will stop, zeroing out the remaining
3050 * pages that belong to the other range that points to extent X.
3051 * So here we make sure we submit 2 bios, one for the first
3052 * range and another one for the third range. Both will target
3053 * the same physical extent from disk, but we can't currently
3054 * make the compressed bio endio callback populate the pages
3055 * for both ranges because each compressed bio is tightly
3056 * coupled with a single extent map, and each range can have
3057 * an extent map with a different offset value relative to the
3058 * uncompressed data of our extent and different lengths. This
3059 * is a corner case so we prioritize correctness over
3060 * non-optimal behavior (submitting 2 bios for the same extent).
3061 */
3062 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) &&
3063 prev_em_start && *prev_em_start != (u64)-1 &&
3064 *prev_em_start != em->orig_start)
3065 force_bio_submit = true;
3066
3067 if (prev_em_start)
3068 *prev_em_start = em->orig_start;
3069
3023 free_extent_map(em); 3070 free_extent_map(em);
3024 em = NULL; 3071 em = NULL;
3025 3072
@@ -3069,7 +3116,8 @@ static int __do_readpage(struct extent_io_tree *tree,
3069 bdev, bio, pnr, 3116 bdev, bio, pnr,
3070 end_bio_extent_readpage, mirror_num, 3117 end_bio_extent_readpage, mirror_num,
3071 *bio_flags, 3118 *bio_flags,
3072 this_bio_flag); 3119 this_bio_flag,
3120 force_bio_submit);
3073 if (!ret) { 3121 if (!ret) {
3074 nr++; 3122 nr++;
3075 *bio_flags = this_bio_flag; 3123 *bio_flags = this_bio_flag;
@@ -3101,6 +3149,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
3101 struct inode *inode; 3149 struct inode *inode;
3102 struct btrfs_ordered_extent *ordered; 3150 struct btrfs_ordered_extent *ordered;
3103 int index; 3151 int index;
3152 u64 prev_em_start = (u64)-1;
3104 3153
3105 inode = pages[0]->mapping->host; 3154 inode = pages[0]->mapping->host;
3106 while (1) { 3155 while (1) {
@@ -3116,7 +3165,7 @@ static inline void __do_contiguous_readpages(struct extent_io_tree *tree,
3116 3165
3117 for (index = 0; index < nr_pages; index++) { 3166 for (index = 0; index < nr_pages; index++) {
3118 __do_readpage(tree, pages[index], get_extent, em_cached, bio, 3167 __do_readpage(tree, pages[index], get_extent, em_cached, bio,
3119 mirror_num, bio_flags, rw); 3168 mirror_num, bio_flags, rw, &prev_em_start);
3120 page_cache_release(pages[index]); 3169 page_cache_release(pages[index]);
3121 } 3170 }
3122} 3171}
@@ -3184,7 +3233,7 @@ static int __extent_read_full_page(struct extent_io_tree *tree,
3184 } 3233 }
3185 3234
3186 ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num, 3235 ret = __do_readpage(tree, page, get_extent, NULL, bio, mirror_num,
3187 bio_flags, rw); 3236 bio_flags, rw, NULL);
3188 return ret; 3237 return ret;
3189} 3238}
3190 3239
@@ -3210,7 +3259,7 @@ int extent_read_full_page_nolock(struct extent_io_tree *tree, struct page *page,
3210 int ret; 3259 int ret;
3211 3260
3212 ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num, 3261 ret = __do_readpage(tree, page, get_extent, NULL, &bio, mirror_num,
3213 &bio_flags, READ); 3262 &bio_flags, READ, NULL);
3214 if (bio) 3263 if (bio)
3215 ret = submit_one_bio(READ, bio, mirror_num, bio_flags); 3264 ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
3216 return ret; 3265 return ret;
@@ -3463,7 +3512,7 @@ static noinline_for_stack int __extent_writepage_io(struct inode *inode,
3463 sector, iosize, pg_offset, 3512 sector, iosize, pg_offset,
3464 bdev, &epd->bio, max_nr, 3513 bdev, &epd->bio, max_nr,
3465 end_bio_extent_writepage, 3514 end_bio_extent_writepage,
3466 0, 0, 0); 3515 0, 0, 0, false);
3467 if (ret) 3516 if (ret)
3468 SetPageError(page); 3517 SetPageError(page);
3469 } 3518 }
@@ -3765,7 +3814,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
3765 ret = submit_extent_page(rw, tree, wbc, p, offset >> 9, 3814 ret = submit_extent_page(rw, tree, wbc, p, offset >> 9,
3766 PAGE_CACHE_SIZE, 0, bdev, &epd->bio, 3815 PAGE_CACHE_SIZE, 0, bdev, &epd->bio,
3767 -1, end_bio_extent_buffer_writepage, 3816 -1, end_bio_extent_buffer_writepage,
3768 0, epd->bio_flags, bio_flags); 3817 0, epd->bio_flags, bio_flags, false);
3769 epd->bio_flags = bio_flags; 3818 epd->bio_flags = bio_flags;
3770 if (ret) { 3819 if (ret) {
3771 set_btree_ioerr(p); 3820 set_btree_ioerr(p);