aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2008-07-24 09:41:53 -0400
committerChris Mason <chris.mason@oracle.com>2008-09-25 11:04:05 -0400
commit89642229a582a5c2b6d2ed8ec16986387d9a9047 (patch)
tree6ece7a9b7d92159096229e27b4862c3685f198f6
parent9ba4611a3a7902c6bad70c5c205de5161fcfc17b (diff)
Btrfs: Search data ordered extents first for checksums on read
Checksum items are not inserted into the tree until all of the io from a given extent is complete. This means one dirty page from an extent may be written, freed, and then read again before the entire extent is on disk and the checksum item is inserted. The checksums themselves are stored in the ordered extent so they can be inserted in bulk when IO is complete. On read, if a checksum item isn't found, the ordered extents were being searched for a checksum record. This all worked most of the time, but the checksum insertion code tries to reduce the number of tree operations by pre-inserting checksum items based on i_size and a few other factors. This means the read code might find a checksum item that hasn't yet really been filled in. This commit changes things to check the ordered extents first and only dive into the btree if nothing was found. This removes the need for extra locking and is more reliable. Signed-off-by: Chris Mason <chris.mason@oracle.com>
-rw-r--r--fs/btrfs/disk-io.c9
-rw-r--r--fs/btrfs/extent_io.c8
-rw-r--r--fs/btrfs/inode.c33
-rw-r--r--fs/btrfs/ordered-data.c1
4 files changed, 31 insertions, 20 deletions
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 86e84a8579e3..7ce3f83c5dd6 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1011,9 +1011,16 @@ void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page)
1011 spin_lock(&em_tree->lock); 1011 spin_lock(&em_tree->lock);
1012 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); 1012 em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE);
1013 spin_unlock(&em_tree->lock); 1013 spin_unlock(&em_tree->lock);
1014 if (!em) 1014 if (!em) {
1015 __unplug_io_fn(bdi, page);
1015 return; 1016 return;
1017 }
1016 1018
1019 if (em->block_start >= EXTENT_MAP_LAST_BYTE) {
1020 free_extent_map(em);
1021 __unplug_io_fn(bdi, page);
1022 return;
1023 }
1017 offset = offset - em->start; 1024 offset = offset - em->start;
1018 btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree, 1025 btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree,
1019 em->block_start + offset, page); 1026 em->block_start + offset, page);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 9965993748d0..e3547a992d5c 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1949,18 +1949,18 @@ printk("2bad mapping end %Lu cur %Lu\n", end, cur);
1949 cur + iosize - 1); 1949 cur + iosize - 1);
1950 } 1950 }
1951 if (!ret) { 1951 if (!ret) {
1952 unsigned long nr = (last_byte >> PAGE_CACHE_SHIFT) + 1; 1952 unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1;
1953 nr -= page->index; 1953 pnr -= page->index;
1954 ret = submit_extent_page(READ, tree, page, 1954 ret = submit_extent_page(READ, tree, page,
1955 sector, iosize, page_offset, 1955 sector, iosize, page_offset,
1956 bdev, bio, nr, 1956 bdev, bio, pnr,
1957 end_bio_extent_readpage, mirror_num); 1957 end_bio_extent_readpage, mirror_num);
1958 nr++;
1958 } 1959 }
1959 if (ret) 1960 if (ret)
1960 SetPageError(page); 1961 SetPageError(page);
1961 cur = cur + iosize; 1962 cur = cur + iosize;
1962 page_offset += iosize; 1963 page_offset += iosize;
1963 nr++;
1964 } 1964 }
1965 if (!nr) { 1965 if (!nr) {
1966 if (!PageError(page)) 1966 if (!PageError(page))
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 28e667052ec3..0e90315ea803 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -611,22 +611,25 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
611 btrfs_test_flag(inode, NODATASUM)) 611 btrfs_test_flag(inode, NODATASUM))
612 return 0; 612 return 0;
613 613
614 /*
615 * It is possible there is an ordered extent that has
616 * not yet finished for this range in the file. If so,
617 * that extent will have a csum cached, and it will insert
618 * the sum after all the blocks in the extent are fully
619 * on disk. So, look for an ordered extent and use the
620 * sum if found. We have to do this before looking in the
621 * btree because csum items are pre-inserted based on
622 * the file size. btrfs_lookup_csum might find an item
623 * that still hasn't been fully filled.
624 */
625 ret = btrfs_find_ordered_sum(inode, start, &csum);
626 if (ret == 0)
627 goto found;
628
629 ret = 0;
614 path = btrfs_alloc_path(); 630 path = btrfs_alloc_path();
615 mutex_lock(&BTRFS_I(inode)->csum_mutex);
616 item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0); 631 item = btrfs_lookup_csum(NULL, root, path, inode->i_ino, start, 0);
617 if (IS_ERR(item)) { 632 if (IS_ERR(item)) {
618 /*
619 * It is possible there is an ordered extent that has
620 * not yet finished for this range in the file. If so,
621 * that extent will have a csum cached, and it will insert
622 * the sum after all the blocks in the extent are fully
623 * on disk. So, look for an ordered extent and use the
624 * sum if found.
625 */
626 ret = btrfs_find_ordered_sum(inode, start, &csum);
627 if (ret == 0)
628 goto found;
629
630 ret = PTR_ERR(item); 633 ret = PTR_ERR(item);
631 /* a csum that isn't present is a preallocated region. */ 634 /* a csum that isn't present is a preallocated region. */
632 if (ret == -ENOENT || ret == -EFBIG) 635 if (ret == -ENOENT || ret == -EFBIG)
@@ -641,7 +644,6 @@ int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
641found: 644found:
642 set_state_private(io_tree, start, csum); 645 set_state_private(io_tree, start, csum);
643out: 646out:
644 mutex_unlock(&BTRFS_I(inode)->csum_mutex);
645 if (path) 647 if (path)
646 btrfs_free_path(path); 648 btrfs_free_path(path);
647 return ret; 649 return ret;
@@ -1375,7 +1377,7 @@ again:
1375 } 1377 }
1376 if (!PageUptodate(page)) { 1378 if (!PageUptodate(page)) {
1377 ret = -EIO; 1379 ret = -EIO;
1378 goto out; 1380 goto out_unlock;
1379 } 1381 }
1380 } 1382 }
1381 wait_on_page_writeback(page); 1383 wait_on_page_writeback(page);
@@ -1406,6 +1408,7 @@ again:
1406 set_page_dirty(page); 1408 set_page_dirty(page);
1407 unlock_extent(io_tree, page_start, page_end, GFP_NOFS); 1409 unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
1408 1410
1411out_unlock:
1409 unlock_page(page); 1412 unlock_page(page);
1410 page_cache_release(page); 1413 page_cache_release(page);
1411out: 1414out:
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index b695f5b29c45..e42fd233e04c 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -557,6 +557,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u32 *sum)
557 } 557 }
558out: 558out:
559 mutex_unlock(&tree->mutex); 559 mutex_unlock(&tree->mutex);
560 btrfs_put_ordered_extent(ordered);
560 return ret; 561 return ret;
561} 562}
562 563