aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent_io.c
diff options
context:
space:
mode:
authorChris Mason <chris.mason@oracle.com>2011-11-06 03:07:10 -0500
committerChris Mason <chris.mason@oracle.com>2011-11-06 03:07:10 -0500
commit806468f8bf76a3cb2b626dd282946a6c9c0a50f0 (patch)
tree2de54229a5623756417a9bad7f426a2e8b06cad7 /fs/btrfs/extent_io.c
parent531f4b1ae5e0fc8c9b3f03838218e5ea178f80d3 (diff)
parent5da6fcbc4eb50c0f55d520750332f5a6ab13508c (diff)
Merge git://git.jan-o-sch.net/btrfs-unstable into integration
Conflicts: fs/btrfs/Makefile fs/btrfs/extent_io.c fs/btrfs/extent_io.h fs/btrfs/scrub.c Signed-off-by: Chris Mason <chris.mason@oracle.com>
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r--fs/btrfs/extent_io.c393
1 files changed, 384 insertions, 9 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index c12705682c65..1f87c4d0e7a0 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
17#include "compat.h" 17#include "compat.h"
18#include "ctree.h" 18#include "ctree.h"
19#include "btrfs_inode.h" 19#include "btrfs_inode.h"
20#include "volumes.h"
20 21
21static struct kmem_cache *extent_state_cache; 22static struct kmem_cache *extent_state_cache;
22static struct kmem_cache *extent_buffer_cache; 23static struct kmem_cache *extent_buffer_cache;
@@ -1787,6 +1788,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
1787 return 0; 1788 return 0;
1788} 1789}
1789 1790
1791/*
1792 * When IO fails, either with EIO or csum verification fails, we
1793 * try other mirrors that might have a good copy of the data. This
1794 * io_failure_record is used to record state as we go through all the
1795 * mirrors. If another mirror has good data, the page is set up to date
1796 * and things continue. If a good mirror can't be found, the original
1797 * bio end_io callback is called to indicate things have failed.
1798 */
1799struct io_failure_record {
1800 struct page *page;
1801 u64 start;
1802 u64 len;
1803 u64 logical;
1804 unsigned long bio_flags;
1805 int this_mirror;
1806 int failed_mirror;
1807 int in_validation;
1808};
1809
1810static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1811 int did_repair)
1812{
1813 int ret;
1814 int err = 0;
1815 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1816
1817 set_state_private(failure_tree, rec->start, 0);
1818 ret = clear_extent_bits(failure_tree, rec->start,
1819 rec->start + rec->len - 1,
1820 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1821 if (ret)
1822 err = ret;
1823
1824 if (did_repair) {
1825 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1826 rec->start + rec->len - 1,
1827 EXTENT_DAMAGED, GFP_NOFS);
1828 if (ret && !err)
1829 err = ret;
1830 }
1831
1832 kfree(rec);
1833 return err;
1834}
1835
1836static void repair_io_failure_callback(struct bio *bio, int err)
1837{
1838 complete(bio->bi_private);
1839}
1840
1841/*
1842 * this bypasses the standard btrfs submit functions deliberately, as
1843 * the standard behavior is to write all copies in a raid setup. here we only
1844 * want to write the one bad copy. so we do the mapping for ourselves and issue
1845 * submit_bio directly.
1846 * to avoid any synchonization issues, wait for the data after writing, which
1847 * actually prevents the read that triggered the error from finishing.
1848 * currently, there can be no more than two copies of every data bit. thus,
1849 * exactly one rewrite is required.
1850 */
1851int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1852 u64 length, u64 logical, struct page *page,
1853 int mirror_num)
1854{
1855 struct bio *bio;
1856 struct btrfs_device *dev;
1857 DECLARE_COMPLETION_ONSTACK(compl);
1858 u64 map_length = 0;
1859 u64 sector;
1860 struct btrfs_bio *bbio = NULL;
1861 int ret;
1862
1863 BUG_ON(!mirror_num);
1864
1865 bio = bio_alloc(GFP_NOFS, 1);
1866 if (!bio)
1867 return -EIO;
1868 bio->bi_private = &compl;
1869 bio->bi_end_io = repair_io_failure_callback;
1870 bio->bi_size = 0;
1871 map_length = length;
1872
1873 ret = btrfs_map_block(map_tree, WRITE, logical,
1874 &map_length, &bbio, mirror_num);
1875 if (ret) {
1876 bio_put(bio);
1877 return -EIO;
1878 }
1879 BUG_ON(mirror_num != bbio->mirror_num);
1880 sector = bbio->stripes[mirror_num-1].physical >> 9;
1881 bio->bi_sector = sector;
1882 dev = bbio->stripes[mirror_num-1].dev;
1883 kfree(bbio);
1884 if (!dev || !dev->bdev || !dev->writeable) {
1885 bio_put(bio);
1886 return -EIO;
1887 }
1888 bio->bi_bdev = dev->bdev;
1889 bio_add_page(bio, page, length, start-page_offset(page));
1890 submit_bio(WRITE_SYNC, bio);
1891 wait_for_completion(&compl);
1892
1893 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1894 /* try to remap that extent elsewhere? */
1895 bio_put(bio);
1896 return -EIO;
1897 }
1898
1899 printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
1900 "sector %llu)\n", page->mapping->host->i_ino, start,
1901 dev->name, sector);
1902
1903 bio_put(bio);
1904 return 0;
1905}
1906
1907/*
1908 * each time an IO finishes, we do a fast check in the IO failure tree
1909 * to see if we need to process or clean up an io_failure_record
1910 */
1911static int clean_io_failure(u64 start, struct page *page)
1912{
1913 u64 private;
1914 u64 private_failure;
1915 struct io_failure_record *failrec;
1916 struct btrfs_mapping_tree *map_tree;
1917 struct extent_state *state;
1918 int num_copies;
1919 int did_repair = 0;
1920 int ret;
1921 struct inode *inode = page->mapping->host;
1922
1923 private = 0;
1924 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1925 (u64)-1, 1, EXTENT_DIRTY, 0);
1926 if (!ret)
1927 return 0;
1928
1929 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
1930 &private_failure);
1931 if (ret)
1932 return 0;
1933
1934 failrec = (struct io_failure_record *)(unsigned long) private_failure;
1935 BUG_ON(!failrec->this_mirror);
1936
1937 if (failrec->in_validation) {
1938 /* there was no real error, just free the record */
1939 pr_debug("clean_io_failure: freeing dummy error at %llu\n",
1940 failrec->start);
1941 did_repair = 1;
1942 goto out;
1943 }
1944
1945 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1946 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1947 failrec->start,
1948 EXTENT_LOCKED);
1949 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1950
1951 if (state && state->start == failrec->start) {
1952 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
1953 num_copies = btrfs_num_copies(map_tree, failrec->logical,
1954 failrec->len);
1955 if (num_copies > 1) {
1956 ret = repair_io_failure(map_tree, start, failrec->len,
1957 failrec->logical, page,
1958 failrec->failed_mirror);
1959 did_repair = !ret;
1960 }
1961 }
1962
1963out:
1964 if (!ret)
1965 ret = free_io_failure(inode, failrec, did_repair);
1966
1967 return ret;
1968}
1969
1970/*
1971 * this is a generic handler for readpage errors (default
1972 * readpage_io_failed_hook). if other copies exist, read those and write back
1973 * good data to the failed position. does not investigate in remapping the
1974 * failed extent elsewhere, hoping the device will be smart enough to do this as
1975 * needed
1976 */
1977
1978static int bio_readpage_error(struct bio *failed_bio, struct page *page,
1979 u64 start, u64 end, int failed_mirror,
1980 struct extent_state *state)
1981{
1982 struct io_failure_record *failrec = NULL;
1983 u64 private;
1984 struct extent_map *em;
1985 struct inode *inode = page->mapping->host;
1986 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1987 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1988 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1989 struct bio *bio;
1990 int num_copies;
1991 int ret;
1992 int read_mode;
1993 u64 logical;
1994
1995 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
1996
1997 ret = get_state_private(failure_tree, start, &private);
1998 if (ret) {
1999 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2000 if (!failrec)
2001 return -ENOMEM;
2002 failrec->start = start;
2003 failrec->len = end - start + 1;
2004 failrec->this_mirror = 0;
2005 failrec->bio_flags = 0;
2006 failrec->in_validation = 0;
2007
2008 read_lock(&em_tree->lock);
2009 em = lookup_extent_mapping(em_tree, start, failrec->len);
2010 if (!em) {
2011 read_unlock(&em_tree->lock);
2012 kfree(failrec);
2013 return -EIO;
2014 }
2015
2016 if (em->start > start || em->start + em->len < start) {
2017 free_extent_map(em);
2018 em = NULL;
2019 }
2020 read_unlock(&em_tree->lock);
2021
2022 if (!em || IS_ERR(em)) {
2023 kfree(failrec);
2024 return -EIO;
2025 }
2026 logical = start - em->start;
2027 logical = em->block_start + logical;
2028 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2029 logical = em->block_start;
2030 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2031 extent_set_compress_type(&failrec->bio_flags,
2032 em->compress_type);
2033 }
2034 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2035 "len=%llu\n", logical, start, failrec->len);
2036 failrec->logical = logical;
2037 free_extent_map(em);
2038
2039 /* set the bits in the private failure tree */
2040 ret = set_extent_bits(failure_tree, start, end,
2041 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2042 if (ret >= 0)
2043 ret = set_state_private(failure_tree, start,
2044 (u64)(unsigned long)failrec);
2045 /* set the bits in the inode's tree */
2046 if (ret >= 0)
2047 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2048 GFP_NOFS);
2049 if (ret < 0) {
2050 kfree(failrec);
2051 return ret;
2052 }
2053 } else {
2054 failrec = (struct io_failure_record *)(unsigned long)private;
2055 pr_debug("bio_readpage_error: (found) logical=%llu, "
2056 "start=%llu, len=%llu, validation=%d\n",
2057 failrec->logical, failrec->start, failrec->len,
2058 failrec->in_validation);
2059 /*
2060 * when data can be on disk more than twice, add to failrec here
2061 * (e.g. with a list for failed_mirror) to make
2062 * clean_io_failure() clean all those errors at once.
2063 */
2064 }
2065 num_copies = btrfs_num_copies(
2066 &BTRFS_I(inode)->root->fs_info->mapping_tree,
2067 failrec->logical, failrec->len);
2068 if (num_copies == 1) {
2069 /*
2070 * we only have a single copy of the data, so don't bother with
2071 * all the retry and error correction code that follows. no
2072 * matter what the error is, it is very likely to persist.
2073 */
2074 pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2075 "state=%p, num_copies=%d, next_mirror %d, "
2076 "failed_mirror %d\n", state, num_copies,
2077 failrec->this_mirror, failed_mirror);
2078 free_io_failure(inode, failrec, 0);
2079 return -EIO;
2080 }
2081
2082 if (!state) {
2083 spin_lock(&tree->lock);
2084 state = find_first_extent_bit_state(tree, failrec->start,
2085 EXTENT_LOCKED);
2086 if (state && state->start != failrec->start)
2087 state = NULL;
2088 spin_unlock(&tree->lock);
2089 }
2090
2091 /*
2092 * there are two premises:
2093 * a) deliver good data to the caller
2094 * b) correct the bad sectors on disk
2095 */
2096 if (failed_bio->bi_vcnt > 1) {
2097 /*
2098 * to fulfill b), we need to know the exact failing sectors, as
2099 * we don't want to rewrite any more than the failed ones. thus,
2100 * we need separate read requests for the failed bio
2101 *
2102 * if the following BUG_ON triggers, our validation request got
2103 * merged. we need separate requests for our algorithm to work.
2104 */
2105 BUG_ON(failrec->in_validation);
2106 failrec->in_validation = 1;
2107 failrec->this_mirror = failed_mirror;
2108 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2109 } else {
2110 /*
2111 * we're ready to fulfill a) and b) alongside. get a good copy
2112 * of the failed sector and if we succeed, we have setup
2113 * everything for repair_io_failure to do the rest for us.
2114 */
2115 if (failrec->in_validation) {
2116 BUG_ON(failrec->this_mirror != failed_mirror);
2117 failrec->in_validation = 0;
2118 failrec->this_mirror = 0;
2119 }
2120 failrec->failed_mirror = failed_mirror;
2121 failrec->this_mirror++;
2122 if (failrec->this_mirror == failed_mirror)
2123 failrec->this_mirror++;
2124 read_mode = READ_SYNC;
2125 }
2126
2127 if (!state || failrec->this_mirror > num_copies) {
2128 pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2129 "next_mirror %d, failed_mirror %d\n", state,
2130 num_copies, failrec->this_mirror, failed_mirror);
2131 free_io_failure(inode, failrec, 0);
2132 return -EIO;
2133 }
2134
2135 bio = bio_alloc(GFP_NOFS, 1);
2136 bio->bi_private = state;
2137 bio->bi_end_io = failed_bio->bi_end_io;
2138 bio->bi_sector = failrec->logical >> 9;
2139 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2140 bio->bi_size = 0;
2141
2142 bio_add_page(bio, page, failrec->len, start - page_offset(page));
2143
2144 pr_debug("bio_readpage_error: submitting new read[%#x] to "
2145 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2146 failrec->this_mirror, num_copies, failrec->in_validation);
2147
2148 tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
2149 failrec->bio_flags, 0);
2150 return 0;
2151}
2152
1790/* lots and lots of room for performance fixes in the end_bio funcs */ 2153/* lots and lots of room for performance fixes in the end_bio funcs */
1791 2154
1792/* 2155/*
@@ -1885,6 +2248,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1885 struct extent_state *cached = NULL; 2248 struct extent_state *cached = NULL;
1886 struct extent_state *state; 2249 struct extent_state *state;
1887 2250
2251 pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2252 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2253 (long int)bio->bi_bdev);
1888 tree = &BTRFS_I(page->mapping->host)->io_tree; 2254 tree = &BTRFS_I(page->mapping->host)->io_tree;
1889 2255
1890 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2256 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1915,11 +2281,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1915 state); 2281 state);
1916 if (ret) 2282 if (ret)
1917 uptodate = 0; 2283 uptodate = 0;
2284 else
2285 clean_io_failure(start, page);
1918 } 2286 }
1919 if (!uptodate && tree->ops && 2287 if (!uptodate) {
1920 tree->ops->readpage_io_failed_hook) { 2288 u64 failed_mirror;
1921 ret = tree->ops->readpage_io_failed_hook(bio, page, 2289 failed_mirror = (u64)bio->bi_bdev;
1922 start, end, state); 2290 if (tree->ops && tree->ops->readpage_io_failed_hook)
2291 ret = tree->ops->readpage_io_failed_hook(
2292 bio, page, start, end,
2293 failed_mirror, state);
2294 else
2295 ret = bio_readpage_error(bio, page, start, end,
2296 failed_mirror, NULL);
1923 if (ret == 0) { 2297 if (ret == 0) {
1924 uptodate = 2298 uptodate =
1925 test_bit(BIO_UPTODATE, &bio->bi_flags); 2299 test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -1999,6 +2373,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1999 mirror_num, bio_flags, start); 2373 mirror_num, bio_flags, start);
2000 else 2374 else
2001 submit_bio(rw, bio); 2375 submit_bio(rw, bio);
2376
2002 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2377 if (bio_flagged(bio, BIO_EOPNOTSUPP))
2003 ret = -EOPNOTSUPP; 2378 ret = -EOPNOTSUPP;
2004 bio_put(bio); 2379 bio_put(bio);
@@ -2264,16 +2639,16 @@ out:
2264} 2639}
2265 2640
2266int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2641int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2267 get_extent_t *get_extent) 2642 get_extent_t *get_extent, int mirror_num)
2268{ 2643{
2269 struct bio *bio = NULL; 2644 struct bio *bio = NULL;
2270 unsigned long bio_flags = 0; 2645 unsigned long bio_flags = 0;
2271 int ret; 2646 int ret;
2272 2647
2273 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2648 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2274 &bio_flags); 2649 &bio_flags);
2275 if (bio) 2650 if (bio)
2276 ret = submit_one_bio(READ, bio, 0, bio_flags); 2651 ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2277 return ret; 2652 return ret;
2278} 2653}
2279 2654
@@ -3127,7 +3502,7 @@ out:
3127 return ret; 3502 return ret;
3128} 3503}
3129 3504
3130static inline struct page *extent_buffer_page(struct extent_buffer *eb, 3505inline struct page *extent_buffer_page(struct extent_buffer *eb,
3131 unsigned long i) 3506 unsigned long i)
3132{ 3507{
3133 struct page *p; 3508 struct page *p;
@@ -3152,7 +3527,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
3152 return p; 3527 return p;
3153} 3528}
3154 3529
3155static inline unsigned long num_extent_pages(u64 start, u64 len) 3530inline unsigned long num_extent_pages(u64 start, u64 len)
3156{ 3531{
3157 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 3532 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
3158 (start >> PAGE_CACHE_SHIFT); 3533 (start >> PAGE_CACHE_SHIFT);