diff options
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r-- | fs/btrfs/extent_io.c | 614 |
1 files changed, 595 insertions, 19 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d418164a35f..1f87c4d0e7a 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c | |||
@@ -17,6 +17,7 @@ | |||
17 | #include "compat.h" | 17 | #include "compat.h" |
18 | #include "ctree.h" | 18 | #include "ctree.h" |
19 | #include "btrfs_inode.h" | 19 | #include "btrfs_inode.h" |
20 | #include "volumes.h" | ||
20 | 21 | ||
21 | static struct kmem_cache *extent_state_cache; | 22 | static struct kmem_cache *extent_state_cache; |
22 | static struct kmem_cache *extent_buffer_cache; | 23 | static struct kmem_cache *extent_buffer_cache; |
@@ -894,6 +895,194 @@ search_again: | |||
894 | goto again; | 895 | goto again; |
895 | } | 896 | } |
896 | 897 | ||
898 | /** | ||
899 | * convert_extent - convert all bits in a given range from one bit to another | ||
900 | * @tree: the io tree to search | ||
901 | * @start: the start offset in bytes | ||
902 | * @end: the end offset in bytes (inclusive) | ||
903 | * @bits: the bits to set in this range | ||
904 | * @clear_bits: the bits to clear in this range | ||
905 | * @mask: the allocation mask | ||
906 | * | ||
907 | * This will go through and set bits for the given range. If any states exist | ||
908 | * already in this range they are set with the given bit and cleared of the | ||
909 | * clear_bits. This is only meant to be used by things that are mergeable, ie | ||
910 | * converting from say DELALLOC to DIRTY. This is not meant to be used with | ||
911 | * boundary bits like LOCK. | ||
912 | */ | ||
913 | int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, | ||
914 | int bits, int clear_bits, gfp_t mask) | ||
915 | { | ||
916 | struct extent_state *state; | ||
917 | struct extent_state *prealloc = NULL; | ||
918 | struct rb_node *node; | ||
919 | int err = 0; | ||
920 | u64 last_start; | ||
921 | u64 last_end; | ||
922 | |||
923 | again: | ||
924 | if (!prealloc && (mask & __GFP_WAIT)) { | ||
925 | prealloc = alloc_extent_state(mask); | ||
926 | if (!prealloc) | ||
927 | return -ENOMEM; | ||
928 | } | ||
929 | |||
930 | spin_lock(&tree->lock); | ||
931 | /* | ||
932 | * this search will find all the extents that end after | ||
933 | * our range starts. | ||
934 | */ | ||
935 | node = tree_search(tree, start); | ||
936 | if (!node) { | ||
937 | prealloc = alloc_extent_state_atomic(prealloc); | ||
938 | if (!prealloc) | ||
939 | return -ENOMEM; | ||
940 | err = insert_state(tree, prealloc, start, end, &bits); | ||
941 | prealloc = NULL; | ||
942 | BUG_ON(err == -EEXIST); | ||
943 | goto out; | ||
944 | } | ||
945 | state = rb_entry(node, struct extent_state, rb_node); | ||
946 | hit_next: | ||
947 | last_start = state->start; | ||
948 | last_end = state->end; | ||
949 | |||
950 | /* | ||
951 | * | ---- desired range ---- | | ||
952 | * | state | | ||
953 | * | ||
954 | * Just lock what we found and keep going | ||
955 | */ | ||
956 | if (state->start == start && state->end <= end) { | ||
957 | struct rb_node *next_node; | ||
958 | |||
959 | set_state_bits(tree, state, &bits); | ||
960 | clear_state_bit(tree, state, &clear_bits, 0); | ||
961 | |||
962 | merge_state(tree, state); | ||
963 | if (last_end == (u64)-1) | ||
964 | goto out; | ||
965 | |||
966 | start = last_end + 1; | ||
967 | next_node = rb_next(&state->rb_node); | ||
968 | if (next_node && start < end && prealloc && !need_resched()) { | ||
969 | state = rb_entry(next_node, struct extent_state, | ||
970 | rb_node); | ||
971 | if (state->start == start) | ||
972 | goto hit_next; | ||
973 | } | ||
974 | goto search_again; | ||
975 | } | ||
976 | |||
977 | /* | ||
978 | * | ---- desired range ---- | | ||
979 | * | state | | ||
980 | * or | ||
981 | * | ------------- state -------------- | | ||
982 | * | ||
983 | * We need to split the extent we found, and may flip bits on | ||
984 | * second half. | ||
985 | * | ||
986 | * If the extent we found extends past our | ||
987 | * range, we just split and search again. It'll get split | ||
988 | * again the next time though. | ||
989 | * | ||
990 | * If the extent we found is inside our range, we set the | ||
991 | * desired bit on it. | ||
992 | */ | ||
993 | if (state->start < start) { | ||
994 | prealloc = alloc_extent_state_atomic(prealloc); | ||
995 | if (!prealloc) | ||
996 | return -ENOMEM; | ||
997 | err = split_state(tree, state, prealloc, start); | ||
998 | BUG_ON(err == -EEXIST); | ||
999 | prealloc = NULL; | ||
1000 | if (err) | ||
1001 | goto out; | ||
1002 | if (state->end <= end) { | ||
1003 | set_state_bits(tree, state, &bits); | ||
1004 | clear_state_bit(tree, state, &clear_bits, 0); | ||
1005 | merge_state(tree, state); | ||
1006 | if (last_end == (u64)-1) | ||
1007 | goto out; | ||
1008 | start = last_end + 1; | ||
1009 | } | ||
1010 | goto search_again; | ||
1011 | } | ||
1012 | /* | ||
1013 | * | ---- desired range ---- | | ||
1014 | * | state | or | state | | ||
1015 | * | ||
1016 | * There's a hole, we need to insert something in it and | ||
1017 | * ignore the extent we found. | ||
1018 | */ | ||
1019 | if (state->start > start) { | ||
1020 | u64 this_end; | ||
1021 | if (end < last_start) | ||
1022 | this_end = end; | ||
1023 | else | ||
1024 | this_end = last_start - 1; | ||
1025 | |||
1026 | prealloc = alloc_extent_state_atomic(prealloc); | ||
1027 | if (!prealloc) | ||
1028 | return -ENOMEM; | ||
1029 | |||
1030 | /* | ||
1031 | * Avoid to free 'prealloc' if it can be merged with | ||
1032 | * the later extent. | ||
1033 | */ | ||
1034 | err = insert_state(tree, prealloc, start, this_end, | ||
1035 | &bits); | ||
1036 | BUG_ON(err == -EEXIST); | ||
1037 | if (err) { | ||
1038 | free_extent_state(prealloc); | ||
1039 | prealloc = NULL; | ||
1040 | goto out; | ||
1041 | } | ||
1042 | prealloc = NULL; | ||
1043 | start = this_end + 1; | ||
1044 | goto search_again; | ||
1045 | } | ||
1046 | /* | ||
1047 | * | ---- desired range ---- | | ||
1048 | * | state | | ||
1049 | * We need to split the extent, and set the bit | ||
1050 | * on the first half | ||
1051 | */ | ||
1052 | if (state->start <= end && state->end > end) { | ||
1053 | prealloc = alloc_extent_state_atomic(prealloc); | ||
1054 | if (!prealloc) | ||
1055 | return -ENOMEM; | ||
1056 | |||
1057 | err = split_state(tree, state, prealloc, end + 1); | ||
1058 | BUG_ON(err == -EEXIST); | ||
1059 | |||
1060 | set_state_bits(tree, prealloc, &bits); | ||
1061 | clear_state_bit(tree, prealloc, &clear_bits, 0); | ||
1062 | |||
1063 | merge_state(tree, prealloc); | ||
1064 | prealloc = NULL; | ||
1065 | goto out; | ||
1066 | } | ||
1067 | |||
1068 | goto search_again; | ||
1069 | |||
1070 | out: | ||
1071 | spin_unlock(&tree->lock); | ||
1072 | if (prealloc) | ||
1073 | free_extent_state(prealloc); | ||
1074 | |||
1075 | return err; | ||
1076 | |||
1077 | search_again: | ||
1078 | if (start > end) | ||
1079 | goto out; | ||
1080 | spin_unlock(&tree->lock); | ||
1081 | if (mask & __GFP_WAIT) | ||
1082 | cond_resched(); | ||
1083 | goto again; | ||
1084 | } | ||
1085 | |||
897 | /* wrappers around set/clear extent bit */ | 1086 | /* wrappers around set/clear extent bit */ |
898 | int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, | 1087 | int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, |
899 | gfp_t mask) | 1088 | gfp_t mask) |
@@ -919,7 +1108,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, | |||
919 | struct extent_state **cached_state, gfp_t mask) | 1108 | struct extent_state **cached_state, gfp_t mask) |
920 | { | 1109 | { |
921 | return set_extent_bit(tree, start, end, | 1110 | return set_extent_bit(tree, start, end, |
922 | EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, | 1111 | EXTENT_DELALLOC | EXTENT_UPTODATE, |
923 | 0, NULL, cached_state, mask); | 1112 | 0, NULL, cached_state, mask); |
924 | } | 1113 | } |
925 | 1114 | ||
@@ -1599,6 +1788,368 @@ static int check_page_writeback(struct extent_io_tree *tree, | |||
1599 | return 0; | 1788 | return 0; |
1600 | } | 1789 | } |
1601 | 1790 | ||
1791 | /* | ||
1792 | * When IO fails, either with EIO or csum verification fails, we | ||
1793 | * try other mirrors that might have a good copy of the data. This | ||
1794 | * io_failure_record is used to record state as we go through all the | ||
1795 | * mirrors. If another mirror has good data, the page is set up to date | ||
1796 | * and things continue. If a good mirror can't be found, the original | ||
1797 | * bio end_io callback is called to indicate things have failed. | ||
1798 | */ | ||
1799 | struct io_failure_record { | ||
1800 | struct page *page; | ||
1801 | u64 start; | ||
1802 | u64 len; | ||
1803 | u64 logical; | ||
1804 | unsigned long bio_flags; | ||
1805 | int this_mirror; | ||
1806 | int failed_mirror; | ||
1807 | int in_validation; | ||
1808 | }; | ||
1809 | |||
1810 | static int free_io_failure(struct inode *inode, struct io_failure_record *rec, | ||
1811 | int did_repair) | ||
1812 | { | ||
1813 | int ret; | ||
1814 | int err = 0; | ||
1815 | struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; | ||
1816 | |||
1817 | set_state_private(failure_tree, rec->start, 0); | ||
1818 | ret = clear_extent_bits(failure_tree, rec->start, | ||
1819 | rec->start + rec->len - 1, | ||
1820 | EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); | ||
1821 | if (ret) | ||
1822 | err = ret; | ||
1823 | |||
1824 | if (did_repair) { | ||
1825 | ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start, | ||
1826 | rec->start + rec->len - 1, | ||
1827 | EXTENT_DAMAGED, GFP_NOFS); | ||
1828 | if (ret && !err) | ||
1829 | err = ret; | ||
1830 | } | ||
1831 | |||
1832 | kfree(rec); | ||
1833 | return err; | ||
1834 | } | ||
1835 | |||
1836 | static void repair_io_failure_callback(struct bio *bio, int err) | ||
1837 | { | ||
1838 | complete(bio->bi_private); | ||
1839 | } | ||
1840 | |||
1841 | /* | ||
1842 | * this bypasses the standard btrfs submit functions deliberately, as | ||
1843 | * the standard behavior is to write all copies in a raid setup. here we only | ||
1844 | * want to write the one bad copy. so we do the mapping for ourselves and issue | ||
1845 | * submit_bio directly. | ||
1846 | * to avoid any synchonization issues, wait for the data after writing, which | ||
1847 | * actually prevents the read that triggered the error from finishing. | ||
1848 | * currently, there can be no more than two copies of every data bit. thus, | ||
1849 | * exactly one rewrite is required. | ||
1850 | */ | ||
1851 | int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start, | ||
1852 | u64 length, u64 logical, struct page *page, | ||
1853 | int mirror_num) | ||
1854 | { | ||
1855 | struct bio *bio; | ||
1856 | struct btrfs_device *dev; | ||
1857 | DECLARE_COMPLETION_ONSTACK(compl); | ||
1858 | u64 map_length = 0; | ||
1859 | u64 sector; | ||
1860 | struct btrfs_bio *bbio = NULL; | ||
1861 | int ret; | ||
1862 | |||
1863 | BUG_ON(!mirror_num); | ||
1864 | |||
1865 | bio = bio_alloc(GFP_NOFS, 1); | ||
1866 | if (!bio) | ||
1867 | return -EIO; | ||
1868 | bio->bi_private = &compl; | ||
1869 | bio->bi_end_io = repair_io_failure_callback; | ||
1870 | bio->bi_size = 0; | ||
1871 | map_length = length; | ||
1872 | |||
1873 | ret = btrfs_map_block(map_tree, WRITE, logical, | ||
1874 | &map_length, &bbio, mirror_num); | ||
1875 | if (ret) { | ||
1876 | bio_put(bio); | ||
1877 | return -EIO; | ||
1878 | } | ||
1879 | BUG_ON(mirror_num != bbio->mirror_num); | ||
1880 | sector = bbio->stripes[mirror_num-1].physical >> 9; | ||
1881 | bio->bi_sector = sector; | ||
1882 | dev = bbio->stripes[mirror_num-1].dev; | ||
1883 | kfree(bbio); | ||
1884 | if (!dev || !dev->bdev || !dev->writeable) { | ||
1885 | bio_put(bio); | ||
1886 | return -EIO; | ||
1887 | } | ||
1888 | bio->bi_bdev = dev->bdev; | ||
1889 | bio_add_page(bio, page, length, start-page_offset(page)); | ||
1890 | submit_bio(WRITE_SYNC, bio); | ||
1891 | wait_for_completion(&compl); | ||
1892 | |||
1893 | if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) { | ||
1894 | /* try to remap that extent elsewhere? */ | ||
1895 | bio_put(bio); | ||
1896 | return -EIO; | ||
1897 | } | ||
1898 | |||
1899 | printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s " | ||
1900 | "sector %llu)\n", page->mapping->host->i_ino, start, | ||
1901 | dev->name, sector); | ||
1902 | |||
1903 | bio_put(bio); | ||
1904 | return 0; | ||
1905 | } | ||
1906 | |||
1907 | /* | ||
1908 | * each time an IO finishes, we do a fast check in the IO failure tree | ||
1909 | * to see if we need to process or clean up an io_failure_record | ||
1910 | */ | ||
1911 | static int clean_io_failure(u64 start, struct page *page) | ||
1912 | { | ||
1913 | u64 private; | ||
1914 | u64 private_failure; | ||
1915 | struct io_failure_record *failrec; | ||
1916 | struct btrfs_mapping_tree *map_tree; | ||
1917 | struct extent_state *state; | ||
1918 | int num_copies; | ||
1919 | int did_repair = 0; | ||
1920 | int ret; | ||
1921 | struct inode *inode = page->mapping->host; | ||
1922 | |||
1923 | private = 0; | ||
1924 | ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, | ||
1925 | (u64)-1, 1, EXTENT_DIRTY, 0); | ||
1926 | if (!ret) | ||
1927 | return 0; | ||
1928 | |||
1929 | ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start, | ||
1930 | &private_failure); | ||
1931 | if (ret) | ||
1932 | return 0; | ||
1933 | |||
1934 | failrec = (struct io_failure_record *)(unsigned long) private_failure; | ||
1935 | BUG_ON(!failrec->this_mirror); | ||
1936 | |||
1937 | if (failrec->in_validation) { | ||
1938 | /* there was no real error, just free the record */ | ||
1939 | pr_debug("clean_io_failure: freeing dummy error at %llu\n", | ||
1940 | failrec->start); | ||
1941 | did_repair = 1; | ||
1942 | goto out; | ||
1943 | } | ||
1944 | |||
1945 | spin_lock(&BTRFS_I(inode)->io_tree.lock); | ||
1946 | state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, | ||
1947 | failrec->start, | ||
1948 | EXTENT_LOCKED); | ||
1949 | spin_unlock(&BTRFS_I(inode)->io_tree.lock); | ||
1950 | |||
1951 | if (state && state->start == failrec->start) { | ||
1952 | map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree; | ||
1953 | num_copies = btrfs_num_copies(map_tree, failrec->logical, | ||
1954 | failrec->len); | ||
1955 | if (num_copies > 1) { | ||
1956 | ret = repair_io_failure(map_tree, start, failrec->len, | ||
1957 | failrec->logical, page, | ||
1958 | failrec->failed_mirror); | ||
1959 | did_repair = !ret; | ||
1960 | } | ||
1961 | } | ||
1962 | |||
1963 | out: | ||
1964 | if (!ret) | ||
1965 | ret = free_io_failure(inode, failrec, did_repair); | ||
1966 | |||
1967 | return ret; | ||
1968 | } | ||
1969 | |||
1970 | /* | ||
1971 | * this is a generic handler for readpage errors (default | ||
1972 | * readpage_io_failed_hook). if other copies exist, read those and write back | ||
1973 | * good data to the failed position. does not investigate in remapping the | ||
1974 | * failed extent elsewhere, hoping the device will be smart enough to do this as | ||
1975 | * needed | ||
1976 | */ | ||
1977 | |||
1978 | static int bio_readpage_error(struct bio *failed_bio, struct page *page, | ||
1979 | u64 start, u64 end, int failed_mirror, | ||
1980 | struct extent_state *state) | ||
1981 | { | ||
1982 | struct io_failure_record *failrec = NULL; | ||
1983 | u64 private; | ||
1984 | struct extent_map *em; | ||
1985 | struct inode *inode = page->mapping->host; | ||
1986 | struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; | ||
1987 | struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; | ||
1988 | struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; | ||
1989 | struct bio *bio; | ||
1990 | int num_copies; | ||
1991 | int ret; | ||
1992 | int read_mode; | ||
1993 | u64 logical; | ||
1994 | |||
1995 | BUG_ON(failed_bio->bi_rw & REQ_WRITE); | ||
1996 | |||
1997 | ret = get_state_private(failure_tree, start, &private); | ||
1998 | if (ret) { | ||
1999 | failrec = kzalloc(sizeof(*failrec), GFP_NOFS); | ||
2000 | if (!failrec) | ||
2001 | return -ENOMEM; | ||
2002 | failrec->start = start; | ||
2003 | failrec->len = end - start + 1; | ||
2004 | failrec->this_mirror = 0; | ||
2005 | failrec->bio_flags = 0; | ||
2006 | failrec->in_validation = 0; | ||
2007 | |||
2008 | read_lock(&em_tree->lock); | ||
2009 | em = lookup_extent_mapping(em_tree, start, failrec->len); | ||
2010 | if (!em) { | ||
2011 | read_unlock(&em_tree->lock); | ||
2012 | kfree(failrec); | ||
2013 | return -EIO; | ||
2014 | } | ||
2015 | |||
2016 | if (em->start > start || em->start + em->len < start) { | ||
2017 | free_extent_map(em); | ||
2018 | em = NULL; | ||
2019 | } | ||
2020 | read_unlock(&em_tree->lock); | ||
2021 | |||
2022 | if (!em || IS_ERR(em)) { | ||
2023 | kfree(failrec); | ||
2024 | return -EIO; | ||
2025 | } | ||
2026 | logical = start - em->start; | ||
2027 | logical = em->block_start + logical; | ||
2028 | if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { | ||
2029 | logical = em->block_start; | ||
2030 | failrec->bio_flags = EXTENT_BIO_COMPRESSED; | ||
2031 | extent_set_compress_type(&failrec->bio_flags, | ||
2032 | em->compress_type); | ||
2033 | } | ||
2034 | pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " | ||
2035 | "len=%llu\n", logical, start, failrec->len); | ||
2036 | failrec->logical = logical; | ||
2037 | free_extent_map(em); | ||
2038 | |||
2039 | /* set the bits in the private failure tree */ | ||
2040 | ret = set_extent_bits(failure_tree, start, end, | ||
2041 | EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); | ||
2042 | if (ret >= 0) | ||
2043 | ret = set_state_private(failure_tree, start, | ||
2044 | (u64)(unsigned long)failrec); | ||
2045 | /* set the bits in the inode's tree */ | ||
2046 | if (ret >= 0) | ||
2047 | ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED, | ||
2048 | GFP_NOFS); | ||
2049 | if (ret < 0) { | ||
2050 | kfree(failrec); | ||
2051 | return ret; | ||
2052 | } | ||
2053 | } else { | ||
2054 | failrec = (struct io_failure_record *)(unsigned long)private; | ||
2055 | pr_debug("bio_readpage_error: (found) logical=%llu, " | ||
2056 | "start=%llu, len=%llu, validation=%d\n", | ||
2057 | failrec->logical, failrec->start, failrec->len, | ||
2058 | failrec->in_validation); | ||
2059 | /* | ||
2060 | * when data can be on disk more than twice, add to failrec here | ||
2061 | * (e.g. with a list for failed_mirror) to make | ||
2062 | * clean_io_failure() clean all those errors at once. | ||
2063 | */ | ||
2064 | } | ||
2065 | num_copies = btrfs_num_copies( | ||
2066 | &BTRFS_I(inode)->root->fs_info->mapping_tree, | ||
2067 | failrec->logical, failrec->len); | ||
2068 | if (num_copies == 1) { | ||
2069 | /* | ||
2070 | * we only have a single copy of the data, so don't bother with | ||
2071 | * all the retry and error correction code that follows. no | ||
2072 | * matter what the error is, it is very likely to persist. | ||
2073 | */ | ||
2074 | pr_debug("bio_readpage_error: cannot repair, num_copies == 1. " | ||
2075 | "state=%p, num_copies=%d, next_mirror %d, " | ||
2076 | "failed_mirror %d\n", state, num_copies, | ||
2077 | failrec->this_mirror, failed_mirror); | ||
2078 | free_io_failure(inode, failrec, 0); | ||
2079 | return -EIO; | ||
2080 | } | ||
2081 | |||
2082 | if (!state) { | ||
2083 | spin_lock(&tree->lock); | ||
2084 | state = find_first_extent_bit_state(tree, failrec->start, | ||
2085 | EXTENT_LOCKED); | ||
2086 | if (state && state->start != failrec->start) | ||
2087 | state = NULL; | ||
2088 | spin_unlock(&tree->lock); | ||
2089 | } | ||
2090 | |||
2091 | /* | ||
2092 | * there are two premises: | ||
2093 | * a) deliver good data to the caller | ||
2094 | * b) correct the bad sectors on disk | ||
2095 | */ | ||
2096 | if (failed_bio->bi_vcnt > 1) { | ||
2097 | /* | ||
2098 | * to fulfill b), we need to know the exact failing sectors, as | ||
2099 | * we don't want to rewrite any more than the failed ones. thus, | ||
2100 | * we need separate read requests for the failed bio | ||
2101 | * | ||
2102 | * if the following BUG_ON triggers, our validation request got | ||
2103 | * merged. we need separate requests for our algorithm to work. | ||
2104 | */ | ||
2105 | BUG_ON(failrec->in_validation); | ||
2106 | failrec->in_validation = 1; | ||
2107 | failrec->this_mirror = failed_mirror; | ||
2108 | read_mode = READ_SYNC | REQ_FAILFAST_DEV; | ||
2109 | } else { | ||
2110 | /* | ||
2111 | * we're ready to fulfill a) and b) alongside. get a good copy | ||
2112 | * of the failed sector and if we succeed, we have setup | ||
2113 | * everything for repair_io_failure to do the rest for us. | ||
2114 | */ | ||
2115 | if (failrec->in_validation) { | ||
2116 | BUG_ON(failrec->this_mirror != failed_mirror); | ||
2117 | failrec->in_validation = 0; | ||
2118 | failrec->this_mirror = 0; | ||
2119 | } | ||
2120 | failrec->failed_mirror = failed_mirror; | ||
2121 | failrec->this_mirror++; | ||
2122 | if (failrec->this_mirror == failed_mirror) | ||
2123 | failrec->this_mirror++; | ||
2124 | read_mode = READ_SYNC; | ||
2125 | } | ||
2126 | |||
2127 | if (!state || failrec->this_mirror > num_copies) { | ||
2128 | pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, " | ||
2129 | "next_mirror %d, failed_mirror %d\n", state, | ||
2130 | num_copies, failrec->this_mirror, failed_mirror); | ||
2131 | free_io_failure(inode, failrec, 0); | ||
2132 | return -EIO; | ||
2133 | } | ||
2134 | |||
2135 | bio = bio_alloc(GFP_NOFS, 1); | ||
2136 | bio->bi_private = state; | ||
2137 | bio->bi_end_io = failed_bio->bi_end_io; | ||
2138 | bio->bi_sector = failrec->logical >> 9; | ||
2139 | bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; | ||
2140 | bio->bi_size = 0; | ||
2141 | |||
2142 | bio_add_page(bio, page, failrec->len, start - page_offset(page)); | ||
2143 | |||
2144 | pr_debug("bio_readpage_error: submitting new read[%#x] to " | ||
2145 | "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, | ||
2146 | failrec->this_mirror, num_copies, failrec->in_validation); | ||
2147 | |||
2148 | tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror, | ||
2149 | failrec->bio_flags, 0); | ||
2150 | return 0; | ||
2151 | } | ||
2152 | |||
1602 | /* lots and lots of room for performance fixes in the end_bio funcs */ | 2153 | /* lots and lots of room for performance fixes in the end_bio funcs */ |
1603 | 2154 | ||
1604 | /* | 2155 | /* |
@@ -1697,6 +2248,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err) | |||
1697 | struct extent_state *cached = NULL; | 2248 | struct extent_state *cached = NULL; |
1698 | struct extent_state *state; | 2249 | struct extent_state *state; |
1699 | 2250 | ||
2251 | pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, " | ||
2252 | "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err, | ||
2253 | (long int)bio->bi_bdev); | ||
1700 | tree = &BTRFS_I(page->mapping->host)->io_tree; | 2254 | tree = &BTRFS_I(page->mapping->host)->io_tree; |
1701 | 2255 | ||
1702 | start = ((u64)page->index << PAGE_CACHE_SHIFT) + | 2256 | start = ((u64)page->index << PAGE_CACHE_SHIFT) + |
@@ -1727,11 +2281,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err) | |||
1727 | state); | 2281 | state); |
1728 | if (ret) | 2282 | if (ret) |
1729 | uptodate = 0; | 2283 | uptodate = 0; |
2284 | else | ||
2285 | clean_io_failure(start, page); | ||
1730 | } | 2286 | } |
1731 | if (!uptodate && tree->ops && | 2287 | if (!uptodate) { |
1732 | tree->ops->readpage_io_failed_hook) { | 2288 | u64 failed_mirror; |
1733 | ret = tree->ops->readpage_io_failed_hook(bio, page, | 2289 | failed_mirror = (u64)bio->bi_bdev; |
1734 | start, end, NULL); | 2290 | if (tree->ops && tree->ops->readpage_io_failed_hook) |
2291 | ret = tree->ops->readpage_io_failed_hook( | ||
2292 | bio, page, start, end, | ||
2293 | failed_mirror, state); | ||
2294 | else | ||
2295 | ret = bio_readpage_error(bio, page, start, end, | ||
2296 | failed_mirror, NULL); | ||
1735 | if (ret == 0) { | 2297 | if (ret == 0) { |
1736 | uptodate = | 2298 | uptodate = |
1737 | test_bit(BIO_UPTODATE, &bio->bi_flags); | 2299 | test_bit(BIO_UPTODATE, &bio->bi_flags); |
@@ -1811,6 +2373,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num, | |||
1811 | mirror_num, bio_flags, start); | 2373 | mirror_num, bio_flags, start); |
1812 | else | 2374 | else |
1813 | submit_bio(rw, bio); | 2375 | submit_bio(rw, bio); |
2376 | |||
1814 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) | 2377 | if (bio_flagged(bio, BIO_EOPNOTSUPP)) |
1815 | ret = -EOPNOTSUPP; | 2378 | ret = -EOPNOTSUPP; |
1816 | bio_put(bio); | 2379 | bio_put(bio); |
@@ -2076,16 +2639,16 @@ out: | |||
2076 | } | 2639 | } |
2077 | 2640 | ||
2078 | int extent_read_full_page(struct extent_io_tree *tree, struct page *page, | 2641 | int extent_read_full_page(struct extent_io_tree *tree, struct page *page, |
2079 | get_extent_t *get_extent) | 2642 | get_extent_t *get_extent, int mirror_num) |
2080 | { | 2643 | { |
2081 | struct bio *bio = NULL; | 2644 | struct bio *bio = NULL; |
2082 | unsigned long bio_flags = 0; | 2645 | unsigned long bio_flags = 0; |
2083 | int ret; | 2646 | int ret; |
2084 | 2647 | ||
2085 | ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, | 2648 | ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num, |
2086 | &bio_flags); | 2649 | &bio_flags); |
2087 | if (bio) | 2650 | if (bio) |
2088 | ret = submit_one_bio(READ, bio, 0, bio_flags); | 2651 | ret = submit_one_bio(READ, bio, mirror_num, bio_flags); |
2089 | return ret; | 2652 | return ret; |
2090 | } | 2653 | } |
2091 | 2654 | ||
@@ -2136,6 +2699,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
2136 | int compressed; | 2699 | int compressed; |
2137 | int write_flags; | 2700 | int write_flags; |
2138 | unsigned long nr_written = 0; | 2701 | unsigned long nr_written = 0; |
2702 | bool fill_delalloc = true; | ||
2139 | 2703 | ||
2140 | if (wbc->sync_mode == WB_SYNC_ALL) | 2704 | if (wbc->sync_mode == WB_SYNC_ALL) |
2141 | write_flags = WRITE_SYNC; | 2705 | write_flags = WRITE_SYNC; |
@@ -2145,6 +2709,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
2145 | trace___extent_writepage(page, inode, wbc); | 2709 | trace___extent_writepage(page, inode, wbc); |
2146 | 2710 | ||
2147 | WARN_ON(!PageLocked(page)); | 2711 | WARN_ON(!PageLocked(page)); |
2712 | |||
2713 | ClearPageError(page); | ||
2714 | |||
2148 | pg_offset = i_size & (PAGE_CACHE_SIZE - 1); | 2715 | pg_offset = i_size & (PAGE_CACHE_SIZE - 1); |
2149 | if (page->index > end_index || | 2716 | if (page->index > end_index || |
2150 | (page->index == end_index && !pg_offset)) { | 2717 | (page->index == end_index && !pg_offset)) { |
@@ -2166,10 +2733,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, | |||
2166 | 2733 | ||
2167 | set_page_extent_mapped(page); | 2734 | set_page_extent_mapped(page); |
2168 | 2735 | ||
2736 | if (!tree->ops || !tree->ops->fill_delalloc) | ||
2737 | fill_delalloc = false; | ||
2738 | |||
2169 | delalloc_start = start; | 2739 | delalloc_start = start; |
2170 | delalloc_end = 0; | 2740 | delalloc_end = 0; |
2171 | page_started = 0; | 2741 | page_started = 0; |
2172 | if (!epd->extent_locked) { | 2742 | if (!epd->extent_locked && fill_delalloc) { |
2173 | u64 delalloc_to_write = 0; | 2743 | u64 delalloc_to_write = 0; |
2174 | /* | 2744 | /* |
2175 | * make sure the wbc mapping index is at least updated | 2745 | * make sure the wbc mapping index is at least updated |
@@ -2421,10 +2991,16 @@ retry: | |||
2421 | * swizzled back from swapper_space to tmpfs file | 2991 | * swizzled back from swapper_space to tmpfs file |
2422 | * mapping | 2992 | * mapping |
2423 | */ | 2993 | */ |
2424 | if (tree->ops && tree->ops->write_cache_pages_lock_hook) | 2994 | if (tree->ops && |
2425 | tree->ops->write_cache_pages_lock_hook(page); | 2995 | tree->ops->write_cache_pages_lock_hook) { |
2426 | else | 2996 | tree->ops->write_cache_pages_lock_hook(page, |
2427 | lock_page(page); | 2997 | data, flush_fn); |
2998 | } else { | ||
2999 | if (!trylock_page(page)) { | ||
3000 | flush_fn(data); | ||
3001 | lock_page(page); | ||
3002 | } | ||
3003 | } | ||
2428 | 3004 | ||
2429 | if (unlikely(page->mapping != mapping)) { | 3005 | if (unlikely(page->mapping != mapping)) { |
2430 | unlock_page(page); | 3006 | unlock_page(page); |
@@ -2926,7 +3502,7 @@ out: | |||
2926 | return ret; | 3502 | return ret; |
2927 | } | 3503 | } |
2928 | 3504 | ||
2929 | static inline struct page *extent_buffer_page(struct extent_buffer *eb, | 3505 | inline struct page *extent_buffer_page(struct extent_buffer *eb, |
2930 | unsigned long i) | 3506 | unsigned long i) |
2931 | { | 3507 | { |
2932 | struct page *p; | 3508 | struct page *p; |
@@ -2951,7 +3527,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb, | |||
2951 | return p; | 3527 | return p; |
2952 | } | 3528 | } |
2953 | 3529 | ||
2954 | static inline unsigned long num_extent_pages(u64 start, u64 len) | 3530 | inline unsigned long num_extent_pages(u64 start, u64 len) |
2955 | { | 3531 | { |
2956 | return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - | 3532 | return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - |
2957 | (start >> PAGE_CACHE_SHIFT); | 3533 | (start >> PAGE_CACHE_SHIFT); |
@@ -3204,6 +3780,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree, | |||
3204 | PAGECACHE_TAG_DIRTY); | 3780 | PAGECACHE_TAG_DIRTY); |
3205 | } | 3781 | } |
3206 | spin_unlock_irq(&page->mapping->tree_lock); | 3782 | spin_unlock_irq(&page->mapping->tree_lock); |
3783 | ClearPageError(page); | ||
3207 | unlock_page(page); | 3784 | unlock_page(page); |
3208 | } | 3785 | } |
3209 | return 0; | 3786 | return 0; |
@@ -3349,8 +3926,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree, | |||
3349 | } | 3926 | } |
3350 | 3927 | ||
3351 | int read_extent_buffer_pages(struct extent_io_tree *tree, | 3928 | int read_extent_buffer_pages(struct extent_io_tree *tree, |
3352 | struct extent_buffer *eb, | 3929 | struct extent_buffer *eb, u64 start, int wait, |
3353 | u64 start, int wait, | ||
3354 | get_extent_t *get_extent, int mirror_num) | 3930 | get_extent_t *get_extent, int mirror_num) |
3355 | { | 3931 | { |
3356 | unsigned long i; | 3932 | unsigned long i; |
@@ -3386,7 +3962,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, | |||
3386 | num_pages = num_extent_pages(eb->start, eb->len); | 3962 | num_pages = num_extent_pages(eb->start, eb->len); |
3387 | for (i = start_i; i < num_pages; i++) { | 3963 | for (i = start_i; i < num_pages; i++) { |
3388 | page = extent_buffer_page(eb, i); | 3964 | page = extent_buffer_page(eb, i); |
3389 | if (!wait) { | 3965 | if (wait == WAIT_NONE) { |
3390 | if (!trylock_page(page)) | 3966 | if (!trylock_page(page)) |
3391 | goto unlock_exit; | 3967 | goto unlock_exit; |
3392 | } else { | 3968 | } else { |
@@ -3430,7 +4006,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, | |||
3430 | if (bio) | 4006 | if (bio) |
3431 | submit_one_bio(READ, bio, mirror_num, bio_flags); | 4007 | submit_one_bio(READ, bio, mirror_num, bio_flags); |
3432 | 4008 | ||
3433 | if (ret || !wait) | 4009 | if (ret || wait != WAIT_COMPLETE) |
3434 | return ret; | 4010 | return ret; |
3435 | 4011 | ||
3436 | for (i = start_i; i < num_pages; i++) { | 4012 | for (i = start_i; i < num_pages; i++) { |