aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent_io.c
diff options
context:
space:
mode:
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r--fs/btrfs/extent_io.c614
1 files changed, 595 insertions, 19 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index d418164a35f..1f87c4d0e7a 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
17#include "compat.h" 17#include "compat.h"
18#include "ctree.h" 18#include "ctree.h"
19#include "btrfs_inode.h" 19#include "btrfs_inode.h"
20#include "volumes.h"
20 21
21static struct kmem_cache *extent_state_cache; 22static struct kmem_cache *extent_state_cache;
22static struct kmem_cache *extent_buffer_cache; 23static struct kmem_cache *extent_buffer_cache;
@@ -894,6 +895,194 @@ search_again:
894 goto again; 895 goto again;
895} 896}
896 897
898/**
899 * convert_extent - convert all bits in a given range from one bit to another
900 * @tree: the io tree to search
901 * @start: the start offset in bytes
902 * @end: the end offset in bytes (inclusive)
903 * @bits: the bits to set in this range
904 * @clear_bits: the bits to clear in this range
905 * @mask: the allocation mask
906 *
907 * This will go through and set bits for the given range. If any states exist
908 * already in this range they are set with the given bit and cleared of the
909 * clear_bits. This is only meant to be used by things that are mergeable, ie
910 * converting from say DELALLOC to DIRTY. This is not meant to be used with
911 * boundary bits like LOCK.
912 */
913int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
914 int bits, int clear_bits, gfp_t mask)
915{
916 struct extent_state *state;
917 struct extent_state *prealloc = NULL;
918 struct rb_node *node;
919 int err = 0;
920 u64 last_start;
921 u64 last_end;
922
923again:
924 if (!prealloc && (mask & __GFP_WAIT)) {
925 prealloc = alloc_extent_state(mask);
926 if (!prealloc)
927 return -ENOMEM;
928 }
929
930 spin_lock(&tree->lock);
931 /*
932 * this search will find all the extents that end after
933 * our range starts.
934 */
935 node = tree_search(tree, start);
936 if (!node) {
937 prealloc = alloc_extent_state_atomic(prealloc);
938 if (!prealloc)
939 return -ENOMEM;
940 err = insert_state(tree, prealloc, start, end, &bits);
941 prealloc = NULL;
942 BUG_ON(err == -EEXIST);
943 goto out;
944 }
945 state = rb_entry(node, struct extent_state, rb_node);
946hit_next:
947 last_start = state->start;
948 last_end = state->end;
949
950 /*
951 * | ---- desired range ---- |
952 * | state |
953 *
954 * Just lock what we found and keep going
955 */
956 if (state->start == start && state->end <= end) {
957 struct rb_node *next_node;
958
959 set_state_bits(tree, state, &bits);
960 clear_state_bit(tree, state, &clear_bits, 0);
961
962 merge_state(tree, state);
963 if (last_end == (u64)-1)
964 goto out;
965
966 start = last_end + 1;
967 next_node = rb_next(&state->rb_node);
968 if (next_node && start < end && prealloc && !need_resched()) {
969 state = rb_entry(next_node, struct extent_state,
970 rb_node);
971 if (state->start == start)
972 goto hit_next;
973 }
974 goto search_again;
975 }
976
977 /*
978 * | ---- desired range ---- |
979 * | state |
980 * or
981 * | ------------- state -------------- |
982 *
983 * We need to split the extent we found, and may flip bits on
984 * second half.
985 *
986 * If the extent we found extends past our
987 * range, we just split and search again. It'll get split
988 * again the next time though.
989 *
990 * If the extent we found is inside our range, we set the
991 * desired bit on it.
992 */
993 if (state->start < start) {
994 prealloc = alloc_extent_state_atomic(prealloc);
995 if (!prealloc)
996 return -ENOMEM;
997 err = split_state(tree, state, prealloc, start);
998 BUG_ON(err == -EEXIST);
999 prealloc = NULL;
1000 if (err)
1001 goto out;
1002 if (state->end <= end) {
1003 set_state_bits(tree, state, &bits);
1004 clear_state_bit(tree, state, &clear_bits, 0);
1005 merge_state(tree, state);
1006 if (last_end == (u64)-1)
1007 goto out;
1008 start = last_end + 1;
1009 }
1010 goto search_again;
1011 }
1012 /*
1013 * | ---- desired range ---- |
1014 * | state | or | state |
1015 *
1016 * There's a hole, we need to insert something in it and
1017 * ignore the extent we found.
1018 */
1019 if (state->start > start) {
1020 u64 this_end;
1021 if (end < last_start)
1022 this_end = end;
1023 else
1024 this_end = last_start - 1;
1025
1026 prealloc = alloc_extent_state_atomic(prealloc);
1027 if (!prealloc)
1028 return -ENOMEM;
1029
1030 /*
1031 * Avoid to free 'prealloc' if it can be merged with
1032 * the later extent.
1033 */
1034 err = insert_state(tree, prealloc, start, this_end,
1035 &bits);
1036 BUG_ON(err == -EEXIST);
1037 if (err) {
1038 free_extent_state(prealloc);
1039 prealloc = NULL;
1040 goto out;
1041 }
1042 prealloc = NULL;
1043 start = this_end + 1;
1044 goto search_again;
1045 }
1046 /*
1047 * | ---- desired range ---- |
1048 * | state |
1049 * We need to split the extent, and set the bit
1050 * on the first half
1051 */
1052 if (state->start <= end && state->end > end) {
1053 prealloc = alloc_extent_state_atomic(prealloc);
1054 if (!prealloc)
1055 return -ENOMEM;
1056
1057 err = split_state(tree, state, prealloc, end + 1);
1058 BUG_ON(err == -EEXIST);
1059
1060 set_state_bits(tree, prealloc, &bits);
1061 clear_state_bit(tree, prealloc, &clear_bits, 0);
1062
1063 merge_state(tree, prealloc);
1064 prealloc = NULL;
1065 goto out;
1066 }
1067
1068 goto search_again;
1069
1070out:
1071 spin_unlock(&tree->lock);
1072 if (prealloc)
1073 free_extent_state(prealloc);
1074
1075 return err;
1076
1077search_again:
1078 if (start > end)
1079 goto out;
1080 spin_unlock(&tree->lock);
1081 if (mask & __GFP_WAIT)
1082 cond_resched();
1083 goto again;
1084}
1085
897/* wrappers around set/clear extent bit */ 1086/* wrappers around set/clear extent bit */
898int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, 1087int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
899 gfp_t mask) 1088 gfp_t mask)
@@ -919,7 +1108,7 @@ int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
919 struct extent_state **cached_state, gfp_t mask) 1108 struct extent_state **cached_state, gfp_t mask)
920{ 1109{
921 return set_extent_bit(tree, start, end, 1110 return set_extent_bit(tree, start, end,
922 EXTENT_DELALLOC | EXTENT_DIRTY | EXTENT_UPTODATE, 1111 EXTENT_DELALLOC | EXTENT_UPTODATE,
923 0, NULL, cached_state, mask); 1112 0, NULL, cached_state, mask);
924} 1113}
925 1114
@@ -1599,6 +1788,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
1599 return 0; 1788 return 0;
1600} 1789}
1601 1790
1791/*
1792 * When IO fails, either with EIO or csum verification fails, we
1793 * try other mirrors that might have a good copy of the data. This
1794 * io_failure_record is used to record state as we go through all the
1795 * mirrors. If another mirror has good data, the page is set up to date
1796 * and things continue. If a good mirror can't be found, the original
1797 * bio end_io callback is called to indicate things have failed.
1798 */
1799struct io_failure_record {
1800 struct page *page;
1801 u64 start;
1802 u64 len;
1803 u64 logical;
1804 unsigned long bio_flags;
1805 int this_mirror;
1806 int failed_mirror;
1807 int in_validation;
1808};
1809
1810static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1811 int did_repair)
1812{
1813 int ret;
1814 int err = 0;
1815 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1816
1817 set_state_private(failure_tree, rec->start, 0);
1818 ret = clear_extent_bits(failure_tree, rec->start,
1819 rec->start + rec->len - 1,
1820 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1821 if (ret)
1822 err = ret;
1823
1824 if (did_repair) {
1825 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1826 rec->start + rec->len - 1,
1827 EXTENT_DAMAGED, GFP_NOFS);
1828 if (ret && !err)
1829 err = ret;
1830 }
1831
1832 kfree(rec);
1833 return err;
1834}
1835
1836static void repair_io_failure_callback(struct bio *bio, int err)
1837{
1838 complete(bio->bi_private);
1839}
1840
1841/*
1842 * this bypasses the standard btrfs submit functions deliberately, as
1843 * the standard behavior is to write all copies in a raid setup. here we only
1844 * want to write the one bad copy. so we do the mapping for ourselves and issue
1845 * submit_bio directly.
1846 * to avoid any synchonization issues, wait for the data after writing, which
1847 * actually prevents the read that triggered the error from finishing.
1848 * currently, there can be no more than two copies of every data bit. thus,
1849 * exactly one rewrite is required.
1850 */
1851int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1852 u64 length, u64 logical, struct page *page,
1853 int mirror_num)
1854{
1855 struct bio *bio;
1856 struct btrfs_device *dev;
1857 DECLARE_COMPLETION_ONSTACK(compl);
1858 u64 map_length = 0;
1859 u64 sector;
1860 struct btrfs_bio *bbio = NULL;
1861 int ret;
1862
1863 BUG_ON(!mirror_num);
1864
1865 bio = bio_alloc(GFP_NOFS, 1);
1866 if (!bio)
1867 return -EIO;
1868 bio->bi_private = &compl;
1869 bio->bi_end_io = repair_io_failure_callback;
1870 bio->bi_size = 0;
1871 map_length = length;
1872
1873 ret = btrfs_map_block(map_tree, WRITE, logical,
1874 &map_length, &bbio, mirror_num);
1875 if (ret) {
1876 bio_put(bio);
1877 return -EIO;
1878 }
1879 BUG_ON(mirror_num != bbio->mirror_num);
1880 sector = bbio->stripes[mirror_num-1].physical >> 9;
1881 bio->bi_sector = sector;
1882 dev = bbio->stripes[mirror_num-1].dev;
1883 kfree(bbio);
1884 if (!dev || !dev->bdev || !dev->writeable) {
1885 bio_put(bio);
1886 return -EIO;
1887 }
1888 bio->bi_bdev = dev->bdev;
1889 bio_add_page(bio, page, length, start-page_offset(page));
1890 submit_bio(WRITE_SYNC, bio);
1891 wait_for_completion(&compl);
1892
1893 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1894 /* try to remap that extent elsewhere? */
1895 bio_put(bio);
1896 return -EIO;
1897 }
1898
1899 printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
1900 "sector %llu)\n", page->mapping->host->i_ino, start,
1901 dev->name, sector);
1902
1903 bio_put(bio);
1904 return 0;
1905}
1906
1907/*
1908 * each time an IO finishes, we do a fast check in the IO failure tree
1909 * to see if we need to process or clean up an io_failure_record
1910 */
1911static int clean_io_failure(u64 start, struct page *page)
1912{
1913 u64 private;
1914 u64 private_failure;
1915 struct io_failure_record *failrec;
1916 struct btrfs_mapping_tree *map_tree;
1917 struct extent_state *state;
1918 int num_copies;
1919 int did_repair = 0;
1920 int ret;
1921 struct inode *inode = page->mapping->host;
1922
1923 private = 0;
1924 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1925 (u64)-1, 1, EXTENT_DIRTY, 0);
1926 if (!ret)
1927 return 0;
1928
1929 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
1930 &private_failure);
1931 if (ret)
1932 return 0;
1933
1934 failrec = (struct io_failure_record *)(unsigned long) private_failure;
1935 BUG_ON(!failrec->this_mirror);
1936
1937 if (failrec->in_validation) {
1938 /* there was no real error, just free the record */
1939 pr_debug("clean_io_failure: freeing dummy error at %llu\n",
1940 failrec->start);
1941 did_repair = 1;
1942 goto out;
1943 }
1944
1945 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1946 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1947 failrec->start,
1948 EXTENT_LOCKED);
1949 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1950
1951 if (state && state->start == failrec->start) {
1952 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
1953 num_copies = btrfs_num_copies(map_tree, failrec->logical,
1954 failrec->len);
1955 if (num_copies > 1) {
1956 ret = repair_io_failure(map_tree, start, failrec->len,
1957 failrec->logical, page,
1958 failrec->failed_mirror);
1959 did_repair = !ret;
1960 }
1961 }
1962
1963out:
1964 if (!ret)
1965 ret = free_io_failure(inode, failrec, did_repair);
1966
1967 return ret;
1968}
1969
1970/*
1971 * this is a generic handler for readpage errors (default
1972 * readpage_io_failed_hook). if other copies exist, read those and write back
1973 * good data to the failed position. does not investigate in remapping the
1974 * failed extent elsewhere, hoping the device will be smart enough to do this as
1975 * needed
1976 */
1977
1978static int bio_readpage_error(struct bio *failed_bio, struct page *page,
1979 u64 start, u64 end, int failed_mirror,
1980 struct extent_state *state)
1981{
1982 struct io_failure_record *failrec = NULL;
1983 u64 private;
1984 struct extent_map *em;
1985 struct inode *inode = page->mapping->host;
1986 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1987 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1988 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1989 struct bio *bio;
1990 int num_copies;
1991 int ret;
1992 int read_mode;
1993 u64 logical;
1994
1995 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
1996
1997 ret = get_state_private(failure_tree, start, &private);
1998 if (ret) {
1999 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
2000 if (!failrec)
2001 return -ENOMEM;
2002 failrec->start = start;
2003 failrec->len = end - start + 1;
2004 failrec->this_mirror = 0;
2005 failrec->bio_flags = 0;
2006 failrec->in_validation = 0;
2007
2008 read_lock(&em_tree->lock);
2009 em = lookup_extent_mapping(em_tree, start, failrec->len);
2010 if (!em) {
2011 read_unlock(&em_tree->lock);
2012 kfree(failrec);
2013 return -EIO;
2014 }
2015
2016 if (em->start > start || em->start + em->len < start) {
2017 free_extent_map(em);
2018 em = NULL;
2019 }
2020 read_unlock(&em_tree->lock);
2021
2022 if (!em || IS_ERR(em)) {
2023 kfree(failrec);
2024 return -EIO;
2025 }
2026 logical = start - em->start;
2027 logical = em->block_start + logical;
2028 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
2029 logical = em->block_start;
2030 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
2031 extent_set_compress_type(&failrec->bio_flags,
2032 em->compress_type);
2033 }
2034 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
2035 "len=%llu\n", logical, start, failrec->len);
2036 failrec->logical = logical;
2037 free_extent_map(em);
2038
2039 /* set the bits in the private failure tree */
2040 ret = set_extent_bits(failure_tree, start, end,
2041 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
2042 if (ret >= 0)
2043 ret = set_state_private(failure_tree, start,
2044 (u64)(unsigned long)failrec);
2045 /* set the bits in the inode's tree */
2046 if (ret >= 0)
2047 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
2048 GFP_NOFS);
2049 if (ret < 0) {
2050 kfree(failrec);
2051 return ret;
2052 }
2053 } else {
2054 failrec = (struct io_failure_record *)(unsigned long)private;
2055 pr_debug("bio_readpage_error: (found) logical=%llu, "
2056 "start=%llu, len=%llu, validation=%d\n",
2057 failrec->logical, failrec->start, failrec->len,
2058 failrec->in_validation);
2059 /*
2060 * when data can be on disk more than twice, add to failrec here
2061 * (e.g. with a list for failed_mirror) to make
2062 * clean_io_failure() clean all those errors at once.
2063 */
2064 }
2065 num_copies = btrfs_num_copies(
2066 &BTRFS_I(inode)->root->fs_info->mapping_tree,
2067 failrec->logical, failrec->len);
2068 if (num_copies == 1) {
2069 /*
2070 * we only have a single copy of the data, so don't bother with
2071 * all the retry and error correction code that follows. no
2072 * matter what the error is, it is very likely to persist.
2073 */
2074 pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
2075 "state=%p, num_copies=%d, next_mirror %d, "
2076 "failed_mirror %d\n", state, num_copies,
2077 failrec->this_mirror, failed_mirror);
2078 free_io_failure(inode, failrec, 0);
2079 return -EIO;
2080 }
2081
2082 if (!state) {
2083 spin_lock(&tree->lock);
2084 state = find_first_extent_bit_state(tree, failrec->start,
2085 EXTENT_LOCKED);
2086 if (state && state->start != failrec->start)
2087 state = NULL;
2088 spin_unlock(&tree->lock);
2089 }
2090
2091 /*
2092 * there are two premises:
2093 * a) deliver good data to the caller
2094 * b) correct the bad sectors on disk
2095 */
2096 if (failed_bio->bi_vcnt > 1) {
2097 /*
2098 * to fulfill b), we need to know the exact failing sectors, as
2099 * we don't want to rewrite any more than the failed ones. thus,
2100 * we need separate read requests for the failed bio
2101 *
2102 * if the following BUG_ON triggers, our validation request got
2103 * merged. we need separate requests for our algorithm to work.
2104 */
2105 BUG_ON(failrec->in_validation);
2106 failrec->in_validation = 1;
2107 failrec->this_mirror = failed_mirror;
2108 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
2109 } else {
2110 /*
2111 * we're ready to fulfill a) and b) alongside. get a good copy
2112 * of the failed sector and if we succeed, we have setup
2113 * everything for repair_io_failure to do the rest for us.
2114 */
2115 if (failrec->in_validation) {
2116 BUG_ON(failrec->this_mirror != failed_mirror);
2117 failrec->in_validation = 0;
2118 failrec->this_mirror = 0;
2119 }
2120 failrec->failed_mirror = failed_mirror;
2121 failrec->this_mirror++;
2122 if (failrec->this_mirror == failed_mirror)
2123 failrec->this_mirror++;
2124 read_mode = READ_SYNC;
2125 }
2126
2127 if (!state || failrec->this_mirror > num_copies) {
2128 pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
2129 "next_mirror %d, failed_mirror %d\n", state,
2130 num_copies, failrec->this_mirror, failed_mirror);
2131 free_io_failure(inode, failrec, 0);
2132 return -EIO;
2133 }
2134
2135 bio = bio_alloc(GFP_NOFS, 1);
2136 bio->bi_private = state;
2137 bio->bi_end_io = failed_bio->bi_end_io;
2138 bio->bi_sector = failrec->logical >> 9;
2139 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
2140 bio->bi_size = 0;
2141
2142 bio_add_page(bio, page, failrec->len, start - page_offset(page));
2143
2144 pr_debug("bio_readpage_error: submitting new read[%#x] to "
2145 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
2146 failrec->this_mirror, num_copies, failrec->in_validation);
2147
2148 tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
2149 failrec->bio_flags, 0);
2150 return 0;
2151}
2152
1602/* lots and lots of room for performance fixes in the end_bio funcs */ 2153/* lots and lots of room for performance fixes in the end_bio funcs */
1603 2154
1604/* 2155/*
@@ -1697,6 +2248,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1697 struct extent_state *cached = NULL; 2248 struct extent_state *cached = NULL;
1698 struct extent_state *state; 2249 struct extent_state *state;
1699 2250
2251 pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2252 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2253 (long int)bio->bi_bdev);
1700 tree = &BTRFS_I(page->mapping->host)->io_tree; 2254 tree = &BTRFS_I(page->mapping->host)->io_tree;
1701 2255
1702 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2256 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1727,11 +2281,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1727 state); 2281 state);
1728 if (ret) 2282 if (ret)
1729 uptodate = 0; 2283 uptodate = 0;
2284 else
2285 clean_io_failure(start, page);
1730 } 2286 }
1731 if (!uptodate && tree->ops && 2287 if (!uptodate) {
1732 tree->ops->readpage_io_failed_hook) { 2288 u64 failed_mirror;
1733 ret = tree->ops->readpage_io_failed_hook(bio, page, 2289 failed_mirror = (u64)bio->bi_bdev;
1734 start, end, NULL); 2290 if (tree->ops && tree->ops->readpage_io_failed_hook)
2291 ret = tree->ops->readpage_io_failed_hook(
2292 bio, page, start, end,
2293 failed_mirror, state);
2294 else
2295 ret = bio_readpage_error(bio, page, start, end,
2296 failed_mirror, NULL);
1735 if (ret == 0) { 2297 if (ret == 0) {
1736 uptodate = 2298 uptodate =
1737 test_bit(BIO_UPTODATE, &bio->bi_flags); 2299 test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -1811,6 +2373,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1811 mirror_num, bio_flags, start); 2373 mirror_num, bio_flags, start);
1812 else 2374 else
1813 submit_bio(rw, bio); 2375 submit_bio(rw, bio);
2376
1814 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2377 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1815 ret = -EOPNOTSUPP; 2378 ret = -EOPNOTSUPP;
1816 bio_put(bio); 2379 bio_put(bio);
@@ -2076,16 +2639,16 @@ out:
2076} 2639}
2077 2640
2078int extent_read_full_page(struct extent_io_tree *tree, struct page *page, 2641int extent_read_full_page(struct extent_io_tree *tree, struct page *page,
2079 get_extent_t *get_extent) 2642 get_extent_t *get_extent, int mirror_num)
2080{ 2643{
2081 struct bio *bio = NULL; 2644 struct bio *bio = NULL;
2082 unsigned long bio_flags = 0; 2645 unsigned long bio_flags = 0;
2083 int ret; 2646 int ret;
2084 2647
2085 ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, 2648 ret = __extent_read_full_page(tree, page, get_extent, &bio, mirror_num,
2086 &bio_flags); 2649 &bio_flags);
2087 if (bio) 2650 if (bio)
2088 ret = submit_one_bio(READ, bio, 0, bio_flags); 2651 ret = submit_one_bio(READ, bio, mirror_num, bio_flags);
2089 return ret; 2652 return ret;
2090} 2653}
2091 2654
@@ -2136,6 +2699,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2136 int compressed; 2699 int compressed;
2137 int write_flags; 2700 int write_flags;
2138 unsigned long nr_written = 0; 2701 unsigned long nr_written = 0;
2702 bool fill_delalloc = true;
2139 2703
2140 if (wbc->sync_mode == WB_SYNC_ALL) 2704 if (wbc->sync_mode == WB_SYNC_ALL)
2141 write_flags = WRITE_SYNC; 2705 write_flags = WRITE_SYNC;
@@ -2145,6 +2709,9 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2145 trace___extent_writepage(page, inode, wbc); 2709 trace___extent_writepage(page, inode, wbc);
2146 2710
2147 WARN_ON(!PageLocked(page)); 2711 WARN_ON(!PageLocked(page));
2712
2713 ClearPageError(page);
2714
2148 pg_offset = i_size & (PAGE_CACHE_SIZE - 1); 2715 pg_offset = i_size & (PAGE_CACHE_SIZE - 1);
2149 if (page->index > end_index || 2716 if (page->index > end_index ||
2150 (page->index == end_index && !pg_offset)) { 2717 (page->index == end_index && !pg_offset)) {
@@ -2166,10 +2733,13 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
2166 2733
2167 set_page_extent_mapped(page); 2734 set_page_extent_mapped(page);
2168 2735
2736 if (!tree->ops || !tree->ops->fill_delalloc)
2737 fill_delalloc = false;
2738
2169 delalloc_start = start; 2739 delalloc_start = start;
2170 delalloc_end = 0; 2740 delalloc_end = 0;
2171 page_started = 0; 2741 page_started = 0;
2172 if (!epd->extent_locked) { 2742 if (!epd->extent_locked && fill_delalloc) {
2173 u64 delalloc_to_write = 0; 2743 u64 delalloc_to_write = 0;
2174 /* 2744 /*
2175 * make sure the wbc mapping index is at least updated 2745 * make sure the wbc mapping index is at least updated
@@ -2421,10 +2991,16 @@ retry:
2421 * swizzled back from swapper_space to tmpfs file 2991 * swizzled back from swapper_space to tmpfs file
2422 * mapping 2992 * mapping
2423 */ 2993 */
2424 if (tree->ops && tree->ops->write_cache_pages_lock_hook) 2994 if (tree->ops &&
2425 tree->ops->write_cache_pages_lock_hook(page); 2995 tree->ops->write_cache_pages_lock_hook) {
2426 else 2996 tree->ops->write_cache_pages_lock_hook(page,
2427 lock_page(page); 2997 data, flush_fn);
2998 } else {
2999 if (!trylock_page(page)) {
3000 flush_fn(data);
3001 lock_page(page);
3002 }
3003 }
2428 3004
2429 if (unlikely(page->mapping != mapping)) { 3005 if (unlikely(page->mapping != mapping)) {
2430 unlock_page(page); 3006 unlock_page(page);
@@ -2926,7 +3502,7 @@ out:
2926 return ret; 3502 return ret;
2927} 3503}
2928 3504
2929static inline struct page *extent_buffer_page(struct extent_buffer *eb, 3505inline struct page *extent_buffer_page(struct extent_buffer *eb,
2930 unsigned long i) 3506 unsigned long i)
2931{ 3507{
2932 struct page *p; 3508 struct page *p;
@@ -2951,7 +3527,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2951 return p; 3527 return p;
2952} 3528}
2953 3529
2954static inline unsigned long num_extent_pages(u64 start, u64 len) 3530inline unsigned long num_extent_pages(u64 start, u64 len)
2955{ 3531{
2956 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 3532 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2957 (start >> PAGE_CACHE_SHIFT); 3533 (start >> PAGE_CACHE_SHIFT);
@@ -3204,6 +3780,7 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
3204 PAGECACHE_TAG_DIRTY); 3780 PAGECACHE_TAG_DIRTY);
3205 } 3781 }
3206 spin_unlock_irq(&page->mapping->tree_lock); 3782 spin_unlock_irq(&page->mapping->tree_lock);
3783 ClearPageError(page);
3207 unlock_page(page); 3784 unlock_page(page);
3208 } 3785 }
3209 return 0; 3786 return 0;
@@ -3349,8 +3926,7 @@ int extent_buffer_uptodate(struct extent_io_tree *tree,
3349} 3926}
3350 3927
3351int read_extent_buffer_pages(struct extent_io_tree *tree, 3928int read_extent_buffer_pages(struct extent_io_tree *tree,
3352 struct extent_buffer *eb, 3929 struct extent_buffer *eb, u64 start, int wait,
3353 u64 start, int wait,
3354 get_extent_t *get_extent, int mirror_num) 3930 get_extent_t *get_extent, int mirror_num)
3355{ 3931{
3356 unsigned long i; 3932 unsigned long i;
@@ -3386,7 +3962,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3386 num_pages = num_extent_pages(eb->start, eb->len); 3962 num_pages = num_extent_pages(eb->start, eb->len);
3387 for (i = start_i; i < num_pages; i++) { 3963 for (i = start_i; i < num_pages; i++) {
3388 page = extent_buffer_page(eb, i); 3964 page = extent_buffer_page(eb, i);
3389 if (!wait) { 3965 if (wait == WAIT_NONE) {
3390 if (!trylock_page(page)) 3966 if (!trylock_page(page))
3391 goto unlock_exit; 3967 goto unlock_exit;
3392 } else { 3968 } else {
@@ -3430,7 +4006,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree,
3430 if (bio) 4006 if (bio)
3431 submit_one_bio(READ, bio, mirror_num, bio_flags); 4007 submit_one_bio(READ, bio, mirror_num, bio_flags);
3432 4008
3433 if (ret || !wait) 4009 if (ret || wait != WAIT_COMPLETE)
3434 return ret; 4010 return ret;
3435 4011
3436 for (i = start_i; i < num_pages; i++) { 4012 for (i = start_i; i < num_pages; i++) {