aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
authorJan Schmidt <list.btrfs@jan-o-sch.net>2011-07-22 09:41:52 -0400
committerJan Schmidt <list.btrfs@jan-o-sch.net>2011-09-29 07:38:42 -0400
commit4a54c8c165b66300830a67349fc7595e3fc442f7 (patch)
treefea3a5e86c2ee9304711e18b2557912166d6d52f /fs
parent2774b2ca3d49124bf1ae89e8d575b3dab4221266 (diff)
btrfs: Moved repair code from inode.c to extent_io.c
The raid-retry code in inode.c can be generalized so that it works for metadata as well. Thus, this patch moves it to extent_io.c and makes the raid-retry code a raid-repair code. Repair works that way: Whenever a read error occurs and we have more mirrors to try, note the failed mirror, and retry another. If we find a good one, check if we did note a failure earlier and if so, do not allow the read to complete until after the bad sector was written with the good data we just fetched. As we have the extent locked while reading, no one can change the data in between. Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Diffstat (limited to 'fs')
-rw-r--r--fs/btrfs/extent_io.c387
-rw-r--r--fs/btrfs/extent_io.h10
-rw-r--r--fs/btrfs/inode.c155
3 files changed, 393 insertions, 159 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index afebb95e3490..624ef10d36cc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
17#include "compat.h" 17#include "compat.h"
18#include "ctree.h" 18#include "ctree.h"
19#include "btrfs_inode.h" 19#include "btrfs_inode.h"
20#include "volumes.h"
20 21
21static struct kmem_cache *extent_state_cache; 22static struct kmem_cache *extent_state_cache;
22static struct kmem_cache *extent_buffer_cache; 23static struct kmem_cache *extent_buffer_cache;
@@ -1599,6 +1600,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
1599 return 0; 1600 return 0;
1600} 1601}
1601 1602
1603/*
1604 * When IO fails, either with EIO or csum verification fails, we
1605 * try other mirrors that might have a good copy of the data. This
1606 * io_failure_record is used to record state as we go through all the
1607 * mirrors. If another mirror has good data, the page is set up to date
1608 * and things continue. If a good mirror can't be found, the original
1609 * bio end_io callback is called to indicate things have failed.
1610 */
1611struct io_failure_record {
1612 struct page *page;
1613 u64 start;
1614 u64 len;
1615 u64 logical;
1616 unsigned long bio_flags;
1617 int this_mirror;
1618 int failed_mirror;
1619 int in_validation;
1620};
1621
1622static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1623 int did_repair)
1624{
1625 int ret;
1626 int err = 0;
1627 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1628
1629 set_state_private(failure_tree, rec->start, 0);
1630 ret = clear_extent_bits(failure_tree, rec->start,
1631 rec->start + rec->len - 1,
1632 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1633 if (ret)
1634 err = ret;
1635
1636 if (did_repair) {
1637 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1638 rec->start + rec->len - 1,
1639 EXTENT_DAMAGED, GFP_NOFS);
1640 if (ret && !err)
1641 err = ret;
1642 }
1643
1644 kfree(rec);
1645 return err;
1646}
1647
1648static void repair_io_failure_callback(struct bio *bio, int err)
1649{
1650 complete(bio->bi_private);
1651}
1652
1653/*
1654 * this bypasses the standard btrfs submit functions deliberately, as
1655 * the standard behavior is to write all copies in a raid setup. here we only
1656 * want to write the one bad copy. so we do the mapping for ourselves and issue
1657 * submit_bio directly.
1658 * to avoid any synchonization issues, wait for the data after writing, which
1659 * actually prevents the read that triggered the error from finishing.
1660 * currently, there can be no more than two copies of every data bit. thus,
1661 * exactly one rewrite is required.
1662 */
1663int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1664 u64 length, u64 logical, struct page *page,
1665 int mirror_num)
1666{
1667 struct bio *bio;
1668 struct btrfs_device *dev;
1669 DECLARE_COMPLETION_ONSTACK(compl);
1670 u64 map_length = 0;
1671 u64 sector;
1672 struct btrfs_bio *bbio = NULL;
1673 int ret;
1674
1675 BUG_ON(!mirror_num);
1676
1677 bio = bio_alloc(GFP_NOFS, 1);
1678 if (!bio)
1679 return -EIO;
1680 bio->bi_private = &compl;
1681 bio->bi_end_io = repair_io_failure_callback;
1682 bio->bi_size = 0;
1683 map_length = length;
1684
1685 ret = btrfs_map_block(map_tree, WRITE, logical,
1686 &map_length, &bbio, mirror_num);
1687 if (ret) {
1688 bio_put(bio);
1689 return -EIO;
1690 }
1691 BUG_ON(mirror_num != bbio->mirror_num);
1692 sector = bbio->stripes[mirror_num-1].physical >> 9;
1693 bio->bi_sector = sector;
1694 dev = bbio->stripes[mirror_num-1].dev;
1695 kfree(bbio);
1696 if (!dev || !dev->bdev || !dev->writeable) {
1697 bio_put(bio);
1698 return -EIO;
1699 }
1700 bio->bi_bdev = dev->bdev;
1701 bio_add_page(bio, page, length, start-page_offset(page));
1702 submit_bio(WRITE_SYNC, bio);
1703 wait_for_completion(&compl);
1704
1705 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1706 /* try to remap that extent elsewhere? */
1707 bio_put(bio);
1708 return -EIO;
1709 }
1710
1711 printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
1712 "sector %llu)\n", page->mapping->host->i_ino, start,
1713 dev->name, sector);
1714
1715 bio_put(bio);
1716 return 0;
1717}
1718
1719/*
1720 * each time an IO finishes, we do a fast check in the IO failure tree
1721 * to see if we need to process or clean up an io_failure_record
1722 */
1723static int clean_io_failure(u64 start, struct page *page)
1724{
1725 u64 private;
1726 u64 private_failure;
1727 struct io_failure_record *failrec;
1728 struct btrfs_mapping_tree *map_tree;
1729 struct extent_state *state;
1730 int num_copies;
1731 int did_repair = 0;
1732 int ret;
1733 struct inode *inode = page->mapping->host;
1734
1735 private = 0;
1736 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1737 (u64)-1, 1, EXTENT_DIRTY, 0);
1738 if (!ret)
1739 return 0;
1740
1741 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
1742 &private_failure);
1743 if (ret)
1744 return 0;
1745
1746 failrec = (struct io_failure_record *)(unsigned long) private_failure;
1747 BUG_ON(!failrec->this_mirror);
1748
1749 if (failrec->in_validation) {
1750 /* there was no real error, just free the record */
1751 pr_debug("clean_io_failure: freeing dummy error at %llu\n",
1752 failrec->start);
1753 did_repair = 1;
1754 goto out;
1755 }
1756
1757 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1758 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1759 failrec->start,
1760 EXTENT_LOCKED);
1761 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1762
1763 if (state && state->start == failrec->start) {
1764 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
1765 num_copies = btrfs_num_copies(map_tree, failrec->logical,
1766 failrec->len);
1767 if (num_copies > 1) {
1768 ret = repair_io_failure(map_tree, start, failrec->len,
1769 failrec->logical, page,
1770 failrec->failed_mirror);
1771 did_repair = !ret;
1772 }
1773 }
1774
1775out:
1776 if (!ret)
1777 ret = free_io_failure(inode, failrec, did_repair);
1778
1779 return ret;
1780}
1781
1782/*
1783 * this is a generic handler for readpage errors (default
1784 * readpage_io_failed_hook). if other copies exist, read those and write back
1785 * good data to the failed position. does not investigate in remapping the
1786 * failed extent elsewhere, hoping the device will be smart enough to do this as
1787 * needed
1788 */
1789
1790static int bio_readpage_error(struct bio *failed_bio, struct page *page,
1791 u64 start, u64 end, int failed_mirror,
1792 struct extent_state *state)
1793{
1794 struct io_failure_record *failrec = NULL;
1795 u64 private;
1796 struct extent_map *em;
1797 struct inode *inode = page->mapping->host;
1798 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1799 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1800 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1801 struct bio *bio;
1802 int num_copies;
1803 int ret;
1804 int read_mode;
1805 u64 logical;
1806
1807 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
1808
1809 ret = get_state_private(failure_tree, start, &private);
1810 if (ret) {
1811 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
1812 if (!failrec)
1813 return -ENOMEM;
1814 failrec->start = start;
1815 failrec->len = end - start + 1;
1816 failrec->this_mirror = 0;
1817 failrec->bio_flags = 0;
1818 failrec->in_validation = 0;
1819
1820 read_lock(&em_tree->lock);
1821 em = lookup_extent_mapping(em_tree, start, failrec->len);
1822 if (!em) {
1823 read_unlock(&em_tree->lock);
1824 kfree(failrec);
1825 return -EIO;
1826 }
1827
1828 if (em->start > start || em->start + em->len < start) {
1829 free_extent_map(em);
1830 em = NULL;
1831 }
1832 read_unlock(&em_tree->lock);
1833
1834 if (!em || IS_ERR(em)) {
1835 kfree(failrec);
1836 return -EIO;
1837 }
1838 logical = start - em->start;
1839 logical = em->block_start + logical;
1840 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1841 logical = em->block_start;
1842 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1843 extent_set_compress_type(&failrec->bio_flags,
1844 em->compress_type);
1845 }
1846 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
1847 "len=%llu\n", logical, start, failrec->len);
1848 failrec->logical = logical;
1849 free_extent_map(em);
1850
1851 /* set the bits in the private failure tree */
1852 ret = set_extent_bits(failure_tree, start, end,
1853 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1854 if (ret >= 0)
1855 ret = set_state_private(failure_tree, start,
1856 (u64)(unsigned long)failrec);
1857 /* set the bits in the inode's tree */
1858 if (ret >= 0)
1859 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
1860 GFP_NOFS);
1861 if (ret < 0) {
1862 kfree(failrec);
1863 return ret;
1864 }
1865 } else {
1866 failrec = (struct io_failure_record *)(unsigned long)private;
1867 pr_debug("bio_readpage_error: (found) logical=%llu, "
1868 "start=%llu, len=%llu, validation=%d\n",
1869 failrec->logical, failrec->start, failrec->len,
1870 failrec->in_validation);
1871 /*
1872 * when data can be on disk more than twice, add to failrec here
1873 * (e.g. with a list for failed_mirror) to make
1874 * clean_io_failure() clean all those errors at once.
1875 */
1876 }
1877 num_copies = btrfs_num_copies(
1878 &BTRFS_I(inode)->root->fs_info->mapping_tree,
1879 failrec->logical, failrec->len);
1880 if (num_copies == 1) {
1881 /*
1882 * we only have a single copy of the data, so don't bother with
1883 * all the retry and error correction code that follows. no
1884 * matter what the error is, it is very likely to persist.
1885 */
1886 pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
1887 "state=%p, num_copies=%d, next_mirror %d, "
1888 "failed_mirror %d\n", state, num_copies,
1889 failrec->this_mirror, failed_mirror);
1890 free_io_failure(inode, failrec, 0);
1891 return -EIO;
1892 }
1893
1894 if (!state) {
1895 spin_lock(&tree->lock);
1896 state = find_first_extent_bit_state(tree, failrec->start,
1897 EXTENT_LOCKED);
1898 if (state && state->start != failrec->start)
1899 state = NULL;
1900 spin_unlock(&tree->lock);
1901 }
1902
1903 /*
1904 * there are two premises:
1905 * a) deliver good data to the caller
1906 * b) correct the bad sectors on disk
1907 */
1908 if (failed_bio->bi_vcnt > 1) {
1909 /*
1910 * to fulfill b), we need to know the exact failing sectors, as
1911 * we don't want to rewrite any more than the failed ones. thus,
1912 * we need separate read requests for the failed bio
1913 *
1914 * if the following BUG_ON triggers, our validation request got
1915 * merged. we need separate requests for our algorithm to work.
1916 */
1917 BUG_ON(failrec->in_validation);
1918 failrec->in_validation = 1;
1919 failrec->this_mirror = failed_mirror;
1920 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
1921 } else {
1922 /*
1923 * we're ready to fulfill a) and b) alongside. get a good copy
1924 * of the failed sector and if we succeed, we have setup
1925 * everything for repair_io_failure to do the rest for us.
1926 */
1927 if (failrec->in_validation) {
1928 BUG_ON(failrec->this_mirror != failed_mirror);
1929 failrec->in_validation = 0;
1930 failrec->this_mirror = 0;
1931 }
1932 failrec->failed_mirror = failed_mirror;
1933 failrec->this_mirror++;
1934 if (failrec->this_mirror == failed_mirror)
1935 failrec->this_mirror++;
1936 read_mode = READ_SYNC;
1937 }
1938
1939 if (!state || failrec->this_mirror > num_copies) {
1940 pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
1941 "next_mirror %d, failed_mirror %d\n", state,
1942 num_copies, failrec->this_mirror, failed_mirror);
1943 free_io_failure(inode, failrec, 0);
1944 return -EIO;
1945 }
1946
1947 bio = bio_alloc(GFP_NOFS, 1);
1948 bio->bi_private = state;
1949 bio->bi_end_io = failed_bio->bi_end_io;
1950 bio->bi_sector = failrec->logical >> 9;
1951 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
1952 bio->bi_size = 0;
1953
1954 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1955
1956 pr_debug("bio_readpage_error: submitting new read[%#x] to "
1957 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
1958 failrec->this_mirror, num_copies, failrec->in_validation);
1959
1960 tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
1961 failrec->bio_flags, 0);
1962 return 0;
1963}
1964
1602/* lots and lots of room for performance fixes in the end_bio funcs */ 1965/* lots and lots of room for performance fixes in the end_bio funcs */
1603 1966
1604/* 1967/*
@@ -1697,6 +2060,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1697 struct extent_state *cached = NULL; 2060 struct extent_state *cached = NULL;
1698 struct extent_state *state; 2061 struct extent_state *state;
1699 2062
2063 pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2064 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2065 (long int)bio->bi_bdev);
1700 tree = &BTRFS_I(page->mapping->host)->io_tree; 2066 tree = &BTRFS_I(page->mapping->host)->io_tree;
1701 2067
1702 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2068 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1727,11 +2093,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1727 state); 2093 state);
1728 if (ret) 2094 if (ret)
1729 uptodate = 0; 2095 uptodate = 0;
2096 else
2097 clean_io_failure(start, page);
1730 } 2098 }
1731 if (!uptodate && tree->ops && 2099 if (!uptodate) {
1732 tree->ops->readpage_io_failed_hook) { 2100 u64 failed_mirror;
1733 ret = tree->ops->readpage_io_failed_hook(bio, page, 2101 failed_mirror = (u64)bio->bi_bdev;
1734 start, end, NULL); 2102 if (tree->ops && tree->ops->readpage_io_failed_hook)
2103 ret = tree->ops->readpage_io_failed_hook(
2104 bio, page, start, end,
2105 failed_mirror, NULL);
2106 else
2107 ret = bio_readpage_error(bio, page, start, end,
2108 failed_mirror, NULL);
1735 if (ret == 0) { 2109 if (ret == 0) {
1736 uptodate = 2110 uptodate =
1737 test_bit(BIO_UPTODATE, &bio->bi_flags); 2111 test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -1811,6 +2185,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1811 mirror_num, bio_flags, start); 2185 mirror_num, bio_flags, start);
1812 else 2186 else
1813 submit_bio(rw, bio); 2187 submit_bio(rw, bio);
2188
1814 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2189 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1815 ret = -EOPNOTSUPP; 2190 ret = -EOPNOTSUPP;
1816 bio_put(bio); 2191 bio_put(bio);
@@ -2926,7 +3301,7 @@ out:
2926 return ret; 3301 return ret;
2927} 3302}
2928 3303
2929static inline struct page *extent_buffer_page(struct extent_buffer *eb, 3304inline struct page *extent_buffer_page(struct extent_buffer *eb,
2930 unsigned long i) 3305 unsigned long i)
2931{ 3306{
2932 struct page *p; 3307 struct page *p;
@@ -2951,7 +3326,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2951 return p; 3326 return p;
2952} 3327}
2953 3328
2954static inline unsigned long num_extent_pages(u64 start, u64 len) 3329inline unsigned long num_extent_pages(u64 start, u64 len)
2955{ 3330{
2956 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 3331 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2957 (start >> PAGE_CACHE_SHIFT); 3332 (start >> PAGE_CACHE_SHIFT);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 435d454b9926..a8e20b672922 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -68,7 +68,7 @@ struct extent_io_ops {
68 unsigned long bio_flags); 68 unsigned long bio_flags);
69 int (*readpage_io_hook)(struct page *page, u64 start, u64 end); 69 int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
70 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, 70 int (*readpage_io_failed_hook)(struct bio *bio, struct page *page,
71 u64 start, u64 end, 71 u64 start, u64 end, u64 failed_mirror,
72 struct extent_state *state); 72 struct extent_state *state);
73 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, 73 int (*writepage_io_failed_hook)(struct bio *bio, struct page *page,
74 u64 start, u64 end, 74 u64 start, u64 end,
@@ -252,6 +252,8 @@ void free_extent_buffer(struct extent_buffer *eb);
252int read_extent_buffer_pages(struct extent_io_tree *tree, 252int read_extent_buffer_pages(struct extent_io_tree *tree,
253 struct extent_buffer *eb, u64 start, int wait, 253 struct extent_buffer *eb, u64 start, int wait,
254 get_extent_t *get_extent, int mirror_num); 254 get_extent_t *get_extent, int mirror_num);
255unsigned long num_extent_pages(u64 start, u64 len);
256struct page *extent_buffer_page(struct extent_buffer *eb, unsigned long i);
255 257
256static inline void extent_buffer_get(struct extent_buffer *eb) 258static inline void extent_buffer_get(struct extent_buffer *eb)
257{ 259{
@@ -301,4 +303,10 @@ int extent_clear_unlock_delalloc(struct inode *inode,
301struct bio * 303struct bio *
302btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, 304btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs,
303 gfp_t gfp_flags); 305 gfp_t gfp_flags);
306
307struct btrfs_mapping_tree;
308
309int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
310 u64 length, u64 logical, struct page *page,
311 int mirror_num);
304#endif 312#endif
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 936a6fabaa9f..9327f45434e8 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -45,10 +45,10 @@
45#include "btrfs_inode.h" 45#include "btrfs_inode.h"
46#include "ioctl.h" 46#include "ioctl.h"
47#include "print-tree.h" 47#include "print-tree.h"
48#include "volumes.h"
49#include "ordered-data.h" 48#include "ordered-data.h"
50#include "xattr.h" 49#include "xattr.h"
51#include "tree-log.h" 50#include "tree-log.h"
51#include "volumes.h"
52#include "compression.h" 52#include "compression.h"
53#include "locking.h" 53#include "locking.h"
54#include "free-space-cache.h" 54#include "free-space-cache.h"
@@ -1819,153 +1819,9 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
1819} 1819}
1820 1820
1821/* 1821/*
1822 * When IO fails, either with EIO or csum verification fails, we
1823 * try other mirrors that might have a good copy of the data. This
1824 * io_failure_record is used to record state as we go through all the
1825 * mirrors. If another mirror has good data, the page is set up to date
1826 * and things continue. If a good mirror can't be found, the original
1827 * bio end_io callback is called to indicate things have failed.
1828 */
1829struct io_failure_record {
1830 struct page *page;
1831 u64 start;
1832 u64 len;
1833 u64 logical;
1834 unsigned long bio_flags;
1835 int last_mirror;
1836};
1837
1838static int btrfs_io_failed_hook(struct bio *failed_bio,
1839 struct page *page, u64 start, u64 end,
1840 struct extent_state *state)
1841{
1842 struct io_failure_record *failrec = NULL;
1843 u64 private;
1844 struct extent_map *em;
1845 struct inode *inode = page->mapping->host;
1846 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1847 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1848 struct bio *bio;
1849 int num_copies;
1850 int ret;
1851 int rw;
1852 u64 logical;
1853
1854 ret = get_state_private(failure_tree, start, &private);
1855 if (ret) {
1856 failrec = kmalloc(sizeof(*failrec), GFP_NOFS);
1857 if (!failrec)
1858 return -ENOMEM;
1859 failrec->start = start;
1860 failrec->len = end - start + 1;
1861 failrec->last_mirror = 0;
1862 failrec->bio_flags = 0;
1863
1864 read_lock(&em_tree->lock);
1865 em = lookup_extent_mapping(em_tree, start, failrec->len);
1866 if (em->start > start || em->start + em->len < start) {
1867 free_extent_map(em);
1868 em = NULL;
1869 }
1870 read_unlock(&em_tree->lock);
1871
1872 if (IS_ERR_OR_NULL(em)) {
1873 kfree(failrec);
1874 return -EIO;
1875 }
1876 logical = start - em->start;
1877 logical = em->block_start + logical;
1878 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1879 logical = em->block_start;
1880 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1881 extent_set_compress_type(&failrec->bio_flags,
1882 em->compress_type);
1883 }
1884 failrec->logical = logical;
1885 free_extent_map(em);
1886 set_extent_bits(failure_tree, start, end, EXTENT_LOCKED |
1887 EXTENT_DIRTY, GFP_NOFS);
1888 set_state_private(failure_tree, start,
1889 (u64)(unsigned long)failrec);
1890 } else {
1891 failrec = (struct io_failure_record *)(unsigned long)private;
1892 }
1893 num_copies = btrfs_num_copies(
1894 &BTRFS_I(inode)->root->fs_info->mapping_tree,
1895 failrec->logical, failrec->len);
1896 failrec->last_mirror++;
1897 if (!state) {
1898 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1899 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1900 failrec->start,
1901 EXTENT_LOCKED);
1902 if (state && state->start != failrec->start)
1903 state = NULL;
1904 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1905 }
1906 if (!state || failrec->last_mirror > num_copies) {
1907 set_state_private(failure_tree, failrec->start, 0);
1908 clear_extent_bits(failure_tree, failrec->start,
1909 failrec->start + failrec->len - 1,
1910 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1911 kfree(failrec);
1912 return -EIO;
1913 }
1914 bio = bio_alloc(GFP_NOFS, 1);
1915 bio->bi_private = state;
1916 bio->bi_end_io = failed_bio->bi_end_io;
1917 bio->bi_sector = failrec->logical >> 9;
1918 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
1919 bio->bi_size = 0;
1920
1921 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1922 if (failed_bio->bi_rw & REQ_WRITE)
1923 rw = WRITE;
1924 else
1925 rw = READ;
1926
1927 ret = BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio,
1928 failrec->last_mirror,
1929 failrec->bio_flags, 0);
1930 return ret;
1931}
1932
1933/*
1934 * each time an IO finishes, we do a fast check in the IO failure tree
1935 * to see if we need to process or clean up an io_failure_record
1936 */
1937static int btrfs_clean_io_failures(struct inode *inode, u64 start)
1938{
1939 u64 private;
1940 u64 private_failure;
1941 struct io_failure_record *failure;
1942 int ret;
1943
1944 private = 0;
1945 if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1946 (u64)-1, 1, EXTENT_DIRTY, 0)) {
1947 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree,
1948 start, &private_failure);
1949 if (ret == 0) {
1950 failure = (struct io_failure_record *)(unsigned long)
1951 private_failure;
1952 set_state_private(&BTRFS_I(inode)->io_failure_tree,
1953 failure->start, 0);
1954 clear_extent_bits(&BTRFS_I(inode)->io_failure_tree,
1955 failure->start,
1956 failure->start + failure->len - 1,
1957 EXTENT_DIRTY | EXTENT_LOCKED,
1958 GFP_NOFS);
1959 kfree(failure);
1960 }
1961 }
1962 return 0;
1963}
1964
1965/*
1966 * when reads are done, we need to check csums to verify the data is correct 1822 * when reads are done, we need to check csums to verify the data is correct
1967 * if there's a match, we allow the bio to finish. If not, we go through 1823 * if there's a match, we allow the bio to finish. If not, the code in
1968 * the io_failure_record routines to find good copies 1824 * extent_io.c will try to find good copies for us.
1969 */ 1825 */
1970static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, 1826static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
1971 struct extent_state *state) 1827 struct extent_state *state)
@@ -2011,10 +1867,6 @@ static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end,
2011 1867
2012 kunmap_atomic(kaddr, KM_USER0); 1868 kunmap_atomic(kaddr, KM_USER0);
2013good: 1869good:
2014 /* if the io failure tree for this inode is non-empty,
2015 * check to see if we've recovered from a failed IO
2016 */
2017 btrfs_clean_io_failures(inode, start);
2018 return 0; 1870 return 0;
2019 1871
2020zeroit: 1872zeroit:
@@ -7420,7 +7272,6 @@ static struct extent_io_ops btrfs_extent_io_ops = {
7420 .readpage_end_io_hook = btrfs_readpage_end_io_hook, 7272 .readpage_end_io_hook = btrfs_readpage_end_io_hook,
7421 .writepage_end_io_hook = btrfs_writepage_end_io_hook, 7273 .writepage_end_io_hook = btrfs_writepage_end_io_hook,
7422 .writepage_start_hook = btrfs_writepage_start_hook, 7274 .writepage_start_hook = btrfs_writepage_start_hook,
7423 .readpage_io_failed_hook = btrfs_io_failed_hook,
7424 .set_bit_hook = btrfs_set_bit_hook, 7275 .set_bit_hook = btrfs_set_bit_hook,
7425 .clear_bit_hook = btrfs_clear_bit_hook, 7276 .clear_bit_hook = btrfs_clear_bit_hook,
7426 .merge_extent_hook = btrfs_merge_extent_hook, 7277 .merge_extent_hook = btrfs_merge_extent_hook,