aboutsummaryrefslogtreecommitdiffstats
path: root/fs/btrfs/extent_io.c
diff options
context:
space:
mode:
authorJan Schmidt <list.btrfs@jan-o-sch.net>2011-07-22 09:41:52 -0400
committerJan Schmidt <list.btrfs@jan-o-sch.net>2011-09-29 07:38:42 -0400
commit4a54c8c165b66300830a67349fc7595e3fc442f7 (patch)
treefea3a5e86c2ee9304711e18b2557912166d6d52f /fs/btrfs/extent_io.c
parent2774b2ca3d49124bf1ae89e8d575b3dab4221266 (diff)
btrfs: Moved repair code from inode.c to extent_io.c
The raid-retry code in inode.c can be generalized so that it works for metadata as well. Thus, this patch moves it to extent_io.c and makes the raid-retry code a raid-repair code. Repair works that way: Whenever a read error occurs and we have more mirrors to try, note the failed mirror, and retry another. If we find a good one, check if we did note a failure earlier and if so, do not allow the read to complete until after the bad sector was written with the good data we just fetched. As we have the extent locked while reading, no one can change the data in between. Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
Diffstat (limited to 'fs/btrfs/extent_io.c')
-rw-r--r--fs/btrfs/extent_io.c387
1 files changed, 381 insertions, 6 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index afebb95e3490..624ef10d36cc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
17#include "compat.h" 17#include "compat.h"
18#include "ctree.h" 18#include "ctree.h"
19#include "btrfs_inode.h" 19#include "btrfs_inode.h"
20#include "volumes.h"
20 21
21static struct kmem_cache *extent_state_cache; 22static struct kmem_cache *extent_state_cache;
22static struct kmem_cache *extent_buffer_cache; 23static struct kmem_cache *extent_buffer_cache;
@@ -1599,6 +1600,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
1599 return 0; 1600 return 0;
1600} 1601}
1601 1602
1603/*
1604 * When IO fails, either with EIO or csum verification fails, we
1605 * try other mirrors that might have a good copy of the data. This
1606 * io_failure_record is used to record state as we go through all the
1607 * mirrors. If another mirror has good data, the page is set up to date
1608 * and things continue. If a good mirror can't be found, the original
1609 * bio end_io callback is called to indicate things have failed.
1610 */
1611struct io_failure_record {
1612 struct page *page;
1613 u64 start;
1614 u64 len;
1615 u64 logical;
1616 unsigned long bio_flags;
1617 int this_mirror;
1618 int failed_mirror;
1619 int in_validation;
1620};
1621
1622static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
1623 int did_repair)
1624{
1625 int ret;
1626 int err = 0;
1627 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1628
1629 set_state_private(failure_tree, rec->start, 0);
1630 ret = clear_extent_bits(failure_tree, rec->start,
1631 rec->start + rec->len - 1,
1632 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1633 if (ret)
1634 err = ret;
1635
1636 if (did_repair) {
1637 ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
1638 rec->start + rec->len - 1,
1639 EXTENT_DAMAGED, GFP_NOFS);
1640 if (ret && !err)
1641 err = ret;
1642 }
1643
1644 kfree(rec);
1645 return err;
1646}
1647
1648static void repair_io_failure_callback(struct bio *bio, int err)
1649{
1650 complete(bio->bi_private);
1651}
1652
1653/*
1654 * this bypasses the standard btrfs submit functions deliberately, as
1655 * the standard behavior is to write all copies in a raid setup. here we only
1656 * want to write the one bad copy. so we do the mapping for ourselves and issue
1657 * submit_bio directly.
1658 * to avoid any synchonization issues, wait for the data after writing, which
1659 * actually prevents the read that triggered the error from finishing.
1660 * currently, there can be no more than two copies of every data bit. thus,
1661 * exactly one rewrite is required.
1662 */
1663int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
1664 u64 length, u64 logical, struct page *page,
1665 int mirror_num)
1666{
1667 struct bio *bio;
1668 struct btrfs_device *dev;
1669 DECLARE_COMPLETION_ONSTACK(compl);
1670 u64 map_length = 0;
1671 u64 sector;
1672 struct btrfs_bio *bbio = NULL;
1673 int ret;
1674
1675 BUG_ON(!mirror_num);
1676
1677 bio = bio_alloc(GFP_NOFS, 1);
1678 if (!bio)
1679 return -EIO;
1680 bio->bi_private = &compl;
1681 bio->bi_end_io = repair_io_failure_callback;
1682 bio->bi_size = 0;
1683 map_length = length;
1684
1685 ret = btrfs_map_block(map_tree, WRITE, logical,
1686 &map_length, &bbio, mirror_num);
1687 if (ret) {
1688 bio_put(bio);
1689 return -EIO;
1690 }
1691 BUG_ON(mirror_num != bbio->mirror_num);
1692 sector = bbio->stripes[mirror_num-1].physical >> 9;
1693 bio->bi_sector = sector;
1694 dev = bbio->stripes[mirror_num-1].dev;
1695 kfree(bbio);
1696 if (!dev || !dev->bdev || !dev->writeable) {
1697 bio_put(bio);
1698 return -EIO;
1699 }
1700 bio->bi_bdev = dev->bdev;
1701 bio_add_page(bio, page, length, start-page_offset(page));
1702 submit_bio(WRITE_SYNC, bio);
1703 wait_for_completion(&compl);
1704
1705 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
1706 /* try to remap that extent elsewhere? */
1707 bio_put(bio);
1708 return -EIO;
1709 }
1710
1711 printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
1712 "sector %llu)\n", page->mapping->host->i_ino, start,
1713 dev->name, sector);
1714
1715 bio_put(bio);
1716 return 0;
1717}
1718
1719/*
1720 * each time an IO finishes, we do a fast check in the IO failure tree
1721 * to see if we need to process or clean up an io_failure_record
1722 */
1723static int clean_io_failure(u64 start, struct page *page)
1724{
1725 u64 private;
1726 u64 private_failure;
1727 struct io_failure_record *failrec;
1728 struct btrfs_mapping_tree *map_tree;
1729 struct extent_state *state;
1730 int num_copies;
1731 int did_repair = 0;
1732 int ret;
1733 struct inode *inode = page->mapping->host;
1734
1735 private = 0;
1736 ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
1737 (u64)-1, 1, EXTENT_DIRTY, 0);
1738 if (!ret)
1739 return 0;
1740
1741 ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
1742 &private_failure);
1743 if (ret)
1744 return 0;
1745
1746 failrec = (struct io_failure_record *)(unsigned long) private_failure;
1747 BUG_ON(!failrec->this_mirror);
1748
1749 if (failrec->in_validation) {
1750 /* there was no real error, just free the record */
1751 pr_debug("clean_io_failure: freeing dummy error at %llu\n",
1752 failrec->start);
1753 did_repair = 1;
1754 goto out;
1755 }
1756
1757 spin_lock(&BTRFS_I(inode)->io_tree.lock);
1758 state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
1759 failrec->start,
1760 EXTENT_LOCKED);
1761 spin_unlock(&BTRFS_I(inode)->io_tree.lock);
1762
1763 if (state && state->start == failrec->start) {
1764 map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
1765 num_copies = btrfs_num_copies(map_tree, failrec->logical,
1766 failrec->len);
1767 if (num_copies > 1) {
1768 ret = repair_io_failure(map_tree, start, failrec->len,
1769 failrec->logical, page,
1770 failrec->failed_mirror);
1771 did_repair = !ret;
1772 }
1773 }
1774
1775out:
1776 if (!ret)
1777 ret = free_io_failure(inode, failrec, did_repair);
1778
1779 return ret;
1780}
1781
1782/*
1783 * this is a generic handler for readpage errors (default
1784 * readpage_io_failed_hook). if other copies exist, read those and write back
1785 * good data to the failed position. does not investigate in remapping the
1786 * failed extent elsewhere, hoping the device will be smart enough to do this as
1787 * needed
1788 */
1789
1790static int bio_readpage_error(struct bio *failed_bio, struct page *page,
1791 u64 start, u64 end, int failed_mirror,
1792 struct extent_state *state)
1793{
1794 struct io_failure_record *failrec = NULL;
1795 u64 private;
1796 struct extent_map *em;
1797 struct inode *inode = page->mapping->host;
1798 struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
1799 struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
1800 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1801 struct bio *bio;
1802 int num_copies;
1803 int ret;
1804 int read_mode;
1805 u64 logical;
1806
1807 BUG_ON(failed_bio->bi_rw & REQ_WRITE);
1808
1809 ret = get_state_private(failure_tree, start, &private);
1810 if (ret) {
1811 failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
1812 if (!failrec)
1813 return -ENOMEM;
1814 failrec->start = start;
1815 failrec->len = end - start + 1;
1816 failrec->this_mirror = 0;
1817 failrec->bio_flags = 0;
1818 failrec->in_validation = 0;
1819
1820 read_lock(&em_tree->lock);
1821 em = lookup_extent_mapping(em_tree, start, failrec->len);
1822 if (!em) {
1823 read_unlock(&em_tree->lock);
1824 kfree(failrec);
1825 return -EIO;
1826 }
1827
1828 if (em->start > start || em->start + em->len < start) {
1829 free_extent_map(em);
1830 em = NULL;
1831 }
1832 read_unlock(&em_tree->lock);
1833
1834 if (!em || IS_ERR(em)) {
1835 kfree(failrec);
1836 return -EIO;
1837 }
1838 logical = start - em->start;
1839 logical = em->block_start + logical;
1840 if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
1841 logical = em->block_start;
1842 failrec->bio_flags = EXTENT_BIO_COMPRESSED;
1843 extent_set_compress_type(&failrec->bio_flags,
1844 em->compress_type);
1845 }
1846 pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
1847 "len=%llu\n", logical, start, failrec->len);
1848 failrec->logical = logical;
1849 free_extent_map(em);
1850
1851 /* set the bits in the private failure tree */
1852 ret = set_extent_bits(failure_tree, start, end,
1853 EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
1854 if (ret >= 0)
1855 ret = set_state_private(failure_tree, start,
1856 (u64)(unsigned long)failrec);
1857 /* set the bits in the inode's tree */
1858 if (ret >= 0)
1859 ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
1860 GFP_NOFS);
1861 if (ret < 0) {
1862 kfree(failrec);
1863 return ret;
1864 }
1865 } else {
1866 failrec = (struct io_failure_record *)(unsigned long)private;
1867 pr_debug("bio_readpage_error: (found) logical=%llu, "
1868 "start=%llu, len=%llu, validation=%d\n",
1869 failrec->logical, failrec->start, failrec->len,
1870 failrec->in_validation);
1871 /*
1872 * when data can be on disk more than twice, add to failrec here
1873 * (e.g. with a list for failed_mirror) to make
1874 * clean_io_failure() clean all those errors at once.
1875 */
1876 }
1877 num_copies = btrfs_num_copies(
1878 &BTRFS_I(inode)->root->fs_info->mapping_tree,
1879 failrec->logical, failrec->len);
1880 if (num_copies == 1) {
1881 /*
1882 * we only have a single copy of the data, so don't bother with
1883 * all the retry and error correction code that follows. no
1884 * matter what the error is, it is very likely to persist.
1885 */
1886 pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
1887 "state=%p, num_copies=%d, next_mirror %d, "
1888 "failed_mirror %d\n", state, num_copies,
1889 failrec->this_mirror, failed_mirror);
1890 free_io_failure(inode, failrec, 0);
1891 return -EIO;
1892 }
1893
1894 if (!state) {
1895 spin_lock(&tree->lock);
1896 state = find_first_extent_bit_state(tree, failrec->start,
1897 EXTENT_LOCKED);
1898 if (state && state->start != failrec->start)
1899 state = NULL;
1900 spin_unlock(&tree->lock);
1901 }
1902
1903 /*
1904 * there are two premises:
1905 * a) deliver good data to the caller
1906 * b) correct the bad sectors on disk
1907 */
1908 if (failed_bio->bi_vcnt > 1) {
1909 /*
1910 * to fulfill b), we need to know the exact failing sectors, as
1911 * we don't want to rewrite any more than the failed ones. thus,
1912 * we need separate read requests for the failed bio
1913 *
1914 * if the following BUG_ON triggers, our validation request got
1915 * merged. we need separate requests for our algorithm to work.
1916 */
1917 BUG_ON(failrec->in_validation);
1918 failrec->in_validation = 1;
1919 failrec->this_mirror = failed_mirror;
1920 read_mode = READ_SYNC | REQ_FAILFAST_DEV;
1921 } else {
1922 /*
1923 * we're ready to fulfill a) and b) alongside. get a good copy
1924 * of the failed sector and if we succeed, we have setup
1925 * everything for repair_io_failure to do the rest for us.
1926 */
1927 if (failrec->in_validation) {
1928 BUG_ON(failrec->this_mirror != failed_mirror);
1929 failrec->in_validation = 0;
1930 failrec->this_mirror = 0;
1931 }
1932 failrec->failed_mirror = failed_mirror;
1933 failrec->this_mirror++;
1934 if (failrec->this_mirror == failed_mirror)
1935 failrec->this_mirror++;
1936 read_mode = READ_SYNC;
1937 }
1938
1939 if (!state || failrec->this_mirror > num_copies) {
1940 pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
1941 "next_mirror %d, failed_mirror %d\n", state,
1942 num_copies, failrec->this_mirror, failed_mirror);
1943 free_io_failure(inode, failrec, 0);
1944 return -EIO;
1945 }
1946
1947 bio = bio_alloc(GFP_NOFS, 1);
1948 bio->bi_private = state;
1949 bio->bi_end_io = failed_bio->bi_end_io;
1950 bio->bi_sector = failrec->logical >> 9;
1951 bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
1952 bio->bi_size = 0;
1953
1954 bio_add_page(bio, page, failrec->len, start - page_offset(page));
1955
1956 pr_debug("bio_readpage_error: submitting new read[%#x] to "
1957 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
1958 failrec->this_mirror, num_copies, failrec->in_validation);
1959
1960 tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
1961 failrec->bio_flags, 0);
1962 return 0;
1963}
1964
1602/* lots and lots of room for performance fixes in the end_bio funcs */ 1965/* lots and lots of room for performance fixes in the end_bio funcs */
1603 1966
1604/* 1967/*
@@ -1697,6 +2060,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1697 struct extent_state *cached = NULL; 2060 struct extent_state *cached = NULL;
1698 struct extent_state *state; 2061 struct extent_state *state;
1699 2062
2063 pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
2064 "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
2065 (long int)bio->bi_bdev);
1700 tree = &BTRFS_I(page->mapping->host)->io_tree; 2066 tree = &BTRFS_I(page->mapping->host)->io_tree;
1701 2067
1702 start = ((u64)page->index << PAGE_CACHE_SHIFT) + 2068 start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1727,11 +2093,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1727 state); 2093 state);
1728 if (ret) 2094 if (ret)
1729 uptodate = 0; 2095 uptodate = 0;
2096 else
2097 clean_io_failure(start, page);
1730 } 2098 }
1731 if (!uptodate && tree->ops && 2099 if (!uptodate) {
1732 tree->ops->readpage_io_failed_hook) { 2100 u64 failed_mirror;
1733 ret = tree->ops->readpage_io_failed_hook(bio, page, 2101 failed_mirror = (u64)bio->bi_bdev;
1734 start, end, NULL); 2102 if (tree->ops && tree->ops->readpage_io_failed_hook)
2103 ret = tree->ops->readpage_io_failed_hook(
2104 bio, page, start, end,
2105 failed_mirror, NULL);
2106 else
2107 ret = bio_readpage_error(bio, page, start, end,
2108 failed_mirror, NULL);
1735 if (ret == 0) { 2109 if (ret == 0) {
1736 uptodate = 2110 uptodate =
1737 test_bit(BIO_UPTODATE, &bio->bi_flags); 2111 test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -1811,6 +2185,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1811 mirror_num, bio_flags, start); 2185 mirror_num, bio_flags, start);
1812 else 2186 else
1813 submit_bio(rw, bio); 2187 submit_bio(rw, bio);
2188
1814 if (bio_flagged(bio, BIO_EOPNOTSUPP)) 2189 if (bio_flagged(bio, BIO_EOPNOTSUPP))
1815 ret = -EOPNOTSUPP; 2190 ret = -EOPNOTSUPP;
1816 bio_put(bio); 2191 bio_put(bio);
@@ -2926,7 +3301,7 @@ out:
2926 return ret; 3301 return ret;
2927} 3302}
2928 3303
2929static inline struct page *extent_buffer_page(struct extent_buffer *eb, 3304inline struct page *extent_buffer_page(struct extent_buffer *eb,
2930 unsigned long i) 3305 unsigned long i)
2931{ 3306{
2932 struct page *p; 3307 struct page *p;
@@ -2951,7 +3326,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
2951 return p; 3326 return p;
2952} 3327}
2953 3328
2954static inline unsigned long num_extent_pages(u64 start, u64 len) 3329inline unsigned long num_extent_pages(u64 start, u64 len)
2955{ 3330{
2956 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - 3331 return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2957 (start >> PAGE_CACHE_SHIFT); 3332 (start >> PAGE_CACHE_SHIFT);