btrfs: Moved repair code from inode.c to extent_io.c

The raid-retry code in inode.c can be generalized so that it works for metadata as well. Thus, this patch moves it to extent_io.c and makes the raid-retry code a raid-repair code. Repair works that way: Whenever a read error occurs and we have more mirrors to try, note the failed mirror, and retry another. If we find a good one, check if we did note a failure earlier and if so, do not allow the read to complete until after the bad sector was written with the good data we just fetched. As we have the extent locked while reading, no one can change the data in between. Signed-off-by: Jan Schmidt <list.btrfs@jan-o-sch.net>
author: Jan Schmidt <list.btrfs@jan-o-sch.net> 2011-07-22 09:41:52 -0400
committer: Jan Schmidt <list.btrfs@jan-o-sch.net> 2011-09-29 07:38:42 -0400
commit: 4a54c8c165b66300830a67349fc7595e3fc442f7 (patch)
tree: fea3a5e86c2ee9304711e18b2557912166d6d52f /fs/btrfs/extent_io.c
parent: 2774b2ca3d49124bf1ae89e8d575b3dab4221266 (diff)
1 files changed, 381 insertions, 6 deletions
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index afebb95e3490..624ef10d36cc 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
 #include "compat.h"
 #include "ctree.h"
 #include "btrfs_inode.h"
+#include "volumes.h"
 static struct kmem_cache *extent_state_cache;
 static struct kmem_cache *extent_buffer_cache;
@@ -1599,6 +1600,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
        return 0;
 }
+/*
+ * When IO fails, either with EIO or csum verification fails, we
+ * try other mirrors that might have a good copy of the data.  This
+ * io_failure_record is used to record state as we go through all the
+ * mirrors.  If another mirror has good data, the page is set up to date
+ * and things continue.  If a good mirror can't be found, the original
+ * bio end_io callback is called to indicate things have failed.
+ */
+struct io_failure_record {
+        struct page *page;
+        u64 start;
+        u64 len;
+        u64 logical;
+        unsigned long bio_flags;
+        int this_mirror;
+        int failed_mirror;
+        int in_validation;
+};
+static int free_io_failure(struct inode *inode, struct io_failure_record *rec,
+                                int did_repair)
+{
+        int ret;
+        int err = 0;
+        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+        set_state_private(failure_tree, rec->start, 0);
+        ret = clear_extent_bits(failure_tree, rec->start,
+                                rec->start + rec->len - 1,
+                                EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+        if (ret)
+                err = ret;
+        if (did_repair) {
+                ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
+                                        rec->start + rec->len - 1,
+                                        EXTENT_DAMAGED, GFP_NOFS);
+                if (ret && !err)
+                        err = ret;
+        }
+        kfree(rec);
+        return err;
+}
+static void repair_io_failure_callback(struct bio *bio, int err)
+{
+        complete(bio->bi_private);
+}
+/*
+ * this bypasses the standard btrfs submit functions deliberately, as
+ * the standard behavior is to write all copies in a raid setup. here we only
+ * want to write the one bad copy. so we do the mapping for ourselves and issue
+ * submit_bio directly.
+ * to avoid any synchonization issues, wait for the data after writing, which
+ * actually prevents the read that triggered the error from finishing.
+ * currently, there can be no more than two copies of every data bit. thus,
+ * exactly one rewrite is required.
+ */
+int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
+                        u64 length, u64 logical, struct page *page,
+                        int mirror_num)
+{
+        struct bio *bio;
+        struct btrfs_device *dev;
+        DECLARE_COMPLETION_ONSTACK(compl);
+        u64 map_length = 0;
+        u64 sector;
+        struct btrfs_bio *bbio = NULL;
+        int ret;
+        BUG_ON(!mirror_num);
+        bio = bio_alloc(GFP_NOFS, 1);
+        if (!bio)
+                return -EIO;
+        bio->bi_private = &compl;
+        bio->bi_end_io = repair_io_failure_callback;
+        bio->bi_size = 0;
+        map_length = length;
+        ret = btrfs_map_block(map_tree, WRITE, logical,
+                              &map_length, &bbio, mirror_num);
+        if (ret) {
+                bio_put(bio);
+                return -EIO;
+        }
+        BUG_ON(mirror_num != bbio->mirror_num);
+        sector = bbio->stripes[mirror_num-1].physical >> 9;
+        bio->bi_sector = sector;
+        dev = bbio->stripes[mirror_num-1].dev;
+        kfree(bbio);
+        if (!dev || !dev->bdev || !dev->writeable) {
+                bio_put(bio);
+                return -EIO;
+        }
+        bio->bi_bdev = dev->bdev;
+        bio_add_page(bio, page, length, start-page_offset(page));
+        submit_bio(WRITE_SYNC, bio);
+        wait_for_completion(&compl);
+        if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+                /* try to remap that extent elsewhere? */
+                bio_put(bio);
+                return -EIO;
+        }
+        printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
+                        "sector %llu)\n", page->mapping->host->i_ino, start,
+                        dev->name, sector);
+        bio_put(bio);
+        return 0;
+}
+/*
+ * each time an IO finishes, we do a fast check in the IO failure tree
+ * to see if we need to process or clean up an io_failure_record
+ */
+static int clean_io_failure(u64 start, struct page *page)
+{
+        u64 private;
+        u64 private_failure;
+        struct io_failure_record *failrec;
+        struct btrfs_mapping_tree *map_tree;
+        struct extent_state *state;
+        int num_copies;
+        int did_repair = 0;
+        int ret;
+        struct inode *inode = page->mapping->host;
+        private = 0;
+        ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
+                                (u64)-1, 1, EXTENT_DIRTY, 0);
+        if (!ret)
+                return 0;
+        ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
+                                &private_failure);
+        if (ret)
+                return 0;
+        failrec = (struct io_failure_record *)(unsigned long) private_failure;
+        BUG_ON(!failrec->this_mirror);
+        if (failrec->in_validation) {
+                /* there was no real error, just free the record */
+                pr_debug("clean_io_failure: freeing dummy error at %llu\n",
+                         failrec->start);
+                did_repair = 1;
+                goto out;
+        }
+        spin_lock(&BTRFS_I(inode)->io_tree.lock);
+        state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
+                                            failrec->start,
+                                            EXTENT_LOCKED);
+        spin_unlock(&BTRFS_I(inode)->io_tree.lock);
+        if (state && state->start == failrec->start) {
+                map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
+                num_copies = btrfs_num_copies(map_tree, failrec->logical,
+                                                failrec->len);
+                if (num_copies > 1)  {
+                        ret = repair_io_failure(map_tree, start, failrec->len,
+                                                failrec->logical, page,
+                                                failrec->failed_mirror);
+                        did_repair = !ret;
+                }
+        }
+out:
+        if (!ret)
+                ret = free_io_failure(inode, failrec, did_repair);
+        return ret;
+}
+/*
+ * this is a generic handler for readpage errors (default
+ * readpage_io_failed_hook). if other copies exist, read those and write back
+ * good data to the failed position. does not investigate in remapping the
+ * failed extent elsewhere, hoping the device will be smart enough to do this as
+ * needed
+ */
+static int bio_readpage_error(struct bio *failed_bio, struct page *page,
+                                u64 start, u64 end, int failed_mirror,
+                                struct extent_state *state)
+{
+        struct io_failure_record *failrec = NULL;
+        u64 private;
+        struct extent_map *em;
+        struct inode *inode = page->mapping->host;
+        struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
+        struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
+        struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+        struct bio *bio;
+        int num_copies;
+        int ret;
+        int read_mode;
+        u64 logical;
+        BUG_ON(failed_bio->bi_rw & REQ_WRITE);
+        ret = get_state_private(failure_tree, start, &private);
+        if (ret) {
+                failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
+                if (!failrec)
+                        return -ENOMEM;
+                failrec->start = start;
+                failrec->len = end - start + 1;
+                failrec->this_mirror = 0;
+                failrec->bio_flags = 0;
+                failrec->in_validation = 0;
+                read_lock(&em_tree->lock);
+                em = lookup_extent_mapping(em_tree, start, failrec->len);
+                if (!em) {
+                        read_unlock(&em_tree->lock);
+                        kfree(failrec);
+                        return -EIO;
+                }
+                if (em->start > start || em->start + em->len < start) {
+                        free_extent_map(em);
+                        em = NULL;
+                }
+                read_unlock(&em_tree->lock);
+                if (!em || IS_ERR(em)) {
+                        kfree(failrec);
+                        return -EIO;
+                }
+                logical = start - em->start;
+                logical = em->block_start + logical;
+                if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
+                        logical = em->block_start;
+                        failrec->bio_flags = EXTENT_BIO_COMPRESSED;
+                        extent_set_compress_type(&failrec->bio_flags,
+                                                 em->compress_type);
+                }
+                pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
+                         "len=%llu\n", logical, start, failrec->len);
+                failrec->logical = logical;
+                free_extent_map(em);
+                /* set the bits in the private failure tree */
+                ret = set_extent_bits(failure_tree, start, end,
+                                        EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS);
+                if (ret >= 0)
+                        ret = set_state_private(failure_tree, start,
+                                                (u64)(unsigned long)failrec);
+                /* set the bits in the inode's tree */
+                if (ret >= 0)
+                        ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
+                                                GFP_NOFS);
+                if (ret < 0) {
+                        kfree(failrec);
+                        return ret;
+                }
+        } else {
+                failrec = (struct io_failure_record *)(unsigned long)private;
+                pr_debug("bio_readpage_error: (found) logical=%llu, "
+                         "start=%llu, len=%llu, validation=%d\n",
+                         failrec->logical, failrec->start, failrec->len,
+                         failrec->in_validation);
+                /*
+                 * when data can be on disk more than twice, add to failrec here
+                 * (e.g. with a list for failed_mirror) to make
+                 * clean_io_failure() clean all those errors at once.
+                 */
+        }
+        num_copies = btrfs_num_copies(
+                              &BTRFS_I(inode)->root->fs_info->mapping_tree,
+                              failrec->logical, failrec->len);
+        if (num_copies == 1) {
+                /*
+                 * we only have a single copy of the data, so don't bother with
+                 * all the retry and error correction code that follows. no
+                 * matter what the error is, it is very likely to persist.
+                 */
+                pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
+                         "state=%p, num_copies=%d, next_mirror %d, "
+                         "failed_mirror %d\n", state, num_copies,
+                         failrec->this_mirror, failed_mirror);
+                free_io_failure(inode, failrec, 0);
+                return -EIO;
+        }
+        if (!state) {
+                spin_lock(&tree->lock);
+                state = find_first_extent_bit_state(tree, failrec->start,
+                                                    EXTENT_LOCKED);
+                if (state && state->start != failrec->start)
+                        state = NULL;
+                spin_unlock(&tree->lock);
+        }
+        /*
+         * there are two premises:
+         *      a) deliver good data to the caller
+         *      b) correct the bad sectors on disk
+         */
+        if (failed_bio->bi_vcnt > 1) {
+                /*
+                 * to fulfill b), we need to know the exact failing sectors, as
+                 * we don't want to rewrite any more than the failed ones. thus,
+                 * we need separate read requests for the failed bio
+                 *
+                 * if the following BUG_ON triggers, our validation request got
+                 * merged. we need separate requests for our algorithm to work.
+                 */
+                BUG_ON(failrec->in_validation);
+                failrec->in_validation = 1;
+                failrec->this_mirror = failed_mirror;
+                read_mode = READ_SYNC | REQ_FAILFAST_DEV;
+        } else {
+                /*
+                 * we're ready to fulfill a) and b) alongside. get a good copy
+                 * of the failed sector and if we succeed, we have setup
+                 * everything for repair_io_failure to do the rest for us.
+                 */
+                if (failrec->in_validation) {
+                        BUG_ON(failrec->this_mirror != failed_mirror);
+                        failrec->in_validation = 0;
+                        failrec->this_mirror = 0;
+                }
+                failrec->failed_mirror = failed_mirror;
+                failrec->this_mirror++;
+                if (failrec->this_mirror == failed_mirror)
+                        failrec->this_mirror++;
+                read_mode = READ_SYNC;
+        }
+        if (!state || failrec->this_mirror > num_copies) {
+                pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
+                         "next_mirror %d, failed_mirror %d\n", state,
+                         num_copies, failrec->this_mirror, failed_mirror);
+                free_io_failure(inode, failrec, 0);
+                return -EIO;
+        }
+        bio = bio_alloc(GFP_NOFS, 1);
+        bio->bi_private = state;
+        bio->bi_end_io = failed_bio->bi_end_io;
+        bio->bi_sector = failrec->logical >> 9;
+        bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
+        bio->bi_size = 0;
+        bio_add_page(bio, page, failrec->len, start - page_offset(page));
+        pr_debug("bio_readpage_error: submitting new read[%#x] to "
+                 "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
+                 failrec->this_mirror, num_copies, failrec->in_validation);
+        tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
+                                        failrec->bio_flags, 0);
+        return 0;
+}
 /* lots and lots of room for performance fixes in the end_bio funcs */
 /*
@@ -1697,6 +2060,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                struct extent_state *cached = NULL;
                struct extent_state *state;
+                pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
+                         "mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
+                         (long int)bio->bi_bdev);
                tree = &BTRFS_I(page->mapping->host)->io_tree;
                start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1727,11 +2093,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
                                                              state);
                        if (ret)
                                uptodate = 0;
+                        else
+                                clean_io_failure(start, page);
                }
-                if (!uptodate && tree->ops &&
+                if (!uptodate) {
-                    tree->ops->readpage_io_failed_hook) {
+                        u64 failed_mirror;
-                        ret = tree->ops->readpage_io_failed_hook(bio, page,
+                        failed_mirror = (u64)bio->bi_bdev;
-                                                         start, end, NULL);
+                        if (tree->ops && tree->ops->readpage_io_failed_hook)
+                                ret = tree->ops->readpage_io_failed_hook(
+                                                bio, page, start, end,
+                                                failed_mirror, NULL);
+                        else
+                                ret = bio_readpage_error(bio, page, start, end,
+                                                         failed_mirror, NULL);
                        if (ret == 0) {
                                uptodate =
                                        test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -1811,6 +2185,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
                                           mirror_num, bio_flags, start);
        else
                submit_bio(rw, bio);
        if (bio_flagged(bio, BIO_EOPNOTSUPP))
                ret = -EOPNOTSUPP;
        bio_put(bio);
@@ -2926,7 +3301,7 @@ out:
        return ret;
 }
-static inline struct page *extent_buffer_page(struct extent_buffer *eb,
+inline struct page *extent_buffer_page(struct extent_buffer *eb,
                                              unsigned long i)
 {
        struct page *p;
@@ -2951,7 +3326,7 @@ static inline struct page *extent_buffer_page(struct extent_buffer *eb,
        return p;
 }
-static inline unsigned long num_extent_pages(u64 start, u64 len)
+inline unsigned long num_extent_pages(u64 start, u64 len)
 {
        return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
                (start >> PAGE_CACHE_SHIFT);
author	Jan Schmidt <list.btrfs@jan-o-sch.net>	2011-07-22 09:41:52 -0400
committer	Jan Schmidt <list.btrfs@jan-o-sch.net>	2011-09-29 07:38:42 -0400
commit	4a54c8c165b66300830a67349fc7595e3fc442f7 (patch)
tree	fea3a5e86c2ee9304711e18b2557912166d6d52f /fs/btrfs/extent_io.c
parent	2774b2ca3d49124bf1ae89e8d575b3dab4221266 (diff)

diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index afebb95e3490..624ef10d36cc 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c
@@ -17,6 +17,7 @@
17	#include "compat.h"	17	#include "compat.h"
18	#include "ctree.h"	18	#include "ctree.h"
19	#include "btrfs_inode.h"	19	#include "btrfs_inode.h"
		20	#include "volumes.h"
20		21
21	static struct kmem_cache *extent_state_cache;	22	static struct kmem_cache *extent_state_cache;
22	static struct kmem_cache *extent_buffer_cache;	23	static struct kmem_cache *extent_buffer_cache;
@@ -1599,6 +1600,368 @@ static int check_page_writeback(struct extent_io_tree *tree,
1599	return 0;	1600	return 0;
1600	}	1601	}
1601		1602
		1603	/*
		1604	* When IO fails, either with EIO or csum verification fails, we
		1605	* try other mirrors that might have a good copy of the data. This
		1606	* io_failure_record is used to record state as we go through all the
		1607	* mirrors. If another mirror has good data, the page is set up to date
		1608	* and things continue. If a good mirror can't be found, the original
		1609	* bio end_io callback is called to indicate things have failed.
		1610	*/
		1611	struct io_failure_record {
		1612	struct page *page;
		1613	u64 start;
		1614	u64 len;
		1615	u64 logical;
		1616	unsigned long bio_flags;
		1617	int this_mirror;
		1618	int failed_mirror;
		1619	int in_validation;
		1620	};
		1621
		1622	static int free_io_failure(struct inode inode, struct io_failure_record rec,
		1623	int did_repair)
		1624	{
		1625	int ret;
		1626	int err = 0;
		1627	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
		1628
		1629	set_state_private(failure_tree, rec->start, 0);
		1630	ret = clear_extent_bits(failure_tree, rec->start,
		1631	rec->start + rec->len - 1,
		1632	EXTENT_LOCKED \| EXTENT_DIRTY, GFP_NOFS);
		1633	if (ret)
		1634	err = ret;
		1635
		1636	if (did_repair) {
		1637	ret = clear_extent_bits(&BTRFS_I(inode)->io_tree, rec->start,
		1638	rec->start + rec->len - 1,
		1639	EXTENT_DAMAGED, GFP_NOFS);
		1640	if (ret && !err)
		1641	err = ret;
		1642	}
		1643
		1644	kfree(rec);
		1645	return err;
		1646	}
		1647
		1648	static void repair_io_failure_callback(struct bio *bio, int err)
		1649	{
		1650	complete(bio->bi_private);
		1651	}
		1652
		1653	/*
		1654	* this bypasses the standard btrfs submit functions deliberately, as
		1655	* the standard behavior is to write all copies in a raid setup. here we only
		1656	* want to write the one bad copy. so we do the mapping for ourselves and issue
		1657	* submit_bio directly.
		1658	* to avoid any synchonization issues, wait for the data after writing, which
		1659	* actually prevents the read that triggered the error from finishing.
		1660	* currently, there can be no more than two copies of every data bit. thus,
		1661	* exactly one rewrite is required.
		1662	*/
		1663	int repair_io_failure(struct btrfs_mapping_tree *map_tree, u64 start,
		1664	u64 length, u64 logical, struct page *page,
		1665	int mirror_num)
		1666	{
		1667	struct bio *bio;
		1668	struct btrfs_device *dev;
		1669	DECLARE_COMPLETION_ONSTACK(compl);
		1670	u64 map_length = 0;
		1671	u64 sector;
		1672	struct btrfs_bio *bbio = NULL;
		1673	int ret;
		1674
		1675	BUG_ON(!mirror_num);
		1676
		1677	bio = bio_alloc(GFP_NOFS, 1);
		1678	if (!bio)
		1679	return -EIO;
		1680	bio->bi_private = &compl;
		1681	bio->bi_end_io = repair_io_failure_callback;
		1682	bio->bi_size = 0;
		1683	map_length = length;
		1684
		1685	ret = btrfs_map_block(map_tree, WRITE, logical,
		1686	&map_length, &bbio, mirror_num);
		1687	if (ret) {
		1688	bio_put(bio);
		1689	return -EIO;
		1690	}
		1691	BUG_ON(mirror_num != bbio->mirror_num);
		1692	sector = bbio->stripes[mirror_num-1].physical >> 9;
		1693	bio->bi_sector = sector;
		1694	dev = bbio->stripes[mirror_num-1].dev;
		1695	kfree(bbio);
		1696	if (!dev \|\| !dev->bdev \|\| !dev->writeable) {
		1697	bio_put(bio);
		1698	return -EIO;
		1699	}
		1700	bio->bi_bdev = dev->bdev;
		1701	bio_add_page(bio, page, length, start-page_offset(page));
		1702	submit_bio(WRITE_SYNC, bio);
		1703	wait_for_completion(&compl);
		1704
		1705	if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) {
		1706	/* try to remap that extent elsewhere? */
		1707	bio_put(bio);
		1708	return -EIO;
		1709	}
		1710
		1711	printk(KERN_INFO "btrfs read error corrected: ino %lu off %llu (dev %s "
		1712	"sector %llu)\n", page->mapping->host->i_ino, start,
		1713	dev->name, sector);
		1714
		1715	bio_put(bio);
		1716	return 0;
		1717	}
		1718
		1719	/*
		1720	* each time an IO finishes, we do a fast check in the IO failure tree
		1721	* to see if we need to process or clean up an io_failure_record
		1722	*/
		1723	static int clean_io_failure(u64 start, struct page *page)
		1724	{
		1725	u64 private;
		1726	u64 private_failure;
		1727	struct io_failure_record *failrec;
		1728	struct btrfs_mapping_tree *map_tree;
		1729	struct extent_state *state;
		1730	int num_copies;
		1731	int did_repair = 0;
		1732	int ret;
		1733	struct inode *inode = page->mapping->host;
		1734
		1735	private = 0;
		1736	ret = count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private,
		1737	(u64)-1, 1, EXTENT_DIRTY, 0);
		1738	if (!ret)
		1739	return 0;
		1740
		1741	ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, start,
		1742	&private_failure);
		1743	if (ret)
		1744	return 0;
		1745
		1746	failrec = (struct io_failure_record *)(unsigned long) private_failure;
		1747	BUG_ON(!failrec->this_mirror);
		1748
		1749	if (failrec->in_validation) {
		1750	/* there was no real error, just free the record */
		1751	pr_debug("clean_io_failure: freeing dummy error at %llu\n",
		1752	failrec->start);
		1753	did_repair = 1;
		1754	goto out;
		1755	}
		1756
		1757	spin_lock(&BTRFS_I(inode)->io_tree.lock);
		1758	state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree,
		1759	failrec->start,
		1760	EXTENT_LOCKED);
		1761	spin_unlock(&BTRFS_I(inode)->io_tree.lock);
		1762
		1763	if (state && state->start == failrec->start) {
		1764	map_tree = &BTRFS_I(inode)->root->fs_info->mapping_tree;
		1765	num_copies = btrfs_num_copies(map_tree, failrec->logical,
		1766	failrec->len);
		1767	if (num_copies > 1) {
		1768	ret = repair_io_failure(map_tree, start, failrec->len,
		1769	failrec->logical, page,
		1770	failrec->failed_mirror);
		1771	did_repair = !ret;
		1772	}
		1773	}
		1774
		1775	out:
		1776	if (!ret)
		1777	ret = free_io_failure(inode, failrec, did_repair);
		1778
		1779	return ret;
		1780	}
		1781
		1782	/*
		1783	* this is a generic handler for readpage errors (default
		1784	* readpage_io_failed_hook). if other copies exist, read those and write back
		1785	* good data to the failed position. does not investigate in remapping the
		1786	* failed extent elsewhere, hoping the device will be smart enough to do this as
		1787	* needed
		1788	*/
		1789
		1790	static int bio_readpage_error(struct bio failed_bio, struct page page,
		1791	u64 start, u64 end, int failed_mirror,
		1792	struct extent_state *state)
		1793	{
		1794	struct io_failure_record *failrec = NULL;
		1795	u64 private;
		1796	struct extent_map *em;
		1797	struct inode *inode = page->mapping->host;
		1798	struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
		1799	struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree;
		1800	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
		1801	struct bio *bio;
		1802	int num_copies;
		1803	int ret;
		1804	int read_mode;
		1805	u64 logical;
		1806
		1807	BUG_ON(failed_bio->bi_rw & REQ_WRITE);
		1808
		1809	ret = get_state_private(failure_tree, start, &private);
		1810	if (ret) {
		1811	failrec = kzalloc(sizeof(*failrec), GFP_NOFS);
		1812	if (!failrec)
		1813	return -ENOMEM;
		1814	failrec->start = start;
		1815	failrec->len = end - start + 1;
		1816	failrec->this_mirror = 0;
		1817	failrec->bio_flags = 0;
		1818	failrec->in_validation = 0;
		1819
		1820	read_lock(&em_tree->lock);
		1821	em = lookup_extent_mapping(em_tree, start, failrec->len);
		1822	if (!em) {
		1823	read_unlock(&em_tree->lock);
		1824	kfree(failrec);
		1825	return -EIO;
		1826	}
		1827
		1828	if (em->start > start \|\| em->start + em->len < start) {
		1829	free_extent_map(em);
		1830	em = NULL;
		1831	}
		1832	read_unlock(&em_tree->lock);
		1833
		1834	if (!em \|\| IS_ERR(em)) {
		1835	kfree(failrec);
		1836	return -EIO;
		1837	}
		1838	logical = start - em->start;
		1839	logical = em->block_start + logical;
		1840	if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) {
		1841	logical = em->block_start;
		1842	failrec->bio_flags = EXTENT_BIO_COMPRESSED;
		1843	extent_set_compress_type(&failrec->bio_flags,
		1844	em->compress_type);
		1845	}
		1846	pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, "
		1847	"len=%llu\n", logical, start, failrec->len);
		1848	failrec->logical = logical;
		1849	free_extent_map(em);
		1850
		1851	/* set the bits in the private failure tree */
		1852	ret = set_extent_bits(failure_tree, start, end,
		1853	EXTENT_LOCKED \| EXTENT_DIRTY, GFP_NOFS);
		1854	if (ret >= 0)
		1855	ret = set_state_private(failure_tree, start,
		1856	(u64)(unsigned long)failrec);
		1857	/* set the bits in the inode's tree */
		1858	if (ret >= 0)
		1859	ret = set_extent_bits(tree, start, end, EXTENT_DAMAGED,
		1860	GFP_NOFS);
		1861	if (ret < 0) {
		1862	kfree(failrec);
		1863	return ret;
		1864	}
		1865	} else {
		1866	failrec = (struct io_failure_record *)(unsigned long)private;
		1867	pr_debug("bio_readpage_error: (found) logical=%llu, "
		1868	"start=%llu, len=%llu, validation=%d\n",
		1869	failrec->logical, failrec->start, failrec->len,
		1870	failrec->in_validation);
		1871	/*
		1872	* when data can be on disk more than twice, add to failrec here
		1873	* (e.g. with a list for failed_mirror) to make
		1874	* clean_io_failure() clean all those errors at once.
		1875	*/
		1876	}
		1877	num_copies = btrfs_num_copies(
		1878	&BTRFS_I(inode)->root->fs_info->mapping_tree,
		1879	failrec->logical, failrec->len);
		1880	if (num_copies == 1) {
		1881	/*
		1882	* we only have a single copy of the data, so don't bother with
		1883	* all the retry and error correction code that follows. no
		1884	* matter what the error is, it is very likely to persist.
		1885	*/
		1886	pr_debug("bio_readpage_error: cannot repair, num_copies == 1. "
		1887	"state=%p, num_copies=%d, next_mirror %d, "
		1888	"failed_mirror %d\n", state, num_copies,
		1889	failrec->this_mirror, failed_mirror);
		1890	free_io_failure(inode, failrec, 0);
		1891	return -EIO;
		1892	}
		1893
		1894	if (!state) {
		1895	spin_lock(&tree->lock);
		1896	state = find_first_extent_bit_state(tree, failrec->start,
		1897	EXTENT_LOCKED);
		1898	if (state && state->start != failrec->start)
		1899	state = NULL;
		1900	spin_unlock(&tree->lock);
		1901	}
		1902
		1903	/*
		1904	* there are two premises:
		1905	* a) deliver good data to the caller
		1906	* b) correct the bad sectors on disk
		1907	*/
		1908	if (failed_bio->bi_vcnt > 1) {
		1909	/*
		1910	* to fulfill b), we need to know the exact failing sectors, as
		1911	* we don't want to rewrite any more than the failed ones. thus,
		1912	* we need separate read requests for the failed bio
		1913	*
		1914	* if the following BUG_ON triggers, our validation request got
		1915	* merged. we need separate requests for our algorithm to work.
		1916	*/
		1917	BUG_ON(failrec->in_validation);
		1918	failrec->in_validation = 1;
		1919	failrec->this_mirror = failed_mirror;
		1920	read_mode = READ_SYNC \| REQ_FAILFAST_DEV;
		1921	} else {
		1922	/*
		1923	* we're ready to fulfill a) and b) alongside. get a good copy
		1924	* of the failed sector and if we succeed, we have setup
		1925	* everything for repair_io_failure to do the rest for us.
		1926	*/
		1927	if (failrec->in_validation) {
		1928	BUG_ON(failrec->this_mirror != failed_mirror);
		1929	failrec->in_validation = 0;
		1930	failrec->this_mirror = 0;
		1931	}
		1932	failrec->failed_mirror = failed_mirror;
		1933	failrec->this_mirror++;
		1934	if (failrec->this_mirror == failed_mirror)
		1935	failrec->this_mirror++;
		1936	read_mode = READ_SYNC;
		1937	}
		1938
		1939	if (!state \|\| failrec->this_mirror > num_copies) {
		1940	pr_debug("bio_readpage_error: (fail) state=%p, num_copies=%d, "
		1941	"next_mirror %d, failed_mirror %d\n", state,
		1942	num_copies, failrec->this_mirror, failed_mirror);
		1943	free_io_failure(inode, failrec, 0);
		1944	return -EIO;
		1945	}
		1946
		1947	bio = bio_alloc(GFP_NOFS, 1);
		1948	bio->bi_private = state;
		1949	bio->bi_end_io = failed_bio->bi_end_io;
		1950	bio->bi_sector = failrec->logical >> 9;
		1951	bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev;
		1952	bio->bi_size = 0;
		1953
		1954	bio_add_page(bio, page, failrec->len, start - page_offset(page));
		1955
		1956	pr_debug("bio_readpage_error: submitting new read[%#x] to "
		1957	"this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode,
		1958	failrec->this_mirror, num_copies, failrec->in_validation);
		1959
		1960	tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror,
		1961	failrec->bio_flags, 0);
		1962	return 0;
		1963	}
		1964
1602	/* lots and lots of room for performance fixes in the end_bio funcs */	1965	/* lots and lots of room for performance fixes in the end_bio funcs */
1603		1966
1604	/*	1967	/*
@@ -1697,6 +2060,9 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1697	struct extent_state *cached = NULL;	2060	struct extent_state *cached = NULL;
1698	struct extent_state *state;	2061	struct extent_state *state;
1699		2062
		2063	pr_debug("end_bio_extent_readpage: bi_vcnt=%d, idx=%d, err=%d, "
		2064	"mirror=%ld\n", bio->bi_vcnt, bio->bi_idx, err,
		2065	(long int)bio->bi_bdev);
1700	tree = &BTRFS_I(page->mapping->host)->io_tree;	2066	tree = &BTRFS_I(page->mapping->host)->io_tree;
1701		2067
1702	start = ((u64)page->index << PAGE_CACHE_SHIFT) +	2068	start = ((u64)page->index << PAGE_CACHE_SHIFT) +
@@ -1727,11 +2093,19 @@ static void end_bio_extent_readpage(struct bio *bio, int err)
1727	state);	2093	state);
1728	if (ret)	2094	if (ret)
1729	uptodate = 0;	2095	uptodate = 0;
		2096	else
		2097	clean_io_failure(start, page);
1730	}	2098	}
1731	if (!uptodate && tree->ops &&	2099	if (!uptodate) {
1732	tree->ops->readpage_io_failed_hook) {	2100	u64 failed_mirror;
1733	ret = tree->ops->readpage_io_failed_hook(bio, page,	2101	failed_mirror = (u64)bio->bi_bdev;
1734	start, end, NULL);	2102	if (tree->ops && tree->ops->readpage_io_failed_hook)
		2103	ret = tree->ops->readpage_io_failed_hook(
		2104	bio, page, start, end,
		2105	failed_mirror, NULL);
		2106	else
		2107	ret = bio_readpage_error(bio, page, start, end,
		2108	failed_mirror, NULL);
1735	if (ret == 0) {	2109	if (ret == 0) {
1736	uptodate =	2110	uptodate =
1737	test_bit(BIO_UPTODATE, &bio->bi_flags);	2111	test_bit(BIO_UPTODATE, &bio->bi_flags);
@@ -1811,6 +2185,7 @@ static int submit_one_bio(int rw, struct bio *bio, int mirror_num,
1811	mirror_num, bio_flags, start);	2185	mirror_num, bio_flags, start);
1812	else	2186	else
1813	submit_bio(rw, bio);	2187	submit_bio(rw, bio);
		2188
1814	if (bio_flagged(bio, BIO_EOPNOTSUPP))	2189	if (bio_flagged(bio, BIO_EOPNOTSUPP))
1815	ret = -EOPNOTSUPP;	2190	ret = -EOPNOTSUPP;
1816	bio_put(bio);	2191	bio_put(bio);
@@ -2926,7 +3301,7 @@ out:
2926	return ret;	3301	return ret;
2927	}	3302	}
2928		3303
2929	static inline struct page extent_buffer_page(struct extent_buffer eb,	3304	inline struct page extent_buffer_page(struct extent_buffer eb,
2930	unsigned long i)	3305	unsigned long i)
2931	{	3306	{
2932	struct page *p;	3307	struct page *p;
@@ -2951,7 +3326,7 @@ static inline struct page extent_buffer_page(struct extent_buffer eb,
2951	return p;	3326	return p;
2952	}	3327	}
2953		3328
2954	static inline unsigned long num_extent_pages(u64 start, u64 len)	3329	inline unsigned long num_extent_pages(u64 start, u64 len)
2955	{	3330	{
2956	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -	3331	return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) -
2957	(start >> PAGE_CACHE_SHIFT);	3332	(start >> PAGE_CACHE_SHIFT);