diff options
| author | NeilBrown <neilb@suse.de> | 2011-07-27 21:39:25 -0400 |
|---|---|---|
| committer | NeilBrown <neilb@suse.de> | 2011-07-27 21:39:25 -0400 |
| commit | 58c54fcca3bac5bf9290cfed31c76e4c4bfbabaf (patch) | |
| tree | 25f663873429468c3b582bc7544f983759b7592e | |
| parent | 5e5702898e93eee7d69b6efde109609a89a61001 (diff) | |
md/raid10: handle further errors during fix_read_error better.
If we find more read/write errors we should record a bad block before
failing the device.
Signed-off-by: NeilBrown <neilb@suse.de>
| -rw-r--r-- | drivers/md/raid10.c | 59 |
1 files changed, 44 insertions, 15 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index fc9ebbab3f62..8b29cd4f01c8 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
| @@ -1749,6 +1749,26 @@ static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev) | |||
| 1749 | atomic_set(&rdev->read_errors, read_errors >> hours_since_last); | 1749 | atomic_set(&rdev->read_errors, read_errors >> hours_since_last); |
| 1750 | } | 1750 | } |
| 1751 | 1751 | ||
| 1752 | static int r10_sync_page_io(mdk_rdev_t *rdev, sector_t sector, | ||
| 1753 | int sectors, struct page *page, int rw) | ||
| 1754 | { | ||
| 1755 | sector_t first_bad; | ||
| 1756 | int bad_sectors; | ||
| 1757 | |||
| 1758 | if (is_badblock(rdev, sector, sectors, &first_bad, &bad_sectors) | ||
| 1759 | && (rw == READ || test_bit(WriteErrorSeen, &rdev->flags))) | ||
| 1760 | return -1; | ||
| 1761 | if (sync_page_io(rdev, sector, sectors << 9, page, rw, false)) | ||
| 1762 | /* success */ | ||
| 1763 | return 1; | ||
| 1764 | if (rw == WRITE) | ||
| 1765 | set_bit(WriteErrorSeen, &rdev->flags); | ||
| 1766 | /* need to record an error - either for the block or the device */ | ||
| 1767 | if (!rdev_set_badblocks(rdev, sector, sectors, 0)) | ||
| 1768 | md_error(rdev->mddev, rdev); | ||
| 1769 | return 0; | ||
| 1770 | } | ||
| 1771 | |||
| 1752 | /* | 1772 | /* |
| 1753 | * This is a kernel thread which: | 1773 | * This is a kernel thread which: |
| 1754 | * | 1774 | * |
| @@ -1832,9 +1852,19 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
| 1832 | rcu_read_unlock(); | 1852 | rcu_read_unlock(); |
| 1833 | 1853 | ||
| 1834 | if (!success) { | 1854 | if (!success) { |
| 1835 | /* Cannot read from anywhere -- bye bye array */ | 1855 | /* Cannot read from anywhere, just mark the block |
| 1856 | * as bad on the first device to discourage future | ||
| 1857 | * reads. | ||
| 1858 | */ | ||
| 1836 | int dn = r10_bio->devs[r10_bio->read_slot].devnum; | 1859 | int dn = r10_bio->devs[r10_bio->read_slot].devnum; |
| 1837 | md_error(mddev, conf->mirrors[dn].rdev); | 1860 | rdev = conf->mirrors[dn].rdev; |
| 1861 | |||
| 1862 | if (!rdev_set_badblocks( | ||
| 1863 | rdev, | ||
| 1864 | r10_bio->devs[r10_bio->read_slot].addr | ||
| 1865 | + sect, | ||
| 1866 | s, 0)) | ||
| 1867 | md_error(mddev, rdev); | ||
| 1838 | break; | 1868 | break; |
| 1839 | } | 1869 | } |
| 1840 | 1870 | ||
| @@ -1855,10 +1885,10 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
| 1855 | 1885 | ||
| 1856 | atomic_inc(&rdev->nr_pending); | 1886 | atomic_inc(&rdev->nr_pending); |
| 1857 | rcu_read_unlock(); | 1887 | rcu_read_unlock(); |
| 1858 | if (sync_page_io(rdev, | 1888 | if (r10_sync_page_io(rdev, |
| 1859 | r10_bio->devs[sl].addr + | 1889 | r10_bio->devs[sl].addr + |
| 1860 | sect, | 1890 | sect, |
| 1861 | s<<9, conf->tmppage, WRITE, false) | 1891 | s<<9, conf->tmppage, WRITE) |
| 1862 | == 0) { | 1892 | == 0) { |
| 1863 | /* Well, this device is dead */ | 1893 | /* Well, this device is dead */ |
| 1864 | printk(KERN_NOTICE | 1894 | printk(KERN_NOTICE |
| @@ -1873,7 +1903,6 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
| 1873 | "drive\n", | 1903 | "drive\n", |
| 1874 | mdname(mddev), | 1904 | mdname(mddev), |
| 1875 | bdevname(rdev->bdev, b)); | 1905 | bdevname(rdev->bdev, b)); |
| 1876 | md_error(mddev, rdev); | ||
| 1877 | } | 1906 | } |
| 1878 | rdev_dec_pending(rdev, mddev); | 1907 | rdev_dec_pending(rdev, mddev); |
| 1879 | rcu_read_lock(); | 1908 | rcu_read_lock(); |
| @@ -1893,11 +1922,12 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
| 1893 | 1922 | ||
| 1894 | atomic_inc(&rdev->nr_pending); | 1923 | atomic_inc(&rdev->nr_pending); |
| 1895 | rcu_read_unlock(); | 1924 | rcu_read_unlock(); |
| 1896 | if (sync_page_io(rdev, | 1925 | switch (r10_sync_page_io(rdev, |
| 1897 | r10_bio->devs[sl].addr + | 1926 | r10_bio->devs[sl].addr + |
| 1898 | sect, | 1927 | sect, |
| 1899 | s<<9, conf->tmppage, | 1928 | s<<9, conf->tmppage, |
| 1900 | READ, false) == 0) { | 1929 | READ)) { |
| 1930 | case 0: | ||
| 1901 | /* Well, this device is dead */ | 1931 | /* Well, this device is dead */ |
| 1902 | printk(KERN_NOTICE | 1932 | printk(KERN_NOTICE |
| 1903 | "md/raid10:%s: unable to read back " | 1933 | "md/raid10:%s: unable to read back " |
| @@ -1911,9 +1941,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
| 1911 | "drive\n", | 1941 | "drive\n", |
| 1912 | mdname(mddev), | 1942 | mdname(mddev), |
| 1913 | bdevname(rdev->bdev, b)); | 1943 | bdevname(rdev->bdev, b)); |
| 1914 | 1944 | break; | |
| 1915 | md_error(mddev, rdev); | 1945 | case 1: |
| 1916 | } else { | ||
| 1917 | printk(KERN_INFO | 1946 | printk(KERN_INFO |
| 1918 | "md/raid10:%s: read error corrected" | 1947 | "md/raid10:%s: read error corrected" |
| 1919 | " (%d sectors at %llu on %s)\n", | 1948 | " (%d sectors at %llu on %s)\n", |
