diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-28 08:50:27 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-07-28 08:50:27 -0400 |
commit | 6140333d3656f62ac7e6a5af87e7fe92cfb8d655 (patch) | |
tree | d96f7ad2196b4383f5ca4396c956e24c82b2952c /drivers/md/md.c | |
parent | 6f56c218666b5c7eff354364357307d18c10058b (diff) | |
parent | 58c54fcca3bac5bf9290cfed31c76e4c4bfbabaf (diff) |
Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md: (75 commits)
md/raid10: handle further errors during fix_read_error better.
md/raid10: Handle read errors during recovery better.
md/raid10: simplify read error handling during recovery.
md/raid10: record bad blocks due to write errors during resync/recovery.
md/raid10: attempt to fix read errors during resync/check
md/raid10: Handle write errors by updating badblock log.
md/raid10: clear bad-block record when write succeeds.
md/raid10: avoid writing to known bad blocks on known bad drives.
md/raid10 record bad blocks as needed during recovery.
md/raid10: avoid reading known bad blocks during resync/recovery.
md/raid10 - avoid reading from known bad blocks - part 3
md/raid10: avoid reading from known bad blocks - part 2
md/raid10: avoid reading from known bad blocks - part 1
md/raid10: Split handle_read_error out from raid10d.
md/raid10: simplify/reindent some loops.
md/raid5: Clear bad blocks on successful write.
md/raid5. Don't write to known bad block on doubtful devices.
md/raid5: write errors should be recorded as bad blocks if possible.
md/raid5: use bad-block log to improve handling of uncorrectable read errors.
md/raid5: avoid reading from known bad blocks.
...
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r-- | drivers/md/md.c | 871 |
1 files changed, 785 insertions, 86 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c index dfc9425db70b..8e221a20f5d9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -215,6 +215,55 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, | |||
215 | } | 215 | } |
216 | EXPORT_SYMBOL_GPL(bio_clone_mddev); | 216 | EXPORT_SYMBOL_GPL(bio_clone_mddev); |
217 | 217 | ||
218 | void md_trim_bio(struct bio *bio, int offset, int size) | ||
219 | { | ||
220 | /* 'bio' is a cloned bio which we need to trim to match | ||
221 | * the given offset and size. | ||
222 | * This requires adjusting bi_sector, bi_size, and bi_io_vec | ||
223 | */ | ||
224 | int i; | ||
225 | struct bio_vec *bvec; | ||
226 | int sofar = 0; | ||
227 | |||
228 | size <<= 9; | ||
229 | if (offset == 0 && size == bio->bi_size) | ||
230 | return; | ||
231 | |||
232 | bio->bi_sector += offset; | ||
233 | bio->bi_size = size; | ||
234 | offset <<= 9; | ||
235 | clear_bit(BIO_SEG_VALID, &bio->bi_flags); | ||
236 | |||
237 | while (bio->bi_idx < bio->bi_vcnt && | ||
238 | bio->bi_io_vec[bio->bi_idx].bv_len <= offset) { | ||
239 | /* remove this whole bio_vec */ | ||
240 | offset -= bio->bi_io_vec[bio->bi_idx].bv_len; | ||
241 | bio->bi_idx++; | ||
242 | } | ||
243 | if (bio->bi_idx < bio->bi_vcnt) { | ||
244 | bio->bi_io_vec[bio->bi_idx].bv_offset += offset; | ||
245 | bio->bi_io_vec[bio->bi_idx].bv_len -= offset; | ||
246 | } | ||
247 | /* avoid any complications with bi_idx being non-zero*/ | ||
248 | if (bio->bi_idx) { | ||
249 | memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx, | ||
250 | (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec)); | ||
251 | bio->bi_vcnt -= bio->bi_idx; | ||
252 | bio->bi_idx = 0; | ||
253 | } | ||
254 | /* Make sure vcnt and last bv are not too big */ | ||
255 | bio_for_each_segment(bvec, bio, i) { | ||
256 | if (sofar + bvec->bv_len > size) | ||
257 | bvec->bv_len = size - sofar; | ||
258 | if (bvec->bv_len == 0) { | ||
259 | bio->bi_vcnt = i; | ||
260 | break; | ||
261 | } | ||
262 | sofar += bvec->bv_len; | ||
263 | } | ||
264 | } | ||
265 | EXPORT_SYMBOL_GPL(md_trim_bio); | ||
266 | |||
218 | /* | 267 | /* |
219 | * We have a system wide 'event count' that is incremented | 268 | * We have a system wide 'event count' that is incremented |
220 | * on any 'interesting' event, and readers of /proc/mdstat | 269 | * on any 'interesting' event, and readers of /proc/mdstat |
@@ -757,6 +806,10 @@ static void free_disk_sb(mdk_rdev_t * rdev) | |||
757 | rdev->sb_start = 0; | 806 | rdev->sb_start = 0; |
758 | rdev->sectors = 0; | 807 | rdev->sectors = 0; |
759 | } | 808 | } |
809 | if (rdev->bb_page) { | ||
810 | put_page(rdev->bb_page); | ||
811 | rdev->bb_page = NULL; | ||
812 | } | ||
760 | } | 813 | } |
761 | 814 | ||
762 | 815 | ||
@@ -1025,7 +1078,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
1025 | ret = -EINVAL; | 1078 | ret = -EINVAL; |
1026 | 1079 | ||
1027 | bdevname(rdev->bdev, b); | 1080 | bdevname(rdev->bdev, b); |
1028 | sb = (mdp_super_t*)page_address(rdev->sb_page); | 1081 | sb = page_address(rdev->sb_page); |
1029 | 1082 | ||
1030 | if (sb->md_magic != MD_SB_MAGIC) { | 1083 | if (sb->md_magic != MD_SB_MAGIC) { |
1031 | printk(KERN_ERR "md: invalid raid superblock magic on %s\n", | 1084 | printk(KERN_ERR "md: invalid raid superblock magic on %s\n", |
@@ -1054,6 +1107,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
1054 | rdev->preferred_minor = sb->md_minor; | 1107 | rdev->preferred_minor = sb->md_minor; |
1055 | rdev->data_offset = 0; | 1108 | rdev->data_offset = 0; |
1056 | rdev->sb_size = MD_SB_BYTES; | 1109 | rdev->sb_size = MD_SB_BYTES; |
1110 | rdev->badblocks.shift = -1; | ||
1057 | 1111 | ||
1058 | if (sb->level == LEVEL_MULTIPATH) | 1112 | if (sb->level == LEVEL_MULTIPATH) |
1059 | rdev->desc_nr = -1; | 1113 | rdev->desc_nr = -1; |
@@ -1064,7 +1118,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
1064 | ret = 1; | 1118 | ret = 1; |
1065 | } else { | 1119 | } else { |
1066 | __u64 ev1, ev2; | 1120 | __u64 ev1, ev2; |
1067 | mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); | 1121 | mdp_super_t *refsb = page_address(refdev->sb_page); |
1068 | if (!uuid_equal(refsb, sb)) { | 1122 | if (!uuid_equal(refsb, sb)) { |
1069 | printk(KERN_WARNING "md: %s has different UUID to %s\n", | 1123 | printk(KERN_WARNING "md: %s has different UUID to %s\n", |
1070 | b, bdevname(refdev->bdev,b2)); | 1124 | b, bdevname(refdev->bdev,b2)); |
@@ -1099,7 +1153,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version | |||
1099 | static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) | 1153 | static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) |
1100 | { | 1154 | { |
1101 | mdp_disk_t *desc; | 1155 | mdp_disk_t *desc; |
1102 | mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); | 1156 | mdp_super_t *sb = page_address(rdev->sb_page); |
1103 | __u64 ev1 = md_event(sb); | 1157 | __u64 ev1 = md_event(sb); |
1104 | 1158 | ||
1105 | rdev->raid_disk = -1; | 1159 | rdev->raid_disk = -1; |
@@ -1230,7 +1284,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1230 | 1284 | ||
1231 | rdev->sb_size = MD_SB_BYTES; | 1285 | rdev->sb_size = MD_SB_BYTES; |
1232 | 1286 | ||
1233 | sb = (mdp_super_t*)page_address(rdev->sb_page); | 1287 | sb = page_address(rdev->sb_page); |
1234 | 1288 | ||
1235 | memset(sb, 0, sizeof(*sb)); | 1289 | memset(sb, 0, sizeof(*sb)); |
1236 | 1290 | ||
@@ -1395,6 +1449,8 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb) | |||
1395 | return cpu_to_le32(csum); | 1449 | return cpu_to_le32(csum); |
1396 | } | 1450 | } |
1397 | 1451 | ||
1452 | static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, | ||
1453 | int acknowledged); | ||
1398 | static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | 1454 | static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) |
1399 | { | 1455 | { |
1400 | struct mdp_superblock_1 *sb; | 1456 | struct mdp_superblock_1 *sb; |
@@ -1435,7 +1491,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
1435 | if (ret) return ret; | 1491 | if (ret) return ret; |
1436 | 1492 | ||
1437 | 1493 | ||
1438 | sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); | 1494 | sb = page_address(rdev->sb_page); |
1439 | 1495 | ||
1440 | if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || | 1496 | if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || |
1441 | sb->major_version != cpu_to_le32(1) || | 1497 | sb->major_version != cpu_to_le32(1) || |
@@ -1473,12 +1529,52 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
1473 | else | 1529 | else |
1474 | rdev->desc_nr = le32_to_cpu(sb->dev_number); | 1530 | rdev->desc_nr = le32_to_cpu(sb->dev_number); |
1475 | 1531 | ||
1532 | if (!rdev->bb_page) { | ||
1533 | rdev->bb_page = alloc_page(GFP_KERNEL); | ||
1534 | if (!rdev->bb_page) | ||
1535 | return -ENOMEM; | ||
1536 | } | ||
1537 | if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) && | ||
1538 | rdev->badblocks.count == 0) { | ||
1539 | /* need to load the bad block list. | ||
1540 | * Currently we limit it to one page. | ||
1541 | */ | ||
1542 | s32 offset; | ||
1543 | sector_t bb_sector; | ||
1544 | u64 *bbp; | ||
1545 | int i; | ||
1546 | int sectors = le16_to_cpu(sb->bblog_size); | ||
1547 | if (sectors > (PAGE_SIZE / 512)) | ||
1548 | return -EINVAL; | ||
1549 | offset = le32_to_cpu(sb->bblog_offset); | ||
1550 | if (offset == 0) | ||
1551 | return -EINVAL; | ||
1552 | bb_sector = (long long)offset; | ||
1553 | if (!sync_page_io(rdev, bb_sector, sectors << 9, | ||
1554 | rdev->bb_page, READ, true)) | ||
1555 | return -EIO; | ||
1556 | bbp = (u64 *)page_address(rdev->bb_page); | ||
1557 | rdev->badblocks.shift = sb->bblog_shift; | ||
1558 | for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) { | ||
1559 | u64 bb = le64_to_cpu(*bbp); | ||
1560 | int count = bb & (0x3ff); | ||
1561 | u64 sector = bb >> 10; | ||
1562 | sector <<= sb->bblog_shift; | ||
1563 | count <<= sb->bblog_shift; | ||
1564 | if (bb + 1 == 0) | ||
1565 | break; | ||
1566 | if (md_set_badblocks(&rdev->badblocks, | ||
1567 | sector, count, 1) == 0) | ||
1568 | return -EINVAL; | ||
1569 | } | ||
1570 | } else if (sb->bblog_offset == 0) | ||
1571 | rdev->badblocks.shift = -1; | ||
1572 | |||
1476 | if (!refdev) { | 1573 | if (!refdev) { |
1477 | ret = 1; | 1574 | ret = 1; |
1478 | } else { | 1575 | } else { |
1479 | __u64 ev1, ev2; | 1576 | __u64 ev1, ev2; |
1480 | struct mdp_superblock_1 *refsb = | 1577 | struct mdp_superblock_1 *refsb = page_address(refdev->sb_page); |
1481 | (struct mdp_superblock_1*)page_address(refdev->sb_page); | ||
1482 | 1578 | ||
1483 | if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || | 1579 | if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || |
1484 | sb->level != refsb->level || | 1580 | sb->level != refsb->level || |
@@ -1513,7 +1609,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) | |||
1513 | 1609 | ||
1514 | static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) | 1610 | static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) |
1515 | { | 1611 | { |
1516 | struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); | 1612 | struct mdp_superblock_1 *sb = page_address(rdev->sb_page); |
1517 | __u64 ev1 = le64_to_cpu(sb->events); | 1613 | __u64 ev1 = le64_to_cpu(sb->events); |
1518 | 1614 | ||
1519 | rdev->raid_disk = -1; | 1615 | rdev->raid_disk = -1; |
@@ -1619,13 +1715,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1619 | int max_dev, i; | 1715 | int max_dev, i; |
1620 | /* make rdev->sb match mddev and rdev data. */ | 1716 | /* make rdev->sb match mddev and rdev data. */ |
1621 | 1717 | ||
1622 | sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); | 1718 | sb = page_address(rdev->sb_page); |
1623 | 1719 | ||
1624 | sb->feature_map = 0; | 1720 | sb->feature_map = 0; |
1625 | sb->pad0 = 0; | 1721 | sb->pad0 = 0; |
1626 | sb->recovery_offset = cpu_to_le64(0); | 1722 | sb->recovery_offset = cpu_to_le64(0); |
1627 | memset(sb->pad1, 0, sizeof(sb->pad1)); | 1723 | memset(sb->pad1, 0, sizeof(sb->pad1)); |
1628 | memset(sb->pad2, 0, sizeof(sb->pad2)); | ||
1629 | memset(sb->pad3, 0, sizeof(sb->pad3)); | 1724 | memset(sb->pad3, 0, sizeof(sb->pad3)); |
1630 | 1725 | ||
1631 | sb->utime = cpu_to_le64((__u64)mddev->utime); | 1726 | sb->utime = cpu_to_le64((__u64)mddev->utime); |
@@ -1665,6 +1760,40 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1665 | sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); | 1760 | sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); |
1666 | } | 1761 | } |
1667 | 1762 | ||
1763 | if (rdev->badblocks.count == 0) | ||
1764 | /* Nothing to do for bad blocks*/ ; | ||
1765 | else if (sb->bblog_offset == 0) | ||
1766 | /* Cannot record bad blocks on this device */ | ||
1767 | md_error(mddev, rdev); | ||
1768 | else { | ||
1769 | struct badblocks *bb = &rdev->badblocks; | ||
1770 | u64 *bbp = (u64 *)page_address(rdev->bb_page); | ||
1771 | u64 *p = bb->page; | ||
1772 | sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS); | ||
1773 | if (bb->changed) { | ||
1774 | unsigned seq; | ||
1775 | |||
1776 | retry: | ||
1777 | seq = read_seqbegin(&bb->lock); | ||
1778 | |||
1779 | memset(bbp, 0xff, PAGE_SIZE); | ||
1780 | |||
1781 | for (i = 0 ; i < bb->count ; i++) { | ||
1782 | u64 internal_bb = *p++; | ||
1783 | u64 store_bb = ((BB_OFFSET(internal_bb) << 10) | ||
1784 | | BB_LEN(internal_bb)); | ||
1785 | *bbp++ = cpu_to_le64(store_bb); | ||
1786 | } | ||
1787 | if (read_seqretry(&bb->lock, seq)) | ||
1788 | goto retry; | ||
1789 | |||
1790 | bb->sector = (rdev->sb_start + | ||
1791 | (int)le32_to_cpu(sb->bblog_offset)); | ||
1792 | bb->size = le16_to_cpu(sb->bblog_size); | ||
1793 | bb->changed = 0; | ||
1794 | } | ||
1795 | } | ||
1796 | |||
1668 | max_dev = 0; | 1797 | max_dev = 0; |
1669 | list_for_each_entry(rdev2, &mddev->disks, same_set) | 1798 | list_for_each_entry(rdev2, &mddev->disks, same_set) |
1670 | if (rdev2->desc_nr+1 > max_dev) | 1799 | if (rdev2->desc_nr+1 > max_dev) |
@@ -1724,7 +1853,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors) | |||
1724 | num_sectors = max_sectors; | 1853 | num_sectors = max_sectors; |
1725 | rdev->sb_start = sb_start; | 1854 | rdev->sb_start = sb_start; |
1726 | } | 1855 | } |
1727 | sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); | 1856 | sb = page_address(rdev->sb_page); |
1728 | sb->data_size = cpu_to_le64(num_sectors); | 1857 | sb->data_size = cpu_to_le64(num_sectors); |
1729 | sb->super_offset = rdev->sb_start; | 1858 | sb->super_offset = rdev->sb_start; |
1730 | sb->sb_csum = calc_sb_1_csum(sb); | 1859 | sb->sb_csum = calc_sb_1_csum(sb); |
@@ -1922,7 +2051,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) | |||
1922 | bd_link_disk_holder(rdev->bdev, mddev->gendisk); | 2051 | bd_link_disk_holder(rdev->bdev, mddev->gendisk); |
1923 | 2052 | ||
1924 | /* May as well allow recovery to be retried once */ | 2053 | /* May as well allow recovery to be retried once */ |
1925 | mddev->recovery_disabled = 0; | 2054 | mddev->recovery_disabled++; |
1926 | 2055 | ||
1927 | return 0; | 2056 | return 0; |
1928 | 2057 | ||
@@ -1953,6 +2082,9 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev) | |||
1953 | sysfs_remove_link(&rdev->kobj, "block"); | 2082 | sysfs_remove_link(&rdev->kobj, "block"); |
1954 | sysfs_put(rdev->sysfs_state); | 2083 | sysfs_put(rdev->sysfs_state); |
1955 | rdev->sysfs_state = NULL; | 2084 | rdev->sysfs_state = NULL; |
2085 | kfree(rdev->badblocks.page); | ||
2086 | rdev->badblocks.count = 0; | ||
2087 | rdev->badblocks.page = NULL; | ||
1956 | /* We need to delay this, otherwise we can deadlock when | 2088 | /* We need to delay this, otherwise we can deadlock when |
1957 | * writing to 'remove' to "dev/state". We also need | 2089 | * writing to 'remove' to "dev/state". We also need |
1958 | * to delay it due to rcu usage. | 2090 | * to delay it due to rcu usage. |
@@ -2127,10 +2259,10 @@ static void print_rdev(mdk_rdev_t *rdev, int major_version) | |||
2127 | printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); | 2259 | printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); |
2128 | switch (major_version) { | 2260 | switch (major_version) { |
2129 | case 0: | 2261 | case 0: |
2130 | print_sb_90((mdp_super_t*)page_address(rdev->sb_page)); | 2262 | print_sb_90(page_address(rdev->sb_page)); |
2131 | break; | 2263 | break; |
2132 | case 1: | 2264 | case 1: |
2133 | print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page)); | 2265 | print_sb_1(page_address(rdev->sb_page)); |
2134 | break; | 2266 | break; |
2135 | } | 2267 | } |
2136 | } else | 2268 | } else |
@@ -2194,6 +2326,7 @@ static void md_update_sb(mddev_t * mddev, int force_change) | |||
2194 | mdk_rdev_t *rdev; | 2326 | mdk_rdev_t *rdev; |
2195 | int sync_req; | 2327 | int sync_req; |
2196 | int nospares = 0; | 2328 | int nospares = 0; |
2329 | int any_badblocks_changed = 0; | ||
2197 | 2330 | ||
2198 | repeat: | 2331 | repeat: |
2199 | /* First make sure individual recovery_offsets are correct */ | 2332 | /* First make sure individual recovery_offsets are correct */ |
@@ -2208,8 +2341,18 @@ repeat: | |||
2208 | if (!mddev->persistent) { | 2341 | if (!mddev->persistent) { |
2209 | clear_bit(MD_CHANGE_CLEAN, &mddev->flags); | 2342 | clear_bit(MD_CHANGE_CLEAN, &mddev->flags); |
2210 | clear_bit(MD_CHANGE_DEVS, &mddev->flags); | 2343 | clear_bit(MD_CHANGE_DEVS, &mddev->flags); |
2211 | if (!mddev->external) | 2344 | if (!mddev->external) { |
2212 | clear_bit(MD_CHANGE_PENDING, &mddev->flags); | 2345 | clear_bit(MD_CHANGE_PENDING, &mddev->flags); |
2346 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
2347 | if (rdev->badblocks.changed) { | ||
2348 | md_ack_all_badblocks(&rdev->badblocks); | ||
2349 | md_error(mddev, rdev); | ||
2350 | } | ||
2351 | clear_bit(Blocked, &rdev->flags); | ||
2352 | clear_bit(BlockedBadBlocks, &rdev->flags); | ||
2353 | wake_up(&rdev->blocked_wait); | ||
2354 | } | ||
2355 | } | ||
2213 | wake_up(&mddev->sb_wait); | 2356 | wake_up(&mddev->sb_wait); |
2214 | return; | 2357 | return; |
2215 | } | 2358 | } |
@@ -2265,6 +2408,14 @@ repeat: | |||
2265 | MD_BUG(); | 2408 | MD_BUG(); |
2266 | mddev->events --; | 2409 | mddev->events --; |
2267 | } | 2410 | } |
2411 | |||
2412 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
2413 | if (rdev->badblocks.changed) | ||
2414 | any_badblocks_changed++; | ||
2415 | if (test_bit(Faulty, &rdev->flags)) | ||
2416 | set_bit(FaultRecorded, &rdev->flags); | ||
2417 | } | ||
2418 | |||
2268 | sync_sbs(mddev, nospares); | 2419 | sync_sbs(mddev, nospares); |
2269 | spin_unlock_irq(&mddev->write_lock); | 2420 | spin_unlock_irq(&mddev->write_lock); |
2270 | 2421 | ||
@@ -2290,6 +2441,13 @@ repeat: | |||
2290 | bdevname(rdev->bdev,b), | 2441 | bdevname(rdev->bdev,b), |
2291 | (unsigned long long)rdev->sb_start); | 2442 | (unsigned long long)rdev->sb_start); |
2292 | rdev->sb_events = mddev->events; | 2443 | rdev->sb_events = mddev->events; |
2444 | if (rdev->badblocks.size) { | ||
2445 | md_super_write(mddev, rdev, | ||
2446 | rdev->badblocks.sector, | ||
2447 | rdev->badblocks.size << 9, | ||
2448 | rdev->bb_page); | ||
2449 | rdev->badblocks.size = 0; | ||
2450 | } | ||
2293 | 2451 | ||
2294 | } else | 2452 | } else |
2295 | dprintk(")\n"); | 2453 | dprintk(")\n"); |
@@ -2313,6 +2471,15 @@ repeat: | |||
2313 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | 2471 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
2314 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 2472 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
2315 | 2473 | ||
2474 | list_for_each_entry(rdev, &mddev->disks, same_set) { | ||
2475 | if (test_and_clear_bit(FaultRecorded, &rdev->flags)) | ||
2476 | clear_bit(Blocked, &rdev->flags); | ||
2477 | |||
2478 | if (any_badblocks_changed) | ||
2479 | md_ack_all_badblocks(&rdev->badblocks); | ||
2480 | clear_bit(BlockedBadBlocks, &rdev->flags); | ||
2481 | wake_up(&rdev->blocked_wait); | ||
2482 | } | ||
2316 | } | 2483 | } |
2317 | 2484 | ||
2318 | /* words written to sysfs files may, or may not, be \n terminated. | 2485 | /* words written to sysfs files may, or may not, be \n terminated. |
@@ -2347,7 +2514,8 @@ state_show(mdk_rdev_t *rdev, char *page) | |||
2347 | char *sep = ""; | 2514 | char *sep = ""; |
2348 | size_t len = 0; | 2515 | size_t len = 0; |
2349 | 2516 | ||
2350 | if (test_bit(Faulty, &rdev->flags)) { | 2517 | if (test_bit(Faulty, &rdev->flags) || |
2518 | rdev->badblocks.unacked_exist) { | ||
2351 | len+= sprintf(page+len, "%sfaulty",sep); | 2519 | len+= sprintf(page+len, "%sfaulty",sep); |
2352 | sep = ","; | 2520 | sep = ","; |
2353 | } | 2521 | } |
@@ -2359,7 +2527,8 @@ state_show(mdk_rdev_t *rdev, char *page) | |||
2359 | len += sprintf(page+len, "%swrite_mostly",sep); | 2527 | len += sprintf(page+len, "%swrite_mostly",sep); |
2360 | sep = ","; | 2528 | sep = ","; |
2361 | } | 2529 | } |
2362 | if (test_bit(Blocked, &rdev->flags)) { | 2530 | if (test_bit(Blocked, &rdev->flags) || |
2531 | rdev->badblocks.unacked_exist) { | ||
2363 | len += sprintf(page+len, "%sblocked", sep); | 2532 | len += sprintf(page+len, "%sblocked", sep); |
2364 | sep = ","; | 2533 | sep = ","; |
2365 | } | 2534 | } |
@@ -2368,6 +2537,10 @@ state_show(mdk_rdev_t *rdev, char *page) | |||
2368 | len += sprintf(page+len, "%sspare", sep); | 2537 | len += sprintf(page+len, "%sspare", sep); |
2369 | sep = ","; | 2538 | sep = ","; |
2370 | } | 2539 | } |
2540 | if (test_bit(WriteErrorSeen, &rdev->flags)) { | ||
2541 | len += sprintf(page+len, "%swrite_error", sep); | ||
2542 | sep = ","; | ||
2543 | } | ||
2371 | return len+sprintf(page+len, "\n"); | 2544 | return len+sprintf(page+len, "\n"); |
2372 | } | 2545 | } |
2373 | 2546 | ||
@@ -2375,13 +2548,15 @@ static ssize_t | |||
2375 | state_store(mdk_rdev_t *rdev, const char *buf, size_t len) | 2548 | state_store(mdk_rdev_t *rdev, const char *buf, size_t len) |
2376 | { | 2549 | { |
2377 | /* can write | 2550 | /* can write |
2378 | * faulty - simulates and error | 2551 | * faulty - simulates an error |
2379 | * remove - disconnects the device | 2552 | * remove - disconnects the device |
2380 | * writemostly - sets write_mostly | 2553 | * writemostly - sets write_mostly |
2381 | * -writemostly - clears write_mostly | 2554 | * -writemostly - clears write_mostly |
2382 | * blocked - sets the Blocked flag | 2555 | * blocked - sets the Blocked flags |
2383 | * -blocked - clears the Blocked flag | 2556 | * -blocked - clears the Blocked and possibly simulates an error |
2384 | * insync - sets Insync providing device isn't active | 2557 | * insync - sets Insync providing device isn't active |
2558 | * write_error - sets WriteErrorSeen | ||
2559 | * -write_error - clears WriteErrorSeen | ||
2385 | */ | 2560 | */ |
2386 | int err = -EINVAL; | 2561 | int err = -EINVAL; |
2387 | if (cmd_match(buf, "faulty") && rdev->mddev->pers) { | 2562 | if (cmd_match(buf, "faulty") && rdev->mddev->pers) { |
@@ -2408,7 +2583,15 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2408 | set_bit(Blocked, &rdev->flags); | 2583 | set_bit(Blocked, &rdev->flags); |
2409 | err = 0; | 2584 | err = 0; |
2410 | } else if (cmd_match(buf, "-blocked")) { | 2585 | } else if (cmd_match(buf, "-blocked")) { |
2586 | if (!test_bit(Faulty, &rdev->flags) && | ||
2587 | test_bit(BlockedBadBlocks, &rdev->flags)) { | ||
2588 | /* metadata handler doesn't understand badblocks, | ||
2589 | * so we need to fail the device | ||
2590 | */ | ||
2591 | md_error(rdev->mddev, rdev); | ||
2592 | } | ||
2411 | clear_bit(Blocked, &rdev->flags); | 2593 | clear_bit(Blocked, &rdev->flags); |
2594 | clear_bit(BlockedBadBlocks, &rdev->flags); | ||
2412 | wake_up(&rdev->blocked_wait); | 2595 | wake_up(&rdev->blocked_wait); |
2413 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); | 2596 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); |
2414 | md_wakeup_thread(rdev->mddev->thread); | 2597 | md_wakeup_thread(rdev->mddev->thread); |
@@ -2417,6 +2600,12 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2417 | } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { | 2600 | } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { |
2418 | set_bit(In_sync, &rdev->flags); | 2601 | set_bit(In_sync, &rdev->flags); |
2419 | err = 0; | 2602 | err = 0; |
2603 | } else if (cmd_match(buf, "write_error")) { | ||
2604 | set_bit(WriteErrorSeen, &rdev->flags); | ||
2605 | err = 0; | ||
2606 | } else if (cmd_match(buf, "-write_error")) { | ||
2607 | clear_bit(WriteErrorSeen, &rdev->flags); | ||
2608 | err = 0; | ||
2420 | } | 2609 | } |
2421 | if (!err) | 2610 | if (!err) |
2422 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 2611 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
@@ -2459,7 +2648,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2459 | { | 2648 | { |
2460 | char *e; | 2649 | char *e; |
2461 | int err; | 2650 | int err; |
2462 | char nm[20]; | ||
2463 | int slot = simple_strtoul(buf, &e, 10); | 2651 | int slot = simple_strtoul(buf, &e, 10); |
2464 | if (strncmp(buf, "none", 4)==0) | 2652 | if (strncmp(buf, "none", 4)==0) |
2465 | slot = -1; | 2653 | slot = -1; |
@@ -2482,8 +2670,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2482 | hot_remove_disk(rdev->mddev, rdev->raid_disk); | 2670 | hot_remove_disk(rdev->mddev, rdev->raid_disk); |
2483 | if (err) | 2671 | if (err) |
2484 | return err; | 2672 | return err; |
2485 | sprintf(nm, "rd%d", rdev->raid_disk); | 2673 | sysfs_unlink_rdev(rdev->mddev, rdev); |
2486 | sysfs_remove_link(&rdev->mddev->kobj, nm); | ||
2487 | rdev->raid_disk = -1; | 2674 | rdev->raid_disk = -1; |
2488 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); | 2675 | set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); |
2489 | md_wakeup_thread(rdev->mddev->thread); | 2676 | md_wakeup_thread(rdev->mddev->thread); |
@@ -2522,8 +2709,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len) | |||
2522 | return err; | 2709 | return err; |
2523 | } else | 2710 | } else |
2524 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 2711 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
2525 | sprintf(nm, "rd%d", rdev->raid_disk); | 2712 | if (sysfs_link_rdev(rdev->mddev, rdev)) |
2526 | if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm)) | ||
2527 | /* failure here is OK */; | 2713 | /* failure here is OK */; |
2528 | /* don't wakeup anyone, leave that to userspace. */ | 2714 | /* don't wakeup anyone, leave that to userspace. */ |
2529 | } else { | 2715 | } else { |
@@ -2712,6 +2898,39 @@ static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t le | |||
2712 | static struct rdev_sysfs_entry rdev_recovery_start = | 2898 | static struct rdev_sysfs_entry rdev_recovery_start = |
2713 | __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); | 2899 | __ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); |
2714 | 2900 | ||
2901 | |||
2902 | static ssize_t | ||
2903 | badblocks_show(struct badblocks *bb, char *page, int unack); | ||
2904 | static ssize_t | ||
2905 | badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack); | ||
2906 | |||
2907 | static ssize_t bb_show(mdk_rdev_t *rdev, char *page) | ||
2908 | { | ||
2909 | return badblocks_show(&rdev->badblocks, page, 0); | ||
2910 | } | ||
2911 | static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len) | ||
2912 | { | ||
2913 | int rv = badblocks_store(&rdev->badblocks, page, len, 0); | ||
2914 | /* Maybe that ack was all we needed */ | ||
2915 | if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags)) | ||
2916 | wake_up(&rdev->blocked_wait); | ||
2917 | return rv; | ||
2918 | } | ||
2919 | static struct rdev_sysfs_entry rdev_bad_blocks = | ||
2920 | __ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store); | ||
2921 | |||
2922 | |||
2923 | static ssize_t ubb_show(mdk_rdev_t *rdev, char *page) | ||
2924 | { | ||
2925 | return badblocks_show(&rdev->badblocks, page, 1); | ||
2926 | } | ||
2927 | static ssize_t ubb_store(mdk_rdev_t *rdev, const char *page, size_t len) | ||
2928 | { | ||
2929 | return badblocks_store(&rdev->badblocks, page, len, 1); | ||
2930 | } | ||
2931 | static struct rdev_sysfs_entry rdev_unack_bad_blocks = | ||
2932 | __ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store); | ||
2933 | |||
2715 | static struct attribute *rdev_default_attrs[] = { | 2934 | static struct attribute *rdev_default_attrs[] = { |
2716 | &rdev_state.attr, | 2935 | &rdev_state.attr, |
2717 | &rdev_errors.attr, | 2936 | &rdev_errors.attr, |
@@ -2719,6 +2938,8 @@ static struct attribute *rdev_default_attrs[] = { | |||
2719 | &rdev_offset.attr, | 2938 | &rdev_offset.attr, |
2720 | &rdev_size.attr, | 2939 | &rdev_size.attr, |
2721 | &rdev_recovery_start.attr, | 2940 | &rdev_recovery_start.attr, |
2941 | &rdev_bad_blocks.attr, | ||
2942 | &rdev_unack_bad_blocks.attr, | ||
2722 | NULL, | 2943 | NULL, |
2723 | }; | 2944 | }; |
2724 | static ssize_t | 2945 | static ssize_t |
@@ -2782,7 +3003,7 @@ static struct kobj_type rdev_ktype = { | |||
2782 | .default_attrs = rdev_default_attrs, | 3003 | .default_attrs = rdev_default_attrs, |
2783 | }; | 3004 | }; |
2784 | 3005 | ||
2785 | void md_rdev_init(mdk_rdev_t *rdev) | 3006 | int md_rdev_init(mdk_rdev_t *rdev) |
2786 | { | 3007 | { |
2787 | rdev->desc_nr = -1; | 3008 | rdev->desc_nr = -1; |
2788 | rdev->saved_raid_disk = -1; | 3009 | rdev->saved_raid_disk = -1; |
@@ -2792,12 +3013,27 @@ void md_rdev_init(mdk_rdev_t *rdev) | |||
2792 | rdev->sb_events = 0; | 3013 | rdev->sb_events = 0; |
2793 | rdev->last_read_error.tv_sec = 0; | 3014 | rdev->last_read_error.tv_sec = 0; |
2794 | rdev->last_read_error.tv_nsec = 0; | 3015 | rdev->last_read_error.tv_nsec = 0; |
3016 | rdev->sb_loaded = 0; | ||
3017 | rdev->bb_page = NULL; | ||
2795 | atomic_set(&rdev->nr_pending, 0); | 3018 | atomic_set(&rdev->nr_pending, 0); |
2796 | atomic_set(&rdev->read_errors, 0); | 3019 | atomic_set(&rdev->read_errors, 0); |
2797 | atomic_set(&rdev->corrected_errors, 0); | 3020 | atomic_set(&rdev->corrected_errors, 0); |
2798 | 3021 | ||
2799 | INIT_LIST_HEAD(&rdev->same_set); | 3022 | INIT_LIST_HEAD(&rdev->same_set); |
2800 | init_waitqueue_head(&rdev->blocked_wait); | 3023 | init_waitqueue_head(&rdev->blocked_wait); |
3024 | |||
3025 | /* Add space to store bad block list. | ||
3026 | * This reserves the space even on arrays where it cannot | ||
3027 | * be used - I wonder if that matters | ||
3028 | */ | ||
3029 | rdev->badblocks.count = 0; | ||
3030 | rdev->badblocks.shift = 0; | ||
3031 | rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL); | ||
3032 | seqlock_init(&rdev->badblocks.lock); | ||
3033 | if (rdev->badblocks.page == NULL) | ||
3034 | return -ENOMEM; | ||
3035 | |||
3036 | return 0; | ||
2801 | } | 3037 | } |
2802 | EXPORT_SYMBOL_GPL(md_rdev_init); | 3038 | EXPORT_SYMBOL_GPL(md_rdev_init); |
2803 | /* | 3039 | /* |
@@ -2823,8 +3059,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi | |||
2823 | return ERR_PTR(-ENOMEM); | 3059 | return ERR_PTR(-ENOMEM); |
2824 | } | 3060 | } |
2825 | 3061 | ||
2826 | md_rdev_init(rdev); | 3062 | err = md_rdev_init(rdev); |
2827 | if ((err = alloc_disk_sb(rdev))) | 3063 | if (err) |
3064 | goto abort_free; | ||
3065 | err = alloc_disk_sb(rdev); | ||
3066 | if (err) | ||
2828 | goto abort_free; | 3067 | goto abort_free; |
2829 | 3068 | ||
2830 | err = lock_rdev(rdev, newdev, super_format == -2); | 3069 | err = lock_rdev(rdev, newdev, super_format == -2); |
@@ -2860,15 +3099,17 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi | |||
2860 | goto abort_free; | 3099 | goto abort_free; |
2861 | } | 3100 | } |
2862 | } | 3101 | } |
3102 | if (super_format == -1) | ||
3103 | /* hot-add for 0.90, or non-persistent: so no badblocks */ | ||
3104 | rdev->badblocks.shift = -1; | ||
2863 | 3105 | ||
2864 | return rdev; | 3106 | return rdev; |
2865 | 3107 | ||
2866 | abort_free: | 3108 | abort_free: |
2867 | if (rdev->sb_page) { | 3109 | if (rdev->bdev) |
2868 | if (rdev->bdev) | 3110 | unlock_rdev(rdev); |
2869 | unlock_rdev(rdev); | 3111 | free_disk_sb(rdev); |
2870 | free_disk_sb(rdev); | 3112 | kfree(rdev->badblocks.page); |
2871 | } | ||
2872 | kfree(rdev); | 3113 | kfree(rdev); |
2873 | return ERR_PTR(err); | 3114 | return ERR_PTR(err); |
2874 | } | 3115 | } |
@@ -3149,15 +3390,13 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
3149 | } | 3390 | } |
3150 | 3391 | ||
3151 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 3392 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
3152 | char nm[20]; | ||
3153 | if (rdev->raid_disk < 0) | 3393 | if (rdev->raid_disk < 0) |
3154 | continue; | 3394 | continue; |
3155 | if (rdev->new_raid_disk >= mddev->raid_disks) | 3395 | if (rdev->new_raid_disk >= mddev->raid_disks) |
3156 | rdev->new_raid_disk = -1; | 3396 | rdev->new_raid_disk = -1; |
3157 | if (rdev->new_raid_disk == rdev->raid_disk) | 3397 | if (rdev->new_raid_disk == rdev->raid_disk) |
3158 | continue; | 3398 | continue; |
3159 | sprintf(nm, "rd%d", rdev->raid_disk); | 3399 | sysfs_unlink_rdev(mddev, rdev); |
3160 | sysfs_remove_link(&mddev->kobj, nm); | ||
3161 | } | 3400 | } |
3162 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 3401 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
3163 | if (rdev->raid_disk < 0) | 3402 | if (rdev->raid_disk < 0) |
@@ -3168,11 +3407,10 @@ level_store(mddev_t *mddev, const char *buf, size_t len) | |||
3168 | if (rdev->raid_disk < 0) | 3407 | if (rdev->raid_disk < 0) |
3169 | clear_bit(In_sync, &rdev->flags); | 3408 | clear_bit(In_sync, &rdev->flags); |
3170 | else { | 3409 | else { |
3171 | char nm[20]; | 3410 | if (sysfs_link_rdev(mddev, rdev)) |
3172 | sprintf(nm, "rd%d", rdev->raid_disk); | 3411 | printk(KERN_WARNING "md: cannot register rd%d" |
3173 | if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) | 3412 | " for %s after level change\n", |
3174 | printk("md: cannot register %s for %s after level change\n", | 3413 | rdev->raid_disk, mdname(mddev)); |
3175 | nm, mdname(mddev)); | ||
3176 | } | 3414 | } |
3177 | } | 3415 | } |
3178 | 3416 | ||
@@ -4504,7 +4742,8 @@ int md_run(mddev_t *mddev) | |||
4504 | } | 4742 | } |
4505 | 4743 | ||
4506 | if (mddev->bio_set == NULL) | 4744 | if (mddev->bio_set == NULL) |
4507 | mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev)); | 4745 | mddev->bio_set = bioset_create(BIO_POOL_SIZE, |
4746 | sizeof(mddev_t *)); | ||
4508 | 4747 | ||
4509 | spin_lock(&pers_lock); | 4748 | spin_lock(&pers_lock); |
4510 | pers = find_pers(mddev->level, mddev->clevel); | 4749 | pers = find_pers(mddev->level, mddev->clevel); |
@@ -4621,12 +4860,9 @@ int md_run(mddev_t *mddev) | |||
4621 | smp_wmb(); | 4860 | smp_wmb(); |
4622 | mddev->ready = 1; | 4861 | mddev->ready = 1; |
4623 | list_for_each_entry(rdev, &mddev->disks, same_set) | 4862 | list_for_each_entry(rdev, &mddev->disks, same_set) |
4624 | if (rdev->raid_disk >= 0) { | 4863 | if (rdev->raid_disk >= 0) |
4625 | char nm[20]; | 4864 | if (sysfs_link_rdev(mddev, rdev)) |
4626 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
4627 | if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) | ||
4628 | /* failure here is OK */; | 4865 | /* failure here is OK */; |
4629 | } | ||
4630 | 4866 | ||
4631 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 4867 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
4632 | 4868 | ||
@@ -4854,11 +5090,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open) | |||
4854 | sysfs_notify_dirent_safe(mddev->sysfs_state); | 5090 | sysfs_notify_dirent_safe(mddev->sysfs_state); |
4855 | 5091 | ||
4856 | list_for_each_entry(rdev, &mddev->disks, same_set) | 5092 | list_for_each_entry(rdev, &mddev->disks, same_set) |
4857 | if (rdev->raid_disk >= 0) { | 5093 | if (rdev->raid_disk >= 0) |
4858 | char nm[20]; | 5094 | sysfs_unlink_rdev(mddev, rdev); |
4859 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
4860 | sysfs_remove_link(&mddev->kobj, nm); | ||
4861 | } | ||
4862 | 5095 | ||
4863 | set_capacity(disk, 0); | 5096 | set_capacity(disk, 0); |
4864 | mutex_unlock(&mddev->open_mutex); | 5097 | mutex_unlock(&mddev->open_mutex); |
@@ -6198,18 +6431,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
6198 | if (!rdev || test_bit(Faulty, &rdev->flags)) | 6431 | if (!rdev || test_bit(Faulty, &rdev->flags)) |
6199 | return; | 6432 | return; |
6200 | 6433 | ||
6201 | if (mddev->external) | 6434 | if (!mddev->pers || !mddev->pers->error_handler) |
6202 | set_bit(Blocked, &rdev->flags); | ||
6203 | /* | ||
6204 | dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n", | ||
6205 | mdname(mddev), | ||
6206 | MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev), | ||
6207 | __builtin_return_address(0),__builtin_return_address(1), | ||
6208 | __builtin_return_address(2),__builtin_return_address(3)); | ||
6209 | */ | ||
6210 | if (!mddev->pers) | ||
6211 | return; | ||
6212 | if (!mddev->pers->error_handler) | ||
6213 | return; | 6435 | return; |
6214 | mddev->pers->error_handler(mddev,rdev); | 6436 | mddev->pers->error_handler(mddev,rdev); |
6215 | if (mddev->degraded) | 6437 | if (mddev->degraded) |
@@ -6933,11 +7155,14 @@ void md_do_sync(mddev_t *mddev) | |||
6933 | atomic_add(sectors, &mddev->recovery_active); | 7155 | atomic_add(sectors, &mddev->recovery_active); |
6934 | } | 7156 | } |
6935 | 7157 | ||
7158 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||
7159 | break; | ||
7160 | |||
6936 | j += sectors; | 7161 | j += sectors; |
6937 | if (j>1) mddev->curr_resync = j; | 7162 | if (j>1) mddev->curr_resync = j; |
6938 | mddev->curr_mark_cnt = io_sectors; | 7163 | mddev->curr_mark_cnt = io_sectors; |
6939 | if (last_check == 0) | 7164 | if (last_check == 0) |
6940 | /* this is the earliers that rebuilt will be | 7165 | /* this is the earliest that rebuild will be |
6941 | * visible in /proc/mdstat | 7166 | * visible in /proc/mdstat |
6942 | */ | 7167 | */ |
6943 | md_new_event(mddev); | 7168 | md_new_event(mddev); |
@@ -6946,10 +7171,6 @@ void md_do_sync(mddev_t *mddev) | |||
6946 | continue; | 7171 | continue; |
6947 | 7172 | ||
6948 | last_check = io_sectors; | 7173 | last_check = io_sectors; |
6949 | |||
6950 | if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | ||
6951 | break; | ||
6952 | |||
6953 | repeat: | 7174 | repeat: |
6954 | if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { | 7175 | if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { |
6955 | /* step marks */ | 7176 | /* step marks */ |
@@ -7067,29 +7288,23 @@ static int remove_and_add_spares(mddev_t *mddev) | |||
7067 | atomic_read(&rdev->nr_pending)==0) { | 7288 | atomic_read(&rdev->nr_pending)==0) { |
7068 | if (mddev->pers->hot_remove_disk( | 7289 | if (mddev->pers->hot_remove_disk( |
7069 | mddev, rdev->raid_disk)==0) { | 7290 | mddev, rdev->raid_disk)==0) { |
7070 | char nm[20]; | 7291 | sysfs_unlink_rdev(mddev, rdev); |
7071 | sprintf(nm,"rd%d", rdev->raid_disk); | ||
7072 | sysfs_remove_link(&mddev->kobj, nm); | ||
7073 | rdev->raid_disk = -1; | 7292 | rdev->raid_disk = -1; |
7074 | } | 7293 | } |
7075 | } | 7294 | } |
7076 | 7295 | ||
7077 | if (mddev->degraded && !mddev->recovery_disabled) { | 7296 | if (mddev->degraded) { |
7078 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 7297 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
7079 | if (rdev->raid_disk >= 0 && | 7298 | if (rdev->raid_disk >= 0 && |
7080 | !test_bit(In_sync, &rdev->flags) && | 7299 | !test_bit(In_sync, &rdev->flags) && |
7081 | !test_bit(Faulty, &rdev->flags) && | 7300 | !test_bit(Faulty, &rdev->flags)) |
7082 | !test_bit(Blocked, &rdev->flags)) | ||
7083 | spares++; | 7301 | spares++; |
7084 | if (rdev->raid_disk < 0 | 7302 | if (rdev->raid_disk < 0 |
7085 | && !test_bit(Faulty, &rdev->flags)) { | 7303 | && !test_bit(Faulty, &rdev->flags)) { |
7086 | rdev->recovery_offset = 0; | 7304 | rdev->recovery_offset = 0; |
7087 | if (mddev->pers-> | 7305 | if (mddev->pers-> |
7088 | hot_add_disk(mddev, rdev) == 0) { | 7306 | hot_add_disk(mddev, rdev) == 0) { |
7089 | char nm[20]; | 7307 | if (sysfs_link_rdev(mddev, rdev)) |
7090 | sprintf(nm, "rd%d", rdev->raid_disk); | ||
7091 | if (sysfs_create_link(&mddev->kobj, | ||
7092 | &rdev->kobj, nm)) | ||
7093 | /* failure here is OK */; | 7308 | /* failure here is OK */; |
7094 | spares++; | 7309 | spares++; |
7095 | md_new_event(mddev); | 7310 | md_new_event(mddev); |
@@ -7138,6 +7353,8 @@ static void reap_sync_thread(mddev_t *mddev) | |||
7138 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 7353 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
7139 | sysfs_notify_dirent_safe(mddev->sysfs_action); | 7354 | sysfs_notify_dirent_safe(mddev->sysfs_action); |
7140 | md_new_event(mddev); | 7355 | md_new_event(mddev); |
7356 | if (mddev->event_work.func) | ||
7357 | queue_work(md_misc_wq, &mddev->event_work); | ||
7141 | } | 7358 | } |
7142 | 7359 | ||
7143 | /* | 7360 | /* |
@@ -7170,9 +7387,6 @@ void md_check_recovery(mddev_t *mddev) | |||
7170 | if (mddev->bitmap) | 7387 | if (mddev->bitmap) |
7171 | bitmap_daemon_work(mddev); | 7388 | bitmap_daemon_work(mddev); |
7172 | 7389 | ||
7173 | if (mddev->ro) | ||
7174 | return; | ||
7175 | |||
7176 | if (signal_pending(current)) { | 7390 | if (signal_pending(current)) { |
7177 | if (mddev->pers->sync_request && !mddev->external) { | 7391 | if (mddev->pers->sync_request && !mddev->external) { |
7178 | printk(KERN_INFO "md: %s in immediate safe mode\n", | 7392 | printk(KERN_INFO "md: %s in immediate safe mode\n", |
@@ -7209,9 +7423,7 @@ void md_check_recovery(mddev_t *mddev) | |||
7209 | atomic_read(&rdev->nr_pending)==0) { | 7423 | atomic_read(&rdev->nr_pending)==0) { |
7210 | if (mddev->pers->hot_remove_disk( | 7424 | if (mddev->pers->hot_remove_disk( |
7211 | mddev, rdev->raid_disk)==0) { | 7425 | mddev, rdev->raid_disk)==0) { |
7212 | char nm[20]; | 7426 | sysfs_unlink_rdev(mddev, rdev); |
7213 | sprintf(nm,"rd%d", rdev->raid_disk); | ||
7214 | sysfs_remove_link(&mddev->kobj, nm); | ||
7215 | rdev->raid_disk = -1; | 7427 | rdev->raid_disk = -1; |
7216 | } | 7428 | } |
7217 | } | 7429 | } |
@@ -7331,12 +7543,499 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev) | |||
7331 | { | 7543 | { |
7332 | sysfs_notify_dirent_safe(rdev->sysfs_state); | 7544 | sysfs_notify_dirent_safe(rdev->sysfs_state); |
7333 | wait_event_timeout(rdev->blocked_wait, | 7545 | wait_event_timeout(rdev->blocked_wait, |
7334 | !test_bit(Blocked, &rdev->flags), | 7546 | !test_bit(Blocked, &rdev->flags) && |
7547 | !test_bit(BlockedBadBlocks, &rdev->flags), | ||
7335 | msecs_to_jiffies(5000)); | 7548 | msecs_to_jiffies(5000)); |
7336 | rdev_dec_pending(rdev, mddev); | 7549 | rdev_dec_pending(rdev, mddev); |
7337 | } | 7550 | } |
7338 | EXPORT_SYMBOL(md_wait_for_blocked_rdev); | 7551 | EXPORT_SYMBOL(md_wait_for_blocked_rdev); |
7339 | 7552 | ||
7553 | |||
7554 | /* Bad block management. | ||
7555 | * We can record which blocks on each device are 'bad' and so just | ||
7556 | * fail those blocks, or that stripe, rather than the whole device. | ||
7557 | * Entries in the bad-block table are 64bits wide. This comprises: | ||
7558 | * Length of bad-range, in sectors: 0-511 for lengths 1-512 | ||
7559 | * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) | ||
7560 | * A 'shift' can be set so that larger blocks are tracked and | ||
7561 | * consequently larger devices can be covered. | ||
7562 | * 'Acknowledged' flag - 1 bit. - the most significant bit. | ||
7563 | * | ||
7564 | * Locking of the bad-block table uses a seqlock so md_is_badblock | ||
7565 | * might need to retry if it is very unlucky. | ||
7566 | * We will sometimes want to check for bad blocks in a bi_end_io function, | ||
7567 | * so we use the write_seqlock_irq variant. | ||
7568 | * | ||
7569 | * When looking for a bad block we specify a range and want to | ||
7570 | * know if any block in the range is bad. So we binary-search | ||
7571 | * to the last range that starts at-or-before the given endpoint, | ||
7572 | * (or "before the sector after the target range") | ||
7573 | * then see if it ends after the given start. | ||
7574 | * We return | ||
7575 | * 0 if there are no known bad blocks in the range | ||
7576 | * 1 if there are known bad block which are all acknowledged | ||
7577 | * -1 if there are bad blocks which have not yet been acknowledged in metadata. | ||
7578 | * plus the start/length of the first bad section we overlap. | ||
7579 | */ | ||
7580 | int md_is_badblock(struct badblocks *bb, sector_t s, int sectors, | ||
7581 | sector_t *first_bad, int *bad_sectors) | ||
7582 | { | ||
7583 | int hi; | ||
7584 | int lo = 0; | ||
7585 | u64 *p = bb->page; | ||
7586 | int rv = 0; | ||
7587 | sector_t target = s + sectors; | ||
7588 | unsigned seq; | ||
7589 | |||
7590 | if (bb->shift > 0) { | ||
7591 | /* round the start down, and the end up */ | ||
7592 | s >>= bb->shift; | ||
7593 | target += (1<<bb->shift) - 1; | ||
7594 | target >>= bb->shift; | ||
7595 | sectors = target - s; | ||
7596 | } | ||
7597 | /* 'target' is now the first block after the bad range */ | ||
7598 | |||
7599 | retry: | ||
7600 | seq = read_seqbegin(&bb->lock); | ||
7601 | |||
7602 | hi = bb->count; | ||
7603 | |||
7604 | /* Binary search between lo and hi for 'target' | ||
7605 | * i.e. for the last range that starts before 'target' | ||
7606 | */ | ||
7607 | /* INVARIANT: ranges before 'lo' and at-or-after 'hi' | ||
7608 | * are known not to be the last range before target. | ||
7609 | * VARIANT: hi-lo is the number of possible | ||
7610 | * ranges, and decreases until it reaches 1 | ||
7611 | */ | ||
7612 | while (hi - lo > 1) { | ||
7613 | int mid = (lo + hi) / 2; | ||
7614 | sector_t a = BB_OFFSET(p[mid]); | ||
7615 | if (a < target) | ||
7616 | /* This could still be the one, earlier ranges | ||
7617 | * could not. */ | ||
7618 | lo = mid; | ||
7619 | else | ||
7620 | /* This and later ranges are definitely out. */ | ||
7621 | hi = mid; | ||
7622 | } | ||
7623 | /* 'lo' might be the last that started before target, but 'hi' isn't */ | ||
7624 | if (hi > lo) { | ||
7625 | /* need to check all range that end after 's' to see if | ||
7626 | * any are unacknowledged. | ||
7627 | */ | ||
7628 | while (lo >= 0 && | ||
7629 | BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { | ||
7630 | if (BB_OFFSET(p[lo]) < target) { | ||
7631 | /* starts before the end, and finishes after | ||
7632 | * the start, so they must overlap | ||
7633 | */ | ||
7634 | if (rv != -1 && BB_ACK(p[lo])) | ||
7635 | rv = 1; | ||
7636 | else | ||
7637 | rv = -1; | ||
7638 | *first_bad = BB_OFFSET(p[lo]); | ||
7639 | *bad_sectors = BB_LEN(p[lo]); | ||
7640 | } | ||
7641 | lo--; | ||
7642 | } | ||
7643 | } | ||
7644 | |||
7645 | if (read_seqretry(&bb->lock, seq)) | ||
7646 | goto retry; | ||
7647 | |||
7648 | return rv; | ||
7649 | } | ||
7650 | EXPORT_SYMBOL_GPL(md_is_badblock); | ||
7651 | |||
7652 | /* | ||
7653 | * Add a range of bad blocks to the table. | ||
7654 | * This might extend the table, or might contract it | ||
7655 | * if two adjacent ranges can be merged. | ||
7656 | * We binary-search to find the 'insertion' point, then | ||
7657 | * decide how best to handle it. | ||
7658 | */ | ||
7659 | static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors, | ||
7660 | int acknowledged) | ||
7661 | { | ||
7662 | u64 *p; | ||
7663 | int lo, hi; | ||
7664 | int rv = 1; | ||
7665 | |||
7666 | if (bb->shift < 0) | ||
7667 | /* badblocks are disabled */ | ||
7668 | return 0; | ||
7669 | |||
7670 | if (bb->shift) { | ||
7671 | /* round the start down, and the end up */ | ||
7672 | sector_t next = s + sectors; | ||
7673 | s >>= bb->shift; | ||
7674 | next += (1<<bb->shift) - 1; | ||
7675 | next >>= bb->shift; | ||
7676 | sectors = next - s; | ||
7677 | } | ||
7678 | |||
7679 | write_seqlock_irq(&bb->lock); | ||
7680 | |||
7681 | p = bb->page; | ||
7682 | lo = 0; | ||
7683 | hi = bb->count; | ||
7684 | /* Find the last range that starts at-or-before 's' */ | ||
7685 | while (hi - lo > 1) { | ||
7686 | int mid = (lo + hi) / 2; | ||
7687 | sector_t a = BB_OFFSET(p[mid]); | ||
7688 | if (a <= s) | ||
7689 | lo = mid; | ||
7690 | else | ||
7691 | hi = mid; | ||
7692 | } | ||
7693 | if (hi > lo && BB_OFFSET(p[lo]) > s) | ||
7694 | hi = lo; | ||
7695 | |||
7696 | if (hi > lo) { | ||
7697 | /* we found a range that might merge with the start | ||
7698 | * of our new range | ||
7699 | */ | ||
7700 | sector_t a = BB_OFFSET(p[lo]); | ||
7701 | sector_t e = a + BB_LEN(p[lo]); | ||
7702 | int ack = BB_ACK(p[lo]); | ||
7703 | if (e >= s) { | ||
7704 | /* Yes, we can merge with a previous range */ | ||
7705 | if (s == a && s + sectors >= e) | ||
7706 | /* new range covers old */ | ||
7707 | ack = acknowledged; | ||
7708 | else | ||
7709 | ack = ack && acknowledged; | ||
7710 | |||
7711 | if (e < s + sectors) | ||
7712 | e = s + sectors; | ||
7713 | if (e - a <= BB_MAX_LEN) { | ||
7714 | p[lo] = BB_MAKE(a, e-a, ack); | ||
7715 | s = e; | ||
7716 | } else { | ||
7717 | /* does not all fit in one range, | ||
7718 | * make p[lo] maximal | ||
7719 | */ | ||
7720 | if (BB_LEN(p[lo]) != BB_MAX_LEN) | ||
7721 | p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); | ||
7722 | s = a + BB_MAX_LEN; | ||
7723 | } | ||
7724 | sectors = e - s; | ||
7725 | } | ||
7726 | } | ||
7727 | if (sectors && hi < bb->count) { | ||
7728 | /* 'hi' points to the first range that starts after 's'. | ||
7729 | * Maybe we can merge with the start of that range */ | ||
7730 | sector_t a = BB_OFFSET(p[hi]); | ||
7731 | sector_t e = a + BB_LEN(p[hi]); | ||
7732 | int ack = BB_ACK(p[hi]); | ||
7733 | if (a <= s + sectors) { | ||
7734 | /* merging is possible */ | ||
7735 | if (e <= s + sectors) { | ||
7736 | /* full overlap */ | ||
7737 | e = s + sectors; | ||
7738 | ack = acknowledged; | ||
7739 | } else | ||
7740 | ack = ack && acknowledged; | ||
7741 | |||
7742 | a = s; | ||
7743 | if (e - a <= BB_MAX_LEN) { | ||
7744 | p[hi] = BB_MAKE(a, e-a, ack); | ||
7745 | s = e; | ||
7746 | } else { | ||
7747 | p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); | ||
7748 | s = a + BB_MAX_LEN; | ||
7749 | } | ||
7750 | sectors = e - s; | ||
7751 | lo = hi; | ||
7752 | hi++; | ||
7753 | } | ||
7754 | } | ||
7755 | if (sectors == 0 && hi < bb->count) { | ||
7756 | /* we might be able to combine lo and hi */ | ||
7757 | /* Note: 's' is at the end of 'lo' */ | ||
7758 | sector_t a = BB_OFFSET(p[hi]); | ||
7759 | int lolen = BB_LEN(p[lo]); | ||
7760 | int hilen = BB_LEN(p[hi]); | ||
7761 | int newlen = lolen + hilen - (s - a); | ||
7762 | if (s >= a && newlen < BB_MAX_LEN) { | ||
7763 | /* yes, we can combine them */ | ||
7764 | int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); | ||
7765 | p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); | ||
7766 | memmove(p + hi, p + hi + 1, | ||
7767 | (bb->count - hi - 1) * 8); | ||
7768 | bb->count--; | ||
7769 | } | ||
7770 | } | ||
7771 | while (sectors) { | ||
7772 | /* didn't merge (it all). | ||
7773 | * Need to add a range just before 'hi' */ | ||
7774 | if (bb->count >= MD_MAX_BADBLOCKS) { | ||
7775 | /* No room for more */ | ||
7776 | rv = 0; | ||
7777 | break; | ||
7778 | } else { | ||
7779 | int this_sectors = sectors; | ||
7780 | memmove(p + hi + 1, p + hi, | ||
7781 | (bb->count - hi) * 8); | ||
7782 | bb->count++; | ||
7783 | |||
7784 | if (this_sectors > BB_MAX_LEN) | ||
7785 | this_sectors = BB_MAX_LEN; | ||
7786 | p[hi] = BB_MAKE(s, this_sectors, acknowledged); | ||
7787 | sectors -= this_sectors; | ||
7788 | s += this_sectors; | ||
7789 | } | ||
7790 | } | ||
7791 | |||
7792 | bb->changed = 1; | ||
7793 | if (!acknowledged) | ||
7794 | bb->unacked_exist = 1; | ||
7795 | write_sequnlock_irq(&bb->lock); | ||
7796 | |||
7797 | return rv; | ||
7798 | } | ||
7799 | |||
7800 | int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors, | ||
7801 | int acknowledged) | ||
7802 | { | ||
7803 | int rv = md_set_badblocks(&rdev->badblocks, | ||
7804 | s + rdev->data_offset, sectors, acknowledged); | ||
7805 | if (rv) { | ||
7806 | /* Make sure they get written out promptly */ | ||
7807 | set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags); | ||
7808 | md_wakeup_thread(rdev->mddev->thread); | ||
7809 | } | ||
7810 | return rv; | ||
7811 | } | ||
7812 | EXPORT_SYMBOL_GPL(rdev_set_badblocks); | ||
7813 | |||
7814 | /* | ||
7815 | * Remove a range of bad blocks from the table. | ||
7816 | * This may involve extending the table if we spilt a region, | ||
7817 | * but it must not fail. So if the table becomes full, we just | ||
7818 | * drop the remove request. | ||
7819 | */ | ||
7820 | static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors) | ||
7821 | { | ||
7822 | u64 *p; | ||
7823 | int lo, hi; | ||
7824 | sector_t target = s + sectors; | ||
7825 | int rv = 0; | ||
7826 | |||
7827 | if (bb->shift > 0) { | ||
7828 | /* When clearing we round the start up and the end down. | ||
7829 | * This should not matter as the shift should align with | ||
7830 | * the block size and no rounding should ever be needed. | ||
7831 | * However it is better the think a block is bad when it | ||
7832 | * isn't than to think a block is not bad when it is. | ||
7833 | */ | ||
7834 | s += (1<<bb->shift) - 1; | ||
7835 | s >>= bb->shift; | ||
7836 | target >>= bb->shift; | ||
7837 | sectors = target - s; | ||
7838 | } | ||
7839 | |||
7840 | write_seqlock_irq(&bb->lock); | ||
7841 | |||
7842 | p = bb->page; | ||
7843 | lo = 0; | ||
7844 | hi = bb->count; | ||
7845 | /* Find the last range that starts before 'target' */ | ||
7846 | while (hi - lo > 1) { | ||
7847 | int mid = (lo + hi) / 2; | ||
7848 | sector_t a = BB_OFFSET(p[mid]); | ||
7849 | if (a < target) | ||
7850 | lo = mid; | ||
7851 | else | ||
7852 | hi = mid; | ||
7853 | } | ||
7854 | if (hi > lo) { | ||
7855 | /* p[lo] is the last range that could overlap the | ||
7856 | * current range. Earlier ranges could also overlap, | ||
7857 | * but only this one can overlap the end of the range. | ||
7858 | */ | ||
7859 | if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) { | ||
7860 | /* Partial overlap, leave the tail of this range */ | ||
7861 | int ack = BB_ACK(p[lo]); | ||
7862 | sector_t a = BB_OFFSET(p[lo]); | ||
7863 | sector_t end = a + BB_LEN(p[lo]); | ||
7864 | |||
7865 | if (a < s) { | ||
7866 | /* we need to split this range */ | ||
7867 | if (bb->count >= MD_MAX_BADBLOCKS) { | ||
7868 | rv = 0; | ||
7869 | goto out; | ||
7870 | } | ||
7871 | memmove(p+lo+1, p+lo, (bb->count - lo) * 8); | ||
7872 | bb->count++; | ||
7873 | p[lo] = BB_MAKE(a, s-a, ack); | ||
7874 | lo++; | ||
7875 | } | ||
7876 | p[lo] = BB_MAKE(target, end - target, ack); | ||
7877 | /* there is no longer an overlap */ | ||
7878 | hi = lo; | ||
7879 | lo--; | ||
7880 | } | ||
7881 | while (lo >= 0 && | ||
7882 | BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { | ||
7883 | /* This range does overlap */ | ||
7884 | if (BB_OFFSET(p[lo]) < s) { | ||
7885 | /* Keep the early parts of this range. */ | ||
7886 | int ack = BB_ACK(p[lo]); | ||
7887 | sector_t start = BB_OFFSET(p[lo]); | ||
7888 | p[lo] = BB_MAKE(start, s - start, ack); | ||
7889 | /* now low doesn't overlap, so.. */ | ||
7890 | break; | ||
7891 | } | ||
7892 | lo--; | ||
7893 | } | ||
7894 | /* 'lo' is strictly before, 'hi' is strictly after, | ||
7895 | * anything between needs to be discarded | ||
7896 | */ | ||
7897 | if (hi - lo > 1) { | ||
7898 | memmove(p+lo+1, p+hi, (bb->count - hi) * 8); | ||
7899 | bb->count -= (hi - lo - 1); | ||
7900 | } | ||
7901 | } | ||
7902 | |||
7903 | bb->changed = 1; | ||
7904 | out: | ||
7905 | write_sequnlock_irq(&bb->lock); | ||
7906 | return rv; | ||
7907 | } | ||
7908 | |||
7909 | int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors) | ||
7910 | { | ||
7911 | return md_clear_badblocks(&rdev->badblocks, | ||
7912 | s + rdev->data_offset, | ||
7913 | sectors); | ||
7914 | } | ||
7915 | EXPORT_SYMBOL_GPL(rdev_clear_badblocks); | ||
7916 | |||
7917 | /* | ||
7918 | * Acknowledge all bad blocks in a list. | ||
7919 | * This only succeeds if ->changed is clear. It is used by | ||
7920 | * in-kernel metadata updates | ||
7921 | */ | ||
7922 | void md_ack_all_badblocks(struct badblocks *bb) | ||
7923 | { | ||
7924 | if (bb->page == NULL || bb->changed) | ||
7925 | /* no point even trying */ | ||
7926 | return; | ||
7927 | write_seqlock_irq(&bb->lock); | ||
7928 | |||
7929 | if (bb->changed == 0) { | ||
7930 | u64 *p = bb->page; | ||
7931 | int i; | ||
7932 | for (i = 0; i < bb->count ; i++) { | ||
7933 | if (!BB_ACK(p[i])) { | ||
7934 | sector_t start = BB_OFFSET(p[i]); | ||
7935 | int len = BB_LEN(p[i]); | ||
7936 | p[i] = BB_MAKE(start, len, 1); | ||
7937 | } | ||
7938 | } | ||
7939 | bb->unacked_exist = 0; | ||
7940 | } | ||
7941 | write_sequnlock_irq(&bb->lock); | ||
7942 | } | ||
7943 | EXPORT_SYMBOL_GPL(md_ack_all_badblocks); | ||
7944 | |||
7945 | /* sysfs access to bad-blocks list. | ||
7946 | * We present two files. | ||
7947 | * 'bad-blocks' lists sector numbers and lengths of ranges that | ||
7948 | * are recorded as bad. The list is truncated to fit within | ||
7949 | * the one-page limit of sysfs. | ||
7950 | * Writing "sector length" to this file adds an acknowledged | ||
7951 | * bad block list. | ||
7952 | * 'unacknowledged-bad-blocks' lists bad blocks that have not yet | ||
7953 | * been acknowledged. Writing to this file adds bad blocks | ||
7954 | * without acknowledging them. This is largely for testing. | ||
7955 | */ | ||
7956 | |||
7957 | static ssize_t | ||
7958 | badblocks_show(struct badblocks *bb, char *page, int unack) | ||
7959 | { | ||
7960 | size_t len; | ||
7961 | int i; | ||
7962 | u64 *p = bb->page; | ||
7963 | unsigned seq; | ||
7964 | |||
7965 | if (bb->shift < 0) | ||
7966 | return 0; | ||
7967 | |||
7968 | retry: | ||
7969 | seq = read_seqbegin(&bb->lock); | ||
7970 | |||
7971 | len = 0; | ||
7972 | i = 0; | ||
7973 | |||
7974 | while (len < PAGE_SIZE && i < bb->count) { | ||
7975 | sector_t s = BB_OFFSET(p[i]); | ||
7976 | unsigned int length = BB_LEN(p[i]); | ||
7977 | int ack = BB_ACK(p[i]); | ||
7978 | i++; | ||
7979 | |||
7980 | if (unack && ack) | ||
7981 | continue; | ||
7982 | |||
7983 | len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n", | ||
7984 | (unsigned long long)s << bb->shift, | ||
7985 | length << bb->shift); | ||
7986 | } | ||
7987 | if (unack && len == 0) | ||
7988 | bb->unacked_exist = 0; | ||
7989 | |||
7990 | if (read_seqretry(&bb->lock, seq)) | ||
7991 | goto retry; | ||
7992 | |||
7993 | return len; | ||
7994 | } | ||
7995 | |||
7996 | #define DO_DEBUG 1 | ||
7997 | |||
7998 | static ssize_t | ||
7999 | badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack) | ||
8000 | { | ||
8001 | unsigned long long sector; | ||
8002 | int length; | ||
8003 | char newline; | ||
8004 | #ifdef DO_DEBUG | ||
8005 | /* Allow clearing via sysfs *only* for testing/debugging. | ||
8006 | * Normally only a successful write may clear a badblock | ||
8007 | */ | ||
8008 | int clear = 0; | ||
8009 | if (page[0] == '-') { | ||
8010 | clear = 1; | ||
8011 | page++; | ||
8012 | } | ||
8013 | #endif /* DO_DEBUG */ | ||
8014 | |||
8015 | switch (sscanf(page, "%llu %d%c", §or, &length, &newline)) { | ||
8016 | case 3: | ||
8017 | if (newline != '\n') | ||
8018 | return -EINVAL; | ||
8019 | case 2: | ||
8020 | if (length <= 0) | ||
8021 | return -EINVAL; | ||
8022 | break; | ||
8023 | default: | ||
8024 | return -EINVAL; | ||
8025 | } | ||
8026 | |||
8027 | #ifdef DO_DEBUG | ||
8028 | if (clear) { | ||
8029 | md_clear_badblocks(bb, sector, length); | ||
8030 | return len; | ||
8031 | } | ||
8032 | #endif /* DO_DEBUG */ | ||
8033 | if (md_set_badblocks(bb, sector, length, !unack)) | ||
8034 | return len; | ||
8035 | else | ||
8036 | return -ENOSPC; | ||
8037 | } | ||
8038 | |||
7340 | static int md_notify_reboot(struct notifier_block *this, | 8039 | static int md_notify_reboot(struct notifier_block *this, |
7341 | unsigned long code, void *x) | 8040 | unsigned long code, void *x) |
7342 | { | 8041 | { |