aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/md.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2011-07-28 08:50:27 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2011-07-28 08:50:27 -0400
commit6140333d3656f62ac7e6a5af87e7fe92cfb8d655 (patch)
treed96f7ad2196b4383f5ca4396c956e24c82b2952c /drivers/md/md.c
parent6f56c218666b5c7eff354364357307d18c10058b (diff)
parent58c54fcca3bac5bf9290cfed31c76e4c4bfbabaf (diff)
Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md: (75 commits) md/raid10: handle further errors during fix_read_error better. md/raid10: Handle read errors during recovery better. md/raid10: simplify read error handling during recovery. md/raid10: record bad blocks due to write errors during resync/recovery. md/raid10: attempt to fix read errors during resync/check md/raid10: Handle write errors by updating badblock log. md/raid10: clear bad-block record when write succeeds. md/raid10: avoid writing to known bad blocks on known bad drives. md/raid10 record bad blocks as needed during recovery. md/raid10: avoid reading known bad blocks during resync/recovery. md/raid10 - avoid reading from known bad blocks - part 3 md/raid10: avoid reading from known bad blocks - part 2 md/raid10: avoid reading from known bad blocks - part 1 md/raid10: Split handle_read_error out from raid10d. md/raid10: simplify/reindent some loops. md/raid5: Clear bad blocks on successful write. md/raid5. Don't write to known bad block on doubtful devices. md/raid5: write errors should be recorded as bad blocks if possible. md/raid5: use bad-block log to improve handling of uncorrectable read errors. md/raid5: avoid reading from known bad blocks. ...
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r--drivers/md/md.c871
1 files changed, 785 insertions, 86 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index dfc9425db70b..8e221a20f5d9 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -215,6 +215,55 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
215} 215}
216EXPORT_SYMBOL_GPL(bio_clone_mddev); 216EXPORT_SYMBOL_GPL(bio_clone_mddev);
217 217
218void md_trim_bio(struct bio *bio, int offset, int size)
219{
220 /* 'bio' is a cloned bio which we need to trim to match
221 * the given offset and size.
222 * This requires adjusting bi_sector, bi_size, and bi_io_vec
223 */
224 int i;
225 struct bio_vec *bvec;
226 int sofar = 0;
227
228 size <<= 9;
229 if (offset == 0 && size == bio->bi_size)
230 return;
231
232 bio->bi_sector += offset;
233 bio->bi_size = size;
234 offset <<= 9;
235 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
236
237 while (bio->bi_idx < bio->bi_vcnt &&
238 bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
239 /* remove this whole bio_vec */
240 offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
241 bio->bi_idx++;
242 }
243 if (bio->bi_idx < bio->bi_vcnt) {
244 bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
245 bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
246 }
247 /* avoid any complications with bi_idx being non-zero*/
248 if (bio->bi_idx) {
249 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
250 (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
251 bio->bi_vcnt -= bio->bi_idx;
252 bio->bi_idx = 0;
253 }
254 /* Make sure vcnt and last bv are not too big */
255 bio_for_each_segment(bvec, bio, i) {
256 if (sofar + bvec->bv_len > size)
257 bvec->bv_len = size - sofar;
258 if (bvec->bv_len == 0) {
259 bio->bi_vcnt = i;
260 break;
261 }
262 sofar += bvec->bv_len;
263 }
264}
265EXPORT_SYMBOL_GPL(md_trim_bio);
266
218/* 267/*
219 * We have a system wide 'event count' that is incremented 268 * We have a system wide 'event count' that is incremented
220 * on any 'interesting' event, and readers of /proc/mdstat 269 * on any 'interesting' event, and readers of /proc/mdstat
@@ -757,6 +806,10 @@ static void free_disk_sb(mdk_rdev_t * rdev)
757 rdev->sb_start = 0; 806 rdev->sb_start = 0;
758 rdev->sectors = 0; 807 rdev->sectors = 0;
759 } 808 }
809 if (rdev->bb_page) {
810 put_page(rdev->bb_page);
811 rdev->bb_page = NULL;
812 }
760} 813}
761 814
762 815
@@ -1025,7 +1078,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
1025 ret = -EINVAL; 1078 ret = -EINVAL;
1026 1079
1027 bdevname(rdev->bdev, b); 1080 bdevname(rdev->bdev, b);
1028 sb = (mdp_super_t*)page_address(rdev->sb_page); 1081 sb = page_address(rdev->sb_page);
1029 1082
1030 if (sb->md_magic != MD_SB_MAGIC) { 1083 if (sb->md_magic != MD_SB_MAGIC) {
1031 printk(KERN_ERR "md: invalid raid superblock magic on %s\n", 1084 printk(KERN_ERR "md: invalid raid superblock magic on %s\n",
@@ -1054,6 +1107,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
1054 rdev->preferred_minor = sb->md_minor; 1107 rdev->preferred_minor = sb->md_minor;
1055 rdev->data_offset = 0; 1108 rdev->data_offset = 0;
1056 rdev->sb_size = MD_SB_BYTES; 1109 rdev->sb_size = MD_SB_BYTES;
1110 rdev->badblocks.shift = -1;
1057 1111
1058 if (sb->level == LEVEL_MULTIPATH) 1112 if (sb->level == LEVEL_MULTIPATH)
1059 rdev->desc_nr = -1; 1113 rdev->desc_nr = -1;
@@ -1064,7 +1118,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
1064 ret = 1; 1118 ret = 1;
1065 } else { 1119 } else {
1066 __u64 ev1, ev2; 1120 __u64 ev1, ev2;
1067 mdp_super_t *refsb = (mdp_super_t*)page_address(refdev->sb_page); 1121 mdp_super_t *refsb = page_address(refdev->sb_page);
1068 if (!uuid_equal(refsb, sb)) { 1122 if (!uuid_equal(refsb, sb)) {
1069 printk(KERN_WARNING "md: %s has different UUID to %s\n", 1123 printk(KERN_WARNING "md: %s has different UUID to %s\n",
1070 b, bdevname(refdev->bdev,b2)); 1124 b, bdevname(refdev->bdev,b2));
@@ -1099,7 +1153,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
1099static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1153static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1100{ 1154{
1101 mdp_disk_t *desc; 1155 mdp_disk_t *desc;
1102 mdp_super_t *sb = (mdp_super_t *)page_address(rdev->sb_page); 1156 mdp_super_t *sb = page_address(rdev->sb_page);
1103 __u64 ev1 = md_event(sb); 1157 __u64 ev1 = md_event(sb);
1104 1158
1105 rdev->raid_disk = -1; 1159 rdev->raid_disk = -1;
@@ -1230,7 +1284,7 @@ static void super_90_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1230 1284
1231 rdev->sb_size = MD_SB_BYTES; 1285 rdev->sb_size = MD_SB_BYTES;
1232 1286
1233 sb = (mdp_super_t*)page_address(rdev->sb_page); 1287 sb = page_address(rdev->sb_page);
1234 1288
1235 memset(sb, 0, sizeof(*sb)); 1289 memset(sb, 0, sizeof(*sb));
1236 1290
@@ -1395,6 +1449,8 @@ static __le32 calc_sb_1_csum(struct mdp_superblock_1 * sb)
1395 return cpu_to_le32(csum); 1449 return cpu_to_le32(csum);
1396} 1450}
1397 1451
1452static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
1453 int acknowledged);
1398static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) 1454static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1399{ 1455{
1400 struct mdp_superblock_1 *sb; 1456 struct mdp_superblock_1 *sb;
@@ -1435,7 +1491,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1435 if (ret) return ret; 1491 if (ret) return ret;
1436 1492
1437 1493
1438 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1494 sb = page_address(rdev->sb_page);
1439 1495
1440 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) || 1496 if (sb->magic != cpu_to_le32(MD_SB_MAGIC) ||
1441 sb->major_version != cpu_to_le32(1) || 1497 sb->major_version != cpu_to_le32(1) ||
@@ -1473,12 +1529,52 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1473 else 1529 else
1474 rdev->desc_nr = le32_to_cpu(sb->dev_number); 1530 rdev->desc_nr = le32_to_cpu(sb->dev_number);
1475 1531
1532 if (!rdev->bb_page) {
1533 rdev->bb_page = alloc_page(GFP_KERNEL);
1534 if (!rdev->bb_page)
1535 return -ENOMEM;
1536 }
1537 if ((le32_to_cpu(sb->feature_map) & MD_FEATURE_BAD_BLOCKS) &&
1538 rdev->badblocks.count == 0) {
1539 /* need to load the bad block list.
1540 * Currently we limit it to one page.
1541 */
1542 s32 offset;
1543 sector_t bb_sector;
1544 u64 *bbp;
1545 int i;
1546 int sectors = le16_to_cpu(sb->bblog_size);
1547 if (sectors > (PAGE_SIZE / 512))
1548 return -EINVAL;
1549 offset = le32_to_cpu(sb->bblog_offset);
1550 if (offset == 0)
1551 return -EINVAL;
1552 bb_sector = (long long)offset;
1553 if (!sync_page_io(rdev, bb_sector, sectors << 9,
1554 rdev->bb_page, READ, true))
1555 return -EIO;
1556 bbp = (u64 *)page_address(rdev->bb_page);
1557 rdev->badblocks.shift = sb->bblog_shift;
1558 for (i = 0 ; i < (sectors << (9-3)) ; i++, bbp++) {
1559 u64 bb = le64_to_cpu(*bbp);
1560 int count = bb & (0x3ff);
1561 u64 sector = bb >> 10;
1562 sector <<= sb->bblog_shift;
1563 count <<= sb->bblog_shift;
1564 if (bb + 1 == 0)
1565 break;
1566 if (md_set_badblocks(&rdev->badblocks,
1567 sector, count, 1) == 0)
1568 return -EINVAL;
1569 }
1570 } else if (sb->bblog_offset == 0)
1571 rdev->badblocks.shift = -1;
1572
1476 if (!refdev) { 1573 if (!refdev) {
1477 ret = 1; 1574 ret = 1;
1478 } else { 1575 } else {
1479 __u64 ev1, ev2; 1576 __u64 ev1, ev2;
1480 struct mdp_superblock_1 *refsb = 1577 struct mdp_superblock_1 *refsb = page_address(refdev->sb_page);
1481 (struct mdp_superblock_1*)page_address(refdev->sb_page);
1482 1578
1483 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 || 1579 if (memcmp(sb->set_uuid, refsb->set_uuid, 16) != 0 ||
1484 sb->level != refsb->level || 1580 sb->level != refsb->level ||
@@ -1513,7 +1609,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
1513 1609
1514static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) 1610static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
1515{ 1611{
1516 struct mdp_superblock_1 *sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1612 struct mdp_superblock_1 *sb = page_address(rdev->sb_page);
1517 __u64 ev1 = le64_to_cpu(sb->events); 1613 __u64 ev1 = le64_to_cpu(sb->events);
1518 1614
1519 rdev->raid_disk = -1; 1615 rdev->raid_disk = -1;
@@ -1619,13 +1715,12 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1619 int max_dev, i; 1715 int max_dev, i;
1620 /* make rdev->sb match mddev and rdev data. */ 1716 /* make rdev->sb match mddev and rdev data. */
1621 1717
1622 sb = (struct mdp_superblock_1*)page_address(rdev->sb_page); 1718 sb = page_address(rdev->sb_page);
1623 1719
1624 sb->feature_map = 0; 1720 sb->feature_map = 0;
1625 sb->pad0 = 0; 1721 sb->pad0 = 0;
1626 sb->recovery_offset = cpu_to_le64(0); 1722 sb->recovery_offset = cpu_to_le64(0);
1627 memset(sb->pad1, 0, sizeof(sb->pad1)); 1723 memset(sb->pad1, 0, sizeof(sb->pad1));
1628 memset(sb->pad2, 0, sizeof(sb->pad2));
1629 memset(sb->pad3, 0, sizeof(sb->pad3)); 1724 memset(sb->pad3, 0, sizeof(sb->pad3));
1630 1725
1631 sb->utime = cpu_to_le64((__u64)mddev->utime); 1726 sb->utime = cpu_to_le64((__u64)mddev->utime);
@@ -1665,6 +1760,40 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev)
1665 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors); 1760 sb->new_chunk = cpu_to_le32(mddev->new_chunk_sectors);
1666 } 1761 }
1667 1762
1763 if (rdev->badblocks.count == 0)
1764 /* Nothing to do for bad blocks*/ ;
1765 else if (sb->bblog_offset == 0)
1766 /* Cannot record bad blocks on this device */
1767 md_error(mddev, rdev);
1768 else {
1769 struct badblocks *bb = &rdev->badblocks;
1770 u64 *bbp = (u64 *)page_address(rdev->bb_page);
1771 u64 *p = bb->page;
1772 sb->feature_map |= cpu_to_le32(MD_FEATURE_BAD_BLOCKS);
1773 if (bb->changed) {
1774 unsigned seq;
1775
1776retry:
1777 seq = read_seqbegin(&bb->lock);
1778
1779 memset(bbp, 0xff, PAGE_SIZE);
1780
1781 for (i = 0 ; i < bb->count ; i++) {
1782 u64 internal_bb = *p++;
1783 u64 store_bb = ((BB_OFFSET(internal_bb) << 10)
1784 | BB_LEN(internal_bb));
1785 *bbp++ = cpu_to_le64(store_bb);
1786 }
1787 if (read_seqretry(&bb->lock, seq))
1788 goto retry;
1789
1790 bb->sector = (rdev->sb_start +
1791 (int)le32_to_cpu(sb->bblog_offset));
1792 bb->size = le16_to_cpu(sb->bblog_size);
1793 bb->changed = 0;
1794 }
1795 }
1796
1668 max_dev = 0; 1797 max_dev = 0;
1669 list_for_each_entry(rdev2, &mddev->disks, same_set) 1798 list_for_each_entry(rdev2, &mddev->disks, same_set)
1670 if (rdev2->desc_nr+1 > max_dev) 1799 if (rdev2->desc_nr+1 > max_dev)
@@ -1724,7 +1853,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
1724 num_sectors = max_sectors; 1853 num_sectors = max_sectors;
1725 rdev->sb_start = sb_start; 1854 rdev->sb_start = sb_start;
1726 } 1855 }
1727 sb = (struct mdp_superblock_1 *) page_address(rdev->sb_page); 1856 sb = page_address(rdev->sb_page);
1728 sb->data_size = cpu_to_le64(num_sectors); 1857 sb->data_size = cpu_to_le64(num_sectors);
1729 sb->super_offset = rdev->sb_start; 1858 sb->super_offset = rdev->sb_start;
1730 sb->sb_csum = calc_sb_1_csum(sb); 1859 sb->sb_csum = calc_sb_1_csum(sb);
@@ -1922,7 +2051,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
1922 bd_link_disk_holder(rdev->bdev, mddev->gendisk); 2051 bd_link_disk_holder(rdev->bdev, mddev->gendisk);
1923 2052
1924 /* May as well allow recovery to be retried once */ 2053 /* May as well allow recovery to be retried once */
1925 mddev->recovery_disabled = 0; 2054 mddev->recovery_disabled++;
1926 2055
1927 return 0; 2056 return 0;
1928 2057
@@ -1953,6 +2082,9 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
1953 sysfs_remove_link(&rdev->kobj, "block"); 2082 sysfs_remove_link(&rdev->kobj, "block");
1954 sysfs_put(rdev->sysfs_state); 2083 sysfs_put(rdev->sysfs_state);
1955 rdev->sysfs_state = NULL; 2084 rdev->sysfs_state = NULL;
2085 kfree(rdev->badblocks.page);
2086 rdev->badblocks.count = 0;
2087 rdev->badblocks.page = NULL;
1956 /* We need to delay this, otherwise we can deadlock when 2088 /* We need to delay this, otherwise we can deadlock when
1957 * writing to 'remove' to "dev/state". We also need 2089 * writing to 'remove' to "dev/state". We also need
1958 * to delay it due to rcu usage. 2090 * to delay it due to rcu usage.
@@ -2127,10 +2259,10 @@ static void print_rdev(mdk_rdev_t *rdev, int major_version)
2127 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version); 2259 printk(KERN_INFO "md: rdev superblock (MJ:%d):\n", major_version);
2128 switch (major_version) { 2260 switch (major_version) {
2129 case 0: 2261 case 0:
2130 print_sb_90((mdp_super_t*)page_address(rdev->sb_page)); 2262 print_sb_90(page_address(rdev->sb_page));
2131 break; 2263 break;
2132 case 1: 2264 case 1:
2133 print_sb_1((struct mdp_superblock_1 *)page_address(rdev->sb_page)); 2265 print_sb_1(page_address(rdev->sb_page));
2134 break; 2266 break;
2135 } 2267 }
2136 } else 2268 } else
@@ -2194,6 +2326,7 @@ static void md_update_sb(mddev_t * mddev, int force_change)
2194 mdk_rdev_t *rdev; 2326 mdk_rdev_t *rdev;
2195 int sync_req; 2327 int sync_req;
2196 int nospares = 0; 2328 int nospares = 0;
2329 int any_badblocks_changed = 0;
2197 2330
2198repeat: 2331repeat:
2199 /* First make sure individual recovery_offsets are correct */ 2332 /* First make sure individual recovery_offsets are correct */
@@ -2208,8 +2341,18 @@ repeat:
2208 if (!mddev->persistent) { 2341 if (!mddev->persistent) {
2209 clear_bit(MD_CHANGE_CLEAN, &mddev->flags); 2342 clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
2210 clear_bit(MD_CHANGE_DEVS, &mddev->flags); 2343 clear_bit(MD_CHANGE_DEVS, &mddev->flags);
2211 if (!mddev->external) 2344 if (!mddev->external) {
2212 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2345 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2346 list_for_each_entry(rdev, &mddev->disks, same_set) {
2347 if (rdev->badblocks.changed) {
2348 md_ack_all_badblocks(&rdev->badblocks);
2349 md_error(mddev, rdev);
2350 }
2351 clear_bit(Blocked, &rdev->flags);
2352 clear_bit(BlockedBadBlocks, &rdev->flags);
2353 wake_up(&rdev->blocked_wait);
2354 }
2355 }
2213 wake_up(&mddev->sb_wait); 2356 wake_up(&mddev->sb_wait);
2214 return; 2357 return;
2215 } 2358 }
@@ -2265,6 +2408,14 @@ repeat:
2265 MD_BUG(); 2408 MD_BUG();
2266 mddev->events --; 2409 mddev->events --;
2267 } 2410 }
2411
2412 list_for_each_entry(rdev, &mddev->disks, same_set) {
2413 if (rdev->badblocks.changed)
2414 any_badblocks_changed++;
2415 if (test_bit(Faulty, &rdev->flags))
2416 set_bit(FaultRecorded, &rdev->flags);
2417 }
2418
2268 sync_sbs(mddev, nospares); 2419 sync_sbs(mddev, nospares);
2269 spin_unlock_irq(&mddev->write_lock); 2420 spin_unlock_irq(&mddev->write_lock);
2270 2421
@@ -2290,6 +2441,13 @@ repeat:
2290 bdevname(rdev->bdev,b), 2441 bdevname(rdev->bdev,b),
2291 (unsigned long long)rdev->sb_start); 2442 (unsigned long long)rdev->sb_start);
2292 rdev->sb_events = mddev->events; 2443 rdev->sb_events = mddev->events;
2444 if (rdev->badblocks.size) {
2445 md_super_write(mddev, rdev,
2446 rdev->badblocks.sector,
2447 rdev->badblocks.size << 9,
2448 rdev->bb_page);
2449 rdev->badblocks.size = 0;
2450 }
2293 2451
2294 } else 2452 } else
2295 dprintk(")\n"); 2453 dprintk(")\n");
@@ -2313,6 +2471,15 @@ repeat:
2313 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2471 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2314 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2472 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2315 2473
2474 list_for_each_entry(rdev, &mddev->disks, same_set) {
2475 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2476 clear_bit(Blocked, &rdev->flags);
2477
2478 if (any_badblocks_changed)
2479 md_ack_all_badblocks(&rdev->badblocks);
2480 clear_bit(BlockedBadBlocks, &rdev->flags);
2481 wake_up(&rdev->blocked_wait);
2482 }
2316} 2483}
2317 2484
2318/* words written to sysfs files may, or may not, be \n terminated. 2485/* words written to sysfs files may, or may not, be \n terminated.
@@ -2347,7 +2514,8 @@ state_show(mdk_rdev_t *rdev, char *page)
2347 char *sep = ""; 2514 char *sep = "";
2348 size_t len = 0; 2515 size_t len = 0;
2349 2516
2350 if (test_bit(Faulty, &rdev->flags)) { 2517 if (test_bit(Faulty, &rdev->flags) ||
2518 rdev->badblocks.unacked_exist) {
2351 len+= sprintf(page+len, "%sfaulty",sep); 2519 len+= sprintf(page+len, "%sfaulty",sep);
2352 sep = ","; 2520 sep = ",";
2353 } 2521 }
@@ -2359,7 +2527,8 @@ state_show(mdk_rdev_t *rdev, char *page)
2359 len += sprintf(page+len, "%swrite_mostly",sep); 2527 len += sprintf(page+len, "%swrite_mostly",sep);
2360 sep = ","; 2528 sep = ",";
2361 } 2529 }
2362 if (test_bit(Blocked, &rdev->flags)) { 2530 if (test_bit(Blocked, &rdev->flags) ||
2531 rdev->badblocks.unacked_exist) {
2363 len += sprintf(page+len, "%sblocked", sep); 2532 len += sprintf(page+len, "%sblocked", sep);
2364 sep = ","; 2533 sep = ",";
2365 } 2534 }
@@ -2368,6 +2537,10 @@ state_show(mdk_rdev_t *rdev, char *page)
2368 len += sprintf(page+len, "%sspare", sep); 2537 len += sprintf(page+len, "%sspare", sep);
2369 sep = ","; 2538 sep = ",";
2370 } 2539 }
2540 if (test_bit(WriteErrorSeen, &rdev->flags)) {
2541 len += sprintf(page+len, "%swrite_error", sep);
2542 sep = ",";
2543 }
2371 return len+sprintf(page+len, "\n"); 2544 return len+sprintf(page+len, "\n");
2372} 2545}
2373 2546
@@ -2375,13 +2548,15 @@ static ssize_t
2375state_store(mdk_rdev_t *rdev, const char *buf, size_t len) 2548state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2376{ 2549{
2377 /* can write 2550 /* can write
2378 * faulty - simulates and error 2551 * faulty - simulates an error
2379 * remove - disconnects the device 2552 * remove - disconnects the device
2380 * writemostly - sets write_mostly 2553 * writemostly - sets write_mostly
2381 * -writemostly - clears write_mostly 2554 * -writemostly - clears write_mostly
2382 * blocked - sets the Blocked flag 2555 * blocked - sets the Blocked flags
2383 * -blocked - clears the Blocked flag 2556 * -blocked - clears the Blocked and possibly simulates an error
2384 * insync - sets Insync providing device isn't active 2557 * insync - sets Insync providing device isn't active
2558 * write_error - sets WriteErrorSeen
2559 * -write_error - clears WriteErrorSeen
2385 */ 2560 */
2386 int err = -EINVAL; 2561 int err = -EINVAL;
2387 if (cmd_match(buf, "faulty") && rdev->mddev->pers) { 2562 if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
@@ -2408,7 +2583,15 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2408 set_bit(Blocked, &rdev->flags); 2583 set_bit(Blocked, &rdev->flags);
2409 err = 0; 2584 err = 0;
2410 } else if (cmd_match(buf, "-blocked")) { 2585 } else if (cmd_match(buf, "-blocked")) {
2586 if (!test_bit(Faulty, &rdev->flags) &&
2587 test_bit(BlockedBadBlocks, &rdev->flags)) {
2588 /* metadata handler doesn't understand badblocks,
2589 * so we need to fail the device
2590 */
2591 md_error(rdev->mddev, rdev);
2592 }
2411 clear_bit(Blocked, &rdev->flags); 2593 clear_bit(Blocked, &rdev->flags);
2594 clear_bit(BlockedBadBlocks, &rdev->flags);
2412 wake_up(&rdev->blocked_wait); 2595 wake_up(&rdev->blocked_wait);
2413 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2596 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2414 md_wakeup_thread(rdev->mddev->thread); 2597 md_wakeup_thread(rdev->mddev->thread);
@@ -2417,6 +2600,12 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2417 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) { 2600 } else if (cmd_match(buf, "insync") && rdev->raid_disk == -1) {
2418 set_bit(In_sync, &rdev->flags); 2601 set_bit(In_sync, &rdev->flags);
2419 err = 0; 2602 err = 0;
2603 } else if (cmd_match(buf, "write_error")) {
2604 set_bit(WriteErrorSeen, &rdev->flags);
2605 err = 0;
2606 } else if (cmd_match(buf, "-write_error")) {
2607 clear_bit(WriteErrorSeen, &rdev->flags);
2608 err = 0;
2420 } 2609 }
2421 if (!err) 2610 if (!err)
2422 sysfs_notify_dirent_safe(rdev->sysfs_state); 2611 sysfs_notify_dirent_safe(rdev->sysfs_state);
@@ -2459,7 +2648,6 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2459{ 2648{
2460 char *e; 2649 char *e;
2461 int err; 2650 int err;
2462 char nm[20];
2463 int slot = simple_strtoul(buf, &e, 10); 2651 int slot = simple_strtoul(buf, &e, 10);
2464 if (strncmp(buf, "none", 4)==0) 2652 if (strncmp(buf, "none", 4)==0)
2465 slot = -1; 2653 slot = -1;
@@ -2482,8 +2670,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2482 hot_remove_disk(rdev->mddev, rdev->raid_disk); 2670 hot_remove_disk(rdev->mddev, rdev->raid_disk);
2483 if (err) 2671 if (err)
2484 return err; 2672 return err;
2485 sprintf(nm, "rd%d", rdev->raid_disk); 2673 sysfs_unlink_rdev(rdev->mddev, rdev);
2486 sysfs_remove_link(&rdev->mddev->kobj, nm);
2487 rdev->raid_disk = -1; 2674 rdev->raid_disk = -1;
2488 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); 2675 set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
2489 md_wakeup_thread(rdev->mddev->thread); 2676 md_wakeup_thread(rdev->mddev->thread);
@@ -2522,8 +2709,7 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
2522 return err; 2709 return err;
2523 } else 2710 } else
2524 sysfs_notify_dirent_safe(rdev->sysfs_state); 2711 sysfs_notify_dirent_safe(rdev->sysfs_state);
2525 sprintf(nm, "rd%d", rdev->raid_disk); 2712 if (sysfs_link_rdev(rdev->mddev, rdev))
2526 if (sysfs_create_link(&rdev->mddev->kobj, &rdev->kobj, nm))
2527 /* failure here is OK */; 2713 /* failure here is OK */;
2528 /* don't wakeup anyone, leave that to userspace. */ 2714 /* don't wakeup anyone, leave that to userspace. */
2529 } else { 2715 } else {
@@ -2712,6 +2898,39 @@ static ssize_t recovery_start_store(mdk_rdev_t *rdev, const char *buf, size_t le
2712static struct rdev_sysfs_entry rdev_recovery_start = 2898static struct rdev_sysfs_entry rdev_recovery_start =
2713__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store); 2899__ATTR(recovery_start, S_IRUGO|S_IWUSR, recovery_start_show, recovery_start_store);
2714 2900
2901
2902static ssize_t
2903badblocks_show(struct badblocks *bb, char *page, int unack);
2904static ssize_t
2905badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack);
2906
2907static ssize_t bb_show(mdk_rdev_t *rdev, char *page)
2908{
2909 return badblocks_show(&rdev->badblocks, page, 0);
2910}
2911static ssize_t bb_store(mdk_rdev_t *rdev, const char *page, size_t len)
2912{
2913 int rv = badblocks_store(&rdev->badblocks, page, len, 0);
2914 /* Maybe that ack was all we needed */
2915 if (test_and_clear_bit(BlockedBadBlocks, &rdev->flags))
2916 wake_up(&rdev->blocked_wait);
2917 return rv;
2918}
2919static struct rdev_sysfs_entry rdev_bad_blocks =
2920__ATTR(bad_blocks, S_IRUGO|S_IWUSR, bb_show, bb_store);
2921
2922
2923static ssize_t ubb_show(mdk_rdev_t *rdev, char *page)
2924{
2925 return badblocks_show(&rdev->badblocks, page, 1);
2926}
2927static ssize_t ubb_store(mdk_rdev_t *rdev, const char *page, size_t len)
2928{
2929 return badblocks_store(&rdev->badblocks, page, len, 1);
2930}
2931static struct rdev_sysfs_entry rdev_unack_bad_blocks =
2932__ATTR(unacknowledged_bad_blocks, S_IRUGO|S_IWUSR, ubb_show, ubb_store);
2933
2715static struct attribute *rdev_default_attrs[] = { 2934static struct attribute *rdev_default_attrs[] = {
2716 &rdev_state.attr, 2935 &rdev_state.attr,
2717 &rdev_errors.attr, 2936 &rdev_errors.attr,
@@ -2719,6 +2938,8 @@ static struct attribute *rdev_default_attrs[] = {
2719 &rdev_offset.attr, 2938 &rdev_offset.attr,
2720 &rdev_size.attr, 2939 &rdev_size.attr,
2721 &rdev_recovery_start.attr, 2940 &rdev_recovery_start.attr,
2941 &rdev_bad_blocks.attr,
2942 &rdev_unack_bad_blocks.attr,
2722 NULL, 2943 NULL,
2723}; 2944};
2724static ssize_t 2945static ssize_t
@@ -2782,7 +3003,7 @@ static struct kobj_type rdev_ktype = {
2782 .default_attrs = rdev_default_attrs, 3003 .default_attrs = rdev_default_attrs,
2783}; 3004};
2784 3005
2785void md_rdev_init(mdk_rdev_t *rdev) 3006int md_rdev_init(mdk_rdev_t *rdev)
2786{ 3007{
2787 rdev->desc_nr = -1; 3008 rdev->desc_nr = -1;
2788 rdev->saved_raid_disk = -1; 3009 rdev->saved_raid_disk = -1;
@@ -2792,12 +3013,27 @@ void md_rdev_init(mdk_rdev_t *rdev)
2792 rdev->sb_events = 0; 3013 rdev->sb_events = 0;
2793 rdev->last_read_error.tv_sec = 0; 3014 rdev->last_read_error.tv_sec = 0;
2794 rdev->last_read_error.tv_nsec = 0; 3015 rdev->last_read_error.tv_nsec = 0;
3016 rdev->sb_loaded = 0;
3017 rdev->bb_page = NULL;
2795 atomic_set(&rdev->nr_pending, 0); 3018 atomic_set(&rdev->nr_pending, 0);
2796 atomic_set(&rdev->read_errors, 0); 3019 atomic_set(&rdev->read_errors, 0);
2797 atomic_set(&rdev->corrected_errors, 0); 3020 atomic_set(&rdev->corrected_errors, 0);
2798 3021
2799 INIT_LIST_HEAD(&rdev->same_set); 3022 INIT_LIST_HEAD(&rdev->same_set);
2800 init_waitqueue_head(&rdev->blocked_wait); 3023 init_waitqueue_head(&rdev->blocked_wait);
3024
3025 /* Add space to store bad block list.
3026 * This reserves the space even on arrays where it cannot
3027 * be used - I wonder if that matters
3028 */
3029 rdev->badblocks.count = 0;
3030 rdev->badblocks.shift = 0;
3031 rdev->badblocks.page = kmalloc(PAGE_SIZE, GFP_KERNEL);
3032 seqlock_init(&rdev->badblocks.lock);
3033 if (rdev->badblocks.page == NULL)
3034 return -ENOMEM;
3035
3036 return 0;
2801} 3037}
2802EXPORT_SYMBOL_GPL(md_rdev_init); 3038EXPORT_SYMBOL_GPL(md_rdev_init);
2803/* 3039/*
@@ -2823,8 +3059,11 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
2823 return ERR_PTR(-ENOMEM); 3059 return ERR_PTR(-ENOMEM);
2824 } 3060 }
2825 3061
2826 md_rdev_init(rdev); 3062 err = md_rdev_init(rdev);
2827 if ((err = alloc_disk_sb(rdev))) 3063 if (err)
3064 goto abort_free;
3065 err = alloc_disk_sb(rdev);
3066 if (err)
2828 goto abort_free; 3067 goto abort_free;
2829 3068
2830 err = lock_rdev(rdev, newdev, super_format == -2); 3069 err = lock_rdev(rdev, newdev, super_format == -2);
@@ -2860,15 +3099,17 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
2860 goto abort_free; 3099 goto abort_free;
2861 } 3100 }
2862 } 3101 }
3102 if (super_format == -1)
3103 /* hot-add for 0.90, or non-persistent: so no badblocks */
3104 rdev->badblocks.shift = -1;
2863 3105
2864 return rdev; 3106 return rdev;
2865 3107
2866abort_free: 3108abort_free:
2867 if (rdev->sb_page) { 3109 if (rdev->bdev)
2868 if (rdev->bdev) 3110 unlock_rdev(rdev);
2869 unlock_rdev(rdev); 3111 free_disk_sb(rdev);
2870 free_disk_sb(rdev); 3112 kfree(rdev->badblocks.page);
2871 }
2872 kfree(rdev); 3113 kfree(rdev);
2873 return ERR_PTR(err); 3114 return ERR_PTR(err);
2874} 3115}
@@ -3149,15 +3390,13 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
3149 } 3390 }
3150 3391
3151 list_for_each_entry(rdev, &mddev->disks, same_set) { 3392 list_for_each_entry(rdev, &mddev->disks, same_set) {
3152 char nm[20];
3153 if (rdev->raid_disk < 0) 3393 if (rdev->raid_disk < 0)
3154 continue; 3394 continue;
3155 if (rdev->new_raid_disk >= mddev->raid_disks) 3395 if (rdev->new_raid_disk >= mddev->raid_disks)
3156 rdev->new_raid_disk = -1; 3396 rdev->new_raid_disk = -1;
3157 if (rdev->new_raid_disk == rdev->raid_disk) 3397 if (rdev->new_raid_disk == rdev->raid_disk)
3158 continue; 3398 continue;
3159 sprintf(nm, "rd%d", rdev->raid_disk); 3399 sysfs_unlink_rdev(mddev, rdev);
3160 sysfs_remove_link(&mddev->kobj, nm);
3161 } 3400 }
3162 list_for_each_entry(rdev, &mddev->disks, same_set) { 3401 list_for_each_entry(rdev, &mddev->disks, same_set) {
3163 if (rdev->raid_disk < 0) 3402 if (rdev->raid_disk < 0)
@@ -3168,11 +3407,10 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
3168 if (rdev->raid_disk < 0) 3407 if (rdev->raid_disk < 0)
3169 clear_bit(In_sync, &rdev->flags); 3408 clear_bit(In_sync, &rdev->flags);
3170 else { 3409 else {
3171 char nm[20]; 3410 if (sysfs_link_rdev(mddev, rdev))
3172 sprintf(nm, "rd%d", rdev->raid_disk); 3411 printk(KERN_WARNING "md: cannot register rd%d"
3173 if(sysfs_create_link(&mddev->kobj, &rdev->kobj, nm)) 3412 " for %s after level change\n",
3174 printk("md: cannot register %s for %s after level change\n", 3413 rdev->raid_disk, mdname(mddev));
3175 nm, mdname(mddev));
3176 } 3414 }
3177 } 3415 }
3178 3416
@@ -4504,7 +4742,8 @@ int md_run(mddev_t *mddev)
4504 } 4742 }
4505 4743
4506 if (mddev->bio_set == NULL) 4744 if (mddev->bio_set == NULL)
4507 mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev)); 4745 mddev->bio_set = bioset_create(BIO_POOL_SIZE,
4746 sizeof(mddev_t *));
4508 4747
4509 spin_lock(&pers_lock); 4748 spin_lock(&pers_lock);
4510 pers = find_pers(mddev->level, mddev->clevel); 4749 pers = find_pers(mddev->level, mddev->clevel);
@@ -4621,12 +4860,9 @@ int md_run(mddev_t *mddev)
4621 smp_wmb(); 4860 smp_wmb();
4622 mddev->ready = 1; 4861 mddev->ready = 1;
4623 list_for_each_entry(rdev, &mddev->disks, same_set) 4862 list_for_each_entry(rdev, &mddev->disks, same_set)
4624 if (rdev->raid_disk >= 0) { 4863 if (rdev->raid_disk >= 0)
4625 char nm[20]; 4864 if (sysfs_link_rdev(mddev, rdev))
4626 sprintf(nm, "rd%d", rdev->raid_disk);
4627 if (sysfs_create_link(&mddev->kobj, &rdev->kobj, nm))
4628 /* failure here is OK */; 4865 /* failure here is OK */;
4629 }
4630 4866
4631 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 4867 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
4632 4868
@@ -4854,11 +5090,8 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
4854 sysfs_notify_dirent_safe(mddev->sysfs_state); 5090 sysfs_notify_dirent_safe(mddev->sysfs_state);
4855 5091
4856 list_for_each_entry(rdev, &mddev->disks, same_set) 5092 list_for_each_entry(rdev, &mddev->disks, same_set)
4857 if (rdev->raid_disk >= 0) { 5093 if (rdev->raid_disk >= 0)
4858 char nm[20]; 5094 sysfs_unlink_rdev(mddev, rdev);
4859 sprintf(nm, "rd%d", rdev->raid_disk);
4860 sysfs_remove_link(&mddev->kobj, nm);
4861 }
4862 5095
4863 set_capacity(disk, 0); 5096 set_capacity(disk, 0);
4864 mutex_unlock(&mddev->open_mutex); 5097 mutex_unlock(&mddev->open_mutex);
@@ -6198,18 +6431,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
6198 if (!rdev || test_bit(Faulty, &rdev->flags)) 6431 if (!rdev || test_bit(Faulty, &rdev->flags))
6199 return; 6432 return;
6200 6433
6201 if (mddev->external) 6434 if (!mddev->pers || !mddev->pers->error_handler)
6202 set_bit(Blocked, &rdev->flags);
6203/*
6204 dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
6205 mdname(mddev),
6206 MAJOR(rdev->bdev->bd_dev), MINOR(rdev->bdev->bd_dev),
6207 __builtin_return_address(0),__builtin_return_address(1),
6208 __builtin_return_address(2),__builtin_return_address(3));
6209*/
6210 if (!mddev->pers)
6211 return;
6212 if (!mddev->pers->error_handler)
6213 return; 6435 return;
6214 mddev->pers->error_handler(mddev,rdev); 6436 mddev->pers->error_handler(mddev,rdev);
6215 if (mddev->degraded) 6437 if (mddev->degraded)
@@ -6933,11 +7155,14 @@ void md_do_sync(mddev_t *mddev)
6933 atomic_add(sectors, &mddev->recovery_active); 7155 atomic_add(sectors, &mddev->recovery_active);
6934 } 7156 }
6935 7157
7158 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7159 break;
7160
6936 j += sectors; 7161 j += sectors;
6937 if (j>1) mddev->curr_resync = j; 7162 if (j>1) mddev->curr_resync = j;
6938 mddev->curr_mark_cnt = io_sectors; 7163 mddev->curr_mark_cnt = io_sectors;
6939 if (last_check == 0) 7164 if (last_check == 0)
6940 /* this is the earliers that rebuilt will be 7165 /* this is the earliest that rebuild will be
6941 * visible in /proc/mdstat 7166 * visible in /proc/mdstat
6942 */ 7167 */
6943 md_new_event(mddev); 7168 md_new_event(mddev);
@@ -6946,10 +7171,6 @@ void md_do_sync(mddev_t *mddev)
6946 continue; 7171 continue;
6947 7172
6948 last_check = io_sectors; 7173 last_check = io_sectors;
6949
6950 if (test_bit(MD_RECOVERY_INTR, &mddev->recovery))
6951 break;
6952
6953 repeat: 7174 repeat:
6954 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) { 7175 if (time_after_eq(jiffies, mark[last_mark] + SYNC_MARK_STEP )) {
6955 /* step marks */ 7176 /* step marks */
@@ -7067,29 +7288,23 @@ static int remove_and_add_spares(mddev_t *mddev)
7067 atomic_read(&rdev->nr_pending)==0) { 7288 atomic_read(&rdev->nr_pending)==0) {
7068 if (mddev->pers->hot_remove_disk( 7289 if (mddev->pers->hot_remove_disk(
7069 mddev, rdev->raid_disk)==0) { 7290 mddev, rdev->raid_disk)==0) {
7070 char nm[20]; 7291 sysfs_unlink_rdev(mddev, rdev);
7071 sprintf(nm,"rd%d", rdev->raid_disk);
7072 sysfs_remove_link(&mddev->kobj, nm);
7073 rdev->raid_disk = -1; 7292 rdev->raid_disk = -1;
7074 } 7293 }
7075 } 7294 }
7076 7295
7077 if (mddev->degraded && !mddev->recovery_disabled) { 7296 if (mddev->degraded) {
7078 list_for_each_entry(rdev, &mddev->disks, same_set) { 7297 list_for_each_entry(rdev, &mddev->disks, same_set) {
7079 if (rdev->raid_disk >= 0 && 7298 if (rdev->raid_disk >= 0 &&
7080 !test_bit(In_sync, &rdev->flags) && 7299 !test_bit(In_sync, &rdev->flags) &&
7081 !test_bit(Faulty, &rdev->flags) && 7300 !test_bit(Faulty, &rdev->flags))
7082 !test_bit(Blocked, &rdev->flags))
7083 spares++; 7301 spares++;
7084 if (rdev->raid_disk < 0 7302 if (rdev->raid_disk < 0
7085 && !test_bit(Faulty, &rdev->flags)) { 7303 && !test_bit(Faulty, &rdev->flags)) {
7086 rdev->recovery_offset = 0; 7304 rdev->recovery_offset = 0;
7087 if (mddev->pers-> 7305 if (mddev->pers->
7088 hot_add_disk(mddev, rdev) == 0) { 7306 hot_add_disk(mddev, rdev) == 0) {
7089 char nm[20]; 7307 if (sysfs_link_rdev(mddev, rdev))
7090 sprintf(nm, "rd%d", rdev->raid_disk);
7091 if (sysfs_create_link(&mddev->kobj,
7092 &rdev->kobj, nm))
7093 /* failure here is OK */; 7308 /* failure here is OK */;
7094 spares++; 7309 spares++;
7095 md_new_event(mddev); 7310 md_new_event(mddev);
@@ -7138,6 +7353,8 @@ static void reap_sync_thread(mddev_t *mddev)
7138 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); 7353 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
7139 sysfs_notify_dirent_safe(mddev->sysfs_action); 7354 sysfs_notify_dirent_safe(mddev->sysfs_action);
7140 md_new_event(mddev); 7355 md_new_event(mddev);
7356 if (mddev->event_work.func)
7357 queue_work(md_misc_wq, &mddev->event_work);
7141} 7358}
7142 7359
7143/* 7360/*
@@ -7170,9 +7387,6 @@ void md_check_recovery(mddev_t *mddev)
7170 if (mddev->bitmap) 7387 if (mddev->bitmap)
7171 bitmap_daemon_work(mddev); 7388 bitmap_daemon_work(mddev);
7172 7389
7173 if (mddev->ro)
7174 return;
7175
7176 if (signal_pending(current)) { 7390 if (signal_pending(current)) {
7177 if (mddev->pers->sync_request && !mddev->external) { 7391 if (mddev->pers->sync_request && !mddev->external) {
7178 printk(KERN_INFO "md: %s in immediate safe mode\n", 7392 printk(KERN_INFO "md: %s in immediate safe mode\n",
@@ -7209,9 +7423,7 @@ void md_check_recovery(mddev_t *mddev)
7209 atomic_read(&rdev->nr_pending)==0) { 7423 atomic_read(&rdev->nr_pending)==0) {
7210 if (mddev->pers->hot_remove_disk( 7424 if (mddev->pers->hot_remove_disk(
7211 mddev, rdev->raid_disk)==0) { 7425 mddev, rdev->raid_disk)==0) {
7212 char nm[20]; 7426 sysfs_unlink_rdev(mddev, rdev);
7213 sprintf(nm,"rd%d", rdev->raid_disk);
7214 sysfs_remove_link(&mddev->kobj, nm);
7215 rdev->raid_disk = -1; 7427 rdev->raid_disk = -1;
7216 } 7428 }
7217 } 7429 }
@@ -7331,12 +7543,499 @@ void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
7331{ 7543{
7332 sysfs_notify_dirent_safe(rdev->sysfs_state); 7544 sysfs_notify_dirent_safe(rdev->sysfs_state);
7333 wait_event_timeout(rdev->blocked_wait, 7545 wait_event_timeout(rdev->blocked_wait,
7334 !test_bit(Blocked, &rdev->flags), 7546 !test_bit(Blocked, &rdev->flags) &&
7547 !test_bit(BlockedBadBlocks, &rdev->flags),
7335 msecs_to_jiffies(5000)); 7548 msecs_to_jiffies(5000));
7336 rdev_dec_pending(rdev, mddev); 7549 rdev_dec_pending(rdev, mddev);
7337} 7550}
7338EXPORT_SYMBOL(md_wait_for_blocked_rdev); 7551EXPORT_SYMBOL(md_wait_for_blocked_rdev);
7339 7552
7553
7554/* Bad block management.
7555 * We can record which blocks on each device are 'bad' and so just
7556 * fail those blocks, or that stripe, rather than the whole device.
7557 * Entries in the bad-block table are 64bits wide. This comprises:
7558 * Length of bad-range, in sectors: 0-511 for lengths 1-512
7559 * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
7560 * A 'shift' can be set so that larger blocks are tracked and
7561 * consequently larger devices can be covered.
7562 * 'Acknowledged' flag - 1 bit. - the most significant bit.
7563 *
7564 * Locking of the bad-block table uses a seqlock so md_is_badblock
7565 * might need to retry if it is very unlucky.
7566 * We will sometimes want to check for bad blocks in a bi_end_io function,
7567 * so we use the write_seqlock_irq variant.
7568 *
7569 * When looking for a bad block we specify a range and want to
7570 * know if any block in the range is bad. So we binary-search
7571 * to the last range that starts at-or-before the given endpoint,
7572 * (or "before the sector after the target range")
7573 * then see if it ends after the given start.
7574 * We return
7575 * 0 if there are no known bad blocks in the range
7576 * 1 if there are known bad block which are all acknowledged
7577 * -1 if there are bad blocks which have not yet been acknowledged in metadata.
7578 * plus the start/length of the first bad section we overlap.
7579 */
7580int md_is_badblock(struct badblocks *bb, sector_t s, int sectors,
7581 sector_t *first_bad, int *bad_sectors)
7582{
7583 int hi;
7584 int lo = 0;
7585 u64 *p = bb->page;
7586 int rv = 0;
7587 sector_t target = s + sectors;
7588 unsigned seq;
7589
7590 if (bb->shift > 0) {
7591 /* round the start down, and the end up */
7592 s >>= bb->shift;
7593 target += (1<<bb->shift) - 1;
7594 target >>= bb->shift;
7595 sectors = target - s;
7596 }
7597 /* 'target' is now the first block after the bad range */
7598
7599retry:
7600 seq = read_seqbegin(&bb->lock);
7601
7602 hi = bb->count;
7603
7604 /* Binary search between lo and hi for 'target'
7605 * i.e. for the last range that starts before 'target'
7606 */
7607 /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
7608 * are known not to be the last range before target.
7609 * VARIANT: hi-lo is the number of possible
7610 * ranges, and decreases until it reaches 1
7611 */
7612 while (hi - lo > 1) {
7613 int mid = (lo + hi) / 2;
7614 sector_t a = BB_OFFSET(p[mid]);
7615 if (a < target)
7616 /* This could still be the one, earlier ranges
7617 * could not. */
7618 lo = mid;
7619 else
7620 /* This and later ranges are definitely out. */
7621 hi = mid;
7622 }
7623 /* 'lo' might be the last that started before target, but 'hi' isn't */
7624 if (hi > lo) {
7625 /* need to check all range that end after 's' to see if
7626 * any are unacknowledged.
7627 */
7628 while (lo >= 0 &&
7629 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
7630 if (BB_OFFSET(p[lo]) < target) {
7631 /* starts before the end, and finishes after
7632 * the start, so they must overlap
7633 */
7634 if (rv != -1 && BB_ACK(p[lo]))
7635 rv = 1;
7636 else
7637 rv = -1;
7638 *first_bad = BB_OFFSET(p[lo]);
7639 *bad_sectors = BB_LEN(p[lo]);
7640 }
7641 lo--;
7642 }
7643 }
7644
7645 if (read_seqretry(&bb->lock, seq))
7646 goto retry;
7647
7648 return rv;
7649}
7650EXPORT_SYMBOL_GPL(md_is_badblock);
7651
7652/*
7653 * Add a range of bad blocks to the table.
7654 * This might extend the table, or might contract it
7655 * if two adjacent ranges can be merged.
7656 * We binary-search to find the 'insertion' point, then
7657 * decide how best to handle it.
7658 */
7659static int md_set_badblocks(struct badblocks *bb, sector_t s, int sectors,
7660 int acknowledged)
7661{
7662 u64 *p;
7663 int lo, hi;
7664 int rv = 1;
7665
7666 if (bb->shift < 0)
7667 /* badblocks are disabled */
7668 return 0;
7669
7670 if (bb->shift) {
7671 /* round the start down, and the end up */
7672 sector_t next = s + sectors;
7673 s >>= bb->shift;
7674 next += (1<<bb->shift) - 1;
7675 next >>= bb->shift;
7676 sectors = next - s;
7677 }
7678
7679 write_seqlock_irq(&bb->lock);
7680
7681 p = bb->page;
7682 lo = 0;
7683 hi = bb->count;
7684 /* Find the last range that starts at-or-before 's' */
7685 while (hi - lo > 1) {
7686 int mid = (lo + hi) / 2;
7687 sector_t a = BB_OFFSET(p[mid]);
7688 if (a <= s)
7689 lo = mid;
7690 else
7691 hi = mid;
7692 }
7693 if (hi > lo && BB_OFFSET(p[lo]) > s)
7694 hi = lo;
7695
7696 if (hi > lo) {
7697 /* we found a range that might merge with the start
7698 * of our new range
7699 */
7700 sector_t a = BB_OFFSET(p[lo]);
7701 sector_t e = a + BB_LEN(p[lo]);
7702 int ack = BB_ACK(p[lo]);
7703 if (e >= s) {
7704 /* Yes, we can merge with a previous range */
7705 if (s == a && s + sectors >= e)
7706 /* new range covers old */
7707 ack = acknowledged;
7708 else
7709 ack = ack && acknowledged;
7710
7711 if (e < s + sectors)
7712 e = s + sectors;
7713 if (e - a <= BB_MAX_LEN) {
7714 p[lo] = BB_MAKE(a, e-a, ack);
7715 s = e;
7716 } else {
7717 /* does not all fit in one range,
7718 * make p[lo] maximal
7719 */
7720 if (BB_LEN(p[lo]) != BB_MAX_LEN)
7721 p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
7722 s = a + BB_MAX_LEN;
7723 }
7724 sectors = e - s;
7725 }
7726 }
7727 if (sectors && hi < bb->count) {
7728 /* 'hi' points to the first range that starts after 's'.
7729 * Maybe we can merge with the start of that range */
7730 sector_t a = BB_OFFSET(p[hi]);
7731 sector_t e = a + BB_LEN(p[hi]);
7732 int ack = BB_ACK(p[hi]);
7733 if (a <= s + sectors) {
7734 /* merging is possible */
7735 if (e <= s + sectors) {
7736 /* full overlap */
7737 e = s + sectors;
7738 ack = acknowledged;
7739 } else
7740 ack = ack && acknowledged;
7741
7742 a = s;
7743 if (e - a <= BB_MAX_LEN) {
7744 p[hi] = BB_MAKE(a, e-a, ack);
7745 s = e;
7746 } else {
7747 p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
7748 s = a + BB_MAX_LEN;
7749 }
7750 sectors = e - s;
7751 lo = hi;
7752 hi++;
7753 }
7754 }
7755 if (sectors == 0 && hi < bb->count) {
7756 /* we might be able to combine lo and hi */
7757 /* Note: 's' is at the end of 'lo' */
7758 sector_t a = BB_OFFSET(p[hi]);
7759 int lolen = BB_LEN(p[lo]);
7760 int hilen = BB_LEN(p[hi]);
7761 int newlen = lolen + hilen - (s - a);
7762 if (s >= a && newlen < BB_MAX_LEN) {
7763 /* yes, we can combine them */
7764 int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
7765 p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
7766 memmove(p + hi, p + hi + 1,
7767 (bb->count - hi - 1) * 8);
7768 bb->count--;
7769 }
7770 }
7771 while (sectors) {
7772 /* didn't merge (it all).
7773 * Need to add a range just before 'hi' */
7774 if (bb->count >= MD_MAX_BADBLOCKS) {
7775 /* No room for more */
7776 rv = 0;
7777 break;
7778 } else {
7779 int this_sectors = sectors;
7780 memmove(p + hi + 1, p + hi,
7781 (bb->count - hi) * 8);
7782 bb->count++;
7783
7784 if (this_sectors > BB_MAX_LEN)
7785 this_sectors = BB_MAX_LEN;
7786 p[hi] = BB_MAKE(s, this_sectors, acknowledged);
7787 sectors -= this_sectors;
7788 s += this_sectors;
7789 }
7790 }
7791
7792 bb->changed = 1;
7793 if (!acknowledged)
7794 bb->unacked_exist = 1;
7795 write_sequnlock_irq(&bb->lock);
7796
7797 return rv;
7798}
7799
7800int rdev_set_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors,
7801 int acknowledged)
7802{
7803 int rv = md_set_badblocks(&rdev->badblocks,
7804 s + rdev->data_offset, sectors, acknowledged);
7805 if (rv) {
7806 /* Make sure they get written out promptly */
7807 set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
7808 md_wakeup_thread(rdev->mddev->thread);
7809 }
7810 return rv;
7811}
7812EXPORT_SYMBOL_GPL(rdev_set_badblocks);
7813
7814/*
7815 * Remove a range of bad blocks from the table.
7816 * This may involve extending the table if we spilt a region,
7817 * but it must not fail. So if the table becomes full, we just
7818 * drop the remove request.
7819 */
7820static int md_clear_badblocks(struct badblocks *bb, sector_t s, int sectors)
7821{
7822 u64 *p;
7823 int lo, hi;
7824 sector_t target = s + sectors;
7825 int rv = 0;
7826
7827 if (bb->shift > 0) {
7828 /* When clearing we round the start up and the end down.
7829 * This should not matter as the shift should align with
7830 * the block size and no rounding should ever be needed.
7831 * However it is better the think a block is bad when it
7832 * isn't than to think a block is not bad when it is.
7833 */
7834 s += (1<<bb->shift) - 1;
7835 s >>= bb->shift;
7836 target >>= bb->shift;
7837 sectors = target - s;
7838 }
7839
7840 write_seqlock_irq(&bb->lock);
7841
7842 p = bb->page;
7843 lo = 0;
7844 hi = bb->count;
7845 /* Find the last range that starts before 'target' */
7846 while (hi - lo > 1) {
7847 int mid = (lo + hi) / 2;
7848 sector_t a = BB_OFFSET(p[mid]);
7849 if (a < target)
7850 lo = mid;
7851 else
7852 hi = mid;
7853 }
7854 if (hi > lo) {
7855 /* p[lo] is the last range that could overlap the
7856 * current range. Earlier ranges could also overlap,
7857 * but only this one can overlap the end of the range.
7858 */
7859 if (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) {
7860 /* Partial overlap, leave the tail of this range */
7861 int ack = BB_ACK(p[lo]);
7862 sector_t a = BB_OFFSET(p[lo]);
7863 sector_t end = a + BB_LEN(p[lo]);
7864
7865 if (a < s) {
7866 /* we need to split this range */
7867 if (bb->count >= MD_MAX_BADBLOCKS) {
7868 rv = 0;
7869 goto out;
7870 }
7871 memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
7872 bb->count++;
7873 p[lo] = BB_MAKE(a, s-a, ack);
7874 lo++;
7875 }
7876 p[lo] = BB_MAKE(target, end - target, ack);
7877 /* there is no longer an overlap */
7878 hi = lo;
7879 lo--;
7880 }
7881 while (lo >= 0 &&
7882 BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
7883 /* This range does overlap */
7884 if (BB_OFFSET(p[lo]) < s) {
7885 /* Keep the early parts of this range. */
7886 int ack = BB_ACK(p[lo]);
7887 sector_t start = BB_OFFSET(p[lo]);
7888 p[lo] = BB_MAKE(start, s - start, ack);
7889 /* now low doesn't overlap, so.. */
7890 break;
7891 }
7892 lo--;
7893 }
7894 /* 'lo' is strictly before, 'hi' is strictly after,
7895 * anything between needs to be discarded
7896 */
7897 if (hi - lo > 1) {
7898 memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
7899 bb->count -= (hi - lo - 1);
7900 }
7901 }
7902
7903 bb->changed = 1;
7904out:
7905 write_sequnlock_irq(&bb->lock);
7906 return rv;
7907}
7908
7909int rdev_clear_badblocks(mdk_rdev_t *rdev, sector_t s, int sectors)
7910{
7911 return md_clear_badblocks(&rdev->badblocks,
7912 s + rdev->data_offset,
7913 sectors);
7914}
7915EXPORT_SYMBOL_GPL(rdev_clear_badblocks);
7916
7917/*
7918 * Acknowledge all bad blocks in a list.
7919 * This only succeeds if ->changed is clear. It is used by
7920 * in-kernel metadata updates
7921 */
7922void md_ack_all_badblocks(struct badblocks *bb)
7923{
7924 if (bb->page == NULL || bb->changed)
7925 /* no point even trying */
7926 return;
7927 write_seqlock_irq(&bb->lock);
7928
7929 if (bb->changed == 0) {
7930 u64 *p = bb->page;
7931 int i;
7932 for (i = 0; i < bb->count ; i++) {
7933 if (!BB_ACK(p[i])) {
7934 sector_t start = BB_OFFSET(p[i]);
7935 int len = BB_LEN(p[i]);
7936 p[i] = BB_MAKE(start, len, 1);
7937 }
7938 }
7939 bb->unacked_exist = 0;
7940 }
7941 write_sequnlock_irq(&bb->lock);
7942}
7943EXPORT_SYMBOL_GPL(md_ack_all_badblocks);
7944
7945/* sysfs access to bad-blocks list.
7946 * We present two files.
7947 * 'bad-blocks' lists sector numbers and lengths of ranges that
7948 * are recorded as bad. The list is truncated to fit within
7949 * the one-page limit of sysfs.
7950 * Writing "sector length" to this file adds an acknowledged
7951 * bad block list.
7952 * 'unacknowledged-bad-blocks' lists bad blocks that have not yet
7953 * been acknowledged. Writing to this file adds bad blocks
7954 * without acknowledging them. This is largely for testing.
7955 */
7956
7957static ssize_t
7958badblocks_show(struct badblocks *bb, char *page, int unack)
7959{
7960 size_t len;
7961 int i;
7962 u64 *p = bb->page;
7963 unsigned seq;
7964
7965 if (bb->shift < 0)
7966 return 0;
7967
7968retry:
7969 seq = read_seqbegin(&bb->lock);
7970
7971 len = 0;
7972 i = 0;
7973
7974 while (len < PAGE_SIZE && i < bb->count) {
7975 sector_t s = BB_OFFSET(p[i]);
7976 unsigned int length = BB_LEN(p[i]);
7977 int ack = BB_ACK(p[i]);
7978 i++;
7979
7980 if (unack && ack)
7981 continue;
7982
7983 len += snprintf(page+len, PAGE_SIZE-len, "%llu %u\n",
7984 (unsigned long long)s << bb->shift,
7985 length << bb->shift);
7986 }
7987 if (unack && len == 0)
7988 bb->unacked_exist = 0;
7989
7990 if (read_seqretry(&bb->lock, seq))
7991 goto retry;
7992
7993 return len;
7994}
7995
7996#define DO_DEBUG 1
7997
7998static ssize_t
7999badblocks_store(struct badblocks *bb, const char *page, size_t len, int unack)
8000{
8001 unsigned long long sector;
8002 int length;
8003 char newline;
8004#ifdef DO_DEBUG
8005 /* Allow clearing via sysfs *only* for testing/debugging.
8006 * Normally only a successful write may clear a badblock
8007 */
8008 int clear = 0;
8009 if (page[0] == '-') {
8010 clear = 1;
8011 page++;
8012 }
8013#endif /* DO_DEBUG */
8014
8015 switch (sscanf(page, "%llu %d%c", &sector, &length, &newline)) {
8016 case 3:
8017 if (newline != '\n')
8018 return -EINVAL;
8019 case 2:
8020 if (length <= 0)
8021 return -EINVAL;
8022 break;
8023 default:
8024 return -EINVAL;
8025 }
8026
8027#ifdef DO_DEBUG
8028 if (clear) {
8029 md_clear_badblocks(bb, sector, length);
8030 return len;
8031 }
8032#endif /* DO_DEBUG */
8033 if (md_set_badblocks(bb, sector, length, !unack))
8034 return len;
8035 else
8036 return -ENOSPC;
8037}
8038
7340static int md_notify_reboot(struct notifier_block *this, 8039static int md_notify_reboot(struct notifier_block *this,
7341 unsigned long code, void *x) 8040 unsigned long code, void *x)
7342{ 8041{