diff options
author | Andrea Bastoni <bastoni@cs.unc.edu> | 2010-05-30 19:16:45 -0400 |
---|---|---|
committer | Andrea Bastoni <bastoni@cs.unc.edu> | 2010-05-30 19:16:45 -0400 |
commit | ada47b5fe13d89735805b566185f4885f5a3f750 (patch) | |
tree | 644b88f8a71896307d71438e9b3af49126ffb22b /drivers/md/raid10.c | |
parent | 43e98717ad40a4ae64545b5ba047c7b86aa44f4f (diff) | |
parent | 3280f21d43ee541f97f8cda5792150d2dbec20d5 (diff) |
Merge branch 'wip-2.6.34' into old-private-masterarchived-private-master
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r-- | drivers/md/raid10.c | 145 |
1 files changed, 123 insertions, 22 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index c2cb7b87b440..e2766d8251a1 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -18,6 +18,7 @@ | |||
18 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. | 18 | * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. |
19 | */ | 19 | */ |
20 | 20 | ||
21 | #include <linux/slab.h> | ||
21 | #include <linux/delay.h> | 22 | #include <linux/delay.h> |
22 | #include <linux/blkdev.h> | 23 | #include <linux/blkdev.h> |
23 | #include <linux/seq_file.h> | 24 | #include <linux/seq_file.h> |
@@ -804,7 +805,7 @@ static int make_request(struct request_queue *q, struct bio * bio) | |||
804 | mdk_rdev_t *blocked_rdev; | 805 | mdk_rdev_t *blocked_rdev; |
805 | 806 | ||
806 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { | 807 | if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { |
807 | bio_endio(bio, -EOPNOTSUPP); | 808 | md_barrier_request(mddev, bio); |
808 | return 0; | 809 | return 0; |
809 | } | 810 | } |
810 | 811 | ||
@@ -1155,13 +1156,17 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1155 | 1156 | ||
1156 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 1157 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
1157 | rdev->data_offset << 9); | 1158 | rdev->data_offset << 9); |
1158 | /* as we don't honour merge_bvec_fn, we must never risk | 1159 | /* as we don't honour merge_bvec_fn, we must |
1159 | * violating it, so limit ->max_sector to one PAGE, as | 1160 | * never risk violating it, so limit |
1160 | * a one page request is never in violation. | 1161 | * ->max_segments to one lying with a single |
1162 | * page, as a one page request is never in | ||
1163 | * violation. | ||
1161 | */ | 1164 | */ |
1162 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 1165 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { |
1163 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) | 1166 | blk_queue_max_segments(mddev->queue, 1); |
1164 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 1167 | blk_queue_segment_boundary(mddev->queue, |
1168 | PAGE_CACHE_SIZE - 1); | ||
1169 | } | ||
1165 | 1170 | ||
1166 | p->head_position = 0; | 1171 | p->head_position = 0; |
1167 | rdev->raid_disk = mirror; | 1172 | rdev->raid_disk = mirror; |
@@ -1432,6 +1437,43 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio) | |||
1432 | 1437 | ||
1433 | 1438 | ||
1434 | /* | 1439 | /* |
1440 | * Used by fix_read_error() to decay the per rdev read_errors. | ||
1441 | * We halve the read error count for every hour that has elapsed | ||
1442 | * since the last recorded read error. | ||
1443 | * | ||
1444 | */ | ||
1445 | static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev) | ||
1446 | { | ||
1447 | struct timespec cur_time_mon; | ||
1448 | unsigned long hours_since_last; | ||
1449 | unsigned int read_errors = atomic_read(&rdev->read_errors); | ||
1450 | |||
1451 | ktime_get_ts(&cur_time_mon); | ||
1452 | |||
1453 | if (rdev->last_read_error.tv_sec == 0 && | ||
1454 | rdev->last_read_error.tv_nsec == 0) { | ||
1455 | /* first time we've seen a read error */ | ||
1456 | rdev->last_read_error = cur_time_mon; | ||
1457 | return; | ||
1458 | } | ||
1459 | |||
1460 | hours_since_last = (cur_time_mon.tv_sec - | ||
1461 | rdev->last_read_error.tv_sec) / 3600; | ||
1462 | |||
1463 | rdev->last_read_error = cur_time_mon; | ||
1464 | |||
1465 | /* | ||
1466 | * if hours_since_last is > the number of bits in read_errors | ||
1467 | * just set read errors to 0. We do this to avoid | ||
1468 | * overflowing the shift of read_errors by hours_since_last. | ||
1469 | */ | ||
1470 | if (hours_since_last >= 8 * sizeof(read_errors)) | ||
1471 | atomic_set(&rdev->read_errors, 0); | ||
1472 | else | ||
1473 | atomic_set(&rdev->read_errors, read_errors >> hours_since_last); | ||
1474 | } | ||
1475 | |||
1476 | /* | ||
1435 | * This is a kernel thread which: | 1477 | * This is a kernel thread which: |
1436 | * | 1478 | * |
1437 | * 1. Retries failed read operations on working mirrors. | 1479 | * 1. Retries failed read operations on working mirrors. |
@@ -1444,6 +1486,43 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1444 | int sect = 0; /* Offset from r10_bio->sector */ | 1486 | int sect = 0; /* Offset from r10_bio->sector */ |
1445 | int sectors = r10_bio->sectors; | 1487 | int sectors = r10_bio->sectors; |
1446 | mdk_rdev_t*rdev; | 1488 | mdk_rdev_t*rdev; |
1489 | int max_read_errors = atomic_read(&mddev->max_corr_read_errors); | ||
1490 | |||
1491 | rcu_read_lock(); | ||
1492 | { | ||
1493 | int d = r10_bio->devs[r10_bio->read_slot].devnum; | ||
1494 | char b[BDEVNAME_SIZE]; | ||
1495 | int cur_read_error_count = 0; | ||
1496 | |||
1497 | rdev = rcu_dereference(conf->mirrors[d].rdev); | ||
1498 | bdevname(rdev->bdev, b); | ||
1499 | |||
1500 | if (test_bit(Faulty, &rdev->flags)) { | ||
1501 | rcu_read_unlock(); | ||
1502 | /* drive has already been failed, just ignore any | ||
1503 | more fix_read_error() attempts */ | ||
1504 | return; | ||
1505 | } | ||
1506 | |||
1507 | check_decay_read_errors(mddev, rdev); | ||
1508 | atomic_inc(&rdev->read_errors); | ||
1509 | cur_read_error_count = atomic_read(&rdev->read_errors); | ||
1510 | if (cur_read_error_count > max_read_errors) { | ||
1511 | rcu_read_unlock(); | ||
1512 | printk(KERN_NOTICE | ||
1513 | "raid10: %s: Raid device exceeded " | ||
1514 | "read_error threshold " | ||
1515 | "[cur %d:max %d]\n", | ||
1516 | b, cur_read_error_count, max_read_errors); | ||
1517 | printk(KERN_NOTICE | ||
1518 | "raid10: %s: Failing raid " | ||
1519 | "device\n", b); | ||
1520 | md_error(mddev, conf->mirrors[d].rdev); | ||
1521 | return; | ||
1522 | } | ||
1523 | } | ||
1524 | rcu_read_unlock(); | ||
1525 | |||
1447 | while(sectors) { | 1526 | while(sectors) { |
1448 | int s = sectors; | 1527 | int s = sectors; |
1449 | int sl = r10_bio->read_slot; | 1528 | int sl = r10_bio->read_slot; |
@@ -1488,6 +1567,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1488 | /* write it back and re-read */ | 1567 | /* write it back and re-read */ |
1489 | rcu_read_lock(); | 1568 | rcu_read_lock(); |
1490 | while (sl != r10_bio->read_slot) { | 1569 | while (sl != r10_bio->read_slot) { |
1570 | char b[BDEVNAME_SIZE]; | ||
1491 | int d; | 1571 | int d; |
1492 | if (sl==0) | 1572 | if (sl==0) |
1493 | sl = conf->copies; | 1573 | sl = conf->copies; |
@@ -1503,9 +1583,21 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1503 | r10_bio->devs[sl].addr + | 1583 | r10_bio->devs[sl].addr + |
1504 | sect + rdev->data_offset, | 1584 | sect + rdev->data_offset, |
1505 | s<<9, conf->tmppage, WRITE) | 1585 | s<<9, conf->tmppage, WRITE) |
1506 | == 0) | 1586 | == 0) { |
1507 | /* Well, this device is dead */ | 1587 | /* Well, this device is dead */ |
1588 | printk(KERN_NOTICE | ||
1589 | "raid10:%s: read correction " | ||
1590 | "write failed" | ||
1591 | " (%d sectors at %llu on %s)\n", | ||
1592 | mdname(mddev), s, | ||
1593 | (unsigned long long)(sect+ | ||
1594 | rdev->data_offset), | ||
1595 | bdevname(rdev->bdev, b)); | ||
1596 | printk(KERN_NOTICE "raid10:%s: failing " | ||
1597 | "drive\n", | ||
1598 | bdevname(rdev->bdev, b)); | ||
1508 | md_error(mddev, rdev); | 1599 | md_error(mddev, rdev); |
1600 | } | ||
1509 | rdev_dec_pending(rdev, mddev); | 1601 | rdev_dec_pending(rdev, mddev); |
1510 | rcu_read_lock(); | 1602 | rcu_read_lock(); |
1511 | } | 1603 | } |
@@ -1526,10 +1618,22 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1526 | if (sync_page_io(rdev->bdev, | 1618 | if (sync_page_io(rdev->bdev, |
1527 | r10_bio->devs[sl].addr + | 1619 | r10_bio->devs[sl].addr + |
1528 | sect + rdev->data_offset, | 1620 | sect + rdev->data_offset, |
1529 | s<<9, conf->tmppage, READ) == 0) | 1621 | s<<9, conf->tmppage, |
1622 | READ) == 0) { | ||
1530 | /* Well, this device is dead */ | 1623 | /* Well, this device is dead */ |
1624 | printk(KERN_NOTICE | ||
1625 | "raid10:%s: unable to read back " | ||
1626 | "corrected sectors" | ||
1627 | " (%d sectors at %llu on %s)\n", | ||
1628 | mdname(mddev), s, | ||
1629 | (unsigned long long)(sect+ | ||
1630 | rdev->data_offset), | ||
1631 | bdevname(rdev->bdev, b)); | ||
1632 | printk(KERN_NOTICE "raid10:%s: failing drive\n", | ||
1633 | bdevname(rdev->bdev, b)); | ||
1634 | |||
1531 | md_error(mddev, rdev); | 1635 | md_error(mddev, rdev); |
1532 | else | 1636 | } else { |
1533 | printk(KERN_INFO | 1637 | printk(KERN_INFO |
1534 | "raid10:%s: read error corrected" | 1638 | "raid10:%s: read error corrected" |
1535 | " (%d sectors at %llu on %s)\n", | 1639 | " (%d sectors at %llu on %s)\n", |
@@ -1537,6 +1641,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1537 | (unsigned long long)(sect+ | 1641 | (unsigned long long)(sect+ |
1538 | rdev->data_offset), | 1642 | rdev->data_offset), |
1539 | bdevname(rdev->bdev, b)); | 1643 | bdevname(rdev->bdev, b)); |
1644 | } | ||
1540 | 1645 | ||
1541 | rdev_dec_pending(rdev, mddev); | 1646 | rdev_dec_pending(rdev, mddev); |
1542 | rcu_read_lock(); | 1647 | rcu_read_lock(); |
@@ -2155,12 +2260,14 @@ static int run(mddev_t *mddev) | |||
2155 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 2260 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
2156 | rdev->data_offset << 9); | 2261 | rdev->data_offset << 9); |
2157 | /* as we don't honour merge_bvec_fn, we must never risk | 2262 | /* as we don't honour merge_bvec_fn, we must never risk |
2158 | * violating it, so limit ->max_sector to one PAGE, as | 2263 | * violating it, so limit max_segments to 1 lying |
2159 | * a one page request is never in violation. | 2264 | * within a single page. |
2160 | */ | 2265 | */ |
2161 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn && | 2266 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { |
2162 | queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) | 2267 | blk_queue_max_segments(mddev->queue, 1); |
2163 | blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); | 2268 | blk_queue_segment_boundary(mddev->queue, |
2269 | PAGE_CACHE_SIZE - 1); | ||
2270 | } | ||
2164 | 2271 | ||
2165 | disk->head_position = 0; | 2272 | disk->head_position = 0; |
2166 | } | 2273 | } |
@@ -2275,13 +2382,6 @@ static void raid10_quiesce(mddev_t *mddev, int state) | |||
2275 | lower_barrier(conf); | 2382 | lower_barrier(conf); |
2276 | break; | 2383 | break; |
2277 | } | 2384 | } |
2278 | if (mddev->thread) { | ||
2279 | if (mddev->bitmap) | ||
2280 | mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ; | ||
2281 | else | ||
2282 | mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT; | ||
2283 | md_wakeup_thread(mddev->thread); | ||
2284 | } | ||
2285 | } | 2385 | } |
2286 | 2386 | ||
2287 | static struct mdk_personality raid10_personality = | 2387 | static struct mdk_personality raid10_personality = |
@@ -2315,6 +2415,7 @@ static void raid_exit(void) | |||
2315 | module_init(raid_init); | 2415 | module_init(raid_init); |
2316 | module_exit(raid_exit); | 2416 | module_exit(raid_exit); |
2317 | MODULE_LICENSE("GPL"); | 2417 | MODULE_LICENSE("GPL"); |
2418 | MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD"); | ||
2318 | MODULE_ALIAS("md-personality-9"); /* RAID10 */ | 2419 | MODULE_ALIAS("md-personality-9"); /* RAID10 */ |
2319 | MODULE_ALIAS("md-raid10"); | 2420 | MODULE_ALIAS("md-raid10"); |
2320 | MODULE_ALIAS("md-level-10"); | 2421 | MODULE_ALIAS("md-level-10"); |