aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md/raid10.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/raid10.c')
-rw-r--r--drivers/md/raid10.c145
1 files changed, 123 insertions, 22 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index c2cb7b87b440..e2766d8251a1 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -18,6 +18,7 @@
18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 18 * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
19 */ 19 */
20 20
21#include <linux/slab.h>
21#include <linux/delay.h> 22#include <linux/delay.h>
22#include <linux/blkdev.h> 23#include <linux/blkdev.h>
23#include <linux/seq_file.h> 24#include <linux/seq_file.h>
@@ -804,7 +805,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
804 mdk_rdev_t *blocked_rdev; 805 mdk_rdev_t *blocked_rdev;
805 806
806 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) { 807 if (unlikely(bio_rw_flagged(bio, BIO_RW_BARRIER))) {
807 bio_endio(bio, -EOPNOTSUPP); 808 md_barrier_request(mddev, bio);
808 return 0; 809 return 0;
809 } 810 }
810 811
@@ -1155,13 +1156,17 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev)
1155 1156
1156 disk_stack_limits(mddev->gendisk, rdev->bdev, 1157 disk_stack_limits(mddev->gendisk, rdev->bdev,
1157 rdev->data_offset << 9); 1158 rdev->data_offset << 9);
1158 /* as we don't honour merge_bvec_fn, we must never risk 1159 /* as we don't honour merge_bvec_fn, we must
1159 * violating it, so limit ->max_sector to one PAGE, as 1160 * never risk violating it, so limit
1160 * a one page request is never in violation. 1161 * ->max_segments to one lying with a single
1162 * page, as a one page request is never in
1163 * violation.
1161 */ 1164 */
1162 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 1165 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1163 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) 1166 blk_queue_max_segments(mddev->queue, 1);
1164 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 1167 blk_queue_segment_boundary(mddev->queue,
1168 PAGE_CACHE_SIZE - 1);
1169 }
1165 1170
1166 p->head_position = 0; 1171 p->head_position = 0;
1167 rdev->raid_disk = mirror; 1172 rdev->raid_disk = mirror;
@@ -1432,6 +1437,43 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1432 1437
1433 1438
1434/* 1439/*
1440 * Used by fix_read_error() to decay the per rdev read_errors.
1441 * We halve the read error count for every hour that has elapsed
1442 * since the last recorded read error.
1443 *
1444 */
1445static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
1446{
1447 struct timespec cur_time_mon;
1448 unsigned long hours_since_last;
1449 unsigned int read_errors = atomic_read(&rdev->read_errors);
1450
1451 ktime_get_ts(&cur_time_mon);
1452
1453 if (rdev->last_read_error.tv_sec == 0 &&
1454 rdev->last_read_error.tv_nsec == 0) {
1455 /* first time we've seen a read error */
1456 rdev->last_read_error = cur_time_mon;
1457 return;
1458 }
1459
1460 hours_since_last = (cur_time_mon.tv_sec -
1461 rdev->last_read_error.tv_sec) / 3600;
1462
1463 rdev->last_read_error = cur_time_mon;
1464
1465 /*
1466 * if hours_since_last is > the number of bits in read_errors
1467 * just set read errors to 0. We do this to avoid
1468 * overflowing the shift of read_errors by hours_since_last.
1469 */
1470 if (hours_since_last >= 8 * sizeof(read_errors))
1471 atomic_set(&rdev->read_errors, 0);
1472 else
1473 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
1474}
1475
1476/*
1435 * This is a kernel thread which: 1477 * This is a kernel thread which:
1436 * 1478 *
1437 * 1. Retries failed read operations on working mirrors. 1479 * 1. Retries failed read operations on working mirrors.
@@ -1444,6 +1486,43 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1444 int sect = 0; /* Offset from r10_bio->sector */ 1486 int sect = 0; /* Offset from r10_bio->sector */
1445 int sectors = r10_bio->sectors; 1487 int sectors = r10_bio->sectors;
1446 mdk_rdev_t*rdev; 1488 mdk_rdev_t*rdev;
1489 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
1490
1491 rcu_read_lock();
1492 {
1493 int d = r10_bio->devs[r10_bio->read_slot].devnum;
1494 char b[BDEVNAME_SIZE];
1495 int cur_read_error_count = 0;
1496
1497 rdev = rcu_dereference(conf->mirrors[d].rdev);
1498 bdevname(rdev->bdev, b);
1499
1500 if (test_bit(Faulty, &rdev->flags)) {
1501 rcu_read_unlock();
1502 /* drive has already been failed, just ignore any
1503 more fix_read_error() attempts */
1504 return;
1505 }
1506
1507 check_decay_read_errors(mddev, rdev);
1508 atomic_inc(&rdev->read_errors);
1509 cur_read_error_count = atomic_read(&rdev->read_errors);
1510 if (cur_read_error_count > max_read_errors) {
1511 rcu_read_unlock();
1512 printk(KERN_NOTICE
1513 "raid10: %s: Raid device exceeded "
1514 "read_error threshold "
1515 "[cur %d:max %d]\n",
1516 b, cur_read_error_count, max_read_errors);
1517 printk(KERN_NOTICE
1518 "raid10: %s: Failing raid "
1519 "device\n", b);
1520 md_error(mddev, conf->mirrors[d].rdev);
1521 return;
1522 }
1523 }
1524 rcu_read_unlock();
1525
1447 while(sectors) { 1526 while(sectors) {
1448 int s = sectors; 1527 int s = sectors;
1449 int sl = r10_bio->read_slot; 1528 int sl = r10_bio->read_slot;
@@ -1488,6 +1567,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1488 /* write it back and re-read */ 1567 /* write it back and re-read */
1489 rcu_read_lock(); 1568 rcu_read_lock();
1490 while (sl != r10_bio->read_slot) { 1569 while (sl != r10_bio->read_slot) {
1570 char b[BDEVNAME_SIZE];
1491 int d; 1571 int d;
1492 if (sl==0) 1572 if (sl==0)
1493 sl = conf->copies; 1573 sl = conf->copies;
@@ -1503,9 +1583,21 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1503 r10_bio->devs[sl].addr + 1583 r10_bio->devs[sl].addr +
1504 sect + rdev->data_offset, 1584 sect + rdev->data_offset,
1505 s<<9, conf->tmppage, WRITE) 1585 s<<9, conf->tmppage, WRITE)
1506 == 0) 1586 == 0) {
1507 /* Well, this device is dead */ 1587 /* Well, this device is dead */
1588 printk(KERN_NOTICE
1589 "raid10:%s: read correction "
1590 "write failed"
1591 " (%d sectors at %llu on %s)\n",
1592 mdname(mddev), s,
1593 (unsigned long long)(sect+
1594 rdev->data_offset),
1595 bdevname(rdev->bdev, b));
1596 printk(KERN_NOTICE "raid10:%s: failing "
1597 "drive\n",
1598 bdevname(rdev->bdev, b));
1508 md_error(mddev, rdev); 1599 md_error(mddev, rdev);
1600 }
1509 rdev_dec_pending(rdev, mddev); 1601 rdev_dec_pending(rdev, mddev);
1510 rcu_read_lock(); 1602 rcu_read_lock();
1511 } 1603 }
@@ -1526,10 +1618,22 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1526 if (sync_page_io(rdev->bdev, 1618 if (sync_page_io(rdev->bdev,
1527 r10_bio->devs[sl].addr + 1619 r10_bio->devs[sl].addr +
1528 sect + rdev->data_offset, 1620 sect + rdev->data_offset,
1529 s<<9, conf->tmppage, READ) == 0) 1621 s<<9, conf->tmppage,
1622 READ) == 0) {
1530 /* Well, this device is dead */ 1623 /* Well, this device is dead */
1624 printk(KERN_NOTICE
1625 "raid10:%s: unable to read back "
1626 "corrected sectors"
1627 " (%d sectors at %llu on %s)\n",
1628 mdname(mddev), s,
1629 (unsigned long long)(sect+
1630 rdev->data_offset),
1631 bdevname(rdev->bdev, b));
1632 printk(KERN_NOTICE "raid10:%s: failing drive\n",
1633 bdevname(rdev->bdev, b));
1634
1531 md_error(mddev, rdev); 1635 md_error(mddev, rdev);
1532 else 1636 } else {
1533 printk(KERN_INFO 1637 printk(KERN_INFO
1534 "raid10:%s: read error corrected" 1638 "raid10:%s: read error corrected"
1535 " (%d sectors at %llu on %s)\n", 1639 " (%d sectors at %llu on %s)\n",
@@ -1537,6 +1641,7 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1537 (unsigned long long)(sect+ 1641 (unsigned long long)(sect+
1538 rdev->data_offset), 1642 rdev->data_offset),
1539 bdevname(rdev->bdev, b)); 1643 bdevname(rdev->bdev, b));
1644 }
1540 1645
1541 rdev_dec_pending(rdev, mddev); 1646 rdev_dec_pending(rdev, mddev);
1542 rcu_read_lock(); 1647 rcu_read_lock();
@@ -2155,12 +2260,14 @@ static int run(mddev_t *mddev)
2155 disk_stack_limits(mddev->gendisk, rdev->bdev, 2260 disk_stack_limits(mddev->gendisk, rdev->bdev,
2156 rdev->data_offset << 9); 2261 rdev->data_offset << 9);
2157 /* as we don't honour merge_bvec_fn, we must never risk 2262 /* as we don't honour merge_bvec_fn, we must never risk
2158 * violating it, so limit ->max_sector to one PAGE, as 2263 * violating it, so limit max_segments to 1 lying
2159 * a one page request is never in violation. 2264 * within a single page.
2160 */ 2265 */
2161 if (rdev->bdev->bd_disk->queue->merge_bvec_fn && 2266 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
2162 queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) 2267 blk_queue_max_segments(mddev->queue, 1);
2163 blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); 2268 blk_queue_segment_boundary(mddev->queue,
2269 PAGE_CACHE_SIZE - 1);
2270 }
2164 2271
2165 disk->head_position = 0; 2272 disk->head_position = 0;
2166 } 2273 }
@@ -2275,13 +2382,6 @@ static void raid10_quiesce(mddev_t *mddev, int state)
2275 lower_barrier(conf); 2382 lower_barrier(conf);
2276 break; 2383 break;
2277 } 2384 }
2278 if (mddev->thread) {
2279 if (mddev->bitmap)
2280 mddev->thread->timeout = mddev->bitmap->daemon_sleep * HZ;
2281 else
2282 mddev->thread->timeout = MAX_SCHEDULE_TIMEOUT;
2283 md_wakeup_thread(mddev->thread);
2284 }
2285} 2385}
2286 2386
2287static struct mdk_personality raid10_personality = 2387static struct mdk_personality raid10_personality =
@@ -2315,6 +2415,7 @@ static void raid_exit(void)
2315module_init(raid_init); 2415module_init(raid_init);
2316module_exit(raid_exit); 2416module_exit(raid_exit);
2317MODULE_LICENSE("GPL"); 2417MODULE_LICENSE("GPL");
2418MODULE_DESCRIPTION("RAID10 (striped mirror) personality for MD");
2318MODULE_ALIAS("md-personality-9"); /* RAID10 */ 2419MODULE_ALIAS("md-personality-9"); /* RAID10 */
2319MODULE_ALIAS("md-raid10"); 2420MODULE_ALIAS("md-raid10");
2320MODULE_ALIAS("md-level-10"); 2421MODULE_ALIAS("md-level-10");