aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorRobert Becker <Rob.Becker@riverbed.com>2009-12-13 20:49:58 -0500
committerNeilBrown <neilb@suse.de>2009-12-13 20:51:41 -0500
commit1e50915fe0bbf7a46db0fa7e1e604d3fc95f057d (patch)
tree7a722ad6f56c61a6173493f1cd44d809c8b1bd8d /drivers/md
parent67b8dc4b06b0e97df55fd76e209f34f9a52e820e (diff)
raid: improve MD/raid10 handling of correctable read errors.
We've noticed severe lasting performance degradation of our raid arrays when we have drives that yield large amounts of media errors. The raid10 module will queue each failed read for retry, and also will attempt call fix_read_error() to perform the read recovery. Read recovery is performed while the array is frozen, so repeated recovery attempts can degrade the performance of the array for extended periods of time. With this patch I propose adding a per md device max number of corrected read attempts. Each rdev will maintain a count of read correction attempts in the rdev->read_errors field (not used currently for raid10). When we enter fix_read_error() we'll check to see when the last read error occurred, and divide the read error count by 2 for every hour since the last read error. If at that point our read error count exceeds the read error threshold, we'll fail the raid device. In addition in this patch I add sysfs nodes (get/set) for the per md max_read_errors attribute, the rdev->read_errors attribute, and added some printk's to indicate when fix_read_error fails to repair an rdev. For testing I used debugfs->fail_make_request to inject IO errors to the rdev while doing IO to the raid array. Signed-off-by: Robert Becker <Rob.Becker@riverbed.com> Signed-off-by: NeilBrown <neilb@suse.de>
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/md.c34
-rw-r--r--drivers/md/md.h4
-rw-r--r--drivers/md/raid10.c74
3 files changed, 112 insertions, 0 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 859edbf8c9b0..f1b905a20133 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -68,6 +68,12 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
68#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); } 68#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
69 69
70/* 70/*
71 * Default number of read corrections we'll attempt on an rdev
72 * before ejecting it from the array. We divide the read error
73 * count by 2 for every hour elapsed between read errors.
74 */
75#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
76/*
71 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' 77 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
72 * is 1000 KB/sec, so the extra system load does not show up that much. 78 * is 1000 KB/sec, so the extra system load does not show up that much.
73 * Increase it if you want to have more _guaranteed_ speed. Note that 79 * Increase it if you want to have more _guaranteed_ speed. Note that
@@ -2653,6 +2659,8 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
2653 rdev->flags = 0; 2659 rdev->flags = 0;
2654 rdev->data_offset = 0; 2660 rdev->data_offset = 0;
2655 rdev->sb_events = 0; 2661 rdev->sb_events = 0;
2662 rdev->last_read_error.tv_sec = 0;
2663 rdev->last_read_error.tv_nsec = 0;
2656 atomic_set(&rdev->nr_pending, 0); 2664 atomic_set(&rdev->nr_pending, 0);
2657 atomic_set(&rdev->read_errors, 0); 2665 atomic_set(&rdev->read_errors, 0);
2658 atomic_set(&rdev->corrected_errors, 0); 2666 atomic_set(&rdev->corrected_errors, 0);
@@ -3290,6 +3298,29 @@ static struct md_sysfs_entry md_array_state =
3290__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); 3298__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
3291 3299
3292static ssize_t 3300static ssize_t
3301max_corrected_read_errors_show(mddev_t *mddev, char *page) {
3302 return sprintf(page, "%d\n",
3303 atomic_read(&mddev->max_corr_read_errors));
3304}
3305
3306static ssize_t
3307max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len)
3308{
3309 char *e;
3310 unsigned long n = simple_strtoul(buf, &e, 10);
3311
3312 if (*buf && (*e == 0 || *e == '\n')) {
3313 atomic_set(&mddev->max_corr_read_errors, n);
3314 return len;
3315 }
3316 return -EINVAL;
3317}
3318
3319static struct md_sysfs_entry max_corr_read_errors =
3320__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
3321 max_corrected_read_errors_store);
3322
3323static ssize_t
3293null_show(mddev_t *mddev, char *page) 3324null_show(mddev_t *mddev, char *page)
3294{ 3325{
3295 return -EINVAL; 3326 return -EINVAL;
@@ -3914,6 +3945,7 @@ static struct attribute *md_default_attrs[] = {
3914 &md_array_state.attr, 3945 &md_array_state.attr,
3915 &md_reshape_position.attr, 3946 &md_reshape_position.attr,
3916 &md_array_size.attr, 3947 &md_array_size.attr,
3948 &max_corr_read_errors.attr,
3917 NULL, 3949 NULL,
3918}; 3950};
3919 3951
@@ -4333,6 +4365,8 @@ static int do_md_run(mddev_t * mddev)
4333 mddev->ro = 0; 4365 mddev->ro = 0;
4334 4366
4335 atomic_set(&mddev->writes_pending,0); 4367 atomic_set(&mddev->writes_pending,0);
4368 atomic_set(&mddev->max_corr_read_errors,
4369 MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
4336 mddev->safemode = 0; 4370 mddev->safemode = 0;
4337 mddev->safemode_timer.function = md_safemode_timeout; 4371 mddev->safemode_timer.function = md_safemode_timeout;
4338 mddev->safemode_timer.data = (unsigned long) mddev; 4372 mddev->safemode_timer.data = (unsigned long) mddev;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index d9138885b87f..8e4c75c00d46 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -97,6 +97,9 @@ struct mdk_rdev_s
97 atomic_t read_errors; /* number of consecutive read errors that 97 atomic_t read_errors; /* number of consecutive read errors that
98 * we have tried to ignore. 98 * we have tried to ignore.
99 */ 99 */
100 struct timespec last_read_error; /* monotonic time since our
101 * last read error
102 */
100 atomic_t corrected_errors; /* number of corrected read errors, 103 atomic_t corrected_errors; /* number of corrected read errors,
101 * for reporting to userspace and storing 104 * for reporting to userspace and storing
102 * in superblock. 105 * in superblock.
@@ -299,6 +302,7 @@ struct mddev_s
299 int external; 302 int external;
300 } bitmap_info; 303 } bitmap_info;
301 304
305 atomic_t max_corr_read_errors; /* max read retries */
302 struct list_head all_mddevs; 306 struct list_head all_mddevs;
303 307
304 /* Generic barrier handling. 308 /* Generic barrier handling.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 670449f7411f..5c71a462c120 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1432,6 +1432,43 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
1432 1432
1433 1433
1434/* 1434/*
1435 * Used by fix_read_error() to decay the per rdev read_errors.
1436 * We halve the read error count for every hour that has elapsed
1437 * since the last recorded read error.
1438 *
1439 */
1440static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
1441{
1442 struct timespec cur_time_mon;
1443 unsigned long hours_since_last;
1444 unsigned int read_errors = atomic_read(&rdev->read_errors);
1445
1446 ktime_get_ts(&cur_time_mon);
1447
1448 if (rdev->last_read_error.tv_sec == 0 &&
1449 rdev->last_read_error.tv_nsec == 0) {
1450 /* first time we've seen a read error */
1451 rdev->last_read_error = cur_time_mon;
1452 return;
1453 }
1454
1455 hours_since_last = (cur_time_mon.tv_sec -
1456 rdev->last_read_error.tv_sec) / 3600;
1457
1458 rdev->last_read_error = cur_time_mon;
1459
1460 /*
1461 * if hours_since_last is > the number of bits in read_errors
1462 * just set read errors to 0. We do this to avoid
1463 * overflowing the shift of read_errors by hours_since_last.
1464 */
1465 if (hours_since_last >= 8 * sizeof(read_errors))
1466 atomic_set(&rdev->read_errors, 0);
1467 else
1468 atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
1469}
1470
1471/*
1435 * This is a kernel thread which: 1472 * This is a kernel thread which:
1436 * 1473 *
1437 * 1. Retries failed read operations on working mirrors. 1474 * 1. Retries failed read operations on working mirrors.
@@ -1444,6 +1481,43 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
1444 int sect = 0; /* Offset from r10_bio->sector */ 1481 int sect = 0; /* Offset from r10_bio->sector */
1445 int sectors = r10_bio->sectors; 1482 int sectors = r10_bio->sectors;
1446 mdk_rdev_t*rdev; 1483 mdk_rdev_t*rdev;
1484 int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
1485
1486 rcu_read_lock();
1487 {
1488 int d = r10_bio->devs[r10_bio->read_slot].devnum;
1489 char b[BDEVNAME_SIZE];
1490 int cur_read_error_count = 0;
1491
1492 rdev = rcu_dereference(conf->mirrors[d].rdev);
1493 bdevname(rdev->bdev, b);
1494
1495 if (test_bit(Faulty, &rdev->flags)) {
1496 rcu_read_unlock();
1497 /* drive has already been failed, just ignore any
1498 more fix_read_error() attempts */
1499 return;
1500 }
1501
1502 check_decay_read_errors(mddev, rdev);
1503 atomic_inc(&rdev->read_errors);
1504 cur_read_error_count = atomic_read(&rdev->read_errors);
1505 if (cur_read_error_count > max_read_errors) {
1506 rcu_read_unlock();
1507 printk(KERN_NOTICE
1508 "raid10: %s: Raid device exceeded "
1509 "read_error threshold "
1510 "[cur %d:max %d]\n",
1511 b, cur_read_error_count, max_read_errors);
1512 printk(KERN_NOTICE
1513 "raid10: %s: Failing raid "
1514 "device\n", b);
1515 md_error(mddev, conf->mirrors[d].rdev);
1516 return;
1517 }
1518 }
1519 rcu_read_unlock();
1520
1447 while(sectors) { 1521 while(sectors) {
1448 int s = sectors; 1522 int s = sectors;
1449 int sl = r10_bio->read_slot; 1523 int sl = r10_bio->read_slot;