raid: improve MD/raid10 handling of correctable read errors.

We've noticed severe lasting performance degradation of our raid arrays when we have drives that yield large amounts of media errors. The raid10 module will queue each failed read for retry, and also will attempt call fix_read_error() to perform the read recovery. Read recovery is performed while the array is frozen, so repeated recovery attempts can degrade the performance of the array for extended periods of time. With this patch I propose adding a per md device max number of corrected read attempts. Each rdev will maintain a count of read correction attempts in the rdev->read_errors field (not used currently for raid10). When we enter fix_read_error() we'll check to see when the last read error occurred, and divide the read error count by 2 for every hour since the last read error. If at that point our read error count exceeds the read error threshold, we'll fail the raid device. In addition in this patch I add sysfs nodes (get/set) for the per md max_read_errors attribute, the rdev->read_errors attribute, and added some printk's to indicate when fix_read_error fails to repair an rdev. For testing I used debugfs->fail_make_request to inject IO errors to the rdev while doing IO to the raid array. Signed-off-by: Robert Becker <Rob.Becker@riverbed.com> Signed-off-by: NeilBrown <neilb@suse.de>
author: Robert Becker <Rob.Becker@riverbed.com> 2009-12-13 20:49:58 -0500
committer: NeilBrown <neilb@suse.de> 2009-12-13 20:51:41 -0500
commit: 1e50915fe0bbf7a46db0fa7e1e604d3fc95f057d (patch)
tree: 7a722ad6f56c61a6173493f1cd44d809c8b1bd8d /drivers/md
parent: 67b8dc4b06b0e97df55fd76e209f34f9a52e820e (diff)
3 files changed, 112 insertions, 0 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 859edbf8c9b0..f1b905a20133 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -68,6 +68,12 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
 /*
+ * Default number of read corrections we'll attempt on an rdev
+ * before ejecting it from the array. We divide the read error
+ * count by 2 for every hour elapsed between read errors.
+ */
+#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
+/*
 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
 * is 1000 KB/sec, so the extra system load does not show up that much.
 * Increase it if you want to have more _guaranteed_ speed. Note that
@@ -2653,6 +2659,8 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
        rdev->flags = 0;
        rdev->data_offset = 0;
        rdev->sb_events = 0;
+        rdev->last_read_error.tv_sec  = 0;
+        rdev->last_read_error.tv_nsec = 0;
        atomic_set(&rdev->nr_pending, 0);
        atomic_set(&rdev->read_errors, 0);
        atomic_set(&rdev->corrected_errors, 0);
@@ -3290,6 +3298,29 @@ static struct md_sysfs_entry md_array_state =
 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
 static ssize_t
+max_corrected_read_errors_show(mddev_t *mddev, char *page) {
+        return sprintf(page, "%d\n",
+                       atomic_read(&mddev->max_corr_read_errors));
+}
+static ssize_t
+max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len)
+{
+        char *e;
+        unsigned long n = simple_strtoul(buf, &e, 10);
+        if (*buf && (*e == 0 || *e == '\n')) {
+                atomic_set(&mddev->max_corr_read_errors, n);
+                return len;
+        }
+        return -EINVAL;
+}
+static struct md_sysfs_entry max_corr_read_errors =
+__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
+        max_corrected_read_errors_store);
+static ssize_t
 null_show(mddev_t *mddev, char *page)
 {
        return -EINVAL;
@@ -3914,6 +3945,7 @@ static struct attribute *md_default_attrs[] = {
        &md_array_state.attr,
        &md_reshape_position.attr,
        &md_array_size.attr,
+        &max_corr_read_errors.attr,
        NULL,
 };
@@ -4333,6 +4365,8 @@ static int do_md_run(mddev_t * mddev)
                mddev->ro = 0;
        atomic_set(&mddev->writes_pending,0);
+        atomic_set(&mddev->max_corr_read_errors,
+                   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
        mddev->safemode = 0;
        mddev->safemode_timer.function = md_safemode_timeout;
        mddev->safemode_timer.data = (unsigned long) mddev;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index d9138885b87f..8e4c75c00d46 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -97,6 +97,9 @@ struct mdk_rdev_s
        atomic_t        read_errors;    /* number of consecutive read errors that
                                         * we have tried to ignore.
                                         */
+        struct timespec last_read_error;        /* monotonic time since our
+                                                 * last read error
+                                                 */
        atomic_t        corrected_errors; /* number of corrected read errors,
                                           * for reporting to userspace and storing
                                           * in superblock.
@@ -299,6 +302,7 @@ struct mddev_s
                int                     external;
        } bitmap_info;
+        atomic_t                        max_corr_read_errors; /* max read retries */
        struct list_head                all_mddevs;
        /* Generic barrier handling.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 670449f7411f..5c71a462c120 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1432,6 +1432,43 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 /*
+ * Used by fix_read_error() to decay the per rdev read_errors.
+ * We halve the read error count for every hour that has elapsed
+ * since the last recorded read error.
+ *
+ */
+static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+        struct timespec cur_time_mon;
+        unsigned long hours_since_last;
+        unsigned int read_errors = atomic_read(&rdev->read_errors);
+        ktime_get_ts(&cur_time_mon);
+        if (rdev->last_read_error.tv_sec == 0 &&
+            rdev->last_read_error.tv_nsec == 0) {
+                /* first time we've seen a read error */
+                rdev->last_read_error = cur_time_mon;
+                return;
+        }
+        hours_since_last = (cur_time_mon.tv_sec -
+                            rdev->last_read_error.tv_sec) / 3600;
+        rdev->last_read_error = cur_time_mon;
+        /*
+         * if hours_since_last is > the number of bits in read_errors
+         * just set read errors to 0. We do this to avoid
+         * overflowing the shift of read_errors by hours_since_last.
+         */
+        if (hours_since_last >= 8 * sizeof(read_errors))
+                atomic_set(&rdev->read_errors, 0);
+        else
+                atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
+}
+/*
 * This is a kernel thread which:
 *
 *      1.      Retries failed read operations on working mirrors.
@@ -1444,6 +1481,43 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
        int sect = 0; /* Offset from r10_bio->sector */
        int sectors = r10_bio->sectors;
        mdk_rdev_t*rdev;
+        int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
+        rcu_read_lock();
+        {
+                int d = r10_bio->devs[r10_bio->read_slot].devnum;
+                char b[BDEVNAME_SIZE];
+                int cur_read_error_count = 0;
+                rdev = rcu_dereference(conf->mirrors[d].rdev);
+                bdevname(rdev->bdev, b);
+                if (test_bit(Faulty, &rdev->flags)) {
+                        rcu_read_unlock();
+                        /* drive has already been failed, just ignore any
+                           more fix_read_error() attempts */
+                        return;
+                }
+                check_decay_read_errors(mddev, rdev);
+                atomic_inc(&rdev->read_errors);
+                cur_read_error_count = atomic_read(&rdev->read_errors);
+                if (cur_read_error_count > max_read_errors) {
+                        rcu_read_unlock();
+                        printk(KERN_NOTICE
+                               "raid10: %s: Raid device exceeded "
+                               "read_error threshold "
+                               "[cur %d:max %d]\n",
+                               b, cur_read_error_count, max_read_errors);
+                        printk(KERN_NOTICE
+                               "raid10: %s: Failing raid "
+                               "device\n", b);
+                        md_error(mddev, conf->mirrors[d].rdev);
+                        return;
+                }
+        }
+        rcu_read_unlock();
        while(sectors) {
                int s = sectors;
                int sl = r10_bio->read_slot;
author	Robert Becker <Rob.Becker@riverbed.com>	2009-12-13 20:49:58 -0500
committer	NeilBrown <neilb@suse.de>	2009-12-13 20:51:41 -0500
commit	1e50915fe0bbf7a46db0fa7e1e604d3fc95f057d (patch)
tree	7a722ad6f56c61a6173493f1cd44d809c8b1bd8d /drivers/md
parent	67b8dc4b06b0e97df55fd76e209f34f9a52e820e (diff)

diff --git a/drivers/md/md.c b/drivers/md/md.c index 859edbf8c9b0..f1b905a20133 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c
@@ -68,6 +68,12 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
68	#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }	68	#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
69		69
70	/*	70	/*
		71	* Default number of read corrections we'll attempt on an rdev
		72	* before ejecting it from the array. We divide the read error
		73	* count by 2 for every hour elapsed between read errors.
		74	*/
		75	#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
		76	/*
71	* Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'	77	* Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
72	* is 1000 KB/sec, so the extra system load does not show up that much.	78	* is 1000 KB/sec, so the extra system load does not show up that much.
73	* Increase it if you want to have more _guaranteed_ speed. Note that	79	* Increase it if you want to have more _guaranteed_ speed. Note that
@@ -2653,6 +2659,8 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
2653	rdev->flags = 0;	2659	rdev->flags = 0;
2654	rdev->data_offset = 0;	2660	rdev->data_offset = 0;
2655	rdev->sb_events = 0;	2661	rdev->sb_events = 0;
		2662	rdev->last_read_error.tv_sec = 0;
		2663	rdev->last_read_error.tv_nsec = 0;
2656	atomic_set(&rdev->nr_pending, 0);	2664	atomic_set(&rdev->nr_pending, 0);
2657	atomic_set(&rdev->read_errors, 0);	2665	atomic_set(&rdev->read_errors, 0);
2658	atomic_set(&rdev->corrected_errors, 0);	2666	atomic_set(&rdev->corrected_errors, 0);
@@ -3290,6 +3298,29 @@ static struct md_sysfs_entry md_array_state =
3290	__ATTR(array_state, S_IRUGO\|S_IWUSR, array_state_show, array_state_store);	3298	__ATTR(array_state, S_IRUGO\|S_IWUSR, array_state_show, array_state_store);
3291		3299
3292	static ssize_t	3300	static ssize_t
		3301	max_corrected_read_errors_show(mddev_t mddev, char page) {
		3302	return sprintf(page, "%d\n",
		3303	atomic_read(&mddev->max_corr_read_errors));
		3304	}
		3305
		3306	static ssize_t
		3307	max_corrected_read_errors_store(mddev_t mddev, const char buf, size_t len)
		3308	{
		3309	char *e;
		3310	unsigned long n = simple_strtoul(buf, &e, 10);
		3311
		3312	if (buf && (e == 0 \|\| *e == '\n')) {
		3313	atomic_set(&mddev->max_corr_read_errors, n);
		3314	return len;
		3315	}
		3316	return -EINVAL;
		3317	}
		3318
		3319	static struct md_sysfs_entry max_corr_read_errors =
		3320	__ATTR(max_read_errors, S_IRUGO\|S_IWUSR, max_corrected_read_errors_show,
		3321	max_corrected_read_errors_store);
		3322
		3323	static ssize_t
3293	null_show(mddev_t mddev, char page)	3324	null_show(mddev_t mddev, char page)
3294	{	3325	{
3295	return -EINVAL;	3326	return -EINVAL;
@@ -3914,6 +3945,7 @@ static struct attribute *md_default_attrs[] = {
3914	&md_array_state.attr,	3945	&md_array_state.attr,
3915	&md_reshape_position.attr,	3946	&md_reshape_position.attr,
3916	&md_array_size.attr,	3947	&md_array_size.attr,
		3948	&max_corr_read_errors.attr,
3917	NULL,	3949	NULL,
3918	};	3950	};
3919		3951
@@ -4333,6 +4365,8 @@ static int do_md_run(mddev_t * mddev)
4333	mddev->ro = 0;	4365	mddev->ro = 0;
4334		4366
4335	atomic_set(&mddev->writes_pending,0);	4367	atomic_set(&mddev->writes_pending,0);
		4368	atomic_set(&mddev->max_corr_read_errors,
		4369	MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
4336	mddev->safemode = 0;	4370	mddev->safemode = 0;
4337	mddev->safemode_timer.function = md_safemode_timeout;	4371	mddev->safemode_timer.function = md_safemode_timeout;
4338	mddev->safemode_timer.data = (unsigned long) mddev;	4372	mddev->safemode_timer.data = (unsigned long) mddev;


diff --git a/drivers/md/md.h b/drivers/md/md.h index d9138885b87f..8e4c75c00d46 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h
@@ -97,6 +97,9 @@ struct mdk_rdev_s
97	atomic_t read_errors; /* number of consecutive read errors that	97	atomic_t read_errors; /* number of consecutive read errors that
98	* we have tried to ignore.	98	* we have tried to ignore.
99	*/	99	*/
		100	struct timespec last_read_error; /* monotonic time since our
		101	* last read error
		102	*/
100	atomic_t corrected_errors; /* number of corrected read errors,	103	atomic_t corrected_errors; /* number of corrected read errors,
101	* for reporting to userspace and storing	104	* for reporting to userspace and storing
102	* in superblock.	105	* in superblock.
@@ -299,6 +302,7 @@ struct mddev_s
299	int external;	302	int external;
300	} bitmap_info;	303	} bitmap_info;
301		304
		305	atomic_t max_corr_read_errors; /* max read retries */
302	struct list_head all_mddevs;	306	struct list_head all_mddevs;
303		307
304	/* Generic barrier handling.	308	/* Generic barrier handling.


diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 670449f7411f..5c71a462c120 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c
@@ -1432,6 +1432,43 @@ static void recovery_request_write(mddev_t mddev, r10bio_t r10_bio)
1432		1432
1433		1433
1434	/*	1434	/*
		1435	* Used by fix_read_error() to decay the per rdev read_errors.
		1436	* We halve the read error count for every hour that has elapsed
		1437	* since the last recorded read error.
		1438	*
		1439	*/
		1440	static void check_decay_read_errors(mddev_t mddev, mdk_rdev_t rdev)
		1441	{
		1442	struct timespec cur_time_mon;
		1443	unsigned long hours_since_last;
		1444	unsigned int read_errors = atomic_read(&rdev->read_errors);
		1445
		1446	ktime_get_ts(&cur_time_mon);
		1447
		1448	if (rdev->last_read_error.tv_sec == 0 &&
		1449	rdev->last_read_error.tv_nsec == 0) {
		1450	/* first time we've seen a read error */
		1451	rdev->last_read_error = cur_time_mon;
		1452	return;
		1453	}
		1454
		1455	hours_since_last = (cur_time_mon.tv_sec -
		1456	rdev->last_read_error.tv_sec) / 3600;
		1457
		1458	rdev->last_read_error = cur_time_mon;
		1459
		1460	/*
		1461	* if hours_since_last is > the number of bits in read_errors
		1462	* just set read errors to 0. We do this to avoid
		1463	* overflowing the shift of read_errors by hours_since_last.
		1464	*/
		1465	if (hours_since_last >= 8 * sizeof(read_errors))
		1466	atomic_set(&rdev->read_errors, 0);
		1467	else
		1468	atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
		1469	}
		1470
		1471	/*
1435	* This is a kernel thread which:	1472	* This is a kernel thread which:
1436	*	1473	*
1437	* 1. Retries failed read operations on working mirrors.	1474	* 1. Retries failed read operations on working mirrors.
@@ -1444,6 +1481,43 @@ static void fix_read_error(conf_t conf, mddev_t mddev, r10bio_t *r10_bio)
1444	int sect = 0; /* Offset from r10_bio->sector */	1481	int sect = 0; /* Offset from r10_bio->sector */
1445	int sectors = r10_bio->sectors;	1482	int sectors = r10_bio->sectors;
1446	mdk_rdev_t*rdev;	1483	mdk_rdev_t*rdev;
		1484	int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
		1485
		1486	rcu_read_lock();
		1487	{
		1488	int d = r10_bio->devs[r10_bio->read_slot].devnum;
		1489	char b[BDEVNAME_SIZE];
		1490	int cur_read_error_count = 0;
		1491
		1492	rdev = rcu_dereference(conf->mirrors[d].rdev);
		1493	bdevname(rdev->bdev, b);
		1494
		1495	if (test_bit(Faulty, &rdev->flags)) {
		1496	rcu_read_unlock();
		1497	/* drive has already been failed, just ignore any
		1498	more fix_read_error() attempts */
		1499	return;
		1500	}
		1501
		1502	check_decay_read_errors(mddev, rdev);
		1503	atomic_inc(&rdev->read_errors);
		1504	cur_read_error_count = atomic_read(&rdev->read_errors);
		1505	if (cur_read_error_count > max_read_errors) {
		1506	rcu_read_unlock();
		1507	printk(KERN_NOTICE
		1508	"raid10: %s: Raid device exceeded "
		1509	"read_error threshold "
		1510	"[cur %d:max %d]\n",
		1511	b, cur_read_error_count, max_read_errors);
		1512	printk(KERN_NOTICE
		1513	"raid10: %s: Failing raid "
		1514	"device\n", b);
		1515	md_error(mddev, conf->mirrors[d].rdev);
		1516	return;
		1517	}
		1518	}
		1519	rcu_read_unlock();
		1520
1447	while(sectors) {	1521	while(sectors) {
1448	int s = sectors;	1522	int s = sectors;
1449	int sl = r10_bio->read_slot;	1523	int sl = r10_bio->read_slot;