3 files changed, 112 insertions, 0 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 859edbf8c9b0..f1b905a20133 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -68,6 +68,12 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
 #define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
 /*
+ * Default number of read corrections we'll attempt on an rdev
+ * before ejecting it from the array. We divide the read error
+ * count by 2 for every hour elapsed between read errors.
+ */
+#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
+/*
 * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
 * is 1000 KB/sec, so the extra system load does not show up that much.
 * Increase it if you want to have more _guaranteed_ speed. Note that
@@ -2653,6 +2659,8 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
        rdev->flags = 0;
        rdev->data_offset = 0;
        rdev->sb_events = 0;
+        rdev->last_read_error.tv_sec  = 0;
+        rdev->last_read_error.tv_nsec = 0;
        atomic_set(&rdev->nr_pending, 0);
        atomic_set(&rdev->read_errors, 0);
        atomic_set(&rdev->corrected_errors, 0);
@@ -3290,6 +3298,29 @@ static struct md_sysfs_entry md_array_state =
 __ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store);
 static ssize_t
+max_corrected_read_errors_show(mddev_t *mddev, char *page) {
+        return sprintf(page, "%d\n",
+                       atomic_read(&mddev->max_corr_read_errors));
+}
+static ssize_t
+max_corrected_read_errors_store(mddev_t *mddev, const char *buf, size_t len)
+{
+        char *e;
+        unsigned long n = simple_strtoul(buf, &e, 10);
+        if (*buf && (*e == 0 || *e == '\n')) {
+                atomic_set(&mddev->max_corr_read_errors, n);
+                return len;
+        }
+        return -EINVAL;
+}
+static struct md_sysfs_entry max_corr_read_errors =
+__ATTR(max_read_errors, S_IRUGO|S_IWUSR, max_corrected_read_errors_show,
+        max_corrected_read_errors_store);
+static ssize_t
 null_show(mddev_t *mddev, char *page)
 {
        return -EINVAL;
@@ -3914,6 +3945,7 @@ static struct attribute *md_default_attrs[] = {
        &md_array_state.attr,
        &md_reshape_position.attr,
        &md_array_size.attr,
+        &max_corr_read_errors.attr,
        NULL,
 };
@@ -4333,6 +4365,8 @@ static int do_md_run(mddev_t * mddev)
                mddev->ro = 0;
        atomic_set(&mddev->writes_pending,0);
+        atomic_set(&mddev->max_corr_read_errors,
+                   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
        mddev->safemode = 0;
        mddev->safemode_timer.function = md_safemode_timeout;
        mddev->safemode_timer.data = (unsigned long) mddev;
diff --git a/drivers/md/md.h b/drivers/md/md.h
index d9138885b87f..8e4c75c00d46 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -97,6 +97,9 @@ struct mdk_rdev_s
        atomic_t        read_errors;    /* number of consecutive read errors that
                                         * we have tried to ignore.
                                         */
+        struct timespec last_read_error;        /* monotonic time since our
+                                                 * last read error
+                                                 */
        atomic_t        corrected_errors; /* number of corrected read errors,
                                           * for reporting to userspace and storing
                                           * in superblock.
@@ -299,6 +302,7 @@ struct mddev_s
                int                     external;
        } bitmap_info;
+        atomic_t                        max_corr_read_errors; /* max read retries */
        struct list_head                all_mddevs;
        /* Generic barrier handling.
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 670449f7411f..5c71a462c120 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1432,6 +1432,43 @@ static void recovery_request_write(mddev_t *mddev, r10bio_t *r10_bio)
 /*
+ * Used by fix_read_error() to decay the per rdev read_errors.
+ * We halve the read error count for every hour that has elapsed
+ * since the last recorded read error.
+ *
+ */
+static void check_decay_read_errors(mddev_t *mddev, mdk_rdev_t *rdev)
+{
+        struct timespec cur_time_mon;
+        unsigned long hours_since_last;
+        unsigned int read_errors = atomic_read(&rdev->read_errors);
+        ktime_get_ts(&cur_time_mon);
+        if (rdev->last_read_error.tv_sec == 0 &&
+            rdev->last_read_error.tv_nsec == 0) {
+                /* first time we've seen a read error */
+                rdev->last_read_error = cur_time_mon;
+                return;
+        }
+        hours_since_last = (cur_time_mon.tv_sec -
+                            rdev->last_read_error.tv_sec) / 3600;
+        rdev->last_read_error = cur_time_mon;
+        /*
+         * if hours_since_last is > the number of bits in read_errors
+         * just set read errors to 0. We do this to avoid
+         * overflowing the shift of read_errors by hours_since_last.
+         */
+        if (hours_since_last >= 8 * sizeof(read_errors))
+                atomic_set(&rdev->read_errors, 0);
+        else
+                atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
+}
+/*
 * This is a kernel thread which:
 *
 *      1.      Retries failed read operations on working mirrors.
@@ -1444,6 +1481,43 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio)
        int sect = 0; /* Offset from r10_bio->sector */
        int sectors = r10_bio->sectors;
        mdk_rdev_t*rdev;
+        int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
+        rcu_read_lock();
+        {
+                int d = r10_bio->devs[r10_bio->read_slot].devnum;
+                char b[BDEVNAME_SIZE];
+                int cur_read_error_count = 0;
+                rdev = rcu_dereference(conf->mirrors[d].rdev);
+                bdevname(rdev->bdev, b);
+                if (test_bit(Faulty, &rdev->flags)) {
+                        rcu_read_unlock();
+                        /* drive has already been failed, just ignore any
+                           more fix_read_error() attempts */
+                        return;
+                }
+                check_decay_read_errors(mddev, rdev);
+                atomic_inc(&rdev->read_errors);
+                cur_read_error_count = atomic_read(&rdev->read_errors);
+                if (cur_read_error_count > max_read_errors) {
+                        rcu_read_unlock();
+                        printk(KERN_NOTICE
+                               "raid10: %s: Raid device exceeded "
+                               "read_error threshold "
+                               "[cur %d:max %d]\n",
+                               b, cur_read_error_count, max_read_errors);
+                        printk(KERN_NOTICE
+                               "raid10: %s: Failing raid "
+                               "device\n", b);
+                        md_error(mddev, conf->mirrors[d].rdev);
+                        return;
+                }
+        }
+        rcu_read_unlock();
        while(sectors) {
                int s = sectors;
                int sl = r10_bio->read_slot;

diff --git a/drivers/md/md.c b/drivers/md/md.c index 859edbf8c9b0..f1b905a20133 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c
@@ -68,6 +68,12 @@ static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
68	#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }	68	#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
69		69
70	/*	70	/*
		71	* Default number of read corrections we'll attempt on an rdev
		72	* before ejecting it from the array. We divide the read error
		73	* count by 2 for every hour elapsed between read errors.
		74	*/
		75	#define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20
		76	/*
71	* Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'	77	* Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit'
72	* is 1000 KB/sec, so the extra system load does not show up that much.	78	* is 1000 KB/sec, so the extra system load does not show up that much.
73	* Increase it if you want to have more _guaranteed_ speed. Note that	79	* Increase it if you want to have more _guaranteed_ speed. Note that
@@ -2653,6 +2659,8 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
2653	rdev->flags = 0;	2659	rdev->flags = 0;
2654	rdev->data_offset = 0;	2660	rdev->data_offset = 0;
2655	rdev->sb_events = 0;	2661	rdev->sb_events = 0;
		2662	rdev->last_read_error.tv_sec = 0;
		2663	rdev->last_read_error.tv_nsec = 0;
2656	atomic_set(&rdev->nr_pending, 0);	2664	atomic_set(&rdev->nr_pending, 0);
2657	atomic_set(&rdev->read_errors, 0);	2665	atomic_set(&rdev->read_errors, 0);
2658	atomic_set(&rdev->corrected_errors, 0);	2666	atomic_set(&rdev->corrected_errors, 0);
@@ -3290,6 +3298,29 @@ static struct md_sysfs_entry md_array_state =
3290	__ATTR(array_state, S_IRUGO\|S_IWUSR, array_state_show, array_state_store);	3298	__ATTR(array_state, S_IRUGO\|S_IWUSR, array_state_show, array_state_store);
3291		3299
3292	static ssize_t	3300	static ssize_t
		3301	max_corrected_read_errors_show(mddev_t mddev, char page) {
		3302	return sprintf(page, "%d\n",
		3303	atomic_read(&mddev->max_corr_read_errors));
		3304	}
		3305
		3306	static ssize_t
		3307	max_corrected_read_errors_store(mddev_t mddev, const char buf, size_t len)
		3308	{
		3309	char *e;
		3310	unsigned long n = simple_strtoul(buf, &e, 10);
		3311
		3312	if (buf && (e == 0 \|\| *e == '\n')) {
		3313	atomic_set(&mddev->max_corr_read_errors, n);
		3314	return len;
		3315	}
		3316	return -EINVAL;
		3317	}
		3318
		3319	static struct md_sysfs_entry max_corr_read_errors =
		3320	__ATTR(max_read_errors, S_IRUGO\|S_IWUSR, max_corrected_read_errors_show,
		3321	max_corrected_read_errors_store);
		3322
		3323	static ssize_t
3293	null_show(mddev_t mddev, char page)	3324	null_show(mddev_t mddev, char page)
3294	{	3325	{
3295	return -EINVAL;	3326	return -EINVAL;
@@ -3914,6 +3945,7 @@ static struct attribute *md_default_attrs[] = {
3914	&md_array_state.attr,	3945	&md_array_state.attr,
3915	&md_reshape_position.attr,	3946	&md_reshape_position.attr,
3916	&md_array_size.attr,	3947	&md_array_size.attr,
		3948	&max_corr_read_errors.attr,
3917	NULL,	3949	NULL,
3918	};	3950	};
3919		3951
@@ -4333,6 +4365,8 @@ static int do_md_run(mddev_t * mddev)
4333	mddev->ro = 0;	4365	mddev->ro = 0;
4334		4366
4335	atomic_set(&mddev->writes_pending,0);	4367	atomic_set(&mddev->writes_pending,0);
		4368	atomic_set(&mddev->max_corr_read_errors,
		4369	MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
4336	mddev->safemode = 0;	4370	mddev->safemode = 0;
4337	mddev->safemode_timer.function = md_safemode_timeout;	4371	mddev->safemode_timer.function = md_safemode_timeout;
4338	mddev->safemode_timer.data = (unsigned long) mddev;	4372	mddev->safemode_timer.data = (unsigned long) mddev;


diff --git a/drivers/md/md.h b/drivers/md/md.h index d9138885b87f..8e4c75c00d46 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h
@@ -97,6 +97,9 @@ struct mdk_rdev_s
97	atomic_t read_errors; /* number of consecutive read errors that	97	atomic_t read_errors; /* number of consecutive read errors that
98	* we have tried to ignore.	98	* we have tried to ignore.
99	*/	99	*/
		100	struct timespec last_read_error; /* monotonic time since our
		101	* last read error
		102	*/
100	atomic_t corrected_errors; /* number of corrected read errors,	103	atomic_t corrected_errors; /* number of corrected read errors,
101	* for reporting to userspace and storing	104	* for reporting to userspace and storing
102	* in superblock.	105	* in superblock.
@@ -299,6 +302,7 @@ struct mddev_s
299	int external;	302	int external;
300	} bitmap_info;	303	} bitmap_info;
301		304
		305	atomic_t max_corr_read_errors; /* max read retries */
302	struct list_head all_mddevs;	306	struct list_head all_mddevs;
303		307
304	/* Generic barrier handling.	308	/* Generic barrier handling.


diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 670449f7411f..5c71a462c120 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c
@@ -1432,6 +1432,43 @@ static void recovery_request_write(mddev_t mddev, r10bio_t r10_bio)
1432		1432
1433		1433
1434	/*	1434	/*
		1435	* Used by fix_read_error() to decay the per rdev read_errors.
		1436	* We halve the read error count for every hour that has elapsed
		1437	* since the last recorded read error.
		1438	*
		1439	*/
		1440	static void check_decay_read_errors(mddev_t mddev, mdk_rdev_t rdev)
		1441	{
		1442	struct timespec cur_time_mon;
		1443	unsigned long hours_since_last;
		1444	unsigned int read_errors = atomic_read(&rdev->read_errors);
		1445
		1446	ktime_get_ts(&cur_time_mon);
		1447
		1448	if (rdev->last_read_error.tv_sec == 0 &&
		1449	rdev->last_read_error.tv_nsec == 0) {
		1450	/* first time we've seen a read error */
		1451	rdev->last_read_error = cur_time_mon;
		1452	return;
		1453	}
		1454
		1455	hours_since_last = (cur_time_mon.tv_sec -
		1456	rdev->last_read_error.tv_sec) / 3600;
		1457
		1458	rdev->last_read_error = cur_time_mon;
		1459
		1460	/*
		1461	* if hours_since_last is > the number of bits in read_errors
		1462	* just set read errors to 0. We do this to avoid
		1463	* overflowing the shift of read_errors by hours_since_last.
		1464	*/
		1465	if (hours_since_last >= 8 * sizeof(read_errors))
		1466	atomic_set(&rdev->read_errors, 0);
		1467	else
		1468	atomic_set(&rdev->read_errors, read_errors >> hours_since_last);
		1469	}
		1470
		1471	/*
1435	* This is a kernel thread which:	1472	* This is a kernel thread which:
1436	*	1473	*
1437	* 1. Retries failed read operations on working mirrors.	1474	* 1. Retries failed read operations on working mirrors.
@@ -1444,6 +1481,43 @@ static void fix_read_error(conf_t conf, mddev_t mddev, r10bio_t *r10_bio)
1444	int sect = 0; /* Offset from r10_bio->sector */	1481	int sect = 0; /* Offset from r10_bio->sector */
1445	int sectors = r10_bio->sectors;	1482	int sectors = r10_bio->sectors;
1446	mdk_rdev_t*rdev;	1483	mdk_rdev_t*rdev;
		1484	int max_read_errors = atomic_read(&mddev->max_corr_read_errors);
		1485
		1486	rcu_read_lock();
		1487	{
		1488	int d = r10_bio->devs[r10_bio->read_slot].devnum;
		1489	char b[BDEVNAME_SIZE];
		1490	int cur_read_error_count = 0;
		1491
		1492	rdev = rcu_dereference(conf->mirrors[d].rdev);
		1493	bdevname(rdev->bdev, b);
		1494
		1495	if (test_bit(Faulty, &rdev->flags)) {
		1496	rcu_read_unlock();
		1497	/* drive has already been failed, just ignore any
		1498	more fix_read_error() attempts */
		1499	return;
		1500	}
		1501
		1502	check_decay_read_errors(mddev, rdev);
		1503	atomic_inc(&rdev->read_errors);
		1504	cur_read_error_count = atomic_read(&rdev->read_errors);
		1505	if (cur_read_error_count > max_read_errors) {
		1506	rcu_read_unlock();
		1507	printk(KERN_NOTICE
		1508	"raid10: %s: Raid device exceeded "
		1509	"read_error threshold "
		1510	"[cur %d:max %d]\n",
		1511	b, cur_read_error_count, max_read_errors);
		1512	printk(KERN_NOTICE
		1513	"raid10: %s: Failing raid "
		1514	"device\n", b);
		1515	md_error(mddev, conf->mirrors[d].rdev);
		1516	return;
		1517	}
		1518	}
		1519	rcu_read_unlock();
		1520
1447	while(sectors) {	1521	while(sectors) {
1448	int s = sectors;	1522	int s = sectors;
1449	int sl = r10_bio->read_slot;	1523	int sl = r10_bio->read_slot;