[PATCH] md: auto-correct correctable read errors in raid10

Largely just a cross-port from raid1. Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: NeilBrown <neilb@suse.de> 2006-01-06 03:20:28 -0500
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-01-06 11:34:05 -0500
commit: 4443ae10ca15d07922ceda622f03db8865fa3d13 (patch)
tree: f1f0a6a82142effbdde93913d53596aeeacc9dc4 /drivers
parent: 220946c9018de74b952446e3a4dff1bfd4cbf310 (diff)
1 files changed, 112 insertions, 15 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 1fa70c34b7d2..64bb4ddc6798 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -209,6 +209,7 @@ static void reschedule_retry(r10bio_t *r10_bio)
        spin_lock_irqsave(&conf->device_lock, flags);
        list_add(&r10_bio->retry_list, &conf->retry_list);
+        conf->nr_queued ++;
        spin_unlock_irqrestore(&conf->device_lock, flags);
        md_wakeup_thread(mddev->thread);
@@ -254,9 +255,9 @@ static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int
        /*
         * this branch is our 'one mirror IO has finished' event handler:
         */
-        if (!uptodate)
+        update_head_pos(slot, r10_bio);
-                md_error(r10_bio->mddev, conf->mirrors[dev].rdev);
-        else
+        if (uptodate) {
                /*
                 * Set R10BIO_Uptodate in our master bio, so that
                 * we will return a good error code to the higher
@@ -267,15 +268,8 @@ static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int
                 * wait for the 'master' bio.
                 */
                set_bit(R10BIO_Uptodate, &r10_bio->state);
-        update_head_pos(slot, r10_bio);
-        /*
-         * we have only one bio on the read side
-         */
-        if (uptodate)
                raid_end_bio_io(r10_bio);
-        else {
+        } else {
                /*
                 * oops, read error:
                 */
@@ -714,6 +708,33 @@ static void allow_barrier(conf_t *conf)
        wake_up(&conf->wait_barrier);
 }
+static void freeze_array(conf_t *conf)
+{
+        /* stop syncio and normal IO and wait for everything to
+         * go quite.
+         * We increment barrier and nr_waiting, and then
+         * wait until barrier+nr_pending match nr_queued+2
+         */
+        spin_lock_irq(&conf->resync_lock);
+        conf->barrier++;
+        conf->nr_waiting++;
+        wait_event_lock_irq(conf->wait_barrier,
+                            conf->barrier+conf->nr_pending == conf->nr_queued+2,
+                            conf->resync_lock,
+                            raid10_unplug(conf->mddev->queue));
+        spin_unlock_irq(&conf->resync_lock);
+}
+static void unfreeze_array(conf_t *conf)
+{
+        /* reverse the effect of the freeze */
+        spin_lock_irq(&conf->resync_lock);
+        conf->barrier--;
+        conf->nr_waiting--;
+        wake_up(&conf->wait_barrier);
+        spin_unlock_irq(&conf->resync_lock);
+}
 static int make_request(request_queue_t *q, struct bio * bio)
 {
        mddev_t *mddev = q->queuedata;
@@ -1338,6 +1359,7 @@ static void raid10d(mddev_t *mddev)
                        break;
                r10_bio = list_entry(head->prev, r10bio_t, retry_list);
                list_del(head->prev);
+                conf->nr_queued--;
                spin_unlock_irqrestore(&conf->device_lock, flags);
                mddev = r10_bio->mddev;
@@ -1350,6 +1372,78 @@ static void raid10d(mddev_t *mddev)
                        unplug = 1;
                } else {
                        int mirror;
+                        /* we got a read error. Maybe the drive is bad.  Maybe just
+                         * the block and we can fix it.
+                         * We freeze all other IO, and try reading the block from
+                         * other devices.  When we find one, we re-write
+                         * and check it that fixes the read error.
+                         * This is all done synchronously while the array is
+                         * frozen.
+                         */
+                        int sect = 0; /* Offset from r10_bio->sector */
+                        int sectors = r10_bio->sectors;
+                        freeze_array(conf);
+                        if (mddev->ro == 0) while(sectors) {
+                                int s = sectors;
+                                int sl = r10_bio->read_slot;
+                                int success = 0;
+                                if (s > (PAGE_SIZE>>9))
+                                        s = PAGE_SIZE >> 9;
+                                do {
+                                        int d = r10_bio->devs[sl].devnum;
+                                        rdev = conf->mirrors[d].rdev;
+                                        if (rdev &&
+                                            test_bit(In_sync, &rdev->flags) &&
+                                            sync_page_io(rdev->bdev,
+                                                         r10_bio->devs[sl].addr +
+                                                         sect + rdev->data_offset,
+                                                         s<<9,
+                                                         conf->tmppage, READ))
+                                                success = 1;
+                                        else {
+                                                sl++;
+                                                if (sl == conf->copies)
+                                                        sl = 0;
+                                        }
+                                } while (!success && sl != r10_bio->read_slot);
+                                if (success) {
+                                        /* write it back and re-read */
+                                        while (sl != r10_bio->read_slot) {
+                                                int d;
+                                                if (sl==0)
+                                                        sl = conf->copies;
+                                                sl--;
+                                                d = r10_bio->devs[sl].devnum;
+                                                rdev = conf->mirrors[d].rdev;
+                                                if (rdev &&
+                                                    test_bit(In_sync, &rdev->flags)) {
+                                                        if (sync_page_io(rdev->bdev,
+                                                                         r10_bio->devs[sl].addr +
+                                                                         sect + rdev->data_offset,
+                                                                         s<<9, conf->tmppage, WRITE) == 0 ||
+                                                            sync_page_io(rdev->bdev,
+                                                                         r10_bio->devs[sl].addr +
+                                                                         sect + rdev->data_offset,
+                                                                         s<<9, conf->tmppage, READ) == 0) {
+                                                                /* Well, this device is dead */
+                                                                md_error(mddev, rdev);
+                                                        }
+                                                }
+                                        }
+                                } else {
+                                        /* Cannot read from anywhere -- bye bye array */
+                                        md_error(mddev, conf->mirrors[r10_bio->devs[r10_bio->read_slot].devnum].rdev);
+                                        break;
+                                }
+                                sectors -= s;
+                                sect += s;
+                        }
+                        unfreeze_array(conf);
                        bio = r10_bio->devs[r10_bio->read_slot].bio;
                        r10_bio->devs[r10_bio->read_slot].bio = NULL;
                        bio_put(bio);
@@ -1793,22 +1887,24 @@ static int run(mddev_t *mddev)
         * bookkeeping area. [whatever we allocate in run(),
         * should be freed in stop()]
         */
-        conf = kmalloc(sizeof(conf_t), GFP_KERNEL);
+        conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
        mddev->private = conf;
        if (!conf) {
                printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
                        mdname(mddev));
                goto out;
        }
-        memset(conf, 0, sizeof(*conf));
+        conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
-        conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks,
                                 GFP_KERNEL);
        if (!conf->mirrors) {
                printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
                       mdname(mddev));
                goto out_free_conf;
        }
-        memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);
+        conf->tmppage = alloc_page(GFP_KERNEL);
+        if (!conf->tmppage)
+                goto out_free_conf;
        conf->near_copies = nc;
        conf->far_copies = fc;
@@ -1918,6 +2014,7 @@ static int run(mddev_t *mddev)
 out_free_conf:
        if (conf->r10bio_pool)
                mempool_destroy(conf->r10bio_pool);
+        put_page(conf->tmppage);
        kfree(conf->mirrors);
        kfree(conf);
        mddev->private = NULL;
author	NeilBrown <neilb@suse.de>	2006-01-06 03:20:28 -0500
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-01-06 11:34:05 -0500
commit	4443ae10ca15d07922ceda622f03db8865fa3d13 (patch)
tree	f1f0a6a82142effbdde93913d53596aeeacc9dc4 /drivers
parent	220946c9018de74b952446e3a4dff1bfd4cbf310 (diff)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 1fa70c34b7d2..64bb4ddc6798 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c
@@ -209,6 +209,7 @@ static void reschedule_retry(r10bio_t *r10_bio)
209		209
210	spin_lock_irqsave(&conf->device_lock, flags);	210	spin_lock_irqsave(&conf->device_lock, flags);
211	list_add(&r10_bio->retry_list, &conf->retry_list);	211	list_add(&r10_bio->retry_list, &conf->retry_list);
		212	conf->nr_queued ++;
212	spin_unlock_irqrestore(&conf->device_lock, flags);	213	spin_unlock_irqrestore(&conf->device_lock, flags);
213		214
214	md_wakeup_thread(mddev->thread);	215	md_wakeup_thread(mddev->thread);
@@ -254,9 +255,9 @@ static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int
254	/*	255	/*
255	* this branch is our 'one mirror IO has finished' event handler:	256	* this branch is our 'one mirror IO has finished' event handler:
256	*/	257	*/
257	if (!uptodate)	258	update_head_pos(slot, r10_bio);
258	md_error(r10_bio->mddev, conf->mirrors[dev].rdev);	259
259	else	260	if (uptodate) {
260	/*	261	/*
261	* Set R10BIO_Uptodate in our master bio, so that	262	* Set R10BIO_Uptodate in our master bio, so that
262	* we will return a good error code to the higher	263	* we will return a good error code to the higher
@@ -267,15 +268,8 @@ static int raid10_end_read_request(struct bio *bio, unsigned int bytes_done, int
267	* wait for the 'master' bio.	268	* wait for the 'master' bio.
268	*/	269	*/
269	set_bit(R10BIO_Uptodate, &r10_bio->state);	270	set_bit(R10BIO_Uptodate, &r10_bio->state);
270
271	update_head_pos(slot, r10_bio);
272
273	/*
274	* we have only one bio on the read side
275	*/
276	if (uptodate)
277	raid_end_bio_io(r10_bio);	271	raid_end_bio_io(r10_bio);
278	else {	272	} else {
279	/*	273	/*
280	* oops, read error:	274	* oops, read error:
281	*/	275	*/
@@ -714,6 +708,33 @@ static void allow_barrier(conf_t *conf)
714	wake_up(&conf->wait_barrier);	708	wake_up(&conf->wait_barrier);
715	}	709	}
716		710
		711	static void freeze_array(conf_t *conf)
		712	{
		713	/* stop syncio and normal IO and wait for everything to
		714	* go quite.
		715	* We increment barrier and nr_waiting, and then
		716	* wait until barrier+nr_pending match nr_queued+2
		717	*/
		718	spin_lock_irq(&conf->resync_lock);
		719	conf->barrier++;
		720	conf->nr_waiting++;
		721	wait_event_lock_irq(conf->wait_barrier,
		722	conf->barrier+conf->nr_pending == conf->nr_queued+2,
		723	conf->resync_lock,
		724	raid10_unplug(conf->mddev->queue));
		725	spin_unlock_irq(&conf->resync_lock);
		726	}
		727
		728	static void unfreeze_array(conf_t *conf)
		729	{
		730	/* reverse the effect of the freeze */
		731	spin_lock_irq(&conf->resync_lock);
		732	conf->barrier--;
		733	conf->nr_waiting--;
		734	wake_up(&conf->wait_barrier);
		735	spin_unlock_irq(&conf->resync_lock);
		736	}
		737
717	static int make_request(request_queue_t q, struct bio bio)	738	static int make_request(request_queue_t q, struct bio bio)
718	{	739	{
719	mddev_t *mddev = q->queuedata;	740	mddev_t *mddev = q->queuedata;
@@ -1338,6 +1359,7 @@ static void raid10d(mddev_t *mddev)
1338	break;	1359	break;
1339	r10_bio = list_entry(head->prev, r10bio_t, retry_list);	1360	r10_bio = list_entry(head->prev, r10bio_t, retry_list);
1340	list_del(head->prev);	1361	list_del(head->prev);
		1362	conf->nr_queued--;
1341	spin_unlock_irqrestore(&conf->device_lock, flags);	1363	spin_unlock_irqrestore(&conf->device_lock, flags);
1342		1364
1343	mddev = r10_bio->mddev;	1365	mddev = r10_bio->mddev;
@@ -1350,6 +1372,78 @@ static void raid10d(mddev_t *mddev)
1350	unplug = 1;	1372	unplug = 1;
1351	} else {	1373	} else {
1352	int mirror;	1374	int mirror;
		1375	/* we got a read error. Maybe the drive is bad. Maybe just
		1376	* the block and we can fix it.
		1377	* We freeze all other IO, and try reading the block from
		1378	* other devices. When we find one, we re-write
		1379	* and check it that fixes the read error.
		1380	* This is all done synchronously while the array is
		1381	* frozen.
		1382	*/
		1383	int sect = 0; /* Offset from r10_bio->sector */
		1384	int sectors = r10_bio->sectors;
		1385	freeze_array(conf);
		1386	if (mddev->ro == 0) while(sectors) {
		1387	int s = sectors;
		1388	int sl = r10_bio->read_slot;
		1389	int success = 0;
		1390
		1391	if (s > (PAGE_SIZE>>9))
		1392	s = PAGE_SIZE >> 9;
		1393
		1394	do {
		1395	int d = r10_bio->devs[sl].devnum;
		1396	rdev = conf->mirrors[d].rdev;
		1397	if (rdev &&
		1398	test_bit(In_sync, &rdev->flags) &&
		1399	sync_page_io(rdev->bdev,
		1400	r10_bio->devs[sl].addr +
		1401	sect + rdev->data_offset,
		1402	s<<9,
		1403	conf->tmppage, READ))
		1404	success = 1;
		1405	else {
		1406	sl++;
		1407	if (sl == conf->copies)
		1408	sl = 0;
		1409	}
		1410	} while (!success && sl != r10_bio->read_slot);
		1411
		1412	if (success) {
		1413	/* write it back and re-read */
		1414	while (sl != r10_bio->read_slot) {
		1415	int d;
		1416	if (sl==0)
		1417	sl = conf->copies;
		1418	sl--;
		1419	d = r10_bio->devs[sl].devnum;
		1420	rdev = conf->mirrors[d].rdev;
		1421	if (rdev &&
		1422	test_bit(In_sync, &rdev->flags)) {
		1423	if (sync_page_io(rdev->bdev,
		1424	r10_bio->devs[sl].addr +
		1425	sect + rdev->data_offset,
		1426	s<<9, conf->tmppage, WRITE) == 0 \|\|
		1427	sync_page_io(rdev->bdev,
		1428	r10_bio->devs[sl].addr +
		1429	sect + rdev->data_offset,
		1430	s<<9, conf->tmppage, READ) == 0) {
		1431	/* Well, this device is dead */
		1432	md_error(mddev, rdev);
		1433	}
		1434	}
		1435	}
		1436	} else {
		1437	/* Cannot read from anywhere -- bye bye array */
		1438	md_error(mddev, conf->mirrors[r10_bio->devs[r10_bio->read_slot].devnum].rdev);
		1439	break;
		1440	}
		1441	sectors -= s;
		1442	sect += s;
		1443	}
		1444
		1445	unfreeze_array(conf);
		1446
1353	bio = r10_bio->devs[r10_bio->read_slot].bio;	1447	bio = r10_bio->devs[r10_bio->read_slot].bio;
1354	r10_bio->devs[r10_bio->read_slot].bio = NULL;	1448	r10_bio->devs[r10_bio->read_slot].bio = NULL;
1355	bio_put(bio);	1449	bio_put(bio);
@@ -1793,22 +1887,24 @@ static int run(mddev_t *mddev)
1793	* bookkeeping area. [whatever we allocate in run(),	1887	* bookkeeping area. [whatever we allocate in run(),
1794	* should be freed in stop()]	1888	* should be freed in stop()]
1795	*/	1889	*/
1796	conf = kmalloc(sizeof(conf_t), GFP_KERNEL);	1890	conf = kzalloc(sizeof(conf_t), GFP_KERNEL);
1797	mddev->private = conf;	1891	mddev->private = conf;
1798	if (!conf) {	1892	if (!conf) {
1799	printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",	1893	printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
1800	mdname(mddev));	1894	mdname(mddev));
1801	goto out;	1895	goto out;
1802	}	1896	}
1803	memset(conf, 0, sizeof(*conf));	1897	conf->mirrors = kzalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1804	conf->mirrors = kmalloc(sizeof(struct mirror_info)*mddev->raid_disks,
1805	GFP_KERNEL);	1898	GFP_KERNEL);
1806	if (!conf->mirrors) {	1899	if (!conf->mirrors) {
1807	printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",	1900	printk(KERN_ERR "raid10: couldn't allocate memory for %s\n",
1808	mdname(mddev));	1901	mdname(mddev));
1809	goto out_free_conf;	1902	goto out_free_conf;
1810	}	1903	}
1811	memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);	1904
		1905	conf->tmppage = alloc_page(GFP_KERNEL);
		1906	if (!conf->tmppage)
		1907	goto out_free_conf;
1812		1908
1813	conf->near_copies = nc;	1909	conf->near_copies = nc;
1814	conf->far_copies = fc;	1910	conf->far_copies = fc;
@@ -1918,6 +2014,7 @@ static int run(mddev_t *mddev)
1918	out_free_conf:	2014	out_free_conf:
1919	if (conf->r10bio_pool)	2015	if (conf->r10bio_pool)
1920	mempool_destroy(conf->r10bio_pool);	2016	mempool_destroy(conf->r10bio_pool);
		2017	put_page(conf->tmppage);
1921	kfree(conf->mirrors);	2018	kfree(conf->mirrors);
1922	kfree(conf);	2019	kfree(conf);
1923	mddev->private = NULL;	2020	mddev->private = NULL;