[PATCH] md: attempt to auto-correct read errors in raid1

On a read-error we suspend the array, then synchronously read the block from other arrays until we find one where we can read it. Then we try writing the good data back everywhere and make sure it works. If any write or subsequent read fails, only then do we fail the device out of the array. To be able to suspend the array, we need to also keep track of how many requests are queued for handling by raid1d. Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: NeilBrown <neilb@suse.de> 2006-01-06 03:20:19 -0500
committer: Linus Torvalds <torvalds@g5.osdl.org> 2006-01-06 11:34:03 -0500
commit: ddaf22abaa831763e75775e6d4c7693504237997 (patch)
tree: 4f6ba4cb056f8c7cea82c7d548769b879d0fb405
parent: d69762e98456b71167865db9e33e732a28dd36ab (diff)
3 files changed, 109 insertions, 10 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 64e7da3701a5..1364a1c97e6f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -461,6 +461,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size,
        bio_put(bio);
        return ret;
 }
+EXPORT_SYMBOL(sync_page_io);
 static int read_disk_sb(mdk_rdev_t * rdev, int size)
 {
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c618015f07f6..b3856db8d6c2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -191,6 +191,7 @@ static void reschedule_retry(r1bio_t *r1_bio)
        spin_lock_irqsave(&conf->device_lock, flags);
        list_add(&r1_bio->retry_list, &conf->retry_list);
+        conf->nr_queued ++;
        spin_unlock_irqrestore(&conf->device_lock, flags);
        wake_up(&conf->wait_barrier);
@@ -245,9 +246,9 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
        /*
         * this branch is our 'one mirror IO has finished' event handler:
         */
-        if (!uptodate)
+        update_head_pos(mirror, r1_bio);
-                md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
-        else
+        if (uptodate || conf->working_disks <= 1) {
                /*
                 * Set R1BIO_Uptodate in our master bio, so that
                 * we will return a good error code for to the higher
@@ -259,14 +260,8 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
                 */
                set_bit(R1BIO_Uptodate, &r1_bio->state);
-        update_head_pos(mirror, r1_bio);
-        /*
-         * we have only one bio on the read side
-         */
-        if (uptodate)
                raid_end_bio_io(r1_bio);
-        else {
+        } else {
                /*
                 * oops, read error:
                 */
@@ -653,6 +648,32 @@ static void allow_barrier(conf_t *conf)
        wake_up(&conf->wait_barrier);
 }
+static void freeze_array(conf_t *conf)
+{
+        /* stop syncio and normal IO and wait for everything to
+         * go quite.
+         * We increment barrier and nr_waiting, and then
+         * wait until barrier+nr_pending match nr_queued+2
+         */
+        spin_lock_irq(&conf->resync_lock);
+        conf->barrier++;
+        conf->nr_waiting++;
+        wait_event_lock_irq(conf->wait_barrier,
+                            conf->barrier+conf->nr_pending == conf->nr_queued+2,
+                            conf->resync_lock,
+                            raid1_unplug(conf->mddev->queue));
+        spin_unlock_irq(&conf->resync_lock);
+}
+static void unfreeze_array(conf_t *conf)
+{
+        /* reverse the effect of the freeze */
+        spin_lock_irq(&conf->resync_lock);
+        conf->barrier--;
+        conf->nr_waiting--;
+        wake_up(&conf->wait_barrier);
+        spin_unlock_irq(&conf->resync_lock);
+}
 /* duplicate the data pages for behind I/O */
 static struct page **alloc_behind_pages(struct bio *bio)
@@ -1196,6 +1217,7 @@ static void raid1d(mddev_t *mddev)
                        break;
                r1_bio = list_entry(head->prev, r1bio_t, retry_list);
                list_del(head->prev);
+                conf->nr_queued--;
                spin_unlock_irqrestore(&conf->device_lock, flags);
                mddev = r1_bio->mddev;
@@ -1235,6 +1257,74 @@ static void raid1d(mddev_t *mddev)
                                }
                } else {
                        int disk;
+                        /* we got a read error. Maybe the drive is bad.  Maybe just
+                         * the block and we can fix it.
+                         * We freeze all other IO, and try reading the block from
+                         * other devices.  When we find one, we re-write
+                         * and check it that fixes the read error.
+                         * This is all done synchronously while the array is
+                         * frozen
+                         */
+                        sector_t sect = r1_bio->sector;
+                        int sectors = r1_bio->sectors;
+                        freeze_array(conf);
+                        while(sectors) {
+                                int s = sectors;
+                                int d = r1_bio->read_disk;
+                                int success = 0;
+                                if (s > (PAGE_SIZE>>9))
+                                        s = PAGE_SIZE >> 9;
+                                do {
+                                        rdev = conf->mirrors[d].rdev;
+                                        if (rdev &&
+                                            test_bit(In_sync, &rdev->flags) &&
+                                            sync_page_io(rdev->bdev,
+                                                         sect + rdev->data_offset,
+                                                         s<<9,
+                                                         conf->tmppage, READ))
+                                                success = 1;
+                                        else {
+                                                d++;
+                                                if (d == conf->raid_disks)
+                                                        d = 0;
+                                        }
+                                } while (!success && d != r1_bio->read_disk);
+                                if (success) {
+                                        /* write it back and re-read */
+                                        while (d != r1_bio->read_disk) {
+                                                if (d==0)
+                                                        d = conf->raid_disks;
+                                                d--;
+                                                rdev = conf->mirrors[d].rdev;
+                                                if (rdev &&
+                                                    test_bit(In_sync, &rdev->flags)) {
+                                                        if (sync_page_io(rdev->bdev,
+                                                                         sect + rdev->data_offset,
+                                                                         s<<9, conf->tmppage, WRITE) == 0 ||
+                                                            sync_page_io(rdev->bdev,
+                                                                         sect + rdev->data_offset,
+                                                                         s<<9, conf->tmppage, READ) == 0) {
+                                                                /* Well, this device is dead */
+                                                                md_error(mddev, rdev);
+                                                        }
+                                                }
+                                        }
+                                } else {
+                                        /* Cannot read from anywhere -- bye bye array */
+                                        md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
+                                        break;
+                                }
+                                sectors -= s;
+                                sect += s;
+                        }
+                        unfreeze_array(conf);
                        bio = r1_bio->bios[r1_bio->read_disk];
                        if ((disk=read_balance(conf, r1_bio)) == -1) {
                                printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
@@ -1529,6 +1619,10 @@ static int run(mddev_t *mddev)
        memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);
+        conf->tmppage = alloc_page(GFP_KERNEL);
+        if (!conf->tmppage)
+                goto out_no_mem;
        conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
        if (!conf->poolinfo)
                goto out_no_mem;
@@ -1635,6 +1729,7 @@ out_free_conf:
                if (conf->r1bio_pool)
                        mempool_destroy(conf->r1bio_pool);
                kfree(conf->mirrors);
+                __free_page(conf->tmppage);
                kfree(conf->poolinfo);
                kfree(conf);
                mddev->private = NULL;
diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h
index c55674252533..cbe4238d3f9f 100644
--- a/include/linux/raid/raid1.h
+++ b/include/linux/raid/raid1.h
@@ -46,6 +46,7 @@ struct r1_private_data_s {
        spinlock_t              resync_lock;
        int                     nr_pending;
        int                     nr_waiting;
+        int                     nr_queued;
        int                     barrier;
        sector_t                next_resync;
        int                     fullsync;  /* set to 1 if a full sync is needed,
@@ -57,6 +58,8 @@ struct r1_private_data_s {
        struct pool_info        *poolinfo;
+        struct page             *tmppage;
        mempool_t *r1bio_pool;
        mempool_t *r1buf_pool;
 };
author	NeilBrown <neilb@suse.de>	2006-01-06 03:20:19 -0500
committer	Linus Torvalds <torvalds@g5.osdl.org>	2006-01-06 11:34:03 -0500
commit	ddaf22abaa831763e75775e6d4c7693504237997 (patch)
tree	4f6ba4cb056f8c7cea82c7d548769b879d0fb405
parent	d69762e98456b71167865db9e33e732a28dd36ab (diff)

diff --git a/drivers/md/md.c b/drivers/md/md.c index 64e7da3701a5..1364a1c97e6f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c
@@ -461,6 +461,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size,
461	bio_put(bio);	461	bio_put(bio);
462	return ret;	462	return ret;
463	}	463	}
		464	EXPORT_SYMBOL(sync_page_io);
464		465
465	static int read_disk_sb(mdk_rdev_t * rdev, int size)	466	static int read_disk_sb(mdk_rdev_t * rdev, int size)
466	{	467	{


diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index c618015f07f6..b3856db8d6c2 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c
@@ -191,6 +191,7 @@ static void reschedule_retry(r1bio_t *r1_bio)
191		191
192	spin_lock_irqsave(&conf->device_lock, flags);	192	spin_lock_irqsave(&conf->device_lock, flags);
193	list_add(&r1_bio->retry_list, &conf->retry_list);	193	list_add(&r1_bio->retry_list, &conf->retry_list);
		194	conf->nr_queued ++;
194	spin_unlock_irqrestore(&conf->device_lock, flags);	195	spin_unlock_irqrestore(&conf->device_lock, flags);
195		196
196	wake_up(&conf->wait_barrier);	197	wake_up(&conf->wait_barrier);
@@ -245,9 +246,9 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
245	/*	246	/*
246	* this branch is our 'one mirror IO has finished' event handler:	247	* this branch is our 'one mirror IO has finished' event handler:
247	*/	248	*/
248	if (!uptodate)	249	update_head_pos(mirror, r1_bio);
249	md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);	250
250	else	251	if (uptodate \|\| conf->working_disks <= 1) {
251	/*	252	/*
252	* Set R1BIO_Uptodate in our master bio, so that	253	* Set R1BIO_Uptodate in our master bio, so that
253	* we will return a good error code for to the higher	254	* we will return a good error code for to the higher
@@ -259,14 +260,8 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
259	*/	260	*/
260	set_bit(R1BIO_Uptodate, &r1_bio->state);	261	set_bit(R1BIO_Uptodate, &r1_bio->state);
261		262
262	update_head_pos(mirror, r1_bio);
263
264	/*
265	* we have only one bio on the read side
266	*/
267	if (uptodate)
268	raid_end_bio_io(r1_bio);	263	raid_end_bio_io(r1_bio);
269	else {	264	} else {
270	/*	265	/*
271	* oops, read error:	266	* oops, read error:
272	*/	267	*/
@@ -653,6 +648,32 @@ static void allow_barrier(conf_t *conf)
653	wake_up(&conf->wait_barrier);	648	wake_up(&conf->wait_barrier);
654	}	649	}
655		650
		651	static void freeze_array(conf_t *conf)
		652	{
		653	/* stop syncio and normal IO and wait for everything to
		654	* go quite.
		655	* We increment barrier and nr_waiting, and then
		656	* wait until barrier+nr_pending match nr_queued+2
		657	*/
		658	spin_lock_irq(&conf->resync_lock);
		659	conf->barrier++;
		660	conf->nr_waiting++;
		661	wait_event_lock_irq(conf->wait_barrier,
		662	conf->barrier+conf->nr_pending == conf->nr_queued+2,
		663	conf->resync_lock,
		664	raid1_unplug(conf->mddev->queue));
		665	spin_unlock_irq(&conf->resync_lock);
		666	}
		667	static void unfreeze_array(conf_t *conf)
		668	{
		669	/* reverse the effect of the freeze */
		670	spin_lock_irq(&conf->resync_lock);
		671	conf->barrier--;
		672	conf->nr_waiting--;
		673	wake_up(&conf->wait_barrier);
		674	spin_unlock_irq(&conf->resync_lock);
		675	}
		676
656		677
657	/* duplicate the data pages for behind I/O */	678	/* duplicate the data pages for behind I/O */
658	static struct page *alloc_behind_pages(struct bio bio)	679	static struct page *alloc_behind_pages(struct bio bio)
@@ -1196,6 +1217,7 @@ static void raid1d(mddev_t *mddev)
1196	break;	1217	break;
1197	r1_bio = list_entry(head->prev, r1bio_t, retry_list);	1218	r1_bio = list_entry(head->prev, r1bio_t, retry_list);
1198	list_del(head->prev);	1219	list_del(head->prev);
		1220	conf->nr_queued--;
1199	spin_unlock_irqrestore(&conf->device_lock, flags);	1221	spin_unlock_irqrestore(&conf->device_lock, flags);
1200		1222
1201	mddev = r1_bio->mddev;	1223	mddev = r1_bio->mddev;
@@ -1235,6 +1257,74 @@ static void raid1d(mddev_t *mddev)
1235	}	1257	}
1236	} else {	1258	} else {
1237	int disk;	1259	int disk;
		1260
		1261	/* we got a read error. Maybe the drive is bad. Maybe just
		1262	* the block and we can fix it.
		1263	* We freeze all other IO, and try reading the block from
		1264	* other devices. When we find one, we re-write
		1265	* and check it that fixes the read error.
		1266	* This is all done synchronously while the array is
		1267	* frozen
		1268	*/
		1269	sector_t sect = r1_bio->sector;
		1270	int sectors = r1_bio->sectors;
		1271	freeze_array(conf);
		1272	while(sectors) {
		1273	int s = sectors;
		1274	int d = r1_bio->read_disk;
		1275	int success = 0;
		1276
		1277	if (s > (PAGE_SIZE>>9))
		1278	s = PAGE_SIZE >> 9;
		1279
		1280	do {
		1281	rdev = conf->mirrors[d].rdev;
		1282	if (rdev &&
		1283	test_bit(In_sync, &rdev->flags) &&
		1284	sync_page_io(rdev->bdev,
		1285	sect + rdev->data_offset,
		1286	s<<9,
		1287	conf->tmppage, READ))
		1288	success = 1;
		1289	else {
		1290	d++;
		1291	if (d == conf->raid_disks)
		1292	d = 0;
		1293	}
		1294	} while (!success && d != r1_bio->read_disk);
		1295
		1296	if (success) {
		1297	/* write it back and re-read */
		1298	while (d != r1_bio->read_disk) {
		1299	if (d==0)
		1300	d = conf->raid_disks;
		1301	d--;
		1302	rdev = conf->mirrors[d].rdev;
		1303	if (rdev &&
		1304	test_bit(In_sync, &rdev->flags)) {
		1305	if (sync_page_io(rdev->bdev,
		1306	sect + rdev->data_offset,
		1307	s<<9, conf->tmppage, WRITE) == 0 \|\|
		1308	sync_page_io(rdev->bdev,
		1309	sect + rdev->data_offset,
		1310	s<<9, conf->tmppage, READ) == 0) {
		1311	/* Well, this device is dead */
		1312	md_error(mddev, rdev);
		1313	}
		1314	}
		1315	}
		1316	} else {
		1317	/* Cannot read from anywhere -- bye bye array */
		1318	md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
		1319	break;
		1320	}
		1321	sectors -= s;
		1322	sect += s;
		1323	}
		1324
		1325
		1326	unfreeze_array(conf);
		1327
1238	bio = r1_bio->bios[r1_bio->read_disk];	1328	bio = r1_bio->bios[r1_bio->read_disk];
1239	if ((disk=read_balance(conf, r1_bio)) == -1) {	1329	if ((disk=read_balance(conf, r1_bio)) == -1) {
1240	printk(KERN_ALERT "raid1: %s: unrecoverable I/O"	1330	printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
@@ -1529,6 +1619,10 @@ static int run(mddev_t *mddev)
1529		1619
1530	memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);	1620	memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);
1531		1621
		1622	conf->tmppage = alloc_page(GFP_KERNEL);
		1623	if (!conf->tmppage)
		1624	goto out_no_mem;
		1625
1532	conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);	1626	conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
1533	if (!conf->poolinfo)	1627	if (!conf->poolinfo)
1534	goto out_no_mem;	1628	goto out_no_mem;
@@ -1635,6 +1729,7 @@ out_free_conf:
1635	if (conf->r1bio_pool)	1729	if (conf->r1bio_pool)
1636	mempool_destroy(conf->r1bio_pool);	1730	mempool_destroy(conf->r1bio_pool);
1637	kfree(conf->mirrors);	1731	kfree(conf->mirrors);
		1732	__free_page(conf->tmppage);
1638	kfree(conf->poolinfo);	1733	kfree(conf->poolinfo);
1639	kfree(conf);	1734	kfree(conf);
1640	mddev->private = NULL;	1735	mddev->private = NULL;


diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h index c55674252533..cbe4238d3f9f 100644 --- a/include/linux/raid/raid1.h +++ b/include/linux/raid/raid1.h
@@ -46,6 +46,7 @@ struct r1_private_data_s {
46	spinlock_t resync_lock;	46	spinlock_t resync_lock;
47	int nr_pending;	47	int nr_pending;
48	int nr_waiting;	48	int nr_waiting;
		49	int nr_queued;
49	int barrier;	50	int barrier;
50	sector_t next_resync;	51	sector_t next_resync;
51	int fullsync; /* set to 1 if a full sync is needed,	52	int fullsync; /* set to 1 if a full sync is needed,
@@ -57,6 +58,8 @@ struct r1_private_data_s {
57		58
58	struct pool_info *poolinfo;	59	struct pool_info *poolinfo;
59		60
		61	struct page *tmppage;
		62
60	mempool_t *r1bio_pool;	63	mempool_t *r1bio_pool;
61	mempool_t *r1buf_pool;	64	mempool_t *r1buf_pool;
62	};	65	};