3 files changed, 109 insertions, 10 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 64e7da3701a5..1364a1c97e6f 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -461,6 +461,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size,
        bio_put(bio);
        return ret;
 }
+EXPORT_SYMBOL(sync_page_io);
 static int read_disk_sb(mdk_rdev_t * rdev, int size)
 {
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index c618015f07f6..b3856db8d6c2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -191,6 +191,7 @@ static void reschedule_retry(r1bio_t *r1_bio)
        spin_lock_irqsave(&conf->device_lock, flags);
        list_add(&r1_bio->retry_list, &conf->retry_list);
+        conf->nr_queued ++;
        spin_unlock_irqrestore(&conf->device_lock, flags);
        wake_up(&conf->wait_barrier);
@@ -245,9 +246,9 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
        /*
         * this branch is our 'one mirror IO has finished' event handler:
         */
-        if (!uptodate)
+        update_head_pos(mirror, r1_bio);
-                md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
-        else
+        if (uptodate || conf->working_disks <= 1) {
                /*
                 * Set R1BIO_Uptodate in our master bio, so that
                 * we will return a good error code for to the higher
@@ -259,14 +260,8 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
                 */
                set_bit(R1BIO_Uptodate, &r1_bio->state);
-        update_head_pos(mirror, r1_bio);
-        /*
-         * we have only one bio on the read side
-         */
-        if (uptodate)
                raid_end_bio_io(r1_bio);
-        else {
+        } else {
                /*
                 * oops, read error:
                 */
@@ -653,6 +648,32 @@ static void allow_barrier(conf_t *conf)
        wake_up(&conf->wait_barrier);
 }
+static void freeze_array(conf_t *conf)
+{
+        /* stop syncio and normal IO and wait for everything to
+         * go quite.
+         * We increment barrier and nr_waiting, and then
+         * wait until barrier+nr_pending match nr_queued+2
+         */
+        spin_lock_irq(&conf->resync_lock);
+        conf->barrier++;
+        conf->nr_waiting++;
+        wait_event_lock_irq(conf->wait_barrier,
+                            conf->barrier+conf->nr_pending == conf->nr_queued+2,
+                            conf->resync_lock,
+                            raid1_unplug(conf->mddev->queue));
+        spin_unlock_irq(&conf->resync_lock);
+}
+static void unfreeze_array(conf_t *conf)
+{
+        /* reverse the effect of the freeze */
+        spin_lock_irq(&conf->resync_lock);
+        conf->barrier--;
+        conf->nr_waiting--;
+        wake_up(&conf->wait_barrier);
+        spin_unlock_irq(&conf->resync_lock);
+}
 /* duplicate the data pages for behind I/O */
 static struct page **alloc_behind_pages(struct bio *bio)
@@ -1196,6 +1217,7 @@ static void raid1d(mddev_t *mddev)
                        break;
                r1_bio = list_entry(head->prev, r1bio_t, retry_list);
                list_del(head->prev);
+                conf->nr_queued--;
                spin_unlock_irqrestore(&conf->device_lock, flags);
                mddev = r1_bio->mddev;
@@ -1235,6 +1257,74 @@ static void raid1d(mddev_t *mddev)
                                }
                } else {
                        int disk;
+                        /* we got a read error. Maybe the drive is bad.  Maybe just
+                         * the block and we can fix it.
+                         * We freeze all other IO, and try reading the block from
+                         * other devices.  When we find one, we re-write
+                         * and check it that fixes the read error.
+                         * This is all done synchronously while the array is
+                         * frozen
+                         */
+                        sector_t sect = r1_bio->sector;
+                        int sectors = r1_bio->sectors;
+                        freeze_array(conf);
+                        while(sectors) {
+                                int s = sectors;
+                                int d = r1_bio->read_disk;
+                                int success = 0;
+                                if (s > (PAGE_SIZE>>9))
+                                        s = PAGE_SIZE >> 9;
+                                do {
+                                        rdev = conf->mirrors[d].rdev;
+                                        if (rdev &&
+                                            test_bit(In_sync, &rdev->flags) &&
+                                            sync_page_io(rdev->bdev,
+                                                         sect + rdev->data_offset,
+                                                         s<<9,
+                                                         conf->tmppage, READ))
+                                                success = 1;
+                                        else {
+                                                d++;
+                                                if (d == conf->raid_disks)
+                                                        d = 0;
+                                        }
+                                } while (!success && d != r1_bio->read_disk);
+                                if (success) {
+                                        /* write it back and re-read */
+                                        while (d != r1_bio->read_disk) {
+                                                if (d==0)
+                                                        d = conf->raid_disks;
+                                                d--;
+                                                rdev = conf->mirrors[d].rdev;
+                                                if (rdev &&
+                                                    test_bit(In_sync, &rdev->flags)) {
+                                                        if (sync_page_io(rdev->bdev,
+                                                                         sect + rdev->data_offset,
+                                                                         s<<9, conf->tmppage, WRITE) == 0 ||
+                                                            sync_page_io(rdev->bdev,
+                                                                         sect + rdev->data_offset,
+                                                                         s<<9, conf->tmppage, READ) == 0) {
+                                                                /* Well, this device is dead */
+                                                                md_error(mddev, rdev);
+                                                        }
+                                                }
+                                        }
+                                } else {
+                                        /* Cannot read from anywhere -- bye bye array */
+                                        md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
+                                        break;
+                                }
+                                sectors -= s;
+                                sect += s;
+                        }
+                        unfreeze_array(conf);
                        bio = r1_bio->bios[r1_bio->read_disk];
                        if ((disk=read_balance(conf, r1_bio)) == -1) {
                                printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
@@ -1529,6 +1619,10 @@ static int run(mddev_t *mddev)
        memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);
+        conf->tmppage = alloc_page(GFP_KERNEL);
+        if (!conf->tmppage)
+                goto out_no_mem;
        conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
        if (!conf->poolinfo)
                goto out_no_mem;
@@ -1635,6 +1729,7 @@ out_free_conf:
                if (conf->r1bio_pool)
                        mempool_destroy(conf->r1bio_pool);
                kfree(conf->mirrors);
+                __free_page(conf->tmppage);
                kfree(conf->poolinfo);
                kfree(conf);
                mddev->private = NULL;
diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h
index c55674252533..cbe4238d3f9f 100644
--- a/include/linux/raid/raid1.h
+++ b/include/linux/raid/raid1.h
@@ -46,6 +46,7 @@ struct r1_private_data_s {
        spinlock_t              resync_lock;
        int                     nr_pending;
        int                     nr_waiting;
+        int                     nr_queued;
        int                     barrier;
        sector_t                next_resync;
        int                     fullsync;  /* set to 1 if a full sync is needed,
@@ -57,6 +58,8 @@ struct r1_private_data_s {
        struct pool_info        *poolinfo;
+        struct page             *tmppage;
        mempool_t *r1bio_pool;
        mempool_t *r1buf_pool;
 };

diff --git a/drivers/md/md.c b/drivers/md/md.c index 64e7da3701a5..1364a1c97e6f 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c
@@ -461,6 +461,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size,
461	bio_put(bio);	461	bio_put(bio);
462	return ret;	462	return ret;
463	}	463	}
		464	EXPORT_SYMBOL(sync_page_io);
464		465
465	static int read_disk_sb(mdk_rdev_t * rdev, int size)	466	static int read_disk_sb(mdk_rdev_t * rdev, int size)
466	{	467	{


diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index c618015f07f6..b3856db8d6c2 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c
@@ -191,6 +191,7 @@ static void reschedule_retry(r1bio_t *r1_bio)
191		191
192	spin_lock_irqsave(&conf->device_lock, flags);	192	spin_lock_irqsave(&conf->device_lock, flags);
193	list_add(&r1_bio->retry_list, &conf->retry_list);	193	list_add(&r1_bio->retry_list, &conf->retry_list);
		194	conf->nr_queued ++;
194	spin_unlock_irqrestore(&conf->device_lock, flags);	195	spin_unlock_irqrestore(&conf->device_lock, flags);
195		196
196	wake_up(&conf->wait_barrier);	197	wake_up(&conf->wait_barrier);
@@ -245,9 +246,9 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
245	/*	246	/*
246	* this branch is our 'one mirror IO has finished' event handler:	247	* this branch is our 'one mirror IO has finished' event handler:
247	*/	248	*/
248	if (!uptodate)	249	update_head_pos(mirror, r1_bio);
249	md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);	250
250	else	251	if (uptodate \|\| conf->working_disks <= 1) {
251	/*	252	/*
252	* Set R1BIO_Uptodate in our master bio, so that	253	* Set R1BIO_Uptodate in our master bio, so that
253	* we will return a good error code for to the higher	254	* we will return a good error code for to the higher
@@ -259,14 +260,8 @@ static int raid1_end_read_request(struct bio *bio, unsigned int bytes_done, int
259	*/	260	*/
260	set_bit(R1BIO_Uptodate, &r1_bio->state);	261	set_bit(R1BIO_Uptodate, &r1_bio->state);
261		262
262	update_head_pos(mirror, r1_bio);
263
264	/*
265	* we have only one bio on the read side
266	*/
267	if (uptodate)
268	raid_end_bio_io(r1_bio);	263	raid_end_bio_io(r1_bio);
269	else {	264	} else {
270	/*	265	/*
271	* oops, read error:	266	* oops, read error:
272	*/	267	*/
@@ -653,6 +648,32 @@ static void allow_barrier(conf_t *conf)
653	wake_up(&conf->wait_barrier);	648	wake_up(&conf->wait_barrier);
654	}	649	}
655		650
		651	static void freeze_array(conf_t *conf)
		652	{
		653	/* stop syncio and normal IO and wait for everything to
		654	* go quite.
		655	* We increment barrier and nr_waiting, and then
		656	* wait until barrier+nr_pending match nr_queued+2
		657	*/
		658	spin_lock_irq(&conf->resync_lock);
		659	conf->barrier++;
		660	conf->nr_waiting++;
		661	wait_event_lock_irq(conf->wait_barrier,
		662	conf->barrier+conf->nr_pending == conf->nr_queued+2,
		663	conf->resync_lock,
		664	raid1_unplug(conf->mddev->queue));
		665	spin_unlock_irq(&conf->resync_lock);
		666	}
		667	static void unfreeze_array(conf_t *conf)
		668	{
		669	/* reverse the effect of the freeze */
		670	spin_lock_irq(&conf->resync_lock);
		671	conf->barrier--;
		672	conf->nr_waiting--;
		673	wake_up(&conf->wait_barrier);
		674	spin_unlock_irq(&conf->resync_lock);
		675	}
		676
656		677
657	/* duplicate the data pages for behind I/O */	678	/* duplicate the data pages for behind I/O */
658	static struct page *alloc_behind_pages(struct bio bio)	679	static struct page *alloc_behind_pages(struct bio bio)
@@ -1196,6 +1217,7 @@ static void raid1d(mddev_t *mddev)
1196	break;	1217	break;
1197	r1_bio = list_entry(head->prev, r1bio_t, retry_list);	1218	r1_bio = list_entry(head->prev, r1bio_t, retry_list);
1198	list_del(head->prev);	1219	list_del(head->prev);
		1220	conf->nr_queued--;
1199	spin_unlock_irqrestore(&conf->device_lock, flags);	1221	spin_unlock_irqrestore(&conf->device_lock, flags);
1200		1222
1201	mddev = r1_bio->mddev;	1223	mddev = r1_bio->mddev;
@@ -1235,6 +1257,74 @@ static void raid1d(mddev_t *mddev)
1235	}	1257	}
1236	} else {	1258	} else {
1237	int disk;	1259	int disk;
		1260
		1261	/* we got a read error. Maybe the drive is bad. Maybe just
		1262	* the block and we can fix it.
		1263	* We freeze all other IO, and try reading the block from
		1264	* other devices. When we find one, we re-write
		1265	* and check it that fixes the read error.
		1266	* This is all done synchronously while the array is
		1267	* frozen
		1268	*/
		1269	sector_t sect = r1_bio->sector;
		1270	int sectors = r1_bio->sectors;
		1271	freeze_array(conf);
		1272	while(sectors) {
		1273	int s = sectors;
		1274	int d = r1_bio->read_disk;
		1275	int success = 0;
		1276
		1277	if (s > (PAGE_SIZE>>9))
		1278	s = PAGE_SIZE >> 9;
		1279
		1280	do {
		1281	rdev = conf->mirrors[d].rdev;
		1282	if (rdev &&
		1283	test_bit(In_sync, &rdev->flags) &&
		1284	sync_page_io(rdev->bdev,
		1285	sect + rdev->data_offset,
		1286	s<<9,
		1287	conf->tmppage, READ))
		1288	success = 1;
		1289	else {
		1290	d++;
		1291	if (d == conf->raid_disks)
		1292	d = 0;
		1293	}
		1294	} while (!success && d != r1_bio->read_disk);
		1295
		1296	if (success) {
		1297	/* write it back and re-read */
		1298	while (d != r1_bio->read_disk) {
		1299	if (d==0)
		1300	d = conf->raid_disks;
		1301	d--;
		1302	rdev = conf->mirrors[d].rdev;
		1303	if (rdev &&
		1304	test_bit(In_sync, &rdev->flags)) {
		1305	if (sync_page_io(rdev->bdev,
		1306	sect + rdev->data_offset,
		1307	s<<9, conf->tmppage, WRITE) == 0 \|\|
		1308	sync_page_io(rdev->bdev,
		1309	sect + rdev->data_offset,
		1310	s<<9, conf->tmppage, READ) == 0) {
		1311	/* Well, this device is dead */
		1312	md_error(mddev, rdev);
		1313	}
		1314	}
		1315	}
		1316	} else {
		1317	/* Cannot read from anywhere -- bye bye array */
		1318	md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev);
		1319	break;
		1320	}
		1321	sectors -= s;
		1322	sect += s;
		1323	}
		1324
		1325
		1326	unfreeze_array(conf);
		1327
1238	bio = r1_bio->bios[r1_bio->read_disk];	1328	bio = r1_bio->bios[r1_bio->read_disk];
1239	if ((disk=read_balance(conf, r1_bio)) == -1) {	1329	if ((disk=read_balance(conf, r1_bio)) == -1) {
1240	printk(KERN_ALERT "raid1: %s: unrecoverable I/O"	1330	printk(KERN_ALERT "raid1: %s: unrecoverable I/O"
@@ -1529,6 +1619,10 @@ static int run(mddev_t *mddev)
1529		1619
1530	memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);	1620	memset(conf->mirrors, 0, sizeof(struct mirror_info)*mddev->raid_disks);
1531		1621
		1622	conf->tmppage = alloc_page(GFP_KERNEL);
		1623	if (!conf->tmppage)
		1624	goto out_no_mem;
		1625
1532	conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);	1626	conf->poolinfo = kmalloc(sizeof(*conf->poolinfo), GFP_KERNEL);
1533	if (!conf->poolinfo)	1627	if (!conf->poolinfo)
1534	goto out_no_mem;	1628	goto out_no_mem;
@@ -1635,6 +1729,7 @@ out_free_conf:
1635	if (conf->r1bio_pool)	1729	if (conf->r1bio_pool)
1636	mempool_destroy(conf->r1bio_pool);	1730	mempool_destroy(conf->r1bio_pool);
1637	kfree(conf->mirrors);	1731	kfree(conf->mirrors);
		1732	__free_page(conf->tmppage);
1638	kfree(conf->poolinfo);	1733	kfree(conf->poolinfo);
1639	kfree(conf);	1734	kfree(conf);
1640	mddev->private = NULL;	1735	mddev->private = NULL;


diff --git a/include/linux/raid/raid1.h b/include/linux/raid/raid1.h index c55674252533..cbe4238d3f9f 100644 --- a/include/linux/raid/raid1.h +++ b/include/linux/raid/raid1.h
@@ -46,6 +46,7 @@ struct r1_private_data_s {
46	spinlock_t resync_lock;	46	spinlock_t resync_lock;
47	int nr_pending;	47	int nr_pending;
48	int nr_waiting;	48	int nr_waiting;
		49	int nr_queued;
49	int barrier;	50	int barrier;
50	sector_t next_resync;	51	sector_t next_resync;
51	int fullsync; /* set to 1 if a full sync is needed,	52	int fullsync; /* set to 1 if a full sync is needed,
@@ -57,6 +58,8 @@ struct r1_private_data_s {
57		58
58	struct pool_info *poolinfo;	59	struct pool_info *poolinfo;
59		60
		61	struct page *tmppage;
		62
60	mempool_t *r1bio_pool;	63	mempool_t *r1bio_pool;
61	mempool_t *r1buf_pool;	64	mempool_t *r1buf_pool;
62	};	65	};