[PATCH] md: support BIO_RW_BARRIER for md/raid1

We can only accept BARRIER requests if all slaves handle barriers, and that can, of course, change with time.... So we keep track of whether the whole array seems safe for barriers, and also whether each individual rdev handles barriers. We initially assumes barriers are OK. When writing the superblock we try a barrier, and if that fails, we flag things for no-barriers. This will usually clear the flags fairly quickly. If writing the superblock finds that BIO_RW_BARRIER is -ENOTSUPP, we need to resubmit, so introduce function "md_super_wait" which waits for requests to finish, and retries ENOTSUPP requests without the barrier flag. When writing the real raid1, write requests which were BIO_RW_BARRIER but which aresn't supported need to be retried. So raid1d is enhanced to do this, and when any bio write completes (i.e. no retry needed) we remove it from the r1bio, so that devices needing retry are easy to find. We should hardly ever get -ENOTSUPP errors when writing data to the raid. It should only happen if: 1/ the device used to support BARRIER, but now doesn't. Few devices change like this, though raid1 can! or 2/ the array has no persistent superblock, so there was no opportunity to pre-test for barriers when writing the superblock. Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au> Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
author: NeilBrown <neilb@suse.de> 2005-11-09 00:39:34 -0500
committer: Linus Torvalds <torvalds@g5.osdl.org> 2005-11-09 10:56:38 -0500
commit: a9701a30470856408d08657eb1bd7ae29a146190 (patch)
tree: eb6ea8c82fdc1b50bf56abadeee63a935034cf27 /drivers/md/raid1.c
parent: bd926c63b7a6843d3ce2728396c0891e54fce5c4 (diff)
1 files changed, 92 insertions, 42 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index fb6b866c28f5..1cbf51fbd43f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -301,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
 {
        int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
        r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
-        int mirror, behind;
+        int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
        conf_t *conf = mddev_to_conf(r1_bio->mddev);
        if (bio->bi_size)
@@ -311,47 +311,54 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
                if (r1_bio->bios[mirror] == bio)
                        break;
-        /*
+        if (error == -ENOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {
-         * this branch is our 'one mirror IO has finished' event handler:
+                set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);
-         */
+                set_bit(R1BIO_BarrierRetry, &r1_bio->state);
-        if (!uptodate) {
+                r1_bio->mddev->barriers_work = 0;
-                md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
+        } else {
-                /* an I/O failed, we can't clear the bitmap */
-                set_bit(R1BIO_Degraded, &r1_bio->state);
-        } else
                /*
-                 * Set R1BIO_Uptodate in our master bio, so that
+                 * this branch is our 'one mirror IO has finished' event handler:
-                 * we will return a good error code for to the higher
-                 * levels even if IO on some other mirrored buffer fails.
-                 *
-                 * The 'master' represents the composite IO operation to
-                 * user-side. So if something waits for IO, then it will
-                 * wait for the 'master' bio.
                 */
-                set_bit(R1BIO_Uptodate, &r1_bio->state);
+                r1_bio->bios[mirror] = NULL;
+                bio_put(bio);
-        update_head_pos(mirror, r1_bio);
+                if (!uptodate) {
+                        md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
-        behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
+                        /* an I/O failed, we can't clear the bitmap */
-        if (behind) {
+                        set_bit(R1BIO_Degraded, &r1_bio->state);
-                if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
+                } else
-                        atomic_dec(&r1_bio->behind_remaining);
+                        /*
+                         * Set R1BIO_Uptodate in our master bio, so that
-                /* In behind mode, we ACK the master bio once the I/O has safely
+                         * we will return a good error code for to the higher
-                 * reached all non-writemostly disks. Setting the Returned bit
+                         * levels even if IO on some other mirrored buffer fails.
-                 * ensures that this gets done only once -- we don't ever want to
+                         *
-                 * return -EIO here, instead we'll wait */
+                         * The 'master' represents the composite IO operation to
+                         * user-side. So if something waits for IO, then it will
-                if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
+                         * wait for the 'master' bio.
-                    test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+                         */
-                        /* Maybe we can return now */
+                        set_bit(R1BIO_Uptodate, &r1_bio->state);
-                        if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
-                                struct bio *mbio = r1_bio->master_bio;
+                update_head_pos(mirror, r1_bio);
-                                PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
-                                       (unsigned long long) mbio->bi_sector,
+                if (behind) {
-                                       (unsigned long long) mbio->bi_sector +
+                        if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
-                                       (mbio->bi_size >> 9) - 1);
+                                atomic_dec(&r1_bio->behind_remaining);
-                                bio_endio(mbio, mbio->bi_size, 0);
+                        /* In behind mode, we ACK the master bio once the I/O has safely
+                         * reached all non-writemostly disks. Setting the Returned bit
+                         * ensures that this gets done only once -- we don't ever want to
+                         * return -EIO here, instead we'll wait */
+                        if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
+                            test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+                                /* Maybe we can return now */
+                                if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
+                                        struct bio *mbio = r1_bio->master_bio;
+                                        PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
+                                               (unsigned long long) mbio->bi_sector,
+                                               (unsigned long long) mbio->bi_sector +
+                                               (mbio->bi_size >> 9) - 1);
+                                        bio_endio(mbio, mbio->bi_size, 0);
+                                }
                        }
                }
        }
@@ -361,8 +368,16 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
         * already.
         */
        if (atomic_dec_and_test(&r1_bio->remaining)) {
+                if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
+                        reschedule_retry(r1_bio);
+                        /* Don't dec_pending yet, we want to hold
+                         * the reference over the retry
+                         */
+                        return 0;
+                }
                if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
                        /* free extra copy of the data pages */
+/* FIXME bio has been freed!!! */
                        int i = bio->bi_vcnt;
                        while (i--)
                                __free_page(bio->bi_io_vec[i].bv_page);
@@ -648,8 +663,9 @@ static int make_request(request_queue_t *q, struct bio * bio)
        struct bio_list bl;
        struct page **behind_pages = NULL;
        const int rw = bio_data_dir(bio);
+        int do_barriers;
-        if (unlikely(bio_barrier(bio))) {
+        if (unlikely(!mddev->barriers_work && bio_barrier(bio))) {
                bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
                return 0;
        }
@@ -759,6 +775,10 @@ static int make_request(request_queue_t *q, struct bio * bio)
        atomic_set(&r1_bio->remaining, 0);
        atomic_set(&r1_bio->behind_remaining, 0);
+        do_barriers = bio->bi_rw & BIO_RW_BARRIER;
+        if (do_barriers)
+                set_bit(R1BIO_Barrier, &r1_bio->state);
        bio_list_init(&bl);
        for (i = 0; i < disks; i++) {
                struct bio *mbio;
@@ -771,7 +791,7 @@ static int make_request(request_queue_t *q, struct bio * bio)
                mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
                mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
                mbio->bi_end_io = raid1_end_write_request;
-                mbio->bi_rw = WRITE;
+                mbio->bi_rw = WRITE | do_barriers;
                mbio->bi_private = r1_bio;
                if (behind_pages) {
@@ -1153,6 +1173,36 @@ static void raid1d(mddev_t *mddev)
                if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
                        sync_request_write(mddev, r1_bio);
                        unplug = 1;
+                } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
+                        /* some requests in the r1bio were BIO_RW_BARRIER
+                         * requests which failed with -ENOTSUPP.  Hohumm..
+                         * Better resubmit without the barrier.
+                         * We know which devices to resubmit for, because
+                         * all others have had their bios[] entry cleared.
+                         */
+                        int i;
+                        clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
+                        clear_bit(R1BIO_Barrier, &r1_bio->state);
+                        for (i=0; i < conf->raid_disks; i++)
+                                if (r1_bio->bios[i]) {
+                                        struct bio_vec *bvec;
+                                        int j;
+                                        bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
+                                        /* copy pages from the failed bio, as
+                                         * this might be a write-behind device */
+                                        __bio_for_each_segment(bvec, bio, j, 0)
+                                                bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
+                                        bio_put(r1_bio->bios[i]);
+                                        bio->bi_sector = r1_bio->sector +
+                                                conf->mirrors[i].rdev->data_offset;
+                                        bio->bi_bdev = conf->mirrors[i].rdev->bdev;
+                                        bio->bi_end_io = raid1_end_write_request;
+                                        bio->bi_rw = WRITE;
+                                        bio->bi_private = r1_bio;
+                                        r1_bio->bios[i] = bio;
+                                        generic_make_request(bio);
+                                }
                } else {
                        int disk;
                        bio = r1_bio->bios[r1_bio->read_disk];
author	NeilBrown <neilb@suse.de>	2005-11-09 00:39:34 -0500
committer	Linus Torvalds <torvalds@g5.osdl.org>	2005-11-09 10:56:38 -0500
commit	a9701a30470856408d08657eb1bd7ae29a146190 (patch)
tree	eb6ea8c82fdc1b50bf56abadeee63a935034cf27 /drivers/md/raid1.c
parent	bd926c63b7a6843d3ce2728396c0891e54fce5c4 (diff)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index fb6b866c28f5..1cbf51fbd43f 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c
@@ -301,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
301	{	301	{
302	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);	302	int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
303	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);	303	r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private);
304	int mirror, behind;	304	int mirror, behind = test_bit(R1BIO_BehindIO, &r1_bio->state);
305	conf_t *conf = mddev_to_conf(r1_bio->mddev);	305	conf_t *conf = mddev_to_conf(r1_bio->mddev);
306		306
307	if (bio->bi_size)	307	if (bio->bi_size)
@@ -311,47 +311,54 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
311	if (r1_bio->bios[mirror] == bio)	311	if (r1_bio->bios[mirror] == bio)
312	break;	312	break;
313		313
314	/*	314	if (error == -ENOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {
315	* this branch is our 'one mirror IO has finished' event handler:	315	set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);
316	*/	316	set_bit(R1BIO_BarrierRetry, &r1_bio->state);
317	if (!uptodate) {	317	r1_bio->mddev->barriers_work = 0;
318	md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);	318	} else {
319	/* an I/O failed, we can't clear the bitmap */
320	set_bit(R1BIO_Degraded, &r1_bio->state);
321	} else
322	/*	319	/*
323	* Set R1BIO_Uptodate in our master bio, so that	320	* this branch is our 'one mirror IO has finished' event handler:
324	* we will return a good error code for to the higher
325	* levels even if IO on some other mirrored buffer fails.
326	*
327	* The 'master' represents the composite IO operation to
328	* user-side. So if something waits for IO, then it will
329	* wait for the 'master' bio.
330	*/	321	*/
331	set_bit(R1BIO_Uptodate, &r1_bio->state);	322	r1_bio->bios[mirror] = NULL;
332		323	bio_put(bio);
333	update_head_pos(mirror, r1_bio);	324	if (!uptodate) {
334		325	md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
335	behind = test_bit(R1BIO_BehindIO, &r1_bio->state);	326	/* an I/O failed, we can't clear the bitmap */
336	if (behind) {	327	set_bit(R1BIO_Degraded, &r1_bio->state);
337	if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))	328	} else
338	atomic_dec(&r1_bio->behind_remaining);	329	/*
339		330	* Set R1BIO_Uptodate in our master bio, so that
340	/* In behind mode, we ACK the master bio once the I/O has safely	331	* we will return a good error code for to the higher
341	* reached all non-writemostly disks. Setting the Returned bit	332	* levels even if IO on some other mirrored buffer fails.
342	* ensures that this gets done only once -- we don't ever want to	333	*
343	* return -EIO here, instead we'll wait */	334	* The 'master' represents the composite IO operation to
344		335	* user-side. So if something waits for IO, then it will
345	if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&	336	* wait for the 'master' bio.
346	test_bit(R1BIO_Uptodate, &r1_bio->state)) {	337	*/
347	/* Maybe we can return now */	338	set_bit(R1BIO_Uptodate, &r1_bio->state);
348	if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {	339
349	struct bio *mbio = r1_bio->master_bio;	340	update_head_pos(mirror, r1_bio);
350	PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",	341
351	(unsigned long long) mbio->bi_sector,	342	if (behind) {
352	(unsigned long long) mbio->bi_sector +	343	if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
353	(mbio->bi_size >> 9) - 1);	344	atomic_dec(&r1_bio->behind_remaining);
354	bio_endio(mbio, mbio->bi_size, 0);	345
		346	/* In behind mode, we ACK the master bio once the I/O has safely
		347	* reached all non-writemostly disks. Setting the Returned bit
		348	* ensures that this gets done only once -- we don't ever want to
		349	* return -EIO here, instead we'll wait */
		350
		351	if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
		352	test_bit(R1BIO_Uptodate, &r1_bio->state)) {
		353	/* Maybe we can return now */
		354	if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
		355	struct bio *mbio = r1_bio->master_bio;
		356	PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
		357	(unsigned long long) mbio->bi_sector,
		358	(unsigned long long) mbio->bi_sector +
		359	(mbio->bi_size >> 9) - 1);
		360	bio_endio(mbio, mbio->bi_size, 0);
		361	}
355	}	362	}
356	}	363	}
357	}	364	}
@@ -361,8 +368,16 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int
361	* already.	368	* already.
362	*/	369	*/
363	if (atomic_dec_and_test(&r1_bio->remaining)) {	370	if (atomic_dec_and_test(&r1_bio->remaining)) {
		371	if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
		372	reschedule_retry(r1_bio);
		373	/* Don't dec_pending yet, we want to hold
		374	* the reference over the retry
		375	*/
		376	return 0;
		377	}
364	if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {	378	if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
365	/* free extra copy of the data pages */	379	/* free extra copy of the data pages */
		380	/* FIXME bio has been freed!!! */
366	int i = bio->bi_vcnt;	381	int i = bio->bi_vcnt;
367	while (i--)	382	while (i--)
368	__free_page(bio->bi_io_vec[i].bv_page);	383	__free_page(bio->bi_io_vec[i].bv_page);
@@ -648,8 +663,9 @@ static int make_request(request_queue_t q, struct bio bio)
648	struct bio_list bl;	663	struct bio_list bl;
649	struct page **behind_pages = NULL;	664	struct page **behind_pages = NULL;
650	const int rw = bio_data_dir(bio);	665	const int rw = bio_data_dir(bio);
		666	int do_barriers;
651		667
652	if (unlikely(bio_barrier(bio))) {	668	if (unlikely(!mddev->barriers_work && bio_barrier(bio))) {
653	bio_endio(bio, bio->bi_size, -EOPNOTSUPP);	669	bio_endio(bio, bio->bi_size, -EOPNOTSUPP);
654	return 0;	670	return 0;
655	}	671	}
@@ -759,6 +775,10 @@ static int make_request(request_queue_t q, struct bio bio)
759	atomic_set(&r1_bio->remaining, 0);	775	atomic_set(&r1_bio->remaining, 0);
760	atomic_set(&r1_bio->behind_remaining, 0);	776	atomic_set(&r1_bio->behind_remaining, 0);
761		777
		778	do_barriers = bio->bi_rw & BIO_RW_BARRIER;
		779	if (do_barriers)
		780	set_bit(R1BIO_Barrier, &r1_bio->state);
		781
762	bio_list_init(&bl);	782	bio_list_init(&bl);
763	for (i = 0; i < disks; i++) {	783	for (i = 0; i < disks; i++) {
764	struct bio *mbio;	784	struct bio *mbio;
@@ -771,7 +791,7 @@ static int make_request(request_queue_t q, struct bio bio)
771	mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;	791	mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
772	mbio->bi_bdev = conf->mirrors[i].rdev->bdev;	792	mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
773	mbio->bi_end_io = raid1_end_write_request;	793	mbio->bi_end_io = raid1_end_write_request;
774	mbio->bi_rw = WRITE;	794	mbio->bi_rw = WRITE \| do_barriers;
775	mbio->bi_private = r1_bio;	795	mbio->bi_private = r1_bio;
776		796
777	if (behind_pages) {	797	if (behind_pages) {
@@ -1153,6 +1173,36 @@ static void raid1d(mddev_t *mddev)
1153	if (test_bit(R1BIO_IsSync, &r1_bio->state)) {	1173	if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
1154	sync_request_write(mddev, r1_bio);	1174	sync_request_write(mddev, r1_bio);
1155	unplug = 1;	1175	unplug = 1;
		1176	} else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
		1177	/* some requests in the r1bio were BIO_RW_BARRIER
		1178	* requests which failed with -ENOTSUPP. Hohumm..
		1179	* Better resubmit without the barrier.
		1180	* We know which devices to resubmit for, because
		1181	* all others have had their bios[] entry cleared.
		1182	*/
		1183	int i;
		1184	clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
		1185	clear_bit(R1BIO_Barrier, &r1_bio->state);
		1186	for (i=0; i < conf->raid_disks; i++)
		1187	if (r1_bio->bios[i]) {
		1188	struct bio_vec *bvec;
		1189	int j;
		1190
		1191	bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
		1192	/* copy pages from the failed bio, as
		1193	* this might be a write-behind device */
		1194	__bio_for_each_segment(bvec, bio, j, 0)
		1195	bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
		1196	bio_put(r1_bio->bios[i]);
		1197	bio->bi_sector = r1_bio->sector +
		1198	conf->mirrors[i].rdev->data_offset;
		1199	bio->bi_bdev = conf->mirrors[i].rdev->bdev;
		1200	bio->bi_end_io = raid1_end_write_request;
		1201	bio->bi_rw = WRITE;
		1202	bio->bi_private = r1_bio;
		1203	r1_bio->bios[i] = bio;
		1204	generic_make_request(bio);
		1205	}
1156	} else {	1206	} else {
1157	int disk;	1207	int disk;
1158	bio = r1_bio->bios[r1_bio->read_disk];	1208	bio = r1_bio->bios[r1_bio->read_disk];