1 files changed, 59 insertions, 117 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 0b830bbe1d8b..378a25894c57 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -319,83 +319,74 @@ static void raid1_end_write_request(struct bio *bio, int error)
                if (r1_bio->bios[mirror] == bio)
                        break;
-        if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {
+        /*
-                set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);
+         * 'one mirror IO has finished' event handler:
-                set_bit(R1BIO_BarrierRetry, &r1_bio->state);
+         */
-                r1_bio->mddev->barriers_work = 0;
+        r1_bio->bios[mirror] = NULL;
-                /* Don't rdev_dec_pending in this branch - keep it for the retry */
+        to_put = bio;
-        } else {
+        if (!uptodate) {
+                md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
+                /* an I/O failed, we can't clear the bitmap */
+                set_bit(R1BIO_Degraded, &r1_bio->state);
+        } else
                /*
-                 * this branch is our 'one mirror IO has finished' event handler:
+                 * Set R1BIO_Uptodate in our master bio, so that we
+                 * will return a good error code for to the higher
+                 * levels even if IO on some other mirrored buffer
+                 * fails.
+                 *
+                 * The 'master' represents the composite IO operation
+                 * to user-side. So if something waits for IO, then it
+                 * will wait for the 'master' bio.
                 */
-                r1_bio->bios[mirror] = NULL;
+                set_bit(R1BIO_Uptodate, &r1_bio->state);
-                to_put = bio;
-                if (!uptodate) {
+        update_head_pos(mirror, r1_bio);
-                        md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
-                        /* an I/O failed, we can't clear the bitmap */
+        if (behind) {
-                        set_bit(R1BIO_Degraded, &r1_bio->state);
+                if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
-                } else
+                        atomic_dec(&r1_bio->behind_remaining);
-                        /*
-                         * Set R1BIO_Uptodate in our master bio, so that
+                /*
-                         * we will return a good error code for to the higher
+                 * In behind mode, we ACK the master bio once the I/O
-                         * levels even if IO on some other mirrored buffer fails.
+                 * has safely reached all non-writemostly
-                         *
+                 * disks. Setting the Returned bit ensures that this
-                         * The 'master' represents the composite IO operation to
+                 * gets done only once -- we don't ever want to return
-                         * user-side. So if something waits for IO, then it will
+                 * -EIO here, instead we'll wait
-                         * wait for the 'master' bio.
+                 */
-                         */
+                if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
-                        set_bit(R1BIO_Uptodate, &r1_bio->state);
+                    test_bit(R1BIO_Uptodate, &r1_bio->state)) {
+                        /* Maybe we can return now */
-                update_head_pos(mirror, r1_bio);
+                        if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
+                                struct bio *mbio = r1_bio->master_bio;
-                if (behind) {
+                                PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
-                        if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
+                                       (unsigned long long) mbio->bi_sector,
-                                atomic_dec(&r1_bio->behind_remaining);
+                                       (unsigned long long) mbio->bi_sector +
+                                       (mbio->bi_size >> 9) - 1);
-                        /* In behind mode, we ACK the master bio once the I/O has safely
+                                bio_endio(mbio, 0);
-                         * reached all non-writemostly disks. Setting the Returned bit
-                         * ensures that this gets done only once -- we don't ever want to
-                         * return -EIO here, instead we'll wait */
-                        if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
-                            test_bit(R1BIO_Uptodate, &r1_bio->state)) {
-                                /* Maybe we can return now */
-                                if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
-                                        struct bio *mbio = r1_bio->master_bio;
-                                        PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
-                                               (unsigned long long) mbio->bi_sector,
-                                               (unsigned long long) mbio->bi_sector +
-                                               (mbio->bi_size >> 9) - 1);
-                                        bio_endio(mbio, 0);
-                                }
                        }
                }
-                rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
        }
+        rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
        /*
-         *
         * Let's see if all mirrored write operations have finished
         * already.
         */
        if (atomic_dec_and_test(&r1_bio->remaining)) {
-                if (test_bit(R1BIO_BarrierRetry, &r1_bio->state))
+                if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
-                        reschedule_retry(r1_bio);
+                        /* free extra copy of the data pages */
-                else {
+                        int i = bio->bi_vcnt;
-                        /* it really is the end of this request */
+                        while (i--)
-                        if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
+                                safe_put_page(bio->bi_io_vec[i].bv_page);
-                                /* free extra copy of the data pages */
-                                int i = bio->bi_vcnt;
-                                while (i--)
-                                        safe_put_page(bio->bi_io_vec[i].bv_page);
-                        }
-                        /* clear the bitmap if all writes complete successfully */
-                        bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
-                                        r1_bio->sectors,
-                                        !test_bit(R1BIO_Degraded, &r1_bio->state),
-                                        behind);
-                        md_write_end(r1_bio->mddev);
-                        raid_end_bio_io(r1_bio);
                }
+                /* clear the bitmap if all writes complete successfully */
+                bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
+                                r1_bio->sectors,
+                                !test_bit(R1BIO_Degraded, &r1_bio->state),
+                                behind);
+                md_write_end(r1_bio->mddev);
+                raid_end_bio_io(r1_bio);
        }
        if (to_put)
@@ -788,16 +779,13 @@ static int make_request(mddev_t *mddev, struct bio * bio)
        struct page **behind_pages = NULL;
        const int rw = bio_data_dir(bio);
        const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
-        unsigned long do_barriers;
+        const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
        mdk_rdev_t *blocked_rdev;
        /*
         * Register the new request and wait if the reconstruction
         * thread has put up a bar for new requests.
         * Continue immediately if no resync is active currently.
-         * We test barriers_work *after* md_write_start as md_write_start
-         * may cause the first superblock write, and that will check out
-         * if barriers work.
         */
        md_write_start(mddev, bio); /* wait on superblock update early */
@@ -821,13 +809,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                }
                finish_wait(&conf->wait_barrier, &w);
        }
-        if (unlikely(!mddev->barriers_work &&
-                     (bio->bi_rw & REQ_HARDBARRIER))) {
-                if (rw == WRITE)
-                        md_write_end(mddev);
-                bio_endio(bio, -EOPNOTSUPP);
-                return 0;
-        }
        wait_barrier(conf);
@@ -959,10 +940,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
        atomic_set(&r1_bio->remaining, 0);
        atomic_set(&r1_bio->behind_remaining, 0);
-        do_barriers = bio->bi_rw & REQ_HARDBARRIER;
-        if (do_barriers)
-                set_bit(R1BIO_Barrier, &r1_bio->state);
        bio_list_init(&bl);
        for (i = 0; i < disks; i++) {
                struct bio *mbio;
@@ -975,7 +952,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
                mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
                mbio->bi_end_io = raid1_end_write_request;
-                mbio->bi_rw = WRITE | do_barriers | do_sync;
+                mbio->bi_rw = WRITE | do_flush_fua | do_sync;
                mbio->bi_private = r1_bio;
                if (behind_pages) {
@@ -1634,41 +1611,6 @@ static void raid1d(mddev_t *mddev)
                if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
                        sync_request_write(mddev, r1_bio);
                        unplug = 1;
-                } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
-                        /* some requests in the r1bio were REQ_HARDBARRIER
-                         * requests which failed with -EOPNOTSUPP.  Hohumm..
-                         * Better resubmit without the barrier.
-                         * We know which devices to resubmit for, because
-                         * all others have had their bios[] entry cleared.
-                         * We already have a nr_pending reference on these rdevs.
-                         */
-                        int i;
-                        const unsigned long do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC);
-                        clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
-                        clear_bit(R1BIO_Barrier, &r1_bio->state);
-                        for (i=0; i < conf->raid_disks; i++)
-                                if (r1_bio->bios[i])
-                                        atomic_inc(&r1_bio->remaining);
-                        for (i=0; i < conf->raid_disks; i++)
-                                if (r1_bio->bios[i]) {
-                                        struct bio_vec *bvec;
-                                        int j;
-                                        bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
-                                        /* copy pages from the failed bio, as
-                                         * this might be a write-behind device */
-                                        __bio_for_each_segment(bvec, bio, j, 0)
-                                                bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
-                                        bio_put(r1_bio->bios[i]);
-                                        bio->bi_sector = r1_bio->sector +
-                                                conf->mirrors[i].rdev->data_offset;
-                                        bio->bi_bdev = conf->mirrors[i].rdev->bdev;
-                                        bio->bi_end_io = raid1_end_write_request;
-                                        bio->bi_rw = WRITE | do_sync;
-                                        bio->bi_private = r1_bio;
-                                        r1_bio->bios[i] = bio;
-                                        generic_make_request(bio);
-                                }
                } else {
                        int disk;

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 0b830bbe1d8b..378a25894c57 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c
@@ -319,83 +319,74 @@ static void raid1_end_write_request(struct bio *bio, int error)
319	if (r1_bio->bios[mirror] == bio)	319	if (r1_bio->bios[mirror] == bio)
320	break;	320	break;
321		321
322	if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) {	322	/*
323	set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags);	323	* 'one mirror IO has finished' event handler:
324	set_bit(R1BIO_BarrierRetry, &r1_bio->state);	324	*/
325	r1_bio->mddev->barriers_work = 0;	325	r1_bio->bios[mirror] = NULL;
326	/* Don't rdev_dec_pending in this branch - keep it for the retry */	326	to_put = bio;
327	} else {	327	if (!uptodate) {
		328	md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);
		329	/* an I/O failed, we can't clear the bitmap */
		330	set_bit(R1BIO_Degraded, &r1_bio->state);
		331	} else
328	/*	332	/*
329	* this branch is our 'one mirror IO has finished' event handler:	333	* Set R1BIO_Uptodate in our master bio, so that we
		334	* will return a good error code for to the higher
		335	* levels even if IO on some other mirrored buffer
		336	* fails.
		337	*
		338	* The 'master' represents the composite IO operation
		339	* to user-side. So if something waits for IO, then it
		340	* will wait for the 'master' bio.
330	*/	341	*/
331	r1_bio->bios[mirror] = NULL;	342	set_bit(R1BIO_Uptodate, &r1_bio->state);
332	to_put = bio;	343
333	if (!uptodate) {	344	update_head_pos(mirror, r1_bio);
334	md_error(r1_bio->mddev, conf->mirrors[mirror].rdev);	345
335	/* an I/O failed, we can't clear the bitmap */	346	if (behind) {
336	set_bit(R1BIO_Degraded, &r1_bio->state);	347	if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))
337	} else	348	atomic_dec(&r1_bio->behind_remaining);
338	/*	349
339	* Set R1BIO_Uptodate in our master bio, so that	350	/*
340	* we will return a good error code for to the higher	351	* In behind mode, we ACK the master bio once the I/O
341	* levels even if IO on some other mirrored buffer fails.	352	* has safely reached all non-writemostly
342	*	353	* disks. Setting the Returned bit ensures that this
343	* The 'master' represents the composite IO operation to	354	* gets done only once -- we don't ever want to return
344	* user-side. So if something waits for IO, then it will	355	* -EIO here, instead we'll wait
345	* wait for the 'master' bio.	356	*/
346	*/	357	if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
347	set_bit(R1BIO_Uptodate, &r1_bio->state);	358	test_bit(R1BIO_Uptodate, &r1_bio->state)) {
348		359	/* Maybe we can return now */
349	update_head_pos(mirror, r1_bio);	360	if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
350		361	struct bio *mbio = r1_bio->master_bio;
351	if (behind) {	362	PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
352	if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags))	363	(unsigned long long) mbio->bi_sector,
353	atomic_dec(&r1_bio->behind_remaining);	364	(unsigned long long) mbio->bi_sector +
354		365	(mbio->bi_size >> 9) - 1);
355	/* In behind mode, we ACK the master bio once the I/O has safely	366	bio_endio(mbio, 0);
356	* reached all non-writemostly disks. Setting the Returned bit
357	* ensures that this gets done only once -- we don't ever want to
358	* return -EIO here, instead we'll wait */
359
360	if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) &&
361	test_bit(R1BIO_Uptodate, &r1_bio->state)) {
362	/* Maybe we can return now */
363	if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) {
364	struct bio *mbio = r1_bio->master_bio;
365	PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n",
366	(unsigned long long) mbio->bi_sector,
367	(unsigned long long) mbio->bi_sector +
368	(mbio->bi_size >> 9) - 1);
369	bio_endio(mbio, 0);
370	}
371	}	367	}
372	}	368	}
373	rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
374	}	369	}
		370	rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev);
		371
375	/*	372	/*
376	*
377	* Let's see if all mirrored write operations have finished	373	* Let's see if all mirrored write operations have finished
378	* already.	374	* already.
379	*/	375	*/
380	if (atomic_dec_and_test(&r1_bio->remaining)) {	376	if (atomic_dec_and_test(&r1_bio->remaining)) {
381	if (test_bit(R1BIO_BarrierRetry, &r1_bio->state))	377	if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {
382	reschedule_retry(r1_bio);	378	/* free extra copy of the data pages */
383	else {	379	int i = bio->bi_vcnt;
384	/* it really is the end of this request */	380	while (i--)
385	if (test_bit(R1BIO_BehindIO, &r1_bio->state)) {	381	safe_put_page(bio->bi_io_vec[i].bv_page);
386	/* free extra copy of the data pages */
387	int i = bio->bi_vcnt;
388	while (i--)
389	safe_put_page(bio->bi_io_vec[i].bv_page);
390	}
391	/* clear the bitmap if all writes complete successfully */
392	bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
393	r1_bio->sectors,
394	!test_bit(R1BIO_Degraded, &r1_bio->state),
395	behind);
396	md_write_end(r1_bio->mddev);
397	raid_end_bio_io(r1_bio);
398	}	382	}
		383	/* clear the bitmap if all writes complete successfully */
		384	bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector,
		385	r1_bio->sectors,
		386	!test_bit(R1BIO_Degraded, &r1_bio->state),
		387	behind);
		388	md_write_end(r1_bio->mddev);
		389	raid_end_bio_io(r1_bio);
399	}	390	}
400		391
401	if (to_put)	392	if (to_put)
@@ -788,16 +779,13 @@ static int make_request(mddev_t mddev, struct bio bio)
788	struct page **behind_pages = NULL;	779	struct page **behind_pages = NULL;
789	const int rw = bio_data_dir(bio);	780	const int rw = bio_data_dir(bio);
790	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);	781	const unsigned long do_sync = (bio->bi_rw & REQ_SYNC);
791	unsigned long do_barriers;	782	const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH \| REQ_FUA));
792	mdk_rdev_t *blocked_rdev;	783	mdk_rdev_t *blocked_rdev;
793		784
794	/*	785	/*
795	* Register the new request and wait if the reconstruction	786	* Register the new request and wait if the reconstruction
796	* thread has put up a bar for new requests.	787	* thread has put up a bar for new requests.
797	* Continue immediately if no resync is active currently.	788	* Continue immediately if no resync is active currently.
798	* We test barriers_work after md_write_start as md_write_start
799	* may cause the first superblock write, and that will check out
800	* if barriers work.
801	*/	789	*/
802		790
803	md_write_start(mddev, bio); /* wait on superblock update early */	791	md_write_start(mddev, bio); /* wait on superblock update early */
@@ -821,13 +809,6 @@ static int make_request(mddev_t mddev, struct bio bio)
821	}	809	}
822	finish_wait(&conf->wait_barrier, &w);	810	finish_wait(&conf->wait_barrier, &w);
823	}	811	}
824	if (unlikely(!mddev->barriers_work &&
825	(bio->bi_rw & REQ_HARDBARRIER))) {
826	if (rw == WRITE)
827	md_write_end(mddev);
828	bio_endio(bio, -EOPNOTSUPP);
829	return 0;
830	}
831		812
832	wait_barrier(conf);	813	wait_barrier(conf);
833		814
@@ -959,10 +940,6 @@ static int make_request(mddev_t mddev, struct bio bio)
959	atomic_set(&r1_bio->remaining, 0);	940	atomic_set(&r1_bio->remaining, 0);
960	atomic_set(&r1_bio->behind_remaining, 0);	941	atomic_set(&r1_bio->behind_remaining, 0);
961		942
962	do_barriers = bio->bi_rw & REQ_HARDBARRIER;
963	if (do_barriers)
964	set_bit(R1BIO_Barrier, &r1_bio->state);
965
966	bio_list_init(&bl);	943	bio_list_init(&bl);
967	for (i = 0; i < disks; i++) {	944	for (i = 0; i < disks; i++) {
968	struct bio *mbio;	945	struct bio *mbio;
@@ -975,7 +952,7 @@ static int make_request(mddev_t mddev, struct bio bio)
975	mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;	952	mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
976	mbio->bi_bdev = conf->mirrors[i].rdev->bdev;	953	mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
977	mbio->bi_end_io = raid1_end_write_request;	954	mbio->bi_end_io = raid1_end_write_request;
978	mbio->bi_rw = WRITE \| do_barriers \| do_sync;	955	mbio->bi_rw = WRITE \| do_flush_fua \| do_sync;
979	mbio->bi_private = r1_bio;	956	mbio->bi_private = r1_bio;
980		957
981	if (behind_pages) {	958	if (behind_pages) {
@@ -1634,41 +1611,6 @@ static void raid1d(mddev_t *mddev)
1634	if (test_bit(R1BIO_IsSync, &r1_bio->state)) {	1611	if (test_bit(R1BIO_IsSync, &r1_bio->state)) {
1635	sync_request_write(mddev, r1_bio);	1612	sync_request_write(mddev, r1_bio);
1636	unplug = 1;	1613	unplug = 1;
1637	} else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) {
1638	/* some requests in the r1bio were REQ_HARDBARRIER
1639	* requests which failed with -EOPNOTSUPP. Hohumm..
1640	* Better resubmit without the barrier.
1641	* We know which devices to resubmit for, because
1642	* all others have had their bios[] entry cleared.
1643	* We already have a nr_pending reference on these rdevs.
1644	*/
1645	int i;
1646	const unsigned long do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC);
1647	clear_bit(R1BIO_BarrierRetry, &r1_bio->state);
1648	clear_bit(R1BIO_Barrier, &r1_bio->state);
1649	for (i=0; i < conf->raid_disks; i++)
1650	if (r1_bio->bios[i])
1651	atomic_inc(&r1_bio->remaining);
1652	for (i=0; i < conf->raid_disks; i++)
1653	if (r1_bio->bios[i]) {
1654	struct bio_vec *bvec;
1655	int j;
1656
1657	bio = bio_clone(r1_bio->master_bio, GFP_NOIO);
1658	/* copy pages from the failed bio, as
1659	* this might be a write-behind device */
1660	__bio_for_each_segment(bvec, bio, j, 0)
1661	bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page;
1662	bio_put(r1_bio->bios[i]);
1663	bio->bi_sector = r1_bio->sector +
1664	conf->mirrors[i].rdev->data_offset;
1665	bio->bi_bdev = conf->mirrors[i].rdev->bdev;
1666	bio->bi_end_io = raid1_end_write_request;
1667	bio->bi_rw = WRITE \| do_sync;
1668	bio->bi_private = r1_bio;
1669	r1_bio->bios[i] = bio;
1670	generic_make_request(bio);
1671	}
1672	} else {	1614	} else {
1673	int disk;	1615	int disk;
1674		1616