md/raid1: avoid writing to known-bad blocks on known-bad drives.

If we have seen any write error on a drive, then don't write to any known-bad blocks on that drive. If necessary, we divide the write request up into pieces just like we do for reads, so each piece is either all written or all not written to any given drive. Signed-off-by: NeilBrown <neilb@suse.de> Reviewed-by: Namhyung Kim <namhyung@gmail.com>
author: NeilBrown <neilb@suse.de> 2011-07-27 21:31:48 -0400
committer: NeilBrown <neilb@suse.de> 2011-07-27 21:31:48 -0400
commit: 1f68f0c4b677ccd6935ff61e4e6888787505f8dc (patch)
tree: aaff73efbc3fb7b6092eb2106e142e1684c4b554 /drivers
parent: 0b7d83865cb7a60b1768212c1e60b8fd7c280506 (diff)
1 files changed, 115 insertions, 38 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 4d40d9d54a20..3214606204d2 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -764,7 +764,7 @@ static int make_request(mddev_t *mddev, struct bio * bio)
        mirror_info_t *mirror;
        r1bio_t *r1_bio;
        struct bio *read_bio;
-        int i, targets = 0, disks;
+        int i, disks;
        struct bitmap *bitmap;
        unsigned long flags;
        const int rw = bio_data_dir(bio);
@@ -772,6 +772,9 @@ static int make_request(mddev_t *mddev, struct bio * bio)
        const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA));
        mdk_rdev_t *blocked_rdev;
        int plugged;
+        int first_clone;
+        int sectors_handled;
+        int max_sectors;
        /*
         * Register the new request and wait if the reconstruction
@@ -832,7 +835,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                /*
                 * read balancing logic:
                 */
-                int max_sectors;
                int rdisk;
 read_again:
@@ -872,7 +874,6 @@ read_again:
                        /* could not read all from this device, so we will
                         * need another r1_bio.
                         */
-                        int sectors_handled;
                        sectors_handled = (r1_bio->sector + max_sectors
                                           - bio->bi_sector);
@@ -906,9 +907,15 @@ read_again:
        /*
         * WRITE:
         */
-        /* first select target devices under spinlock and
+        /* first select target devices under rcu_lock and
         * inc refcount on their rdev.  Record them by setting
         * bios[x] to bio
+         * If there are known/acknowledged bad blocks on any device on
+         * which we have seen a write error, we want to avoid writing those
+         * blocks.
+         * This potentially requires several writes to write around
+         * the bad blocks.  Each set of writes gets it's own r1bio
+         * with a set of bios attached.
         */
        plugged = mddev_check_plugged(mddev);
@@ -916,6 +923,7 @@ read_again:
 retry_write:
        blocked_rdev = NULL;
        rcu_read_lock();
+        max_sectors = r1_bio->sectors;
        for (i = 0;  i < disks; i++) {
                mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
                if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
@@ -923,17 +931,56 @@ read_again:
                        blocked_rdev = rdev;
                        break;
                }
-                if (rdev && !test_bit(Faulty, &rdev->flags)) {
+                r1_bio->bios[i] = NULL;
-                        atomic_inc(&rdev->nr_pending);
+                if (!rdev || test_bit(Faulty, &rdev->flags)) {
-                        if (test_bit(Faulty, &rdev->flags)) {
+                        set_bit(R1BIO_Degraded, &r1_bio->state);
+                        continue;
+                }
+                atomic_inc(&rdev->nr_pending);
+                if (test_bit(WriteErrorSeen, &rdev->flags)) {
+                        sector_t first_bad;
+                        int bad_sectors;
+                        int is_bad;
+                        is_bad = is_badblock(rdev, r1_bio->sector,
+                                             max_sectors,
+                                             &first_bad, &bad_sectors);
+                        if (is_bad < 0) {
+                                /* mustn't write here until the bad block is
+                                 * acknowledged*/
+                                set_bit(BlockedBadBlocks, &rdev->flags);
+                                blocked_rdev = rdev;
+                                break;
+                        }
+                        if (is_bad && first_bad <= r1_bio->sector) {
+                                /* Cannot write here at all */
+                                bad_sectors -= (r1_bio->sector - first_bad);
+                                if (bad_sectors < max_sectors)
+                                        /* mustn't write more than bad_sectors
+                                         * to other devices yet
+                                         */
+                                        max_sectors = bad_sectors;
                                rdev_dec_pending(rdev, mddev);
-                                r1_bio->bios[i] = NULL;
+                                /* We don't set R1BIO_Degraded as that
-                        } else {
+                                 * only applies if the disk is
-                                r1_bio->bios[i] = bio;
+                                 * missing, so it might be re-added,
-                                targets++;
+                                 * and we want to know to recover this
+                                 * chunk.
+                                 * In this case the device is here,
+                                 * and the fact that this chunk is not
+                                 * in-sync is recorded in the bad
+                                 * block log
+                                 */
+                                continue;
                        }
-                } else
+                        if (is_bad) {
-                        r1_bio->bios[i] = NULL;
+                                int good_sectors = first_bad - r1_bio->sector;
+                                if (good_sectors < max_sectors)
+                                        max_sectors = good_sectors;
+                        }
+                }
+                r1_bio->bios[i] = bio;
        }
        rcu_read_unlock();
@@ -944,48 +991,56 @@ read_again:
                for (j = 0; j < i; j++)
                        if (r1_bio->bios[j])
                                rdev_dec_pending(conf->mirrors[j].rdev, mddev);
+                r1_bio->state = 0;
                allow_barrier(conf);
                md_wait_for_blocked_rdev(blocked_rdev, mddev);
                wait_barrier(conf);
                goto retry_write;
        }
-        if (targets < conf->raid_disks) {
+        if (max_sectors < r1_bio->sectors) {
-                /* array is degraded, we will not clear the bitmap
+                /* We are splitting this write into multiple parts, so
-                 * on I/O completion (see raid1_end_write_request) */
+                 * we need to prepare for allocating another r1_bio.
-                set_bit(R1BIO_Degraded, &r1_bio->state);
+                 */
+                r1_bio->sectors = max_sectors;
+                spin_lock_irq(&conf->device_lock);
+                if (bio->bi_phys_segments == 0)
+                        bio->bi_phys_segments = 2;
+                else
+                        bio->bi_phys_segments++;
+                spin_unlock_irq(&conf->device_lock);
        }
+        sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector;
-        /* do behind I/O ?
-         * Not if there are too many, or cannot allocate memory,
-         * or a reader on WriteMostly is waiting for behind writes 
-         * to flush */
-        if (bitmap &&
-            (atomic_read(&bitmap->behind_writes)
-             < mddev->bitmap_info.max_write_behind) &&
-            !waitqueue_active(&bitmap->behind_wait))
-                alloc_behind_pages(bio, r1_bio);
        atomic_set(&r1_bio->remaining, 1);
        atomic_set(&r1_bio->behind_remaining, 0);
-        bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,
+        first_clone = 1;
-                                test_bit(R1BIO_BehindIO, &r1_bio->state));
        for (i = 0; i < disks; i++) {
                struct bio *mbio;
                if (!r1_bio->bios[i])
                        continue;
                mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-                r1_bio->bios[i] = mbio;
+                md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors);
-                mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;
+                if (first_clone) {
-                mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+                        /* do behind I/O ?
-                mbio->bi_end_io = raid1_end_write_request;
+                         * Not if there are too many, or cannot
-                mbio->bi_rw = WRITE | do_flush_fua | do_sync;
+                         * allocate memory, or a reader on WriteMostly
-                mbio->bi_private = r1_bio;
+                         * is waiting for behind writes to flush */
+                        if (bitmap &&
+                            (atomic_read(&bitmap->behind_writes)
+                             < mddev->bitmap_info.max_write_behind) &&
+                            !waitqueue_active(&bitmap->behind_wait))
+                                alloc_behind_pages(mbio, r1_bio);
+                        bitmap_startwrite(bitmap, r1_bio->sector,
+                                          r1_bio->sectors,
+                                          test_bit(R1BIO_BehindIO,
+                                                   &r1_bio->state));
+                        first_clone = 0;
+                }
                if (r1_bio->behind_pages) {
                        struct bio_vec *bvec;
                        int j;
@@ -1003,6 +1058,15 @@ read_again:
                                atomic_inc(&r1_bio->behind_remaining);
                }
+                r1_bio->bios[i] = mbio;
+                mbio->bi_sector = (r1_bio->sector +
+                                   conf->mirrors[i].rdev->data_offset);
+                mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
+                mbio->bi_end_io = raid1_end_write_request;
+                mbio->bi_rw = WRITE | do_flush_fua | do_sync;
+                mbio->bi_private = r1_bio;
                atomic_inc(&r1_bio->remaining);
                spin_lock_irqsave(&conf->device_lock, flags);
                bio_list_add(&conf->pending_bio_list, mbio);
@@ -1013,6 +1077,19 @@ read_again:
        /* In case raid1d snuck in to freeze_array */
        wake_up(&conf->wait_barrier);
+        if (sectors_handled < (bio->bi_size >> 9)) {
+                /* We need another r1_bio.  It has already been counted
+                 * in bio->bi_phys_segments
+                 */
+                r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+                r1_bio->master_bio = bio;
+                r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+                r1_bio->state = 0;
+                r1_bio->mddev = mddev;
+                r1_bio->sector = bio->bi_sector + sectors_handled;
+                goto retry_write;
+        }
        if (do_sync || !bitmap || !plugged)
                md_wakeup_thread(mddev->thread);
author	NeilBrown <neilb@suse.de>	2011-07-27 21:31:48 -0400
committer	NeilBrown <neilb@suse.de>	2011-07-27 21:31:48 -0400
commit	1f68f0c4b677ccd6935ff61e4e6888787505f8dc (patch)
tree	aaff73efbc3fb7b6092eb2106e142e1684c4b554 /drivers
parent	0b7d83865cb7a60b1768212c1e60b8fd7c280506 (diff)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 4d40d9d54a20..3214606204d2 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c
@@ -764,7 +764,7 @@ static int make_request(mddev_t mddev, struct bio bio)
764	mirror_info_t *mirror;	764	mirror_info_t *mirror;
765	r1bio_t *r1_bio;	765	r1bio_t *r1_bio;
766	struct bio *read_bio;	766	struct bio *read_bio;
767	int i, targets = 0, disks;	767	int i, disks;
768	struct bitmap *bitmap;	768	struct bitmap *bitmap;
769	unsigned long flags;	769	unsigned long flags;
770	const int rw = bio_data_dir(bio);	770	const int rw = bio_data_dir(bio);
@@ -772,6 +772,9 @@ static int make_request(mddev_t mddev, struct bio bio)
772	const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH \| REQ_FUA));	772	const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH \| REQ_FUA));
773	mdk_rdev_t *blocked_rdev;	773	mdk_rdev_t *blocked_rdev;
774	int plugged;	774	int plugged;
		775	int first_clone;
		776	int sectors_handled;
		777	int max_sectors;
775		778
776	/*	779	/*
777	* Register the new request and wait if the reconstruction	780	* Register the new request and wait if the reconstruction
@@ -832,7 +835,6 @@ static int make_request(mddev_t mddev, struct bio bio)
832	/*	835	/*
833	* read balancing logic:	836	* read balancing logic:
834	*/	837	*/
835	int max_sectors;
836	int rdisk;	838	int rdisk;
837		839
838	read_again:	840	read_again:
@@ -872,7 +874,6 @@ read_again:
872	/* could not read all from this device, so we will	874	/* could not read all from this device, so we will
873	* need another r1_bio.	875	* need another r1_bio.
874	*/	876	*/
875	int sectors_handled;
876		877
877	sectors_handled = (r1_bio->sector + max_sectors	878	sectors_handled = (r1_bio->sector + max_sectors
878	- bio->bi_sector);	879	- bio->bi_sector);
@@ -906,9 +907,15 @@ read_again:
906	/*	907	/*
907	* WRITE:	908	* WRITE:
908	*/	909	*/
909	/* first select target devices under spinlock and	910	/* first select target devices under rcu_lock and
910	* inc refcount on their rdev. Record them by setting	911	* inc refcount on their rdev. Record them by setting
911	* bios[x] to bio	912	* bios[x] to bio
		913	* If there are known/acknowledged bad blocks on any device on
		914	* which we have seen a write error, we want to avoid writing those
		915	* blocks.
		916	* This potentially requires several writes to write around
		917	* the bad blocks. Each set of writes gets it's own r1bio
		918	* with a set of bios attached.
912	*/	919	*/
913	plugged = mddev_check_plugged(mddev);	920	plugged = mddev_check_plugged(mddev);
914		921
@@ -916,6 +923,7 @@ read_again:
916	retry_write:	923	retry_write:
917	blocked_rdev = NULL;	924	blocked_rdev = NULL;
918	rcu_read_lock();	925	rcu_read_lock();
		926	max_sectors = r1_bio->sectors;
919	for (i = 0; i < disks; i++) {	927	for (i = 0; i < disks; i++) {
920	mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);	928	mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
921	if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {	929	if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
@@ -923,17 +931,56 @@ read_again:
923	blocked_rdev = rdev;	931	blocked_rdev = rdev;
924	break;	932	break;
925	}	933	}
926	if (rdev && !test_bit(Faulty, &rdev->flags)) {	934	r1_bio->bios[i] = NULL;
927	atomic_inc(&rdev->nr_pending);	935	if (!rdev \|\| test_bit(Faulty, &rdev->flags)) {
928	if (test_bit(Faulty, &rdev->flags)) {	936	set_bit(R1BIO_Degraded, &r1_bio->state);
		937	continue;
		938	}
		939
		940	atomic_inc(&rdev->nr_pending);
		941	if (test_bit(WriteErrorSeen, &rdev->flags)) {
		942	sector_t first_bad;
		943	int bad_sectors;
		944	int is_bad;
		945
		946	is_bad = is_badblock(rdev, r1_bio->sector,
		947	max_sectors,
		948	&first_bad, &bad_sectors);
		949	if (is_bad < 0) {
		950	/* mustn't write here until the bad block is
		951	* acknowledged*/
		952	set_bit(BlockedBadBlocks, &rdev->flags);
		953	blocked_rdev = rdev;
		954	break;
		955	}
		956	if (is_bad && first_bad <= r1_bio->sector) {
		957	/* Cannot write here at all */
		958	bad_sectors -= (r1_bio->sector - first_bad);
		959	if (bad_sectors < max_sectors)
		960	/* mustn't write more than bad_sectors
		961	* to other devices yet
		962	*/
		963	max_sectors = bad_sectors;
929	rdev_dec_pending(rdev, mddev);	964	rdev_dec_pending(rdev, mddev);
930	r1_bio->bios[i] = NULL;	965	/* We don't set R1BIO_Degraded as that
931	} else {	966	* only applies if the disk is
932	r1_bio->bios[i] = bio;	967	* missing, so it might be re-added,
933	targets++;	968	* and we want to know to recover this
		969	* chunk.
		970	* In this case the device is here,
		971	* and the fact that this chunk is not
		972	* in-sync is recorded in the bad
		973	* block log
		974	*/
		975	continue;
934	}	976	}
935	} else	977	if (is_bad) {
936	r1_bio->bios[i] = NULL;	978	int good_sectors = first_bad - r1_bio->sector;
		979	if (good_sectors < max_sectors)
		980	max_sectors = good_sectors;
		981	}
		982	}
		983	r1_bio->bios[i] = bio;
937	}	984	}
938	rcu_read_unlock();	985	rcu_read_unlock();
939		986
@@ -944,48 +991,56 @@ read_again:
944	for (j = 0; j < i; j++)	991	for (j = 0; j < i; j++)
945	if (r1_bio->bios[j])	992	if (r1_bio->bios[j])
946	rdev_dec_pending(conf->mirrors[j].rdev, mddev);	993	rdev_dec_pending(conf->mirrors[j].rdev, mddev);
947		994	r1_bio->state = 0;
948	allow_barrier(conf);	995	allow_barrier(conf);
949	md_wait_for_blocked_rdev(blocked_rdev, mddev);	996	md_wait_for_blocked_rdev(blocked_rdev, mddev);
950	wait_barrier(conf);	997	wait_barrier(conf);
951	goto retry_write;	998	goto retry_write;
952	}	999	}
953		1000
954	if (targets < conf->raid_disks) {	1001	if (max_sectors < r1_bio->sectors) {
955	/* array is degraded, we will not clear the bitmap	1002	/* We are splitting this write into multiple parts, so
956	* on I/O completion (see raid1_end_write_request) */	1003	* we need to prepare for allocating another r1_bio.
957	set_bit(R1BIO_Degraded, &r1_bio->state);	1004	*/
		1005	r1_bio->sectors = max_sectors;
		1006	spin_lock_irq(&conf->device_lock);
		1007	if (bio->bi_phys_segments == 0)
		1008	bio->bi_phys_segments = 2;
		1009	else
		1010	bio->bi_phys_segments++;
		1011	spin_unlock_irq(&conf->device_lock);
958	}	1012	}
959		1013	sectors_handled = r1_bio->sector + max_sectors - bio->bi_sector;
960	/* do behind I/O ?
961	* Not if there are too many, or cannot allocate memory,
962	* or a reader on WriteMostly is waiting for behind writes
963	* to flush */
964	if (bitmap &&
965	(atomic_read(&bitmap->behind_writes)
966	< mddev->bitmap_info.max_write_behind) &&
967	!waitqueue_active(&bitmap->behind_wait))
968	alloc_behind_pages(bio, r1_bio);
969		1014
970	atomic_set(&r1_bio->remaining, 1);	1015	atomic_set(&r1_bio->remaining, 1);
971	atomic_set(&r1_bio->behind_remaining, 0);	1016	atomic_set(&r1_bio->behind_remaining, 0);
972		1017
973	bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors,	1018	first_clone = 1;
974	test_bit(R1BIO_BehindIO, &r1_bio->state));
975	for (i = 0; i < disks; i++) {	1019	for (i = 0; i < disks; i++) {
976	struct bio *mbio;	1020	struct bio *mbio;
977	if (!r1_bio->bios[i])	1021	if (!r1_bio->bios[i])
978	continue;	1022	continue;
979		1023
980	mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);	1024	mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
981	r1_bio->bios[i] = mbio;	1025	md_trim_bio(mbio, r1_bio->sector - bio->bi_sector, max_sectors);
982		1026
983	mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset;	1027	if (first_clone) {
984	mbio->bi_bdev = conf->mirrors[i].rdev->bdev;	1028	/* do behind I/O ?
985	mbio->bi_end_io = raid1_end_write_request;	1029	* Not if there are too many, or cannot
986	mbio->bi_rw = WRITE \| do_flush_fua \| do_sync;	1030	* allocate memory, or a reader on WriteMostly
987	mbio->bi_private = r1_bio;	1031	* is waiting for behind writes to flush */
988		1032	if (bitmap &&
		1033	(atomic_read(&bitmap->behind_writes)
		1034	< mddev->bitmap_info.max_write_behind) &&
		1035	!waitqueue_active(&bitmap->behind_wait))
		1036	alloc_behind_pages(mbio, r1_bio);
		1037
		1038	bitmap_startwrite(bitmap, r1_bio->sector,
		1039	r1_bio->sectors,
		1040	test_bit(R1BIO_BehindIO,
		1041	&r1_bio->state));
		1042	first_clone = 0;
		1043	}
989	if (r1_bio->behind_pages) {	1044	if (r1_bio->behind_pages) {
990	struct bio_vec *bvec;	1045	struct bio_vec *bvec;
991	int j;	1046	int j;
@@ -1003,6 +1058,15 @@ read_again:
1003	atomic_inc(&r1_bio->behind_remaining);	1058	atomic_inc(&r1_bio->behind_remaining);
1004	}	1059	}
1005		1060
		1061	r1_bio->bios[i] = mbio;
		1062
		1063	mbio->bi_sector = (r1_bio->sector +
		1064	conf->mirrors[i].rdev->data_offset);
		1065	mbio->bi_bdev = conf->mirrors[i].rdev->bdev;
		1066	mbio->bi_end_io = raid1_end_write_request;
		1067	mbio->bi_rw = WRITE \| do_flush_fua \| do_sync;
		1068	mbio->bi_private = r1_bio;
		1069
1006	atomic_inc(&r1_bio->remaining);	1070	atomic_inc(&r1_bio->remaining);
1007	spin_lock_irqsave(&conf->device_lock, flags);	1071	spin_lock_irqsave(&conf->device_lock, flags);
1008	bio_list_add(&conf->pending_bio_list, mbio);	1072	bio_list_add(&conf->pending_bio_list, mbio);
@@ -1013,6 +1077,19 @@ read_again:
1013	/* In case raid1d snuck in to freeze_array */	1077	/* In case raid1d snuck in to freeze_array */
1014	wake_up(&conf->wait_barrier);	1078	wake_up(&conf->wait_barrier);
1015		1079
		1080	if (sectors_handled < (bio->bi_size >> 9)) {
		1081	/* We need another r1_bio. It has already been counted
		1082	* in bio->bi_phys_segments
		1083	*/
		1084	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
		1085	r1_bio->master_bio = bio;
		1086	r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
		1087	r1_bio->state = 0;
		1088	r1_bio->mddev = mddev;
		1089	r1_bio->sector = bio->bi_sector + sectors_handled;
		1090	goto retry_write;
		1091	}
		1092
1016	if (do_sync \|\| !bitmap \|\| !plugged)	1093	if (do_sync \|\| !bitmap \|\| !plugged)
1017	md_wakeup_thread(mddev->thread);	1094	md_wakeup_thread(mddev->thread);
1018		1095