md/raid10: avoid writing to known bad blocks on known bad drives.

Writing to known bad blocks on drives that have seen a write error is asking for trouble. So try to avoid these blocks. Signed-off-by: NeilBrown <neilb@suse.de>
author: NeilBrown <neilb@suse.de> 2011-07-27 21:39:24 -0400
committer: NeilBrown <neilb@suse.de> 2011-07-27 21:39:24 -0400
commit: d4432c23be957ff061f7b23fd60e8506cb472a55 (patch)
tree: 1e9e8cbc75721e62300bb2251658de870bf79d8a /drivers/md
parent: e875ecea266a543e643b19e44cf472f1412708f9 (diff)
1 files changed, 93 insertions, 12 deletions
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index de6089926273..13077a3fd7d2 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -807,6 +807,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
        unsigned long flags;
        mdk_rdev_t *blocked_rdev;
        int plugged;
+        int sectors_handled;
+        int max_sectors;
        if (unlikely(bio->bi_rw & REQ_FLUSH)) {
                md_flush_request(mddev, bio);
@@ -895,7 +897,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
                /*
                 * read balancing logic:
                 */
-                int max_sectors;
                int disk;
                int slot;
@@ -925,8 +926,6 @@ read_again:
                        /* Could not read all from this device, so we will
                         * need another r10_bio.
                         */
-                        int sectors_handled;
                        sectors_handled = (r10_bio->sectors + max_sectors
                                           - bio->bi_sector);
                        r10_bio->sectors = max_sectors;
@@ -963,13 +962,22 @@ read_again:
        /* first select target devices under rcu_lock and
         * inc refcount on their rdev.  Record them by setting
         * bios[x] to bio
+         * If there are known/acknowledged bad blocks on any device
+         * on which we have seen a write error, we want to avoid
+         * writing to those blocks.  This potentially requires several
+         * writes to write around the bad blocks.  Each set of writes
+         * gets its own r10_bio with a set of bios attached.  The number
+         * of r10_bios is recored in bio->bi_phys_segments just as with
+         * the read case.
         */
        plugged = mddev_check_plugged(mddev);
        raid10_find_phys(conf, r10_bio);
- retry_write:
+retry_write:
        blocked_rdev = NULL;
        rcu_read_lock();
+        max_sectors = r10_bio->sectors;
        for (i = 0;  i < conf->copies; i++) {
                int d = r10_bio->devs[i].devnum;
                mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
@@ -978,13 +986,55 @@ read_again:
                        blocked_rdev = rdev;
                        break;
                }
-                if (rdev && !test_bit(Faulty, &rdev->flags)) {
+                r10_bio->devs[i].bio = NULL;
-                        atomic_inc(&rdev->nr_pending);
+                if (!rdev || test_bit(Faulty, &rdev->flags)) {
-                        r10_bio->devs[i].bio = bio;
-                } else {
-                        r10_bio->devs[i].bio = NULL;
                        set_bit(R10BIO_Degraded, &r10_bio->state);
+                        continue;
+                }
+                if (test_bit(WriteErrorSeen, &rdev->flags)) {
+                        sector_t first_bad;
+                        sector_t dev_sector = r10_bio->devs[i].addr;
+                        int bad_sectors;
+                        int is_bad;
+                        is_bad = is_badblock(rdev, dev_sector,
+                                             max_sectors,
+                                             &first_bad, &bad_sectors);
+                        if (is_bad < 0) {
+                                /* Mustn't write here until the bad block
+                                 * is acknowledged
+                                 */
+                                atomic_inc(&rdev->nr_pending);
+                                set_bit(BlockedBadBlocks, &rdev->flags);
+                                blocked_rdev = rdev;
+                                break;
+                        }
+                        if (is_bad && first_bad <= dev_sector) {
+                                /* Cannot write here at all */
+                                bad_sectors -= (dev_sector - first_bad);
+                                if (bad_sectors < max_sectors)
+                                        /* Mustn't write more than bad_sectors
+                                         * to other devices yet
+                                         */
+                                        max_sectors = bad_sectors;
+                                /* We don't set R10BIO_Degraded as that
+                                 * only applies if the disk is missing,
+                                 * so it might be re-added, and we want to
+                                 * know to recover this chunk.
+                                 * In this case the device is here, and the
+                                 * fact that this chunk is not in-sync is
+                                 * recorded in the bad block log.
+                                 */
+                                continue;
+                        }
+                        if (is_bad) {
+                                int good_sectors = first_bad - dev_sector;
+                                if (good_sectors < max_sectors)
+                                        max_sectors = good_sectors;
+                        }
                }
+                r10_bio->devs[i].bio = bio;
+                atomic_inc(&rdev->nr_pending);
        }
        rcu_read_unlock();
@@ -1004,8 +1054,22 @@ read_again:
                goto retry_write;
        }
+        if (max_sectors < r10_bio->sectors) {
+                /* We are splitting this into multiple parts, so
+                 * we need to prepare for allocating another r10_bio.
+                 */
+                r10_bio->sectors = max_sectors;
+                spin_lock_irq(&conf->device_lock);
+                if (bio->bi_phys_segments == 0)
+                        bio->bi_phys_segments = 2;
+                else
+                        bio->bi_phys_segments++;
+                spin_unlock_irq(&conf->device_lock);
+        }
+        sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
        atomic_set(&r10_bio->remaining, 1);
-        bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);
+        bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
        for (i = 0; i < conf->copies; i++) {
                struct bio *mbio;
@@ -1014,10 +1078,12 @@ read_again:
                        continue;
                mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+                md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
+                            max_sectors);
                r10_bio->devs[i].bio = mbio;
-                mbio->bi_sector = r10_bio->devs[i].addr+
+                mbio->bi_sector = (r10_bio->devs[i].addr+
-                        conf->mirrors[d].rdev->data_offset;
+                                   conf->mirrors[d].rdev->data_offset);
                mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
                mbio->bi_end_io = raid10_end_write_request;
                mbio->bi_rw = WRITE | do_sync | do_fua;
@@ -1042,6 +1108,21 @@ read_again:
        /* In case raid10d snuck in to freeze_array */
        wake_up(&conf->wait_barrier);
+        if (sectors_handled < (bio->bi_size >> 9)) {
+                /* We need another r1_bio.  It has already been counted
+                 * in bio->bi_phys_segments.
+                 */
+                r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+                r10_bio->master_bio = bio;
+                r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
+                r10_bio->mddev = mddev;
+                r10_bio->sector = bio->bi_sector + sectors_handled;
+                r10_bio->state = 0;
+                goto retry_write;
+        }
        if (do_sync || !mddev->bitmap || !plugged)
                md_wakeup_thread(mddev->thread);
        return 0;
author	NeilBrown <neilb@suse.de>	2011-07-27 21:39:24 -0400
committer	NeilBrown <neilb@suse.de>	2011-07-27 21:39:24 -0400
commit	d4432c23be957ff061f7b23fd60e8506cb472a55 (patch)
tree	1e9e8cbc75721e62300bb2251658de870bf79d8a /drivers/md
parent	e875ecea266a543e643b19e44cf472f1412708f9 (diff)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index de6089926273..13077a3fd7d2 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c
@@ -807,6 +807,8 @@ static int make_request(mddev_t mddev, struct bio bio)
807	unsigned long flags;	807	unsigned long flags;
808	mdk_rdev_t *blocked_rdev;	808	mdk_rdev_t *blocked_rdev;
809	int plugged;	809	int plugged;
		810	int sectors_handled;
		811	int max_sectors;
810		812
811	if (unlikely(bio->bi_rw & REQ_FLUSH)) {	813	if (unlikely(bio->bi_rw & REQ_FLUSH)) {
812	md_flush_request(mddev, bio);	814	md_flush_request(mddev, bio);
@@ -895,7 +897,6 @@ static int make_request(mddev_t mddev, struct bio bio)
895	/*	897	/*
896	* read balancing logic:	898	* read balancing logic:
897	*/	899	*/
898	int max_sectors;
899	int disk;	900	int disk;
900	int slot;	901	int slot;
901		902
@@ -925,8 +926,6 @@ read_again:
925	/* Could not read all from this device, so we will	926	/* Could not read all from this device, so we will
926	* need another r10_bio.	927	* need another r10_bio.
927	*/	928	*/
928	int sectors_handled;
929
930	sectors_handled = (r10_bio->sectors + max_sectors	929	sectors_handled = (r10_bio->sectors + max_sectors
931	- bio->bi_sector);	930	- bio->bi_sector);
932	r10_bio->sectors = max_sectors;	931	r10_bio->sectors = max_sectors;
@@ -963,13 +962,22 @@ read_again:
963	/* first select target devices under rcu_lock and	962	/* first select target devices under rcu_lock and
964	* inc refcount on their rdev. Record them by setting	963	* inc refcount on their rdev. Record them by setting
965	* bios[x] to bio	964	* bios[x] to bio
		965	* If there are known/acknowledged bad blocks on any device
		966	* on which we have seen a write error, we want to avoid
		967	* writing to those blocks. This potentially requires several
		968	* writes to write around the bad blocks. Each set of writes
		969	* gets its own r10_bio with a set of bios attached. The number
		970	* of r10_bios is recored in bio->bi_phys_segments just as with
		971	* the read case.
966	*/	972	*/
967	plugged = mddev_check_plugged(mddev);	973	plugged = mddev_check_plugged(mddev);
968		974
969	raid10_find_phys(conf, r10_bio);	975	raid10_find_phys(conf, r10_bio);
970	retry_write:	976	retry_write:
971	blocked_rdev = NULL;	977	blocked_rdev = NULL;
972	rcu_read_lock();	978	rcu_read_lock();
		979	max_sectors = r10_bio->sectors;
		980
973	for (i = 0; i < conf->copies; i++) {	981	for (i = 0; i < conf->copies; i++) {
974	int d = r10_bio->devs[i].devnum;	982	int d = r10_bio->devs[i].devnum;
975	mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);	983	mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
@@ -978,13 +986,55 @@ read_again:
978	blocked_rdev = rdev;	986	blocked_rdev = rdev;
979	break;	987	break;
980	}	988	}
981	if (rdev && !test_bit(Faulty, &rdev->flags)) {	989	r10_bio->devs[i].bio = NULL;
982	atomic_inc(&rdev->nr_pending);	990	if (!rdev \|\| test_bit(Faulty, &rdev->flags)) {
983	r10_bio->devs[i].bio = bio;
984	} else {
985	r10_bio->devs[i].bio = NULL;
986	set_bit(R10BIO_Degraded, &r10_bio->state);	991	set_bit(R10BIO_Degraded, &r10_bio->state);
		992	continue;
		993	}
		994	if (test_bit(WriteErrorSeen, &rdev->flags)) {
		995	sector_t first_bad;
		996	sector_t dev_sector = r10_bio->devs[i].addr;
		997	int bad_sectors;
		998	int is_bad;
		999
		1000	is_bad = is_badblock(rdev, dev_sector,
		1001	max_sectors,
		1002	&first_bad, &bad_sectors);
		1003	if (is_bad < 0) {
		1004	/* Mustn't write here until the bad block
		1005	* is acknowledged
		1006	*/
		1007	atomic_inc(&rdev->nr_pending);
		1008	set_bit(BlockedBadBlocks, &rdev->flags);
		1009	blocked_rdev = rdev;
		1010	break;
		1011	}
		1012	if (is_bad && first_bad <= dev_sector) {
		1013	/* Cannot write here at all */
		1014	bad_sectors -= (dev_sector - first_bad);
		1015	if (bad_sectors < max_sectors)
		1016	/* Mustn't write more than bad_sectors
		1017	* to other devices yet
		1018	*/
		1019	max_sectors = bad_sectors;
		1020	/* We don't set R10BIO_Degraded as that
		1021	* only applies if the disk is missing,
		1022	* so it might be re-added, and we want to
		1023	* know to recover this chunk.
		1024	* In this case the device is here, and the
		1025	* fact that this chunk is not in-sync is
		1026	* recorded in the bad block log.
		1027	*/
		1028	continue;
		1029	}
		1030	if (is_bad) {
		1031	int good_sectors = first_bad - dev_sector;
		1032	if (good_sectors < max_sectors)
		1033	max_sectors = good_sectors;
		1034	}
987	}	1035	}
		1036	r10_bio->devs[i].bio = bio;
		1037	atomic_inc(&rdev->nr_pending);
988	}	1038	}
989	rcu_read_unlock();	1039	rcu_read_unlock();
990		1040
@@ -1004,8 +1054,22 @@ read_again:
1004	goto retry_write;	1054	goto retry_write;
1005	}	1055	}
1006		1056
		1057	if (max_sectors < r10_bio->sectors) {
		1058	/* We are splitting this into multiple parts, so
		1059	* we need to prepare for allocating another r10_bio.
		1060	*/
		1061	r10_bio->sectors = max_sectors;
		1062	spin_lock_irq(&conf->device_lock);
		1063	if (bio->bi_phys_segments == 0)
		1064	bio->bi_phys_segments = 2;
		1065	else
		1066	bio->bi_phys_segments++;
		1067	spin_unlock_irq(&conf->device_lock);
		1068	}
		1069	sectors_handled = r10_bio->sector + max_sectors - bio->bi_sector;
		1070
1007	atomic_set(&r10_bio->remaining, 1);	1071	atomic_set(&r10_bio->remaining, 1);
1008	bitmap_startwrite(mddev->bitmap, bio->bi_sector, r10_bio->sectors, 0);	1072	bitmap_startwrite(mddev->bitmap, r10_bio->sector, r10_bio->sectors, 0);
1009		1073
1010	for (i = 0; i < conf->copies; i++) {	1074	for (i = 0; i < conf->copies; i++) {
1011	struct bio *mbio;	1075	struct bio *mbio;
@@ -1014,10 +1078,12 @@ read_again:
1014	continue;	1078	continue;
1015		1079
1016	mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);	1080	mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
		1081	md_trim_bio(mbio, r10_bio->sector - bio->bi_sector,
		1082	max_sectors);
1017	r10_bio->devs[i].bio = mbio;	1083	r10_bio->devs[i].bio = mbio;
1018		1084
1019	mbio->bi_sector = r10_bio->devs[i].addr+	1085	mbio->bi_sector = (r10_bio->devs[i].addr+
1020	conf->mirrors[d].rdev->data_offset;	1086	conf->mirrors[d].rdev->data_offset);
1021	mbio->bi_bdev = conf->mirrors[d].rdev->bdev;	1087	mbio->bi_bdev = conf->mirrors[d].rdev->bdev;
1022	mbio->bi_end_io = raid10_end_write_request;	1088	mbio->bi_end_io = raid10_end_write_request;
1023	mbio->bi_rw = WRITE \| do_sync \| do_fua;	1089	mbio->bi_rw = WRITE \| do_sync \| do_fua;
@@ -1042,6 +1108,21 @@ read_again:
1042	/* In case raid10d snuck in to freeze_array */	1108	/* In case raid10d snuck in to freeze_array */
1043	wake_up(&conf->wait_barrier);	1109	wake_up(&conf->wait_barrier);
1044		1110
		1111	if (sectors_handled < (bio->bi_size >> 9)) {
		1112	/* We need another r1_bio. It has already been counted
		1113	* in bio->bi_phys_segments.
		1114	*/
		1115	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
		1116
		1117	r10_bio->master_bio = bio;
		1118	r10_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
		1119
		1120	r10_bio->mddev = mddev;
		1121	r10_bio->sector = bio->bi_sector + sectors_handled;
		1122	r10_bio->state = 0;
		1123	goto retry_write;
		1124	}
		1125
1045	if (do_sync \|\| !mddev->bitmap \|\| !plugged)	1126	if (do_sync \|\| !mddev->bitmap \|\| !plugged)
1046	md_wakeup_thread(mddev->thread);	1127	md_wakeup_thread(mddev->thread);
1047	return 0;	1128	return 0;