From 0232605d987d8230b254aa139805bbb56a7ca30c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 15:56:52 +1000 Subject: md: make 'name' arg to md_register_thread non-optional. Having the 'name' arg optional and defaulting to the current personality name is no necessary and leads to errors, as when changing the level of an array we can end up using the name of the old level instead of the new one. So make it non-optional and always explicitly pass the name of the level that the array will be. Reported-by: majianpeng Signed-off-by: NeilBrown --- drivers/md/raid1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a9c7981ddd24..39b2a8aa3b23 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2621,7 +2621,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) goto abort; } err = -ENOMEM; - conf->thread = md_register_thread(raid1d, mddev, NULL); + conf->thread = md_register_thread(raid1d, mddev, "raid1"); if (!conf->thread) { printk(KERN_ERR "md/raid1:%s: couldn't allocate thread\n", -- cgit v1.2.2 From 32644afd8975d19174bcb9ba34687c32dd810a09 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 15:58:42 +1000 Subject: md/raid1: fix bug in read_balance introduced by hot-replace When we added hot_replace we doubled the number of devices that could be in a RAID1 array. So we doubled how far read_balance would search. Unfortunately we didn't double the point at which it looped back to the beginning - so it effectively loops over all non-replacement disks twice. This doesn't cause bad behaviour, but it pointless and means we never read from replacement devices. Signed-off-by: NeilBrown --- drivers/md/raid1.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 39b2a8aa3b23..34b4665cb0b6 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -517,8 +517,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect int bad_sectors; int disk = start_disk + i; - if (disk >= conf->raid_disks) - disk -= conf->raid_disks; + if (disk >= conf->raid_disks * 2) + disk -= conf->raid_disks * 2; rdev = rcu_dereference(conf->mirrors[disk].rdev); if (r1_bio->bios[disk] == IO_BLOCKED -- cgit v1.2.2 From b357f04a67c2aeee828b240863cd3f21d6cb3179 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 3 Jul 2012 17:45:31 +1000 Subject: md: fix up plugging (again). The value returned by "mddev_check_plug" is only valid until the next 'schedule' as that will unplug things. This could happen at any call to mempool_alloc. So just calling mddev_check_plug at the start doesn't really make sense. So call it just before, or just after, queuing things for the thread. As the action that happens at unplug is to wake the thread, this makes lots of sense. If we cannot add a plug (which requires a small GFP_ATOMIC alloc) we wake thread immediately. RAID5 is a bit different. Requests are queued for the thread and the thread is woken by release_stripe. So we don't need to wake the thread on failure. However the thread doesn't perform certain actions when there is any active plug, so it is important to install a plug before waking the thread. So for RAID5 we install the plug *before* queuing the request and waking the thread. Without this patch it is possible for raid1 or raid10 to queue a request without then waking the thread, resulting in the array locking up. Also change raid10 to only flush_pending_write when there are not active plugs, just like raid1. This patch is suitable for 3.0 or later. I plan to submit it to -stable, but I'll like to let it spend a few weeks in mainline first to be sure it is completely safe. Signed-off-by: NeilBrown --- drivers/md/raid1.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 34b4665cb0b6..8c2754f835ef 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -883,7 +883,6 @@ static void make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); struct md_rdev *blocked_rdev; - int plugged; int first_clone; int sectors_handled; int max_sectors; @@ -1034,7 +1033,6 @@ read_again: * the bad blocks. Each set of writes gets it's own r1bio * with a set of bios attached. */ - plugged = mddev_check_plugged(mddev); disks = conf->raid_disks * 2; retry_write: @@ -1191,6 +1189,8 @@ read_again: bio_list_add(&conf->pending_bio_list, mbio); conf->pending_count++; spin_unlock_irqrestore(&conf->device_lock, flags); + if (!mddev_check_plugged(mddev)) + md_wakeup_thread(mddev->thread); } /* Mustn't call r1_bio_write_done before this next test, * as it could result in the bio being freed. @@ -1213,9 +1213,6 @@ read_again: /* In case raid1d snuck in to freeze_array */ wake_up(&conf->wait_barrier); - - if (do_sync || !bitmap || !plugged) - md_wakeup_thread(mddev->thread); } static void status(struct seq_file *seq, struct mddev *mddev) -- cgit v1.2.2 From 2d4f4f3384d4ef4f7c571448e803a1ce721113d5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 9 Jul 2012 11:34:13 +1000 Subject: md/raid1: fix use-after-free bug in RAID1 data-check code. This bug has been present ever since data-check was introduce in 2.6.16. However it would only fire if a data-check were done on a degraded array, which was only possible if the array has 3 or more devices. This is certainly possible, but is quite uncommon. Since hot-replace was added in 3.3 it can happen more often as the same condition can arise if not all possible replacements are present. The problem is that as soon as we submit the last read request, the 'r1_bio' structure could be freed at any time, so we really should stop looking at it. If the last device is being read from we will stop looking at it. However if the last device is not due to be read from, we will still check the bio pointer in the r1_bio, but the r1_bio might already be free. So use the read_targets counter to make sure we stop looking for bios to submit as soon as we have submitted them all. This fix is suitable for any -stable kernel since 2.6.16. Cc: stable@vger.kernel.org Reported-by: Arnold Schulz Signed-off-by: NeilBrown --- drivers/md/raid1.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 8c2754f835ef..240ff3125040 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2485,9 +2485,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { atomic_set(&r1_bio->remaining, read_targets); - for (i = 0; i < conf->raid_disks * 2; i++) { + for (i = 0; i < conf->raid_disks * 2 && read_targets; i++) { bio = r1_bio->bios[i]; if (bio->bi_end_io == end_sync_read) { + read_targets--; md_sync_acct(bio->bi_bdev, nr_sectors); generic_make_request(bio); } -- cgit v1.2.2 From 58e94ae18478c08229626daece2fc108a4a23261 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 19 Jul 2012 15:59:18 +1000 Subject: md/raid1: close some possible races on write errors during resync commit 4367af556133723d0f443e14ca8170d9447317cb md/raid1: clear bad-block record when write succeeds. Added a 'reschedule_retry' call possibility at the end of end_sync_write, but didn't add matching code at the end of sync_request_write. So if the writes complete very quickly, or scheduling makes it seem that way, then we can miss rescheduling the request and the resync could hang. Also commit 73d5c38a9536142e062c35997b044e89166e063b md: avoid races when stopping resync. Fix a race condition in this same code in end_sync_write but didn't make the change in sync_request_write. This patch updates sync_request_write to fix both of those. Patch is suitable for 3.1 and later kernels. Reported-by: Alexander Lyakas Original-version-by: Alexander Lyakas Cc: stable@vger.kernel.org Signed-off-by: NeilBrown --- drivers/md/raid1.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 240ff3125040..cacd008d6864 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1818,8 +1818,14 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) if (atomic_dec_and_test(&r1_bio->remaining)) { /* if we're here, all write(s) have completed, so clean up */ - md_done_sync(mddev, r1_bio->sectors, 1); - put_buf(r1_bio); + int s = r1_bio->sectors; + if (test_bit(R1BIO_MadeGood, &r1_bio->state) || + test_bit(R1BIO_WriteError, &r1_bio->state)) + reschedule_retry(r1_bio); + else { + put_buf(r1_bio); + md_done_sync(mddev, s, 1); + } } } -- cgit v1.2.2 From 0eaf822cb3dfcf2a64b2d27f4f6219186adb2695 Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Tue, 31 Jul 2012 10:03:52 +1000 Subject: MD RAID1: rename mirror_info structure MD RAID1: Rename the structure 'mirror_info' to 'raid1_info' The same structure name ('mirror_info') is used by raid10. Each of these structures are defined in there respective header files. If dm-raid is to support both RAID1 and RAID10, the header files will be included and the structure names must not collide. While only one of these structure names needs to change, this patch adds consistency to the naming of the structure. Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown --- drivers/md/raid1.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index cacd008d6864..57d6abd497ef 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -873,7 +873,7 @@ do_sync_io: static void make_request(struct mddev *mddev, struct bio * bio) { struct r1conf *conf = mddev->private; - struct mirror_info *mirror; + struct raid1_info *mirror; struct r1bio *r1_bio; struct bio *read_bio; int i, disks; @@ -1364,7 +1364,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) struct r1conf *conf = mddev->private; int err = -EEXIST; int mirror = 0; - struct mirror_info *p; + struct raid1_info *p; int first = 0; int last = conf->raid_disks - 1; struct request_queue *q = bdev_get_queue(rdev->bdev); @@ -1433,7 +1433,7 @@ static int raid1_remove_disk(struct mddev *mddev, struct md_rdev *rdev) struct r1conf *conf = mddev->private; int err = 0; int number = rdev->raid_disk; - struct mirror_info *p = conf->mirrors+ number; + struct raid1_info *p = conf->mirrors + number; if (rdev != p->rdev) p = conf->mirrors + conf->raid_disks + number; @@ -2521,7 +2521,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) { struct r1conf *conf; int i; - struct mirror_info *disk; + struct raid1_info *disk; struct md_rdev *rdev; int err = -ENOMEM; @@ -2529,7 +2529,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (!conf) goto abort; - conf->mirrors = kzalloc(sizeof(struct mirror_info) + conf->mirrors = kzalloc(sizeof(struct raid1_info) * mddev->raid_disks * 2, GFP_KERNEL); if (!conf->mirrors) @@ -2798,7 +2798,7 @@ static int raid1_reshape(struct mddev *mddev) */ mempool_t *newpool, *oldpool; struct pool_info *newpoolinfo; - struct mirror_info *newmirrors; + struct raid1_info *newmirrors; struct r1conf *conf = mddev->private; int cnt, raid_disks; unsigned long flags; @@ -2841,7 +2841,7 @@ static int raid1_reshape(struct mddev *mddev) kfree(newpoolinfo); return -ENOMEM; } - newmirrors = kzalloc(sizeof(struct mirror_info) * raid_disks * 2, + newmirrors = kzalloc(sizeof(struct raid1_info) * raid_disks * 2, GFP_KERNEL); if (!newmirrors) { kfree(newpoolinfo); -- cgit v1.2.2 From 473e87ce485ffcac041f7911b33f0b4cd4d6cf2b Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Tue, 31 Jul 2012 10:03:52 +1000 Subject: MD: Move macros from raid1*.h to raid1*.c MD RAID1/RAID10: Move some macros from .h file to .c file There are three macros (IO_BLOCKED,IO_MADE_GOOD,BIO_SPECIAL) which are defined in both raid1.h and raid10.h. They are only used in there respective .c files. However, if we wish to make RAID10 accessible to the device-mapper RAID target (dm-raid.c), then we need to move these macros into the .c files where they are used so that they do not conflict with each other. The macros from the two files are identical and could be moved into md.h, but I chose to leave the duplication and have them remain in the personality files. Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown --- drivers/md/raid1.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 57d6abd497ef..d3d3568b4fb1 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -46,6 +46,20 @@ */ #define NR_RAID1_BIOS 256 +/* when we get a read error on a read-only array, we redirect to another + * device without failing the first device, or trying to over-write to + * correct the read error. To keep track of bad blocks on a per-bio + * level, we store IO_BLOCKED in the appropriate 'bios' pointer + */ +#define IO_BLOCKED ((struct bio *)1) +/* When we successfully write to a known bad-block, we need to remove the + * bad-block marking which must be done from process context. So we record + * the success by setting devs[n].bio to IO_MADE_GOOD + */ +#define IO_MADE_GOOD ((struct bio *)2) + +#define BIO_SPECIAL(bio) ((unsigned long)bio <= 2) + /* When there are this many requests queue to be written by * the raid1 thread, we become 'congested' to provide back-pressure * for writeback. -- cgit v1.2.2 From be4d3280b17bc51f23ec6ebb345728f302f80a0c Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 31 Jul 2012 10:03:53 +1000 Subject: md/raid1: make sequential read detection per disk based Currently the sequential read detection is global wide. It's natural to make it per disk based, which can improve the detection for concurrent multiple sequential reads. And next patch will make SSD read balance not use distance based algorithm, where this change help detect truly sequential read for SSD. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid1.c | 35 +++++++---------------------------- 1 file changed, 7 insertions(+), 28 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d3d3568b4fb1..fb96c0c2db40 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -497,9 +497,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect const sector_t this_sector = r1_bio->sector; int sectors; int best_good_sectors; - int start_disk; int best_disk; - int i; + int disk; sector_t best_dist; struct md_rdev *rdev; int choose_first; @@ -517,23 +516,16 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect best_good_sectors = 0; if (conf->mddev->recovery_cp < MaxSector && - (this_sector + sectors >= conf->next_resync)) { + (this_sector + sectors >= conf->next_resync)) choose_first = 1; - start_disk = 0; - } else { + else choose_first = 0; - start_disk = conf->last_used; - } - for (i = 0 ; i < conf->raid_disks * 2 ; i++) { + for (disk = 0 ; disk < conf->raid_disks * 2 ; disk++) { sector_t dist; sector_t first_bad; int bad_sectors; - int disk = start_disk + i; - if (disk >= conf->raid_disks * 2) - disk -= conf->raid_disks * 2; - rdev = rcu_dereference(conf->mirrors[disk].rdev); if (r1_bio->bios[disk] == IO_BLOCKED || rdev == NULL @@ -594,7 +586,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect dist = abs(this_sector - conf->mirrors[disk].head_position); if (choose_first /* Don't change to another disk for sequential reads */ - || conf->next_seq_sect == this_sector + || conf->mirrors[disk].next_seq_sect == this_sector || dist == 0 /* If device is idle, use it */ || atomic_read(&rdev->nr_pending) == 0) { @@ -620,8 +612,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect goto retry; } sectors = best_good_sectors; - conf->next_seq_sect = this_sector + sectors; - conf->last_used = best_disk; + conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; } rcu_read_unlock(); *max_sectors = sectors; @@ -2599,7 +2590,6 @@ static struct r1conf *setup_conf(struct mddev *mddev) conf->recovery_disabled = mddev->recovery_disabled - 1; err = -EIO; - conf->last_used = -1; for (i = 0; i < conf->raid_disks * 2; i++) { disk = conf->mirrors + i; @@ -2625,19 +2615,9 @@ static struct r1conf *setup_conf(struct mddev *mddev) if (disk->rdev && (disk->rdev->saved_raid_disk < 0)) conf->fullsync = 1; - } else if (conf->last_used < 0) - /* - * The first working device is used as a - * starting point to read balancing. - */ - conf->last_used = i; + } } - if (conf->last_used < 0) { - printk(KERN_ERR "md/raid1:%s: no operational mirrors\n", - mdname(mddev)); - goto abort; - } err = -ENOMEM; conf->thread = md_register_thread(raid1d, mddev, "raid1"); if (!conf->thread) { @@ -2894,7 +2874,6 @@ static int raid1_reshape(struct mddev *mddev) conf->raid_disks = mddev->raid_disks = raid_disks; mddev->delta_disks = 0; - conf->last_used = 0; /* just make sure it is in-range */ lower_barrier(conf); set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); -- cgit v1.2.2 From 9dedf60313fa4dddfd5b9b226a0ef12a512bf9dc Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 31 Jul 2012 10:03:53 +1000 Subject: md/raid1: read balance chooses idlest disk for SSD SSD hasn't spindle, distance between requests means nothing. And the original distance based algorithm sometimes can cause severe performance issue for SSD raid. Considering two thread groups, one accesses file A, the other access file B. The first group will access one disk and the second will access the other disk, because requests are near from one group and far between groups. In this case, read balance might keep one disk very busy but the other relative idle. For SSD, we should try best to distribute requests to as many disks as possible. There isn't spindle move penality anyway. With below patch, I can see more than 50% throughput improvement sometimes depending on workloads. The only exception is small requests can be merged to a big request which typically can drive higher throughput for SSD too. Such small requests are sequential reads. Unlike hard disk, sequential read which can't be merged (for example direct IO, or read without readahead) can be ignored for SSD. Again there is no spindle move penality. readahead dispatches small requests and such requests can be merged. Last patch can help detect sequential read well, at least if concurrent read number isn't greater than raid disk number. In that case, distance based algorithm doesn't work well too. V2: For hard disk and SSD mixed raid, doesn't use distance based algorithm for random IO too. This makes the algorithm generic for raid with SSD. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid1.c | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index fb96c0c2db40..d9869f25aa75 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -497,9 +497,11 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect const sector_t this_sector = r1_bio->sector; int sectors; int best_good_sectors; - int best_disk; + int best_disk, best_dist_disk, best_pending_disk; + int has_nonrot_disk; int disk; sector_t best_dist; + unsigned int min_pending; struct md_rdev *rdev; int choose_first; @@ -512,8 +514,12 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect retry: sectors = r1_bio->sectors; best_disk = -1; + best_dist_disk = -1; best_dist = MaxSector; + best_pending_disk = -1; + min_pending = UINT_MAX; best_good_sectors = 0; + has_nonrot_disk = 0; if (conf->mddev->recovery_cp < MaxSector && (this_sector + sectors >= conf->next_resync)) @@ -525,6 +531,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect sector_t dist; sector_t first_bad; int bad_sectors; + unsigned int pending; rdev = rcu_dereference(conf->mirrors[disk].rdev); if (r1_bio->bios[disk] == IO_BLOCKED @@ -583,22 +590,43 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect } else best_good_sectors = sectors; + has_nonrot_disk |= blk_queue_nonrot(bdev_get_queue(rdev->bdev)); + pending = atomic_read(&rdev->nr_pending); dist = abs(this_sector - conf->mirrors[disk].head_position); if (choose_first /* Don't change to another disk for sequential reads */ || conf->mirrors[disk].next_seq_sect == this_sector || dist == 0 /* If device is idle, use it */ - || atomic_read(&rdev->nr_pending) == 0) { + || pending == 0) { best_disk = disk; break; } + + if (min_pending > pending) { + min_pending = pending; + best_pending_disk = disk; + } + if (dist < best_dist) { best_dist = dist; - best_disk = disk; + best_dist_disk = disk; } } + /* + * If all disks are rotational, choose the closest disk. If any disk is + * non-rotational, choose the disk with less pending request even the + * disk is rotational, which might/might not be optimal for raids with + * mixed ratation/non-rotational disks depending on workload. + */ + if (best_disk == -1) { + if (has_nonrot_disk) + best_disk = best_pending_disk; + else + best_disk = best_dist_disk; + } + if (best_disk >= 0) { rdev = rcu_dereference(conf->mirrors[best_disk].rdev); if (!rdev) -- cgit v1.2.2 From 12cee5a8a29e7263e39953f1d941f723c617ca5f Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 31 Jul 2012 10:03:53 +1000 Subject: md/raid1: prevent merging too large request For SSD, if request size exceeds specific value (optimal io size), request size isn't important for bandwidth. In such condition, if making request size bigger will cause some disks idle, the total throughput will actually drop. A good example is doing a readahead in a two-disk raid1 setup. So when should we split big requests? We absolutly don't want to split big request to very small requests. Even in SSD, big request transfer is more efficient. This patch only considers request with size above optimal io size. If all disks are busy, is it worth doing a split? Say optimal io size is 16k, two requests 32k and two disks. We can let each disk run one 32k request, or split the requests to 4 16k requests and each disk runs two. It's hard to say which case is better, depending on hardware. So only consider case where there are idle disks. For readahead, split is always better in this case. And in my test, below patch can improve > 30% thoughput. Hmm, not 100%, because disk isn't 100% busy. Such case can happen not just in readahead, for example, in directio. But I suppose directio usually will have bigger IO depth and make all disks busy, so I ignored it. Note: if the raid uses any hard disk, we don't prevent merging. That will make performace worse. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid1.c | 56 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 49 insertions(+), 7 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d9869f25aa75..7aa958ed2847 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -504,6 +504,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect unsigned int min_pending; struct md_rdev *rdev; int choose_first; + int choose_next_idle; rcu_read_lock(); /* @@ -520,6 +521,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect min_pending = UINT_MAX; best_good_sectors = 0; has_nonrot_disk = 0; + choose_next_idle = 0; if (conf->mddev->recovery_cp < MaxSector && (this_sector + sectors >= conf->next_resync)) @@ -532,6 +534,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect sector_t first_bad; int bad_sectors; unsigned int pending; + bool nonrot; rdev = rcu_dereference(conf->mirrors[disk].rdev); if (r1_bio->bios[disk] == IO_BLOCKED @@ -590,18 +593,52 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect } else best_good_sectors = sectors; - has_nonrot_disk |= blk_queue_nonrot(bdev_get_queue(rdev->bdev)); + nonrot = blk_queue_nonrot(bdev_get_queue(rdev->bdev)); + has_nonrot_disk |= nonrot; pending = atomic_read(&rdev->nr_pending); dist = abs(this_sector - conf->mirrors[disk].head_position); - if (choose_first - /* Don't change to another disk for sequential reads */ - || conf->mirrors[disk].next_seq_sect == this_sector - || dist == 0 - /* If device is idle, use it */ - || pending == 0) { + if (choose_first) { best_disk = disk; break; } + /* Don't change to another disk for sequential reads */ + if (conf->mirrors[disk].next_seq_sect == this_sector + || dist == 0) { + int opt_iosize = bdev_io_opt(rdev->bdev) >> 9; + struct raid1_info *mirror = &conf->mirrors[disk]; + + best_disk = disk; + /* + * If buffered sequential IO size exceeds optimal + * iosize, check if there is idle disk. If yes, choose + * the idle disk. read_balance could already choose an + * idle disk before noticing it's a sequential IO in + * this disk. This doesn't matter because this disk + * will idle, next time it will be utilized after the + * first disk has IO size exceeds optimal iosize. In + * this way, iosize of the first disk will be optimal + * iosize at least. iosize of the second disk might be + * small, but not a big deal since when the second disk + * starts IO, the first disk is likely still busy. + */ + if (nonrot && opt_iosize > 0 && + mirror->seq_start != MaxSector && + mirror->next_seq_sect > opt_iosize && + mirror->next_seq_sect - opt_iosize >= + mirror->seq_start) { + choose_next_idle = 1; + continue; + } + break; + } + /* If device is idle, use it */ + if (pending == 0) { + best_disk = disk; + break; + } + + if (choose_next_idle) + continue; if (min_pending > pending) { min_pending = pending; @@ -640,6 +677,10 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect goto retry; } sectors = best_good_sectors; + + if (conf->mirrors[best_disk].next_seq_sect != this_sector) + conf->mirrors[best_disk].seq_start = this_sector; + conf->mirrors[best_disk].next_seq_sect = this_sector + sectors; } rcu_read_unlock(); @@ -2605,6 +2646,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) mddev->merge_check_needed = 1; disk->head_position = 0; + disk->seq_start = MaxSector; } conf->raid_disks = mddev->raid_disks; conf->mddev = mddev; -- cgit v1.2.2 From b7219ccb33aa0df9949a60c68b5e9f712615e56f Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Jul 2012 10:05:34 +1000 Subject: md/raid1: don't abort a resync on the first badblock. If a resync of a RAID1 array with 2 devices finds a known bad block one device it will neither read from, or write to, that device for this block offset. So there will be one read_target (The other device) and zero write targets. This condition causes md/raid1 to abort the resync assuming that it has finished - without known bad blocks this would be true. When there are no write targets because of the presence of bad blocks we should only skip over the area covered by the bad block. RAID10 already gets this right, raid1 doesn't. Or didn't. As this can cause a 'sync' to abort early and appear to have succeeded it could lead to some data corruption, so it suitable for -stable. Cc: stable@vger.kernel.org Reported-by: Alexander Lyakas Signed-off-by: NeilBrown --- drivers/md/raid1.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7aa958ed2847..d2361b162de5 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2502,7 +2502,10 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp /* There is nowhere to write, so all non-sync * drives must be failed - so we are finished */ - sector_t rv = max_sector - sector_nr; + sector_t rv; + if (min_bad > 0) + max_sector = sector_nr + min_bad; + rv = max_sector - sector_nr; *skipped = 1; put_buf(r1_bio); return rv; -- cgit v1.2.2 From d57368afe63b3b7b45ce6c2b8c5276417935be2f Mon Sep 17 00:00:00 2001 From: Alexander Lyakas Date: Tue, 17 Jul 2012 13:17:55 +0300 Subject: md/RAID1: Add missing case for attempting to repair known bad blocks. When doing resync or repair, attempt to correct bad blocks, according to WriteErrorSeen policy Signed-off-by: Alex Lyakas Signed-off-by: NeilBrown --- drivers/md/raid1.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index d2361b162de5..197f62681db5 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2445,6 +2445,18 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipp bio->bi_rw = READ; bio->bi_end_io = end_sync_read; read_targets++; + } else if (!test_bit(WriteErrorSeen, &rdev->flags) && + test_bit(MD_RECOVERY_SYNC, &mddev->recovery) && + !test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) { + /* + * The device is suitable for reading (InSync), + * but has bad block(s) here. Let's try to correct them, + * if we are doing resync or repair. Otherwise, leave + * this device alone for this sync request. + */ + bio->bi_rw = WRITE; + bio->bi_end_io = end_sync_write; + write_targets++; } } if (bio->bi_end_io) { -- cgit v1.2.2 From 0021b7bc045e4b0b85d8c53614342aaf84ca96a5 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 31 Jul 2012 09:08:14 +0200 Subject: md: remove plug_cnt feature of plugging. This seemed like a good idea at the time, but after further thought I cannot see it making a difference other than very occasionally and testing to try to exercise the case it is most likely to help did not show any performance difference by removing it. So remove the counting of active plugs and allow 'pending writes' to be activated at any time, not just when no plugs are active. This is only relevant when there is a write-intent bitmap, and the updating of the bitmap will likely introduce enough delay that the single-threading of bitmap updates will be enough to collect large numbers of updates together. Removing this will make it easier to centralise the unplug code, and will clear the other for other unplug enhancements which have a measurable effect. Signed-off-by: NeilBrown Signed-off-by: Jens Axboe --- drivers/md/raid1.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index cacd008d6864..36a8fc059ac3 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2173,8 +2173,7 @@ static void raid1d(struct mddev *mddev) blk_start_plug(&plug); for (;;) { - if (atomic_read(&mddev->plug_cnt) == 0) - flush_pending_writes(conf); + flush_pending_writes(conf); spin_lock_irqsave(&conf->device_lock, flags); if (list_empty(head)) { -- cgit v1.2.2 From f54a9d0e59c4bea3db733921ca9147612a6f292c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 2 Aug 2012 08:33:20 +1000 Subject: md/raid1: submit IO from originating thread instead of md thread. queuing writes to the md thread means that all requests go through the one processor which may not be able to keep up with very high request rates. So use the plugging infrastructure to submit all requests on unplug. If a 'schedule' is needed, we fall back on the old approach of handing the requests to the thread for it to handle. Signed-off-by: NeilBrown --- drivers/md/raid1.c | 57 +++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 3 deletions(-) (limited to 'drivers/md/raid1.c') diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 36a8fc059ac3..9f01870d031c 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -870,6 +870,44 @@ do_sync_io: pr_debug("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); } +struct raid1_plug_cb { + struct blk_plug_cb cb; + struct bio_list pending; + int pending_cnt; +}; + +static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct raid1_plug_cb *plug = container_of(cb, struct raid1_plug_cb, + cb); + struct mddev *mddev = plug->cb.data; + struct r1conf *conf = mddev->private; + struct bio *bio; + + if (from_schedule) { + spin_lock_irq(&conf->device_lock); + bio_list_merge(&conf->pending_bio_list, &plug->pending); + conf->pending_count += plug->pending_cnt; + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(mddev->thread); + kfree(plug); + return; + } + + /* we aren't scheduling, so we can do the write-out directly. */ + bio = bio_list_get(&plug->pending); + bitmap_unplug(mddev->bitmap); + wake_up(&conf->wait_barrier); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + kfree(plug); +} + static void make_request(struct mddev *mddev, struct bio * bio) { struct r1conf *conf = mddev->private; @@ -883,6 +921,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); struct md_rdev *blocked_rdev; + struct blk_plug_cb *cb; + struct raid1_plug_cb *plug = NULL; int first_clone; int sectors_handled; int max_sectors; @@ -1185,11 +1225,22 @@ read_again: mbio->bi_private = r1_bio; atomic_inc(&r1_bio->remaining); + + cb = blk_check_plugged(raid1_unplug, mddev, sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid1_plug_cb, cb); + else + plug = NULL; spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) + if (!plug) md_wakeup_thread(mddev->thread); } /* Mustn't call r1_bio_write_done before this next test, -- cgit v1.2.2