diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-05-23 01:03:03 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-05-23 01:03:03 -0400 |
commit | 4b382d0643603819e8b48da58efc254cabc22574 (patch) | |
tree | 71ae8bc989af8a0137c065e4741a76dc4e4d4cb8 /drivers | |
parent | bdfbe804c2303cb4b178bb4b5c3e855892472033 (diff) | |
parent | b098636cf04c89db4036fedc778da0acc666ad1a (diff) |
Merge branch 'for-linus' of git://neil.brown.name/md
* 'for-linus' of git://neil.brown.name/md:
md: allow resync_start to be set while an array is active.
md/raid10: reformat some loops with less indenting.
md/raid10: remove unused variable.
md/raid10: make more use of 'slot' in raid10d.
md/raid10: some tidying up in fix_read_error
md/raid1: improve handling of pages allocated for write-behind.
md/raid1: try fix_sync_read_error before process_checks.
md/raid1: tidy up new functions: process_checks and fix_sync_read_error.
md/raid1: split out two sub-functions from sync_request_write
md: make error_handler functions more uniform and correct.
md/multipath: discard ->working_disks in favour of ->degraded
md/raid1: clean up read_balance.
md: simplify raid10 read_balance
md/bitmap: fix saving of events_cleared and other state.
md: reject a re-add request that cannot be honoured.
md: Fix race when creating a new md device.
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/md/bitmap.c | 10 | ||||
-rw-r--r-- | drivers/md/md.c | 23 | ||||
-rw-r--r-- | drivers/md/multipath.c | 60 | ||||
-rw-r--r-- | drivers/md/multipath.h | 1 | ||||
-rw-r--r-- | drivers/md/raid1.c | 506 | ||||
-rw-r--r-- | drivers/md/raid1.h | 4 | ||||
-rw-r--r-- | drivers/md/raid10.c | 424 | ||||
-rw-r--r-- | drivers/md/raid5.c | 41 |
8 files changed, 535 insertions, 534 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 5c9362792f1d..70bd738b8b99 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -493,11 +493,11 @@ void bitmap_update_sb(struct bitmap *bitmap) | |||
493 | spin_unlock_irqrestore(&bitmap->lock, flags); | 493 | spin_unlock_irqrestore(&bitmap->lock, flags); |
494 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); | 494 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); |
495 | sb->events = cpu_to_le64(bitmap->mddev->events); | 495 | sb->events = cpu_to_le64(bitmap->mddev->events); |
496 | if (bitmap->mddev->events < bitmap->events_cleared) { | 496 | if (bitmap->mddev->events < bitmap->events_cleared) |
497 | /* rocking back to read-only */ | 497 | /* rocking back to read-only */ |
498 | bitmap->events_cleared = bitmap->mddev->events; | 498 | bitmap->events_cleared = bitmap->mddev->events; |
499 | sb->events_cleared = cpu_to_le64(bitmap->events_cleared); | 499 | sb->events_cleared = cpu_to_le64(bitmap->events_cleared); |
500 | } | 500 | sb->state = cpu_to_le32(bitmap->flags); |
501 | /* Just in case these have been changed via sysfs: */ | 501 | /* Just in case these have been changed via sysfs: */ |
502 | sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); | 502 | sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); |
503 | sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); | 503 | sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); |
@@ -618,7 +618,7 @@ success: | |||
618 | if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) | 618 | if (le32_to_cpu(sb->version) == BITMAP_MAJOR_HOSTENDIAN) |
619 | bitmap->flags |= BITMAP_HOSTENDIAN; | 619 | bitmap->flags |= BITMAP_HOSTENDIAN; |
620 | bitmap->events_cleared = le64_to_cpu(sb->events_cleared); | 620 | bitmap->events_cleared = le64_to_cpu(sb->events_cleared); |
621 | if (sb->state & cpu_to_le32(BITMAP_STALE)) | 621 | if (bitmap->flags & BITMAP_STALE) |
622 | bitmap->events_cleared = bitmap->mddev->events; | 622 | bitmap->events_cleared = bitmap->mddev->events; |
623 | err = 0; | 623 | err = 0; |
624 | out: | 624 | out: |
@@ -652,9 +652,11 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, | |||
652 | switch (op) { | 652 | switch (op) { |
653 | case MASK_SET: | 653 | case MASK_SET: |
654 | sb->state |= cpu_to_le32(bits); | 654 | sb->state |= cpu_to_le32(bits); |
655 | bitmap->flags |= bits; | ||
655 | break; | 656 | break; |
656 | case MASK_UNSET: | 657 | case MASK_UNSET: |
657 | sb->state &= cpu_to_le32(~bits); | 658 | sb->state &= cpu_to_le32(~bits); |
659 | bitmap->flags &= ~bits; | ||
658 | break; | 660 | break; |
659 | default: | 661 | default: |
660 | BUG(); | 662 | BUG(); |
diff --git a/drivers/md/md.c b/drivers/md/md.c index 7d6f7f18a920..aa640a85bb21 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -3324,7 +3324,7 @@ resync_start_store(mddev_t *mddev, const char *buf, size_t len) | |||
3324 | char *e; | 3324 | char *e; |
3325 | unsigned long long n = simple_strtoull(buf, &e, 10); | 3325 | unsigned long long n = simple_strtoull(buf, &e, 10); |
3326 | 3326 | ||
3327 | if (mddev->pers) | 3327 | if (mddev->pers && !test_bit(MD_RECOVERY_FROZEN, &mddev->recovery)) |
3328 | return -EBUSY; | 3328 | return -EBUSY; |
3329 | if (cmd_match(buf, "none")) | 3329 | if (cmd_match(buf, "none")) |
3330 | n = MaxSector; | 3330 | n = MaxSector; |
@@ -4347,13 +4347,19 @@ static int md_alloc(dev_t dev, char *name) | |||
4347 | disk->fops = &md_fops; | 4347 | disk->fops = &md_fops; |
4348 | disk->private_data = mddev; | 4348 | disk->private_data = mddev; |
4349 | disk->queue = mddev->queue; | 4349 | disk->queue = mddev->queue; |
4350 | blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); | ||
4350 | /* Allow extended partitions. This makes the | 4351 | /* Allow extended partitions. This makes the |
4351 | * 'mdp' device redundant, but we can't really | 4352 | * 'mdp' device redundant, but we can't really |
4352 | * remove it now. | 4353 | * remove it now. |
4353 | */ | 4354 | */ |
4354 | disk->flags |= GENHD_FL_EXT_DEVT; | 4355 | disk->flags |= GENHD_FL_EXT_DEVT; |
4355 | add_disk(disk); | ||
4356 | mddev->gendisk = disk; | 4356 | mddev->gendisk = disk; |
4357 | /* As soon as we call add_disk(), another thread could get | ||
4358 | * through to md_open, so make sure it doesn't get too far | ||
4359 | */ | ||
4360 | mutex_lock(&mddev->open_mutex); | ||
4361 | add_disk(disk); | ||
4362 | |||
4357 | error = kobject_init_and_add(&mddev->kobj, &md_ktype, | 4363 | error = kobject_init_and_add(&mddev->kobj, &md_ktype, |
4358 | &disk_to_dev(disk)->kobj, "%s", "md"); | 4364 | &disk_to_dev(disk)->kobj, "%s", "md"); |
4359 | if (error) { | 4365 | if (error) { |
@@ -4367,8 +4373,7 @@ static int md_alloc(dev_t dev, char *name) | |||
4367 | if (mddev->kobj.sd && | 4373 | if (mddev->kobj.sd && |
4368 | sysfs_create_group(&mddev->kobj, &md_bitmap_group)) | 4374 | sysfs_create_group(&mddev->kobj, &md_bitmap_group)) |
4369 | printk(KERN_DEBUG "pointless warning\n"); | 4375 | printk(KERN_DEBUG "pointless warning\n"); |
4370 | 4376 | mutex_unlock(&mddev->open_mutex); | |
4371 | blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA); | ||
4372 | abort: | 4377 | abort: |
4373 | mutex_unlock(&disks_mutex); | 4378 | mutex_unlock(&disks_mutex); |
4374 | if (!error && mddev->kobj.sd) { | 4379 | if (!error && mddev->kobj.sd) { |
@@ -5211,6 +5216,16 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info) | |||
5211 | } else | 5216 | } else |
5212 | super_types[mddev->major_version]. | 5217 | super_types[mddev->major_version]. |
5213 | validate_super(mddev, rdev); | 5218 | validate_super(mddev, rdev); |
5219 | if ((info->state & (1<<MD_DISK_SYNC)) && | ||
5220 | (!test_bit(In_sync, &rdev->flags) || | ||
5221 | rdev->raid_disk != info->raid_disk)) { | ||
5222 | /* This was a hot-add request, but events doesn't | ||
5223 | * match, so reject it. | ||
5224 | */ | ||
5225 | export_rdev(rdev); | ||
5226 | return -EINVAL; | ||
5227 | } | ||
5228 | |||
5214 | if (test_bit(In_sync, &rdev->flags)) | 5229 | if (test_bit(In_sync, &rdev->flags)) |
5215 | rdev->saved_raid_disk = rdev->raid_disk; | 5230 | rdev->saved_raid_disk = rdev->raid_disk; |
5216 | else | 5231 | else |
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index c35890990985..3535c23af288 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
@@ -146,7 +146,7 @@ static void multipath_status (struct seq_file *seq, mddev_t *mddev) | |||
146 | int i; | 146 | int i; |
147 | 147 | ||
148 | seq_printf (seq, " [%d/%d] [", conf->raid_disks, | 148 | seq_printf (seq, " [%d/%d] [", conf->raid_disks, |
149 | conf->working_disks); | 149 | conf->raid_disks - mddev->degraded); |
150 | for (i = 0; i < conf->raid_disks; i++) | 150 | for (i = 0; i < conf->raid_disks; i++) |
151 | seq_printf (seq, "%s", | 151 | seq_printf (seq, "%s", |
152 | conf->multipaths[i].rdev && | 152 | conf->multipaths[i].rdev && |
@@ -186,35 +186,36 @@ static int multipath_congested(void *data, int bits) | |||
186 | static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) | 186 | static void multipath_error (mddev_t *mddev, mdk_rdev_t *rdev) |
187 | { | 187 | { |
188 | multipath_conf_t *conf = mddev->private; | 188 | multipath_conf_t *conf = mddev->private; |
189 | char b[BDEVNAME_SIZE]; | ||
189 | 190 | ||
190 | if (conf->working_disks <= 1) { | 191 | if (conf->raid_disks - mddev->degraded <= 1) { |
191 | /* | 192 | /* |
192 | * Uh oh, we can do nothing if this is our last path, but | 193 | * Uh oh, we can do nothing if this is our last path, but |
193 | * first check if this is a queued request for a device | 194 | * first check if this is a queued request for a device |
194 | * which has just failed. | 195 | * which has just failed. |
195 | */ | 196 | */ |
196 | printk(KERN_ALERT | 197 | printk(KERN_ALERT |
197 | "multipath: only one IO path left and IO error.\n"); | 198 | "multipath: only one IO path left and IO error.\n"); |
198 | /* leave it active... it's all we have */ | 199 | /* leave it active... it's all we have */ |
199 | } else { | 200 | return; |
200 | /* | 201 | } |
201 | * Mark disk as unusable | 202 | /* |
202 | */ | 203 | * Mark disk as unusable |
203 | if (!test_bit(Faulty, &rdev->flags)) { | 204 | */ |
204 | char b[BDEVNAME_SIZE]; | 205 | if (test_and_clear_bit(In_sync, &rdev->flags)) { |
205 | clear_bit(In_sync, &rdev->flags); | 206 | unsigned long flags; |
206 | set_bit(Faulty, &rdev->flags); | 207 | spin_lock_irqsave(&conf->device_lock, flags); |
207 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 208 | mddev->degraded++; |
208 | conf->working_disks--; | 209 | spin_unlock_irqrestore(&conf->device_lock, flags); |
209 | mddev->degraded++; | ||
210 | printk(KERN_ALERT "multipath: IO failure on %s," | ||
211 | " disabling IO path.\n" | ||
212 | "multipath: Operation continuing" | ||
213 | " on %d IO paths.\n", | ||
214 | bdevname (rdev->bdev,b), | ||
215 | conf->working_disks); | ||
216 | } | ||
217 | } | 210 | } |
211 | set_bit(Faulty, &rdev->flags); | ||
212 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
213 | printk(KERN_ALERT "multipath: IO failure on %s," | ||
214 | " disabling IO path.\n" | ||
215 | "multipath: Operation continuing" | ||
216 | " on %d IO paths.\n", | ||
217 | bdevname(rdev->bdev, b), | ||
218 | conf->raid_disks - mddev->degraded); | ||
218 | } | 219 | } |
219 | 220 | ||
220 | static void print_multipath_conf (multipath_conf_t *conf) | 221 | static void print_multipath_conf (multipath_conf_t *conf) |
@@ -227,7 +228,7 @@ static void print_multipath_conf (multipath_conf_t *conf) | |||
227 | printk("(conf==NULL)\n"); | 228 | printk("(conf==NULL)\n"); |
228 | return; | 229 | return; |
229 | } | 230 | } |
230 | printk(" --- wd:%d rd:%d\n", conf->working_disks, | 231 | printk(" --- wd:%d rd:%d\n", conf->raid_disks - conf->mddev->degraded, |
231 | conf->raid_disks); | 232 | conf->raid_disks); |
232 | 233 | ||
233 | for (i = 0; i < conf->raid_disks; i++) { | 234 | for (i = 0; i < conf->raid_disks; i++) { |
@@ -274,10 +275,11 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) | |||
274 | PAGE_CACHE_SIZE - 1); | 275 | PAGE_CACHE_SIZE - 1); |
275 | } | 276 | } |
276 | 277 | ||
277 | conf->working_disks++; | 278 | spin_lock_irq(&conf->device_lock); |
278 | mddev->degraded--; | 279 | mddev->degraded--; |
279 | rdev->raid_disk = path; | 280 | rdev->raid_disk = path; |
280 | set_bit(In_sync, &rdev->flags); | 281 | set_bit(In_sync, &rdev->flags); |
282 | spin_unlock_irq(&conf->device_lock); | ||
281 | rcu_assign_pointer(p->rdev, rdev); | 283 | rcu_assign_pointer(p->rdev, rdev); |
282 | err = 0; | 284 | err = 0; |
283 | md_integrity_add_rdev(rdev, mddev); | 285 | md_integrity_add_rdev(rdev, mddev); |
@@ -391,6 +393,7 @@ static int multipath_run (mddev_t *mddev) | |||
391 | int disk_idx; | 393 | int disk_idx; |
392 | struct multipath_info *disk; | 394 | struct multipath_info *disk; |
393 | mdk_rdev_t *rdev; | 395 | mdk_rdev_t *rdev; |
396 | int working_disks; | ||
394 | 397 | ||
395 | if (md_check_no_bitmap(mddev)) | 398 | if (md_check_no_bitmap(mddev)) |
396 | return -EINVAL; | 399 | return -EINVAL; |
@@ -424,7 +427,7 @@ static int multipath_run (mddev_t *mddev) | |||
424 | goto out_free_conf; | 427 | goto out_free_conf; |
425 | } | 428 | } |
426 | 429 | ||
427 | conf->working_disks = 0; | 430 | working_disks = 0; |
428 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 431 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
429 | disk_idx = rdev->raid_disk; | 432 | disk_idx = rdev->raid_disk; |
430 | if (disk_idx < 0 || | 433 | if (disk_idx < 0 || |
@@ -446,7 +449,7 @@ static int multipath_run (mddev_t *mddev) | |||
446 | } | 449 | } |
447 | 450 | ||
448 | if (!test_bit(Faulty, &rdev->flags)) | 451 | if (!test_bit(Faulty, &rdev->flags)) |
449 | conf->working_disks++; | 452 | working_disks++; |
450 | } | 453 | } |
451 | 454 | ||
452 | conf->raid_disks = mddev->raid_disks; | 455 | conf->raid_disks = mddev->raid_disks; |
@@ -454,12 +457,12 @@ static int multipath_run (mddev_t *mddev) | |||
454 | spin_lock_init(&conf->device_lock); | 457 | spin_lock_init(&conf->device_lock); |
455 | INIT_LIST_HEAD(&conf->retry_list); | 458 | INIT_LIST_HEAD(&conf->retry_list); |
456 | 459 | ||
457 | if (!conf->working_disks) { | 460 | if (!working_disks) { |
458 | printk(KERN_ERR "multipath: no operational IO paths for %s\n", | 461 | printk(KERN_ERR "multipath: no operational IO paths for %s\n", |
459 | mdname(mddev)); | 462 | mdname(mddev)); |
460 | goto out_free_conf; | 463 | goto out_free_conf; |
461 | } | 464 | } |
462 | mddev->degraded = conf->raid_disks - conf->working_disks; | 465 | mddev->degraded = conf->raid_disks - working_disks; |
463 | 466 | ||
464 | conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, | 467 | conf->pool = mempool_create_kmalloc_pool(NR_RESERVED_BUFS, |
465 | sizeof(struct multipath_bh)); | 468 | sizeof(struct multipath_bh)); |
@@ -481,7 +484,8 @@ static int multipath_run (mddev_t *mddev) | |||
481 | 484 | ||
482 | printk(KERN_INFO | 485 | printk(KERN_INFO |
483 | "multipath: array %s active with %d out of %d IO paths\n", | 486 | "multipath: array %s active with %d out of %d IO paths\n", |
484 | mdname(mddev), conf->working_disks, mddev->raid_disks); | 487 | mdname(mddev), conf->raid_disks - mddev->degraded, |
488 | mddev->raid_disks); | ||
485 | /* | 489 | /* |
486 | * Ok, everything is just fine now | 490 | * Ok, everything is just fine now |
487 | */ | 491 | */ |
diff --git a/drivers/md/multipath.h b/drivers/md/multipath.h index d1c2a8d78395..3c5a45eb5f8a 100644 --- a/drivers/md/multipath.h +++ b/drivers/md/multipath.h | |||
@@ -9,7 +9,6 @@ struct multipath_private_data { | |||
9 | mddev_t *mddev; | 9 | mddev_t *mddev; |
10 | struct multipath_info *multipaths; | 10 | struct multipath_info *multipaths; |
11 | int raid_disks; | 11 | int raid_disks; |
12 | int working_disks; | ||
13 | spinlock_t device_lock; | 12 | spinlock_t device_lock; |
14 | struct list_head retry_list; | 13 | struct list_head retry_list; |
15 | 14 | ||
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 2b7a7ff401dc..5d096096f958 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -297,23 +297,24 @@ static void raid1_end_read_request(struct bio *bio, int error) | |||
297 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); | 297 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); |
298 | } | 298 | } |
299 | 299 | ||
300 | static void r1_bio_write_done(r1bio_t *r1_bio, int vcnt, struct bio_vec *bv, | 300 | static void r1_bio_write_done(r1bio_t *r1_bio) |
301 | int behind) | ||
302 | { | 301 | { |
303 | if (atomic_dec_and_test(&r1_bio->remaining)) | 302 | if (atomic_dec_and_test(&r1_bio->remaining)) |
304 | { | 303 | { |
305 | /* it really is the end of this request */ | 304 | /* it really is the end of this request */ |
306 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | 305 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { |
307 | /* free extra copy of the data pages */ | 306 | /* free extra copy of the data pages */ |
308 | int i = vcnt; | 307 | int i = r1_bio->behind_page_count; |
309 | while (i--) | 308 | while (i--) |
310 | safe_put_page(bv[i].bv_page); | 309 | safe_put_page(r1_bio->behind_pages[i]); |
310 | kfree(r1_bio->behind_pages); | ||
311 | r1_bio->behind_pages = NULL; | ||
311 | } | 312 | } |
312 | /* clear the bitmap if all writes complete successfully */ | 313 | /* clear the bitmap if all writes complete successfully */ |
313 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, | 314 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, |
314 | r1_bio->sectors, | 315 | r1_bio->sectors, |
315 | !test_bit(R1BIO_Degraded, &r1_bio->state), | 316 | !test_bit(R1BIO_Degraded, &r1_bio->state), |
316 | behind); | 317 | test_bit(R1BIO_BehindIO, &r1_bio->state)); |
317 | md_write_end(r1_bio->mddev); | 318 | md_write_end(r1_bio->mddev); |
318 | raid_end_bio_io(r1_bio); | 319 | raid_end_bio_io(r1_bio); |
319 | } | 320 | } |
@@ -386,7 +387,7 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
386 | * Let's see if all mirrored write operations have finished | 387 | * Let's see if all mirrored write operations have finished |
387 | * already. | 388 | * already. |
388 | */ | 389 | */ |
389 | r1_bio_write_done(r1_bio, bio->bi_vcnt, bio->bi_io_vec, behind); | 390 | r1_bio_write_done(r1_bio); |
390 | 391 | ||
391 | if (to_put) | 392 | if (to_put) |
392 | bio_put(to_put); | 393 | bio_put(to_put); |
@@ -411,10 +412,10 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
411 | { | 412 | { |
412 | const sector_t this_sector = r1_bio->sector; | 413 | const sector_t this_sector = r1_bio->sector; |
413 | const int sectors = r1_bio->sectors; | 414 | const int sectors = r1_bio->sectors; |
414 | int new_disk = -1; | ||
415 | int start_disk; | 415 | int start_disk; |
416 | int best_disk; | ||
416 | int i; | 417 | int i; |
417 | sector_t new_distance, current_distance; | 418 | sector_t best_dist; |
418 | mdk_rdev_t *rdev; | 419 | mdk_rdev_t *rdev; |
419 | int choose_first; | 420 | int choose_first; |
420 | 421 | ||
@@ -425,6 +426,8 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
425 | * We take the first readable disk when above the resync window. | 426 | * We take the first readable disk when above the resync window. |
426 | */ | 427 | */ |
427 | retry: | 428 | retry: |
429 | best_disk = -1; | ||
430 | best_dist = MaxSector; | ||
428 | if (conf->mddev->recovery_cp < MaxSector && | 431 | if (conf->mddev->recovery_cp < MaxSector && |
429 | (this_sector + sectors >= conf->next_resync)) { | 432 | (this_sector + sectors >= conf->next_resync)) { |
430 | choose_first = 1; | 433 | choose_first = 1; |
@@ -434,8 +437,8 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
434 | start_disk = conf->last_used; | 437 | start_disk = conf->last_used; |
435 | } | 438 | } |
436 | 439 | ||
437 | /* make sure the disk is operational */ | ||
438 | for (i = 0 ; i < conf->raid_disks ; i++) { | 440 | for (i = 0 ; i < conf->raid_disks ; i++) { |
441 | sector_t dist; | ||
439 | int disk = start_disk + i; | 442 | int disk = start_disk + i; |
440 | if (disk >= conf->raid_disks) | 443 | if (disk >= conf->raid_disks) |
441 | disk -= conf->raid_disks; | 444 | disk -= conf->raid_disks; |
@@ -443,60 +446,43 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
443 | rdev = rcu_dereference(conf->mirrors[disk].rdev); | 446 | rdev = rcu_dereference(conf->mirrors[disk].rdev); |
444 | if (r1_bio->bios[disk] == IO_BLOCKED | 447 | if (r1_bio->bios[disk] == IO_BLOCKED |
445 | || rdev == NULL | 448 | || rdev == NULL |
446 | || !test_bit(In_sync, &rdev->flags)) | 449 | || test_bit(Faulty, &rdev->flags)) |
447 | continue; | 450 | continue; |
448 | 451 | if (!test_bit(In_sync, &rdev->flags) && | |
449 | new_disk = disk; | 452 | rdev->recovery_offset < this_sector + sectors) |
450 | if (!test_bit(WriteMostly, &rdev->flags)) | ||
451 | break; | ||
452 | } | ||
453 | |||
454 | if (new_disk < 0 || choose_first) | ||
455 | goto rb_out; | ||
456 | |||
457 | /* | ||
458 | * Don't change to another disk for sequential reads: | ||
459 | */ | ||
460 | if (conf->next_seq_sect == this_sector) | ||
461 | goto rb_out; | ||
462 | if (this_sector == conf->mirrors[new_disk].head_position) | ||
463 | goto rb_out; | ||
464 | |||
465 | current_distance = abs(this_sector | ||
466 | - conf->mirrors[new_disk].head_position); | ||
467 | |||
468 | /* look for a better disk - i.e. head is closer */ | ||
469 | start_disk = new_disk; | ||
470 | for (i = 1; i < conf->raid_disks; i++) { | ||
471 | int disk = start_disk + 1; | ||
472 | if (disk >= conf->raid_disks) | ||
473 | disk -= conf->raid_disks; | ||
474 | |||
475 | rdev = rcu_dereference(conf->mirrors[disk].rdev); | ||
476 | if (r1_bio->bios[disk] == IO_BLOCKED | ||
477 | || rdev == NULL | ||
478 | || !test_bit(In_sync, &rdev->flags) | ||
479 | || test_bit(WriteMostly, &rdev->flags)) | ||
480 | continue; | 453 | continue; |
481 | 454 | if (test_bit(WriteMostly, &rdev->flags)) { | |
482 | if (!atomic_read(&rdev->nr_pending)) { | 455 | /* Don't balance among write-mostly, just |
483 | new_disk = disk; | 456 | * use the first as a last resort */ |
457 | if (best_disk < 0) | ||
458 | best_disk = disk; | ||
459 | continue; | ||
460 | } | ||
461 | /* This is a reasonable device to use. It might | ||
462 | * even be best. | ||
463 | */ | ||
464 | dist = abs(this_sector - conf->mirrors[disk].head_position); | ||
465 | if (choose_first | ||
466 | /* Don't change to another disk for sequential reads */ | ||
467 | || conf->next_seq_sect == this_sector | ||
468 | || dist == 0 | ||
469 | /* If device is idle, use it */ | ||
470 | || atomic_read(&rdev->nr_pending) == 0) { | ||
471 | best_disk = disk; | ||
484 | break; | 472 | break; |
485 | } | 473 | } |
486 | new_distance = abs(this_sector - conf->mirrors[disk].head_position); | 474 | if (dist < best_dist) { |
487 | if (new_distance < current_distance) { | 475 | best_dist = dist; |
488 | current_distance = new_distance; | 476 | best_disk = disk; |
489 | new_disk = disk; | ||
490 | } | 477 | } |
491 | } | 478 | } |
492 | 479 | ||
493 | rb_out: | 480 | if (best_disk >= 0) { |
494 | if (new_disk >= 0) { | 481 | rdev = rcu_dereference(conf->mirrors[best_disk].rdev); |
495 | rdev = rcu_dereference(conf->mirrors[new_disk].rdev); | ||
496 | if (!rdev) | 482 | if (!rdev) |
497 | goto retry; | 483 | goto retry; |
498 | atomic_inc(&rdev->nr_pending); | 484 | atomic_inc(&rdev->nr_pending); |
499 | if (!test_bit(In_sync, &rdev->flags)) { | 485 | if (test_bit(Faulty, &rdev->flags)) { |
500 | /* cannot risk returning a device that failed | 486 | /* cannot risk returning a device that failed |
501 | * before we inc'ed nr_pending | 487 | * before we inc'ed nr_pending |
502 | */ | 488 | */ |
@@ -504,11 +490,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
504 | goto retry; | 490 | goto retry; |
505 | } | 491 | } |
506 | conf->next_seq_sect = this_sector + sectors; | 492 | conf->next_seq_sect = this_sector + sectors; |
507 | conf->last_used = new_disk; | 493 | conf->last_used = best_disk; |
508 | } | 494 | } |
509 | rcu_read_unlock(); | 495 | rcu_read_unlock(); |
510 | 496 | ||
511 | return new_disk; | 497 | return best_disk; |
512 | } | 498 | } |
513 | 499 | ||
514 | static int raid1_congested(void *data, int bits) | 500 | static int raid1_congested(void *data, int bits) |
@@ -675,37 +661,36 @@ static void unfreeze_array(conf_t *conf) | |||
675 | 661 | ||
676 | 662 | ||
677 | /* duplicate the data pages for behind I/O | 663 | /* duplicate the data pages for behind I/O |
678 | * We return a list of bio_vec rather than just page pointers | ||
679 | * as it makes freeing easier | ||
680 | */ | 664 | */ |
681 | static struct bio_vec *alloc_behind_pages(struct bio *bio) | 665 | static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio) |
682 | { | 666 | { |
683 | int i; | 667 | int i; |
684 | struct bio_vec *bvec; | 668 | struct bio_vec *bvec; |
685 | struct bio_vec *pages = kzalloc(bio->bi_vcnt * sizeof(struct bio_vec), | 669 | struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*), |
686 | GFP_NOIO); | 670 | GFP_NOIO); |
687 | if (unlikely(!pages)) | 671 | if (unlikely(!pages)) |
688 | goto do_sync_io; | 672 | return; |
689 | 673 | ||
690 | bio_for_each_segment(bvec, bio, i) { | 674 | bio_for_each_segment(bvec, bio, i) { |
691 | pages[i].bv_page = alloc_page(GFP_NOIO); | 675 | pages[i] = alloc_page(GFP_NOIO); |
692 | if (unlikely(!pages[i].bv_page)) | 676 | if (unlikely(!pages[i])) |
693 | goto do_sync_io; | 677 | goto do_sync_io; |
694 | memcpy(kmap(pages[i].bv_page) + bvec->bv_offset, | 678 | memcpy(kmap(pages[i]) + bvec->bv_offset, |
695 | kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); | 679 | kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); |
696 | kunmap(pages[i].bv_page); | 680 | kunmap(pages[i]); |
697 | kunmap(bvec->bv_page); | 681 | kunmap(bvec->bv_page); |
698 | } | 682 | } |
699 | 683 | r1_bio->behind_pages = pages; | |
700 | return pages; | 684 | r1_bio->behind_page_count = bio->bi_vcnt; |
685 | set_bit(R1BIO_BehindIO, &r1_bio->state); | ||
686 | return; | ||
701 | 687 | ||
702 | do_sync_io: | 688 | do_sync_io: |
703 | if (pages) | 689 | for (i = 0; i < bio->bi_vcnt; i++) |
704 | for (i = 0; i < bio->bi_vcnt && pages[i].bv_page; i++) | 690 | if (pages[i]) |
705 | put_page(pages[i].bv_page); | 691 | put_page(pages[i]); |
706 | kfree(pages); | 692 | kfree(pages); |
707 | PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); | 693 | PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); |
708 | return NULL; | ||
709 | } | 694 | } |
710 | 695 | ||
711 | static int make_request(mddev_t *mddev, struct bio * bio) | 696 | static int make_request(mddev_t *mddev, struct bio * bio) |
@@ -717,7 +702,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
717 | int i, targets = 0, disks; | 702 | int i, targets = 0, disks; |
718 | struct bitmap *bitmap; | 703 | struct bitmap *bitmap; |
719 | unsigned long flags; | 704 | unsigned long flags; |
720 | struct bio_vec *behind_pages = NULL; | ||
721 | const int rw = bio_data_dir(bio); | 705 | const int rw = bio_data_dir(bio); |
722 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); | 706 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); |
723 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); | 707 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); |
@@ -870,9 +854,8 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
870 | if (bitmap && | 854 | if (bitmap && |
871 | (atomic_read(&bitmap->behind_writes) | 855 | (atomic_read(&bitmap->behind_writes) |
872 | < mddev->bitmap_info.max_write_behind) && | 856 | < mddev->bitmap_info.max_write_behind) && |
873 | !waitqueue_active(&bitmap->behind_wait) && | 857 | !waitqueue_active(&bitmap->behind_wait)) |
874 | (behind_pages = alloc_behind_pages(bio)) != NULL) | 858 | alloc_behind_pages(bio, r1_bio); |
875 | set_bit(R1BIO_BehindIO, &r1_bio->state); | ||
876 | 859 | ||
877 | atomic_set(&r1_bio->remaining, 1); | 860 | atomic_set(&r1_bio->remaining, 1); |
878 | atomic_set(&r1_bio->behind_remaining, 0); | 861 | atomic_set(&r1_bio->behind_remaining, 0); |
@@ -893,7 +876,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
893 | mbio->bi_rw = WRITE | do_flush_fua | do_sync; | 876 | mbio->bi_rw = WRITE | do_flush_fua | do_sync; |
894 | mbio->bi_private = r1_bio; | 877 | mbio->bi_private = r1_bio; |
895 | 878 | ||
896 | if (behind_pages) { | 879 | if (r1_bio->behind_pages) { |
897 | struct bio_vec *bvec; | 880 | struct bio_vec *bvec; |
898 | int j; | 881 | int j; |
899 | 882 | ||
@@ -905,7 +888,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
905 | * them all | 888 | * them all |
906 | */ | 889 | */ |
907 | __bio_for_each_segment(bvec, mbio, j, 0) | 890 | __bio_for_each_segment(bvec, mbio, j, 0) |
908 | bvec->bv_page = behind_pages[j].bv_page; | 891 | bvec->bv_page = r1_bio->behind_pages[j]; |
909 | if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) | 892 | if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) |
910 | atomic_inc(&r1_bio->behind_remaining); | 893 | atomic_inc(&r1_bio->behind_remaining); |
911 | } | 894 | } |
@@ -915,8 +898,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
915 | bio_list_add(&conf->pending_bio_list, mbio); | 898 | bio_list_add(&conf->pending_bio_list, mbio); |
916 | spin_unlock_irqrestore(&conf->device_lock, flags); | 899 | spin_unlock_irqrestore(&conf->device_lock, flags); |
917 | } | 900 | } |
918 | r1_bio_write_done(r1_bio, bio->bi_vcnt, behind_pages, behind_pages != NULL); | 901 | r1_bio_write_done(r1_bio); |
919 | kfree(behind_pages); /* the behind pages are attached to the bios now */ | ||
920 | 902 | ||
921 | /* In case raid1d snuck in to freeze_array */ | 903 | /* In case raid1d snuck in to freeze_array */ |
922 | wake_up(&conf->wait_barrier); | 904 | wake_up(&conf->wait_barrier); |
@@ -1196,194 +1178,210 @@ static void end_sync_write(struct bio *bio, int error) | |||
1196 | } | 1178 | } |
1197 | } | 1179 | } |
1198 | 1180 | ||
1199 | static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) | 1181 | static int fix_sync_read_error(r1bio_t *r1_bio) |
1200 | { | 1182 | { |
1183 | /* Try some synchronous reads of other devices to get | ||
1184 | * good data, much like with normal read errors. Only | ||
1185 | * read into the pages we already have so we don't | ||
1186 | * need to re-issue the read request. | ||
1187 | * We don't need to freeze the array, because being in an | ||
1188 | * active sync request, there is no normal IO, and | ||
1189 | * no overlapping syncs. | ||
1190 | */ | ||
1191 | mddev_t *mddev = r1_bio->mddev; | ||
1201 | conf_t *conf = mddev->private; | 1192 | conf_t *conf = mddev->private; |
1202 | int i; | 1193 | struct bio *bio = r1_bio->bios[r1_bio->read_disk]; |
1203 | int disks = conf->raid_disks; | 1194 | sector_t sect = r1_bio->sector; |
1204 | struct bio *bio, *wbio; | 1195 | int sectors = r1_bio->sectors; |
1205 | 1196 | int idx = 0; | |
1206 | bio = r1_bio->bios[r1_bio->read_disk]; | ||
1207 | 1197 | ||
1198 | while(sectors) { | ||
1199 | int s = sectors; | ||
1200 | int d = r1_bio->read_disk; | ||
1201 | int success = 0; | ||
1202 | mdk_rdev_t *rdev; | ||
1203 | int start; | ||
1208 | 1204 | ||
1209 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { | 1205 | if (s > (PAGE_SIZE>>9)) |
1210 | /* We have read all readable devices. If we haven't | 1206 | s = PAGE_SIZE >> 9; |
1211 | * got the block, then there is no hope left. | 1207 | do { |
1212 | * If we have, then we want to do a comparison | 1208 | if (r1_bio->bios[d]->bi_end_io == end_sync_read) { |
1213 | * and skip the write if everything is the same. | 1209 | /* No rcu protection needed here devices |
1214 | * If any blocks failed to read, then we need to | 1210 | * can only be removed when no resync is |
1215 | * attempt an over-write | 1211 | * active, and resync is currently active |
1216 | */ | 1212 | */ |
1217 | int primary; | 1213 | rdev = conf->mirrors[d].rdev; |
1218 | if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { | 1214 | if (sync_page_io(rdev, |
1219 | for (i=0; i<mddev->raid_disks; i++) | 1215 | sect, |
1220 | if (r1_bio->bios[i]->bi_end_io == end_sync_read) | 1216 | s<<9, |
1221 | md_error(mddev, conf->mirrors[i].rdev); | 1217 | bio->bi_io_vec[idx].bv_page, |
1218 | READ, false)) { | ||
1219 | success = 1; | ||
1220 | break; | ||
1221 | } | ||
1222 | } | ||
1223 | d++; | ||
1224 | if (d == conf->raid_disks) | ||
1225 | d = 0; | ||
1226 | } while (!success && d != r1_bio->read_disk); | ||
1222 | 1227 | ||
1223 | md_done_sync(mddev, r1_bio->sectors, 1); | 1228 | if (!success) { |
1229 | char b[BDEVNAME_SIZE]; | ||
1230 | /* Cannot read from anywhere, array is toast */ | ||
1231 | md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); | ||
1232 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" | ||
1233 | " for block %llu\n", | ||
1234 | mdname(mddev), | ||
1235 | bdevname(bio->bi_bdev, b), | ||
1236 | (unsigned long long)r1_bio->sector); | ||
1237 | md_done_sync(mddev, r1_bio->sectors, 0); | ||
1224 | put_buf(r1_bio); | 1238 | put_buf(r1_bio); |
1225 | return; | 1239 | return 0; |
1226 | } | 1240 | } |
1227 | for (primary=0; primary<mddev->raid_disks; primary++) | ||
1228 | if (r1_bio->bios[primary]->bi_end_io == end_sync_read && | ||
1229 | test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { | ||
1230 | r1_bio->bios[primary]->bi_end_io = NULL; | ||
1231 | rdev_dec_pending(conf->mirrors[primary].rdev, mddev); | ||
1232 | break; | ||
1233 | } | ||
1234 | r1_bio->read_disk = primary; | ||
1235 | for (i=0; i<mddev->raid_disks; i++) | ||
1236 | if (r1_bio->bios[i]->bi_end_io == end_sync_read) { | ||
1237 | int j; | ||
1238 | int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); | ||
1239 | struct bio *pbio = r1_bio->bios[primary]; | ||
1240 | struct bio *sbio = r1_bio->bios[i]; | ||
1241 | |||
1242 | if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { | ||
1243 | for (j = vcnt; j-- ; ) { | ||
1244 | struct page *p, *s; | ||
1245 | p = pbio->bi_io_vec[j].bv_page; | ||
1246 | s = sbio->bi_io_vec[j].bv_page; | ||
1247 | if (memcmp(page_address(p), | ||
1248 | page_address(s), | ||
1249 | PAGE_SIZE)) | ||
1250 | break; | ||
1251 | } | ||
1252 | } else | ||
1253 | j = 0; | ||
1254 | if (j >= 0) | ||
1255 | mddev->resync_mismatches += r1_bio->sectors; | ||
1256 | if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) | ||
1257 | && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { | ||
1258 | sbio->bi_end_io = NULL; | ||
1259 | rdev_dec_pending(conf->mirrors[i].rdev, mddev); | ||
1260 | } else { | ||
1261 | /* fixup the bio for reuse */ | ||
1262 | int size; | ||
1263 | sbio->bi_vcnt = vcnt; | ||
1264 | sbio->bi_size = r1_bio->sectors << 9; | ||
1265 | sbio->bi_idx = 0; | ||
1266 | sbio->bi_phys_segments = 0; | ||
1267 | sbio->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
1268 | sbio->bi_flags |= 1 << BIO_UPTODATE; | ||
1269 | sbio->bi_next = NULL; | ||
1270 | sbio->bi_sector = r1_bio->sector + | ||
1271 | conf->mirrors[i].rdev->data_offset; | ||
1272 | sbio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1273 | size = sbio->bi_size; | ||
1274 | for (j = 0; j < vcnt ; j++) { | ||
1275 | struct bio_vec *bi; | ||
1276 | bi = &sbio->bi_io_vec[j]; | ||
1277 | bi->bv_offset = 0; | ||
1278 | if (size > PAGE_SIZE) | ||
1279 | bi->bv_len = PAGE_SIZE; | ||
1280 | else | ||
1281 | bi->bv_len = size; | ||
1282 | size -= PAGE_SIZE; | ||
1283 | memcpy(page_address(bi->bv_page), | ||
1284 | page_address(pbio->bi_io_vec[j].bv_page), | ||
1285 | PAGE_SIZE); | ||
1286 | } | ||
1287 | 1241 | ||
1288 | } | 1242 | start = d; |
1289 | } | 1243 | /* write it back and re-read */ |
1244 | while (d != r1_bio->read_disk) { | ||
1245 | if (d == 0) | ||
1246 | d = conf->raid_disks; | ||
1247 | d--; | ||
1248 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | ||
1249 | continue; | ||
1250 | rdev = conf->mirrors[d].rdev; | ||
1251 | if (sync_page_io(rdev, | ||
1252 | sect, | ||
1253 | s<<9, | ||
1254 | bio->bi_io_vec[idx].bv_page, | ||
1255 | WRITE, false) == 0) { | ||
1256 | r1_bio->bios[d]->bi_end_io = NULL; | ||
1257 | rdev_dec_pending(rdev, mddev); | ||
1258 | md_error(mddev, rdev); | ||
1259 | } else | ||
1260 | atomic_add(s, &rdev->corrected_errors); | ||
1261 | } | ||
1262 | d = start; | ||
1263 | while (d != r1_bio->read_disk) { | ||
1264 | if (d == 0) | ||
1265 | d = conf->raid_disks; | ||
1266 | d--; | ||
1267 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | ||
1268 | continue; | ||
1269 | rdev = conf->mirrors[d].rdev; | ||
1270 | if (sync_page_io(rdev, | ||
1271 | sect, | ||
1272 | s<<9, | ||
1273 | bio->bi_io_vec[idx].bv_page, | ||
1274 | READ, false) == 0) | ||
1275 | md_error(mddev, rdev); | ||
1276 | } | ||
1277 | sectors -= s; | ||
1278 | sect += s; | ||
1279 | idx ++; | ||
1290 | } | 1280 | } |
1291 | if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { | 1281 | set_bit(R1BIO_Uptodate, &r1_bio->state); |
1292 | /* ouch - failed to read all of that. | 1282 | set_bit(BIO_UPTODATE, &bio->bi_flags); |
1293 | * Try some synchronous reads of other devices to get | 1283 | return 1; |
1294 | * good data, much like with normal read errors. Only | 1284 | } |
1295 | * read into the pages we already have so we don't | 1285 | |
1296 | * need to re-issue the read request. | 1286 | static int process_checks(r1bio_t *r1_bio) |
1297 | * We don't need to freeze the array, because being in an | 1287 | { |
1298 | * active sync request, there is no normal IO, and | 1288 | /* We have read all readable devices. If we haven't |
1299 | * no overlapping syncs. | 1289 | * got the block, then there is no hope left. |
1300 | */ | 1290 | * If we have, then we want to do a comparison |
1301 | sector_t sect = r1_bio->sector; | 1291 | * and skip the write if everything is the same. |
1302 | int sectors = r1_bio->sectors; | 1292 | * If any blocks failed to read, then we need to |
1303 | int idx = 0; | 1293 | * attempt an over-write |
1304 | 1294 | */ | |
1305 | while(sectors) { | 1295 | mddev_t *mddev = r1_bio->mddev; |
1306 | int s = sectors; | 1296 | conf_t *conf = mddev->private; |
1307 | int d = r1_bio->read_disk; | 1297 | int primary; |
1308 | int success = 0; | 1298 | int i; |
1309 | mdk_rdev_t *rdev; | 1299 | |
1310 | 1300 | for (primary = 0; primary < conf->raid_disks; primary++) | |
1311 | if (s > (PAGE_SIZE>>9)) | 1301 | if (r1_bio->bios[primary]->bi_end_io == end_sync_read && |
1312 | s = PAGE_SIZE >> 9; | 1302 | test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { |
1313 | do { | 1303 | r1_bio->bios[primary]->bi_end_io = NULL; |
1314 | if (r1_bio->bios[d]->bi_end_io == end_sync_read) { | 1304 | rdev_dec_pending(conf->mirrors[primary].rdev, mddev); |
1315 | /* No rcu protection needed here devices | 1305 | break; |
1316 | * can only be removed when no resync is | 1306 | } |
1317 | * active, and resync is currently active | 1307 | r1_bio->read_disk = primary; |
1318 | */ | 1308 | for (i = 0; i < conf->raid_disks; i++) { |
1319 | rdev = conf->mirrors[d].rdev; | 1309 | int j; |
1320 | if (sync_page_io(rdev, | 1310 | int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); |
1321 | sect, | 1311 | struct bio *pbio = r1_bio->bios[primary]; |
1322 | s<<9, | 1312 | struct bio *sbio = r1_bio->bios[i]; |
1323 | bio->bi_io_vec[idx].bv_page, | 1313 | int size; |
1324 | READ, false)) { | 1314 | |
1325 | success = 1; | 1315 | if (r1_bio->bios[i]->bi_end_io != end_sync_read) |
1326 | break; | 1316 | continue; |
1327 | } | 1317 | |
1328 | } | 1318 | if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { |
1329 | d++; | 1319 | for (j = vcnt; j-- ; ) { |
1330 | if (d == conf->raid_disks) | 1320 | struct page *p, *s; |
1331 | d = 0; | 1321 | p = pbio->bi_io_vec[j].bv_page; |
1332 | } while (!success && d != r1_bio->read_disk); | 1322 | s = sbio->bi_io_vec[j].bv_page; |
1333 | 1323 | if (memcmp(page_address(p), | |
1334 | if (success) { | 1324 | page_address(s), |
1335 | int start = d; | 1325 | PAGE_SIZE)) |
1336 | /* write it back and re-read */ | 1326 | break; |
1337 | set_bit(R1BIO_Uptodate, &r1_bio->state); | ||
1338 | while (d != r1_bio->read_disk) { | ||
1339 | if (d == 0) | ||
1340 | d = conf->raid_disks; | ||
1341 | d--; | ||
1342 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | ||
1343 | continue; | ||
1344 | rdev = conf->mirrors[d].rdev; | ||
1345 | atomic_add(s, &rdev->corrected_errors); | ||
1346 | if (sync_page_io(rdev, | ||
1347 | sect, | ||
1348 | s<<9, | ||
1349 | bio->bi_io_vec[idx].bv_page, | ||
1350 | WRITE, false) == 0) | ||
1351 | md_error(mddev, rdev); | ||
1352 | } | ||
1353 | d = start; | ||
1354 | while (d != r1_bio->read_disk) { | ||
1355 | if (d == 0) | ||
1356 | d = conf->raid_disks; | ||
1357 | d--; | ||
1358 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | ||
1359 | continue; | ||
1360 | rdev = conf->mirrors[d].rdev; | ||
1361 | if (sync_page_io(rdev, | ||
1362 | sect, | ||
1363 | s<<9, | ||
1364 | bio->bi_io_vec[idx].bv_page, | ||
1365 | READ, false) == 0) | ||
1366 | md_error(mddev, rdev); | ||
1367 | } | ||
1368 | } else { | ||
1369 | char b[BDEVNAME_SIZE]; | ||
1370 | /* Cannot read from anywhere, array is toast */ | ||
1371 | md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); | ||
1372 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" | ||
1373 | " for block %llu\n", | ||
1374 | mdname(mddev), | ||
1375 | bdevname(bio->bi_bdev, b), | ||
1376 | (unsigned long long)r1_bio->sector); | ||
1377 | md_done_sync(mddev, r1_bio->sectors, 0); | ||
1378 | put_buf(r1_bio); | ||
1379 | return; | ||
1380 | } | 1327 | } |
1381 | sectors -= s; | 1328 | } else |
1382 | sect += s; | 1329 | j = 0; |
1383 | idx ++; | 1330 | if (j >= 0) |
1331 | mddev->resync_mismatches += r1_bio->sectors; | ||
1332 | if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) | ||
1333 | && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { | ||
1334 | /* No need to write to this device. */ | ||
1335 | sbio->bi_end_io = NULL; | ||
1336 | rdev_dec_pending(conf->mirrors[i].rdev, mddev); | ||
1337 | continue; | ||
1338 | } | ||
1339 | /* fixup the bio for reuse */ | ||
1340 | sbio->bi_vcnt = vcnt; | ||
1341 | sbio->bi_size = r1_bio->sectors << 9; | ||
1342 | sbio->bi_idx = 0; | ||
1343 | sbio->bi_phys_segments = 0; | ||
1344 | sbio->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
1345 | sbio->bi_flags |= 1 << BIO_UPTODATE; | ||
1346 | sbio->bi_next = NULL; | ||
1347 | sbio->bi_sector = r1_bio->sector + | ||
1348 | conf->mirrors[i].rdev->data_offset; | ||
1349 | sbio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1350 | size = sbio->bi_size; | ||
1351 | for (j = 0; j < vcnt ; j++) { | ||
1352 | struct bio_vec *bi; | ||
1353 | bi = &sbio->bi_io_vec[j]; | ||
1354 | bi->bv_offset = 0; | ||
1355 | if (size > PAGE_SIZE) | ||
1356 | bi->bv_len = PAGE_SIZE; | ||
1357 | else | ||
1358 | bi->bv_len = size; | ||
1359 | size -= PAGE_SIZE; | ||
1360 | memcpy(page_address(bi->bv_page), | ||
1361 | page_address(pbio->bi_io_vec[j].bv_page), | ||
1362 | PAGE_SIZE); | ||
1384 | } | 1363 | } |
1385 | } | 1364 | } |
1365 | return 0; | ||
1366 | } | ||
1386 | 1367 | ||
1368 | static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) | ||
1369 | { | ||
1370 | conf_t *conf = mddev->private; | ||
1371 | int i; | ||
1372 | int disks = conf->raid_disks; | ||
1373 | struct bio *bio, *wbio; | ||
1374 | |||
1375 | bio = r1_bio->bios[r1_bio->read_disk]; | ||
1376 | |||
1377 | if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) | ||
1378 | /* ouch - failed to read all of that. */ | ||
1379 | if (!fix_sync_read_error(r1_bio)) | ||
1380 | return; | ||
1381 | |||
1382 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) | ||
1383 | if (process_checks(r1_bio) < 0) | ||
1384 | return; | ||
1387 | /* | 1385 | /* |
1388 | * schedule writes | 1386 | * schedule writes |
1389 | */ | 1387 | */ |
@@ -2063,7 +2061,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) | |||
2063 | set_capacity(mddev->gendisk, mddev->array_sectors); | 2061 | set_capacity(mddev->gendisk, mddev->array_sectors); |
2064 | revalidate_disk(mddev->gendisk); | 2062 | revalidate_disk(mddev->gendisk); |
2065 | if (sectors > mddev->dev_sectors && | 2063 | if (sectors > mddev->dev_sectors && |
2066 | mddev->recovery_cp == MaxSector) { | 2064 | mddev->recovery_cp > mddev->dev_sectors) { |
2067 | mddev->recovery_cp = mddev->dev_sectors; | 2065 | mddev->recovery_cp = mddev->dev_sectors; |
2068 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 2066 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
2069 | } | 2067 | } |
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h index cbfdf1a6acd9..5fc4ca1af863 100644 --- a/drivers/md/raid1.h +++ b/drivers/md/raid1.h | |||
@@ -94,7 +94,9 @@ struct r1bio_s { | |||
94 | int read_disk; | 94 | int read_disk; |
95 | 95 | ||
96 | struct list_head retry_list; | 96 | struct list_head retry_list; |
97 | struct bitmap_update *bitmap_update; | 97 | /* Next two are only valid when R1BIO_BehindIO is set */ |
98 | struct page **behind_pages; | ||
99 | int behind_page_count; | ||
98 | /* | 100 | /* |
99 | * if the IO is in WRITE direction, then multiple bios are used. | 101 | * if the IO is in WRITE direction, then multiple bios are used. |
100 | * We choose the number when they are allocated. | 102 | * We choose the number when they are allocated. |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 8e9462626ec5..6e846688962f 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -271,9 +271,10 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
271 | */ | 271 | */ |
272 | set_bit(R10BIO_Uptodate, &r10_bio->state); | 272 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
273 | raid_end_bio_io(r10_bio); | 273 | raid_end_bio_io(r10_bio); |
274 | rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); | ||
274 | } else { | 275 | } else { |
275 | /* | 276 | /* |
276 | * oops, read error: | 277 | * oops, read error - keep the refcount on the rdev |
277 | */ | 278 | */ |
278 | char b[BDEVNAME_SIZE]; | 279 | char b[BDEVNAME_SIZE]; |
279 | if (printk_ratelimit()) | 280 | if (printk_ratelimit()) |
@@ -282,8 +283,6 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
282 | bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); | 283 | bdevname(conf->mirrors[dev].rdev->bdev,b), (unsigned long long)r10_bio->sector); |
283 | reschedule_retry(r10_bio); | 284 | reschedule_retry(r10_bio); |
284 | } | 285 | } |
285 | |||
286 | rdev_dec_pending(conf->mirrors[dev].rdev, conf->mddev); | ||
287 | } | 286 | } |
288 | 287 | ||
289 | static void raid10_end_write_request(struct bio *bio, int error) | 288 | static void raid10_end_write_request(struct bio *bio, int error) |
@@ -488,13 +487,19 @@ static int raid10_mergeable_bvec(struct request_queue *q, | |||
488 | static int read_balance(conf_t *conf, r10bio_t *r10_bio) | 487 | static int read_balance(conf_t *conf, r10bio_t *r10_bio) |
489 | { | 488 | { |
490 | const sector_t this_sector = r10_bio->sector; | 489 | const sector_t this_sector = r10_bio->sector; |
491 | int disk, slot, nslot; | 490 | int disk, slot; |
492 | const int sectors = r10_bio->sectors; | 491 | const int sectors = r10_bio->sectors; |
493 | sector_t new_distance, current_distance; | 492 | sector_t new_distance, best_dist; |
494 | mdk_rdev_t *rdev; | 493 | mdk_rdev_t *rdev; |
494 | int do_balance; | ||
495 | int best_slot; | ||
495 | 496 | ||
496 | raid10_find_phys(conf, r10_bio); | 497 | raid10_find_phys(conf, r10_bio); |
497 | rcu_read_lock(); | 498 | rcu_read_lock(); |
499 | retry: | ||
500 | best_slot = -1; | ||
501 | best_dist = MaxSector; | ||
502 | do_balance = 1; | ||
498 | /* | 503 | /* |
499 | * Check if we can balance. We can balance on the whole | 504 | * Check if we can balance. We can balance on the whole |
500 | * device if no resync is going on (recovery is ok), or below | 505 | * device if no resync is going on (recovery is ok), or below |
@@ -502,86 +507,58 @@ static int read_balance(conf_t *conf, r10bio_t *r10_bio) | |||
502 | * above the resync window. | 507 | * above the resync window. |
503 | */ | 508 | */ |
504 | if (conf->mddev->recovery_cp < MaxSector | 509 | if (conf->mddev->recovery_cp < MaxSector |
505 | && (this_sector + sectors >= conf->next_resync)) { | 510 | && (this_sector + sectors >= conf->next_resync)) |
506 | /* make sure that disk is operational */ | 511 | do_balance = 0; |
507 | slot = 0; | ||
508 | disk = r10_bio->devs[slot].devnum; | ||
509 | |||
510 | while ((rdev = rcu_dereference(conf->mirrors[disk].rdev)) == NULL || | ||
511 | r10_bio->devs[slot].bio == IO_BLOCKED || | ||
512 | !test_bit(In_sync, &rdev->flags)) { | ||
513 | slot++; | ||
514 | if (slot == conf->copies) { | ||
515 | slot = 0; | ||
516 | disk = -1; | ||
517 | break; | ||
518 | } | ||
519 | disk = r10_bio->devs[slot].devnum; | ||
520 | } | ||
521 | goto rb_out; | ||
522 | } | ||
523 | |||
524 | 512 | ||
525 | /* make sure the disk is operational */ | 513 | for (slot = 0; slot < conf->copies ; slot++) { |
526 | slot = 0; | 514 | if (r10_bio->devs[slot].bio == IO_BLOCKED) |
527 | disk = r10_bio->devs[slot].devnum; | 515 | continue; |
528 | while ((rdev=rcu_dereference(conf->mirrors[disk].rdev)) == NULL || | ||
529 | r10_bio->devs[slot].bio == IO_BLOCKED || | ||
530 | !test_bit(In_sync, &rdev->flags)) { | ||
531 | slot ++; | ||
532 | if (slot == conf->copies) { | ||
533 | disk = -1; | ||
534 | goto rb_out; | ||
535 | } | ||
536 | disk = r10_bio->devs[slot].devnum; | 516 | disk = r10_bio->devs[slot].devnum; |
537 | } | 517 | rdev = rcu_dereference(conf->mirrors[disk].rdev); |
538 | 518 | if (rdev == NULL) | |
539 | |||
540 | current_distance = abs(r10_bio->devs[slot].addr - | ||
541 | conf->mirrors[disk].head_position); | ||
542 | |||
543 | /* Find the disk whose head is closest, | ||
544 | * or - for far > 1 - find the closest to partition beginning */ | ||
545 | |||
546 | for (nslot = slot; nslot < conf->copies; nslot++) { | ||
547 | int ndisk = r10_bio->devs[nslot].devnum; | ||
548 | |||
549 | |||
550 | if ((rdev=rcu_dereference(conf->mirrors[ndisk].rdev)) == NULL || | ||
551 | r10_bio->devs[nslot].bio == IO_BLOCKED || | ||
552 | !test_bit(In_sync, &rdev->flags)) | ||
553 | continue; | 519 | continue; |
520 | if (!test_bit(In_sync, &rdev->flags)) | ||
521 | continue; | ||
522 | |||
523 | if (!do_balance) | ||
524 | break; | ||
554 | 525 | ||
555 | /* This optimisation is debatable, and completely destroys | 526 | /* This optimisation is debatable, and completely destroys |
556 | * sequential read speed for 'far copies' arrays. So only | 527 | * sequential read speed for 'far copies' arrays. So only |
557 | * keep it for 'near' arrays, and review those later. | 528 | * keep it for 'near' arrays, and review those later. |
558 | */ | 529 | */ |
559 | if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) { | 530 | if (conf->near_copies > 1 && !atomic_read(&rdev->nr_pending)) |
560 | disk = ndisk; | ||
561 | slot = nslot; | ||
562 | break; | 531 | break; |
563 | } | ||
564 | 532 | ||
565 | /* for far > 1 always use the lowest address */ | 533 | /* for far > 1 always use the lowest address */ |
566 | if (conf->far_copies > 1) | 534 | if (conf->far_copies > 1) |
567 | new_distance = r10_bio->devs[nslot].addr; | 535 | new_distance = r10_bio->devs[slot].addr; |
568 | else | 536 | else |
569 | new_distance = abs(r10_bio->devs[nslot].addr - | 537 | new_distance = abs(r10_bio->devs[slot].addr - |
570 | conf->mirrors[ndisk].head_position); | 538 | conf->mirrors[disk].head_position); |
571 | if (new_distance < current_distance) { | 539 | if (new_distance < best_dist) { |
572 | current_distance = new_distance; | 540 | best_dist = new_distance; |
573 | disk = ndisk; | 541 | best_slot = slot; |
574 | slot = nslot; | ||
575 | } | 542 | } |
576 | } | 543 | } |
544 | if (slot == conf->copies) | ||
545 | slot = best_slot; | ||
577 | 546 | ||
578 | rb_out: | 547 | if (slot >= 0) { |
579 | r10_bio->read_slot = slot; | 548 | disk = r10_bio->devs[slot].devnum; |
580 | /* conf->next_seq_sect = this_sector + sectors;*/ | 549 | rdev = rcu_dereference(conf->mirrors[disk].rdev); |
581 | 550 | if (!rdev) | |
582 | if (disk >= 0 && (rdev=rcu_dereference(conf->mirrors[disk].rdev))!= NULL) | 551 | goto retry; |
583 | atomic_inc(&conf->mirrors[disk].rdev->nr_pending); | 552 | atomic_inc(&rdev->nr_pending); |
584 | else | 553 | if (test_bit(Faulty, &rdev->flags)) { |
554 | /* Cannot risk returning a device that failed | ||
555 | * before we inc'ed nr_pending | ||
556 | */ | ||
557 | rdev_dec_pending(rdev, conf->mddev); | ||
558 | goto retry; | ||
559 | } | ||
560 | r10_bio->read_slot = slot; | ||
561 | } else | ||
585 | disk = -1; | 562 | disk = -1; |
586 | rcu_read_unlock(); | 563 | rcu_read_unlock(); |
587 | 564 | ||
@@ -1460,40 +1437,33 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1460 | int max_read_errors = atomic_read(&mddev->max_corr_read_errors); | 1437 | int max_read_errors = atomic_read(&mddev->max_corr_read_errors); |
1461 | int d = r10_bio->devs[r10_bio->read_slot].devnum; | 1438 | int d = r10_bio->devs[r10_bio->read_slot].devnum; |
1462 | 1439 | ||
1463 | rcu_read_lock(); | 1440 | /* still own a reference to this rdev, so it cannot |
1464 | rdev = rcu_dereference(conf->mirrors[d].rdev); | 1441 | * have been cleared recently. |
1465 | if (rdev) { /* If rdev is not NULL */ | 1442 | */ |
1466 | char b[BDEVNAME_SIZE]; | 1443 | rdev = conf->mirrors[d].rdev; |
1467 | int cur_read_error_count = 0; | ||
1468 | 1444 | ||
1469 | bdevname(rdev->bdev, b); | 1445 | if (test_bit(Faulty, &rdev->flags)) |
1446 | /* drive has already been failed, just ignore any | ||
1447 | more fix_read_error() attempts */ | ||
1448 | return; | ||
1470 | 1449 | ||
1471 | if (test_bit(Faulty, &rdev->flags)) { | 1450 | check_decay_read_errors(mddev, rdev); |
1472 | rcu_read_unlock(); | 1451 | atomic_inc(&rdev->read_errors); |
1473 | /* drive has already been failed, just ignore any | 1452 | if (atomic_read(&rdev->read_errors) > max_read_errors) { |
1474 | more fix_read_error() attempts */ | 1453 | char b[BDEVNAME_SIZE]; |
1475 | return; | 1454 | bdevname(rdev->bdev, b); |
1476 | } | ||
1477 | 1455 | ||
1478 | check_decay_read_errors(mddev, rdev); | 1456 | printk(KERN_NOTICE |
1479 | atomic_inc(&rdev->read_errors); | 1457 | "md/raid10:%s: %s: Raid device exceeded " |
1480 | cur_read_error_count = atomic_read(&rdev->read_errors); | 1458 | "read_error threshold [cur %d:max %d]\n", |
1481 | if (cur_read_error_count > max_read_errors) { | 1459 | mdname(mddev), b, |
1482 | rcu_read_unlock(); | 1460 | atomic_read(&rdev->read_errors), max_read_errors); |
1483 | printk(KERN_NOTICE | 1461 | printk(KERN_NOTICE |
1484 | "md/raid10:%s: %s: Raid device exceeded " | 1462 | "md/raid10:%s: %s: Failing raid device\n", |
1485 | "read_error threshold " | 1463 | mdname(mddev), b); |
1486 | "[cur %d:max %d]\n", | 1464 | md_error(mddev, conf->mirrors[d].rdev); |
1487 | mdname(mddev), | 1465 | return; |
1488 | b, cur_read_error_count, max_read_errors); | ||
1489 | printk(KERN_NOTICE | ||
1490 | "md/raid10:%s: %s: Failing raid " | ||
1491 | "device\n", mdname(mddev), b); | ||
1492 | md_error(mddev, conf->mirrors[d].rdev); | ||
1493 | return; | ||
1494 | } | ||
1495 | } | 1466 | } |
1496 | rcu_read_unlock(); | ||
1497 | 1467 | ||
1498 | while(sectors) { | 1468 | while(sectors) { |
1499 | int s = sectors; | 1469 | int s = sectors; |
@@ -1562,8 +1532,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1562 | "write failed" | 1532 | "write failed" |
1563 | " (%d sectors at %llu on %s)\n", | 1533 | " (%d sectors at %llu on %s)\n", |
1564 | mdname(mddev), s, | 1534 | mdname(mddev), s, |
1565 | (unsigned long long)(sect+ | 1535 | (unsigned long long)( |
1566 | rdev->data_offset), | 1536 | sect + rdev->data_offset), |
1567 | bdevname(rdev->bdev, b)); | 1537 | bdevname(rdev->bdev, b)); |
1568 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " | 1538 | printk(KERN_NOTICE "md/raid10:%s: %s: failing " |
1569 | "drive\n", | 1539 | "drive\n", |
@@ -1599,8 +1569,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1599 | "corrected sectors" | 1569 | "corrected sectors" |
1600 | " (%d sectors at %llu on %s)\n", | 1570 | " (%d sectors at %llu on %s)\n", |
1601 | mdname(mddev), s, | 1571 | mdname(mddev), s, |
1602 | (unsigned long long)(sect+ | 1572 | (unsigned long long)( |
1603 | rdev->data_offset), | 1573 | sect + rdev->data_offset), |
1604 | bdevname(rdev->bdev, b)); | 1574 | bdevname(rdev->bdev, b)); |
1605 | printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", | 1575 | printk(KERN_NOTICE "md/raid10:%s: %s: failing drive\n", |
1606 | mdname(mddev), | 1576 | mdname(mddev), |
@@ -1612,8 +1582,8 @@ static void fix_read_error(conf_t *conf, mddev_t *mddev, r10bio_t *r10_bio) | |||
1612 | "md/raid10:%s: read error corrected" | 1582 | "md/raid10:%s: read error corrected" |
1613 | " (%d sectors at %llu on %s)\n", | 1583 | " (%d sectors at %llu on %s)\n", |
1614 | mdname(mddev), s, | 1584 | mdname(mddev), s, |
1615 | (unsigned long long)(sect+ | 1585 | (unsigned long long)( |
1616 | rdev->data_offset), | 1586 | sect + rdev->data_offset), |
1617 | bdevname(rdev->bdev, b)); | 1587 | bdevname(rdev->bdev, b)); |
1618 | } | 1588 | } |
1619 | 1589 | ||
@@ -1663,7 +1633,8 @@ static void raid10d(mddev_t *mddev) | |||
1663 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) | 1633 | else if (test_bit(R10BIO_IsRecover, &r10_bio->state)) |
1664 | recovery_request_write(mddev, r10_bio); | 1634 | recovery_request_write(mddev, r10_bio); |
1665 | else { | 1635 | else { |
1666 | int mirror; | 1636 | int slot = r10_bio->read_slot; |
1637 | int mirror = r10_bio->devs[slot].devnum; | ||
1667 | /* we got a read error. Maybe the drive is bad. Maybe just | 1638 | /* we got a read error. Maybe the drive is bad. Maybe just |
1668 | * the block and we can fix it. | 1639 | * the block and we can fix it. |
1669 | * We freeze all other IO, and try reading the block from | 1640 | * We freeze all other IO, and try reading the block from |
@@ -1677,9 +1648,10 @@ static void raid10d(mddev_t *mddev) | |||
1677 | fix_read_error(conf, mddev, r10_bio); | 1648 | fix_read_error(conf, mddev, r10_bio); |
1678 | unfreeze_array(conf); | 1649 | unfreeze_array(conf); |
1679 | } | 1650 | } |
1651 | rdev_dec_pending(conf->mirrors[mirror].rdev, mddev); | ||
1680 | 1652 | ||
1681 | bio = r10_bio->devs[r10_bio->read_slot].bio; | 1653 | bio = r10_bio->devs[slot].bio; |
1682 | r10_bio->devs[r10_bio->read_slot].bio = | 1654 | r10_bio->devs[slot].bio = |
1683 | mddev->ro ? IO_BLOCKED : NULL; | 1655 | mddev->ro ? IO_BLOCKED : NULL; |
1684 | mirror = read_balance(conf, r10_bio); | 1656 | mirror = read_balance(conf, r10_bio); |
1685 | if (mirror == -1) { | 1657 | if (mirror == -1) { |
@@ -1693,6 +1665,7 @@ static void raid10d(mddev_t *mddev) | |||
1693 | } else { | 1665 | } else { |
1694 | const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); | 1666 | const unsigned long do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); |
1695 | bio_put(bio); | 1667 | bio_put(bio); |
1668 | slot = r10_bio->read_slot; | ||
1696 | rdev = conf->mirrors[mirror].rdev; | 1669 | rdev = conf->mirrors[mirror].rdev; |
1697 | if (printk_ratelimit()) | 1670 | if (printk_ratelimit()) |
1698 | printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" | 1671 | printk(KERN_ERR "md/raid10:%s: %s: redirecting sector %llu to" |
@@ -1702,8 +1675,8 @@ static void raid10d(mddev_t *mddev) | |||
1702 | (unsigned long long)r10_bio->sector); | 1675 | (unsigned long long)r10_bio->sector); |
1703 | bio = bio_clone_mddev(r10_bio->master_bio, | 1676 | bio = bio_clone_mddev(r10_bio->master_bio, |
1704 | GFP_NOIO, mddev); | 1677 | GFP_NOIO, mddev); |
1705 | r10_bio->devs[r10_bio->read_slot].bio = bio; | 1678 | r10_bio->devs[slot].bio = bio; |
1706 | bio->bi_sector = r10_bio->devs[r10_bio->read_slot].addr | 1679 | bio->bi_sector = r10_bio->devs[slot].addr |
1707 | + rdev->data_offset; | 1680 | + rdev->data_offset; |
1708 | bio->bi_bdev = rdev->bdev; | 1681 | bio->bi_bdev = rdev->bdev; |
1709 | bio->bi_rw = READ | do_sync; | 1682 | bio->bi_rw = READ | do_sync; |
@@ -1763,13 +1736,13 @@ static int init_resync(conf_t *conf) | |||
1763 | * | 1736 | * |
1764 | */ | 1737 | */ |
1765 | 1738 | ||
1766 | static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, int go_faster) | 1739 | static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, |
1740 | int *skipped, int go_faster) | ||
1767 | { | 1741 | { |
1768 | conf_t *conf = mddev->private; | 1742 | conf_t *conf = mddev->private; |
1769 | r10bio_t *r10_bio; | 1743 | r10bio_t *r10_bio; |
1770 | struct bio *biolist = NULL, *bio; | 1744 | struct bio *biolist = NULL, *bio; |
1771 | sector_t max_sector, nr_sectors; | 1745 | sector_t max_sector, nr_sectors; |
1772 | int disk; | ||
1773 | int i; | 1746 | int i; |
1774 | int max_sync; | 1747 | int max_sync; |
1775 | sector_t sync_blocks; | 1748 | sector_t sync_blocks; |
@@ -1858,108 +1831,114 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1858 | int j, k; | 1831 | int j, k; |
1859 | r10_bio = NULL; | 1832 | r10_bio = NULL; |
1860 | 1833 | ||
1861 | for (i=0 ; i<conf->raid_disks; i++) | 1834 | for (i=0 ; i<conf->raid_disks; i++) { |
1862 | if (conf->mirrors[i].rdev && | 1835 | int still_degraded; |
1863 | !test_bit(In_sync, &conf->mirrors[i].rdev->flags)) { | 1836 | r10bio_t *rb2; |
1864 | int still_degraded = 0; | 1837 | sector_t sect; |
1865 | /* want to reconstruct this device */ | 1838 | int must_sync; |
1866 | r10bio_t *rb2 = r10_bio; | ||
1867 | sector_t sect = raid10_find_virt(conf, sector_nr, i); | ||
1868 | int must_sync; | ||
1869 | /* Unless we are doing a full sync, we only need | ||
1870 | * to recover the block if it is set in the bitmap | ||
1871 | */ | ||
1872 | must_sync = bitmap_start_sync(mddev->bitmap, sect, | ||
1873 | &sync_blocks, 1); | ||
1874 | if (sync_blocks < max_sync) | ||
1875 | max_sync = sync_blocks; | ||
1876 | if (!must_sync && | ||
1877 | !conf->fullsync) { | ||
1878 | /* yep, skip the sync_blocks here, but don't assume | ||
1879 | * that there will never be anything to do here | ||
1880 | */ | ||
1881 | chunks_skipped = -1; | ||
1882 | continue; | ||
1883 | } | ||
1884 | 1839 | ||
1885 | r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); | 1840 | if (conf->mirrors[i].rdev == NULL || |
1886 | raise_barrier(conf, rb2 != NULL); | 1841 | test_bit(In_sync, &conf->mirrors[i].rdev->flags)) |
1887 | atomic_set(&r10_bio->remaining, 0); | 1842 | continue; |
1888 | 1843 | ||
1889 | r10_bio->master_bio = (struct bio*)rb2; | 1844 | still_degraded = 0; |
1890 | if (rb2) | 1845 | /* want to reconstruct this device */ |
1891 | atomic_inc(&rb2->remaining); | 1846 | rb2 = r10_bio; |
1892 | r10_bio->mddev = mddev; | 1847 | sect = raid10_find_virt(conf, sector_nr, i); |
1893 | set_bit(R10BIO_IsRecover, &r10_bio->state); | 1848 | /* Unless we are doing a full sync, we only need |
1894 | r10_bio->sector = sect; | 1849 | * to recover the block if it is set in the bitmap |
1850 | */ | ||
1851 | must_sync = bitmap_start_sync(mddev->bitmap, sect, | ||
1852 | &sync_blocks, 1); | ||
1853 | if (sync_blocks < max_sync) | ||
1854 | max_sync = sync_blocks; | ||
1855 | if (!must_sync && | ||
1856 | !conf->fullsync) { | ||
1857 | /* yep, skip the sync_blocks here, but don't assume | ||
1858 | * that there will never be anything to do here | ||
1859 | */ | ||
1860 | chunks_skipped = -1; | ||
1861 | continue; | ||
1862 | } | ||
1895 | 1863 | ||
1896 | raid10_find_phys(conf, r10_bio); | 1864 | r10_bio = mempool_alloc(conf->r10buf_pool, GFP_NOIO); |
1865 | raise_barrier(conf, rb2 != NULL); | ||
1866 | atomic_set(&r10_bio->remaining, 0); | ||
1897 | 1867 | ||
1898 | /* Need to check if the array will still be | 1868 | r10_bio->master_bio = (struct bio*)rb2; |
1899 | * degraded | 1869 | if (rb2) |
1900 | */ | 1870 | atomic_inc(&rb2->remaining); |
1901 | for (j=0; j<conf->raid_disks; j++) | 1871 | r10_bio->mddev = mddev; |
1902 | if (conf->mirrors[j].rdev == NULL || | 1872 | set_bit(R10BIO_IsRecover, &r10_bio->state); |
1903 | test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { | 1873 | r10_bio->sector = sect; |
1904 | still_degraded = 1; | ||
1905 | break; | ||
1906 | } | ||
1907 | |||
1908 | must_sync = bitmap_start_sync(mddev->bitmap, sect, | ||
1909 | &sync_blocks, still_degraded); | ||
1910 | |||
1911 | for (j=0; j<conf->copies;j++) { | ||
1912 | int d = r10_bio->devs[j].devnum; | ||
1913 | if (conf->mirrors[d].rdev && | ||
1914 | test_bit(In_sync, &conf->mirrors[d].rdev->flags)) { | ||
1915 | /* This is where we read from */ | ||
1916 | bio = r10_bio->devs[0].bio; | ||
1917 | bio->bi_next = biolist; | ||
1918 | biolist = bio; | ||
1919 | bio->bi_private = r10_bio; | ||
1920 | bio->bi_end_io = end_sync_read; | ||
1921 | bio->bi_rw = READ; | ||
1922 | bio->bi_sector = r10_bio->devs[j].addr + | ||
1923 | conf->mirrors[d].rdev->data_offset; | ||
1924 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; | ||
1925 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | ||
1926 | atomic_inc(&r10_bio->remaining); | ||
1927 | /* and we write to 'i' */ | ||
1928 | |||
1929 | for (k=0; k<conf->copies; k++) | ||
1930 | if (r10_bio->devs[k].devnum == i) | ||
1931 | break; | ||
1932 | BUG_ON(k == conf->copies); | ||
1933 | bio = r10_bio->devs[1].bio; | ||
1934 | bio->bi_next = biolist; | ||
1935 | biolist = bio; | ||
1936 | bio->bi_private = r10_bio; | ||
1937 | bio->bi_end_io = end_sync_write; | ||
1938 | bio->bi_rw = WRITE; | ||
1939 | bio->bi_sector = r10_bio->devs[k].addr + | ||
1940 | conf->mirrors[i].rdev->data_offset; | ||
1941 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1942 | |||
1943 | r10_bio->devs[0].devnum = d; | ||
1944 | r10_bio->devs[1].devnum = i; | ||
1945 | 1874 | ||
1946 | break; | 1875 | raid10_find_phys(conf, r10_bio); |
1947 | } | 1876 | |
1948 | } | 1877 | /* Need to check if the array will still be |
1949 | if (j == conf->copies) { | 1878 | * degraded |
1950 | /* Cannot recover, so abort the recovery */ | 1879 | */ |
1951 | put_buf(r10_bio); | 1880 | for (j=0; j<conf->raid_disks; j++) |
1952 | if (rb2) | 1881 | if (conf->mirrors[j].rdev == NULL || |
1953 | atomic_dec(&rb2->remaining); | 1882 | test_bit(Faulty, &conf->mirrors[j].rdev->flags)) { |
1954 | r10_bio = rb2; | 1883 | still_degraded = 1; |
1955 | if (!test_and_set_bit(MD_RECOVERY_INTR, | ||
1956 | &mddev->recovery)) | ||
1957 | printk(KERN_INFO "md/raid10:%s: insufficient " | ||
1958 | "working devices for recovery.\n", | ||
1959 | mdname(mddev)); | ||
1960 | break; | 1884 | break; |
1961 | } | 1885 | } |
1886 | |||
1887 | must_sync = bitmap_start_sync(mddev->bitmap, sect, | ||
1888 | &sync_blocks, still_degraded); | ||
1889 | |||
1890 | for (j=0; j<conf->copies;j++) { | ||
1891 | int d = r10_bio->devs[j].devnum; | ||
1892 | if (!conf->mirrors[d].rdev || | ||
1893 | !test_bit(In_sync, &conf->mirrors[d].rdev->flags)) | ||
1894 | continue; | ||
1895 | /* This is where we read from */ | ||
1896 | bio = r10_bio->devs[0].bio; | ||
1897 | bio->bi_next = biolist; | ||
1898 | biolist = bio; | ||
1899 | bio->bi_private = r10_bio; | ||
1900 | bio->bi_end_io = end_sync_read; | ||
1901 | bio->bi_rw = READ; | ||
1902 | bio->bi_sector = r10_bio->devs[j].addr + | ||
1903 | conf->mirrors[d].rdev->data_offset; | ||
1904 | bio->bi_bdev = conf->mirrors[d].rdev->bdev; | ||
1905 | atomic_inc(&conf->mirrors[d].rdev->nr_pending); | ||
1906 | atomic_inc(&r10_bio->remaining); | ||
1907 | /* and we write to 'i' */ | ||
1908 | |||
1909 | for (k=0; k<conf->copies; k++) | ||
1910 | if (r10_bio->devs[k].devnum == i) | ||
1911 | break; | ||
1912 | BUG_ON(k == conf->copies); | ||
1913 | bio = r10_bio->devs[1].bio; | ||
1914 | bio->bi_next = biolist; | ||
1915 | biolist = bio; | ||
1916 | bio->bi_private = r10_bio; | ||
1917 | bio->bi_end_io = end_sync_write; | ||
1918 | bio->bi_rw = WRITE; | ||
1919 | bio->bi_sector = r10_bio->devs[k].addr + | ||
1920 | conf->mirrors[i].rdev->data_offset; | ||
1921 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1922 | |||
1923 | r10_bio->devs[0].devnum = d; | ||
1924 | r10_bio->devs[1].devnum = i; | ||
1925 | |||
1926 | break; | ||
1927 | } | ||
1928 | if (j == conf->copies) { | ||
1929 | /* Cannot recover, so abort the recovery */ | ||
1930 | put_buf(r10_bio); | ||
1931 | if (rb2) | ||
1932 | atomic_dec(&rb2->remaining); | ||
1933 | r10_bio = rb2; | ||
1934 | if (!test_and_set_bit(MD_RECOVERY_INTR, | ||
1935 | &mddev->recovery)) | ||
1936 | printk(KERN_INFO "md/raid10:%s: insufficient " | ||
1937 | "working devices for recovery.\n", | ||
1938 | mdname(mddev)); | ||
1939 | break; | ||
1962 | } | 1940 | } |
1941 | } | ||
1963 | if (biolist == NULL) { | 1942 | if (biolist == NULL) { |
1964 | while (r10_bio) { | 1943 | while (r10_bio) { |
1965 | r10bio_t *rb2 = r10_bio; | 1944 | r10bio_t *rb2 = r10_bio; |
@@ -1977,7 +1956,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1977 | 1956 | ||
1978 | if (!bitmap_start_sync(mddev->bitmap, sector_nr, | 1957 | if (!bitmap_start_sync(mddev->bitmap, sector_nr, |
1979 | &sync_blocks, mddev->degraded) && | 1958 | &sync_blocks, mddev->degraded) && |
1980 | !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { | 1959 | !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, |
1960 | &mddev->recovery)) { | ||
1981 | /* We can skip this block */ | 1961 | /* We can skip this block */ |
1982 | *skipped = 1; | 1962 | *skipped = 1; |
1983 | return sync_blocks + sectors_skipped; | 1963 | return sync_blocks + sectors_skipped; |
@@ -2022,7 +2002,8 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
2022 | for (i=0; i<conf->copies; i++) { | 2002 | for (i=0; i<conf->copies; i++) { |
2023 | int d = r10_bio->devs[i].devnum; | 2003 | int d = r10_bio->devs[i].devnum; |
2024 | if (r10_bio->devs[i].bio->bi_end_io) | 2004 | if (r10_bio->devs[i].bio->bi_end_io) |
2025 | rdev_dec_pending(conf->mirrors[d].rdev, mddev); | 2005 | rdev_dec_pending(conf->mirrors[d].rdev, |
2006 | mddev); | ||
2026 | } | 2007 | } |
2027 | put_buf(r10_bio); | 2008 | put_buf(r10_bio); |
2028 | biolist = NULL; | 2009 | biolist = NULL; |
@@ -2047,26 +2028,27 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
2047 | do { | 2028 | do { |
2048 | struct page *page; | 2029 | struct page *page; |
2049 | int len = PAGE_SIZE; | 2030 | int len = PAGE_SIZE; |
2050 | disk = 0; | ||
2051 | if (sector_nr + (len>>9) > max_sector) | 2031 | if (sector_nr + (len>>9) > max_sector) |
2052 | len = (max_sector - sector_nr) << 9; | 2032 | len = (max_sector - sector_nr) << 9; |
2053 | if (len == 0) | 2033 | if (len == 0) |
2054 | break; | 2034 | break; |
2055 | for (bio= biolist ; bio ; bio=bio->bi_next) { | 2035 | for (bio= biolist ; bio ; bio=bio->bi_next) { |
2036 | struct bio *bio2; | ||
2056 | page = bio->bi_io_vec[bio->bi_vcnt].bv_page; | 2037 | page = bio->bi_io_vec[bio->bi_vcnt].bv_page; |
2057 | if (bio_add_page(bio, page, len, 0) == 0) { | 2038 | if (bio_add_page(bio, page, len, 0)) |
2058 | /* stop here */ | 2039 | continue; |
2059 | struct bio *bio2; | 2040 | |
2060 | bio->bi_io_vec[bio->bi_vcnt].bv_page = page; | 2041 | /* stop here */ |
2061 | for (bio2 = biolist; bio2 && bio2 != bio; bio2 = bio2->bi_next) { | 2042 | bio->bi_io_vec[bio->bi_vcnt].bv_page = page; |
2062 | /* remove last page from this bio */ | 2043 | for (bio2 = biolist; |
2063 | bio2->bi_vcnt--; | 2044 | bio2 && bio2 != bio; |
2064 | bio2->bi_size -= len; | 2045 | bio2 = bio2->bi_next) { |
2065 | bio2->bi_flags &= ~(1<< BIO_SEG_VALID); | 2046 | /* remove last page from this bio */ |
2066 | } | 2047 | bio2->bi_vcnt--; |
2067 | goto bio_full; | 2048 | bio2->bi_size -= len; |
2049 | bio2->bi_flags &= ~(1<< BIO_SEG_VALID); | ||
2068 | } | 2050 | } |
2069 | disk = i; | 2051 | goto bio_full; |
2070 | } | 2052 | } |
2071 | nr_sectors += len>>9; | 2053 | nr_sectors += len>>9; |
2072 | sector_nr += len>>9; | 2054 | sector_nr += len>>9; |
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 49bf5f891435..34dd54539f7b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -1700,27 +1700,25 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1700 | raid5_conf_t *conf = mddev->private; | 1700 | raid5_conf_t *conf = mddev->private; |
1701 | pr_debug("raid456: error called\n"); | 1701 | pr_debug("raid456: error called\n"); |
1702 | 1702 | ||
1703 | if (!test_bit(Faulty, &rdev->flags)) { | 1703 | if (test_and_clear_bit(In_sync, &rdev->flags)) { |
1704 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 1704 | unsigned long flags; |
1705 | if (test_and_clear_bit(In_sync, &rdev->flags)) { | 1705 | spin_lock_irqsave(&conf->device_lock, flags); |
1706 | unsigned long flags; | 1706 | mddev->degraded++; |
1707 | spin_lock_irqsave(&conf->device_lock, flags); | 1707 | spin_unlock_irqrestore(&conf->device_lock, flags); |
1708 | mddev->degraded++; | 1708 | /* |
1709 | spin_unlock_irqrestore(&conf->device_lock, flags); | 1709 | * if recovery was running, make sure it aborts. |
1710 | /* | 1710 | */ |
1711 | * if recovery was running, make sure it aborts. | 1711 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); |
1712 | */ | ||
1713 | set_bit(MD_RECOVERY_INTR, &mddev->recovery); | ||
1714 | } | ||
1715 | set_bit(Faulty, &rdev->flags); | ||
1716 | printk(KERN_ALERT | ||
1717 | "md/raid:%s: Disk failure on %s, disabling device.\n" | ||
1718 | "md/raid:%s: Operation continuing on %d devices.\n", | ||
1719 | mdname(mddev), | ||
1720 | bdevname(rdev->bdev, b), | ||
1721 | mdname(mddev), | ||
1722 | conf->raid_disks - mddev->degraded); | ||
1723 | } | 1712 | } |
1713 | set_bit(Faulty, &rdev->flags); | ||
1714 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | ||
1715 | printk(KERN_ALERT | ||
1716 | "md/raid:%s: Disk failure on %s, disabling device.\n" | ||
1717 | "md/raid:%s: Operation continuing on %d devices.\n", | ||
1718 | mdname(mddev), | ||
1719 | bdevname(rdev->bdev, b), | ||
1720 | mdname(mddev), | ||
1721 | conf->raid_disks - mddev->degraded); | ||
1724 | } | 1722 | } |
1725 | 1723 | ||
1726 | /* | 1724 | /* |
@@ -5391,7 +5389,8 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) | |||
5391 | return -EINVAL; | 5389 | return -EINVAL; |
5392 | set_capacity(mddev->gendisk, mddev->array_sectors); | 5390 | set_capacity(mddev->gendisk, mddev->array_sectors); |
5393 | revalidate_disk(mddev->gendisk); | 5391 | revalidate_disk(mddev->gendisk); |
5394 | if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { | 5392 | if (sectors > mddev->dev_sectors && |
5393 | mddev->recovery_cp > mddev->dev_sectors) { | ||
5395 | mddev->recovery_cp = mddev->dev_sectors; | 5394 | mddev->recovery_cp = mddev->dev_sectors; |
5396 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 5395 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
5397 | } | 5396 | } |