diff options
Diffstat (limited to 'drivers/md/raid1.c')
-rw-r--r-- | drivers/md/raid1.c | 920 |
1 files changed, 395 insertions, 525 deletions
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 0b830bbe1d8b..f7431b6d8447 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -52,23 +52,16 @@ | |||
52 | #define NR_RAID1_BIOS 256 | 52 | #define NR_RAID1_BIOS 256 |
53 | 53 | ||
54 | 54 | ||
55 | static void unplug_slaves(mddev_t *mddev); | ||
56 | |||
57 | static void allow_barrier(conf_t *conf); | 55 | static void allow_barrier(conf_t *conf); |
58 | static void lower_barrier(conf_t *conf); | 56 | static void lower_barrier(conf_t *conf); |
59 | 57 | ||
60 | static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) | 58 | static void * r1bio_pool_alloc(gfp_t gfp_flags, void *data) |
61 | { | 59 | { |
62 | struct pool_info *pi = data; | 60 | struct pool_info *pi = data; |
63 | r1bio_t *r1_bio; | ||
64 | int size = offsetof(r1bio_t, bios[pi->raid_disks]); | 61 | int size = offsetof(r1bio_t, bios[pi->raid_disks]); |
65 | 62 | ||
66 | /* allocate a r1bio with room for raid_disks entries in the bios array */ | 63 | /* allocate a r1bio with room for raid_disks entries in the bios array */ |
67 | r1_bio = kzalloc(size, gfp_flags); | 64 | return kzalloc(size, gfp_flags); |
68 | if (!r1_bio && pi->mddev) | ||
69 | unplug_slaves(pi->mddev); | ||
70 | |||
71 | return r1_bio; | ||
72 | } | 65 | } |
73 | 66 | ||
74 | static void r1bio_pool_free(void *r1_bio, void *data) | 67 | static void r1bio_pool_free(void *r1_bio, void *data) |
@@ -91,16 +84,14 @@ static void * r1buf_pool_alloc(gfp_t gfp_flags, void *data) | |||
91 | int i, j; | 84 | int i, j; |
92 | 85 | ||
93 | r1_bio = r1bio_pool_alloc(gfp_flags, pi); | 86 | r1_bio = r1bio_pool_alloc(gfp_flags, pi); |
94 | if (!r1_bio) { | 87 | if (!r1_bio) |
95 | unplug_slaves(pi->mddev); | ||
96 | return NULL; | 88 | return NULL; |
97 | } | ||
98 | 89 | ||
99 | /* | 90 | /* |
100 | * Allocate bios : 1 for reading, n-1 for writing | 91 | * Allocate bios : 1 for reading, n-1 for writing |
101 | */ | 92 | */ |
102 | for (j = pi->raid_disks ; j-- ; ) { | 93 | for (j = pi->raid_disks ; j-- ; ) { |
103 | bio = bio_alloc(gfp_flags, RESYNC_PAGES); | 94 | bio = bio_kmalloc(gfp_flags, RESYNC_PAGES); |
104 | if (!bio) | 95 | if (!bio) |
105 | goto out_free_bio; | 96 | goto out_free_bio; |
106 | r1_bio->bios[j] = bio; | 97 | r1_bio->bios[j] = bio; |
@@ -306,6 +297,29 @@ static void raid1_end_read_request(struct bio *bio, int error) | |||
306 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); | 297 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); |
307 | } | 298 | } |
308 | 299 | ||
300 | static void r1_bio_write_done(r1bio_t *r1_bio) | ||
301 | { | ||
302 | if (atomic_dec_and_test(&r1_bio->remaining)) | ||
303 | { | ||
304 | /* it really is the end of this request */ | ||
305 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | ||
306 | /* free extra copy of the data pages */ | ||
307 | int i = r1_bio->behind_page_count; | ||
308 | while (i--) | ||
309 | safe_put_page(r1_bio->behind_pages[i]); | ||
310 | kfree(r1_bio->behind_pages); | ||
311 | r1_bio->behind_pages = NULL; | ||
312 | } | ||
313 | /* clear the bitmap if all writes complete successfully */ | ||
314 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, | ||
315 | r1_bio->sectors, | ||
316 | !test_bit(R1BIO_Degraded, &r1_bio->state), | ||
317 | test_bit(R1BIO_BehindIO, &r1_bio->state)); | ||
318 | md_write_end(r1_bio->mddev); | ||
319 | raid_end_bio_io(r1_bio); | ||
320 | } | ||
321 | } | ||
322 | |||
309 | static void raid1_end_write_request(struct bio *bio, int error) | 323 | static void raid1_end_write_request(struct bio *bio, int error) |
310 | { | 324 | { |
311 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 325 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
@@ -319,84 +333,61 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
319 | if (r1_bio->bios[mirror] == bio) | 333 | if (r1_bio->bios[mirror] == bio) |
320 | break; | 334 | break; |
321 | 335 | ||
322 | if (error == -EOPNOTSUPP && test_bit(R1BIO_Barrier, &r1_bio->state)) { | 336 | /* |
323 | set_bit(BarriersNotsupp, &conf->mirrors[mirror].rdev->flags); | 337 | * 'one mirror IO has finished' event handler: |
324 | set_bit(R1BIO_BarrierRetry, &r1_bio->state); | 338 | */ |
325 | r1_bio->mddev->barriers_work = 0; | 339 | r1_bio->bios[mirror] = NULL; |
326 | /* Don't rdev_dec_pending in this branch - keep it for the retry */ | 340 | to_put = bio; |
327 | } else { | 341 | if (!uptodate) { |
342 | md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); | ||
343 | /* an I/O failed, we can't clear the bitmap */ | ||
344 | set_bit(R1BIO_Degraded, &r1_bio->state); | ||
345 | } else | ||
328 | /* | 346 | /* |
329 | * this branch is our 'one mirror IO has finished' event handler: | 347 | * Set R1BIO_Uptodate in our master bio, so that we |
348 | * will return a good error code for to the higher | ||
349 | * levels even if IO on some other mirrored buffer | ||
350 | * fails. | ||
351 | * | ||
352 | * The 'master' represents the composite IO operation | ||
353 | * to user-side. So if something waits for IO, then it | ||
354 | * will wait for the 'master' bio. | ||
330 | */ | 355 | */ |
331 | r1_bio->bios[mirror] = NULL; | 356 | set_bit(R1BIO_Uptodate, &r1_bio->state); |
332 | to_put = bio; | 357 | |
333 | if (!uptodate) { | 358 | update_head_pos(mirror, r1_bio); |
334 | md_error(r1_bio->mddev, conf->mirrors[mirror].rdev); | 359 | |
335 | /* an I/O failed, we can't clear the bitmap */ | 360 | if (behind) { |
336 | set_bit(R1BIO_Degraded, &r1_bio->state); | 361 | if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) |
337 | } else | 362 | atomic_dec(&r1_bio->behind_remaining); |
338 | /* | 363 | |
339 | * Set R1BIO_Uptodate in our master bio, so that | 364 | /* |
340 | * we will return a good error code for to the higher | 365 | * In behind mode, we ACK the master bio once the I/O |
341 | * levels even if IO on some other mirrored buffer fails. | 366 | * has safely reached all non-writemostly |
342 | * | 367 | * disks. Setting the Returned bit ensures that this |
343 | * The 'master' represents the composite IO operation to | 368 | * gets done only once -- we don't ever want to return |
344 | * user-side. So if something waits for IO, then it will | 369 | * -EIO here, instead we'll wait |
345 | * wait for the 'master' bio. | 370 | */ |
346 | */ | 371 | if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && |
347 | set_bit(R1BIO_Uptodate, &r1_bio->state); | 372 | test_bit(R1BIO_Uptodate, &r1_bio->state)) { |
348 | 373 | /* Maybe we can return now */ | |
349 | update_head_pos(mirror, r1_bio); | 374 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { |
350 | 375 | struct bio *mbio = r1_bio->master_bio; | |
351 | if (behind) { | 376 | PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", |
352 | if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) | 377 | (unsigned long long) mbio->bi_sector, |
353 | atomic_dec(&r1_bio->behind_remaining); | 378 | (unsigned long long) mbio->bi_sector + |
354 | 379 | (mbio->bi_size >> 9) - 1); | |
355 | /* In behind mode, we ACK the master bio once the I/O has safely | 380 | bio_endio(mbio, 0); |
356 | * reached all non-writemostly disks. Setting the Returned bit | ||
357 | * ensures that this gets done only once -- we don't ever want to | ||
358 | * return -EIO here, instead we'll wait */ | ||
359 | |||
360 | if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && | ||
361 | test_bit(R1BIO_Uptodate, &r1_bio->state)) { | ||
362 | /* Maybe we can return now */ | ||
363 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { | ||
364 | struct bio *mbio = r1_bio->master_bio; | ||
365 | PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", | ||
366 | (unsigned long long) mbio->bi_sector, | ||
367 | (unsigned long long) mbio->bi_sector + | ||
368 | (mbio->bi_size >> 9) - 1); | ||
369 | bio_endio(mbio, 0); | ||
370 | } | ||
371 | } | 381 | } |
372 | } | 382 | } |
373 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); | ||
374 | } | 383 | } |
384 | rdev_dec_pending(conf->mirrors[mirror].rdev, conf->mddev); | ||
385 | |||
375 | /* | 386 | /* |
376 | * | ||
377 | * Let's see if all mirrored write operations have finished | 387 | * Let's see if all mirrored write operations have finished |
378 | * already. | 388 | * already. |
379 | */ | 389 | */ |
380 | if (atomic_dec_and_test(&r1_bio->remaining)) { | 390 | r1_bio_write_done(r1_bio); |
381 | if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) | ||
382 | reschedule_retry(r1_bio); | ||
383 | else { | ||
384 | /* it really is the end of this request */ | ||
385 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | ||
386 | /* free extra copy of the data pages */ | ||
387 | int i = bio->bi_vcnt; | ||
388 | while (i--) | ||
389 | safe_put_page(bio->bi_io_vec[i].bv_page); | ||
390 | } | ||
391 | /* clear the bitmap if all writes complete successfully */ | ||
392 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, | ||
393 | r1_bio->sectors, | ||
394 | !test_bit(R1BIO_Degraded, &r1_bio->state), | ||
395 | behind); | ||
396 | md_write_end(r1_bio->mddev); | ||
397 | raid_end_bio_io(r1_bio); | ||
398 | } | ||
399 | } | ||
400 | 391 | ||
401 | if (to_put) | 392 | if (to_put) |
402 | bio_put(to_put); | 393 | bio_put(to_put); |
@@ -420,11 +411,13 @@ static void raid1_end_write_request(struct bio *bio, int error) | |||
420 | static int read_balance(conf_t *conf, r1bio_t *r1_bio) | 411 | static int read_balance(conf_t *conf, r1bio_t *r1_bio) |
421 | { | 412 | { |
422 | const sector_t this_sector = r1_bio->sector; | 413 | const sector_t this_sector = r1_bio->sector; |
423 | int new_disk = conf->last_used, disk = new_disk; | ||
424 | int wonly_disk = -1; | ||
425 | const int sectors = r1_bio->sectors; | 414 | const int sectors = r1_bio->sectors; |
426 | sector_t new_distance, current_distance; | 415 | int start_disk; |
416 | int best_disk; | ||
417 | int i; | ||
418 | sector_t best_dist; | ||
427 | mdk_rdev_t *rdev; | 419 | mdk_rdev_t *rdev; |
420 | int choose_first; | ||
428 | 421 | ||
429 | rcu_read_lock(); | 422 | rcu_read_lock(); |
430 | /* | 423 | /* |
@@ -433,100 +426,63 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
433 | * We take the first readable disk when above the resync window. | 426 | * We take the first readable disk when above the resync window. |
434 | */ | 427 | */ |
435 | retry: | 428 | retry: |
429 | best_disk = -1; | ||
430 | best_dist = MaxSector; | ||
436 | if (conf->mddev->recovery_cp < MaxSector && | 431 | if (conf->mddev->recovery_cp < MaxSector && |
437 | (this_sector + sectors >= conf->next_resync)) { | 432 | (this_sector + sectors >= conf->next_resync)) { |
438 | /* Choose the first operational device, for consistancy */ | 433 | choose_first = 1; |
439 | new_disk = 0; | 434 | start_disk = 0; |
440 | 435 | } else { | |
441 | for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); | 436 | choose_first = 0; |
442 | r1_bio->bios[new_disk] == IO_BLOCKED || | 437 | start_disk = conf->last_used; |
443 | !rdev || !test_bit(In_sync, &rdev->flags) | ||
444 | || test_bit(WriteMostly, &rdev->flags); | ||
445 | rdev = rcu_dereference(conf->mirrors[++new_disk].rdev)) { | ||
446 | |||
447 | if (rdev && test_bit(In_sync, &rdev->flags) && | ||
448 | r1_bio->bios[new_disk] != IO_BLOCKED) | ||
449 | wonly_disk = new_disk; | ||
450 | |||
451 | if (new_disk == conf->raid_disks - 1) { | ||
452 | new_disk = wonly_disk; | ||
453 | break; | ||
454 | } | ||
455 | } | ||
456 | goto rb_out; | ||
457 | } | ||
458 | |||
459 | |||
460 | /* make sure the disk is operational */ | ||
461 | for (rdev = rcu_dereference(conf->mirrors[new_disk].rdev); | ||
462 | r1_bio->bios[new_disk] == IO_BLOCKED || | ||
463 | !rdev || !test_bit(In_sync, &rdev->flags) || | ||
464 | test_bit(WriteMostly, &rdev->flags); | ||
465 | rdev = rcu_dereference(conf->mirrors[new_disk].rdev)) { | ||
466 | |||
467 | if (rdev && test_bit(In_sync, &rdev->flags) && | ||
468 | r1_bio->bios[new_disk] != IO_BLOCKED) | ||
469 | wonly_disk = new_disk; | ||
470 | |||
471 | if (new_disk <= 0) | ||
472 | new_disk = conf->raid_disks; | ||
473 | new_disk--; | ||
474 | if (new_disk == disk) { | ||
475 | new_disk = wonly_disk; | ||
476 | break; | ||
477 | } | ||
478 | } | 438 | } |
479 | 439 | ||
480 | if (new_disk < 0) | 440 | for (i = 0 ; i < conf->raid_disks ; i++) { |
481 | goto rb_out; | 441 | sector_t dist; |
482 | 442 | int disk = start_disk + i; | |
483 | disk = new_disk; | 443 | if (disk >= conf->raid_disks) |
484 | /* now disk == new_disk == starting point for search */ | 444 | disk -= conf->raid_disks; |
485 | |||
486 | /* | ||
487 | * Don't change to another disk for sequential reads: | ||
488 | */ | ||
489 | if (conf->next_seq_sect == this_sector) | ||
490 | goto rb_out; | ||
491 | if (this_sector == conf->mirrors[new_disk].head_position) | ||
492 | goto rb_out; | ||
493 | |||
494 | current_distance = abs(this_sector - conf->mirrors[disk].head_position); | ||
495 | |||
496 | /* Find the disk whose head is closest */ | ||
497 | |||
498 | do { | ||
499 | if (disk <= 0) | ||
500 | disk = conf->raid_disks; | ||
501 | disk--; | ||
502 | 445 | ||
503 | rdev = rcu_dereference(conf->mirrors[disk].rdev); | 446 | rdev = rcu_dereference(conf->mirrors[disk].rdev); |
504 | 447 | if (r1_bio->bios[disk] == IO_BLOCKED | |
505 | if (!rdev || r1_bio->bios[disk] == IO_BLOCKED || | 448 | || rdev == NULL |
506 | !test_bit(In_sync, &rdev->flags) || | 449 | || test_bit(Faulty, &rdev->flags)) |
507 | test_bit(WriteMostly, &rdev->flags)) | ||
508 | continue; | 450 | continue; |
509 | 451 | if (!test_bit(In_sync, &rdev->flags) && | |
510 | if (!atomic_read(&rdev->nr_pending)) { | 452 | rdev->recovery_offset < this_sector + sectors) |
511 | new_disk = disk; | 453 | continue; |
454 | if (test_bit(WriteMostly, &rdev->flags)) { | ||
455 | /* Don't balance among write-mostly, just | ||
456 | * use the first as a last resort */ | ||
457 | if (best_disk < 0) | ||
458 | best_disk = disk; | ||
459 | continue; | ||
460 | } | ||
461 | /* This is a reasonable device to use. It might | ||
462 | * even be best. | ||
463 | */ | ||
464 | dist = abs(this_sector - conf->mirrors[disk].head_position); | ||
465 | if (choose_first | ||
466 | /* Don't change to another disk for sequential reads */ | ||
467 | || conf->next_seq_sect == this_sector | ||
468 | || dist == 0 | ||
469 | /* If device is idle, use it */ | ||
470 | || atomic_read(&rdev->nr_pending) == 0) { | ||
471 | best_disk = disk; | ||
512 | break; | 472 | break; |
513 | } | 473 | } |
514 | new_distance = abs(this_sector - conf->mirrors[disk].head_position); | 474 | if (dist < best_dist) { |
515 | if (new_distance < current_distance) { | 475 | best_dist = dist; |
516 | current_distance = new_distance; | 476 | best_disk = disk; |
517 | new_disk = disk; | ||
518 | } | 477 | } |
519 | } while (disk != conf->last_used); | 478 | } |
520 | |||
521 | rb_out: | ||
522 | |||
523 | 479 | ||
524 | if (new_disk >= 0) { | 480 | if (best_disk >= 0) { |
525 | rdev = rcu_dereference(conf->mirrors[new_disk].rdev); | 481 | rdev = rcu_dereference(conf->mirrors[best_disk].rdev); |
526 | if (!rdev) | 482 | if (!rdev) |
527 | goto retry; | 483 | goto retry; |
528 | atomic_inc(&rdev->nr_pending); | 484 | atomic_inc(&rdev->nr_pending); |
529 | if (!test_bit(In_sync, &rdev->flags)) { | 485 | if (test_bit(Faulty, &rdev->flags)) { |
530 | /* cannot risk returning a device that failed | 486 | /* cannot risk returning a device that failed |
531 | * before we inc'ed nr_pending | 487 | * before we inc'ed nr_pending |
532 | */ | 488 | */ |
@@ -534,59 +490,26 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio) | |||
534 | goto retry; | 490 | goto retry; |
535 | } | 491 | } |
536 | conf->next_seq_sect = this_sector + sectors; | 492 | conf->next_seq_sect = this_sector + sectors; |
537 | conf->last_used = new_disk; | 493 | conf->last_used = best_disk; |
538 | } | 494 | } |
539 | rcu_read_unlock(); | 495 | rcu_read_unlock(); |
540 | 496 | ||
541 | return new_disk; | 497 | return best_disk; |
542 | } | 498 | } |
543 | 499 | ||
544 | static void unplug_slaves(mddev_t *mddev) | 500 | int md_raid1_congested(mddev_t *mddev, int bits) |
545 | { | 501 | { |
546 | conf_t *conf = mddev->private; | 502 | conf_t *conf = mddev->private; |
547 | int i; | ||
548 | |||
549 | rcu_read_lock(); | ||
550 | for (i=0; i<mddev->raid_disks; i++) { | ||
551 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); | ||
552 | if (rdev && !test_bit(Faulty, &rdev->flags) && atomic_read(&rdev->nr_pending)) { | ||
553 | struct request_queue *r_queue = bdev_get_queue(rdev->bdev); | ||
554 | |||
555 | atomic_inc(&rdev->nr_pending); | ||
556 | rcu_read_unlock(); | ||
557 | |||
558 | blk_unplug(r_queue); | ||
559 | |||
560 | rdev_dec_pending(rdev, mddev); | ||
561 | rcu_read_lock(); | ||
562 | } | ||
563 | } | ||
564 | rcu_read_unlock(); | ||
565 | } | ||
566 | |||
567 | static void raid1_unplug(struct request_queue *q) | ||
568 | { | ||
569 | mddev_t *mddev = q->queuedata; | ||
570 | |||
571 | unplug_slaves(mddev); | ||
572 | md_wakeup_thread(mddev->thread); | ||
573 | } | ||
574 | |||
575 | static int raid1_congested(void *data, int bits) | ||
576 | { | ||
577 | mddev_t *mddev = data; | ||
578 | conf_t *conf = mddev->private; | ||
579 | int i, ret = 0; | 503 | int i, ret = 0; |
580 | 504 | ||
581 | if (mddev_congested(mddev, bits)) | ||
582 | return 1; | ||
583 | |||
584 | rcu_read_lock(); | 505 | rcu_read_lock(); |
585 | for (i = 0; i < mddev->raid_disks; i++) { | 506 | for (i = 0; i < mddev->raid_disks; i++) { |
586 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); | 507 | mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev); |
587 | if (rdev && !test_bit(Faulty, &rdev->flags)) { | 508 | if (rdev && !test_bit(Faulty, &rdev->flags)) { |
588 | struct request_queue *q = bdev_get_queue(rdev->bdev); | 509 | struct request_queue *q = bdev_get_queue(rdev->bdev); |
589 | 510 | ||
511 | BUG_ON(!q); | ||
512 | |||
590 | /* Note the '|| 1' - when read_balance prefers | 513 | /* Note the '|| 1' - when read_balance prefers |
591 | * non-congested targets, it can be removed | 514 | * non-congested targets, it can be removed |
592 | */ | 515 | */ |
@@ -599,22 +522,26 @@ static int raid1_congested(void *data, int bits) | |||
599 | rcu_read_unlock(); | 522 | rcu_read_unlock(); |
600 | return ret; | 523 | return ret; |
601 | } | 524 | } |
525 | EXPORT_SYMBOL_GPL(md_raid1_congested); | ||
602 | 526 | ||
527 | static int raid1_congested(void *data, int bits) | ||
528 | { | ||
529 | mddev_t *mddev = data; | ||
603 | 530 | ||
604 | static int flush_pending_writes(conf_t *conf) | 531 | return mddev_congested(mddev, bits) || |
532 | md_raid1_congested(mddev, bits); | ||
533 | } | ||
534 | |||
535 | static void flush_pending_writes(conf_t *conf) | ||
605 | { | 536 | { |
606 | /* Any writes that have been queued but are awaiting | 537 | /* Any writes that have been queued but are awaiting |
607 | * bitmap updates get flushed here. | 538 | * bitmap updates get flushed here. |
608 | * We return 1 if any requests were actually submitted. | ||
609 | */ | 539 | */ |
610 | int rv = 0; | ||
611 | |||
612 | spin_lock_irq(&conf->device_lock); | 540 | spin_lock_irq(&conf->device_lock); |
613 | 541 | ||
614 | if (conf->pending_bio_list.head) { | 542 | if (conf->pending_bio_list.head) { |
615 | struct bio *bio; | 543 | struct bio *bio; |
616 | bio = bio_list_get(&conf->pending_bio_list); | 544 | bio = bio_list_get(&conf->pending_bio_list); |
617 | blk_remove_plug(conf->mddev->queue); | ||
618 | spin_unlock_irq(&conf->device_lock); | 545 | spin_unlock_irq(&conf->device_lock); |
619 | /* flush any pending bitmap writes to | 546 | /* flush any pending bitmap writes to |
620 | * disk before proceeding w/ I/O */ | 547 | * disk before proceeding w/ I/O */ |
@@ -626,10 +553,8 @@ static int flush_pending_writes(conf_t *conf) | |||
626 | generic_make_request(bio); | 553 | generic_make_request(bio); |
627 | bio = next; | 554 | bio = next; |
628 | } | 555 | } |
629 | rv = 1; | ||
630 | } else | 556 | } else |
631 | spin_unlock_irq(&conf->device_lock); | 557 | spin_unlock_irq(&conf->device_lock); |
632 | return rv; | ||
633 | } | 558 | } |
634 | 559 | ||
635 | /* Barriers.... | 560 | /* Barriers.... |
@@ -661,17 +586,15 @@ static void raise_barrier(conf_t *conf) | |||
661 | 586 | ||
662 | /* Wait until no block IO is waiting */ | 587 | /* Wait until no block IO is waiting */ |
663 | wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, | 588 | wait_event_lock_irq(conf->wait_barrier, !conf->nr_waiting, |
664 | conf->resync_lock, | 589 | conf->resync_lock, ); |
665 | raid1_unplug(conf->mddev->queue)); | ||
666 | 590 | ||
667 | /* block any new IO from starting */ | 591 | /* block any new IO from starting */ |
668 | conf->barrier++; | 592 | conf->barrier++; |
669 | 593 | ||
670 | /* No wait for all pending IO to complete */ | 594 | /* Now wait for all pending IO to complete */ |
671 | wait_event_lock_irq(conf->wait_barrier, | 595 | wait_event_lock_irq(conf->wait_barrier, |
672 | !conf->nr_pending && conf->barrier < RESYNC_DEPTH, | 596 | !conf->nr_pending && conf->barrier < RESYNC_DEPTH, |
673 | conf->resync_lock, | 597 | conf->resync_lock, ); |
674 | raid1_unplug(conf->mddev->queue)); | ||
675 | 598 | ||
676 | spin_unlock_irq(&conf->resync_lock); | 599 | spin_unlock_irq(&conf->resync_lock); |
677 | } | 600 | } |
@@ -693,7 +616,7 @@ static void wait_barrier(conf_t *conf) | |||
693 | conf->nr_waiting++; | 616 | conf->nr_waiting++; |
694 | wait_event_lock_irq(conf->wait_barrier, !conf->barrier, | 617 | wait_event_lock_irq(conf->wait_barrier, !conf->barrier, |
695 | conf->resync_lock, | 618 | conf->resync_lock, |
696 | raid1_unplug(conf->mddev->queue)); | 619 | ); |
697 | conf->nr_waiting--; | 620 | conf->nr_waiting--; |
698 | } | 621 | } |
699 | conf->nr_pending++; | 622 | conf->nr_pending++; |
@@ -729,8 +652,7 @@ static void freeze_array(conf_t *conf) | |||
729 | wait_event_lock_irq(conf->wait_barrier, | 652 | wait_event_lock_irq(conf->wait_barrier, |
730 | conf->nr_pending == conf->nr_queued+1, | 653 | conf->nr_pending == conf->nr_queued+1, |
731 | conf->resync_lock, | 654 | conf->resync_lock, |
732 | ({ flush_pending_writes(conf); | 655 | flush_pending_writes(conf)); |
733 | raid1_unplug(conf->mddev->queue); })); | ||
734 | spin_unlock_irq(&conf->resync_lock); | 656 | spin_unlock_irq(&conf->resync_lock); |
735 | } | 657 | } |
736 | static void unfreeze_array(conf_t *conf) | 658 | static void unfreeze_array(conf_t *conf) |
@@ -744,15 +666,16 @@ static void unfreeze_array(conf_t *conf) | |||
744 | } | 666 | } |
745 | 667 | ||
746 | 668 | ||
747 | /* duplicate the data pages for behind I/O */ | 669 | /* duplicate the data pages for behind I/O |
748 | static struct page **alloc_behind_pages(struct bio *bio) | 670 | */ |
671 | static void alloc_behind_pages(struct bio *bio, r1bio_t *r1_bio) | ||
749 | { | 672 | { |
750 | int i; | 673 | int i; |
751 | struct bio_vec *bvec; | 674 | struct bio_vec *bvec; |
752 | struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page *), | 675 | struct page **pages = kzalloc(bio->bi_vcnt * sizeof(struct page*), |
753 | GFP_NOIO); | 676 | GFP_NOIO); |
754 | if (unlikely(!pages)) | 677 | if (unlikely(!pages)) |
755 | goto do_sync_io; | 678 | return; |
756 | 679 | ||
757 | bio_for_each_segment(bvec, bio, i) { | 680 | bio_for_each_segment(bvec, bio, i) { |
758 | pages[i] = alloc_page(GFP_NOIO); | 681 | pages[i] = alloc_page(GFP_NOIO); |
@@ -763,16 +686,17 @@ static struct page **alloc_behind_pages(struct bio *bio) | |||
763 | kunmap(pages[i]); | 686 | kunmap(pages[i]); |
764 | kunmap(bvec->bv_page); | 687 | kunmap(bvec->bv_page); |
765 | } | 688 | } |
766 | 689 | r1_bio->behind_pages = pages; | |
767 | return pages; | 690 | r1_bio->behind_page_count = bio->bi_vcnt; |
691 | set_bit(R1BIO_BehindIO, &r1_bio->state); | ||
692 | return; | ||
768 | 693 | ||
769 | do_sync_io: | 694 | do_sync_io: |
770 | if (pages) | 695 | for (i = 0; i < bio->bi_vcnt; i++) |
771 | for (i = 0; i < bio->bi_vcnt && pages[i]; i++) | 696 | if (pages[i]) |
772 | put_page(pages[i]); | 697 | put_page(pages[i]); |
773 | kfree(pages); | 698 | kfree(pages); |
774 | PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); | 699 | PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); |
775 | return NULL; | ||
776 | } | 700 | } |
777 | 701 | ||
778 | static int make_request(mddev_t *mddev, struct bio * bio) | 702 | static int make_request(mddev_t *mddev, struct bio * bio) |
@@ -784,20 +708,16 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
784 | int i, targets = 0, disks; | 708 | int i, targets = 0, disks; |
785 | struct bitmap *bitmap; | 709 | struct bitmap *bitmap; |
786 | unsigned long flags; | 710 | unsigned long flags; |
787 | struct bio_list bl; | ||
788 | struct page **behind_pages = NULL; | ||
789 | const int rw = bio_data_dir(bio); | 711 | const int rw = bio_data_dir(bio); |
790 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); | 712 | const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); |
791 | unsigned long do_barriers; | 713 | const unsigned long do_flush_fua = (bio->bi_rw & (REQ_FLUSH | REQ_FUA)); |
792 | mdk_rdev_t *blocked_rdev; | 714 | mdk_rdev_t *blocked_rdev; |
715 | int plugged; | ||
793 | 716 | ||
794 | /* | 717 | /* |
795 | * Register the new request and wait if the reconstruction | 718 | * Register the new request and wait if the reconstruction |
796 | * thread has put up a bar for new requests. | 719 | * thread has put up a bar for new requests. |
797 | * Continue immediately if no resync is active currently. | 720 | * Continue immediately if no resync is active currently. |
798 | * We test barriers_work *after* md_write_start as md_write_start | ||
799 | * may cause the first superblock write, and that will check out | ||
800 | * if barriers work. | ||
801 | */ | 721 | */ |
802 | 722 | ||
803 | md_write_start(mddev, bio); /* wait on superblock update early */ | 723 | md_write_start(mddev, bio); /* wait on superblock update early */ |
@@ -821,13 +741,6 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
821 | } | 741 | } |
822 | finish_wait(&conf->wait_barrier, &w); | 742 | finish_wait(&conf->wait_barrier, &w); |
823 | } | 743 | } |
824 | if (unlikely(!mddev->barriers_work && | ||
825 | (bio->bi_rw & REQ_HARDBARRIER))) { | ||
826 | if (rw == WRITE) | ||
827 | md_write_end(mddev); | ||
828 | bio_endio(bio, -EOPNOTSUPP); | ||
829 | return 0; | ||
830 | } | ||
831 | 744 | ||
832 | wait_barrier(conf); | 745 | wait_barrier(conf); |
833 | 746 | ||
@@ -870,7 +783,7 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
870 | } | 783 | } |
871 | r1_bio->read_disk = rdisk; | 784 | r1_bio->read_disk = rdisk; |
872 | 785 | ||
873 | read_bio = bio_clone(bio, GFP_NOIO); | 786 | read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
874 | 787 | ||
875 | r1_bio->bios[rdisk] = read_bio; | 788 | r1_bio->bios[rdisk] = read_bio; |
876 | 789 | ||
@@ -891,14 +804,9 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
891 | * inc refcount on their rdev. Record them by setting | 804 | * inc refcount on their rdev. Record them by setting |
892 | * bios[x] to bio | 805 | * bios[x] to bio |
893 | */ | 806 | */ |
807 | plugged = mddev_check_plugged(mddev); | ||
808 | |||
894 | disks = conf->raid_disks; | 809 | disks = conf->raid_disks; |
895 | #if 0 | ||
896 | { static int first=1; | ||
897 | if (first) printk("First Write sector %llu disks %d\n", | ||
898 | (unsigned long long)r1_bio->sector, disks); | ||
899 | first = 0; | ||
900 | } | ||
901 | #endif | ||
902 | retry_write: | 810 | retry_write: |
903 | blocked_rdev = NULL; | 811 | blocked_rdev = NULL; |
904 | rcu_read_lock(); | 812 | rcu_read_lock(); |
@@ -952,33 +860,29 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
952 | if (bitmap && | 860 | if (bitmap && |
953 | (atomic_read(&bitmap->behind_writes) | 861 | (atomic_read(&bitmap->behind_writes) |
954 | < mddev->bitmap_info.max_write_behind) && | 862 | < mddev->bitmap_info.max_write_behind) && |
955 | !waitqueue_active(&bitmap->behind_wait) && | 863 | !waitqueue_active(&bitmap->behind_wait)) |
956 | (behind_pages = alloc_behind_pages(bio)) != NULL) | 864 | alloc_behind_pages(bio, r1_bio); |
957 | set_bit(R1BIO_BehindIO, &r1_bio->state); | ||
958 | 865 | ||
959 | atomic_set(&r1_bio->remaining, 0); | 866 | atomic_set(&r1_bio->remaining, 1); |
960 | atomic_set(&r1_bio->behind_remaining, 0); | 867 | atomic_set(&r1_bio->behind_remaining, 0); |
961 | 868 | ||
962 | do_barriers = bio->bi_rw & REQ_HARDBARRIER; | 869 | bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, |
963 | if (do_barriers) | 870 | test_bit(R1BIO_BehindIO, &r1_bio->state)); |
964 | set_bit(R1BIO_Barrier, &r1_bio->state); | ||
965 | |||
966 | bio_list_init(&bl); | ||
967 | for (i = 0; i < disks; i++) { | 871 | for (i = 0; i < disks; i++) { |
968 | struct bio *mbio; | 872 | struct bio *mbio; |
969 | if (!r1_bio->bios[i]) | 873 | if (!r1_bio->bios[i]) |
970 | continue; | 874 | continue; |
971 | 875 | ||
972 | mbio = bio_clone(bio, GFP_NOIO); | 876 | mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); |
973 | r1_bio->bios[i] = mbio; | 877 | r1_bio->bios[i] = mbio; |
974 | 878 | ||
975 | mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; | 879 | mbio->bi_sector = r1_bio->sector + conf->mirrors[i].rdev->data_offset; |
976 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; | 880 | mbio->bi_bdev = conf->mirrors[i].rdev->bdev; |
977 | mbio->bi_end_io = raid1_end_write_request; | 881 | mbio->bi_end_io = raid1_end_write_request; |
978 | mbio->bi_rw = WRITE | do_barriers | do_sync; | 882 | mbio->bi_rw = WRITE | do_flush_fua | do_sync; |
979 | mbio->bi_private = r1_bio; | 883 | mbio->bi_private = r1_bio; |
980 | 884 | ||
981 | if (behind_pages) { | 885 | if (r1_bio->behind_pages) { |
982 | struct bio_vec *bvec; | 886 | struct bio_vec *bvec; |
983 | int j; | 887 | int j; |
984 | 888 | ||
@@ -986,39 +890,27 @@ static int make_request(mddev_t *mddev, struct bio * bio) | |||
986 | * we clear any unused pointer in the io_vec, rather | 890 | * we clear any unused pointer in the io_vec, rather |
987 | * than leave them unchanged. This is important | 891 | * than leave them unchanged. This is important |
988 | * because when we come to free the pages, we won't | 892 | * because when we come to free the pages, we won't |
989 | * know the originial bi_idx, so we just free | 893 | * know the original bi_idx, so we just free |
990 | * them all | 894 | * them all |
991 | */ | 895 | */ |
992 | __bio_for_each_segment(bvec, mbio, j, 0) | 896 | __bio_for_each_segment(bvec, mbio, j, 0) |
993 | bvec->bv_page = behind_pages[j]; | 897 | bvec->bv_page = r1_bio->behind_pages[j]; |
994 | if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) | 898 | if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) |
995 | atomic_inc(&r1_bio->behind_remaining); | 899 | atomic_inc(&r1_bio->behind_remaining); |
996 | } | 900 | } |
997 | 901 | ||
998 | atomic_inc(&r1_bio->remaining); | 902 | atomic_inc(&r1_bio->remaining); |
999 | 903 | spin_lock_irqsave(&conf->device_lock, flags); | |
1000 | bio_list_add(&bl, mbio); | 904 | bio_list_add(&conf->pending_bio_list, mbio); |
905 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
1001 | } | 906 | } |
1002 | kfree(behind_pages); /* the behind pages are attached to the bios now */ | 907 | r1_bio_write_done(r1_bio); |
1003 | |||
1004 | bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, | ||
1005 | test_bit(R1BIO_BehindIO, &r1_bio->state)); | ||
1006 | spin_lock_irqsave(&conf->device_lock, flags); | ||
1007 | bio_list_merge(&conf->pending_bio_list, &bl); | ||
1008 | bio_list_init(&bl); | ||
1009 | 908 | ||
1010 | blk_plug_device(mddev->queue); | 909 | /* In case raid1d snuck in to freeze_array */ |
1011 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
1012 | |||
1013 | /* In case raid1d snuck into freeze_array */ | ||
1014 | wake_up(&conf->wait_barrier); | 910 | wake_up(&conf->wait_barrier); |
1015 | 911 | ||
1016 | if (do_sync) | 912 | if (do_sync || !bitmap || !plugged) |
1017 | md_wakeup_thread(mddev->thread); | 913 | md_wakeup_thread(mddev->thread); |
1018 | #if 0 | ||
1019 | while ((bio = bio_list_pop(&bl)) != NULL) | ||
1020 | generic_make_request(bio); | ||
1021 | #endif | ||
1022 | 914 | ||
1023 | return 0; | 915 | return 0; |
1024 | } | 916 | } |
@@ -1076,8 +968,9 @@ static void error(mddev_t *mddev, mdk_rdev_t *rdev) | |||
1076 | } else | 968 | } else |
1077 | set_bit(Faulty, &rdev->flags); | 969 | set_bit(Faulty, &rdev->flags); |
1078 | set_bit(MD_CHANGE_DEVS, &mddev->flags); | 970 | set_bit(MD_CHANGE_DEVS, &mddev->flags); |
1079 | printk(KERN_ALERT "md/raid1:%s: Disk failure on %s, disabling device.\n" | 971 | printk(KERN_ALERT |
1080 | KERN_ALERT "md/raid1:%s: Operation continuing on %d devices.\n", | 972 | "md/raid1:%s: Disk failure on %s, disabling device.\n" |
973 | "md/raid1:%s: Operation continuing on %d devices.\n", | ||
1081 | mdname(mddev), bdevname(rdev->bdev, b), | 974 | mdname(mddev), bdevname(rdev->bdev, b), |
1082 | mdname(mddev), conf->raid_disks - mddev->degraded); | 975 | mdname(mddev), conf->raid_disks - mddev->degraded); |
1083 | } | 976 | } |
@@ -1206,10 +1099,11 @@ static int raid1_remove_disk(mddev_t *mddev, int number) | |||
1206 | err = -EBUSY; | 1099 | err = -EBUSY; |
1207 | goto abort; | 1100 | goto abort; |
1208 | } | 1101 | } |
1209 | /* Only remove non-faulty devices is recovery | 1102 | /* Only remove non-faulty devices if recovery |
1210 | * is not possible. | 1103 | * is not possible. |
1211 | */ | 1104 | */ |
1212 | if (!test_bit(Faulty, &rdev->flags) && | 1105 | if (!test_bit(Faulty, &rdev->flags) && |
1106 | !mddev->recovery_disabled && | ||
1213 | mddev->degraded < conf->raid_disks) { | 1107 | mddev->degraded < conf->raid_disks) { |
1214 | err = -EBUSY; | 1108 | err = -EBUSY; |
1215 | goto abort; | 1109 | goto abort; |
@@ -1222,7 +1116,7 @@ static int raid1_remove_disk(mddev_t *mddev, int number) | |||
1222 | p->rdev = rdev; | 1116 | p->rdev = rdev; |
1223 | goto abort; | 1117 | goto abort; |
1224 | } | 1118 | } |
1225 | md_integrity_register(mddev); | 1119 | err = md_integrity_register(mddev); |
1226 | } | 1120 | } |
1227 | abort: | 1121 | abort: |
1228 | 1122 | ||
@@ -1268,7 +1162,7 @@ static void end_sync_write(struct bio *bio, int error) | |||
1268 | break; | 1162 | break; |
1269 | } | 1163 | } |
1270 | if (!uptodate) { | 1164 | if (!uptodate) { |
1271 | int sync_blocks = 0; | 1165 | sector_t sync_blocks = 0; |
1272 | sector_t s = r1_bio->sector; | 1166 | sector_t s = r1_bio->sector; |
1273 | long sectors_to_go = r1_bio->sectors; | 1167 | long sectors_to_go = r1_bio->sectors; |
1274 | /* make sure these bits doesn't get cleared. */ | 1168 | /* make sure these bits doesn't get cleared. */ |
@@ -1290,194 +1184,210 @@ static void end_sync_write(struct bio *bio, int error) | |||
1290 | } | 1184 | } |
1291 | } | 1185 | } |
1292 | 1186 | ||
1293 | static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) | 1187 | static int fix_sync_read_error(r1bio_t *r1_bio) |
1294 | { | 1188 | { |
1189 | /* Try some synchronous reads of other devices to get | ||
1190 | * good data, much like with normal read errors. Only | ||
1191 | * read into the pages we already have so we don't | ||
1192 | * need to re-issue the read request. | ||
1193 | * We don't need to freeze the array, because being in an | ||
1194 | * active sync request, there is no normal IO, and | ||
1195 | * no overlapping syncs. | ||
1196 | */ | ||
1197 | mddev_t *mddev = r1_bio->mddev; | ||
1295 | conf_t *conf = mddev->private; | 1198 | conf_t *conf = mddev->private; |
1296 | int i; | 1199 | struct bio *bio = r1_bio->bios[r1_bio->read_disk]; |
1297 | int disks = conf->raid_disks; | 1200 | sector_t sect = r1_bio->sector; |
1298 | struct bio *bio, *wbio; | 1201 | int sectors = r1_bio->sectors; |
1299 | 1202 | int idx = 0; | |
1300 | bio = r1_bio->bios[r1_bio->read_disk]; | ||
1301 | 1203 | ||
1204 | while(sectors) { | ||
1205 | int s = sectors; | ||
1206 | int d = r1_bio->read_disk; | ||
1207 | int success = 0; | ||
1208 | mdk_rdev_t *rdev; | ||
1209 | int start; | ||
1302 | 1210 | ||
1303 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { | 1211 | if (s > (PAGE_SIZE>>9)) |
1304 | /* We have read all readable devices. If we haven't | 1212 | s = PAGE_SIZE >> 9; |
1305 | * got the block, then there is no hope left. | 1213 | do { |
1306 | * If we have, then we want to do a comparison | 1214 | if (r1_bio->bios[d]->bi_end_io == end_sync_read) { |
1307 | * and skip the write if everything is the same. | 1215 | /* No rcu protection needed here devices |
1308 | * If any blocks failed to read, then we need to | 1216 | * can only be removed when no resync is |
1309 | * attempt an over-write | 1217 | * active, and resync is currently active |
1310 | */ | 1218 | */ |
1311 | int primary; | 1219 | rdev = conf->mirrors[d].rdev; |
1312 | if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { | 1220 | if (sync_page_io(rdev, |
1313 | for (i=0; i<mddev->raid_disks; i++) | 1221 | sect, |
1314 | if (r1_bio->bios[i]->bi_end_io == end_sync_read) | 1222 | s<<9, |
1315 | md_error(mddev, conf->mirrors[i].rdev); | 1223 | bio->bi_io_vec[idx].bv_page, |
1224 | READ, false)) { | ||
1225 | success = 1; | ||
1226 | break; | ||
1227 | } | ||
1228 | } | ||
1229 | d++; | ||
1230 | if (d == conf->raid_disks) | ||
1231 | d = 0; | ||
1232 | } while (!success && d != r1_bio->read_disk); | ||
1316 | 1233 | ||
1317 | md_done_sync(mddev, r1_bio->sectors, 1); | 1234 | if (!success) { |
1235 | char b[BDEVNAME_SIZE]; | ||
1236 | /* Cannot read from anywhere, array is toast */ | ||
1237 | md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); | ||
1238 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" | ||
1239 | " for block %llu\n", | ||
1240 | mdname(mddev), | ||
1241 | bdevname(bio->bi_bdev, b), | ||
1242 | (unsigned long long)r1_bio->sector); | ||
1243 | md_done_sync(mddev, r1_bio->sectors, 0); | ||
1318 | put_buf(r1_bio); | 1244 | put_buf(r1_bio); |
1319 | return; | 1245 | return 0; |
1320 | } | 1246 | } |
1321 | for (primary=0; primary<mddev->raid_disks; primary++) | ||
1322 | if (r1_bio->bios[primary]->bi_end_io == end_sync_read && | ||
1323 | test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { | ||
1324 | r1_bio->bios[primary]->bi_end_io = NULL; | ||
1325 | rdev_dec_pending(conf->mirrors[primary].rdev, mddev); | ||
1326 | break; | ||
1327 | } | ||
1328 | r1_bio->read_disk = primary; | ||
1329 | for (i=0; i<mddev->raid_disks; i++) | ||
1330 | if (r1_bio->bios[i]->bi_end_io == end_sync_read) { | ||
1331 | int j; | ||
1332 | int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); | ||
1333 | struct bio *pbio = r1_bio->bios[primary]; | ||
1334 | struct bio *sbio = r1_bio->bios[i]; | ||
1335 | |||
1336 | if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { | ||
1337 | for (j = vcnt; j-- ; ) { | ||
1338 | struct page *p, *s; | ||
1339 | p = pbio->bi_io_vec[j].bv_page; | ||
1340 | s = sbio->bi_io_vec[j].bv_page; | ||
1341 | if (memcmp(page_address(p), | ||
1342 | page_address(s), | ||
1343 | PAGE_SIZE)) | ||
1344 | break; | ||
1345 | } | ||
1346 | } else | ||
1347 | j = 0; | ||
1348 | if (j >= 0) | ||
1349 | mddev->resync_mismatches += r1_bio->sectors; | ||
1350 | if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) | ||
1351 | && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { | ||
1352 | sbio->bi_end_io = NULL; | ||
1353 | rdev_dec_pending(conf->mirrors[i].rdev, mddev); | ||
1354 | } else { | ||
1355 | /* fixup the bio for reuse */ | ||
1356 | int size; | ||
1357 | sbio->bi_vcnt = vcnt; | ||
1358 | sbio->bi_size = r1_bio->sectors << 9; | ||
1359 | sbio->bi_idx = 0; | ||
1360 | sbio->bi_phys_segments = 0; | ||
1361 | sbio->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
1362 | sbio->bi_flags |= 1 << BIO_UPTODATE; | ||
1363 | sbio->bi_next = NULL; | ||
1364 | sbio->bi_sector = r1_bio->sector + | ||
1365 | conf->mirrors[i].rdev->data_offset; | ||
1366 | sbio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1367 | size = sbio->bi_size; | ||
1368 | for (j = 0; j < vcnt ; j++) { | ||
1369 | struct bio_vec *bi; | ||
1370 | bi = &sbio->bi_io_vec[j]; | ||
1371 | bi->bv_offset = 0; | ||
1372 | if (size > PAGE_SIZE) | ||
1373 | bi->bv_len = PAGE_SIZE; | ||
1374 | else | ||
1375 | bi->bv_len = size; | ||
1376 | size -= PAGE_SIZE; | ||
1377 | memcpy(page_address(bi->bv_page), | ||
1378 | page_address(pbio->bi_io_vec[j].bv_page), | ||
1379 | PAGE_SIZE); | ||
1380 | } | ||
1381 | 1247 | ||
1382 | } | 1248 | start = d; |
1383 | } | 1249 | /* write it back and re-read */ |
1250 | while (d != r1_bio->read_disk) { | ||
1251 | if (d == 0) | ||
1252 | d = conf->raid_disks; | ||
1253 | d--; | ||
1254 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | ||
1255 | continue; | ||
1256 | rdev = conf->mirrors[d].rdev; | ||
1257 | if (sync_page_io(rdev, | ||
1258 | sect, | ||
1259 | s<<9, | ||
1260 | bio->bi_io_vec[idx].bv_page, | ||
1261 | WRITE, false) == 0) { | ||
1262 | r1_bio->bios[d]->bi_end_io = NULL; | ||
1263 | rdev_dec_pending(rdev, mddev); | ||
1264 | md_error(mddev, rdev); | ||
1265 | } else | ||
1266 | atomic_add(s, &rdev->corrected_errors); | ||
1267 | } | ||
1268 | d = start; | ||
1269 | while (d != r1_bio->read_disk) { | ||
1270 | if (d == 0) | ||
1271 | d = conf->raid_disks; | ||
1272 | d--; | ||
1273 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | ||
1274 | continue; | ||
1275 | rdev = conf->mirrors[d].rdev; | ||
1276 | if (sync_page_io(rdev, | ||
1277 | sect, | ||
1278 | s<<9, | ||
1279 | bio->bi_io_vec[idx].bv_page, | ||
1280 | READ, false) == 0) | ||
1281 | md_error(mddev, rdev); | ||
1282 | } | ||
1283 | sectors -= s; | ||
1284 | sect += s; | ||
1285 | idx ++; | ||
1384 | } | 1286 | } |
1385 | if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) { | 1287 | set_bit(R1BIO_Uptodate, &r1_bio->state); |
1386 | /* ouch - failed to read all of that. | 1288 | set_bit(BIO_UPTODATE, &bio->bi_flags); |
1387 | * Try some synchronous reads of other devices to get | 1289 | return 1; |
1388 | * good data, much like with normal read errors. Only | 1290 | } |
1389 | * read into the pages we already have so we don't | 1291 | |
1390 | * need to re-issue the read request. | 1292 | static int process_checks(r1bio_t *r1_bio) |
1391 | * We don't need to freeze the array, because being in an | 1293 | { |
1392 | * active sync request, there is no normal IO, and | 1294 | /* We have read all readable devices. If we haven't |
1393 | * no overlapping syncs. | 1295 | * got the block, then there is no hope left. |
1394 | */ | 1296 | * If we have, then we want to do a comparison |
1395 | sector_t sect = r1_bio->sector; | 1297 | * and skip the write if everything is the same. |
1396 | int sectors = r1_bio->sectors; | 1298 | * If any blocks failed to read, then we need to |
1397 | int idx = 0; | 1299 | * attempt an over-write |
1398 | 1300 | */ | |
1399 | while(sectors) { | 1301 | mddev_t *mddev = r1_bio->mddev; |
1400 | int s = sectors; | 1302 | conf_t *conf = mddev->private; |
1401 | int d = r1_bio->read_disk; | 1303 | int primary; |
1402 | int success = 0; | 1304 | int i; |
1403 | mdk_rdev_t *rdev; | 1305 | |
1404 | 1306 | for (primary = 0; primary < conf->raid_disks; primary++) | |
1405 | if (s > (PAGE_SIZE>>9)) | 1307 | if (r1_bio->bios[primary]->bi_end_io == end_sync_read && |
1406 | s = PAGE_SIZE >> 9; | 1308 | test_bit(BIO_UPTODATE, &r1_bio->bios[primary]->bi_flags)) { |
1407 | do { | 1309 | r1_bio->bios[primary]->bi_end_io = NULL; |
1408 | if (r1_bio->bios[d]->bi_end_io == end_sync_read) { | 1310 | rdev_dec_pending(conf->mirrors[primary].rdev, mddev); |
1409 | /* No rcu protection needed here devices | 1311 | break; |
1410 | * can only be removed when no resync is | 1312 | } |
1411 | * active, and resync is currently active | 1313 | r1_bio->read_disk = primary; |
1412 | */ | 1314 | for (i = 0; i < conf->raid_disks; i++) { |
1413 | rdev = conf->mirrors[d].rdev; | 1315 | int j; |
1414 | if (sync_page_io(rdev->bdev, | 1316 | int vcnt = r1_bio->sectors >> (PAGE_SHIFT- 9); |
1415 | sect + rdev->data_offset, | 1317 | struct bio *pbio = r1_bio->bios[primary]; |
1416 | s<<9, | 1318 | struct bio *sbio = r1_bio->bios[i]; |
1417 | bio->bi_io_vec[idx].bv_page, | 1319 | int size; |
1418 | READ)) { | 1320 | |
1419 | success = 1; | 1321 | if (r1_bio->bios[i]->bi_end_io != end_sync_read) |
1420 | break; | 1322 | continue; |
1421 | } | 1323 | |
1422 | } | 1324 | if (test_bit(BIO_UPTODATE, &sbio->bi_flags)) { |
1423 | d++; | 1325 | for (j = vcnt; j-- ; ) { |
1424 | if (d == conf->raid_disks) | 1326 | struct page *p, *s; |
1425 | d = 0; | 1327 | p = pbio->bi_io_vec[j].bv_page; |
1426 | } while (!success && d != r1_bio->read_disk); | 1328 | s = sbio->bi_io_vec[j].bv_page; |
1427 | 1329 | if (memcmp(page_address(p), | |
1428 | if (success) { | 1330 | page_address(s), |
1429 | int start = d; | 1331 | PAGE_SIZE)) |
1430 | /* write it back and re-read */ | 1332 | break; |
1431 | set_bit(R1BIO_Uptodate, &r1_bio->state); | ||
1432 | while (d != r1_bio->read_disk) { | ||
1433 | if (d == 0) | ||
1434 | d = conf->raid_disks; | ||
1435 | d--; | ||
1436 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | ||
1437 | continue; | ||
1438 | rdev = conf->mirrors[d].rdev; | ||
1439 | atomic_add(s, &rdev->corrected_errors); | ||
1440 | if (sync_page_io(rdev->bdev, | ||
1441 | sect + rdev->data_offset, | ||
1442 | s<<9, | ||
1443 | bio->bi_io_vec[idx].bv_page, | ||
1444 | WRITE) == 0) | ||
1445 | md_error(mddev, rdev); | ||
1446 | } | ||
1447 | d = start; | ||
1448 | while (d != r1_bio->read_disk) { | ||
1449 | if (d == 0) | ||
1450 | d = conf->raid_disks; | ||
1451 | d--; | ||
1452 | if (r1_bio->bios[d]->bi_end_io != end_sync_read) | ||
1453 | continue; | ||
1454 | rdev = conf->mirrors[d].rdev; | ||
1455 | if (sync_page_io(rdev->bdev, | ||
1456 | sect + rdev->data_offset, | ||
1457 | s<<9, | ||
1458 | bio->bi_io_vec[idx].bv_page, | ||
1459 | READ) == 0) | ||
1460 | md_error(mddev, rdev); | ||
1461 | } | ||
1462 | } else { | ||
1463 | char b[BDEVNAME_SIZE]; | ||
1464 | /* Cannot read from anywhere, array is toast */ | ||
1465 | md_error(mddev, conf->mirrors[r1_bio->read_disk].rdev); | ||
1466 | printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O read error" | ||
1467 | " for block %llu\n", | ||
1468 | mdname(mddev), | ||
1469 | bdevname(bio->bi_bdev, b), | ||
1470 | (unsigned long long)r1_bio->sector); | ||
1471 | md_done_sync(mddev, r1_bio->sectors, 0); | ||
1472 | put_buf(r1_bio); | ||
1473 | return; | ||
1474 | } | 1333 | } |
1475 | sectors -= s; | 1334 | } else |
1476 | sect += s; | 1335 | j = 0; |
1477 | idx ++; | 1336 | if (j >= 0) |
1337 | mddev->resync_mismatches += r1_bio->sectors; | ||
1338 | if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) | ||
1339 | && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { | ||
1340 | /* No need to write to this device. */ | ||
1341 | sbio->bi_end_io = NULL; | ||
1342 | rdev_dec_pending(conf->mirrors[i].rdev, mddev); | ||
1343 | continue; | ||
1344 | } | ||
1345 | /* fixup the bio for reuse */ | ||
1346 | sbio->bi_vcnt = vcnt; | ||
1347 | sbio->bi_size = r1_bio->sectors << 9; | ||
1348 | sbio->bi_idx = 0; | ||
1349 | sbio->bi_phys_segments = 0; | ||
1350 | sbio->bi_flags &= ~(BIO_POOL_MASK - 1); | ||
1351 | sbio->bi_flags |= 1 << BIO_UPTODATE; | ||
1352 | sbio->bi_next = NULL; | ||
1353 | sbio->bi_sector = r1_bio->sector + | ||
1354 | conf->mirrors[i].rdev->data_offset; | ||
1355 | sbio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1356 | size = sbio->bi_size; | ||
1357 | for (j = 0; j < vcnt ; j++) { | ||
1358 | struct bio_vec *bi; | ||
1359 | bi = &sbio->bi_io_vec[j]; | ||
1360 | bi->bv_offset = 0; | ||
1361 | if (size > PAGE_SIZE) | ||
1362 | bi->bv_len = PAGE_SIZE; | ||
1363 | else | ||
1364 | bi->bv_len = size; | ||
1365 | size -= PAGE_SIZE; | ||
1366 | memcpy(page_address(bi->bv_page), | ||
1367 | page_address(pbio->bi_io_vec[j].bv_page), | ||
1368 | PAGE_SIZE); | ||
1478 | } | 1369 | } |
1479 | } | 1370 | } |
1371 | return 0; | ||
1372 | } | ||
1373 | |||
1374 | static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio) | ||
1375 | { | ||
1376 | conf_t *conf = mddev->private; | ||
1377 | int i; | ||
1378 | int disks = conf->raid_disks; | ||
1379 | struct bio *bio, *wbio; | ||
1380 | |||
1381 | bio = r1_bio->bios[r1_bio->read_disk]; | ||
1480 | 1382 | ||
1383 | if (!test_bit(R1BIO_Uptodate, &r1_bio->state)) | ||
1384 | /* ouch - failed to read all of that. */ | ||
1385 | if (!fix_sync_read_error(r1_bio)) | ||
1386 | return; | ||
1387 | |||
1388 | if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) | ||
1389 | if (process_checks(r1_bio) < 0) | ||
1390 | return; | ||
1481 | /* | 1391 | /* |
1482 | * schedule writes | 1392 | * schedule writes |
1483 | */ | 1393 | */ |
@@ -1536,10 +1446,8 @@ static void fix_read_error(conf_t *conf, int read_disk, | |||
1536 | rdev = conf->mirrors[d].rdev; | 1446 | rdev = conf->mirrors[d].rdev; |
1537 | if (rdev && | 1447 | if (rdev && |
1538 | test_bit(In_sync, &rdev->flags) && | 1448 | test_bit(In_sync, &rdev->flags) && |
1539 | sync_page_io(rdev->bdev, | 1449 | sync_page_io(rdev, sect, s<<9, |
1540 | sect + rdev->data_offset, | 1450 | conf->tmppage, READ, false)) |
1541 | s<<9, | ||
1542 | conf->tmppage, READ)) | ||
1543 | success = 1; | 1451 | success = 1; |
1544 | else { | 1452 | else { |
1545 | d++; | 1453 | d++; |
@@ -1562,9 +1470,8 @@ static void fix_read_error(conf_t *conf, int read_disk, | |||
1562 | rdev = conf->mirrors[d].rdev; | 1470 | rdev = conf->mirrors[d].rdev; |
1563 | if (rdev && | 1471 | if (rdev && |
1564 | test_bit(In_sync, &rdev->flags)) { | 1472 | test_bit(In_sync, &rdev->flags)) { |
1565 | if (sync_page_io(rdev->bdev, | 1473 | if (sync_page_io(rdev, sect, s<<9, |
1566 | sect + rdev->data_offset, | 1474 | conf->tmppage, WRITE, false) |
1567 | s<<9, conf->tmppage, WRITE) | ||
1568 | == 0) | 1475 | == 0) |
1569 | /* Well, this device is dead */ | 1476 | /* Well, this device is dead */ |
1570 | md_error(mddev, rdev); | 1477 | md_error(mddev, rdev); |
@@ -1579,9 +1486,8 @@ static void fix_read_error(conf_t *conf, int read_disk, | |||
1579 | rdev = conf->mirrors[d].rdev; | 1486 | rdev = conf->mirrors[d].rdev; |
1580 | if (rdev && | 1487 | if (rdev && |
1581 | test_bit(In_sync, &rdev->flags)) { | 1488 | test_bit(In_sync, &rdev->flags)) { |
1582 | if (sync_page_io(rdev->bdev, | 1489 | if (sync_page_io(rdev, sect, s<<9, |
1583 | sect + rdev->data_offset, | 1490 | conf->tmppage, READ, false) |
1584 | s<<9, conf->tmppage, READ) | ||
1585 | == 0) | 1491 | == 0) |
1586 | /* Well, this device is dead */ | 1492 | /* Well, this device is dead */ |
1587 | md_error(mddev, rdev); | 1493 | md_error(mddev, rdev); |
@@ -1609,15 +1515,17 @@ static void raid1d(mddev_t *mddev) | |||
1609 | unsigned long flags; | 1515 | unsigned long flags; |
1610 | conf_t *conf = mddev->private; | 1516 | conf_t *conf = mddev->private; |
1611 | struct list_head *head = &conf->retry_list; | 1517 | struct list_head *head = &conf->retry_list; |
1612 | int unplug=0; | ||
1613 | mdk_rdev_t *rdev; | 1518 | mdk_rdev_t *rdev; |
1519 | struct blk_plug plug; | ||
1614 | 1520 | ||
1615 | md_check_recovery(mddev); | 1521 | md_check_recovery(mddev); |
1616 | 1522 | ||
1523 | blk_start_plug(&plug); | ||
1617 | for (;;) { | 1524 | for (;;) { |
1618 | char b[BDEVNAME_SIZE]; | 1525 | char b[BDEVNAME_SIZE]; |
1619 | 1526 | ||
1620 | unplug += flush_pending_writes(conf); | 1527 | if (atomic_read(&mddev->plug_cnt) == 0) |
1528 | flush_pending_writes(conf); | ||
1621 | 1529 | ||
1622 | spin_lock_irqsave(&conf->device_lock, flags); | 1530 | spin_lock_irqsave(&conf->device_lock, flags); |
1623 | if (list_empty(head)) { | 1531 | if (list_empty(head)) { |
@@ -1631,45 +1539,9 @@ static void raid1d(mddev_t *mddev) | |||
1631 | 1539 | ||
1632 | mddev = r1_bio->mddev; | 1540 | mddev = r1_bio->mddev; |
1633 | conf = mddev->private; | 1541 | conf = mddev->private; |
1634 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) { | 1542 | if (test_bit(R1BIO_IsSync, &r1_bio->state)) |
1635 | sync_request_write(mddev, r1_bio); | 1543 | sync_request_write(mddev, r1_bio); |
1636 | unplug = 1; | 1544 | else { |
1637 | } else if (test_bit(R1BIO_BarrierRetry, &r1_bio->state)) { | ||
1638 | /* some requests in the r1bio were REQ_HARDBARRIER | ||
1639 | * requests which failed with -EOPNOTSUPP. Hohumm.. | ||
1640 | * Better resubmit without the barrier. | ||
1641 | * We know which devices to resubmit for, because | ||
1642 | * all others have had their bios[] entry cleared. | ||
1643 | * We already have a nr_pending reference on these rdevs. | ||
1644 | */ | ||
1645 | int i; | ||
1646 | const unsigned long do_sync = (r1_bio->master_bio->bi_rw & REQ_SYNC); | ||
1647 | clear_bit(R1BIO_BarrierRetry, &r1_bio->state); | ||
1648 | clear_bit(R1BIO_Barrier, &r1_bio->state); | ||
1649 | for (i=0; i < conf->raid_disks; i++) | ||
1650 | if (r1_bio->bios[i]) | ||
1651 | atomic_inc(&r1_bio->remaining); | ||
1652 | for (i=0; i < conf->raid_disks; i++) | ||
1653 | if (r1_bio->bios[i]) { | ||
1654 | struct bio_vec *bvec; | ||
1655 | int j; | ||
1656 | |||
1657 | bio = bio_clone(r1_bio->master_bio, GFP_NOIO); | ||
1658 | /* copy pages from the failed bio, as | ||
1659 | * this might be a write-behind device */ | ||
1660 | __bio_for_each_segment(bvec, bio, j, 0) | ||
1661 | bvec->bv_page = bio_iovec_idx(r1_bio->bios[i], j)->bv_page; | ||
1662 | bio_put(r1_bio->bios[i]); | ||
1663 | bio->bi_sector = r1_bio->sector + | ||
1664 | conf->mirrors[i].rdev->data_offset; | ||
1665 | bio->bi_bdev = conf->mirrors[i].rdev->bdev; | ||
1666 | bio->bi_end_io = raid1_end_write_request; | ||
1667 | bio->bi_rw = WRITE | do_sync; | ||
1668 | bio->bi_private = r1_bio; | ||
1669 | r1_bio->bios[i] = bio; | ||
1670 | generic_make_request(bio); | ||
1671 | } | ||
1672 | } else { | ||
1673 | int disk; | 1545 | int disk; |
1674 | 1546 | ||
1675 | /* we got a read error. Maybe the drive is bad. Maybe just | 1547 | /* we got a read error. Maybe the drive is bad. Maybe just |
@@ -1704,7 +1576,8 @@ static void raid1d(mddev_t *mddev) | |||
1704 | mddev->ro ? IO_BLOCKED : NULL; | 1576 | mddev->ro ? IO_BLOCKED : NULL; |
1705 | r1_bio->read_disk = disk; | 1577 | r1_bio->read_disk = disk; |
1706 | bio_put(bio); | 1578 | bio_put(bio); |
1707 | bio = bio_clone(r1_bio->master_bio, GFP_NOIO); | 1579 | bio = bio_clone_mddev(r1_bio->master_bio, |
1580 | GFP_NOIO, mddev); | ||
1708 | r1_bio->bios[r1_bio->read_disk] = bio; | 1581 | r1_bio->bios[r1_bio->read_disk] = bio; |
1709 | rdev = conf->mirrors[disk].rdev; | 1582 | rdev = conf->mirrors[disk].rdev; |
1710 | if (printk_ratelimit()) | 1583 | if (printk_ratelimit()) |
@@ -1718,14 +1591,12 @@ static void raid1d(mddev_t *mddev) | |||
1718 | bio->bi_end_io = raid1_end_read_request; | 1591 | bio->bi_end_io = raid1_end_read_request; |
1719 | bio->bi_rw = READ | do_sync; | 1592 | bio->bi_rw = READ | do_sync; |
1720 | bio->bi_private = r1_bio; | 1593 | bio->bi_private = r1_bio; |
1721 | unplug = 1; | ||
1722 | generic_make_request(bio); | 1594 | generic_make_request(bio); |
1723 | } | 1595 | } |
1724 | } | 1596 | } |
1725 | cond_resched(); | 1597 | cond_resched(); |
1726 | } | 1598 | } |
1727 | if (unplug) | 1599 | blk_finish_plug(&plug); |
1728 | unplug_slaves(mddev); | ||
1729 | } | 1600 | } |
1730 | 1601 | ||
1731 | 1602 | ||
@@ -1763,7 +1634,7 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1763 | int i; | 1634 | int i; |
1764 | int wonly = -1; | 1635 | int wonly = -1; |
1765 | int write_targets = 0, read_targets = 0; | 1636 | int write_targets = 0, read_targets = 0; |
1766 | int sync_blocks; | 1637 | sector_t sync_blocks; |
1767 | int still_degraded = 0; | 1638 | int still_degraded = 0; |
1768 | 1639 | ||
1769 | if (!conf->r1buf_pool) | 1640 | if (!conf->r1buf_pool) |
@@ -1813,11 +1684,11 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i | |||
1813 | msleep_interruptible(1000); | 1684 | msleep_interruptible(1000); |
1814 | 1685 | ||
1815 | bitmap_cond_end_sync(mddev->bitmap, sector_nr); | 1686 | bitmap_cond_end_sync(mddev->bitmap, sector_nr); |
1687 | r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); | ||
1816 | raise_barrier(conf); | 1688 | raise_barrier(conf); |
1817 | 1689 | ||
1818 | conf->next_resync = sector_nr; | 1690 | conf->next_resync = sector_nr; |
1819 | 1691 | ||
1820 | r1_bio = mempool_alloc(conf->r1buf_pool, GFP_NOIO); | ||
1821 | rcu_read_lock(); | 1692 | rcu_read_lock(); |
1822 | /* | 1693 | /* |
1823 | * If we get a correctably read error during resync or recovery, | 1694 | * If we get a correctably read error during resync or recovery, |
@@ -2029,7 +1900,6 @@ static conf_t *setup_conf(mddev_t *mddev) | |||
2029 | init_waitqueue_head(&conf->wait_barrier); | 1900 | init_waitqueue_head(&conf->wait_barrier); |
2030 | 1901 | ||
2031 | bio_list_init(&conf->pending_bio_list); | 1902 | bio_list_init(&conf->pending_bio_list); |
2032 | bio_list_init(&conf->flushing_bio_list); | ||
2033 | 1903 | ||
2034 | conf->last_used = -1; | 1904 | conf->last_used = -1; |
2035 | for (i = 0; i < conf->raid_disks; i++) { | 1905 | for (i = 0; i < conf->raid_disks; i++) { |
@@ -2107,8 +1977,9 @@ static int run(mddev_t *mddev) | |||
2107 | if (IS_ERR(conf)) | 1977 | if (IS_ERR(conf)) |
2108 | return PTR_ERR(conf); | 1978 | return PTR_ERR(conf); |
2109 | 1979 | ||
2110 | mddev->queue->queue_lock = &conf->device_lock; | ||
2111 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 1980 | list_for_each_entry(rdev, &mddev->disks, same_set) { |
1981 | if (!mddev->gendisk) | ||
1982 | continue; | ||
2112 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 1983 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
2113 | rdev->data_offset << 9); | 1984 | rdev->data_offset << 9); |
2114 | /* as we don't honour merge_bvec_fn, we must never risk | 1985 | /* as we don't honour merge_bvec_fn, we must never risk |
@@ -2150,11 +2021,11 @@ static int run(mddev_t *mddev) | |||
2150 | 2021 | ||
2151 | md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); | 2022 | md_set_array_sectors(mddev, raid1_size(mddev, 0, 0)); |
2152 | 2023 | ||
2153 | mddev->queue->unplug_fn = raid1_unplug; | 2024 | if (mddev->queue) { |
2154 | mddev->queue->backing_dev_info.congested_fn = raid1_congested; | 2025 | mddev->queue->backing_dev_info.congested_fn = raid1_congested; |
2155 | mddev->queue->backing_dev_info.congested_data = mddev; | 2026 | mddev->queue->backing_dev_info.congested_data = mddev; |
2156 | md_integrity_register(mddev); | 2027 | } |
2157 | return 0; | 2028 | return md_integrity_register(mddev); |
2158 | } | 2029 | } |
2159 | 2030 | ||
2160 | static int stop(mddev_t *mddev) | 2031 | static int stop(mddev_t *mddev) |
@@ -2176,7 +2047,6 @@ static int stop(mddev_t *mddev) | |||
2176 | 2047 | ||
2177 | md_unregister_thread(mddev->thread); | 2048 | md_unregister_thread(mddev->thread); |
2178 | mddev->thread = NULL; | 2049 | mddev->thread = NULL; |
2179 | blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ | ||
2180 | if (conf->r1bio_pool) | 2050 | if (conf->r1bio_pool) |
2181 | mempool_destroy(conf->r1bio_pool); | 2051 | mempool_destroy(conf->r1bio_pool); |
2182 | kfree(conf->mirrors); | 2052 | kfree(conf->mirrors); |
@@ -2201,7 +2071,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) | |||
2201 | set_capacity(mddev->gendisk, mddev->array_sectors); | 2071 | set_capacity(mddev->gendisk, mddev->array_sectors); |
2202 | revalidate_disk(mddev->gendisk); | 2072 | revalidate_disk(mddev->gendisk); |
2203 | if (sectors > mddev->dev_sectors && | 2073 | if (sectors > mddev->dev_sectors && |
2204 | mddev->recovery_cp == MaxSector) { | 2074 | mddev->recovery_cp > mddev->dev_sectors) { |
2205 | mddev->recovery_cp = mddev->dev_sectors; | 2075 | mddev->recovery_cp = mddev->dev_sectors; |
2206 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | 2076 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); |
2207 | } | 2077 | } |