aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--drivers/md/md.c49
-rw-r--r--drivers/md/md.h1
-rw-r--r--drivers/md/raid1.c208
-rw-r--r--drivers/md/raid1.h4
4 files changed, 233 insertions, 29 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7ae3c5a18001..48217e8aa0eb 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -215,6 +215,55 @@ struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
215} 215}
216EXPORT_SYMBOL_GPL(bio_clone_mddev); 216EXPORT_SYMBOL_GPL(bio_clone_mddev);
217 217
218void md_trim_bio(struct bio *bio, int offset, int size)
219{
220 /* 'bio' is a cloned bio which we need to trim to match
221 * the given offset and size.
222 * This requires adjusting bi_sector, bi_size, and bi_io_vec
223 */
224 int i;
225 struct bio_vec *bvec;
226 int sofar = 0;
227
228 size <<= 9;
229 if (offset == 0 && size == bio->bi_size)
230 return;
231
232 bio->bi_sector += offset;
233 bio->bi_size = size;
234 offset <<= 9;
235 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
236
237 while (bio->bi_idx < bio->bi_vcnt &&
238 bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
239 /* remove this whole bio_vec */
240 offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
241 bio->bi_idx++;
242 }
243 if (bio->bi_idx < bio->bi_vcnt) {
244 bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
245 bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
246 }
247 /* avoid any complications with bi_idx being non-zero*/
248 if (bio->bi_idx) {
249 memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
250 (bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
251 bio->bi_vcnt -= bio->bi_idx;
252 bio->bi_idx = 0;
253 }
254 /* Make sure vcnt and last bv are not too big */
255 bio_for_each_segment(bvec, bio, i) {
256 if (sofar + bvec->bv_len > size)
257 bvec->bv_len = size - sofar;
258 if (bvec->bv_len == 0) {
259 bio->bi_vcnt = i;
260 break;
261 }
262 sofar += bvec->bv_len;
263 }
264}
265EXPORT_SYMBOL_GPL(md_trim_bio);
266
218/* 267/*
219 * We have a system wide 'event count' that is incremented 268 * We have a system wide 'event count' that is incremented
220 * on any 'interesting' event, and readers of /proc/mdstat 269 * on any 'interesting' event, and readers of /proc/mdstat
diff --git a/drivers/md/md.h b/drivers/md/md.h
index aea9e9ff8a33..7c3192c0a29a 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -575,4 +575,5 @@ extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
575extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, 575extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
576 mddev_t *mddev); 576 mddev_t *mddev);
577extern int mddev_check_plugged(mddev_t *mddev); 577extern int mddev_check_plugged(mddev_t *mddev);
578extern void md_trim_bio(struct bio *bio, int offset, int size);
578#endif /* _MD_MD_H */ 579#endif /* _MD_MD_H */
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 8db311d7cddc..cc3939dc9e3d 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -41,11 +41,7 @@
41#include "bitmap.h" 41#include "bitmap.h"
42 42
43#define DEBUG 0 43#define DEBUG 0
44#if DEBUG 44#define PRINTK(x...) do { if (DEBUG) printk(x); } while (0)
45#define PRINTK(x...) printk(x)
46#else
47#define PRINTK(x...)
48#endif
49 45
50/* 46/*
51 * Number of guaranteed r1bios in case of extreme VM load: 47 * Number of guaranteed r1bios in case of extreme VM load:
@@ -177,12 +173,6 @@ static void free_r1bio(r1bio_t *r1_bio)
177{ 173{
178 conf_t *conf = r1_bio->mddev->private; 174 conf_t *conf = r1_bio->mddev->private;
179 175
180 /*
181 * Wake up any possible resync thread that waits for the device
182 * to go idle.
183 */
184 allow_barrier(conf);
185
186 put_all_bios(conf, r1_bio); 176 put_all_bios(conf, r1_bio);
187 mempool_free(r1_bio, conf->r1bio_pool); 177 mempool_free(r1_bio, conf->r1bio_pool);
188} 178}
@@ -223,6 +213,33 @@ static void reschedule_retry(r1bio_t *r1_bio)
223 * operation and are ready to return a success/failure code to the buffer 213 * operation and are ready to return a success/failure code to the buffer
224 * cache layer. 214 * cache layer.
225 */ 215 */
216static void call_bio_endio(r1bio_t *r1_bio)
217{
218 struct bio *bio = r1_bio->master_bio;
219 int done;
220 conf_t *conf = r1_bio->mddev->private;
221
222 if (bio->bi_phys_segments) {
223 unsigned long flags;
224 spin_lock_irqsave(&conf->device_lock, flags);
225 bio->bi_phys_segments--;
226 done = (bio->bi_phys_segments == 0);
227 spin_unlock_irqrestore(&conf->device_lock, flags);
228 } else
229 done = 1;
230
231 if (!test_bit(R1BIO_Uptodate, &r1_bio->state))
232 clear_bit(BIO_UPTODATE, &bio->bi_flags);
233 if (done) {
234 bio_endio(bio, 0);
235 /*
236 * Wake up any possible resync thread that waits for the device
237 * to go idle.
238 */
239 allow_barrier(conf);
240 }
241}
242
226static void raid_end_bio_io(r1bio_t *r1_bio) 243static void raid_end_bio_io(r1bio_t *r1_bio)
227{ 244{
228 struct bio *bio = r1_bio->master_bio; 245 struct bio *bio = r1_bio->master_bio;
@@ -235,8 +252,7 @@ static void raid_end_bio_io(r1bio_t *r1_bio)
235 (unsigned long long) bio->bi_sector + 252 (unsigned long long) bio->bi_sector +
236 (bio->bi_size >> 9) - 1); 253 (bio->bi_size >> 9) - 1);
237 254
238 bio_endio(bio, 255 call_bio_endio(r1_bio);
239 test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO);
240 } 256 }
241 free_r1bio(r1_bio); 257 free_r1bio(r1_bio);
242} 258}
@@ -295,6 +311,7 @@ static void raid1_end_read_request(struct bio *bio, int error)
295 bdevname(conf->mirrors[mirror].rdev->bdev, 311 bdevname(conf->mirrors[mirror].rdev->bdev,
296 b), 312 b),
297 (unsigned long long)r1_bio->sector); 313 (unsigned long long)r1_bio->sector);
314 set_bit(R1BIO_ReadError, &r1_bio->state);
298 reschedule_retry(r1_bio); 315 reschedule_retry(r1_bio);
299 } 316 }
300 317
@@ -381,7 +398,7 @@ static void raid1_end_write_request(struct bio *bio, int error)
381 (unsigned long long) mbio->bi_sector, 398 (unsigned long long) mbio->bi_sector,
382 (unsigned long long) mbio->bi_sector + 399 (unsigned long long) mbio->bi_sector +
383 (mbio->bi_size >> 9) - 1); 400 (mbio->bi_size >> 9) - 1);
384 bio_endio(mbio, 0); 401 call_bio_endio(r1_bio);
385 } 402 }
386 } 403 }
387 } 404 }
@@ -412,10 +429,11 @@ static void raid1_end_write_request(struct bio *bio, int error)
412 * 429 *
413 * The rdev for the device selected will have nr_pending incremented. 430 * The rdev for the device selected will have nr_pending incremented.
414 */ 431 */
415static int read_balance(conf_t *conf, r1bio_t *r1_bio) 432static int read_balance(conf_t *conf, r1bio_t *r1_bio, int *max_sectors)
416{ 433{
417 const sector_t this_sector = r1_bio->sector; 434 const sector_t this_sector = r1_bio->sector;
418 const int sectors = r1_bio->sectors; 435 int sectors;
436 int best_good_sectors;
419 int start_disk; 437 int start_disk;
420 int best_disk; 438 int best_disk;
421 int i; 439 int i;
@@ -430,8 +448,11 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
430 * We take the first readable disk when above the resync window. 448 * We take the first readable disk when above the resync window.
431 */ 449 */
432 retry: 450 retry:
451 sectors = r1_bio->sectors;
433 best_disk = -1; 452 best_disk = -1;
434 best_dist = MaxSector; 453 best_dist = MaxSector;
454 best_good_sectors = 0;
455
435 if (conf->mddev->recovery_cp < MaxSector && 456 if (conf->mddev->recovery_cp < MaxSector &&
436 (this_sector + sectors >= conf->next_resync)) { 457 (this_sector + sectors >= conf->next_resync)) {
437 choose_first = 1; 458 choose_first = 1;
@@ -443,6 +464,9 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
443 464
444 for (i = 0 ; i < conf->raid_disks ; i++) { 465 for (i = 0 ; i < conf->raid_disks ; i++) {
445 sector_t dist; 466 sector_t dist;
467 sector_t first_bad;
468 int bad_sectors;
469
446 int disk = start_disk + i; 470 int disk = start_disk + i;
447 if (disk >= conf->raid_disks) 471 if (disk >= conf->raid_disks)
448 disk -= conf->raid_disks; 472 disk -= conf->raid_disks;
@@ -465,6 +489,35 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
465 /* This is a reasonable device to use. It might 489 /* This is a reasonable device to use. It might
466 * even be best. 490 * even be best.
467 */ 491 */
492 if (is_badblock(rdev, this_sector, sectors,
493 &first_bad, &bad_sectors)) {
494 if (best_dist < MaxSector)
495 /* already have a better device */
496 continue;
497 if (first_bad <= this_sector) {
498 /* cannot read here. If this is the 'primary'
499 * device, then we must not read beyond
500 * bad_sectors from another device..
501 */
502 bad_sectors -= (this_sector - first_bad);
503 if (choose_first && sectors > bad_sectors)
504 sectors = bad_sectors;
505 if (best_good_sectors > sectors)
506 best_good_sectors = sectors;
507
508 } else {
509 sector_t good_sectors = first_bad - this_sector;
510 if (good_sectors > best_good_sectors) {
511 best_good_sectors = good_sectors;
512 best_disk = disk;
513 }
514 if (choose_first)
515 break;
516 }
517 continue;
518 } else
519 best_good_sectors = sectors;
520
468 dist = abs(this_sector - conf->mirrors[disk].head_position); 521 dist = abs(this_sector - conf->mirrors[disk].head_position);
469 if (choose_first 522 if (choose_first
470 /* Don't change to another disk for sequential reads */ 523 /* Don't change to another disk for sequential reads */
@@ -493,10 +546,12 @@ static int read_balance(conf_t *conf, r1bio_t *r1_bio)
493 rdev_dec_pending(rdev, conf->mddev); 546 rdev_dec_pending(rdev, conf->mddev);
494 goto retry; 547 goto retry;
495 } 548 }
549 sectors = best_good_sectors;
496 conf->next_seq_sect = this_sector + sectors; 550 conf->next_seq_sect = this_sector + sectors;
497 conf->last_used = best_disk; 551 conf->last_used = best_disk;
498 } 552 }
499 rcu_read_unlock(); 553 rcu_read_unlock();
554 *max_sectors = sectors;
500 555
501 return best_disk; 556 return best_disk;
502} 557}
@@ -763,11 +818,25 @@ static int make_request(mddev_t *mddev, struct bio * bio)
763 r1_bio->mddev = mddev; 818 r1_bio->mddev = mddev;
764 r1_bio->sector = bio->bi_sector; 819 r1_bio->sector = bio->bi_sector;
765 820
821 /* We might need to issue multiple reads to different
822 * devices if there are bad blocks around, so we keep
823 * track of the number of reads in bio->bi_phys_segments.
824 * If this is 0, there is only one r1_bio and no locking
825 * will be needed when requests complete. If it is
826 * non-zero, then it is the number of not-completed requests.
827 */
828 bio->bi_phys_segments = 0;
829 clear_bit(BIO_SEG_VALID, &bio->bi_flags);
830
766 if (rw == READ) { 831 if (rw == READ) {
767 /* 832 /*
768 * read balancing logic: 833 * read balancing logic:
769 */ 834 */
770 int rdisk = read_balance(conf, r1_bio); 835 int max_sectors;
836 int rdisk;
837
838read_again:
839 rdisk = read_balance(conf, r1_bio, &max_sectors);
771 840
772 if (rdisk < 0) { 841 if (rdisk < 0) {
773 /* couldn't find anywhere to read from */ 842 /* couldn't find anywhere to read from */
@@ -788,6 +857,8 @@ static int make_request(mddev_t *mddev, struct bio * bio)
788 r1_bio->read_disk = rdisk; 857 r1_bio->read_disk = rdisk;
789 858
790 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); 859 read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
860 md_trim_bio(read_bio, r1_bio->sector - bio->bi_sector,
861 max_sectors);
791 862
792 r1_bio->bios[rdisk] = read_bio; 863 r1_bio->bios[rdisk] = read_bio;
793 864
@@ -797,7 +868,38 @@ static int make_request(mddev_t *mddev, struct bio * bio)
797 read_bio->bi_rw = READ | do_sync; 868 read_bio->bi_rw = READ | do_sync;
798 read_bio->bi_private = r1_bio; 869 read_bio->bi_private = r1_bio;
799 870
800 generic_make_request(read_bio); 871 if (max_sectors < r1_bio->sectors) {
872 /* could not read all from this device, so we will
873 * need another r1_bio.
874 */
875 int sectors_handled;
876
877 sectors_handled = (r1_bio->sector + max_sectors
878 - bio->bi_sector);
879 r1_bio->sectors = max_sectors;
880 spin_lock_irq(&conf->device_lock);
881 if (bio->bi_phys_segments == 0)
882 bio->bi_phys_segments = 2;
883 else
884 bio->bi_phys_segments++;
885 spin_unlock_irq(&conf->device_lock);
886 /* Cannot call generic_make_request directly
887 * as that will be queued in __make_request
888 * and subsequent mempool_alloc might block waiting
889 * for it. So hand bio over to raid1d.
890 */
891 reschedule_retry(r1_bio);
892
893 r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
894
895 r1_bio->master_bio = bio;
896 r1_bio->sectors = (bio->bi_size >> 9) - sectors_handled;
897 r1_bio->state = 0;
898 r1_bio->mddev = mddev;
899 r1_bio->sector = bio->bi_sector + sectors_handled;
900 goto read_again;
901 } else
902 generic_make_request(read_bio);
801 return 0; 903 return 0;
802 } 904 }
803 905
@@ -849,8 +951,6 @@ static int make_request(mddev_t *mddev, struct bio * bio)
849 goto retry_write; 951 goto retry_write;
850 } 952 }
851 953
852 BUG_ON(targets == 0); /* we never fail the last device */
853
854 if (targets < conf->raid_disks) { 954 if (targets < conf->raid_disks) {
855 /* array is degraded, we will not clear the bitmap 955 /* array is degraded, we will not clear the bitmap
856 * on I/O completion (see raid1_end_write_request) */ 956 * on I/O completion (see raid1_end_write_request) */
@@ -1425,7 +1525,7 @@ static void sync_request_write(mddev_t *mddev, r1bio_t *r1_bio)
1425 * 1525 *
1426 * 1. Retries failed read operations on working mirrors. 1526 * 1. Retries failed read operations on working mirrors.
1427 * 2. Updates the raid superblock when problems encounter. 1527 * 2. Updates the raid superblock when problems encounter.
1428 * 3. Performs writes following reads for array syncronising. 1528 * 3. Performs writes following reads for array synchronising.
1429 */ 1529 */
1430 1530
1431static void fix_read_error(conf_t *conf, int read_disk, 1531static void fix_read_error(conf_t *conf, int read_disk,
@@ -1448,9 +1548,14 @@ static void fix_read_error(conf_t *conf, int read_disk,
1448 * which is the thread that might remove 1548 * which is the thread that might remove
1449 * a device. If raid1d ever becomes multi-threaded.... 1549 * a device. If raid1d ever becomes multi-threaded....
1450 */ 1550 */
1551 sector_t first_bad;
1552 int bad_sectors;
1553
1451 rdev = conf->mirrors[d].rdev; 1554 rdev = conf->mirrors[d].rdev;
1452 if (rdev && 1555 if (rdev &&
1453 test_bit(In_sync, &rdev->flags) && 1556 test_bit(In_sync, &rdev->flags) &&
1557 is_badblock(rdev, sect, s,
1558 &first_bad, &bad_sectors) == 0 &&
1454 sync_page_io(rdev, sect, s<<9, 1559 sync_page_io(rdev, sect, s<<9,
1455 conf->tmppage, READ, false)) 1560 conf->tmppage, READ, false))
1456 success = 1; 1561 success = 1;
@@ -1546,9 +1651,11 @@ static void raid1d(mddev_t *mddev)
1546 conf = mddev->private; 1651 conf = mddev->private;
1547 if (test_bit(R1BIO_IsSync, &r1_bio->state)) 1652 if (test_bit(R1BIO_IsSync, &r1_bio->state))
1548 sync_request_write(mddev, r1_bio); 1653 sync_request_write(mddev, r1_bio);
1549 else { 1654 else if (test_bit(R1BIO_ReadError, &r1_bio->state)) {
1550 int disk; 1655 int disk;
1656 int max_sectors;
1551 1657
1658 clear_bit(R1BIO_ReadError, &r1_bio->state);
1552 /* we got a read error. Maybe the drive is bad. Maybe just 1659 /* we got a read error. Maybe the drive is bad. Maybe just
1553 * the block and we can fix it. 1660 * the block and we can fix it.
1554 * We freeze all other IO, and try reading the block from 1661 * We freeze all other IO, and try reading the block from
@@ -1568,21 +1675,28 @@ static void raid1d(mddev_t *mddev)
1568 conf->mirrors[r1_bio->read_disk].rdev); 1675 conf->mirrors[r1_bio->read_disk].rdev);
1569 1676
1570 bio = r1_bio->bios[r1_bio->read_disk]; 1677 bio = r1_bio->bios[r1_bio->read_disk];
1571 if ((disk=read_balance(conf, r1_bio)) == -1) { 1678 bdevname(bio->bi_bdev, b);
1679read_more:
1680 disk = read_balance(conf, r1_bio, &max_sectors);
1681 if (disk == -1) {
1572 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O" 1682 printk(KERN_ALERT "md/raid1:%s: %s: unrecoverable I/O"
1573 " read error for block %llu\n", 1683 " read error for block %llu\n",
1574 mdname(mddev), 1684 mdname(mddev), b,
1575 bdevname(bio->bi_bdev,b),
1576 (unsigned long long)r1_bio->sector); 1685 (unsigned long long)r1_bio->sector);
1577 raid_end_bio_io(r1_bio); 1686 raid_end_bio_io(r1_bio);
1578 } else { 1687 } else {
1579 const unsigned long do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC; 1688 const unsigned long do_sync = r1_bio->master_bio->bi_rw & REQ_SYNC;
1580 r1_bio->bios[r1_bio->read_disk] = 1689 if (bio) {
1581 mddev->ro ? IO_BLOCKED : NULL; 1690 r1_bio->bios[r1_bio->read_disk] =
1691 mddev->ro ? IO_BLOCKED : NULL;
1692 bio_put(bio);
1693 }
1582 r1_bio->read_disk = disk; 1694 r1_bio->read_disk = disk;
1583 bio_put(bio);
1584 bio = bio_clone_mddev(r1_bio->master_bio, 1695 bio = bio_clone_mddev(r1_bio->master_bio,
1585 GFP_NOIO, mddev); 1696 GFP_NOIO, mddev);
1697 md_trim_bio(bio,
1698 r1_bio->sector - bio->bi_sector,
1699 max_sectors);
1586 r1_bio->bios[r1_bio->read_disk] = bio; 1700 r1_bio->bios[r1_bio->read_disk] = bio;
1587 rdev = conf->mirrors[disk].rdev; 1701 rdev = conf->mirrors[disk].rdev;
1588 printk_ratelimited( 1702 printk_ratelimited(
@@ -1597,8 +1711,44 @@ static void raid1d(mddev_t *mddev)
1597 bio->bi_end_io = raid1_end_read_request; 1711 bio->bi_end_io = raid1_end_read_request;
1598 bio->bi_rw = READ | do_sync; 1712 bio->bi_rw = READ | do_sync;
1599 bio->bi_private = r1_bio; 1713 bio->bi_private = r1_bio;
1600 generic_make_request(bio); 1714 if (max_sectors < r1_bio->sectors) {
1715 /* Drat - have to split this up more */
1716 struct bio *mbio = r1_bio->master_bio;
1717 int sectors_handled =
1718 r1_bio->sector + max_sectors
1719 - mbio->bi_sector;
1720 r1_bio->sectors = max_sectors;
1721 spin_lock_irq(&conf->device_lock);
1722 if (mbio->bi_phys_segments == 0)
1723 mbio->bi_phys_segments = 2;
1724 else
1725 mbio->bi_phys_segments++;
1726 spin_unlock_irq(&conf->device_lock);
1727 generic_make_request(bio);
1728 bio = NULL;
1729
1730 r1_bio = mempool_alloc(conf->r1bio_pool,
1731 GFP_NOIO);
1732
1733 r1_bio->master_bio = mbio;
1734 r1_bio->sectors = (mbio->bi_size >> 9)
1735 - sectors_handled;
1736 r1_bio->state = 0;
1737 set_bit(R1BIO_ReadError,
1738 &r1_bio->state);
1739 r1_bio->mddev = mddev;
1740 r1_bio->sector = mbio->bi_sector
1741 + sectors_handled;
1742
1743 goto read_more;
1744 } else
1745 generic_make_request(bio);
1601 } 1746 }
1747 } else {
1748 /* just a partial read to be scheduled from separate
1749 * context
1750 */
1751 generic_make_request(r1_bio->bios[r1_bio->read_disk]);
1602 } 1752 }
1603 cond_resched(); 1753 cond_resched();
1604 } 1754 }
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 3cd18cfda2ad..aa6af37ca01b 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -123,6 +123,10 @@ struct r1bio_s {
123#define R1BIO_IsSync 1 123#define R1BIO_IsSync 1
124#define R1BIO_Degraded 2 124#define R1BIO_Degraded 2
125#define R1BIO_BehindIO 3 125#define R1BIO_BehindIO 3
126/* Set ReadError on bios that experience a readerror so that
127 * raid1d knows what to do with them.
128 */
129#define R1BIO_ReadError 4
126/* For write-behind requests, we call bi_end_io when 130/* For write-behind requests, we call bi_end_io when
127 * the last non-write-behind device completes, providing 131 * the last non-write-behind device completes, providing
128 * any write was successful. Otherwise we call when 132 * any write was successful. Otherwise we call when