diff options
author | NeilBrown <neilb@cse.unsw.edu.au> | 2005-09-09 19:23:47 -0400 |
---|---|---|
committer | Linus Torvalds <torvalds@g5.osdl.org> | 2005-09-09 19:39:10 -0400 |
commit | 4b6d287f627b5fb6a49f78f9e81649ff98c62bb7 (patch) | |
tree | 7b6cbc6a997e25a7fb6185da7129e539c4ffda8b /drivers | |
parent | 8ddf9efe6708f3674f0ddfeb6425fd27bea109a2 (diff) |
[PATCH] md: add write-behind support for md/raid1
If a device is flagged 'WriteMostly' and the array has a bitmap, and the
bitmap superblock indicates that write_behind is allowed, then write_behind is
enabled for WriteMostly devices.
Write requests will be acknowledges as complete to the caller (via b_end_io)
when all non-WriteMostly devices have completed the write, but will not be
cleared from the bitmap until all devices complete.
This requires memory allocation to make a local copy of the data being
written. If there is insufficient memory, then we fall-back on normal write
semantics.
Signed-Off-By: Paul Clements <paul.clements@steeleye.com>
Signed-off-by: Neil Brown <neilb@cse.unsw.edu.au>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/md/bitmap.c | 26 | ||||
-rw-r--r-- | drivers/md/raid1.c | 124 |
2 files changed, 139 insertions, 11 deletions
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 2925219f088..2c84de2b4ad 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -437,6 +437,7 @@ void bitmap_print_sb(struct bitmap *bitmap) | |||
437 | printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); | 437 | printk(KERN_DEBUG " daemon sleep: %ds\n", le32_to_cpu(sb->daemon_sleep)); |
438 | printk(KERN_DEBUG " sync size: %llu KB\n", | 438 | printk(KERN_DEBUG " sync size: %llu KB\n", |
439 | (unsigned long long)le64_to_cpu(sb->sync_size)/2); | 439 | (unsigned long long)le64_to_cpu(sb->sync_size)/2); |
440 | printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind)); | ||
440 | kunmap(bitmap->sb_page); | 441 | kunmap(bitmap->sb_page); |
441 | } | 442 | } |
442 | 443 | ||
@@ -445,7 +446,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
445 | { | 446 | { |
446 | char *reason = NULL; | 447 | char *reason = NULL; |
447 | bitmap_super_t *sb; | 448 | bitmap_super_t *sb; |
448 | unsigned long chunksize, daemon_sleep; | 449 | unsigned long chunksize, daemon_sleep, write_behind; |
449 | unsigned long bytes_read; | 450 | unsigned long bytes_read; |
450 | unsigned long long events; | 451 | unsigned long long events; |
451 | int err = -EINVAL; | 452 | int err = -EINVAL; |
@@ -474,6 +475,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
474 | 475 | ||
475 | chunksize = le32_to_cpu(sb->chunksize); | 476 | chunksize = le32_to_cpu(sb->chunksize); |
476 | daemon_sleep = le32_to_cpu(sb->daemon_sleep); | 477 | daemon_sleep = le32_to_cpu(sb->daemon_sleep); |
478 | write_behind = le32_to_cpu(sb->write_behind); | ||
477 | 479 | ||
478 | /* verify that the bitmap-specific fields are valid */ | 480 | /* verify that the bitmap-specific fields are valid */ |
479 | if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) | 481 | if (sb->magic != cpu_to_le32(BITMAP_MAGIC)) |
@@ -485,7 +487,9 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
485 | else if ((1 << ffz(~chunksize)) != chunksize) | 487 | else if ((1 << ffz(~chunksize)) != chunksize) |
486 | reason = "bitmap chunksize not a power of 2"; | 488 | reason = "bitmap chunksize not a power of 2"; |
487 | else if (daemon_sleep < 1 || daemon_sleep > 15) | 489 | else if (daemon_sleep < 1 || daemon_sleep > 15) |
488 | reason = "daemon sleep period out of range"; | 490 | reason = "daemon sleep period out of range (1-15s)"; |
491 | else if (write_behind > COUNTER_MAX) | ||
492 | reason = "write-behind limit out of range (0 - 16383)"; | ||
489 | if (reason) { | 493 | if (reason) { |
490 | printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n", | 494 | printk(KERN_INFO "%s: invalid bitmap file superblock: %s\n", |
491 | bmname(bitmap), reason); | 495 | bmname(bitmap), reason); |
@@ -518,6 +522,7 @@ success: | |||
518 | /* assign fields using values from superblock */ | 522 | /* assign fields using values from superblock */ |
519 | bitmap->chunksize = chunksize; | 523 | bitmap->chunksize = chunksize; |
520 | bitmap->daemon_sleep = daemon_sleep; | 524 | bitmap->daemon_sleep = daemon_sleep; |
525 | bitmap->max_write_behind = write_behind; | ||
521 | bitmap->flags |= sb->state; | 526 | bitmap->flags |= sb->state; |
522 | bitmap->events_cleared = le64_to_cpu(sb->events_cleared); | 527 | bitmap->events_cleared = le64_to_cpu(sb->events_cleared); |
523 | if (sb->state & BITMAP_STALE) | 528 | if (sb->state & BITMAP_STALE) |
@@ -1282,9 +1287,16 @@ static bitmap_counter_t *bitmap_get_counter(struct bitmap *bitmap, | |||
1282 | } | 1287 | } |
1283 | } | 1288 | } |
1284 | 1289 | ||
1285 | int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors) | 1290 | int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, int behind) |
1286 | { | 1291 | { |
1287 | if (!bitmap) return 0; | 1292 | if (!bitmap) return 0; |
1293 | |||
1294 | if (behind) { | ||
1295 | atomic_inc(&bitmap->behind_writes); | ||
1296 | PRINTK(KERN_DEBUG "inc write-behind count %d/%d\n", | ||
1297 | atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); | ||
1298 | } | ||
1299 | |||
1288 | while (sectors) { | 1300 | while (sectors) { |
1289 | int blocks; | 1301 | int blocks; |
1290 | bitmap_counter_t *bmc; | 1302 | bitmap_counter_t *bmc; |
@@ -1319,9 +1331,15 @@ int bitmap_startwrite(struct bitmap *bitmap, sector_t offset, unsigned long sect | |||
1319 | } | 1331 | } |
1320 | 1332 | ||
1321 | void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, | 1333 | void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long sectors, |
1322 | int success) | 1334 | int success, int behind) |
1323 | { | 1335 | { |
1324 | if (!bitmap) return; | 1336 | if (!bitmap) return; |
1337 | if (behind) { | ||
1338 | atomic_dec(&bitmap->behind_writes); | ||
1339 | PRINTK(KERN_DEBUG "dec write-behind count %d/%d\n", | ||
1340 | atomic_read(&bitmap->behind_writes), bitmap->max_write_behind); | ||
1341 | } | ||
1342 | |||
1325 | while (sectors) { | 1343 | while (sectors) { |
1326 | int blocks; | 1344 | int blocks; |
1327 | unsigned long flags; | 1345 | unsigned long flags; |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 28839a8193f..ba7f5f25616 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -222,8 +222,17 @@ static void raid_end_bio_io(r1bio_t *r1_bio) | |||
222 | { | 222 | { |
223 | struct bio *bio = r1_bio->master_bio; | 223 | struct bio *bio = r1_bio->master_bio; |
224 | 224 | ||
225 | bio_endio(bio, bio->bi_size, | 225 | /* if nobody has done the final endio yet, do it now */ |
226 | test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); | 226 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { |
227 | PRINTK(KERN_DEBUG "raid1: sync end %s on sectors %llu-%llu\n", | ||
228 | (bio_data_dir(bio) == WRITE) ? "write" : "read", | ||
229 | (unsigned long long) bio->bi_sector, | ||
230 | (unsigned long long) bio->bi_sector + | ||
231 | (bio->bi_size >> 9) - 1); | ||
232 | |||
233 | bio_endio(bio, bio->bi_size, | ||
234 | test_bit(R1BIO_Uptodate, &r1_bio->state) ? 0 : -EIO); | ||
235 | } | ||
227 | free_r1bio(r1_bio); | 236 | free_r1bio(r1_bio); |
228 | } | 237 | } |
229 | 238 | ||
@@ -292,7 +301,7 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int | |||
292 | { | 301 | { |
293 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); | 302 | int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); |
294 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); | 303 | r1bio_t * r1_bio = (r1bio_t *)(bio->bi_private); |
295 | int mirror; | 304 | int mirror, behind; |
296 | conf_t *conf = mddev_to_conf(r1_bio->mddev); | 305 | conf_t *conf = mddev_to_conf(r1_bio->mddev); |
297 | 306 | ||
298 | if (bio->bi_size) | 307 | if (bio->bi_size) |
@@ -323,16 +332,46 @@ static int raid1_end_write_request(struct bio *bio, unsigned int bytes_done, int | |||
323 | 332 | ||
324 | update_head_pos(mirror, r1_bio); | 333 | update_head_pos(mirror, r1_bio); |
325 | 334 | ||
335 | behind = test_bit(R1BIO_BehindIO, &r1_bio->state); | ||
336 | if (behind) { | ||
337 | if (test_bit(WriteMostly, &conf->mirrors[mirror].rdev->flags)) | ||
338 | atomic_dec(&r1_bio->behind_remaining); | ||
339 | |||
340 | /* In behind mode, we ACK the master bio once the I/O has safely | ||
341 | * reached all non-writemostly disks. Setting the Returned bit | ||
342 | * ensures that this gets done only once -- we don't ever want to | ||
343 | * return -EIO here, instead we'll wait */ | ||
344 | |||
345 | if (atomic_read(&r1_bio->behind_remaining) >= (atomic_read(&r1_bio->remaining)-1) && | ||
346 | test_bit(R1BIO_Uptodate, &r1_bio->state)) { | ||
347 | /* Maybe we can return now */ | ||
348 | if (!test_and_set_bit(R1BIO_Returned, &r1_bio->state)) { | ||
349 | struct bio *mbio = r1_bio->master_bio; | ||
350 | PRINTK(KERN_DEBUG "raid1: behind end write sectors %llu-%llu\n", | ||
351 | (unsigned long long) mbio->bi_sector, | ||
352 | (unsigned long long) mbio->bi_sector + | ||
353 | (mbio->bi_size >> 9) - 1); | ||
354 | bio_endio(mbio, mbio->bi_size, 0); | ||
355 | } | ||
356 | } | ||
357 | } | ||
326 | /* | 358 | /* |
327 | * | 359 | * |
328 | * Let's see if all mirrored write operations have finished | 360 | * Let's see if all mirrored write operations have finished |
329 | * already. | 361 | * already. |
330 | */ | 362 | */ |
331 | if (atomic_dec_and_test(&r1_bio->remaining)) { | 363 | if (atomic_dec_and_test(&r1_bio->remaining)) { |
364 | if (test_bit(R1BIO_BehindIO, &r1_bio->state)) { | ||
365 | /* free extra copy of the data pages */ | ||
366 | int i = bio->bi_vcnt; | ||
367 | while (i--) | ||
368 | __free_page(bio->bi_io_vec[i].bv_page); | ||
369 | } | ||
332 | /* clear the bitmap if all writes complete successfully */ | 370 | /* clear the bitmap if all writes complete successfully */ |
333 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, | 371 | bitmap_endwrite(r1_bio->mddev->bitmap, r1_bio->sector, |
334 | r1_bio->sectors, | 372 | r1_bio->sectors, |
335 | !test_bit(R1BIO_Degraded, &r1_bio->state)); | 373 | !test_bit(R1BIO_Degraded, &r1_bio->state), |
374 | behind); | ||
336 | md_write_end(r1_bio->mddev); | 375 | md_write_end(r1_bio->mddev); |
337 | raid_end_bio_io(r1_bio); | 376 | raid_end_bio_io(r1_bio); |
338 | } | 377 | } |
@@ -562,6 +601,39 @@ static void device_barrier(conf_t *conf, sector_t sect) | |||
562 | spin_unlock_irq(&conf->resync_lock); | 601 | spin_unlock_irq(&conf->resync_lock); |
563 | } | 602 | } |
564 | 603 | ||
604 | /* duplicate the data pages for behind I/O */ | ||
605 | static struct page **alloc_behind_pages(struct bio *bio) | ||
606 | { | ||
607 | int i; | ||
608 | struct bio_vec *bvec; | ||
609 | struct page **pages = kmalloc(bio->bi_vcnt * sizeof(struct page *), | ||
610 | GFP_NOIO); | ||
611 | if (unlikely(!pages)) | ||
612 | goto do_sync_io; | ||
613 | |||
614 | memset(pages, 0, bio->bi_vcnt * sizeof(struct page *)); | ||
615 | |||
616 | bio_for_each_segment(bvec, bio, i) { | ||
617 | pages[i] = alloc_page(GFP_NOIO); | ||
618 | if (unlikely(!pages[i])) | ||
619 | goto do_sync_io; | ||
620 | memcpy(kmap(pages[i]) + bvec->bv_offset, | ||
621 | kmap(bvec->bv_page) + bvec->bv_offset, bvec->bv_len); | ||
622 | kunmap(pages[i]); | ||
623 | kunmap(bvec->bv_page); | ||
624 | } | ||
625 | |||
626 | return pages; | ||
627 | |||
628 | do_sync_io: | ||
629 | if (pages) | ||
630 | for (i = 0; i < bio->bi_vcnt && pages[i]; i++) | ||
631 | __free_page(pages[i]); | ||
632 | kfree(pages); | ||
633 | PRINTK("%dB behind alloc failed, doing sync I/O\n", bio->bi_size); | ||
634 | return NULL; | ||
635 | } | ||
636 | |||
565 | static int make_request(request_queue_t *q, struct bio * bio) | 637 | static int make_request(request_queue_t *q, struct bio * bio) |
566 | { | 638 | { |
567 | mddev_t *mddev = q->queuedata; | 639 | mddev_t *mddev = q->queuedata; |
@@ -574,6 +646,7 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
574 | struct bitmap *bitmap = mddev->bitmap; | 646 | struct bitmap *bitmap = mddev->bitmap; |
575 | unsigned long flags; | 647 | unsigned long flags; |
576 | struct bio_list bl; | 648 | struct bio_list bl; |
649 | struct page **behind_pages = NULL; | ||
577 | 650 | ||
578 | if (unlikely(bio_barrier(bio))) { | 651 | if (unlikely(bio_barrier(bio))) { |
579 | bio_endio(bio, bio->bi_size, -EOPNOTSUPP); | 652 | bio_endio(bio, bio->bi_size, -EOPNOTSUPP); |
@@ -613,8 +686,6 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
613 | r1_bio->mddev = mddev; | 686 | r1_bio->mddev = mddev; |
614 | r1_bio->sector = bio->bi_sector; | 687 | r1_bio->sector = bio->bi_sector; |
615 | 688 | ||
616 | r1_bio->state = 0; | ||
617 | |||
618 | if (bio_data_dir(bio) == READ) { | 689 | if (bio_data_dir(bio) == READ) { |
619 | /* | 690 | /* |
620 | * read balancing logic: | 691 | * read balancing logic: |
@@ -675,13 +746,22 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
675 | } | 746 | } |
676 | rcu_read_unlock(); | 747 | rcu_read_unlock(); |
677 | 748 | ||
749 | BUG_ON(targets == 0); /* we never fail the last device */ | ||
750 | |||
678 | if (targets < conf->raid_disks) { | 751 | if (targets < conf->raid_disks) { |
679 | /* array is degraded, we will not clear the bitmap | 752 | /* array is degraded, we will not clear the bitmap |
680 | * on I/O completion (see raid1_end_write_request) */ | 753 | * on I/O completion (see raid1_end_write_request) */ |
681 | set_bit(R1BIO_Degraded, &r1_bio->state); | 754 | set_bit(R1BIO_Degraded, &r1_bio->state); |
682 | } | 755 | } |
683 | 756 | ||
757 | /* do behind I/O ? */ | ||
758 | if (bitmap && | ||
759 | atomic_read(&bitmap->behind_writes) < bitmap->max_write_behind && | ||
760 | (behind_pages = alloc_behind_pages(bio)) != NULL) | ||
761 | set_bit(R1BIO_BehindIO, &r1_bio->state); | ||
762 | |||
684 | atomic_set(&r1_bio->remaining, 0); | 763 | atomic_set(&r1_bio->remaining, 0); |
764 | atomic_set(&r1_bio->behind_remaining, 0); | ||
685 | 765 | ||
686 | bio_list_init(&bl); | 766 | bio_list_init(&bl); |
687 | for (i = 0; i < disks; i++) { | 767 | for (i = 0; i < disks; i++) { |
@@ -698,12 +778,31 @@ static int make_request(request_queue_t *q, struct bio * bio) | |||
698 | mbio->bi_rw = WRITE; | 778 | mbio->bi_rw = WRITE; |
699 | mbio->bi_private = r1_bio; | 779 | mbio->bi_private = r1_bio; |
700 | 780 | ||
781 | if (behind_pages) { | ||
782 | struct bio_vec *bvec; | ||
783 | int j; | ||
784 | |||
785 | /* Yes, I really want the '__' version so that | ||
786 | * we clear any unused pointer in the io_vec, rather | ||
787 | * than leave them unchanged. This is important | ||
788 | * because when we come to free the pages, we won't | ||
789 | * know the originial bi_idx, so we just free | ||
790 | * them all | ||
791 | */ | ||
792 | __bio_for_each_segment(bvec, mbio, j, 0) | ||
793 | bvec->bv_page = behind_pages[j]; | ||
794 | if (test_bit(WriteMostly, &conf->mirrors[i].rdev->flags)) | ||
795 | atomic_inc(&r1_bio->behind_remaining); | ||
796 | } | ||
797 | |||
701 | atomic_inc(&r1_bio->remaining); | 798 | atomic_inc(&r1_bio->remaining); |
702 | 799 | ||
703 | bio_list_add(&bl, mbio); | 800 | bio_list_add(&bl, mbio); |
704 | } | 801 | } |
802 | kfree(behind_pages); /* the behind pages are attached to the bios now */ | ||
705 | 803 | ||
706 | bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors); | 804 | bitmap_startwrite(bitmap, bio->bi_sector, r1_bio->sectors, |
805 | test_bit(R1BIO_BehindIO, &r1_bio->state)); | ||
707 | spin_lock_irqsave(&conf->device_lock, flags); | 806 | spin_lock_irqsave(&conf->device_lock, flags); |
708 | bio_list_merge(&conf->pending_bio_list, &bl); | 807 | bio_list_merge(&conf->pending_bio_list, &bl); |
709 | bio_list_init(&bl); | 808 | bio_list_init(&bl); |
@@ -1471,6 +1570,17 @@ out: | |||
1471 | static int stop(mddev_t *mddev) | 1570 | static int stop(mddev_t *mddev) |
1472 | { | 1571 | { |
1473 | conf_t *conf = mddev_to_conf(mddev); | 1572 | conf_t *conf = mddev_to_conf(mddev); |
1573 | struct bitmap *bitmap = mddev->bitmap; | ||
1574 | int behind_wait = 0; | ||
1575 | |||
1576 | /* wait for behind writes to complete */ | ||
1577 | while (bitmap && atomic_read(&bitmap->behind_writes) > 0) { | ||
1578 | behind_wait++; | ||
1579 | printk(KERN_INFO "raid1: behind writes in progress on device %s, waiting to stop (%d)\n", mdname(mddev), behind_wait); | ||
1580 | set_current_state(TASK_UNINTERRUPTIBLE); | ||
1581 | schedule_timeout(HZ); /* wait a second */ | ||
1582 | /* need to kick something here to make sure I/O goes? */ | ||
1583 | } | ||
1474 | 1584 | ||
1475 | md_unregister_thread(mddev->thread); | 1585 | md_unregister_thread(mddev->thread); |
1476 | mddev->thread = NULL; | 1586 | mddev->thread = NULL; |