diff options
author | Takashi Iwai <tiwai@suse.de> | 2012-04-07 06:28:00 -0400 |
---|---|---|
committer | Takashi Iwai <tiwai@suse.de> | 2012-04-07 06:28:00 -0400 |
commit | c38f62b08d800104fa9b0e9d6e9141459986c06d (patch) | |
tree | 1d04d768c8aa0c1a544d1f068317c7beb0101be2 /drivers/md | |
parent | 250f32747e62cb415b85083e247184188f24e566 (diff) | |
parent | 8abe05c6eb358967f16bce8a02c88d57c82cfbd6 (diff) |
Merge tag 'asoc-3.4' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound into for-linus
ASoC: fixes for 3.4
A bunch of driver-specific fixes and one generic fix for the new support
for platform DAPM contexts - we were picking the wrong default for the
idle_bias_off setting which was meaning we weren't actually achieving
any useful runtime PM on platform devices.
Diffstat (limited to 'drivers/md')
41 files changed, 2424 insertions, 807 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig index faa4741df6d3..10f122a3a856 100644 --- a/drivers/md/Kconfig +++ b/drivers/md/Kconfig | |||
@@ -277,8 +277,8 @@ config DM_MIRROR | |||
277 | needed for live data migration tools such as 'pvmove'. | 277 | needed for live data migration tools such as 'pvmove'. |
278 | 278 | ||
279 | config DM_RAID | 279 | config DM_RAID |
280 | tristate "RAID 1/4/5/6 target (EXPERIMENTAL)" | 280 | tristate "RAID 1/4/5/6 target" |
281 | depends on BLK_DEV_DM && EXPERIMENTAL | 281 | depends on BLK_DEV_DM |
282 | select MD_RAID1 | 282 | select MD_RAID1 |
283 | select MD_RAID456 | 283 | select MD_RAID456 |
284 | select BLK_DEV_MD | 284 | select BLK_DEV_MD |
@@ -359,8 +359,8 @@ config DM_DELAY | |||
359 | If unsure, say N. | 359 | If unsure, say N. |
360 | 360 | ||
361 | config DM_UEVENT | 361 | config DM_UEVENT |
362 | bool "DM uevents (EXPERIMENTAL)" | 362 | bool "DM uevents" |
363 | depends on BLK_DEV_DM && EXPERIMENTAL | 363 | depends on BLK_DEV_DM |
364 | ---help--- | 364 | ---help--- |
365 | Generate udev events for DM events. | 365 | Generate udev events for DM events. |
366 | 366 | ||
@@ -370,4 +370,24 @@ config DM_FLAKEY | |||
370 | ---help--- | 370 | ---help--- |
371 | A target that intermittently fails I/O for debugging purposes. | 371 | A target that intermittently fails I/O for debugging purposes. |
372 | 372 | ||
373 | config DM_VERITY | ||
374 | tristate "Verity target support (EXPERIMENTAL)" | ||
375 | depends on BLK_DEV_DM && EXPERIMENTAL | ||
376 | select CRYPTO | ||
377 | select CRYPTO_HASH | ||
378 | select DM_BUFIO | ||
379 | ---help--- | ||
380 | This device-mapper target creates a read-only device that | ||
381 | transparently validates the data on one underlying device against | ||
382 | a pre-generated tree of cryptographic checksums stored on a second | ||
383 | device. | ||
384 | |||
385 | You'll need to activate the digests you're going to use in the | ||
386 | cryptoapi configuration. | ||
387 | |||
388 | To compile this code as a module, choose M here: the module will | ||
389 | be called dm-verity. | ||
390 | |||
391 | If unsure, say N. | ||
392 | |||
373 | endif # MD | 393 | endif # MD |
diff --git a/drivers/md/Makefile b/drivers/md/Makefile index 046860c7a166..8b2e0dffe82e 100644 --- a/drivers/md/Makefile +++ b/drivers/md/Makefile | |||
@@ -42,6 +42,7 @@ obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o | |||
42 | obj-$(CONFIG_DM_ZERO) += dm-zero.o | 42 | obj-$(CONFIG_DM_ZERO) += dm-zero.o |
43 | obj-$(CONFIG_DM_RAID) += dm-raid.o | 43 | obj-$(CONFIG_DM_RAID) += dm-raid.o |
44 | obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o | 44 | obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o |
45 | obj-$(CONFIG_DM_VERITY) += dm-verity.o | ||
45 | 46 | ||
46 | ifeq ($(CONFIG_DM_UEVENT),y) | 47 | ifeq ($(CONFIG_DM_UEVENT),y) |
47 | dm-mod-objs += dm-uevent.o | 48 | dm-mod-objs += dm-uevent.o |
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index cdf36b1e9aa6..3d0dfa7a89a2 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c | |||
@@ -26,6 +26,7 @@ | |||
26 | #include <linux/file.h> | 26 | #include <linux/file.h> |
27 | #include <linux/mount.h> | 27 | #include <linux/mount.h> |
28 | #include <linux/buffer_head.h> | 28 | #include <linux/buffer_head.h> |
29 | #include <linux/seq_file.h> | ||
29 | #include "md.h" | 30 | #include "md.h" |
30 | #include "bitmap.h" | 31 | #include "bitmap.h" |
31 | 32 | ||
@@ -35,31 +36,6 @@ static inline char *bmname(struct bitmap *bitmap) | |||
35 | } | 36 | } |
36 | 37 | ||
37 | /* | 38 | /* |
38 | * just a placeholder - calls kmalloc for bitmap pages | ||
39 | */ | ||
40 | static unsigned char *bitmap_alloc_page(struct bitmap *bitmap) | ||
41 | { | ||
42 | unsigned char *page; | ||
43 | |||
44 | page = kzalloc(PAGE_SIZE, GFP_NOIO); | ||
45 | if (!page) | ||
46 | printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap)); | ||
47 | else | ||
48 | pr_debug("%s: bitmap_alloc_page: allocated page at %p\n", | ||
49 | bmname(bitmap), page); | ||
50 | return page; | ||
51 | } | ||
52 | |||
53 | /* | ||
54 | * for now just a placeholder -- just calls kfree for bitmap pages | ||
55 | */ | ||
56 | static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page) | ||
57 | { | ||
58 | pr_debug("%s: bitmap_free_page: free page %p\n", bmname(bitmap), page); | ||
59 | kfree(page); | ||
60 | } | ||
61 | |||
62 | /* | ||
63 | * check a page and, if necessary, allocate it (or hijack it if the alloc fails) | 39 | * check a page and, if necessary, allocate it (or hijack it if the alloc fails) |
64 | * | 40 | * |
65 | * 1) check to see if this page is allocated, if it's not then try to alloc | 41 | * 1) check to see if this page is allocated, if it's not then try to alloc |
@@ -96,7 +72,7 @@ __acquires(bitmap->lock) | |||
96 | /* this page has not been allocated yet */ | 72 | /* this page has not been allocated yet */ |
97 | 73 | ||
98 | spin_unlock_irq(&bitmap->lock); | 74 | spin_unlock_irq(&bitmap->lock); |
99 | mappage = bitmap_alloc_page(bitmap); | 75 | mappage = kzalloc(PAGE_SIZE, GFP_NOIO); |
100 | spin_lock_irq(&bitmap->lock); | 76 | spin_lock_irq(&bitmap->lock); |
101 | 77 | ||
102 | if (mappage == NULL) { | 78 | if (mappage == NULL) { |
@@ -109,7 +85,7 @@ __acquires(bitmap->lock) | |||
109 | } else if (bitmap->bp[page].map || | 85 | } else if (bitmap->bp[page].map || |
110 | bitmap->bp[page].hijacked) { | 86 | bitmap->bp[page].hijacked) { |
111 | /* somebody beat us to getting the page */ | 87 | /* somebody beat us to getting the page */ |
112 | bitmap_free_page(bitmap, mappage); | 88 | kfree(mappage); |
113 | return 0; | 89 | return 0; |
114 | } else { | 90 | } else { |
115 | 91 | ||
@@ -141,7 +117,7 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page) | |||
141 | ptr = bitmap->bp[page].map; | 117 | ptr = bitmap->bp[page].map; |
142 | bitmap->bp[page].map = NULL; | 118 | bitmap->bp[page].map = NULL; |
143 | bitmap->missing_pages++; | 119 | bitmap->missing_pages++; |
144 | bitmap_free_page(bitmap, ptr); | 120 | kfree(ptr); |
145 | } | 121 | } |
146 | } | 122 | } |
147 | 123 | ||
@@ -171,7 +147,7 @@ static struct page *read_sb_page(struct mddev *mddev, loff_t offset, | |||
171 | did_alloc = 1; | 147 | did_alloc = 1; |
172 | } | 148 | } |
173 | 149 | ||
174 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 150 | rdev_for_each(rdev, mddev) { |
175 | if (! test_bit(In_sync, &rdev->flags) | 151 | if (! test_bit(In_sync, &rdev->flags) |
176 | || test_bit(Faulty, &rdev->flags)) | 152 | || test_bit(Faulty, &rdev->flags)) |
177 | continue; | 153 | continue; |
@@ -445,19 +421,14 @@ out: | |||
445 | void bitmap_update_sb(struct bitmap *bitmap) | 421 | void bitmap_update_sb(struct bitmap *bitmap) |
446 | { | 422 | { |
447 | bitmap_super_t *sb; | 423 | bitmap_super_t *sb; |
448 | unsigned long flags; | ||
449 | 424 | ||
450 | if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ | 425 | if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ |
451 | return; | 426 | return; |
452 | if (bitmap->mddev->bitmap_info.external) | 427 | if (bitmap->mddev->bitmap_info.external) |
453 | return; | 428 | return; |
454 | spin_lock_irqsave(&bitmap->lock, flags); | 429 | if (!bitmap->sb_page) /* no superblock */ |
455 | if (!bitmap->sb_page) { /* no superblock */ | ||
456 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
457 | return; | 430 | return; |
458 | } | 431 | sb = kmap_atomic(bitmap->sb_page); |
459 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
460 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); | ||
461 | sb->events = cpu_to_le64(bitmap->mddev->events); | 432 | sb->events = cpu_to_le64(bitmap->mddev->events); |
462 | if (bitmap->mddev->events < bitmap->events_cleared) | 433 | if (bitmap->mddev->events < bitmap->events_cleared) |
463 | /* rocking back to read-only */ | 434 | /* rocking back to read-only */ |
@@ -467,7 +438,7 @@ void bitmap_update_sb(struct bitmap *bitmap) | |||
467 | /* Just in case these have been changed via sysfs: */ | 438 | /* Just in case these have been changed via sysfs: */ |
468 | sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); | 439 | sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); |
469 | sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); | 440 | sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); |
470 | kunmap_atomic(sb, KM_USER0); | 441 | kunmap_atomic(sb); |
471 | write_page(bitmap, bitmap->sb_page, 1); | 442 | write_page(bitmap, bitmap->sb_page, 1); |
472 | } | 443 | } |
473 | 444 | ||
@@ -478,7 +449,7 @@ void bitmap_print_sb(struct bitmap *bitmap) | |||
478 | 449 | ||
479 | if (!bitmap || !bitmap->sb_page) | 450 | if (!bitmap || !bitmap->sb_page) |
480 | return; | 451 | return; |
481 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); | 452 | sb = kmap_atomic(bitmap->sb_page); |
482 | printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); | 453 | printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); |
483 | printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); | 454 | printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); |
484 | printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); | 455 | printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); |
@@ -497,7 +468,7 @@ void bitmap_print_sb(struct bitmap *bitmap) | |||
497 | printk(KERN_DEBUG " sync size: %llu KB\n", | 468 | printk(KERN_DEBUG " sync size: %llu KB\n", |
498 | (unsigned long long)le64_to_cpu(sb->sync_size)/2); | 469 | (unsigned long long)le64_to_cpu(sb->sync_size)/2); |
499 | printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind)); | 470 | printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind)); |
500 | kunmap_atomic(sb, KM_USER0); | 471 | kunmap_atomic(sb); |
501 | } | 472 | } |
502 | 473 | ||
503 | /* | 474 | /* |
@@ -525,7 +496,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) | |||
525 | } | 496 | } |
526 | bitmap->sb_page->index = 0; | 497 | bitmap->sb_page->index = 0; |
527 | 498 | ||
528 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); | 499 | sb = kmap_atomic(bitmap->sb_page); |
529 | 500 | ||
530 | sb->magic = cpu_to_le32(BITMAP_MAGIC); | 501 | sb->magic = cpu_to_le32(BITMAP_MAGIC); |
531 | sb->version = cpu_to_le32(BITMAP_MAJOR_HI); | 502 | sb->version = cpu_to_le32(BITMAP_MAJOR_HI); |
@@ -533,7 +504,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) | |||
533 | chunksize = bitmap->mddev->bitmap_info.chunksize; | 504 | chunksize = bitmap->mddev->bitmap_info.chunksize; |
534 | BUG_ON(!chunksize); | 505 | BUG_ON(!chunksize); |
535 | if (!is_power_of_2(chunksize)) { | 506 | if (!is_power_of_2(chunksize)) { |
536 | kunmap_atomic(sb, KM_USER0); | 507 | kunmap_atomic(sb); |
537 | printk(KERN_ERR "bitmap chunksize not a power of 2\n"); | 508 | printk(KERN_ERR "bitmap chunksize not a power of 2\n"); |
538 | return -EINVAL; | 509 | return -EINVAL; |
539 | } | 510 | } |
@@ -571,7 +542,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap) | |||
571 | bitmap->flags |= BITMAP_HOSTENDIAN; | 542 | bitmap->flags |= BITMAP_HOSTENDIAN; |
572 | sb->version = cpu_to_le32(BITMAP_MAJOR_HOSTENDIAN); | 543 | sb->version = cpu_to_le32(BITMAP_MAJOR_HOSTENDIAN); |
573 | 544 | ||
574 | kunmap_atomic(sb, KM_USER0); | 545 | kunmap_atomic(sb); |
575 | 546 | ||
576 | return 0; | 547 | return 0; |
577 | } | 548 | } |
@@ -603,7 +574,7 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
603 | return err; | 574 | return err; |
604 | } | 575 | } |
605 | 576 | ||
606 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); | 577 | sb = kmap_atomic(bitmap->sb_page); |
607 | 578 | ||
608 | chunksize = le32_to_cpu(sb->chunksize); | 579 | chunksize = le32_to_cpu(sb->chunksize); |
609 | daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; | 580 | daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; |
@@ -632,26 +603,28 @@ static int bitmap_read_sb(struct bitmap *bitmap) | |||
632 | /* keep the array size field of the bitmap superblock up to date */ | 603 | /* keep the array size field of the bitmap superblock up to date */ |
633 | sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); | 604 | sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); |
634 | 605 | ||
635 | if (!bitmap->mddev->persistent) | 606 | if (bitmap->mddev->persistent) { |
636 | goto success; | 607 | /* |
637 | 608 | * We have a persistent array superblock, so compare the | |
638 | /* | 609 | * bitmap's UUID and event counter to the mddev's |
639 | * if we have a persistent array superblock, compare the | 610 | */ |
640 | * bitmap's UUID and event counter to the mddev's | 611 | if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { |
641 | */ | 612 | printk(KERN_INFO |
642 | if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { | 613 | "%s: bitmap superblock UUID mismatch\n", |
643 | printk(KERN_INFO "%s: bitmap superblock UUID mismatch\n", | 614 | bmname(bitmap)); |
644 | bmname(bitmap)); | 615 | goto out; |
645 | goto out; | 616 | } |
646 | } | 617 | events = le64_to_cpu(sb->events); |
647 | events = le64_to_cpu(sb->events); | 618 | if (events < bitmap->mddev->events) { |
648 | if (events < bitmap->mddev->events) { | 619 | printk(KERN_INFO |
649 | printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) " | 620 | "%s: bitmap file is out of date (%llu < %llu) " |
650 | "-- forcing full recovery\n", bmname(bitmap), events, | 621 | "-- forcing full recovery\n", |
651 | (unsigned long long) bitmap->mddev->events); | 622 | bmname(bitmap), events, |
652 | sb->state |= cpu_to_le32(BITMAP_STALE); | 623 | (unsigned long long) bitmap->mddev->events); |
624 | sb->state |= cpu_to_le32(BITMAP_STALE); | ||
625 | } | ||
653 | } | 626 | } |
654 | success: | 627 | |
655 | /* assign fields using values from superblock */ | 628 | /* assign fields using values from superblock */ |
656 | bitmap->mddev->bitmap_info.chunksize = chunksize; | 629 | bitmap->mddev->bitmap_info.chunksize = chunksize; |
657 | bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; | 630 | bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; |
@@ -664,7 +637,7 @@ success: | |||
664 | bitmap->events_cleared = bitmap->mddev->events; | 637 | bitmap->events_cleared = bitmap->mddev->events; |
665 | err = 0; | 638 | err = 0; |
666 | out: | 639 | out: |
667 | kunmap_atomic(sb, KM_USER0); | 640 | kunmap_atomic(sb); |
668 | if (err) | 641 | if (err) |
669 | bitmap_print_sb(bitmap); | 642 | bitmap_print_sb(bitmap); |
670 | return err; | 643 | return err; |
@@ -680,16 +653,11 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, | |||
680 | enum bitmap_mask_op op) | 653 | enum bitmap_mask_op op) |
681 | { | 654 | { |
682 | bitmap_super_t *sb; | 655 | bitmap_super_t *sb; |
683 | unsigned long flags; | ||
684 | int old; | 656 | int old; |
685 | 657 | ||
686 | spin_lock_irqsave(&bitmap->lock, flags); | 658 | if (!bitmap->sb_page) /* can't set the state */ |
687 | if (!bitmap->sb_page) { /* can't set the state */ | ||
688 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
689 | return 0; | 659 | return 0; |
690 | } | 660 | sb = kmap_atomic(bitmap->sb_page); |
691 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
692 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); | ||
693 | old = le32_to_cpu(sb->state) & bits; | 661 | old = le32_to_cpu(sb->state) & bits; |
694 | switch (op) { | 662 | switch (op) { |
695 | case MASK_SET: | 663 | case MASK_SET: |
@@ -703,7 +671,7 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits, | |||
703 | default: | 671 | default: |
704 | BUG(); | 672 | BUG(); |
705 | } | 673 | } |
706 | kunmap_atomic(sb, KM_USER0); | 674 | kunmap_atomic(sb); |
707 | return old; | 675 | return old; |
708 | } | 676 | } |
709 | 677 | ||
@@ -870,7 +838,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) | |||
870 | unsigned long bit; | 838 | unsigned long bit; |
871 | struct page *page; | 839 | struct page *page; |
872 | void *kaddr; | 840 | void *kaddr; |
873 | unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); | 841 | unsigned long chunk = block >> bitmap->chunkshift; |
874 | 842 | ||
875 | if (!bitmap->filemap) | 843 | if (!bitmap->filemap) |
876 | return; | 844 | return; |
@@ -881,12 +849,12 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block) | |||
881 | bit = file_page_offset(bitmap, chunk); | 849 | bit = file_page_offset(bitmap, chunk); |
882 | 850 | ||
883 | /* set the bit */ | 851 | /* set the bit */ |
884 | kaddr = kmap_atomic(page, KM_USER0); | 852 | kaddr = kmap_atomic(page); |
885 | if (bitmap->flags & BITMAP_HOSTENDIAN) | 853 | if (bitmap->flags & BITMAP_HOSTENDIAN) |
886 | set_bit(bit, kaddr); | 854 | set_bit(bit, kaddr); |
887 | else | 855 | else |
888 | __set_bit_le(bit, kaddr); | 856 | __set_bit_le(bit, kaddr); |
889 | kunmap_atomic(kaddr, KM_USER0); | 857 | kunmap_atomic(kaddr); |
890 | pr_debug("set file bit %lu page %lu\n", bit, page->index); | 858 | pr_debug("set file bit %lu page %lu\n", bit, page->index); |
891 | /* record page number so it gets flushed to disk when unplug occurs */ | 859 | /* record page number so it gets flushed to disk when unplug occurs */ |
892 | set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); | 860 | set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); |
@@ -1050,10 +1018,10 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | |||
1050 | * if bitmap is out of date, dirty the | 1018 | * if bitmap is out of date, dirty the |
1051 | * whole page and write it out | 1019 | * whole page and write it out |
1052 | */ | 1020 | */ |
1053 | paddr = kmap_atomic(page, KM_USER0); | 1021 | paddr = kmap_atomic(page); |
1054 | memset(paddr + offset, 0xff, | 1022 | memset(paddr + offset, 0xff, |
1055 | PAGE_SIZE - offset); | 1023 | PAGE_SIZE - offset); |
1056 | kunmap_atomic(paddr, KM_USER0); | 1024 | kunmap_atomic(paddr); |
1057 | write_page(bitmap, page, 1); | 1025 | write_page(bitmap, page, 1); |
1058 | 1026 | ||
1059 | ret = -EIO; | 1027 | ret = -EIO; |
@@ -1061,18 +1029,18 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start) | |||
1061 | goto err; | 1029 | goto err; |
1062 | } | 1030 | } |
1063 | } | 1031 | } |
1064 | paddr = kmap_atomic(page, KM_USER0); | 1032 | paddr = kmap_atomic(page); |
1065 | if (bitmap->flags & BITMAP_HOSTENDIAN) | 1033 | if (bitmap->flags & BITMAP_HOSTENDIAN) |
1066 | b = test_bit(bit, paddr); | 1034 | b = test_bit(bit, paddr); |
1067 | else | 1035 | else |
1068 | b = test_bit_le(bit, paddr); | 1036 | b = test_bit_le(bit, paddr); |
1069 | kunmap_atomic(paddr, KM_USER0); | 1037 | kunmap_atomic(paddr); |
1070 | if (b) { | 1038 | if (b) { |
1071 | /* if the disk bit is set, set the memory bit */ | 1039 | /* if the disk bit is set, set the memory bit */ |
1072 | int needed = ((sector_t)(i+1) << (CHUNK_BLOCK_SHIFT(bitmap)) | 1040 | int needed = ((sector_t)(i+1) << bitmap->chunkshift |
1073 | >= start); | 1041 | >= start); |
1074 | bitmap_set_memory_bits(bitmap, | 1042 | bitmap_set_memory_bits(bitmap, |
1075 | (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap), | 1043 | (sector_t)i << bitmap->chunkshift, |
1076 | needed); | 1044 | needed); |
1077 | bit_cnt++; | 1045 | bit_cnt++; |
1078 | } | 1046 | } |
@@ -1116,7 +1084,7 @@ void bitmap_write_all(struct bitmap *bitmap) | |||
1116 | 1084 | ||
1117 | static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) | 1085 | static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) |
1118 | { | 1086 | { |
1119 | sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap); | 1087 | sector_t chunk = offset >> bitmap->chunkshift; |
1120 | unsigned long page = chunk >> PAGE_COUNTER_SHIFT; | 1088 | unsigned long page = chunk >> PAGE_COUNTER_SHIFT; |
1121 | bitmap->bp[page].count += inc; | 1089 | bitmap->bp[page].count += inc; |
1122 | bitmap_checkfree(bitmap, page); | 1090 | bitmap_checkfree(bitmap, page); |
@@ -1209,10 +1177,10 @@ void bitmap_daemon_work(struct mddev *mddev) | |||
1209 | mddev->bitmap_info.external == 0) { | 1177 | mddev->bitmap_info.external == 0) { |
1210 | bitmap_super_t *sb; | 1178 | bitmap_super_t *sb; |
1211 | bitmap->need_sync = 0; | 1179 | bitmap->need_sync = 0; |
1212 | sb = kmap_atomic(bitmap->sb_page, KM_USER0); | 1180 | sb = kmap_atomic(bitmap->sb_page); |
1213 | sb->events_cleared = | 1181 | sb->events_cleared = |
1214 | cpu_to_le64(bitmap->events_cleared); | 1182 | cpu_to_le64(bitmap->events_cleared); |
1215 | kunmap_atomic(sb, KM_USER0); | 1183 | kunmap_atomic(sb); |
1216 | write_page(bitmap, bitmap->sb_page, 1); | 1184 | write_page(bitmap, bitmap->sb_page, 1); |
1217 | } | 1185 | } |
1218 | spin_lock_irqsave(&bitmap->lock, flags); | 1186 | spin_lock_irqsave(&bitmap->lock, flags); |
@@ -1222,7 +1190,7 @@ void bitmap_daemon_work(struct mddev *mddev) | |||
1222 | bitmap->allclean = 0; | 1190 | bitmap->allclean = 0; |
1223 | } | 1191 | } |
1224 | bmc = bitmap_get_counter(bitmap, | 1192 | bmc = bitmap_get_counter(bitmap, |
1225 | (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), | 1193 | (sector_t)j << bitmap->chunkshift, |
1226 | &blocks, 0); | 1194 | &blocks, 0); |
1227 | if (!bmc) | 1195 | if (!bmc) |
1228 | j |= PAGE_COUNTER_MASK; | 1196 | j |= PAGE_COUNTER_MASK; |
@@ -1231,11 +1199,11 @@ void bitmap_daemon_work(struct mddev *mddev) | |||
1231 | /* we can clear the bit */ | 1199 | /* we can clear the bit */ |
1232 | *bmc = 0; | 1200 | *bmc = 0; |
1233 | bitmap_count_page(bitmap, | 1201 | bitmap_count_page(bitmap, |
1234 | (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), | 1202 | (sector_t)j << bitmap->chunkshift, |
1235 | -1); | 1203 | -1); |
1236 | 1204 | ||
1237 | /* clear the bit */ | 1205 | /* clear the bit */ |
1238 | paddr = kmap_atomic(page, KM_USER0); | 1206 | paddr = kmap_atomic(page); |
1239 | if (bitmap->flags & BITMAP_HOSTENDIAN) | 1207 | if (bitmap->flags & BITMAP_HOSTENDIAN) |
1240 | clear_bit(file_page_offset(bitmap, j), | 1208 | clear_bit(file_page_offset(bitmap, j), |
1241 | paddr); | 1209 | paddr); |
@@ -1244,7 +1212,7 @@ void bitmap_daemon_work(struct mddev *mddev) | |||
1244 | file_page_offset(bitmap, | 1212 | file_page_offset(bitmap, |
1245 | j), | 1213 | j), |
1246 | paddr); | 1214 | paddr); |
1247 | kunmap_atomic(paddr, KM_USER0); | 1215 | kunmap_atomic(paddr); |
1248 | } else if (*bmc <= 2) { | 1216 | } else if (*bmc <= 2) { |
1249 | *bmc = 1; /* maybe clear the bit next time */ | 1217 | *bmc = 1; /* maybe clear the bit next time */ |
1250 | set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); | 1218 | set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); |
@@ -1285,7 +1253,7 @@ __acquires(bitmap->lock) | |||
1285 | * The lock must have been taken with interrupts enabled. | 1253 | * The lock must have been taken with interrupts enabled. |
1286 | * If !create, we don't release the lock. | 1254 | * If !create, we don't release the lock. |
1287 | */ | 1255 | */ |
1288 | sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap); | 1256 | sector_t chunk = offset >> bitmap->chunkshift; |
1289 | unsigned long page = chunk >> PAGE_COUNTER_SHIFT; | 1257 | unsigned long page = chunk >> PAGE_COUNTER_SHIFT; |
1290 | unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; | 1258 | unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; |
1291 | sector_t csize; | 1259 | sector_t csize; |
@@ -1295,10 +1263,10 @@ __acquires(bitmap->lock) | |||
1295 | 1263 | ||
1296 | if (bitmap->bp[page].hijacked || | 1264 | if (bitmap->bp[page].hijacked || |
1297 | bitmap->bp[page].map == NULL) | 1265 | bitmap->bp[page].map == NULL) |
1298 | csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) + | 1266 | csize = ((sector_t)1) << (bitmap->chunkshift + |
1299 | PAGE_COUNTER_SHIFT - 1); | 1267 | PAGE_COUNTER_SHIFT - 1); |
1300 | else | 1268 | else |
1301 | csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap)); | 1269 | csize = ((sector_t)1) << bitmap->chunkshift; |
1302 | *blocks = csize - (offset & (csize - 1)); | 1270 | *blocks = csize - (offset & (csize - 1)); |
1303 | 1271 | ||
1304 | if (err < 0) | 1272 | if (err < 0) |
@@ -1424,7 +1392,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto | |||
1424 | set_page_attr(bitmap, | 1392 | set_page_attr(bitmap, |
1425 | filemap_get_page( | 1393 | filemap_get_page( |
1426 | bitmap, | 1394 | bitmap, |
1427 | offset >> CHUNK_BLOCK_SHIFT(bitmap)), | 1395 | offset >> bitmap->chunkshift), |
1428 | BITMAP_PAGE_PENDING); | 1396 | BITMAP_PAGE_PENDING); |
1429 | bitmap->allclean = 0; | 1397 | bitmap->allclean = 0; |
1430 | } | 1398 | } |
@@ -1512,7 +1480,7 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i | |||
1512 | else { | 1480 | else { |
1513 | if (*bmc <= 2) { | 1481 | if (*bmc <= 2) { |
1514 | set_page_attr(bitmap, | 1482 | set_page_attr(bitmap, |
1515 | filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)), | 1483 | filemap_get_page(bitmap, offset >> bitmap->chunkshift), |
1516 | BITMAP_PAGE_PENDING); | 1484 | BITMAP_PAGE_PENDING); |
1517 | bitmap->allclean = 0; | 1485 | bitmap->allclean = 0; |
1518 | } | 1486 | } |
@@ -1559,7 +1527,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector) | |||
1559 | 1527 | ||
1560 | bitmap->mddev->curr_resync_completed = sector; | 1528 | bitmap->mddev->curr_resync_completed = sector; |
1561 | set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); | 1529 | set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); |
1562 | sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); | 1530 | sector &= ~((1ULL << bitmap->chunkshift) - 1); |
1563 | s = 0; | 1531 | s = 0; |
1564 | while (s < sector && s < bitmap->mddev->resync_max_sectors) { | 1532 | while (s < sector && s < bitmap->mddev->resync_max_sectors) { |
1565 | bitmap_end_sync(bitmap, s, &blocks, 0); | 1533 | bitmap_end_sync(bitmap, s, &blocks, 0); |
@@ -1589,7 +1557,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n | |||
1589 | struct page *page; | 1557 | struct page *page; |
1590 | *bmc = 2 | (needed ? NEEDED_MASK : 0); | 1558 | *bmc = 2 | (needed ? NEEDED_MASK : 0); |
1591 | bitmap_count_page(bitmap, offset, 1); | 1559 | bitmap_count_page(bitmap, offset, 1); |
1592 | page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); | 1560 | page = filemap_get_page(bitmap, offset >> bitmap->chunkshift); |
1593 | set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); | 1561 | set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); |
1594 | bitmap->allclean = 0; | 1562 | bitmap->allclean = 0; |
1595 | } | 1563 | } |
@@ -1602,7 +1570,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e) | |||
1602 | unsigned long chunk; | 1570 | unsigned long chunk; |
1603 | 1571 | ||
1604 | for (chunk = s; chunk <= e; chunk++) { | 1572 | for (chunk = s; chunk <= e; chunk++) { |
1605 | sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap); | 1573 | sector_t sec = (sector_t)chunk << bitmap->chunkshift; |
1606 | bitmap_set_memory_bits(bitmap, sec, 1); | 1574 | bitmap_set_memory_bits(bitmap, sec, 1); |
1607 | spin_lock_irq(&bitmap->lock); | 1575 | spin_lock_irq(&bitmap->lock); |
1608 | bitmap_file_set_bit(bitmap, sec); | 1576 | bitmap_file_set_bit(bitmap, sec); |
@@ -1759,11 +1727,12 @@ int bitmap_create(struct mddev *mddev) | |||
1759 | goto error; | 1727 | goto error; |
1760 | 1728 | ||
1761 | bitmap->daemon_lastrun = jiffies; | 1729 | bitmap->daemon_lastrun = jiffies; |
1762 | bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize); | 1730 | bitmap->chunkshift = (ffz(~mddev->bitmap_info.chunksize) |
1731 | - BITMAP_BLOCK_SHIFT); | ||
1763 | 1732 | ||
1764 | /* now that chunksize and chunkshift are set, we can use these macros */ | 1733 | /* now that chunksize and chunkshift are set, we can use these macros */ |
1765 | chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >> | 1734 | chunks = (blocks + bitmap->chunkshift - 1) >> |
1766 | CHUNK_BLOCK_SHIFT(bitmap); | 1735 | bitmap->chunkshift; |
1767 | pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO; | 1736 | pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO; |
1768 | 1737 | ||
1769 | BUG_ON(!pages); | 1738 | BUG_ON(!pages); |
@@ -1836,6 +1805,33 @@ out: | |||
1836 | } | 1805 | } |
1837 | EXPORT_SYMBOL_GPL(bitmap_load); | 1806 | EXPORT_SYMBOL_GPL(bitmap_load); |
1838 | 1807 | ||
1808 | void bitmap_status(struct seq_file *seq, struct bitmap *bitmap) | ||
1809 | { | ||
1810 | unsigned long chunk_kb; | ||
1811 | unsigned long flags; | ||
1812 | |||
1813 | if (!bitmap) | ||
1814 | return; | ||
1815 | |||
1816 | spin_lock_irqsave(&bitmap->lock, flags); | ||
1817 | chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; | ||
1818 | seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " | ||
1819 | "%lu%s chunk", | ||
1820 | bitmap->pages - bitmap->missing_pages, | ||
1821 | bitmap->pages, | ||
1822 | (bitmap->pages - bitmap->missing_pages) | ||
1823 | << (PAGE_SHIFT - 10), | ||
1824 | chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, | ||
1825 | chunk_kb ? "KB" : "B"); | ||
1826 | if (bitmap->file) { | ||
1827 | seq_printf(seq, ", file: "); | ||
1828 | seq_path(seq, &bitmap->file->f_path, " \t\n"); | ||
1829 | } | ||
1830 | |||
1831 | seq_printf(seq, "\n"); | ||
1832 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
1833 | } | ||
1834 | |||
1839 | static ssize_t | 1835 | static ssize_t |
1840 | location_show(struct mddev *mddev, char *page) | 1836 | location_show(struct mddev *mddev, char *page) |
1841 | { | 1837 | { |
@@ -1904,6 +1900,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len) | |||
1904 | if (mddev->pers) { | 1900 | if (mddev->pers) { |
1905 | mddev->pers->quiesce(mddev, 1); | 1901 | mddev->pers->quiesce(mddev, 1); |
1906 | rv = bitmap_create(mddev); | 1902 | rv = bitmap_create(mddev); |
1903 | if (!rv) | ||
1904 | rv = bitmap_load(mddev); | ||
1907 | if (rv) { | 1905 | if (rv) { |
1908 | bitmap_destroy(mddev); | 1906 | bitmap_destroy(mddev); |
1909 | mddev->bitmap_info.offset = 0; | 1907 | mddev->bitmap_info.offset = 0; |
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h index a15436dd9b3e..55ca5aec84e4 100644 --- a/drivers/md/bitmap.h +++ b/drivers/md/bitmap.h | |||
@@ -13,8 +13,6 @@ | |||
13 | #define BITMAP_MAJOR_HI 4 | 13 | #define BITMAP_MAJOR_HI 4 |
14 | #define BITMAP_MAJOR_HOSTENDIAN 3 | 14 | #define BITMAP_MAJOR_HOSTENDIAN 3 |
15 | 15 | ||
16 | #define BITMAP_MINOR 39 | ||
17 | |||
18 | /* | 16 | /* |
19 | * in-memory bitmap: | 17 | * in-memory bitmap: |
20 | * | 18 | * |
@@ -101,21 +99,10 @@ typedef __u16 bitmap_counter_t; | |||
101 | /* same, except a mask value for more efficient bitops */ | 99 | /* same, except a mask value for more efficient bitops */ |
102 | #define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) | 100 | #define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) |
103 | 101 | ||
104 | #define BITMAP_BLOCK_SIZE 512 | ||
105 | #define BITMAP_BLOCK_SHIFT 9 | 102 | #define BITMAP_BLOCK_SHIFT 9 |
106 | 103 | ||
107 | /* how many blocks per chunk? (this is variable) */ | 104 | /* how many blocks per chunk? (this is variable) */ |
108 | #define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT) | 105 | #define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT) |
109 | #define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT) | ||
110 | #define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1) | ||
111 | |||
112 | /* when hijacked, the counters and bits represent even larger "chunks" */ | ||
113 | /* there will be 1024 chunks represented by each counter in the page pointers */ | ||
114 | #define PAGEPTR_BLOCK_RATIO(bitmap) \ | ||
115 | (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1) | ||
116 | #define PAGEPTR_BLOCK_SHIFT(bitmap) \ | ||
117 | (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1) | ||
118 | #define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1) | ||
119 | 106 | ||
120 | #endif | 107 | #endif |
121 | 108 | ||
@@ -181,12 +168,6 @@ struct bitmap_page { | |||
181 | unsigned int count:31; | 168 | unsigned int count:31; |
182 | }; | 169 | }; |
183 | 170 | ||
184 | /* keep track of bitmap file pages that have pending writes on them */ | ||
185 | struct page_list { | ||
186 | struct list_head list; | ||
187 | struct page *page; | ||
188 | }; | ||
189 | |||
190 | /* the main bitmap structure - one per mddev */ | 171 | /* the main bitmap structure - one per mddev */ |
191 | struct bitmap { | 172 | struct bitmap { |
192 | struct bitmap_page *bp; | 173 | struct bitmap_page *bp; |
@@ -196,7 +177,7 @@ struct bitmap { | |||
196 | struct mddev *mddev; /* the md device that the bitmap is for */ | 177 | struct mddev *mddev; /* the md device that the bitmap is for */ |
197 | 178 | ||
198 | /* bitmap chunksize -- how much data does each bit represent? */ | 179 | /* bitmap chunksize -- how much data does each bit represent? */ |
199 | unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ | 180 | unsigned long chunkshift; /* chunksize = 2^(chunkshift+9) (for bitops) */ |
200 | unsigned long chunks; /* total number of data chunks for the array */ | 181 | unsigned long chunks; /* total number of data chunks for the array */ |
201 | 182 | ||
202 | __u64 events_cleared; | 183 | __u64 events_cleared; |
@@ -245,6 +226,7 @@ void bitmap_destroy(struct mddev *mddev); | |||
245 | 226 | ||
246 | void bitmap_print_sb(struct bitmap *bitmap); | 227 | void bitmap_print_sb(struct bitmap *bitmap); |
247 | void bitmap_update_sb(struct bitmap *bitmap); | 228 | void bitmap_update_sb(struct bitmap *bitmap); |
229 | void bitmap_status(struct seq_file *seq, struct bitmap *bitmap); | ||
248 | 230 | ||
249 | int bitmap_setallbits(struct bitmap *bitmap); | 231 | int bitmap_setallbits(struct bitmap *bitmap); |
250 | void bitmap_write_all(struct bitmap *bitmap); | 232 | void bitmap_write_all(struct bitmap *bitmap); |
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c index 0a6806f80ab5..cc06a1e52423 100644 --- a/drivers/md/dm-bufio.c +++ b/drivers/md/dm-bufio.c | |||
@@ -12,7 +12,6 @@ | |||
12 | #include <linux/dm-io.h> | 12 | #include <linux/dm-io.h> |
13 | #include <linux/slab.h> | 13 | #include <linux/slab.h> |
14 | #include <linux/vmalloc.h> | 14 | #include <linux/vmalloc.h> |
15 | #include <linux/version.h> | ||
16 | #include <linux/shrinker.h> | 15 | #include <linux/shrinker.h> |
17 | #include <linux/module.h> | 16 | #include <linux/module.h> |
18 | 17 | ||
@@ -579,7 +578,7 @@ static void write_endio(struct bio *bio, int error) | |||
579 | struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); | 578 | struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); |
580 | 579 | ||
581 | b->write_error = error; | 580 | b->write_error = error; |
582 | if (error) { | 581 | if (unlikely(error)) { |
583 | struct dm_bufio_client *c = b->c; | 582 | struct dm_bufio_client *c = b->c; |
584 | (void)cmpxchg(&c->async_write_error, 0, error); | 583 | (void)cmpxchg(&c->async_write_error, 0, error); |
585 | } | 584 | } |
@@ -698,13 +697,20 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c) | |||
698 | dm_bufio_lock(c); | 697 | dm_bufio_lock(c); |
699 | } | 698 | } |
700 | 699 | ||
700 | enum new_flag { | ||
701 | NF_FRESH = 0, | ||
702 | NF_READ = 1, | ||
703 | NF_GET = 2, | ||
704 | NF_PREFETCH = 3 | ||
705 | }; | ||
706 | |||
701 | /* | 707 | /* |
702 | * Allocate a new buffer. If the allocation is not possible, wait until | 708 | * Allocate a new buffer. If the allocation is not possible, wait until |
703 | * some other thread frees a buffer. | 709 | * some other thread frees a buffer. |
704 | * | 710 | * |
705 | * May drop the lock and regain it. | 711 | * May drop the lock and regain it. |
706 | */ | 712 | */ |
707 | static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c) | 713 | static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf) |
708 | { | 714 | { |
709 | struct dm_buffer *b; | 715 | struct dm_buffer *b; |
710 | 716 | ||
@@ -727,6 +733,9 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client | |||
727 | return b; | 733 | return b; |
728 | } | 734 | } |
729 | 735 | ||
736 | if (nf == NF_PREFETCH) | ||
737 | return NULL; | ||
738 | |||
730 | if (!list_empty(&c->reserved_buffers)) { | 739 | if (!list_empty(&c->reserved_buffers)) { |
731 | b = list_entry(c->reserved_buffers.next, | 740 | b = list_entry(c->reserved_buffers.next, |
732 | struct dm_buffer, lru_list); | 741 | struct dm_buffer, lru_list); |
@@ -744,9 +753,12 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client | |||
744 | } | 753 | } |
745 | } | 754 | } |
746 | 755 | ||
747 | static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c) | 756 | static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf) |
748 | { | 757 | { |
749 | struct dm_buffer *b = __alloc_buffer_wait_no_callback(c); | 758 | struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf); |
759 | |||
760 | if (!b) | ||
761 | return NULL; | ||
750 | 762 | ||
751 | if (c->alloc_callback) | 763 | if (c->alloc_callback) |
752 | c->alloc_callback(b); | 764 | c->alloc_callback(b); |
@@ -866,32 +878,23 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block) | |||
866 | * Getting a buffer | 878 | * Getting a buffer |
867 | *--------------------------------------------------------------*/ | 879 | *--------------------------------------------------------------*/ |
868 | 880 | ||
869 | enum new_flag { | ||
870 | NF_FRESH = 0, | ||
871 | NF_READ = 1, | ||
872 | NF_GET = 2 | ||
873 | }; | ||
874 | |||
875 | static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, | 881 | static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, |
876 | enum new_flag nf, struct dm_buffer **bp, | 882 | enum new_flag nf, int *need_submit) |
877 | int *need_submit) | ||
878 | { | 883 | { |
879 | struct dm_buffer *b, *new_b = NULL; | 884 | struct dm_buffer *b, *new_b = NULL; |
880 | 885 | ||
881 | *need_submit = 0; | 886 | *need_submit = 0; |
882 | 887 | ||
883 | b = __find(c, block); | 888 | b = __find(c, block); |
884 | if (b) { | 889 | if (b) |
885 | b->hold_count++; | 890 | goto found_buffer; |
886 | __relink_lru(b, test_bit(B_DIRTY, &b->state) || | ||
887 | test_bit(B_WRITING, &b->state)); | ||
888 | return b; | ||
889 | } | ||
890 | 891 | ||
891 | if (nf == NF_GET) | 892 | if (nf == NF_GET) |
892 | return NULL; | 893 | return NULL; |
893 | 894 | ||
894 | new_b = __alloc_buffer_wait(c); | 895 | new_b = __alloc_buffer_wait(c, nf); |
896 | if (!new_b) | ||
897 | return NULL; | ||
895 | 898 | ||
896 | /* | 899 | /* |
897 | * We've had a period where the mutex was unlocked, so need to | 900 | * We've had a period where the mutex was unlocked, so need to |
@@ -900,10 +903,7 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, | |||
900 | b = __find(c, block); | 903 | b = __find(c, block); |
901 | if (b) { | 904 | if (b) { |
902 | __free_buffer_wake(new_b); | 905 | __free_buffer_wake(new_b); |
903 | b->hold_count++; | 906 | goto found_buffer; |
904 | __relink_lru(b, test_bit(B_DIRTY, &b->state) || | ||
905 | test_bit(B_WRITING, &b->state)); | ||
906 | return b; | ||
907 | } | 907 | } |
908 | 908 | ||
909 | __check_watermark(c); | 909 | __check_watermark(c); |
@@ -923,6 +923,24 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, | |||
923 | *need_submit = 1; | 923 | *need_submit = 1; |
924 | 924 | ||
925 | return b; | 925 | return b; |
926 | |||
927 | found_buffer: | ||
928 | if (nf == NF_PREFETCH) | ||
929 | return NULL; | ||
930 | /* | ||
931 | * Note: it is essential that we don't wait for the buffer to be | ||
932 | * read if dm_bufio_get function is used. Both dm_bufio_get and | ||
933 | * dm_bufio_prefetch can be used in the driver request routine. | ||
934 | * If the user called both dm_bufio_prefetch and dm_bufio_get on | ||
935 | * the same buffer, it would deadlock if we waited. | ||
936 | */ | ||
937 | if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state))) | ||
938 | return NULL; | ||
939 | |||
940 | b->hold_count++; | ||
941 | __relink_lru(b, test_bit(B_DIRTY, &b->state) || | ||
942 | test_bit(B_WRITING, &b->state)); | ||
943 | return b; | ||
926 | } | 944 | } |
927 | 945 | ||
928 | /* | 946 | /* |
@@ -957,10 +975,10 @@ static void *new_read(struct dm_bufio_client *c, sector_t block, | |||
957 | struct dm_buffer *b; | 975 | struct dm_buffer *b; |
958 | 976 | ||
959 | dm_bufio_lock(c); | 977 | dm_bufio_lock(c); |
960 | b = __bufio_new(c, block, nf, bp, &need_submit); | 978 | b = __bufio_new(c, block, nf, &need_submit); |
961 | dm_bufio_unlock(c); | 979 | dm_bufio_unlock(c); |
962 | 980 | ||
963 | if (!b || IS_ERR(b)) | 981 | if (!b) |
964 | return b; | 982 | return b; |
965 | 983 | ||
966 | if (need_submit) | 984 | if (need_submit) |
@@ -1006,13 +1024,47 @@ void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, | |||
1006 | } | 1024 | } |
1007 | EXPORT_SYMBOL_GPL(dm_bufio_new); | 1025 | EXPORT_SYMBOL_GPL(dm_bufio_new); |
1008 | 1026 | ||
1027 | void dm_bufio_prefetch(struct dm_bufio_client *c, | ||
1028 | sector_t block, unsigned n_blocks) | ||
1029 | { | ||
1030 | struct blk_plug plug; | ||
1031 | |||
1032 | blk_start_plug(&plug); | ||
1033 | dm_bufio_lock(c); | ||
1034 | |||
1035 | for (; n_blocks--; block++) { | ||
1036 | int need_submit; | ||
1037 | struct dm_buffer *b; | ||
1038 | b = __bufio_new(c, block, NF_PREFETCH, &need_submit); | ||
1039 | if (unlikely(b != NULL)) { | ||
1040 | dm_bufio_unlock(c); | ||
1041 | |||
1042 | if (need_submit) | ||
1043 | submit_io(b, READ, b->block, read_endio); | ||
1044 | dm_bufio_release(b); | ||
1045 | |||
1046 | dm_bufio_cond_resched(); | ||
1047 | |||
1048 | if (!n_blocks) | ||
1049 | goto flush_plug; | ||
1050 | dm_bufio_lock(c); | ||
1051 | } | ||
1052 | |||
1053 | } | ||
1054 | |||
1055 | dm_bufio_unlock(c); | ||
1056 | |||
1057 | flush_plug: | ||
1058 | blk_finish_plug(&plug); | ||
1059 | } | ||
1060 | EXPORT_SYMBOL_GPL(dm_bufio_prefetch); | ||
1061 | |||
1009 | void dm_bufio_release(struct dm_buffer *b) | 1062 | void dm_bufio_release(struct dm_buffer *b) |
1010 | { | 1063 | { |
1011 | struct dm_bufio_client *c = b->c; | 1064 | struct dm_bufio_client *c = b->c; |
1012 | 1065 | ||
1013 | dm_bufio_lock(c); | 1066 | dm_bufio_lock(c); |
1014 | 1067 | ||
1015 | BUG_ON(test_bit(B_READING, &b->state)); | ||
1016 | BUG_ON(!b->hold_count); | 1068 | BUG_ON(!b->hold_count); |
1017 | 1069 | ||
1018 | b->hold_count--; | 1070 | b->hold_count--; |
@@ -1025,6 +1077,7 @@ void dm_bufio_release(struct dm_buffer *b) | |||
1025 | * invalid buffer. | 1077 | * invalid buffer. |
1026 | */ | 1078 | */ |
1027 | if ((b->read_error || b->write_error) && | 1079 | if ((b->read_error || b->write_error) && |
1080 | !test_bit(B_READING, &b->state) && | ||
1028 | !test_bit(B_WRITING, &b->state) && | 1081 | !test_bit(B_WRITING, &b->state) && |
1029 | !test_bit(B_DIRTY, &b->state)) { | 1082 | !test_bit(B_DIRTY, &b->state)) { |
1030 | __unlink_buffer(b); | 1083 | __unlink_buffer(b); |
@@ -1042,6 +1095,8 @@ void dm_bufio_mark_buffer_dirty(struct dm_buffer *b) | |||
1042 | 1095 | ||
1043 | dm_bufio_lock(c); | 1096 | dm_bufio_lock(c); |
1044 | 1097 | ||
1098 | BUG_ON(test_bit(B_READING, &b->state)); | ||
1099 | |||
1045 | if (!test_and_set_bit(B_DIRTY, &b->state)) | 1100 | if (!test_and_set_bit(B_DIRTY, &b->state)) |
1046 | __relink_lru(b, LIST_DIRTY); | 1101 | __relink_lru(b, LIST_DIRTY); |
1047 | 1102 | ||
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h index 5c4c3a04e381..b142946a9e32 100644 --- a/drivers/md/dm-bufio.h +++ b/drivers/md/dm-bufio.h | |||
@@ -63,6 +63,14 @@ void *dm_bufio_new(struct dm_bufio_client *c, sector_t block, | |||
63 | struct dm_buffer **bp); | 63 | struct dm_buffer **bp); |
64 | 64 | ||
65 | /* | 65 | /* |
66 | * Prefetch the specified blocks to the cache. | ||
67 | * The function starts to read the blocks and returns without waiting for | ||
68 | * I/O to finish. | ||
69 | */ | ||
70 | void dm_bufio_prefetch(struct dm_bufio_client *c, | ||
71 | sector_t block, unsigned n_blocks); | ||
72 | |||
73 | /* | ||
66 | * Release a reference obtained with dm_bufio_{read,get,new}. The data | 74 | * Release a reference obtained with dm_bufio_{read,get,new}. The data |
67 | * pointer and dm_buffer pointer is no longer valid after this call. | 75 | * pointer and dm_buffer pointer is no longer valid after this call. |
68 | */ | 76 | */ |
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c index 8c2a000cf3f5..3f06df59fd82 100644 --- a/drivers/md/dm-crypt.c +++ b/drivers/md/dm-crypt.c | |||
@@ -176,7 +176,6 @@ struct crypt_config { | |||
176 | 176 | ||
177 | #define MIN_IOS 16 | 177 | #define MIN_IOS 16 |
178 | #define MIN_POOL_PAGES 32 | 178 | #define MIN_POOL_PAGES 32 |
179 | #define MIN_BIO_PAGES 8 | ||
180 | 179 | ||
181 | static struct kmem_cache *_crypt_io_pool; | 180 | static struct kmem_cache *_crypt_io_pool; |
182 | 181 | ||
@@ -590,9 +589,9 @@ static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv, | |||
590 | int r = 0; | 589 | int r = 0; |
591 | 590 | ||
592 | if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { | 591 | if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { |
593 | src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0); | 592 | src = kmap_atomic(sg_page(&dmreq->sg_in)); |
594 | r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset); | 593 | r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset); |
595 | kunmap_atomic(src, KM_USER0); | 594 | kunmap_atomic(src); |
596 | } else | 595 | } else |
597 | memset(iv, 0, cc->iv_size); | 596 | memset(iv, 0, cc->iv_size); |
598 | 597 | ||
@@ -608,14 +607,14 @@ static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv, | |||
608 | if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) | 607 | if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) |
609 | return 0; | 608 | return 0; |
610 | 609 | ||
611 | dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0); | 610 | dst = kmap_atomic(sg_page(&dmreq->sg_out)); |
612 | r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset); | 611 | r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset); |
613 | 612 | ||
614 | /* Tweak the first block of plaintext sector */ | 613 | /* Tweak the first block of plaintext sector */ |
615 | if (!r) | 614 | if (!r) |
616 | crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size); | 615 | crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size); |
617 | 616 | ||
618 | kunmap_atomic(dst, KM_USER0); | 617 | kunmap_atomic(dst); |
619 | return r; | 618 | return r; |
620 | } | 619 | } |
621 | 620 | ||
@@ -848,12 +847,11 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size, | |||
848 | } | 847 | } |
849 | 848 | ||
850 | /* | 849 | /* |
851 | * if additional pages cannot be allocated without waiting, | 850 | * If additional pages cannot be allocated without waiting, |
852 | * return a partially allocated bio, the caller will then try | 851 | * return a partially-allocated bio. The caller will then try |
853 | * to allocate additional bios while submitting this partial bio | 852 | * to allocate more bios while submitting this partial bio. |
854 | */ | 853 | */ |
855 | if (i == (MIN_BIO_PAGES - 1)) | 854 | gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT; |
856 | gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT; | ||
857 | 855 | ||
858 | len = (size > PAGE_SIZE) ? PAGE_SIZE : size; | 856 | len = (size > PAGE_SIZE) ? PAGE_SIZE : size; |
859 | 857 | ||
@@ -1046,16 +1044,14 @@ static void kcryptd_queue_io(struct dm_crypt_io *io) | |||
1046 | queue_work(cc->io_queue, &io->work); | 1044 | queue_work(cc->io_queue, &io->work); |
1047 | } | 1045 | } |
1048 | 1046 | ||
1049 | static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, | 1047 | static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async) |
1050 | int error, int async) | ||
1051 | { | 1048 | { |
1052 | struct bio *clone = io->ctx.bio_out; | 1049 | struct bio *clone = io->ctx.bio_out; |
1053 | struct crypt_config *cc = io->target->private; | 1050 | struct crypt_config *cc = io->target->private; |
1054 | 1051 | ||
1055 | if (unlikely(error < 0)) { | 1052 | if (unlikely(io->error < 0)) { |
1056 | crypt_free_buffer_pages(cc, clone); | 1053 | crypt_free_buffer_pages(cc, clone); |
1057 | bio_put(clone); | 1054 | bio_put(clone); |
1058 | io->error = -EIO; | ||
1059 | crypt_dec_pending(io); | 1055 | crypt_dec_pending(io); |
1060 | return; | 1056 | return; |
1061 | } | 1057 | } |
@@ -1106,12 +1102,16 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) | |||
1106 | sector += bio_sectors(clone); | 1102 | sector += bio_sectors(clone); |
1107 | 1103 | ||
1108 | crypt_inc_pending(io); | 1104 | crypt_inc_pending(io); |
1105 | |||
1109 | r = crypt_convert(cc, &io->ctx); | 1106 | r = crypt_convert(cc, &io->ctx); |
1107 | if (r < 0) | ||
1108 | io->error = -EIO; | ||
1109 | |||
1110 | crypt_finished = atomic_dec_and_test(&io->ctx.pending); | 1110 | crypt_finished = atomic_dec_and_test(&io->ctx.pending); |
1111 | 1111 | ||
1112 | /* Encryption was already finished, submit io now */ | 1112 | /* Encryption was already finished, submit io now */ |
1113 | if (crypt_finished) { | 1113 | if (crypt_finished) { |
1114 | kcryptd_crypt_write_io_submit(io, r, 0); | 1114 | kcryptd_crypt_write_io_submit(io, 0); |
1115 | 1115 | ||
1116 | /* | 1116 | /* |
1117 | * If there was an error, do not try next fragments. | 1117 | * If there was an error, do not try next fragments. |
@@ -1162,11 +1162,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io) | |||
1162 | crypt_dec_pending(io); | 1162 | crypt_dec_pending(io); |
1163 | } | 1163 | } |
1164 | 1164 | ||
1165 | static void kcryptd_crypt_read_done(struct dm_crypt_io *io, int error) | 1165 | static void kcryptd_crypt_read_done(struct dm_crypt_io *io) |
1166 | { | 1166 | { |
1167 | if (unlikely(error < 0)) | ||
1168 | io->error = -EIO; | ||
1169 | |||
1170 | crypt_dec_pending(io); | 1167 | crypt_dec_pending(io); |
1171 | } | 1168 | } |
1172 | 1169 | ||
@@ -1181,9 +1178,11 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io) | |||
1181 | io->sector); | 1178 | io->sector); |
1182 | 1179 | ||
1183 | r = crypt_convert(cc, &io->ctx); | 1180 | r = crypt_convert(cc, &io->ctx); |
1181 | if (r < 0) | ||
1182 | io->error = -EIO; | ||
1184 | 1183 | ||
1185 | if (atomic_dec_and_test(&io->ctx.pending)) | 1184 | if (atomic_dec_and_test(&io->ctx.pending)) |
1186 | kcryptd_crypt_read_done(io, r); | 1185 | kcryptd_crypt_read_done(io); |
1187 | 1186 | ||
1188 | crypt_dec_pending(io); | 1187 | crypt_dec_pending(io); |
1189 | } | 1188 | } |
@@ -1204,15 +1203,18 @@ static void kcryptd_async_done(struct crypto_async_request *async_req, | |||
1204 | if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) | 1203 | if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) |
1205 | error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); | 1204 | error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); |
1206 | 1205 | ||
1206 | if (error < 0) | ||
1207 | io->error = -EIO; | ||
1208 | |||
1207 | mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); | 1209 | mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); |
1208 | 1210 | ||
1209 | if (!atomic_dec_and_test(&ctx->pending)) | 1211 | if (!atomic_dec_and_test(&ctx->pending)) |
1210 | return; | 1212 | return; |
1211 | 1213 | ||
1212 | if (bio_data_dir(io->base_bio) == READ) | 1214 | if (bio_data_dir(io->base_bio) == READ) |
1213 | kcryptd_crypt_read_done(io, error); | 1215 | kcryptd_crypt_read_done(io); |
1214 | else | 1216 | else |
1215 | kcryptd_crypt_write_io_submit(io, error, 1); | 1217 | kcryptd_crypt_write_io_submit(io, 1); |
1216 | } | 1218 | } |
1217 | 1219 | ||
1218 | static void kcryptd_crypt(struct work_struct *work) | 1220 | static void kcryptd_crypt(struct work_struct *work) |
@@ -1413,6 +1415,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1413 | char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; | 1415 | char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; |
1414 | char *cipher_api = NULL; | 1416 | char *cipher_api = NULL; |
1415 | int cpu, ret = -EINVAL; | 1417 | int cpu, ret = -EINVAL; |
1418 | char dummy; | ||
1416 | 1419 | ||
1417 | /* Convert to crypto api definition? */ | 1420 | /* Convert to crypto api definition? */ |
1418 | if (strchr(cipher_in, '(')) { | 1421 | if (strchr(cipher_in, '(')) { |
@@ -1434,7 +1437,7 @@ static int crypt_ctr_cipher(struct dm_target *ti, | |||
1434 | 1437 | ||
1435 | if (!keycount) | 1438 | if (!keycount) |
1436 | cc->tfms_count = 1; | 1439 | cc->tfms_count = 1; |
1437 | else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 || | 1440 | else if (sscanf(keycount, "%u%c", &cc->tfms_count, &dummy) != 1 || |
1438 | !is_power_of_2(cc->tfms_count)) { | 1441 | !is_power_of_2(cc->tfms_count)) { |
1439 | ti->error = "Bad cipher key count specification"; | 1442 | ti->error = "Bad cipher key count specification"; |
1440 | return -EINVAL; | 1443 | return -EINVAL; |
@@ -1579,6 +1582,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1579 | int ret; | 1582 | int ret; |
1580 | struct dm_arg_set as; | 1583 | struct dm_arg_set as; |
1581 | const char *opt_string; | 1584 | const char *opt_string; |
1585 | char dummy; | ||
1582 | 1586 | ||
1583 | static struct dm_arg _args[] = { | 1587 | static struct dm_arg _args[] = { |
1584 | {0, 1, "Invalid number of feature args"}, | 1588 | {0, 1, "Invalid number of feature args"}, |
@@ -1636,7 +1640,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1636 | } | 1640 | } |
1637 | 1641 | ||
1638 | ret = -EINVAL; | 1642 | ret = -EINVAL; |
1639 | if (sscanf(argv[2], "%llu", &tmpll) != 1) { | 1643 | if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) { |
1640 | ti->error = "Invalid iv_offset sector"; | 1644 | ti->error = "Invalid iv_offset sector"; |
1641 | goto bad; | 1645 | goto bad; |
1642 | } | 1646 | } |
@@ -1647,7 +1651,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1647 | goto bad; | 1651 | goto bad; |
1648 | } | 1652 | } |
1649 | 1653 | ||
1650 | if (sscanf(argv[4], "%llu", &tmpll) != 1) { | 1654 | if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) { |
1651 | ti->error = "Invalid device sector"; | 1655 | ti->error = "Invalid device sector"; |
1652 | goto bad; | 1656 | goto bad; |
1653 | } | 1657 | } |
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c index f18375dcedd9..2dc22dddb2ae 100644 --- a/drivers/md/dm-delay.c +++ b/drivers/md/dm-delay.c | |||
@@ -131,6 +131,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
131 | { | 131 | { |
132 | struct delay_c *dc; | 132 | struct delay_c *dc; |
133 | unsigned long long tmpll; | 133 | unsigned long long tmpll; |
134 | char dummy; | ||
134 | 135 | ||
135 | if (argc != 3 && argc != 6) { | 136 | if (argc != 3 && argc != 6) { |
136 | ti->error = "requires exactly 3 or 6 arguments"; | 137 | ti->error = "requires exactly 3 or 6 arguments"; |
@@ -145,13 +146,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
145 | 146 | ||
146 | dc->reads = dc->writes = 0; | 147 | dc->reads = dc->writes = 0; |
147 | 148 | ||
148 | if (sscanf(argv[1], "%llu", &tmpll) != 1) { | 149 | if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) { |
149 | ti->error = "Invalid device sector"; | 150 | ti->error = "Invalid device sector"; |
150 | goto bad; | 151 | goto bad; |
151 | } | 152 | } |
152 | dc->start_read = tmpll; | 153 | dc->start_read = tmpll; |
153 | 154 | ||
154 | if (sscanf(argv[2], "%u", &dc->read_delay) != 1) { | 155 | if (sscanf(argv[2], "%u%c", &dc->read_delay, &dummy) != 1) { |
155 | ti->error = "Invalid delay"; | 156 | ti->error = "Invalid delay"; |
156 | goto bad; | 157 | goto bad; |
157 | } | 158 | } |
@@ -166,13 +167,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
166 | if (argc == 3) | 167 | if (argc == 3) |
167 | goto out; | 168 | goto out; |
168 | 169 | ||
169 | if (sscanf(argv[4], "%llu", &tmpll) != 1) { | 170 | if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) { |
170 | ti->error = "Invalid write device sector"; | 171 | ti->error = "Invalid write device sector"; |
171 | goto bad_dev_read; | 172 | goto bad_dev_read; |
172 | } | 173 | } |
173 | dc->start_write = tmpll; | 174 | dc->start_write = tmpll; |
174 | 175 | ||
175 | if (sscanf(argv[5], "%u", &dc->write_delay) != 1) { | 176 | if (sscanf(argv[5], "%u%c", &dc->write_delay, &dummy) != 1) { |
176 | ti->error = "Invalid write delay"; | 177 | ti->error = "Invalid write delay"; |
177 | goto bad_dev_read; | 178 | goto bad_dev_read; |
178 | } | 179 | } |
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index 042e71996569..aa70f7d43a1a 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c | |||
@@ -283,7 +283,7 @@ int dm_exception_store_init(void) | |||
283 | return 0; | 283 | return 0; |
284 | 284 | ||
285 | persistent_fail: | 285 | persistent_fail: |
286 | dm_persistent_snapshot_exit(); | 286 | dm_transient_snapshot_exit(); |
287 | transient_fail: | 287 | transient_fail: |
288 | return r; | 288 | return r; |
289 | } | 289 | } |
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c index 9fb18c147825..ac49c01f1a44 100644 --- a/drivers/md/dm-flakey.c +++ b/drivers/md/dm-flakey.c | |||
@@ -160,6 +160,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
160 | unsigned long long tmpll; | 160 | unsigned long long tmpll; |
161 | struct dm_arg_set as; | 161 | struct dm_arg_set as; |
162 | const char *devname; | 162 | const char *devname; |
163 | char dummy; | ||
163 | 164 | ||
164 | as.argc = argc; | 165 | as.argc = argc; |
165 | as.argv = argv; | 166 | as.argv = argv; |
@@ -178,7 +179,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
178 | 179 | ||
179 | devname = dm_shift_arg(&as); | 180 | devname = dm_shift_arg(&as); |
180 | 181 | ||
181 | if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) { | 182 | if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1) { |
182 | ti->error = "Invalid device sector"; | 183 | ti->error = "Invalid device sector"; |
183 | goto bad; | 184 | goto bad; |
184 | } | 185 | } |
@@ -323,7 +324,7 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio, | |||
323 | * Corrupt successful READs while in down state. | 324 | * Corrupt successful READs while in down state. |
324 | * If flags were specified, only corrupt those that match. | 325 | * If flags were specified, only corrupt those that match. |
325 | */ | 326 | */ |
326 | if (!error && bio_submitted_while_down && | 327 | if (fc->corrupt_bio_byte && !error && bio_submitted_while_down && |
327 | (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) && | 328 | (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) && |
328 | all_corrupt_bio_flags_match(bio, fc)) | 329 | all_corrupt_bio_flags_match(bio, fc)) |
329 | corrupt_bio_data(bio, fc); | 330 | corrupt_bio_data(bio, fc); |
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c index ad2eba40e319..ea5dd289fe2a 100644 --- a/drivers/md/dm-io.c +++ b/drivers/md/dm-io.c | |||
@@ -296,6 +296,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, | |||
296 | unsigned offset; | 296 | unsigned offset; |
297 | unsigned num_bvecs; | 297 | unsigned num_bvecs; |
298 | sector_t remaining = where->count; | 298 | sector_t remaining = where->count; |
299 | struct request_queue *q = bdev_get_queue(where->bdev); | ||
300 | sector_t discard_sectors; | ||
299 | 301 | ||
300 | /* | 302 | /* |
301 | * where->count may be zero if rw holds a flush and we need to | 303 | * where->count may be zero if rw holds a flush and we need to |
@@ -305,9 +307,12 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, | |||
305 | /* | 307 | /* |
306 | * Allocate a suitably sized-bio. | 308 | * Allocate a suitably sized-bio. |
307 | */ | 309 | */ |
308 | num_bvecs = dm_sector_div_up(remaining, | 310 | if (rw & REQ_DISCARD) |
309 | (PAGE_SIZE >> SECTOR_SHIFT)); | 311 | num_bvecs = 1; |
310 | num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs); | 312 | else |
313 | num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), | ||
314 | dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT))); | ||
315 | |||
311 | bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); | 316 | bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); |
312 | bio->bi_sector = where->sector + (where->count - remaining); | 317 | bio->bi_sector = where->sector + (where->count - remaining); |
313 | bio->bi_bdev = where->bdev; | 318 | bio->bi_bdev = where->bdev; |
@@ -315,10 +320,14 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where, | |||
315 | bio->bi_destructor = dm_bio_destructor; | 320 | bio->bi_destructor = dm_bio_destructor; |
316 | store_io_and_region_in_bio(bio, io, region); | 321 | store_io_and_region_in_bio(bio, io, region); |
317 | 322 | ||
318 | /* | 323 | if (rw & REQ_DISCARD) { |
319 | * Try and add as many pages as possible. | 324 | discard_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining); |
320 | */ | 325 | bio->bi_size = discard_sectors << SECTOR_SHIFT; |
321 | while (remaining) { | 326 | remaining -= discard_sectors; |
327 | } else while (remaining) { | ||
328 | /* | ||
329 | * Try and add as many pages as possible. | ||
330 | */ | ||
322 | dp->get_page(dp, &page, &len, &offset); | 331 | dp->get_page(dp, &page, &len, &offset); |
323 | len = min(len, to_bytes(remaining)); | 332 | len = min(len, to_bytes(remaining)); |
324 | if (!bio_add_page(bio, page, len, offset)) | 333 | if (!bio_add_page(bio, page, len, offset)) |
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c index 31c2dc25886d..a1a3e6df17b8 100644 --- a/drivers/md/dm-ioctl.c +++ b/drivers/md/dm-ioctl.c | |||
@@ -880,6 +880,7 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) | |||
880 | struct hd_geometry geometry; | 880 | struct hd_geometry geometry; |
881 | unsigned long indata[4]; | 881 | unsigned long indata[4]; |
882 | char *geostr = (char *) param + param->data_start; | 882 | char *geostr = (char *) param + param->data_start; |
883 | char dummy; | ||
883 | 884 | ||
884 | md = find_device(param); | 885 | md = find_device(param); |
885 | if (!md) | 886 | if (!md) |
@@ -891,8 +892,8 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size) | |||
891 | goto out; | 892 | goto out; |
892 | } | 893 | } |
893 | 894 | ||
894 | x = sscanf(geostr, "%lu %lu %lu %lu", indata, | 895 | x = sscanf(geostr, "%lu %lu %lu %lu%c", indata, |
895 | indata + 1, indata + 2, indata + 3); | 896 | indata + 1, indata + 2, indata + 3, &dummy); |
896 | 897 | ||
897 | if (x != 4) { | 898 | if (x != 4) { |
898 | DMWARN("Unable to interpret geometry settings."); | 899 | DMWARN("Unable to interpret geometry settings."); |
@@ -1437,7 +1438,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size) | |||
1437 | 1438 | ||
1438 | if (!argc) { | 1439 | if (!argc) { |
1439 | DMWARN("Empty message received."); | 1440 | DMWARN("Empty message received."); |
1440 | goto out; | 1441 | goto out_argv; |
1441 | } | 1442 | } |
1442 | 1443 | ||
1443 | table = dm_get_live_table(md); | 1444 | table = dm_get_live_table(md); |
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c index 9728839f844a..3639eeab6042 100644 --- a/drivers/md/dm-linear.c +++ b/drivers/md/dm-linear.c | |||
@@ -29,6 +29,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
29 | { | 29 | { |
30 | struct linear_c *lc; | 30 | struct linear_c *lc; |
31 | unsigned long long tmp; | 31 | unsigned long long tmp; |
32 | char dummy; | ||
32 | 33 | ||
33 | if (argc != 2) { | 34 | if (argc != 2) { |
34 | ti->error = "Invalid argument count"; | 35 | ti->error = "Invalid argument count"; |
@@ -41,7 +42,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
41 | return -ENOMEM; | 42 | return -ENOMEM; |
42 | } | 43 | } |
43 | 44 | ||
44 | if (sscanf(argv[1], "%llu", &tmp) != 1) { | 45 | if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) { |
45 | ti->error = "dm-linear: Invalid device sector"; | 46 | ti->error = "dm-linear: Invalid device sector"; |
46 | goto bad; | 47 | goto bad; |
47 | } | 48 | } |
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index 3b52bb72bd1f..65ebaebf502b 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c | |||
@@ -369,6 +369,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, | |||
369 | unsigned int region_count; | 369 | unsigned int region_count; |
370 | size_t bitset_size, buf_size; | 370 | size_t bitset_size, buf_size; |
371 | int r; | 371 | int r; |
372 | char dummy; | ||
372 | 373 | ||
373 | if (argc < 1 || argc > 2) { | 374 | if (argc < 1 || argc > 2) { |
374 | DMWARN("wrong number of arguments to dirty region log"); | 375 | DMWARN("wrong number of arguments to dirty region log"); |
@@ -387,7 +388,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, | |||
387 | } | 388 | } |
388 | } | 389 | } |
389 | 390 | ||
390 | if (sscanf(argv[0], "%u", ®ion_size) != 1 || | 391 | if (sscanf(argv[0], "%u%c", ®ion_size, &dummy) != 1 || |
391 | !_check_region_size(ti, region_size)) { | 392 | !_check_region_size(ti, region_size)) { |
392 | DMWARN("invalid region size %s", argv[0]); | 393 | DMWARN("invalid region size %s", argv[0]); |
393 | return -EINVAL; | 394 | return -EINVAL; |
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c index 801d92d237cf..922a3385eead 100644 --- a/drivers/md/dm-mpath.c +++ b/drivers/md/dm-mpath.c | |||
@@ -226,6 +226,27 @@ static void free_multipath(struct multipath *m) | |||
226 | kfree(m); | 226 | kfree(m); |
227 | } | 227 | } |
228 | 228 | ||
229 | static int set_mapinfo(struct multipath *m, union map_info *info) | ||
230 | { | ||
231 | struct dm_mpath_io *mpio; | ||
232 | |||
233 | mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); | ||
234 | if (!mpio) | ||
235 | return -ENOMEM; | ||
236 | |||
237 | memset(mpio, 0, sizeof(*mpio)); | ||
238 | info->ptr = mpio; | ||
239 | |||
240 | return 0; | ||
241 | } | ||
242 | |||
243 | static void clear_mapinfo(struct multipath *m, union map_info *info) | ||
244 | { | ||
245 | struct dm_mpath_io *mpio = info->ptr; | ||
246 | |||
247 | info->ptr = NULL; | ||
248 | mempool_free(mpio, m->mpio_pool); | ||
249 | } | ||
229 | 250 | ||
230 | /*----------------------------------------------- | 251 | /*----------------------------------------------- |
231 | * Path selection | 252 | * Path selection |
@@ -341,13 +362,14 @@ static int __must_push_back(struct multipath *m) | |||
341 | } | 362 | } |
342 | 363 | ||
343 | static int map_io(struct multipath *m, struct request *clone, | 364 | static int map_io(struct multipath *m, struct request *clone, |
344 | struct dm_mpath_io *mpio, unsigned was_queued) | 365 | union map_info *map_context, unsigned was_queued) |
345 | { | 366 | { |
346 | int r = DM_MAPIO_REMAPPED; | 367 | int r = DM_MAPIO_REMAPPED; |
347 | size_t nr_bytes = blk_rq_bytes(clone); | 368 | size_t nr_bytes = blk_rq_bytes(clone); |
348 | unsigned long flags; | 369 | unsigned long flags; |
349 | struct pgpath *pgpath; | 370 | struct pgpath *pgpath; |
350 | struct block_device *bdev; | 371 | struct block_device *bdev; |
372 | struct dm_mpath_io *mpio = map_context->ptr; | ||
351 | 373 | ||
352 | spin_lock_irqsave(&m->lock, flags); | 374 | spin_lock_irqsave(&m->lock, flags); |
353 | 375 | ||
@@ -423,7 +445,6 @@ static void dispatch_queued_ios(struct multipath *m) | |||
423 | { | 445 | { |
424 | int r; | 446 | int r; |
425 | unsigned long flags; | 447 | unsigned long flags; |
426 | struct dm_mpath_io *mpio; | ||
427 | union map_info *info; | 448 | union map_info *info; |
428 | struct request *clone, *n; | 449 | struct request *clone, *n; |
429 | LIST_HEAD(cl); | 450 | LIST_HEAD(cl); |
@@ -436,16 +457,15 @@ static void dispatch_queued_ios(struct multipath *m) | |||
436 | list_del_init(&clone->queuelist); | 457 | list_del_init(&clone->queuelist); |
437 | 458 | ||
438 | info = dm_get_rq_mapinfo(clone); | 459 | info = dm_get_rq_mapinfo(clone); |
439 | mpio = info->ptr; | ||
440 | 460 | ||
441 | r = map_io(m, clone, mpio, 1); | 461 | r = map_io(m, clone, info, 1); |
442 | if (r < 0) { | 462 | if (r < 0) { |
443 | mempool_free(mpio, m->mpio_pool); | 463 | clear_mapinfo(m, info); |
444 | dm_kill_unmapped_request(clone, r); | 464 | dm_kill_unmapped_request(clone, r); |
445 | } else if (r == DM_MAPIO_REMAPPED) | 465 | } else if (r == DM_MAPIO_REMAPPED) |
446 | dm_dispatch_request(clone); | 466 | dm_dispatch_request(clone); |
447 | else if (r == DM_MAPIO_REQUEUE) { | 467 | else if (r == DM_MAPIO_REQUEUE) { |
448 | mempool_free(mpio, m->mpio_pool); | 468 | clear_mapinfo(m, info); |
449 | dm_requeue_unmapped_request(clone); | 469 | dm_requeue_unmapped_request(clone); |
450 | } | 470 | } |
451 | } | 471 | } |
@@ -908,20 +928,16 @@ static int multipath_map(struct dm_target *ti, struct request *clone, | |||
908 | union map_info *map_context) | 928 | union map_info *map_context) |
909 | { | 929 | { |
910 | int r; | 930 | int r; |
911 | struct dm_mpath_io *mpio; | ||
912 | struct multipath *m = (struct multipath *) ti->private; | 931 | struct multipath *m = (struct multipath *) ti->private; |
913 | 932 | ||
914 | mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); | 933 | if (set_mapinfo(m, map_context) < 0) |
915 | if (!mpio) | ||
916 | /* ENOMEM, requeue */ | 934 | /* ENOMEM, requeue */ |
917 | return DM_MAPIO_REQUEUE; | 935 | return DM_MAPIO_REQUEUE; |
918 | memset(mpio, 0, sizeof(*mpio)); | ||
919 | 936 | ||
920 | map_context->ptr = mpio; | ||
921 | clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; | 937 | clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; |
922 | r = map_io(m, clone, mpio, 0); | 938 | r = map_io(m, clone, map_context, 0); |
923 | if (r < 0 || r == DM_MAPIO_REQUEUE) | 939 | if (r < 0 || r == DM_MAPIO_REQUEUE) |
924 | mempool_free(mpio, m->mpio_pool); | 940 | clear_mapinfo(m, map_context); |
925 | 941 | ||
926 | return r; | 942 | return r; |
927 | } | 943 | } |
@@ -1054,8 +1070,9 @@ static int switch_pg_num(struct multipath *m, const char *pgstr) | |||
1054 | struct priority_group *pg; | 1070 | struct priority_group *pg; |
1055 | unsigned pgnum; | 1071 | unsigned pgnum; |
1056 | unsigned long flags; | 1072 | unsigned long flags; |
1073 | char dummy; | ||
1057 | 1074 | ||
1058 | if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || | 1075 | if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || |
1059 | (pgnum > m->nr_priority_groups)) { | 1076 | (pgnum > m->nr_priority_groups)) { |
1060 | DMWARN("invalid PG number supplied to switch_pg_num"); | 1077 | DMWARN("invalid PG number supplied to switch_pg_num"); |
1061 | return -EINVAL; | 1078 | return -EINVAL; |
@@ -1085,8 +1102,9 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed) | |||
1085 | { | 1102 | { |
1086 | struct priority_group *pg; | 1103 | struct priority_group *pg; |
1087 | unsigned pgnum; | 1104 | unsigned pgnum; |
1105 | char dummy; | ||
1088 | 1106 | ||
1089 | if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || | 1107 | if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum || |
1090 | (pgnum > m->nr_priority_groups)) { | 1108 | (pgnum > m->nr_priority_groups)) { |
1091 | DMWARN("invalid PG number supplied to bypass_pg"); | 1109 | DMWARN("invalid PG number supplied to bypass_pg"); |
1092 | return -EINVAL; | 1110 | return -EINVAL; |
@@ -1261,13 +1279,15 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone, | |||
1261 | struct path_selector *ps; | 1279 | struct path_selector *ps; |
1262 | int r; | 1280 | int r; |
1263 | 1281 | ||
1282 | BUG_ON(!mpio); | ||
1283 | |||
1264 | r = do_end_io(m, clone, error, mpio); | 1284 | r = do_end_io(m, clone, error, mpio); |
1265 | if (pgpath) { | 1285 | if (pgpath) { |
1266 | ps = &pgpath->pg->ps; | 1286 | ps = &pgpath->pg->ps; |
1267 | if (ps->type->end_io) | 1287 | if (ps->type->end_io) |
1268 | ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); | 1288 | ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); |
1269 | } | 1289 | } |
1270 | mempool_free(mpio, m->mpio_pool); | 1290 | clear_mapinfo(m, map_context); |
1271 | 1291 | ||
1272 | return r; | 1292 | return r; |
1273 | } | 1293 | } |
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c index 03a837aa5ce6..3941fae0de9f 100644 --- a/drivers/md/dm-queue-length.c +++ b/drivers/md/dm-queue-length.c | |||
@@ -112,6 +112,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path, | |||
112 | struct selector *s = ps->context; | 112 | struct selector *s = ps->context; |
113 | struct path_info *pi; | 113 | struct path_info *pi; |
114 | unsigned repeat_count = QL_MIN_IO; | 114 | unsigned repeat_count = QL_MIN_IO; |
115 | char dummy; | ||
115 | 116 | ||
116 | /* | 117 | /* |
117 | * Arguments: [<repeat_count>] | 118 | * Arguments: [<repeat_count>] |
@@ -123,7 +124,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path, | |||
123 | return -EINVAL; | 124 | return -EINVAL; |
124 | } | 125 | } |
125 | 126 | ||
126 | if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { | 127 | if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { |
127 | *error = "queue-length ps: invalid repeat count"; | 128 | *error = "queue-length ps: invalid repeat count"; |
128 | return -EINVAL; | 129 | return -EINVAL; |
129 | } | 130 | } |
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 86cb7e5d83d5..b0ba52459ed7 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c | |||
@@ -604,7 +604,9 @@ static int read_disk_sb(struct md_rdev *rdev, int size) | |||
604 | return 0; | 604 | return 0; |
605 | 605 | ||
606 | if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { | 606 | if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { |
607 | DMERR("Failed to read device superblock"); | 607 | DMERR("Failed to read superblock of device at position %d", |
608 | rdev->raid_disk); | ||
609 | set_bit(Faulty, &rdev->flags); | ||
608 | return -EINVAL; | 610 | return -EINVAL; |
609 | } | 611 | } |
610 | 612 | ||
@@ -615,14 +617,14 @@ static int read_disk_sb(struct md_rdev *rdev, int size) | |||
615 | 617 | ||
616 | static void super_sync(struct mddev *mddev, struct md_rdev *rdev) | 618 | static void super_sync(struct mddev *mddev, struct md_rdev *rdev) |
617 | { | 619 | { |
618 | struct md_rdev *r, *t; | 620 | struct md_rdev *r; |
619 | uint64_t failed_devices; | 621 | uint64_t failed_devices; |
620 | struct dm_raid_superblock *sb; | 622 | struct dm_raid_superblock *sb; |
621 | 623 | ||
622 | sb = page_address(rdev->sb_page); | 624 | sb = page_address(rdev->sb_page); |
623 | failed_devices = le64_to_cpu(sb->failed_devices); | 625 | failed_devices = le64_to_cpu(sb->failed_devices); |
624 | 626 | ||
625 | rdev_for_each(r, t, mddev) | 627 | rdev_for_each(r, mddev) |
626 | if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) | 628 | if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) |
627 | failed_devices |= (1ULL << r->raid_disk); | 629 | failed_devices |= (1ULL << r->raid_disk); |
628 | 630 | ||
@@ -668,7 +670,14 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev) | |||
668 | return ret; | 670 | return ret; |
669 | 671 | ||
670 | sb = page_address(rdev->sb_page); | 672 | sb = page_address(rdev->sb_page); |
671 | if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) { | 673 | |
674 | /* | ||
675 | * Two cases that we want to write new superblocks and rebuild: | ||
676 | * 1) New device (no matching magic number) | ||
677 | * 2) Device specified for rebuild (!In_sync w/ offset == 0) | ||
678 | */ | ||
679 | if ((sb->magic != cpu_to_le32(DM_RAID_MAGIC)) || | ||
680 | (!test_bit(In_sync, &rdev->flags) && !rdev->recovery_offset)) { | ||
672 | super_sync(rdev->mddev, rdev); | 681 | super_sync(rdev->mddev, rdev); |
673 | 682 | ||
674 | set_bit(FirstUse, &rdev->flags); | 683 | set_bit(FirstUse, &rdev->flags); |
@@ -700,7 +709,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) | |||
700 | struct dm_raid_superblock *sb; | 709 | struct dm_raid_superblock *sb; |
701 | uint32_t new_devs = 0; | 710 | uint32_t new_devs = 0; |
702 | uint32_t rebuilds = 0; | 711 | uint32_t rebuilds = 0; |
703 | struct md_rdev *r, *t; | 712 | struct md_rdev *r; |
704 | struct dm_raid_superblock *sb2; | 713 | struct dm_raid_superblock *sb2; |
705 | 714 | ||
706 | sb = page_address(rdev->sb_page); | 715 | sb = page_address(rdev->sb_page); |
@@ -743,13 +752,10 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) | |||
743 | * case the In_sync bit will /not/ be set and | 752 | * case the In_sync bit will /not/ be set and |
744 | * recovery_cp must be MaxSector. | 753 | * recovery_cp must be MaxSector. |
745 | */ | 754 | */ |
746 | rdev_for_each(r, t, mddev) { | 755 | rdev_for_each(r, mddev) { |
747 | if (!test_bit(In_sync, &r->flags)) { | 756 | if (!test_bit(In_sync, &r->flags)) { |
748 | if (!test_bit(FirstUse, &r->flags)) | 757 | DMINFO("Device %d specified for rebuild: " |
749 | DMERR("Superblock area of " | 758 | "Clearing superblock", r->raid_disk); |
750 | "rebuild device %d should have been " | ||
751 | "cleared.", r->raid_disk); | ||
752 | set_bit(FirstUse, &r->flags); | ||
753 | rebuilds++; | 759 | rebuilds++; |
754 | } else if (test_bit(FirstUse, &r->flags)) | 760 | } else if (test_bit(FirstUse, &r->flags)) |
755 | new_devs++; | 761 | new_devs++; |
@@ -778,7 +784,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev) | |||
778 | * Now we set the Faulty bit for those devices that are | 784 | * Now we set the Faulty bit for those devices that are |
779 | * recorded in the superblock as failed. | 785 | * recorded in the superblock as failed. |
780 | */ | 786 | */ |
781 | rdev_for_each(r, t, mddev) { | 787 | rdev_for_each(r, mddev) { |
782 | if (!r->sb_page) | 788 | if (!r->sb_page) |
783 | continue; | 789 | continue; |
784 | sb2 = page_address(r->sb_page); | 790 | sb2 = page_address(r->sb_page); |
@@ -851,11 +857,27 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev) | |||
851 | static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) | 857 | static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) |
852 | { | 858 | { |
853 | int ret; | 859 | int ret; |
854 | struct md_rdev *rdev, *freshest, *tmp; | 860 | unsigned redundancy = 0; |
861 | struct raid_dev *dev; | ||
862 | struct md_rdev *rdev, *freshest; | ||
855 | struct mddev *mddev = &rs->md; | 863 | struct mddev *mddev = &rs->md; |
856 | 864 | ||
865 | switch (rs->raid_type->level) { | ||
866 | case 1: | ||
867 | redundancy = rs->md.raid_disks - 1; | ||
868 | break; | ||
869 | case 4: | ||
870 | case 5: | ||
871 | case 6: | ||
872 | redundancy = rs->raid_type->parity_devs; | ||
873 | break; | ||
874 | default: | ||
875 | ti->error = "Unknown RAID type"; | ||
876 | return -EINVAL; | ||
877 | } | ||
878 | |||
857 | freshest = NULL; | 879 | freshest = NULL; |
858 | rdev_for_each(rdev, tmp, mddev) { | 880 | rdev_for_each(rdev, mddev) { |
859 | if (!rdev->meta_bdev) | 881 | if (!rdev->meta_bdev) |
860 | continue; | 882 | continue; |
861 | 883 | ||
@@ -868,6 +890,37 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) | |||
868 | case 0: | 890 | case 0: |
869 | break; | 891 | break; |
870 | default: | 892 | default: |
893 | dev = container_of(rdev, struct raid_dev, rdev); | ||
894 | if (redundancy--) { | ||
895 | if (dev->meta_dev) | ||
896 | dm_put_device(ti, dev->meta_dev); | ||
897 | |||
898 | dev->meta_dev = NULL; | ||
899 | rdev->meta_bdev = NULL; | ||
900 | |||
901 | if (rdev->sb_page) | ||
902 | put_page(rdev->sb_page); | ||
903 | |||
904 | rdev->sb_page = NULL; | ||
905 | |||
906 | rdev->sb_loaded = 0; | ||
907 | |||
908 | /* | ||
909 | * We might be able to salvage the data device | ||
910 | * even though the meta device has failed. For | ||
911 | * now, we behave as though '- -' had been | ||
912 | * set for this device in the table. | ||
913 | */ | ||
914 | if (dev->data_dev) | ||
915 | dm_put_device(ti, dev->data_dev); | ||
916 | |||
917 | dev->data_dev = NULL; | ||
918 | rdev->bdev = NULL; | ||
919 | |||
920 | list_del(&rdev->same_set); | ||
921 | |||
922 | continue; | ||
923 | } | ||
871 | ti->error = "Failed to load superblock"; | 924 | ti->error = "Failed to load superblock"; |
872 | return ret; | 925 | return ret; |
873 | } | 926 | } |
@@ -884,7 +937,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) | |||
884 | if (super_validate(mddev, freshest)) | 937 | if (super_validate(mddev, freshest)) |
885 | return -EINVAL; | 938 | return -EINVAL; |
886 | 939 | ||
887 | rdev_for_each(rdev, tmp, mddev) | 940 | rdev_for_each(rdev, mddev) |
888 | if ((rdev != freshest) && super_validate(mddev, rdev)) | 941 | if ((rdev != freshest) && super_validate(mddev, rdev)) |
889 | return -EINVAL; | 942 | return -EINVAL; |
890 | 943 | ||
@@ -971,6 +1024,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
971 | 1024 | ||
972 | INIT_WORK(&rs->md.event_work, do_table_event); | 1025 | INIT_WORK(&rs->md.event_work, do_table_event); |
973 | ti->private = rs; | 1026 | ti->private = rs; |
1027 | ti->num_flush_requests = 1; | ||
974 | 1028 | ||
975 | mutex_lock(&rs->md.reconfig_mutex); | 1029 | mutex_lock(&rs->md.reconfig_mutex); |
976 | ret = md_run(&rs->md); | 1030 | ret = md_run(&rs->md); |
@@ -1209,7 +1263,7 @@ static void raid_resume(struct dm_target *ti) | |||
1209 | 1263 | ||
1210 | static struct target_type raid_target = { | 1264 | static struct target_type raid_target = { |
1211 | .name = "raid", | 1265 | .name = "raid", |
1212 | .version = {1, 1, 0}, | 1266 | .version = {1, 2, 0}, |
1213 | .module = THIS_MODULE, | 1267 | .module = THIS_MODULE, |
1214 | .ctr = raid_ctr, | 1268 | .ctr = raid_ctr, |
1215 | .dtr = raid_dtr, | 1269 | .dtr = raid_dtr, |
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index 9bfd057be686..d039de8322f0 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c | |||
@@ -924,8 +924,9 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti, | |||
924 | unsigned int mirror, char **argv) | 924 | unsigned int mirror, char **argv) |
925 | { | 925 | { |
926 | unsigned long long offset; | 926 | unsigned long long offset; |
927 | char dummy; | ||
927 | 928 | ||
928 | if (sscanf(argv[1], "%llu", &offset) != 1) { | 929 | if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1) { |
929 | ti->error = "Invalid offset"; | 930 | ti->error = "Invalid offset"; |
930 | return -EINVAL; | 931 | return -EINVAL; |
931 | } | 932 | } |
@@ -953,13 +954,14 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti, | |||
953 | { | 954 | { |
954 | unsigned param_count; | 955 | unsigned param_count; |
955 | struct dm_dirty_log *dl; | 956 | struct dm_dirty_log *dl; |
957 | char dummy; | ||
956 | 958 | ||
957 | if (argc < 2) { | 959 | if (argc < 2) { |
958 | ti->error = "Insufficient mirror log arguments"; | 960 | ti->error = "Insufficient mirror log arguments"; |
959 | return NULL; | 961 | return NULL; |
960 | } | 962 | } |
961 | 963 | ||
962 | if (sscanf(argv[1], "%u", ¶m_count) != 1) { | 964 | if (sscanf(argv[1], "%u%c", ¶m_count, &dummy) != 1) { |
963 | ti->error = "Invalid mirror log argument count"; | 965 | ti->error = "Invalid mirror log argument count"; |
964 | return NULL; | 966 | return NULL; |
965 | } | 967 | } |
@@ -986,13 +988,14 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv, | |||
986 | { | 988 | { |
987 | unsigned num_features; | 989 | unsigned num_features; |
988 | struct dm_target *ti = ms->ti; | 990 | struct dm_target *ti = ms->ti; |
991 | char dummy; | ||
989 | 992 | ||
990 | *args_used = 0; | 993 | *args_used = 0; |
991 | 994 | ||
992 | if (!argc) | 995 | if (!argc) |
993 | return 0; | 996 | return 0; |
994 | 997 | ||
995 | if (sscanf(argv[0], "%u", &num_features) != 1) { | 998 | if (sscanf(argv[0], "%u%c", &num_features, &dummy) != 1) { |
996 | ti->error = "Invalid number of features"; | 999 | ti->error = "Invalid number of features"; |
997 | return -EINVAL; | 1000 | return -EINVAL; |
998 | } | 1001 | } |
@@ -1036,6 +1039,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1036 | unsigned int nr_mirrors, m, args_used; | 1039 | unsigned int nr_mirrors, m, args_used; |
1037 | struct mirror_set *ms; | 1040 | struct mirror_set *ms; |
1038 | struct dm_dirty_log *dl; | 1041 | struct dm_dirty_log *dl; |
1042 | char dummy; | ||
1039 | 1043 | ||
1040 | dl = create_dirty_log(ti, argc, argv, &args_used); | 1044 | dl = create_dirty_log(ti, argc, argv, &args_used); |
1041 | if (!dl) | 1045 | if (!dl) |
@@ -1044,7 +1048,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv) | |||
1044 | argv += args_used; | 1048 | argv += args_used; |
1045 | argc -= args_used; | 1049 | argc -= args_used; |
1046 | 1050 | ||
1047 | if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || | 1051 | if (!argc || sscanf(argv[0], "%u%c", &nr_mirrors, &dummy) != 1 || |
1048 | nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) { | 1052 | nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) { |
1049 | ti->error = "Invalid number of mirrors"; | 1053 | ti->error = "Invalid number of mirrors"; |
1050 | dm_dirty_log_destroy(dl); | 1054 | dm_dirty_log_destroy(dl); |
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c index 27f1d423b76c..6ab1192cdd5f 100644 --- a/drivers/md/dm-round-robin.c +++ b/drivers/md/dm-round-robin.c | |||
@@ -114,6 +114,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path, | |||
114 | struct selector *s = (struct selector *) ps->context; | 114 | struct selector *s = (struct selector *) ps->context; |
115 | struct path_info *pi; | 115 | struct path_info *pi; |
116 | unsigned repeat_count = RR_MIN_IO; | 116 | unsigned repeat_count = RR_MIN_IO; |
117 | char dummy; | ||
117 | 118 | ||
118 | if (argc > 1) { | 119 | if (argc > 1) { |
119 | *error = "round-robin ps: incorrect number of arguments"; | 120 | *error = "round-robin ps: incorrect number of arguments"; |
@@ -121,7 +122,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path, | |||
121 | } | 122 | } |
122 | 123 | ||
123 | /* First path argument is number of I/Os before switching path */ | 124 | /* First path argument is number of I/Os before switching path */ |
124 | if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { | 125 | if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { |
125 | *error = "round-robin ps: invalid repeat count"; | 126 | *error = "round-robin ps: invalid repeat count"; |
126 | return -EINVAL; | 127 | return -EINVAL; |
127 | } | 128 | } |
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c index 59883bd78214..9df8f6bd6418 100644 --- a/drivers/md/dm-service-time.c +++ b/drivers/md/dm-service-time.c | |||
@@ -110,6 +110,7 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path, | |||
110 | struct path_info *pi; | 110 | struct path_info *pi; |
111 | unsigned repeat_count = ST_MIN_IO; | 111 | unsigned repeat_count = ST_MIN_IO; |
112 | unsigned relative_throughput = 1; | 112 | unsigned relative_throughput = 1; |
113 | char dummy; | ||
113 | 114 | ||
114 | /* | 115 | /* |
115 | * Arguments: [<repeat_count> [<relative_throughput>]] | 116 | * Arguments: [<repeat_count> [<relative_throughput>]] |
@@ -128,13 +129,13 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path, | |||
128 | return -EINVAL; | 129 | return -EINVAL; |
129 | } | 130 | } |
130 | 131 | ||
131 | if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) { | 132 | if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) { |
132 | *error = "service-time ps: invalid repeat count"; | 133 | *error = "service-time ps: invalid repeat count"; |
133 | return -EINVAL; | 134 | return -EINVAL; |
134 | } | 135 | } |
135 | 136 | ||
136 | if ((argc == 2) && | 137 | if ((argc == 2) && |
137 | (sscanf(argv[1], "%u", &relative_throughput) != 1 || | 138 | (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 || |
138 | relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { | 139 | relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { |
139 | *error = "service-time ps: invalid relative_throughput value"; | 140 | *error = "service-time ps: invalid relative_throughput value"; |
140 | return -EINVAL; | 141 | return -EINVAL; |
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c index 3d80cf0c152d..35c94ff24ad5 100644 --- a/drivers/md/dm-stripe.c +++ b/drivers/md/dm-stripe.c | |||
@@ -75,8 +75,9 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc, | |||
75 | unsigned int stripe, char **argv) | 75 | unsigned int stripe, char **argv) |
76 | { | 76 | { |
77 | unsigned long long start; | 77 | unsigned long long start; |
78 | char dummy; | ||
78 | 79 | ||
79 | if (sscanf(argv[1], "%llu", &start) != 1) | 80 | if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1) |
80 | return -EINVAL; | 81 | return -EINVAL; |
81 | 82 | ||
82 | if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), | 83 | if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), |
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 63cc54289aff..2e227fbf1622 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c | |||
@@ -268,8 +268,7 @@ void dm_table_destroy(struct dm_table *t) | |||
268 | vfree(t->highs); | 268 | vfree(t->highs); |
269 | 269 | ||
270 | /* free the device list */ | 270 | /* free the device list */ |
271 | if (t->devices.next != &t->devices) | 271 | free_devices(&t->devices); |
272 | free_devices(&t->devices); | ||
273 | 272 | ||
274 | dm_free_md_mempools(t->mempools); | 273 | dm_free_md_mempools(t->mempools); |
275 | 274 | ||
@@ -464,10 +463,11 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode, | |||
464 | struct dm_dev_internal *dd; | 463 | struct dm_dev_internal *dd; |
465 | unsigned int major, minor; | 464 | unsigned int major, minor; |
466 | struct dm_table *t = ti->table; | 465 | struct dm_table *t = ti->table; |
466 | char dummy; | ||
467 | 467 | ||
468 | BUG_ON(!t); | 468 | BUG_ON(!t); |
469 | 469 | ||
470 | if (sscanf(path, "%u:%u", &major, &minor) == 2) { | 470 | if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) { |
471 | /* Extract the major/minor numbers */ | 471 | /* Extract the major/minor numbers */ |
472 | dev = MKDEV(major, minor); | 472 | dev = MKDEV(major, minor); |
473 | if (MAJOR(dev) != major || MINOR(dev) != minor) | 473 | if (MAJOR(dev) != major || MINOR(dev) != minor) |
@@ -842,9 +842,10 @@ static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set, | |||
842 | unsigned *value, char **error, unsigned grouped) | 842 | unsigned *value, char **error, unsigned grouped) |
843 | { | 843 | { |
844 | const char *arg_str = dm_shift_arg(arg_set); | 844 | const char *arg_str = dm_shift_arg(arg_set); |
845 | char dummy; | ||
845 | 846 | ||
846 | if (!arg_str || | 847 | if (!arg_str || |
847 | (sscanf(arg_str, "%u", value) != 1) || | 848 | (sscanf(arg_str, "%u%c", value, &dummy) != 1) || |
848 | (*value < arg->min) || | 849 | (*value < arg->min) || |
849 | (*value > arg->max) || | 850 | (*value > arg->max) || |
850 | (grouped && arg_set->argc < *value)) { | 851 | (grouped && arg_set->argc < *value)) { |
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c index 59c4f0446ffa..737d38865b69 100644 --- a/drivers/md/dm-thin-metadata.c +++ b/drivers/md/dm-thin-metadata.c | |||
@@ -385,6 +385,7 @@ static int init_pmd(struct dm_pool_metadata *pmd, | |||
385 | data_sm = dm_sm_disk_create(tm, nr_blocks); | 385 | data_sm = dm_sm_disk_create(tm, nr_blocks); |
386 | if (IS_ERR(data_sm)) { | 386 | if (IS_ERR(data_sm)) { |
387 | DMERR("sm_disk_create failed"); | 387 | DMERR("sm_disk_create failed"); |
388 | dm_tm_unlock(tm, sblock); | ||
388 | r = PTR_ERR(data_sm); | 389 | r = PTR_ERR(data_sm); |
389 | goto bad; | 390 | goto bad; |
390 | } | 391 | } |
@@ -613,7 +614,7 @@ static int __commit_transaction(struct dm_pool_metadata *pmd) | |||
613 | if (r < 0) | 614 | if (r < 0) |
614 | goto out; | 615 | goto out; |
615 | 616 | ||
616 | r = dm_sm_root_size(pmd->metadata_sm, &data_len); | 617 | r = dm_sm_root_size(pmd->data_sm, &data_len); |
617 | if (r < 0) | 618 | if (r < 0) |
618 | goto out; | 619 | goto out; |
619 | 620 | ||
@@ -712,6 +713,9 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, | |||
712 | if (r) | 713 | if (r) |
713 | goto bad; | 714 | goto bad; |
714 | 715 | ||
716 | if (bdev_size > THIN_METADATA_MAX_SECTORS) | ||
717 | bdev_size = THIN_METADATA_MAX_SECTORS; | ||
718 | |||
715 | disk_super = dm_block_data(sblock); | 719 | disk_super = dm_block_data(sblock); |
716 | disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); | 720 | disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); |
717 | disk_super->version = cpu_to_le32(THIN_VERSION); | 721 | disk_super->version = cpu_to_le32(THIN_VERSION); |
@@ -789,6 +793,11 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd) | |||
789 | return 0; | 793 | return 0; |
790 | } | 794 | } |
791 | 795 | ||
796 | /* | ||
797 | * __open_device: Returns @td corresponding to device with id @dev, | ||
798 | * creating it if @create is set and incrementing @td->open_count. | ||
799 | * On failure, @td is undefined. | ||
800 | */ | ||
792 | static int __open_device(struct dm_pool_metadata *pmd, | 801 | static int __open_device(struct dm_pool_metadata *pmd, |
793 | dm_thin_id dev, int create, | 802 | dm_thin_id dev, int create, |
794 | struct dm_thin_device **td) | 803 | struct dm_thin_device **td) |
@@ -799,10 +808,16 @@ static int __open_device(struct dm_pool_metadata *pmd, | |||
799 | struct disk_device_details details_le; | 808 | struct disk_device_details details_le; |
800 | 809 | ||
801 | /* | 810 | /* |
802 | * Check the device isn't already open. | 811 | * If the device is already open, return it. |
803 | */ | 812 | */ |
804 | list_for_each_entry(td2, &pmd->thin_devices, list) | 813 | list_for_each_entry(td2, &pmd->thin_devices, list) |
805 | if (td2->id == dev) { | 814 | if (td2->id == dev) { |
815 | /* | ||
816 | * May not create an already-open device. | ||
817 | */ | ||
818 | if (create) | ||
819 | return -EEXIST; | ||
820 | |||
806 | td2->open_count++; | 821 | td2->open_count++; |
807 | *td = td2; | 822 | *td = td2; |
808 | return 0; | 823 | return 0; |
@@ -817,6 +832,9 @@ static int __open_device(struct dm_pool_metadata *pmd, | |||
817 | if (r != -ENODATA || !create) | 832 | if (r != -ENODATA || !create) |
818 | return r; | 833 | return r; |
819 | 834 | ||
835 | /* | ||
836 | * Create new device. | ||
837 | */ | ||
820 | changed = 1; | 838 | changed = 1; |
821 | details_le.mapped_blocks = 0; | 839 | details_le.mapped_blocks = 0; |
822 | details_le.transaction_id = cpu_to_le64(pmd->trans_id); | 840 | details_le.transaction_id = cpu_to_le64(pmd->trans_id); |
@@ -882,12 +900,10 @@ static int __create_thin(struct dm_pool_metadata *pmd, | |||
882 | 900 | ||
883 | r = __open_device(pmd, dev, 1, &td); | 901 | r = __open_device(pmd, dev, 1, &td); |
884 | if (r) { | 902 | if (r) { |
885 | __close_device(td); | ||
886 | dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); | 903 | dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); |
887 | dm_btree_del(&pmd->bl_info, dev_root); | 904 | dm_btree_del(&pmd->bl_info, dev_root); |
888 | return r; | 905 | return r; |
889 | } | 906 | } |
890 | td->changed = 1; | ||
891 | __close_device(td); | 907 | __close_device(td); |
892 | 908 | ||
893 | return r; | 909 | return r; |
@@ -967,14 +983,14 @@ static int __create_snap(struct dm_pool_metadata *pmd, | |||
967 | goto bad; | 983 | goto bad; |
968 | 984 | ||
969 | r = __set_snapshot_details(pmd, td, origin, pmd->time); | 985 | r = __set_snapshot_details(pmd, td, origin, pmd->time); |
986 | __close_device(td); | ||
987 | |||
970 | if (r) | 988 | if (r) |
971 | goto bad; | 989 | goto bad; |
972 | 990 | ||
973 | __close_device(td); | ||
974 | return 0; | 991 | return 0; |
975 | 992 | ||
976 | bad: | 993 | bad: |
977 | __close_device(td); | ||
978 | dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); | 994 | dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); |
979 | dm_btree_remove(&pmd->details_info, pmd->details_root, | 995 | dm_btree_remove(&pmd->details_info, pmd->details_root, |
980 | &key, &pmd->details_root); | 996 | &key, &pmd->details_root); |
@@ -1211,6 +1227,8 @@ static int __remove(struct dm_thin_device *td, dm_block_t block) | |||
1211 | if (r) | 1227 | if (r) |
1212 | return r; | 1228 | return r; |
1213 | 1229 | ||
1230 | td->mapped_blocks--; | ||
1231 | td->changed = 1; | ||
1214 | pmd->need_commit = 1; | 1232 | pmd->need_commit = 1; |
1215 | 1233 | ||
1216 | return 0; | 1234 | return 0; |
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h index 859c16896877..ed4725e67c96 100644 --- a/drivers/md/dm-thin-metadata.h +++ b/drivers/md/dm-thin-metadata.h | |||
@@ -11,6 +11,19 @@ | |||
11 | 11 | ||
12 | #define THIN_METADATA_BLOCK_SIZE 4096 | 12 | #define THIN_METADATA_BLOCK_SIZE 4096 |
13 | 13 | ||
14 | /* | ||
15 | * The metadata device is currently limited in size. | ||
16 | * | ||
17 | * We have one block of index, which can hold 255 index entries. Each | ||
18 | * index entry contains allocation info about 16k metadata blocks. | ||
19 | */ | ||
20 | #define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) | ||
21 | |||
22 | /* | ||
23 | * A metadata device larger than 16GB triggers a warning. | ||
24 | */ | ||
25 | #define THIN_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT)) | ||
26 | |||
14 | /*----------------------------------------------------------------*/ | 27 | /*----------------------------------------------------------------*/ |
15 | 28 | ||
16 | struct dm_pool_metadata; | 29 | struct dm_pool_metadata; |
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index c3087575fef0..213ae32a0fc4 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c | |||
@@ -23,6 +23,7 @@ | |||
23 | #define DEFERRED_SET_SIZE 64 | 23 | #define DEFERRED_SET_SIZE 64 |
24 | #define MAPPING_POOL_SIZE 1024 | 24 | #define MAPPING_POOL_SIZE 1024 |
25 | #define PRISON_CELLS 1024 | 25 | #define PRISON_CELLS 1024 |
26 | #define COMMIT_PERIOD HZ | ||
26 | 27 | ||
27 | /* | 28 | /* |
28 | * The block size of the device holding pool data must be | 29 | * The block size of the device holding pool data must be |
@@ -32,16 +33,6 @@ | |||
32 | #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) | 33 | #define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) |
33 | 34 | ||
34 | /* | 35 | /* |
35 | * The metadata device is currently limited in size. The limitation is | ||
36 | * checked lower down in dm-space-map-metadata, but we also check it here | ||
37 | * so we can fail early. | ||
38 | * | ||
39 | * We have one block of index, which can hold 255 index entries. Each | ||
40 | * index entry contains allocation info about 16k metadata blocks. | ||
41 | */ | ||
42 | #define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT))) | ||
43 | |||
44 | /* | ||
45 | * Device id is restricted to 24 bits. | 36 | * Device id is restricted to 24 bits. |
46 | */ | 37 | */ |
47 | #define MAX_DEV_ID ((1 << 24) - 1) | 38 | #define MAX_DEV_ID ((1 << 24) - 1) |
@@ -72,7 +63,7 @@ | |||
72 | * missed out if the io covers the block. (schedule_copy). | 63 | * missed out if the io covers the block. (schedule_copy). |
73 | * | 64 | * |
74 | * iv) insert the new mapping into the origin's btree | 65 | * iv) insert the new mapping into the origin's btree |
75 | * (process_prepared_mappings). This act of inserting breaks some | 66 | * (process_prepared_mapping). This act of inserting breaks some |
76 | * sharing of btree nodes between the two devices. Breaking sharing only | 67 | * sharing of btree nodes between the two devices. Breaking sharing only |
77 | * effects the btree of that specific device. Btrees for the other | 68 | * effects the btree of that specific device. Btrees for the other |
78 | * devices that share the block never change. The btree for the origin | 69 | * devices that share the block never change. The btree for the origin |
@@ -124,7 +115,7 @@ struct cell { | |||
124 | struct hlist_node list; | 115 | struct hlist_node list; |
125 | struct bio_prison *prison; | 116 | struct bio_prison *prison; |
126 | struct cell_key key; | 117 | struct cell_key key; |
127 | unsigned count; | 118 | struct bio *holder; |
128 | struct bio_list bios; | 119 | struct bio_list bios; |
129 | }; | 120 | }; |
130 | 121 | ||
@@ -220,54 +211,59 @@ static struct cell *__search_bucket(struct hlist_head *bucket, | |||
220 | * This may block if a new cell needs allocating. You must ensure that | 211 | * This may block if a new cell needs allocating. You must ensure that |
221 | * cells will be unlocked even if the calling thread is blocked. | 212 | * cells will be unlocked even if the calling thread is blocked. |
222 | * | 213 | * |
223 | * Returns the number of entries in the cell prior to the new addition | 214 | * Returns 1 if the cell was already held, 0 if @inmate is the new holder. |
224 | * or < 0 on failure. | ||
225 | */ | 215 | */ |
226 | static int bio_detain(struct bio_prison *prison, struct cell_key *key, | 216 | static int bio_detain(struct bio_prison *prison, struct cell_key *key, |
227 | struct bio *inmate, struct cell **ref) | 217 | struct bio *inmate, struct cell **ref) |
228 | { | 218 | { |
229 | int r; | 219 | int r = 1; |
230 | unsigned long flags; | 220 | unsigned long flags; |
231 | uint32_t hash = hash_key(prison, key); | 221 | uint32_t hash = hash_key(prison, key); |
232 | struct cell *uninitialized_var(cell), *cell2 = NULL; | 222 | struct cell *cell, *cell2; |
233 | 223 | ||
234 | BUG_ON(hash > prison->nr_buckets); | 224 | BUG_ON(hash > prison->nr_buckets); |
235 | 225 | ||
236 | spin_lock_irqsave(&prison->lock, flags); | 226 | spin_lock_irqsave(&prison->lock, flags); |
227 | |||
237 | cell = __search_bucket(prison->cells + hash, key); | 228 | cell = __search_bucket(prison->cells + hash, key); |
229 | if (cell) { | ||
230 | bio_list_add(&cell->bios, inmate); | ||
231 | goto out; | ||
232 | } | ||
238 | 233 | ||
239 | if (!cell) { | 234 | /* |
240 | /* | 235 | * Allocate a new cell |
241 | * Allocate a new cell | 236 | */ |
242 | */ | 237 | spin_unlock_irqrestore(&prison->lock, flags); |
243 | spin_unlock_irqrestore(&prison->lock, flags); | 238 | cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); |
244 | cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); | 239 | spin_lock_irqsave(&prison->lock, flags); |
245 | spin_lock_irqsave(&prison->lock, flags); | ||
246 | 240 | ||
247 | /* | 241 | /* |
248 | * We've been unlocked, so we have to double check that | 242 | * We've been unlocked, so we have to double check that |
249 | * nobody else has inserted this cell in the meantime. | 243 | * nobody else has inserted this cell in the meantime. |
250 | */ | 244 | */ |
251 | cell = __search_bucket(prison->cells + hash, key); | 245 | cell = __search_bucket(prison->cells + hash, key); |
246 | if (cell) { | ||
247 | mempool_free(cell2, prison->cell_pool); | ||
248 | bio_list_add(&cell->bios, inmate); | ||
249 | goto out; | ||
250 | } | ||
252 | 251 | ||
253 | if (!cell) { | 252 | /* |
254 | cell = cell2; | 253 | * Use new cell. |
255 | cell2 = NULL; | 254 | */ |
255 | cell = cell2; | ||
256 | 256 | ||
257 | cell->prison = prison; | 257 | cell->prison = prison; |
258 | memcpy(&cell->key, key, sizeof(cell->key)); | 258 | memcpy(&cell->key, key, sizeof(cell->key)); |
259 | cell->count = 0; | 259 | cell->holder = inmate; |
260 | bio_list_init(&cell->bios); | 260 | bio_list_init(&cell->bios); |
261 | hlist_add_head(&cell->list, prison->cells + hash); | 261 | hlist_add_head(&cell->list, prison->cells + hash); |
262 | } | ||
263 | } | ||
264 | 262 | ||
265 | r = cell->count++; | 263 | r = 0; |
266 | bio_list_add(&cell->bios, inmate); | ||
267 | spin_unlock_irqrestore(&prison->lock, flags); | ||
268 | 264 | ||
269 | if (cell2) | 265 | out: |
270 | mempool_free(cell2, prison->cell_pool); | 266 | spin_unlock_irqrestore(&prison->lock, flags); |
271 | 267 | ||
272 | *ref = cell; | 268 | *ref = cell; |
273 | 269 | ||
@@ -283,8 +279,8 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates) | |||
283 | 279 | ||
284 | hlist_del(&cell->list); | 280 | hlist_del(&cell->list); |
285 | 281 | ||
286 | if (inmates) | 282 | bio_list_add(inmates, cell->holder); |
287 | bio_list_merge(inmates, &cell->bios); | 283 | bio_list_merge(inmates, &cell->bios); |
288 | 284 | ||
289 | mempool_free(cell, prison->cell_pool); | 285 | mempool_free(cell, prison->cell_pool); |
290 | } | 286 | } |
@@ -305,22 +301,44 @@ static void cell_release(struct cell *cell, struct bio_list *bios) | |||
305 | * bio may be in the cell. This function releases the cell, and also does | 301 | * bio may be in the cell. This function releases the cell, and also does |
306 | * a sanity check. | 302 | * a sanity check. |
307 | */ | 303 | */ |
304 | static void __cell_release_singleton(struct cell *cell, struct bio *bio) | ||
305 | { | ||
306 | hlist_del(&cell->list); | ||
307 | BUG_ON(cell->holder != bio); | ||
308 | BUG_ON(!bio_list_empty(&cell->bios)); | ||
309 | } | ||
310 | |||
308 | static void cell_release_singleton(struct cell *cell, struct bio *bio) | 311 | static void cell_release_singleton(struct cell *cell, struct bio *bio) |
309 | { | 312 | { |
310 | struct bio_prison *prison = cell->prison; | ||
311 | struct bio_list bios; | ||
312 | struct bio *b; | ||
313 | unsigned long flags; | 313 | unsigned long flags; |
314 | 314 | struct bio_prison *prison = cell->prison; | |
315 | bio_list_init(&bios); | ||
316 | 315 | ||
317 | spin_lock_irqsave(&prison->lock, flags); | 316 | spin_lock_irqsave(&prison->lock, flags); |
318 | __cell_release(cell, &bios); | 317 | __cell_release_singleton(cell, bio); |
319 | spin_unlock_irqrestore(&prison->lock, flags); | 318 | spin_unlock_irqrestore(&prison->lock, flags); |
319 | } | ||
320 | |||
321 | /* | ||
322 | * Sometimes we don't want the holder, just the additional bios. | ||
323 | */ | ||
324 | static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates) | ||
325 | { | ||
326 | struct bio_prison *prison = cell->prison; | ||
327 | |||
328 | hlist_del(&cell->list); | ||
329 | bio_list_merge(inmates, &cell->bios); | ||
320 | 330 | ||
321 | b = bio_list_pop(&bios); | 331 | mempool_free(cell, prison->cell_pool); |
322 | BUG_ON(b != bio); | 332 | } |
323 | BUG_ON(!bio_list_empty(&bios)); | 333 | |
334 | static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates) | ||
335 | { | ||
336 | unsigned long flags; | ||
337 | struct bio_prison *prison = cell->prison; | ||
338 | |||
339 | spin_lock_irqsave(&prison->lock, flags); | ||
340 | __cell_release_no_holder(cell, inmates); | ||
341 | spin_unlock_irqrestore(&prison->lock, flags); | ||
324 | } | 342 | } |
325 | 343 | ||
326 | static void cell_error(struct cell *cell) | 344 | static void cell_error(struct cell *cell) |
@@ -471,6 +489,13 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b, | |||
471 | * devices. | 489 | * devices. |
472 | */ | 490 | */ |
473 | struct new_mapping; | 491 | struct new_mapping; |
492 | |||
493 | struct pool_features { | ||
494 | unsigned zero_new_blocks:1; | ||
495 | unsigned discard_enabled:1; | ||
496 | unsigned discard_passdown:1; | ||
497 | }; | ||
498 | |||
474 | struct pool { | 499 | struct pool { |
475 | struct list_head list; | 500 | struct list_head list; |
476 | struct dm_target *ti; /* Only set if a pool target is bound */ | 501 | struct dm_target *ti; /* Only set if a pool target is bound */ |
@@ -484,7 +509,7 @@ struct pool { | |||
484 | dm_block_t offset_mask; | 509 | dm_block_t offset_mask; |
485 | dm_block_t low_water_blocks; | 510 | dm_block_t low_water_blocks; |
486 | 511 | ||
487 | unsigned zero_new_blocks:1; | 512 | struct pool_features pf; |
488 | unsigned low_water_triggered:1; /* A dm event has been sent */ | 513 | unsigned low_water_triggered:1; /* A dm event has been sent */ |
489 | unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ | 514 | unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ |
490 | 515 | ||
@@ -493,17 +518,21 @@ struct pool { | |||
493 | 518 | ||
494 | struct workqueue_struct *wq; | 519 | struct workqueue_struct *wq; |
495 | struct work_struct worker; | 520 | struct work_struct worker; |
521 | struct delayed_work waker; | ||
496 | 522 | ||
497 | unsigned ref_count; | 523 | unsigned ref_count; |
524 | unsigned long last_commit_jiffies; | ||
498 | 525 | ||
499 | spinlock_t lock; | 526 | spinlock_t lock; |
500 | struct bio_list deferred_bios; | 527 | struct bio_list deferred_bios; |
501 | struct bio_list deferred_flush_bios; | 528 | struct bio_list deferred_flush_bios; |
502 | struct list_head prepared_mappings; | 529 | struct list_head prepared_mappings; |
530 | struct list_head prepared_discards; | ||
503 | 531 | ||
504 | struct bio_list retry_on_resume_list; | 532 | struct bio_list retry_on_resume_list; |
505 | 533 | ||
506 | struct deferred_set ds; /* FIXME: move to thin_c */ | 534 | struct deferred_set shared_read_ds; |
535 | struct deferred_set all_io_ds; | ||
507 | 536 | ||
508 | struct new_mapping *next_mapping; | 537 | struct new_mapping *next_mapping; |
509 | mempool_t *mapping_pool; | 538 | mempool_t *mapping_pool; |
@@ -521,7 +550,7 @@ struct pool_c { | |||
521 | struct dm_target_callbacks callbacks; | 550 | struct dm_target_callbacks callbacks; |
522 | 551 | ||
523 | dm_block_t low_water_blocks; | 552 | dm_block_t low_water_blocks; |
524 | unsigned zero_new_blocks:1; | 553 | struct pool_features pf; |
525 | }; | 554 | }; |
526 | 555 | ||
527 | /* | 556 | /* |
@@ -529,6 +558,7 @@ struct pool_c { | |||
529 | */ | 558 | */ |
530 | struct thin_c { | 559 | struct thin_c { |
531 | struct dm_dev *pool_dev; | 560 | struct dm_dev *pool_dev; |
561 | struct dm_dev *origin_dev; | ||
532 | dm_thin_id dev_id; | 562 | dm_thin_id dev_id; |
533 | 563 | ||
534 | struct pool *pool; | 564 | struct pool *pool; |
@@ -597,6 +627,13 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev | |||
597 | 627 | ||
598 | /*----------------------------------------------------------------*/ | 628 | /*----------------------------------------------------------------*/ |
599 | 629 | ||
630 | struct endio_hook { | ||
631 | struct thin_c *tc; | ||
632 | struct deferred_entry *shared_read_entry; | ||
633 | struct deferred_entry *all_io_entry; | ||
634 | struct new_mapping *overwrite_mapping; | ||
635 | }; | ||
636 | |||
600 | static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) | 637 | static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) |
601 | { | 638 | { |
602 | struct bio *bio; | 639 | struct bio *bio; |
@@ -607,7 +644,8 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) | |||
607 | bio_list_init(master); | 644 | bio_list_init(master); |
608 | 645 | ||
609 | while ((bio = bio_list_pop(&bios))) { | 646 | while ((bio = bio_list_pop(&bios))) { |
610 | if (dm_get_mapinfo(bio)->ptr == tc) | 647 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; |
648 | if (h->tc == tc) | ||
611 | bio_endio(bio, DM_ENDIO_REQUEUE); | 649 | bio_endio(bio, DM_ENDIO_REQUEUE); |
612 | else | 650 | else |
613 | bio_list_add(master, bio); | 651 | bio_list_add(master, bio); |
@@ -646,14 +684,16 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block) | |||
646 | (bio->bi_sector & pool->offset_mask); | 684 | (bio->bi_sector & pool->offset_mask); |
647 | } | 685 | } |
648 | 686 | ||
649 | static void remap_and_issue(struct thin_c *tc, struct bio *bio, | 687 | static void remap_to_origin(struct thin_c *tc, struct bio *bio) |
650 | dm_block_t block) | 688 | { |
689 | bio->bi_bdev = tc->origin_dev->bdev; | ||
690 | } | ||
691 | |||
692 | static void issue(struct thin_c *tc, struct bio *bio) | ||
651 | { | 693 | { |
652 | struct pool *pool = tc->pool; | 694 | struct pool *pool = tc->pool; |
653 | unsigned long flags; | 695 | unsigned long flags; |
654 | 696 | ||
655 | remap(tc, bio, block); | ||
656 | |||
657 | /* | 697 | /* |
658 | * Batch together any FUA/FLUSH bios we find and then issue | 698 | * Batch together any FUA/FLUSH bios we find and then issue |
659 | * a single commit for them in process_deferred_bios(). | 699 | * a single commit for them in process_deferred_bios(). |
@@ -666,6 +706,19 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio, | |||
666 | generic_make_request(bio); | 706 | generic_make_request(bio); |
667 | } | 707 | } |
668 | 708 | ||
709 | static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio) | ||
710 | { | ||
711 | remap_to_origin(tc, bio); | ||
712 | issue(tc, bio); | ||
713 | } | ||
714 | |||
715 | static void remap_and_issue(struct thin_c *tc, struct bio *bio, | ||
716 | dm_block_t block) | ||
717 | { | ||
718 | remap(tc, bio, block); | ||
719 | issue(tc, bio); | ||
720 | } | ||
721 | |||
669 | /* | 722 | /* |
670 | * wake_worker() is used when new work is queued and when pool_resume is | 723 | * wake_worker() is used when new work is queued and when pool_resume is |
671 | * ready to continue deferred IO processing. | 724 | * ready to continue deferred IO processing. |
@@ -680,21 +733,17 @@ static void wake_worker(struct pool *pool) | |||
680 | /* | 733 | /* |
681 | * Bio endio functions. | 734 | * Bio endio functions. |
682 | */ | 735 | */ |
683 | struct endio_hook { | ||
684 | struct thin_c *tc; | ||
685 | bio_end_io_t *saved_bi_end_io; | ||
686 | struct deferred_entry *entry; | ||
687 | }; | ||
688 | |||
689 | struct new_mapping { | 736 | struct new_mapping { |
690 | struct list_head list; | 737 | struct list_head list; |
691 | 738 | ||
692 | int prepared; | 739 | unsigned quiesced:1; |
740 | unsigned prepared:1; | ||
741 | unsigned pass_discard:1; | ||
693 | 742 | ||
694 | struct thin_c *tc; | 743 | struct thin_c *tc; |
695 | dm_block_t virt_block; | 744 | dm_block_t virt_block; |
696 | dm_block_t data_block; | 745 | dm_block_t data_block; |
697 | struct cell *cell; | 746 | struct cell *cell, *cell2; |
698 | int err; | 747 | int err; |
699 | 748 | ||
700 | /* | 749 | /* |
@@ -711,7 +760,7 @@ static void __maybe_add_mapping(struct new_mapping *m) | |||
711 | { | 760 | { |
712 | struct pool *pool = m->tc->pool; | 761 | struct pool *pool = m->tc->pool; |
713 | 762 | ||
714 | if (list_empty(&m->list) && m->prepared) { | 763 | if (m->quiesced && m->prepared) { |
715 | list_add(&m->list, &pool->prepared_mappings); | 764 | list_add(&m->list, &pool->prepared_mappings); |
716 | wake_worker(pool); | 765 | wake_worker(pool); |
717 | } | 766 | } |
@@ -734,7 +783,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context) | |||
734 | static void overwrite_endio(struct bio *bio, int err) | 783 | static void overwrite_endio(struct bio *bio, int err) |
735 | { | 784 | { |
736 | unsigned long flags; | 785 | unsigned long flags; |
737 | struct new_mapping *m = dm_get_mapinfo(bio)->ptr; | 786 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; |
787 | struct new_mapping *m = h->overwrite_mapping; | ||
738 | struct pool *pool = m->tc->pool; | 788 | struct pool *pool = m->tc->pool; |
739 | 789 | ||
740 | m->err = err; | 790 | m->err = err; |
@@ -745,31 +795,6 @@ static void overwrite_endio(struct bio *bio, int err) | |||
745 | spin_unlock_irqrestore(&pool->lock, flags); | 795 | spin_unlock_irqrestore(&pool->lock, flags); |
746 | } | 796 | } |
747 | 797 | ||
748 | static void shared_read_endio(struct bio *bio, int err) | ||
749 | { | ||
750 | struct list_head mappings; | ||
751 | struct new_mapping *m, *tmp; | ||
752 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; | ||
753 | unsigned long flags; | ||
754 | struct pool *pool = h->tc->pool; | ||
755 | |||
756 | bio->bi_end_io = h->saved_bi_end_io; | ||
757 | bio_endio(bio, err); | ||
758 | |||
759 | INIT_LIST_HEAD(&mappings); | ||
760 | ds_dec(h->entry, &mappings); | ||
761 | |||
762 | spin_lock_irqsave(&pool->lock, flags); | ||
763 | list_for_each_entry_safe(m, tmp, &mappings, list) { | ||
764 | list_del(&m->list); | ||
765 | INIT_LIST_HEAD(&m->list); | ||
766 | __maybe_add_mapping(m); | ||
767 | } | ||
768 | spin_unlock_irqrestore(&pool->lock, flags); | ||
769 | |||
770 | mempool_free(h, pool->endio_hook_pool); | ||
771 | } | ||
772 | |||
773 | /*----------------------------------------------------------------*/ | 798 | /*----------------------------------------------------------------*/ |
774 | 799 | ||
775 | /* | 800 | /* |
@@ -800,21 +825,16 @@ static void cell_defer(struct thin_c *tc, struct cell *cell, | |||
800 | * Same as cell_defer above, except it omits one particular detainee, | 825 | * Same as cell_defer above, except it omits one particular detainee, |
801 | * a write bio that covers the block and has already been processed. | 826 | * a write bio that covers the block and has already been processed. |
802 | */ | 827 | */ |
803 | static void cell_defer_except(struct thin_c *tc, struct cell *cell, | 828 | static void cell_defer_except(struct thin_c *tc, struct cell *cell) |
804 | struct bio *exception) | ||
805 | { | 829 | { |
806 | struct bio_list bios; | 830 | struct bio_list bios; |
807 | struct bio *bio; | ||
808 | struct pool *pool = tc->pool; | 831 | struct pool *pool = tc->pool; |
809 | unsigned long flags; | 832 | unsigned long flags; |
810 | 833 | ||
811 | bio_list_init(&bios); | 834 | bio_list_init(&bios); |
812 | cell_release(cell, &bios); | ||
813 | 835 | ||
814 | spin_lock_irqsave(&pool->lock, flags); | 836 | spin_lock_irqsave(&pool->lock, flags); |
815 | while ((bio = bio_list_pop(&bios))) | 837 | cell_release_no_holder(cell, &pool->deferred_bios); |
816 | if (bio != exception) | ||
817 | bio_list_add(&pool->deferred_bios, bio); | ||
818 | spin_unlock_irqrestore(&pool->lock, flags); | 838 | spin_unlock_irqrestore(&pool->lock, flags); |
819 | 839 | ||
820 | wake_worker(pool); | 840 | wake_worker(pool); |
@@ -854,7 +874,7 @@ static void process_prepared_mapping(struct new_mapping *m) | |||
854 | * the bios in the cell. | 874 | * the bios in the cell. |
855 | */ | 875 | */ |
856 | if (bio) { | 876 | if (bio) { |
857 | cell_defer_except(tc, m->cell, bio); | 877 | cell_defer_except(tc, m->cell); |
858 | bio_endio(bio, 0); | 878 | bio_endio(bio, 0); |
859 | } else | 879 | } else |
860 | cell_defer(tc, m->cell, m->data_block); | 880 | cell_defer(tc, m->cell, m->data_block); |
@@ -863,7 +883,30 @@ static void process_prepared_mapping(struct new_mapping *m) | |||
863 | mempool_free(m, tc->pool->mapping_pool); | 883 | mempool_free(m, tc->pool->mapping_pool); |
864 | } | 884 | } |
865 | 885 | ||
866 | static void process_prepared_mappings(struct pool *pool) | 886 | static void process_prepared_discard(struct new_mapping *m) |
887 | { | ||
888 | int r; | ||
889 | struct thin_c *tc = m->tc; | ||
890 | |||
891 | r = dm_thin_remove_block(tc->td, m->virt_block); | ||
892 | if (r) | ||
893 | DMERR("dm_thin_remove_block() failed"); | ||
894 | |||
895 | /* | ||
896 | * Pass the discard down to the underlying device? | ||
897 | */ | ||
898 | if (m->pass_discard) | ||
899 | remap_and_issue(tc, m->bio, m->data_block); | ||
900 | else | ||
901 | bio_endio(m->bio, 0); | ||
902 | |||
903 | cell_defer_except(tc, m->cell); | ||
904 | cell_defer_except(tc, m->cell2); | ||
905 | mempool_free(m, tc->pool->mapping_pool); | ||
906 | } | ||
907 | |||
908 | static void process_prepared(struct pool *pool, struct list_head *head, | ||
909 | void (*fn)(struct new_mapping *)) | ||
867 | { | 910 | { |
868 | unsigned long flags; | 911 | unsigned long flags; |
869 | struct list_head maps; | 912 | struct list_head maps; |
@@ -871,21 +914,27 @@ static void process_prepared_mappings(struct pool *pool) | |||
871 | 914 | ||
872 | INIT_LIST_HEAD(&maps); | 915 | INIT_LIST_HEAD(&maps); |
873 | spin_lock_irqsave(&pool->lock, flags); | 916 | spin_lock_irqsave(&pool->lock, flags); |
874 | list_splice_init(&pool->prepared_mappings, &maps); | 917 | list_splice_init(head, &maps); |
875 | spin_unlock_irqrestore(&pool->lock, flags); | 918 | spin_unlock_irqrestore(&pool->lock, flags); |
876 | 919 | ||
877 | list_for_each_entry_safe(m, tmp, &maps, list) | 920 | list_for_each_entry_safe(m, tmp, &maps, list) |
878 | process_prepared_mapping(m); | 921 | fn(m); |
879 | } | 922 | } |
880 | 923 | ||
881 | /* | 924 | /* |
882 | * Deferred bio jobs. | 925 | * Deferred bio jobs. |
883 | */ | 926 | */ |
884 | static int io_overwrites_block(struct pool *pool, struct bio *bio) | 927 | static int io_overlaps_block(struct pool *pool, struct bio *bio) |
885 | { | 928 | { |
886 | return ((bio_data_dir(bio) == WRITE) && | 929 | return !(bio->bi_sector & pool->offset_mask) && |
887 | !(bio->bi_sector & pool->offset_mask)) && | ||
888 | (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); | 930 | (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); |
931 | |||
932 | } | ||
933 | |||
934 | static int io_overwrites_block(struct pool *pool, struct bio *bio) | ||
935 | { | ||
936 | return (bio_data_dir(bio) == WRITE) && | ||
937 | io_overlaps_block(pool, bio); | ||
889 | } | 938 | } |
890 | 939 | ||
891 | static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, | 940 | static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, |
@@ -917,7 +966,8 @@ static struct new_mapping *get_next_mapping(struct pool *pool) | |||
917 | } | 966 | } |
918 | 967 | ||
919 | static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, | 968 | static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, |
920 | dm_block_t data_origin, dm_block_t data_dest, | 969 | struct dm_dev *origin, dm_block_t data_origin, |
970 | dm_block_t data_dest, | ||
921 | struct cell *cell, struct bio *bio) | 971 | struct cell *cell, struct bio *bio) |
922 | { | 972 | { |
923 | int r; | 973 | int r; |
@@ -925,6 +975,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, | |||
925 | struct new_mapping *m = get_next_mapping(pool); | 975 | struct new_mapping *m = get_next_mapping(pool); |
926 | 976 | ||
927 | INIT_LIST_HEAD(&m->list); | 977 | INIT_LIST_HEAD(&m->list); |
978 | m->quiesced = 0; | ||
928 | m->prepared = 0; | 979 | m->prepared = 0; |
929 | m->tc = tc; | 980 | m->tc = tc; |
930 | m->virt_block = virt_block; | 981 | m->virt_block = virt_block; |
@@ -933,7 +984,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, | |||
933 | m->err = 0; | 984 | m->err = 0; |
934 | m->bio = NULL; | 985 | m->bio = NULL; |
935 | 986 | ||
936 | ds_add_work(&pool->ds, &m->list); | 987 | if (!ds_add_work(&pool->shared_read_ds, &m->list)) |
988 | m->quiesced = 1; | ||
937 | 989 | ||
938 | /* | 990 | /* |
939 | * IO to pool_dev remaps to the pool target's data_dev. | 991 | * IO to pool_dev remaps to the pool target's data_dev. |
@@ -942,14 +994,15 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, | |||
942 | * bio immediately. Otherwise we use kcopyd to clone the data first. | 994 | * bio immediately. Otherwise we use kcopyd to clone the data first. |
943 | */ | 995 | */ |
944 | if (io_overwrites_block(pool, bio)) { | 996 | if (io_overwrites_block(pool, bio)) { |
997 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; | ||
998 | h->overwrite_mapping = m; | ||
945 | m->bio = bio; | 999 | m->bio = bio; |
946 | save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); | 1000 | save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); |
947 | dm_get_mapinfo(bio)->ptr = m; | ||
948 | remap_and_issue(tc, bio, data_dest); | 1001 | remap_and_issue(tc, bio, data_dest); |
949 | } else { | 1002 | } else { |
950 | struct dm_io_region from, to; | 1003 | struct dm_io_region from, to; |
951 | 1004 | ||
952 | from.bdev = tc->pool_dev->bdev; | 1005 | from.bdev = origin->bdev; |
953 | from.sector = data_origin * pool->sectors_per_block; | 1006 | from.sector = data_origin * pool->sectors_per_block; |
954 | from.count = pool->sectors_per_block; | 1007 | from.count = pool->sectors_per_block; |
955 | 1008 | ||
@@ -967,6 +1020,22 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, | |||
967 | } | 1020 | } |
968 | } | 1021 | } |
969 | 1022 | ||
1023 | static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block, | ||
1024 | dm_block_t data_origin, dm_block_t data_dest, | ||
1025 | struct cell *cell, struct bio *bio) | ||
1026 | { | ||
1027 | schedule_copy(tc, virt_block, tc->pool_dev, | ||
1028 | data_origin, data_dest, cell, bio); | ||
1029 | } | ||
1030 | |||
1031 | static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block, | ||
1032 | dm_block_t data_dest, | ||
1033 | struct cell *cell, struct bio *bio) | ||
1034 | { | ||
1035 | schedule_copy(tc, virt_block, tc->origin_dev, | ||
1036 | virt_block, data_dest, cell, bio); | ||
1037 | } | ||
1038 | |||
970 | static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, | 1039 | static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, |
971 | dm_block_t data_block, struct cell *cell, | 1040 | dm_block_t data_block, struct cell *cell, |
972 | struct bio *bio) | 1041 | struct bio *bio) |
@@ -975,6 +1044,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, | |||
975 | struct new_mapping *m = get_next_mapping(pool); | 1044 | struct new_mapping *m = get_next_mapping(pool); |
976 | 1045 | ||
977 | INIT_LIST_HEAD(&m->list); | 1046 | INIT_LIST_HEAD(&m->list); |
1047 | m->quiesced = 1; | ||
978 | m->prepared = 0; | 1048 | m->prepared = 0; |
979 | m->tc = tc; | 1049 | m->tc = tc; |
980 | m->virt_block = virt_block; | 1050 | m->virt_block = virt_block; |
@@ -988,13 +1058,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, | |||
988 | * zeroing pre-existing data, we can issue the bio immediately. | 1058 | * zeroing pre-existing data, we can issue the bio immediately. |
989 | * Otherwise we use kcopyd to zero the data first. | 1059 | * Otherwise we use kcopyd to zero the data first. |
990 | */ | 1060 | */ |
991 | if (!pool->zero_new_blocks) | 1061 | if (!pool->pf.zero_new_blocks) |
992 | process_prepared_mapping(m); | 1062 | process_prepared_mapping(m); |
993 | 1063 | ||
994 | else if (io_overwrites_block(pool, bio)) { | 1064 | else if (io_overwrites_block(pool, bio)) { |
1065 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; | ||
1066 | h->overwrite_mapping = m; | ||
995 | m->bio = bio; | 1067 | m->bio = bio; |
996 | save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); | 1068 | save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); |
997 | dm_get_mapinfo(bio)->ptr = m; | ||
998 | remap_and_issue(tc, bio, data_block); | 1069 | remap_and_issue(tc, bio, data_block); |
999 | 1070 | ||
1000 | } else { | 1071 | } else { |
@@ -1081,7 +1152,8 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result) | |||
1081 | */ | 1152 | */ |
1082 | static void retry_on_resume(struct bio *bio) | 1153 | static void retry_on_resume(struct bio *bio) |
1083 | { | 1154 | { |
1084 | struct thin_c *tc = dm_get_mapinfo(bio)->ptr; | 1155 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; |
1156 | struct thin_c *tc = h->tc; | ||
1085 | struct pool *pool = tc->pool; | 1157 | struct pool *pool = tc->pool; |
1086 | unsigned long flags; | 1158 | unsigned long flags; |
1087 | 1159 | ||
@@ -1102,6 +1174,86 @@ static void no_space(struct cell *cell) | |||
1102 | retry_on_resume(bio); | 1174 | retry_on_resume(bio); |
1103 | } | 1175 | } |
1104 | 1176 | ||
1177 | static void process_discard(struct thin_c *tc, struct bio *bio) | ||
1178 | { | ||
1179 | int r; | ||
1180 | struct pool *pool = tc->pool; | ||
1181 | struct cell *cell, *cell2; | ||
1182 | struct cell_key key, key2; | ||
1183 | dm_block_t block = get_bio_block(tc, bio); | ||
1184 | struct dm_thin_lookup_result lookup_result; | ||
1185 | struct new_mapping *m; | ||
1186 | |||
1187 | build_virtual_key(tc->td, block, &key); | ||
1188 | if (bio_detain(tc->pool->prison, &key, bio, &cell)) | ||
1189 | return; | ||
1190 | |||
1191 | r = dm_thin_find_block(tc->td, block, 1, &lookup_result); | ||
1192 | switch (r) { | ||
1193 | case 0: | ||
1194 | /* | ||
1195 | * Check nobody is fiddling with this pool block. This can | ||
1196 | * happen if someone's in the process of breaking sharing | ||
1197 | * on this block. | ||
1198 | */ | ||
1199 | build_data_key(tc->td, lookup_result.block, &key2); | ||
1200 | if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) { | ||
1201 | cell_release_singleton(cell, bio); | ||
1202 | break; | ||
1203 | } | ||
1204 | |||
1205 | if (io_overlaps_block(pool, bio)) { | ||
1206 | /* | ||
1207 | * IO may still be going to the destination block. We must | ||
1208 | * quiesce before we can do the removal. | ||
1209 | */ | ||
1210 | m = get_next_mapping(pool); | ||
1211 | m->tc = tc; | ||
1212 | m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown; | ||
1213 | m->virt_block = block; | ||
1214 | m->data_block = lookup_result.block; | ||
1215 | m->cell = cell; | ||
1216 | m->cell2 = cell2; | ||
1217 | m->err = 0; | ||
1218 | m->bio = bio; | ||
1219 | |||
1220 | if (!ds_add_work(&pool->all_io_ds, &m->list)) { | ||
1221 | list_add(&m->list, &pool->prepared_discards); | ||
1222 | wake_worker(pool); | ||
1223 | } | ||
1224 | } else { | ||
1225 | /* | ||
1226 | * This path is hit if people are ignoring | ||
1227 | * limits->discard_granularity. It ignores any | ||
1228 | * part of the discard that is in a subsequent | ||
1229 | * block. | ||
1230 | */ | ||
1231 | sector_t offset = bio->bi_sector - (block << pool->block_shift); | ||
1232 | unsigned remaining = (pool->sectors_per_block - offset) << 9; | ||
1233 | bio->bi_size = min(bio->bi_size, remaining); | ||
1234 | |||
1235 | cell_release_singleton(cell, bio); | ||
1236 | cell_release_singleton(cell2, bio); | ||
1237 | remap_and_issue(tc, bio, lookup_result.block); | ||
1238 | } | ||
1239 | break; | ||
1240 | |||
1241 | case -ENODATA: | ||
1242 | /* | ||
1243 | * It isn't provisioned, just forget it. | ||
1244 | */ | ||
1245 | cell_release_singleton(cell, bio); | ||
1246 | bio_endio(bio, 0); | ||
1247 | break; | ||
1248 | |||
1249 | default: | ||
1250 | DMERR("discard: find block unexpectedly returned %d", r); | ||
1251 | cell_release_singleton(cell, bio); | ||
1252 | bio_io_error(bio); | ||
1253 | break; | ||
1254 | } | ||
1255 | } | ||
1256 | |||
1105 | static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, | 1257 | static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, |
1106 | struct cell_key *key, | 1258 | struct cell_key *key, |
1107 | struct dm_thin_lookup_result *lookup_result, | 1259 | struct dm_thin_lookup_result *lookup_result, |
@@ -1113,8 +1265,8 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, | |||
1113 | r = alloc_data_block(tc, &data_block); | 1265 | r = alloc_data_block(tc, &data_block); |
1114 | switch (r) { | 1266 | switch (r) { |
1115 | case 0: | 1267 | case 0: |
1116 | schedule_copy(tc, block, lookup_result->block, | 1268 | schedule_internal_copy(tc, block, lookup_result->block, |
1117 | data_block, cell, bio); | 1269 | data_block, cell, bio); |
1118 | break; | 1270 | break; |
1119 | 1271 | ||
1120 | case -ENOSPC: | 1272 | case -ENOSPC: |
@@ -1147,13 +1299,9 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio, | |||
1147 | if (bio_data_dir(bio) == WRITE) | 1299 | if (bio_data_dir(bio) == WRITE) |
1148 | break_sharing(tc, bio, block, &key, lookup_result, cell); | 1300 | break_sharing(tc, bio, block, &key, lookup_result, cell); |
1149 | else { | 1301 | else { |
1150 | struct endio_hook *h; | 1302 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; |
1151 | h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); | ||
1152 | 1303 | ||
1153 | h->tc = tc; | 1304 | h->shared_read_entry = ds_inc(&pool->shared_read_ds); |
1154 | h->entry = ds_inc(&pool->ds); | ||
1155 | save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio); | ||
1156 | dm_get_mapinfo(bio)->ptr = h; | ||
1157 | 1305 | ||
1158 | cell_release_singleton(cell, bio); | 1306 | cell_release_singleton(cell, bio); |
1159 | remap_and_issue(tc, bio, lookup_result->block); | 1307 | remap_and_issue(tc, bio, lookup_result->block); |
@@ -1188,7 +1336,10 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block | |||
1188 | r = alloc_data_block(tc, &data_block); | 1336 | r = alloc_data_block(tc, &data_block); |
1189 | switch (r) { | 1337 | switch (r) { |
1190 | case 0: | 1338 | case 0: |
1191 | schedule_zero(tc, block, data_block, cell, bio); | 1339 | if (tc->origin_dev) |
1340 | schedule_external_copy(tc, block, data_block, cell, bio); | ||
1341 | else | ||
1342 | schedule_zero(tc, block, data_block, cell, bio); | ||
1192 | break; | 1343 | break; |
1193 | 1344 | ||
1194 | case -ENOSPC: | 1345 | case -ENOSPC: |
@@ -1239,16 +1390,27 @@ static void process_bio(struct thin_c *tc, struct bio *bio) | |||
1239 | break; | 1390 | break; |
1240 | 1391 | ||
1241 | case -ENODATA: | 1392 | case -ENODATA: |
1242 | provision_block(tc, bio, block, cell); | 1393 | if (bio_data_dir(bio) == READ && tc->origin_dev) { |
1394 | cell_release_singleton(cell, bio); | ||
1395 | remap_to_origin_and_issue(tc, bio); | ||
1396 | } else | ||
1397 | provision_block(tc, bio, block, cell); | ||
1243 | break; | 1398 | break; |
1244 | 1399 | ||
1245 | default: | 1400 | default: |
1246 | DMERR("dm_thin_find_block() failed, error = %d", r); | 1401 | DMERR("dm_thin_find_block() failed, error = %d", r); |
1402 | cell_release_singleton(cell, bio); | ||
1247 | bio_io_error(bio); | 1403 | bio_io_error(bio); |
1248 | break; | 1404 | break; |
1249 | } | 1405 | } |
1250 | } | 1406 | } |
1251 | 1407 | ||
1408 | static int need_commit_due_to_time(struct pool *pool) | ||
1409 | { | ||
1410 | return jiffies < pool->last_commit_jiffies || | ||
1411 | jiffies > pool->last_commit_jiffies + COMMIT_PERIOD; | ||
1412 | } | ||
1413 | |||
1252 | static void process_deferred_bios(struct pool *pool) | 1414 | static void process_deferred_bios(struct pool *pool) |
1253 | { | 1415 | { |
1254 | unsigned long flags; | 1416 | unsigned long flags; |
@@ -1264,7 +1426,9 @@ static void process_deferred_bios(struct pool *pool) | |||
1264 | spin_unlock_irqrestore(&pool->lock, flags); | 1426 | spin_unlock_irqrestore(&pool->lock, flags); |
1265 | 1427 | ||
1266 | while ((bio = bio_list_pop(&bios))) { | 1428 | while ((bio = bio_list_pop(&bios))) { |
1267 | struct thin_c *tc = dm_get_mapinfo(bio)->ptr; | 1429 | struct endio_hook *h = dm_get_mapinfo(bio)->ptr; |
1430 | struct thin_c *tc = h->tc; | ||
1431 | |||
1268 | /* | 1432 | /* |
1269 | * If we've got no free new_mapping structs, and processing | 1433 | * If we've got no free new_mapping structs, and processing |
1270 | * this bio might require one, we pause until there are some | 1434 | * this bio might require one, we pause until there are some |
@@ -1277,7 +1441,11 @@ static void process_deferred_bios(struct pool *pool) | |||
1277 | 1441 | ||
1278 | break; | 1442 | break; |
1279 | } | 1443 | } |
1280 | process_bio(tc, bio); | 1444 | |
1445 | if (bio->bi_rw & REQ_DISCARD) | ||
1446 | process_discard(tc, bio); | ||
1447 | else | ||
1448 | process_bio(tc, bio); | ||
1281 | } | 1449 | } |
1282 | 1450 | ||
1283 | /* | 1451 | /* |
@@ -1290,7 +1458,7 @@ static void process_deferred_bios(struct pool *pool) | |||
1290 | bio_list_init(&pool->deferred_flush_bios); | 1458 | bio_list_init(&pool->deferred_flush_bios); |
1291 | spin_unlock_irqrestore(&pool->lock, flags); | 1459 | spin_unlock_irqrestore(&pool->lock, flags); |
1292 | 1460 | ||
1293 | if (bio_list_empty(&bios)) | 1461 | if (bio_list_empty(&bios) && !need_commit_due_to_time(pool)) |
1294 | return; | 1462 | return; |
1295 | 1463 | ||
1296 | r = dm_pool_commit_metadata(pool->pmd); | 1464 | r = dm_pool_commit_metadata(pool->pmd); |
@@ -1301,6 +1469,7 @@ static void process_deferred_bios(struct pool *pool) | |||
1301 | bio_io_error(bio); | 1469 | bio_io_error(bio); |
1302 | return; | 1470 | return; |
1303 | } | 1471 | } |
1472 | pool->last_commit_jiffies = jiffies; | ||
1304 | 1473 | ||
1305 | while ((bio = bio_list_pop(&bios))) | 1474 | while ((bio = bio_list_pop(&bios))) |
1306 | generic_make_request(bio); | 1475 | generic_make_request(bio); |
@@ -1310,10 +1479,22 @@ static void do_worker(struct work_struct *ws) | |||
1310 | { | 1479 | { |
1311 | struct pool *pool = container_of(ws, struct pool, worker); | 1480 | struct pool *pool = container_of(ws, struct pool, worker); |
1312 | 1481 | ||
1313 | process_prepared_mappings(pool); | 1482 | process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping); |
1483 | process_prepared(pool, &pool->prepared_discards, process_prepared_discard); | ||
1314 | process_deferred_bios(pool); | 1484 | process_deferred_bios(pool); |
1315 | } | 1485 | } |
1316 | 1486 | ||
1487 | /* | ||
1488 | * We want to commit periodically so that not too much | ||
1489 | * unwritten data builds up. | ||
1490 | */ | ||
1491 | static void do_waker(struct work_struct *ws) | ||
1492 | { | ||
1493 | struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker); | ||
1494 | wake_worker(pool); | ||
1495 | queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD); | ||
1496 | } | ||
1497 | |||
1317 | /*----------------------------------------------------------------*/ | 1498 | /*----------------------------------------------------------------*/ |
1318 | 1499 | ||
1319 | /* | 1500 | /* |
@@ -1335,6 +1516,19 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio) | |||
1335 | wake_worker(pool); | 1516 | wake_worker(pool); |
1336 | } | 1517 | } |
1337 | 1518 | ||
1519 | static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio) | ||
1520 | { | ||
1521 | struct pool *pool = tc->pool; | ||
1522 | struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO); | ||
1523 | |||
1524 | h->tc = tc; | ||
1525 | h->shared_read_entry = NULL; | ||
1526 | h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds); | ||
1527 | h->overwrite_mapping = NULL; | ||
1528 | |||
1529 | return h; | ||
1530 | } | ||
1531 | |||
1338 | /* | 1532 | /* |
1339 | * Non-blocking function called from the thin target's map function. | 1533 | * Non-blocking function called from the thin target's map function. |
1340 | */ | 1534 | */ |
@@ -1347,12 +1541,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio, | |||
1347 | struct dm_thin_device *td = tc->td; | 1541 | struct dm_thin_device *td = tc->td; |
1348 | struct dm_thin_lookup_result result; | 1542 | struct dm_thin_lookup_result result; |
1349 | 1543 | ||
1350 | /* | 1544 | map_context->ptr = thin_hook_bio(tc, bio); |
1351 | * Save the thin context for easy access from the deferred bio later. | 1545 | if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) { |
1352 | */ | ||
1353 | map_context->ptr = tc; | ||
1354 | |||
1355 | if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) { | ||
1356 | thin_defer_bio(tc, bio); | 1546 | thin_defer_bio(tc, bio); |
1357 | return DM_MAPIO_SUBMITTED; | 1547 | return DM_MAPIO_SUBMITTED; |
1358 | } | 1548 | } |
@@ -1434,7 +1624,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti) | |||
1434 | 1624 | ||
1435 | pool->ti = ti; | 1625 | pool->ti = ti; |
1436 | pool->low_water_blocks = pt->low_water_blocks; | 1626 | pool->low_water_blocks = pt->low_water_blocks; |
1437 | pool->zero_new_blocks = pt->zero_new_blocks; | 1627 | pool->pf = pt->pf; |
1438 | 1628 | ||
1439 | return 0; | 1629 | return 0; |
1440 | } | 1630 | } |
@@ -1448,6 +1638,14 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti) | |||
1448 | /*---------------------------------------------------------------- | 1638 | /*---------------------------------------------------------------- |
1449 | * Pool creation | 1639 | * Pool creation |
1450 | *--------------------------------------------------------------*/ | 1640 | *--------------------------------------------------------------*/ |
1641 | /* Initialize pool features. */ | ||
1642 | static void pool_features_init(struct pool_features *pf) | ||
1643 | { | ||
1644 | pf->zero_new_blocks = 1; | ||
1645 | pf->discard_enabled = 1; | ||
1646 | pf->discard_passdown = 1; | ||
1647 | } | ||
1648 | |||
1451 | static void __pool_destroy(struct pool *pool) | 1649 | static void __pool_destroy(struct pool *pool) |
1452 | { | 1650 | { |
1453 | __pool_table_remove(pool); | 1651 | __pool_table_remove(pool); |
@@ -1495,7 +1693,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, | |||
1495 | pool->block_shift = ffs(block_size) - 1; | 1693 | pool->block_shift = ffs(block_size) - 1; |
1496 | pool->offset_mask = block_size - 1; | 1694 | pool->offset_mask = block_size - 1; |
1497 | pool->low_water_blocks = 0; | 1695 | pool->low_water_blocks = 0; |
1498 | pool->zero_new_blocks = 1; | 1696 | pool_features_init(&pool->pf); |
1499 | pool->prison = prison_create(PRISON_CELLS); | 1697 | pool->prison = prison_create(PRISON_CELLS); |
1500 | if (!pool->prison) { | 1698 | if (!pool->prison) { |
1501 | *error = "Error creating pool's bio prison"; | 1699 | *error = "Error creating pool's bio prison"; |
@@ -1523,14 +1721,17 @@ static struct pool *pool_create(struct mapped_device *pool_md, | |||
1523 | } | 1721 | } |
1524 | 1722 | ||
1525 | INIT_WORK(&pool->worker, do_worker); | 1723 | INIT_WORK(&pool->worker, do_worker); |
1724 | INIT_DELAYED_WORK(&pool->waker, do_waker); | ||
1526 | spin_lock_init(&pool->lock); | 1725 | spin_lock_init(&pool->lock); |
1527 | bio_list_init(&pool->deferred_bios); | 1726 | bio_list_init(&pool->deferred_bios); |
1528 | bio_list_init(&pool->deferred_flush_bios); | 1727 | bio_list_init(&pool->deferred_flush_bios); |
1529 | INIT_LIST_HEAD(&pool->prepared_mappings); | 1728 | INIT_LIST_HEAD(&pool->prepared_mappings); |
1729 | INIT_LIST_HEAD(&pool->prepared_discards); | ||
1530 | pool->low_water_triggered = 0; | 1730 | pool->low_water_triggered = 0; |
1531 | pool->no_free_space = 0; | 1731 | pool->no_free_space = 0; |
1532 | bio_list_init(&pool->retry_on_resume_list); | 1732 | bio_list_init(&pool->retry_on_resume_list); |
1533 | ds_init(&pool->ds); | 1733 | ds_init(&pool->shared_read_ds); |
1734 | ds_init(&pool->all_io_ds); | ||
1534 | 1735 | ||
1535 | pool->next_mapping = NULL; | 1736 | pool->next_mapping = NULL; |
1536 | pool->mapping_pool = | 1737 | pool->mapping_pool = |
@@ -1549,6 +1750,7 @@ static struct pool *pool_create(struct mapped_device *pool_md, | |||
1549 | goto bad_endio_hook_pool; | 1750 | goto bad_endio_hook_pool; |
1550 | } | 1751 | } |
1551 | pool->ref_count = 1; | 1752 | pool->ref_count = 1; |
1753 | pool->last_commit_jiffies = jiffies; | ||
1552 | pool->pool_md = pool_md; | 1754 | pool->pool_md = pool_md; |
1553 | pool->md_dev = metadata_dev; | 1755 | pool->md_dev = metadata_dev; |
1554 | __pool_table_insert(pool); | 1756 | __pool_table_insert(pool); |
@@ -1588,7 +1790,8 @@ static void __pool_dec(struct pool *pool) | |||
1588 | 1790 | ||
1589 | static struct pool *__pool_find(struct mapped_device *pool_md, | 1791 | static struct pool *__pool_find(struct mapped_device *pool_md, |
1590 | struct block_device *metadata_dev, | 1792 | struct block_device *metadata_dev, |
1591 | unsigned long block_size, char **error) | 1793 | unsigned long block_size, char **error, |
1794 | int *created) | ||
1592 | { | 1795 | { |
1593 | struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); | 1796 | struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); |
1594 | 1797 | ||
@@ -1604,8 +1807,10 @@ static struct pool *__pool_find(struct mapped_device *pool_md, | |||
1604 | return ERR_PTR(-EINVAL); | 1807 | return ERR_PTR(-EINVAL); |
1605 | __pool_inc(pool); | 1808 | __pool_inc(pool); |
1606 | 1809 | ||
1607 | } else | 1810 | } else { |
1608 | pool = pool_create(pool_md, metadata_dev, block_size, error); | 1811 | pool = pool_create(pool_md, metadata_dev, block_size, error); |
1812 | *created = 1; | ||
1813 | } | ||
1609 | } | 1814 | } |
1610 | 1815 | ||
1611 | return pool; | 1816 | return pool; |
@@ -1629,10 +1834,6 @@ static void pool_dtr(struct dm_target *ti) | |||
1629 | mutex_unlock(&dm_thin_pool_table.mutex); | 1834 | mutex_unlock(&dm_thin_pool_table.mutex); |
1630 | } | 1835 | } |
1631 | 1836 | ||
1632 | struct pool_features { | ||
1633 | unsigned zero_new_blocks:1; | ||
1634 | }; | ||
1635 | |||
1636 | static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, | 1837 | static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, |
1637 | struct dm_target *ti) | 1838 | struct dm_target *ti) |
1638 | { | 1839 | { |
@@ -1641,7 +1842,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, | |||
1641 | const char *arg_name; | 1842 | const char *arg_name; |
1642 | 1843 | ||
1643 | static struct dm_arg _args[] = { | 1844 | static struct dm_arg _args[] = { |
1644 | {0, 1, "Invalid number of pool feature arguments"}, | 1845 | {0, 3, "Invalid number of pool feature arguments"}, |
1645 | }; | 1846 | }; |
1646 | 1847 | ||
1647 | /* | 1848 | /* |
@@ -1661,6 +1862,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, | |||
1661 | if (!strcasecmp(arg_name, "skip_block_zeroing")) { | 1862 | if (!strcasecmp(arg_name, "skip_block_zeroing")) { |
1662 | pf->zero_new_blocks = 0; | 1863 | pf->zero_new_blocks = 0; |
1663 | continue; | 1864 | continue; |
1865 | } else if (!strcasecmp(arg_name, "ignore_discard")) { | ||
1866 | pf->discard_enabled = 0; | ||
1867 | continue; | ||
1868 | } else if (!strcasecmp(arg_name, "no_discard_passdown")) { | ||
1869 | pf->discard_passdown = 0; | ||
1870 | continue; | ||
1664 | } | 1871 | } |
1665 | 1872 | ||
1666 | ti->error = "Unrecognised pool feature requested"; | 1873 | ti->error = "Unrecognised pool feature requested"; |
@@ -1678,10 +1885,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, | |||
1678 | * | 1885 | * |
1679 | * Optional feature arguments are: | 1886 | * Optional feature arguments are: |
1680 | * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. | 1887 | * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. |
1888 | * ignore_discard: disable discard | ||
1889 | * no_discard_passdown: don't pass discards down to the data device | ||
1681 | */ | 1890 | */ |
1682 | static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | 1891 | static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) |
1683 | { | 1892 | { |
1684 | int r; | 1893 | int r, pool_created = 0; |
1685 | struct pool_c *pt; | 1894 | struct pool_c *pt; |
1686 | struct pool *pool; | 1895 | struct pool *pool; |
1687 | struct pool_features pf; | 1896 | struct pool_features pf; |
@@ -1691,6 +1900,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
1691 | dm_block_t low_water_blocks; | 1900 | dm_block_t low_water_blocks; |
1692 | struct dm_dev *metadata_dev; | 1901 | struct dm_dev *metadata_dev; |
1693 | sector_t metadata_dev_size; | 1902 | sector_t metadata_dev_size; |
1903 | char b[BDEVNAME_SIZE]; | ||
1694 | 1904 | ||
1695 | /* | 1905 | /* |
1696 | * FIXME Remove validation from scope of lock. | 1906 | * FIXME Remove validation from scope of lock. |
@@ -1712,11 +1922,9 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
1712 | } | 1922 | } |
1713 | 1923 | ||
1714 | metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; | 1924 | metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; |
1715 | if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) { | 1925 | if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING) |
1716 | ti->error = "Metadata device is too large"; | 1926 | DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.", |
1717 | r = -EINVAL; | 1927 | bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS); |
1718 | goto out_metadata; | ||
1719 | } | ||
1720 | 1928 | ||
1721 | r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); | 1929 | r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); |
1722 | if (r) { | 1930 | if (r) { |
@@ -1742,8 +1950,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
1742 | /* | 1950 | /* |
1743 | * Set default pool features. | 1951 | * Set default pool features. |
1744 | */ | 1952 | */ |
1745 | memset(&pf, 0, sizeof(pf)); | 1953 | pool_features_init(&pf); |
1746 | pf.zero_new_blocks = 1; | ||
1747 | 1954 | ||
1748 | dm_consume_args(&as, 4); | 1955 | dm_consume_args(&as, 4); |
1749 | r = parse_pool_features(&as, &pf, ti); | 1956 | r = parse_pool_features(&as, &pf, ti); |
@@ -1757,20 +1964,58 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
1757 | } | 1964 | } |
1758 | 1965 | ||
1759 | pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, | 1966 | pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, |
1760 | block_size, &ti->error); | 1967 | block_size, &ti->error, &pool_created); |
1761 | if (IS_ERR(pool)) { | 1968 | if (IS_ERR(pool)) { |
1762 | r = PTR_ERR(pool); | 1969 | r = PTR_ERR(pool); |
1763 | goto out_free_pt; | 1970 | goto out_free_pt; |
1764 | } | 1971 | } |
1765 | 1972 | ||
1973 | /* | ||
1974 | * 'pool_created' reflects whether this is the first table load. | ||
1975 | * Top level discard support is not allowed to be changed after | ||
1976 | * initial load. This would require a pool reload to trigger thin | ||
1977 | * device changes. | ||
1978 | */ | ||
1979 | if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) { | ||
1980 | ti->error = "Discard support cannot be disabled once enabled"; | ||
1981 | r = -EINVAL; | ||
1982 | goto out_flags_changed; | ||
1983 | } | ||
1984 | |||
1985 | /* | ||
1986 | * If discard_passdown was enabled verify that the data device | ||
1987 | * supports discards. Disable discard_passdown if not; otherwise | ||
1988 | * -EOPNOTSUPP will be returned. | ||
1989 | */ | ||
1990 | if (pf.discard_passdown) { | ||
1991 | struct request_queue *q = bdev_get_queue(data_dev->bdev); | ||
1992 | if (!q || !blk_queue_discard(q)) { | ||
1993 | DMWARN("Discard unsupported by data device: Disabling discard passdown."); | ||
1994 | pf.discard_passdown = 0; | ||
1995 | } | ||
1996 | } | ||
1997 | |||
1766 | pt->pool = pool; | 1998 | pt->pool = pool; |
1767 | pt->ti = ti; | 1999 | pt->ti = ti; |
1768 | pt->metadata_dev = metadata_dev; | 2000 | pt->metadata_dev = metadata_dev; |
1769 | pt->data_dev = data_dev; | 2001 | pt->data_dev = data_dev; |
1770 | pt->low_water_blocks = low_water_blocks; | 2002 | pt->low_water_blocks = low_water_blocks; |
1771 | pt->zero_new_blocks = pf.zero_new_blocks; | 2003 | pt->pf = pf; |
1772 | ti->num_flush_requests = 1; | 2004 | ti->num_flush_requests = 1; |
1773 | ti->num_discard_requests = 0; | 2005 | /* |
2006 | * Only need to enable discards if the pool should pass | ||
2007 | * them down to the data device. The thin device's discard | ||
2008 | * processing will cause mappings to be removed from the btree. | ||
2009 | */ | ||
2010 | if (pf.discard_enabled && pf.discard_passdown) { | ||
2011 | ti->num_discard_requests = 1; | ||
2012 | /* | ||
2013 | * Setting 'discards_supported' circumvents the normal | ||
2014 | * stacking of discard limits (this keeps the pool and | ||
2015 | * thin devices' discard limits consistent). | ||
2016 | */ | ||
2017 | ti->discards_supported = 1; | ||
2018 | } | ||
1774 | ti->private = pt; | 2019 | ti->private = pt; |
1775 | 2020 | ||
1776 | pt->callbacks.congested_fn = pool_is_congested; | 2021 | pt->callbacks.congested_fn = pool_is_congested; |
@@ -1780,6 +2025,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
1780 | 2025 | ||
1781 | return 0; | 2026 | return 0; |
1782 | 2027 | ||
2028 | out_flags_changed: | ||
2029 | __pool_dec(pool); | ||
1783 | out_free_pt: | 2030 | out_free_pt: |
1784 | kfree(pt); | 2031 | kfree(pt); |
1785 | out: | 2032 | out: |
@@ -1878,7 +2125,7 @@ static void pool_resume(struct dm_target *ti) | |||
1878 | __requeue_bios(pool); | 2125 | __requeue_bios(pool); |
1879 | spin_unlock_irqrestore(&pool->lock, flags); | 2126 | spin_unlock_irqrestore(&pool->lock, flags); |
1880 | 2127 | ||
1881 | wake_worker(pool); | 2128 | do_waker(&pool->waker.work); |
1882 | } | 2129 | } |
1883 | 2130 | ||
1884 | static void pool_postsuspend(struct dm_target *ti) | 2131 | static void pool_postsuspend(struct dm_target *ti) |
@@ -1887,6 +2134,7 @@ static void pool_postsuspend(struct dm_target *ti) | |||
1887 | struct pool_c *pt = ti->private; | 2134 | struct pool_c *pt = ti->private; |
1888 | struct pool *pool = pt->pool; | 2135 | struct pool *pool = pt->pool; |
1889 | 2136 | ||
2137 | cancel_delayed_work(&pool->waker); | ||
1890 | flush_workqueue(pool->wq); | 2138 | flush_workqueue(pool->wq); |
1891 | 2139 | ||
1892 | r = dm_pool_commit_metadata(pool->pmd); | 2140 | r = dm_pool_commit_metadata(pool->pmd); |
@@ -2067,7 +2315,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv) | |||
2067 | static int pool_status(struct dm_target *ti, status_type_t type, | 2315 | static int pool_status(struct dm_target *ti, status_type_t type, |
2068 | char *result, unsigned maxlen) | 2316 | char *result, unsigned maxlen) |
2069 | { | 2317 | { |
2070 | int r; | 2318 | int r, count; |
2071 | unsigned sz = 0; | 2319 | unsigned sz = 0; |
2072 | uint64_t transaction_id; | 2320 | uint64_t transaction_id; |
2073 | dm_block_t nr_free_blocks_data; | 2321 | dm_block_t nr_free_blocks_data; |
@@ -2130,10 +2378,19 @@ static int pool_status(struct dm_target *ti, status_type_t type, | |||
2130 | (unsigned long)pool->sectors_per_block, | 2378 | (unsigned long)pool->sectors_per_block, |
2131 | (unsigned long long)pt->low_water_blocks); | 2379 | (unsigned long long)pt->low_water_blocks); |
2132 | 2380 | ||
2133 | DMEMIT("%u ", !pool->zero_new_blocks); | 2381 | count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled + |
2382 | !pool->pf.discard_passdown; | ||
2383 | DMEMIT("%u ", count); | ||
2134 | 2384 | ||
2135 | if (!pool->zero_new_blocks) | 2385 | if (!pool->pf.zero_new_blocks) |
2136 | DMEMIT("skip_block_zeroing "); | 2386 | DMEMIT("skip_block_zeroing "); |
2387 | |||
2388 | if (!pool->pf.discard_enabled) | ||
2389 | DMEMIT("ignore_discard "); | ||
2390 | |||
2391 | if (!pool->pf.discard_passdown) | ||
2392 | DMEMIT("no_discard_passdown "); | ||
2393 | |||
2137 | break; | 2394 | break; |
2138 | } | 2395 | } |
2139 | 2396 | ||
@@ -2162,6 +2419,21 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | |||
2162 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | 2419 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); |
2163 | } | 2420 | } |
2164 | 2421 | ||
2422 | static void set_discard_limits(struct pool *pool, struct queue_limits *limits) | ||
2423 | { | ||
2424 | /* | ||
2425 | * FIXME: these limits may be incompatible with the pool's data device | ||
2426 | */ | ||
2427 | limits->max_discard_sectors = pool->sectors_per_block; | ||
2428 | |||
2429 | /* | ||
2430 | * This is just a hint, and not enforced. We have to cope with | ||
2431 | * bios that overlap 2 blocks. | ||
2432 | */ | ||
2433 | limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT; | ||
2434 | limits->discard_zeroes_data = pool->pf.zero_new_blocks; | ||
2435 | } | ||
2436 | |||
2165 | static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) | 2437 | static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) |
2166 | { | 2438 | { |
2167 | struct pool_c *pt = ti->private; | 2439 | struct pool_c *pt = ti->private; |
@@ -2169,13 +2441,15 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) | |||
2169 | 2441 | ||
2170 | blk_limits_io_min(limits, 0); | 2442 | blk_limits_io_min(limits, 0); |
2171 | blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); | 2443 | blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); |
2444 | if (pool->pf.discard_enabled) | ||
2445 | set_discard_limits(pool, limits); | ||
2172 | } | 2446 | } |
2173 | 2447 | ||
2174 | static struct target_type pool_target = { | 2448 | static struct target_type pool_target = { |
2175 | .name = "thin-pool", | 2449 | .name = "thin-pool", |
2176 | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | | 2450 | .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | |
2177 | DM_TARGET_IMMUTABLE, | 2451 | DM_TARGET_IMMUTABLE, |
2178 | .version = {1, 0, 0}, | 2452 | .version = {1, 1, 0}, |
2179 | .module = THIS_MODULE, | 2453 | .module = THIS_MODULE, |
2180 | .ctr = pool_ctr, | 2454 | .ctr = pool_ctr, |
2181 | .dtr = pool_dtr, | 2455 | .dtr = pool_dtr, |
@@ -2202,6 +2476,8 @@ static void thin_dtr(struct dm_target *ti) | |||
2202 | __pool_dec(tc->pool); | 2476 | __pool_dec(tc->pool); |
2203 | dm_pool_close_thin_device(tc->td); | 2477 | dm_pool_close_thin_device(tc->td); |
2204 | dm_put_device(ti, tc->pool_dev); | 2478 | dm_put_device(ti, tc->pool_dev); |
2479 | if (tc->origin_dev) | ||
2480 | dm_put_device(ti, tc->origin_dev); | ||
2205 | kfree(tc); | 2481 | kfree(tc); |
2206 | 2482 | ||
2207 | mutex_unlock(&dm_thin_pool_table.mutex); | 2483 | mutex_unlock(&dm_thin_pool_table.mutex); |
@@ -2210,21 +2486,25 @@ static void thin_dtr(struct dm_target *ti) | |||
2210 | /* | 2486 | /* |
2211 | * Thin target parameters: | 2487 | * Thin target parameters: |
2212 | * | 2488 | * |
2213 | * <pool_dev> <dev_id> | 2489 | * <pool_dev> <dev_id> [origin_dev] |
2214 | * | 2490 | * |
2215 | * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) | 2491 | * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) |
2216 | * dev_id: the internal device identifier | 2492 | * dev_id: the internal device identifier |
2493 | * origin_dev: a device external to the pool that should act as the origin | ||
2494 | * | ||
2495 | * If the pool device has discards disabled, they get disabled for the thin | ||
2496 | * device as well. | ||
2217 | */ | 2497 | */ |
2218 | static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) | 2498 | static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) |
2219 | { | 2499 | { |
2220 | int r; | 2500 | int r; |
2221 | struct thin_c *tc; | 2501 | struct thin_c *tc; |
2222 | struct dm_dev *pool_dev; | 2502 | struct dm_dev *pool_dev, *origin_dev; |
2223 | struct mapped_device *pool_md; | 2503 | struct mapped_device *pool_md; |
2224 | 2504 | ||
2225 | mutex_lock(&dm_thin_pool_table.mutex); | 2505 | mutex_lock(&dm_thin_pool_table.mutex); |
2226 | 2506 | ||
2227 | if (argc != 2) { | 2507 | if (argc != 2 && argc != 3) { |
2228 | ti->error = "Invalid argument count"; | 2508 | ti->error = "Invalid argument count"; |
2229 | r = -EINVAL; | 2509 | r = -EINVAL; |
2230 | goto out_unlock; | 2510 | goto out_unlock; |
@@ -2237,6 +2517,15 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
2237 | goto out_unlock; | 2517 | goto out_unlock; |
2238 | } | 2518 | } |
2239 | 2519 | ||
2520 | if (argc == 3) { | ||
2521 | r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev); | ||
2522 | if (r) { | ||
2523 | ti->error = "Error opening origin device"; | ||
2524 | goto bad_origin_dev; | ||
2525 | } | ||
2526 | tc->origin_dev = origin_dev; | ||
2527 | } | ||
2528 | |||
2240 | r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); | 2529 | r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); |
2241 | if (r) { | 2530 | if (r) { |
2242 | ti->error = "Error opening pool device"; | 2531 | ti->error = "Error opening pool device"; |
@@ -2273,8 +2562,12 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) | |||
2273 | 2562 | ||
2274 | ti->split_io = tc->pool->sectors_per_block; | 2563 | ti->split_io = tc->pool->sectors_per_block; |
2275 | ti->num_flush_requests = 1; | 2564 | ti->num_flush_requests = 1; |
2276 | ti->num_discard_requests = 0; | 2565 | |
2277 | ti->discards_supported = 0; | 2566 | /* In case the pool supports discards, pass them on. */ |
2567 | if (tc->pool->pf.discard_enabled) { | ||
2568 | ti->discards_supported = 1; | ||
2569 | ti->num_discard_requests = 1; | ||
2570 | } | ||
2278 | 2571 | ||
2279 | dm_put(pool_md); | 2572 | dm_put(pool_md); |
2280 | 2573 | ||
@@ -2289,6 +2582,9 @@ bad_pool_lookup: | |||
2289 | bad_common: | 2582 | bad_common: |
2290 | dm_put_device(ti, tc->pool_dev); | 2583 | dm_put_device(ti, tc->pool_dev); |
2291 | bad_pool_dev: | 2584 | bad_pool_dev: |
2585 | if (tc->origin_dev) | ||
2586 | dm_put_device(ti, tc->origin_dev); | ||
2587 | bad_origin_dev: | ||
2292 | kfree(tc); | 2588 | kfree(tc); |
2293 | out_unlock: | 2589 | out_unlock: |
2294 | mutex_unlock(&dm_thin_pool_table.mutex); | 2590 | mutex_unlock(&dm_thin_pool_table.mutex); |
@@ -2299,11 +2595,46 @@ out_unlock: | |||
2299 | static int thin_map(struct dm_target *ti, struct bio *bio, | 2595 | static int thin_map(struct dm_target *ti, struct bio *bio, |
2300 | union map_info *map_context) | 2596 | union map_info *map_context) |
2301 | { | 2597 | { |
2302 | bio->bi_sector -= ti->begin; | 2598 | bio->bi_sector = dm_target_offset(ti, bio->bi_sector); |
2303 | 2599 | ||
2304 | return thin_bio_map(ti, bio, map_context); | 2600 | return thin_bio_map(ti, bio, map_context); |
2305 | } | 2601 | } |
2306 | 2602 | ||
2603 | static int thin_endio(struct dm_target *ti, | ||
2604 | struct bio *bio, int err, | ||
2605 | union map_info *map_context) | ||
2606 | { | ||
2607 | unsigned long flags; | ||
2608 | struct endio_hook *h = map_context->ptr; | ||
2609 | struct list_head work; | ||
2610 | struct new_mapping *m, *tmp; | ||
2611 | struct pool *pool = h->tc->pool; | ||
2612 | |||
2613 | if (h->shared_read_entry) { | ||
2614 | INIT_LIST_HEAD(&work); | ||
2615 | ds_dec(h->shared_read_entry, &work); | ||
2616 | |||
2617 | spin_lock_irqsave(&pool->lock, flags); | ||
2618 | list_for_each_entry_safe(m, tmp, &work, list) { | ||
2619 | list_del(&m->list); | ||
2620 | m->quiesced = 1; | ||
2621 | __maybe_add_mapping(m); | ||
2622 | } | ||
2623 | spin_unlock_irqrestore(&pool->lock, flags); | ||
2624 | } | ||
2625 | |||
2626 | if (h->all_io_entry) { | ||
2627 | INIT_LIST_HEAD(&work); | ||
2628 | ds_dec(h->all_io_entry, &work); | ||
2629 | list_for_each_entry_safe(m, tmp, &work, list) | ||
2630 | list_add(&m->list, &pool->prepared_discards); | ||
2631 | } | ||
2632 | |||
2633 | mempool_free(h, pool->endio_hook_pool); | ||
2634 | |||
2635 | return 0; | ||
2636 | } | ||
2637 | |||
2307 | static void thin_postsuspend(struct dm_target *ti) | 2638 | static void thin_postsuspend(struct dm_target *ti) |
2308 | { | 2639 | { |
2309 | if (dm_noflush_suspending(ti)) | 2640 | if (dm_noflush_suspending(ti)) |
@@ -2347,6 +2678,8 @@ static int thin_status(struct dm_target *ti, status_type_t type, | |||
2347 | DMEMIT("%s %lu", | 2678 | DMEMIT("%s %lu", |
2348 | format_dev_t(buf, tc->pool_dev->bdev->bd_dev), | 2679 | format_dev_t(buf, tc->pool_dev->bdev->bd_dev), |
2349 | (unsigned long) tc->dev_id); | 2680 | (unsigned long) tc->dev_id); |
2681 | if (tc->origin_dev) | ||
2682 | DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev)); | ||
2350 | break; | 2683 | break; |
2351 | } | 2684 | } |
2352 | } | 2685 | } |
@@ -2377,18 +2710,21 @@ static int thin_iterate_devices(struct dm_target *ti, | |||
2377 | static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) | 2710 | static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) |
2378 | { | 2711 | { |
2379 | struct thin_c *tc = ti->private; | 2712 | struct thin_c *tc = ti->private; |
2713 | struct pool *pool = tc->pool; | ||
2380 | 2714 | ||
2381 | blk_limits_io_min(limits, 0); | 2715 | blk_limits_io_min(limits, 0); |
2382 | blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT); | 2716 | blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); |
2717 | set_discard_limits(pool, limits); | ||
2383 | } | 2718 | } |
2384 | 2719 | ||
2385 | static struct target_type thin_target = { | 2720 | static struct target_type thin_target = { |
2386 | .name = "thin", | 2721 | .name = "thin", |
2387 | .version = {1, 0, 0}, | 2722 | .version = {1, 1, 0}, |
2388 | .module = THIS_MODULE, | 2723 | .module = THIS_MODULE, |
2389 | .ctr = thin_ctr, | 2724 | .ctr = thin_ctr, |
2390 | .dtr = thin_dtr, | 2725 | .dtr = thin_dtr, |
2391 | .map = thin_map, | 2726 | .map = thin_map, |
2727 | .end_io = thin_endio, | ||
2392 | .postsuspend = thin_postsuspend, | 2728 | .postsuspend = thin_postsuspend, |
2393 | .status = thin_status, | 2729 | .status = thin_status, |
2394 | .iterate_devices = thin_iterate_devices, | 2730 | .iterate_devices = thin_iterate_devices, |
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c new file mode 100644 index 000000000000..fa365d39b612 --- /dev/null +++ b/drivers/md/dm-verity.c | |||
@@ -0,0 +1,913 @@ | |||
1 | /* | ||
2 | * Copyright (C) 2012 Red Hat, Inc. | ||
3 | * | ||
4 | * Author: Mikulas Patocka <mpatocka@redhat.com> | ||
5 | * | ||
6 | * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors | ||
7 | * | ||
8 | * This file is released under the GPLv2. | ||
9 | * | ||
10 | * In the file "/sys/module/dm_verity/parameters/prefetch_cluster" you can set | ||
11 | * default prefetch value. Data are read in "prefetch_cluster" chunks from the | ||
12 | * hash device. Setting this greatly improves performance when data and hash | ||
13 | * are on the same disk on different partitions on devices with poor random | ||
14 | * access behavior. | ||
15 | */ | ||
16 | |||
17 | #include "dm-bufio.h" | ||
18 | |||
19 | #include <linux/module.h> | ||
20 | #include <linux/device-mapper.h> | ||
21 | #include <crypto/hash.h> | ||
22 | |||
23 | #define DM_MSG_PREFIX "verity" | ||
24 | |||
25 | #define DM_VERITY_IO_VEC_INLINE 16 | ||
26 | #define DM_VERITY_MEMPOOL_SIZE 4 | ||
27 | #define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144 | ||
28 | |||
29 | #define DM_VERITY_MAX_LEVELS 63 | ||
30 | |||
31 | static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE; | ||
32 | |||
33 | module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR); | ||
34 | |||
35 | struct dm_verity { | ||
36 | struct dm_dev *data_dev; | ||
37 | struct dm_dev *hash_dev; | ||
38 | struct dm_target *ti; | ||
39 | struct dm_bufio_client *bufio; | ||
40 | char *alg_name; | ||
41 | struct crypto_shash *tfm; | ||
42 | u8 *root_digest; /* digest of the root block */ | ||
43 | u8 *salt; /* salt: its size is salt_size */ | ||
44 | unsigned salt_size; | ||
45 | sector_t data_start; /* data offset in 512-byte sectors */ | ||
46 | sector_t hash_start; /* hash start in blocks */ | ||
47 | sector_t data_blocks; /* the number of data blocks */ | ||
48 | sector_t hash_blocks; /* the number of hash blocks */ | ||
49 | unsigned char data_dev_block_bits; /* log2(data blocksize) */ | ||
50 | unsigned char hash_dev_block_bits; /* log2(hash blocksize) */ | ||
51 | unsigned char hash_per_block_bits; /* log2(hashes in hash block) */ | ||
52 | unsigned char levels; /* the number of tree levels */ | ||
53 | unsigned char version; | ||
54 | unsigned digest_size; /* digest size for the current hash algorithm */ | ||
55 | unsigned shash_descsize;/* the size of temporary space for crypto */ | ||
56 | int hash_failed; /* set to 1 if hash of any block failed */ | ||
57 | |||
58 | mempool_t *io_mempool; /* mempool of struct dm_verity_io */ | ||
59 | mempool_t *vec_mempool; /* mempool of bio vector */ | ||
60 | |||
61 | struct workqueue_struct *verify_wq; | ||
62 | |||
63 | /* starting blocks for each tree level. 0 is the lowest level. */ | ||
64 | sector_t hash_level_block[DM_VERITY_MAX_LEVELS]; | ||
65 | }; | ||
66 | |||
67 | struct dm_verity_io { | ||
68 | struct dm_verity *v; | ||
69 | struct bio *bio; | ||
70 | |||
71 | /* original values of bio->bi_end_io and bio->bi_private */ | ||
72 | bio_end_io_t *orig_bi_end_io; | ||
73 | void *orig_bi_private; | ||
74 | |||
75 | sector_t block; | ||
76 | unsigned n_blocks; | ||
77 | |||
78 | /* saved bio vector */ | ||
79 | struct bio_vec *io_vec; | ||
80 | unsigned io_vec_size; | ||
81 | |||
82 | struct work_struct work; | ||
83 | |||
84 | /* A space for short vectors; longer vectors are allocated separately. */ | ||
85 | struct bio_vec io_vec_inline[DM_VERITY_IO_VEC_INLINE]; | ||
86 | |||
87 | /* | ||
88 | * Three variably-size fields follow this struct: | ||
89 | * | ||
90 | * u8 hash_desc[v->shash_descsize]; | ||
91 | * u8 real_digest[v->digest_size]; | ||
92 | * u8 want_digest[v->digest_size]; | ||
93 | * | ||
94 | * To access them use: io_hash_desc(), io_real_digest() and io_want_digest(). | ||
95 | */ | ||
96 | }; | ||
97 | |||
98 | static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io) | ||
99 | { | ||
100 | return (struct shash_desc *)(io + 1); | ||
101 | } | ||
102 | |||
103 | static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io) | ||
104 | { | ||
105 | return (u8 *)(io + 1) + v->shash_descsize; | ||
106 | } | ||
107 | |||
108 | static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io) | ||
109 | { | ||
110 | return (u8 *)(io + 1) + v->shash_descsize + v->digest_size; | ||
111 | } | ||
112 | |||
113 | /* | ||
114 | * Auxiliary structure appended to each dm-bufio buffer. If the value | ||
115 | * hash_verified is nonzero, hash of the block has been verified. | ||
116 | * | ||
117 | * The variable hash_verified is set to 0 when allocating the buffer, then | ||
118 | * it can be changed to 1 and it is never reset to 0 again. | ||
119 | * | ||
120 | * There is no lock around this value, a race condition can at worst cause | ||
121 | * that multiple processes verify the hash of the same buffer simultaneously | ||
122 | * and write 1 to hash_verified simultaneously. | ||
123 | * This condition is harmless, so we don't need locking. | ||
124 | */ | ||
125 | struct buffer_aux { | ||
126 | int hash_verified; | ||
127 | }; | ||
128 | |||
129 | /* | ||
130 | * Initialize struct buffer_aux for a freshly created buffer. | ||
131 | */ | ||
132 | static void dm_bufio_alloc_callback(struct dm_buffer *buf) | ||
133 | { | ||
134 | struct buffer_aux *aux = dm_bufio_get_aux_data(buf); | ||
135 | |||
136 | aux->hash_verified = 0; | ||
137 | } | ||
138 | |||
139 | /* | ||
140 | * Translate input sector number to the sector number on the target device. | ||
141 | */ | ||
142 | static sector_t verity_map_sector(struct dm_verity *v, sector_t bi_sector) | ||
143 | { | ||
144 | return v->data_start + dm_target_offset(v->ti, bi_sector); | ||
145 | } | ||
146 | |||
147 | /* | ||
148 | * Return hash position of a specified block at a specified tree level | ||
149 | * (0 is the lowest level). | ||
150 | * The lowest "hash_per_block_bits"-bits of the result denote hash position | ||
151 | * inside a hash block. The remaining bits denote location of the hash block. | ||
152 | */ | ||
153 | static sector_t verity_position_at_level(struct dm_verity *v, sector_t block, | ||
154 | int level) | ||
155 | { | ||
156 | return block >> (level * v->hash_per_block_bits); | ||
157 | } | ||
158 | |||
159 | static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level, | ||
160 | sector_t *hash_block, unsigned *offset) | ||
161 | { | ||
162 | sector_t position = verity_position_at_level(v, block, level); | ||
163 | unsigned idx; | ||
164 | |||
165 | *hash_block = v->hash_level_block[level] + (position >> v->hash_per_block_bits); | ||
166 | |||
167 | if (!offset) | ||
168 | return; | ||
169 | |||
170 | idx = position & ((1 << v->hash_per_block_bits) - 1); | ||
171 | if (!v->version) | ||
172 | *offset = idx * v->digest_size; | ||
173 | else | ||
174 | *offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits); | ||
175 | } | ||
176 | |||
177 | /* | ||
178 | * Verify hash of a metadata block pertaining to the specified data block | ||
179 | * ("block" argument) at a specified level ("level" argument). | ||
180 | * | ||
181 | * On successful return, io_want_digest(v, io) contains the hash value for | ||
182 | * a lower tree level or for the data block (if we're at the lowest leve). | ||
183 | * | ||
184 | * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned. | ||
185 | * If "skip_unverified" is false, unverified buffer is hashed and verified | ||
186 | * against current value of io_want_digest(v, io). | ||
187 | */ | ||
188 | static int verity_verify_level(struct dm_verity_io *io, sector_t block, | ||
189 | int level, bool skip_unverified) | ||
190 | { | ||
191 | struct dm_verity *v = io->v; | ||
192 | struct dm_buffer *buf; | ||
193 | struct buffer_aux *aux; | ||
194 | u8 *data; | ||
195 | int r; | ||
196 | sector_t hash_block; | ||
197 | unsigned offset; | ||
198 | |||
199 | verity_hash_at_level(v, block, level, &hash_block, &offset); | ||
200 | |||
201 | data = dm_bufio_read(v->bufio, hash_block, &buf); | ||
202 | if (unlikely(IS_ERR(data))) | ||
203 | return PTR_ERR(data); | ||
204 | |||
205 | aux = dm_bufio_get_aux_data(buf); | ||
206 | |||
207 | if (!aux->hash_verified) { | ||
208 | struct shash_desc *desc; | ||
209 | u8 *result; | ||
210 | |||
211 | if (skip_unverified) { | ||
212 | r = 1; | ||
213 | goto release_ret_r; | ||
214 | } | ||
215 | |||
216 | desc = io_hash_desc(v, io); | ||
217 | desc->tfm = v->tfm; | ||
218 | desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; | ||
219 | r = crypto_shash_init(desc); | ||
220 | if (r < 0) { | ||
221 | DMERR("crypto_shash_init failed: %d", r); | ||
222 | goto release_ret_r; | ||
223 | } | ||
224 | |||
225 | if (likely(v->version >= 1)) { | ||
226 | r = crypto_shash_update(desc, v->salt, v->salt_size); | ||
227 | if (r < 0) { | ||
228 | DMERR("crypto_shash_update failed: %d", r); | ||
229 | goto release_ret_r; | ||
230 | } | ||
231 | } | ||
232 | |||
233 | r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits); | ||
234 | if (r < 0) { | ||
235 | DMERR("crypto_shash_update failed: %d", r); | ||
236 | goto release_ret_r; | ||
237 | } | ||
238 | |||
239 | if (!v->version) { | ||
240 | r = crypto_shash_update(desc, v->salt, v->salt_size); | ||
241 | if (r < 0) { | ||
242 | DMERR("crypto_shash_update failed: %d", r); | ||
243 | goto release_ret_r; | ||
244 | } | ||
245 | } | ||
246 | |||
247 | result = io_real_digest(v, io); | ||
248 | r = crypto_shash_final(desc, result); | ||
249 | if (r < 0) { | ||
250 | DMERR("crypto_shash_final failed: %d", r); | ||
251 | goto release_ret_r; | ||
252 | } | ||
253 | if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { | ||
254 | DMERR_LIMIT("metadata block %llu is corrupted", | ||
255 | (unsigned long long)hash_block); | ||
256 | v->hash_failed = 1; | ||
257 | r = -EIO; | ||
258 | goto release_ret_r; | ||
259 | } else | ||
260 | aux->hash_verified = 1; | ||
261 | } | ||
262 | |||
263 | data += offset; | ||
264 | |||
265 | memcpy(io_want_digest(v, io), data, v->digest_size); | ||
266 | |||
267 | dm_bufio_release(buf); | ||
268 | return 0; | ||
269 | |||
270 | release_ret_r: | ||
271 | dm_bufio_release(buf); | ||
272 | |||
273 | return r; | ||
274 | } | ||
275 | |||
276 | /* | ||
277 | * Verify one "dm_verity_io" structure. | ||
278 | */ | ||
279 | static int verity_verify_io(struct dm_verity_io *io) | ||
280 | { | ||
281 | struct dm_verity *v = io->v; | ||
282 | unsigned b; | ||
283 | int i; | ||
284 | unsigned vector = 0, offset = 0; | ||
285 | |||
286 | for (b = 0; b < io->n_blocks; b++) { | ||
287 | struct shash_desc *desc; | ||
288 | u8 *result; | ||
289 | int r; | ||
290 | unsigned todo; | ||
291 | |||
292 | if (likely(v->levels)) { | ||
293 | /* | ||
294 | * First, we try to get the requested hash for | ||
295 | * the current block. If the hash block itself is | ||
296 | * verified, zero is returned. If it isn't, this | ||
297 | * function returns 0 and we fall back to whole | ||
298 | * chain verification. | ||
299 | */ | ||
300 | int r = verity_verify_level(io, io->block + b, 0, true); | ||
301 | if (likely(!r)) | ||
302 | goto test_block_hash; | ||
303 | if (r < 0) | ||
304 | return r; | ||
305 | } | ||
306 | |||
307 | memcpy(io_want_digest(v, io), v->root_digest, v->digest_size); | ||
308 | |||
309 | for (i = v->levels - 1; i >= 0; i--) { | ||
310 | int r = verity_verify_level(io, io->block + b, i, false); | ||
311 | if (unlikely(r)) | ||
312 | return r; | ||
313 | } | ||
314 | |||
315 | test_block_hash: | ||
316 | desc = io_hash_desc(v, io); | ||
317 | desc->tfm = v->tfm; | ||
318 | desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP; | ||
319 | r = crypto_shash_init(desc); | ||
320 | if (r < 0) { | ||
321 | DMERR("crypto_shash_init failed: %d", r); | ||
322 | return r; | ||
323 | } | ||
324 | |||
325 | if (likely(v->version >= 1)) { | ||
326 | r = crypto_shash_update(desc, v->salt, v->salt_size); | ||
327 | if (r < 0) { | ||
328 | DMERR("crypto_shash_update failed: %d", r); | ||
329 | return r; | ||
330 | } | ||
331 | } | ||
332 | |||
333 | todo = 1 << v->data_dev_block_bits; | ||
334 | do { | ||
335 | struct bio_vec *bv; | ||
336 | u8 *page; | ||
337 | unsigned len; | ||
338 | |||
339 | BUG_ON(vector >= io->io_vec_size); | ||
340 | bv = &io->io_vec[vector]; | ||
341 | page = kmap_atomic(bv->bv_page); | ||
342 | len = bv->bv_len - offset; | ||
343 | if (likely(len >= todo)) | ||
344 | len = todo; | ||
345 | r = crypto_shash_update(desc, | ||
346 | page + bv->bv_offset + offset, len); | ||
347 | kunmap_atomic(page); | ||
348 | if (r < 0) { | ||
349 | DMERR("crypto_shash_update failed: %d", r); | ||
350 | return r; | ||
351 | } | ||
352 | offset += len; | ||
353 | if (likely(offset == bv->bv_len)) { | ||
354 | offset = 0; | ||
355 | vector++; | ||
356 | } | ||
357 | todo -= len; | ||
358 | } while (todo); | ||
359 | |||
360 | if (!v->version) { | ||
361 | r = crypto_shash_update(desc, v->salt, v->salt_size); | ||
362 | if (r < 0) { | ||
363 | DMERR("crypto_shash_update failed: %d", r); | ||
364 | return r; | ||
365 | } | ||
366 | } | ||
367 | |||
368 | result = io_real_digest(v, io); | ||
369 | r = crypto_shash_final(desc, result); | ||
370 | if (r < 0) { | ||
371 | DMERR("crypto_shash_final failed: %d", r); | ||
372 | return r; | ||
373 | } | ||
374 | if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) { | ||
375 | DMERR_LIMIT("data block %llu is corrupted", | ||
376 | (unsigned long long)(io->block + b)); | ||
377 | v->hash_failed = 1; | ||
378 | return -EIO; | ||
379 | } | ||
380 | } | ||
381 | BUG_ON(vector != io->io_vec_size); | ||
382 | BUG_ON(offset); | ||
383 | |||
384 | return 0; | ||
385 | } | ||
386 | |||
387 | /* | ||
388 | * End one "io" structure with a given error. | ||
389 | */ | ||
390 | static void verity_finish_io(struct dm_verity_io *io, int error) | ||
391 | { | ||
392 | struct bio *bio = io->bio; | ||
393 | struct dm_verity *v = io->v; | ||
394 | |||
395 | bio->bi_end_io = io->orig_bi_end_io; | ||
396 | bio->bi_private = io->orig_bi_private; | ||
397 | |||
398 | if (io->io_vec != io->io_vec_inline) | ||
399 | mempool_free(io->io_vec, v->vec_mempool); | ||
400 | |||
401 | mempool_free(io, v->io_mempool); | ||
402 | |||
403 | bio_endio(bio, error); | ||
404 | } | ||
405 | |||
406 | static void verity_work(struct work_struct *w) | ||
407 | { | ||
408 | struct dm_verity_io *io = container_of(w, struct dm_verity_io, work); | ||
409 | |||
410 | verity_finish_io(io, verity_verify_io(io)); | ||
411 | } | ||
412 | |||
413 | static void verity_end_io(struct bio *bio, int error) | ||
414 | { | ||
415 | struct dm_verity_io *io = bio->bi_private; | ||
416 | |||
417 | if (error) { | ||
418 | verity_finish_io(io, error); | ||
419 | return; | ||
420 | } | ||
421 | |||
422 | INIT_WORK(&io->work, verity_work); | ||
423 | queue_work(io->v->verify_wq, &io->work); | ||
424 | } | ||
425 | |||
426 | /* | ||
427 | * Prefetch buffers for the specified io. | ||
428 | * The root buffer is not prefetched, it is assumed that it will be cached | ||
429 | * all the time. | ||
430 | */ | ||
431 | static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io) | ||
432 | { | ||
433 | int i; | ||
434 | |||
435 | for (i = v->levels - 2; i >= 0; i--) { | ||
436 | sector_t hash_block_start; | ||
437 | sector_t hash_block_end; | ||
438 | verity_hash_at_level(v, io->block, i, &hash_block_start, NULL); | ||
439 | verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL); | ||
440 | if (!i) { | ||
441 | unsigned cluster = *(volatile unsigned *)&dm_verity_prefetch_cluster; | ||
442 | |||
443 | cluster >>= v->data_dev_block_bits; | ||
444 | if (unlikely(!cluster)) | ||
445 | goto no_prefetch_cluster; | ||
446 | |||
447 | if (unlikely(cluster & (cluster - 1))) | ||
448 | cluster = 1 << (fls(cluster) - 1); | ||
449 | |||
450 | hash_block_start &= ~(sector_t)(cluster - 1); | ||
451 | hash_block_end |= cluster - 1; | ||
452 | if (unlikely(hash_block_end >= v->hash_blocks)) | ||
453 | hash_block_end = v->hash_blocks - 1; | ||
454 | } | ||
455 | no_prefetch_cluster: | ||
456 | dm_bufio_prefetch(v->bufio, hash_block_start, | ||
457 | hash_block_end - hash_block_start + 1); | ||
458 | } | ||
459 | } | ||
460 | |||
461 | /* | ||
462 | * Bio map function. It allocates dm_verity_io structure and bio vector and | ||
463 | * fills them. Then it issues prefetches and the I/O. | ||
464 | */ | ||
465 | static int verity_map(struct dm_target *ti, struct bio *bio, | ||
466 | union map_info *map_context) | ||
467 | { | ||
468 | struct dm_verity *v = ti->private; | ||
469 | struct dm_verity_io *io; | ||
470 | |||
471 | bio->bi_bdev = v->data_dev->bdev; | ||
472 | bio->bi_sector = verity_map_sector(v, bio->bi_sector); | ||
473 | |||
474 | if (((unsigned)bio->bi_sector | bio_sectors(bio)) & | ||
475 | ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) { | ||
476 | DMERR_LIMIT("unaligned io"); | ||
477 | return -EIO; | ||
478 | } | ||
479 | |||
480 | if ((bio->bi_sector + bio_sectors(bio)) >> | ||
481 | (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) { | ||
482 | DMERR_LIMIT("io out of range"); | ||
483 | return -EIO; | ||
484 | } | ||
485 | |||
486 | if (bio_data_dir(bio) == WRITE) | ||
487 | return -EIO; | ||
488 | |||
489 | io = mempool_alloc(v->io_mempool, GFP_NOIO); | ||
490 | io->v = v; | ||
491 | io->bio = bio; | ||
492 | io->orig_bi_end_io = bio->bi_end_io; | ||
493 | io->orig_bi_private = bio->bi_private; | ||
494 | io->block = bio->bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT); | ||
495 | io->n_blocks = bio->bi_size >> v->data_dev_block_bits; | ||
496 | |||
497 | bio->bi_end_io = verity_end_io; | ||
498 | bio->bi_private = io; | ||
499 | io->io_vec_size = bio->bi_vcnt - bio->bi_idx; | ||
500 | if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE) | ||
501 | io->io_vec = io->io_vec_inline; | ||
502 | else | ||
503 | io->io_vec = mempool_alloc(v->vec_mempool, GFP_NOIO); | ||
504 | memcpy(io->io_vec, bio_iovec(bio), | ||
505 | io->io_vec_size * sizeof(struct bio_vec)); | ||
506 | |||
507 | verity_prefetch_io(v, io); | ||
508 | |||
509 | generic_make_request(bio); | ||
510 | |||
511 | return DM_MAPIO_SUBMITTED; | ||
512 | } | ||
513 | |||
514 | /* | ||
515 | * Status: V (valid) or C (corruption found) | ||
516 | */ | ||
517 | static int verity_status(struct dm_target *ti, status_type_t type, | ||
518 | char *result, unsigned maxlen) | ||
519 | { | ||
520 | struct dm_verity *v = ti->private; | ||
521 | unsigned sz = 0; | ||
522 | unsigned x; | ||
523 | |||
524 | switch (type) { | ||
525 | case STATUSTYPE_INFO: | ||
526 | DMEMIT("%c", v->hash_failed ? 'C' : 'V'); | ||
527 | break; | ||
528 | case STATUSTYPE_TABLE: | ||
529 | DMEMIT("%u %s %s %u %u %llu %llu %s ", | ||
530 | v->version, | ||
531 | v->data_dev->name, | ||
532 | v->hash_dev->name, | ||
533 | 1 << v->data_dev_block_bits, | ||
534 | 1 << v->hash_dev_block_bits, | ||
535 | (unsigned long long)v->data_blocks, | ||
536 | (unsigned long long)v->hash_start, | ||
537 | v->alg_name | ||
538 | ); | ||
539 | for (x = 0; x < v->digest_size; x++) | ||
540 | DMEMIT("%02x", v->root_digest[x]); | ||
541 | DMEMIT(" "); | ||
542 | if (!v->salt_size) | ||
543 | DMEMIT("-"); | ||
544 | else | ||
545 | for (x = 0; x < v->salt_size; x++) | ||
546 | DMEMIT("%02x", v->salt[x]); | ||
547 | break; | ||
548 | } | ||
549 | |||
550 | return 0; | ||
551 | } | ||
552 | |||
553 | static int verity_ioctl(struct dm_target *ti, unsigned cmd, | ||
554 | unsigned long arg) | ||
555 | { | ||
556 | struct dm_verity *v = ti->private; | ||
557 | int r = 0; | ||
558 | |||
559 | if (v->data_start || | ||
560 | ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT) | ||
561 | r = scsi_verify_blk_ioctl(NULL, cmd); | ||
562 | |||
563 | return r ? : __blkdev_driver_ioctl(v->data_dev->bdev, v->data_dev->mode, | ||
564 | cmd, arg); | ||
565 | } | ||
566 | |||
567 | static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm, | ||
568 | struct bio_vec *biovec, int max_size) | ||
569 | { | ||
570 | struct dm_verity *v = ti->private; | ||
571 | struct request_queue *q = bdev_get_queue(v->data_dev->bdev); | ||
572 | |||
573 | if (!q->merge_bvec_fn) | ||
574 | return max_size; | ||
575 | |||
576 | bvm->bi_bdev = v->data_dev->bdev; | ||
577 | bvm->bi_sector = verity_map_sector(v, bvm->bi_sector); | ||
578 | |||
579 | return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); | ||
580 | } | ||
581 | |||
582 | static int verity_iterate_devices(struct dm_target *ti, | ||
583 | iterate_devices_callout_fn fn, void *data) | ||
584 | { | ||
585 | struct dm_verity *v = ti->private; | ||
586 | |||
587 | return fn(ti, v->data_dev, v->data_start, ti->len, data); | ||
588 | } | ||
589 | |||
590 | static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits) | ||
591 | { | ||
592 | struct dm_verity *v = ti->private; | ||
593 | |||
594 | if (limits->logical_block_size < 1 << v->data_dev_block_bits) | ||
595 | limits->logical_block_size = 1 << v->data_dev_block_bits; | ||
596 | |||
597 | if (limits->physical_block_size < 1 << v->data_dev_block_bits) | ||
598 | limits->physical_block_size = 1 << v->data_dev_block_bits; | ||
599 | |||
600 | blk_limits_io_min(limits, limits->logical_block_size); | ||
601 | } | ||
602 | |||
603 | static void verity_dtr(struct dm_target *ti) | ||
604 | { | ||
605 | struct dm_verity *v = ti->private; | ||
606 | |||
607 | if (v->verify_wq) | ||
608 | destroy_workqueue(v->verify_wq); | ||
609 | |||
610 | if (v->vec_mempool) | ||
611 | mempool_destroy(v->vec_mempool); | ||
612 | |||
613 | if (v->io_mempool) | ||
614 | mempool_destroy(v->io_mempool); | ||
615 | |||
616 | if (v->bufio) | ||
617 | dm_bufio_client_destroy(v->bufio); | ||
618 | |||
619 | kfree(v->salt); | ||
620 | kfree(v->root_digest); | ||
621 | |||
622 | if (v->tfm) | ||
623 | crypto_free_shash(v->tfm); | ||
624 | |||
625 | kfree(v->alg_name); | ||
626 | |||
627 | if (v->hash_dev) | ||
628 | dm_put_device(ti, v->hash_dev); | ||
629 | |||
630 | if (v->data_dev) | ||
631 | dm_put_device(ti, v->data_dev); | ||
632 | |||
633 | kfree(v); | ||
634 | } | ||
635 | |||
636 | /* | ||
637 | * Target parameters: | ||
638 | * <version> The current format is version 1. | ||
639 | * Vsn 0 is compatible with original Chromium OS releases. | ||
640 | * <data device> | ||
641 | * <hash device> | ||
642 | * <data block size> | ||
643 | * <hash block size> | ||
644 | * <the number of data blocks> | ||
645 | * <hash start block> | ||
646 | * <algorithm> | ||
647 | * <digest> | ||
648 | * <salt> Hex string or "-" if no salt. | ||
649 | */ | ||
650 | static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv) | ||
651 | { | ||
652 | struct dm_verity *v; | ||
653 | unsigned num; | ||
654 | unsigned long long num_ll; | ||
655 | int r; | ||
656 | int i; | ||
657 | sector_t hash_position; | ||
658 | char dummy; | ||
659 | |||
660 | v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL); | ||
661 | if (!v) { | ||
662 | ti->error = "Cannot allocate verity structure"; | ||
663 | return -ENOMEM; | ||
664 | } | ||
665 | ti->private = v; | ||
666 | v->ti = ti; | ||
667 | |||
668 | if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) { | ||
669 | ti->error = "Device must be readonly"; | ||
670 | r = -EINVAL; | ||
671 | goto bad; | ||
672 | } | ||
673 | |||
674 | if (argc != 10) { | ||
675 | ti->error = "Invalid argument count: exactly 10 arguments required"; | ||
676 | r = -EINVAL; | ||
677 | goto bad; | ||
678 | } | ||
679 | |||
680 | if (sscanf(argv[0], "%d%c", &num, &dummy) != 1 || | ||
681 | num < 0 || num > 1) { | ||
682 | ti->error = "Invalid version"; | ||
683 | r = -EINVAL; | ||
684 | goto bad; | ||
685 | } | ||
686 | v->version = num; | ||
687 | |||
688 | r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev); | ||
689 | if (r) { | ||
690 | ti->error = "Data device lookup failed"; | ||
691 | goto bad; | ||
692 | } | ||
693 | |||
694 | r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev); | ||
695 | if (r) { | ||
696 | ti->error = "Data device lookup failed"; | ||
697 | goto bad; | ||
698 | } | ||
699 | |||
700 | if (sscanf(argv[3], "%u%c", &num, &dummy) != 1 || | ||
701 | !num || (num & (num - 1)) || | ||
702 | num < bdev_logical_block_size(v->data_dev->bdev) || | ||
703 | num > PAGE_SIZE) { | ||
704 | ti->error = "Invalid data device block size"; | ||
705 | r = -EINVAL; | ||
706 | goto bad; | ||
707 | } | ||
708 | v->data_dev_block_bits = ffs(num) - 1; | ||
709 | |||
710 | if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 || | ||
711 | !num || (num & (num - 1)) || | ||
712 | num < bdev_logical_block_size(v->hash_dev->bdev) || | ||
713 | num > INT_MAX) { | ||
714 | ti->error = "Invalid hash device block size"; | ||
715 | r = -EINVAL; | ||
716 | goto bad; | ||
717 | } | ||
718 | v->hash_dev_block_bits = ffs(num) - 1; | ||
719 | |||
720 | if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 || | ||
721 | num_ll << (v->data_dev_block_bits - SECTOR_SHIFT) != | ||
722 | (sector_t)num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) { | ||
723 | ti->error = "Invalid data blocks"; | ||
724 | r = -EINVAL; | ||
725 | goto bad; | ||
726 | } | ||
727 | v->data_blocks = num_ll; | ||
728 | |||
729 | if (ti->len > (v->data_blocks << (v->data_dev_block_bits - SECTOR_SHIFT))) { | ||
730 | ti->error = "Data device is too small"; | ||
731 | r = -EINVAL; | ||
732 | goto bad; | ||
733 | } | ||
734 | |||
735 | if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 || | ||
736 | num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT) != | ||
737 | (sector_t)num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT)) { | ||
738 | ti->error = "Invalid hash start"; | ||
739 | r = -EINVAL; | ||
740 | goto bad; | ||
741 | } | ||
742 | v->hash_start = num_ll; | ||
743 | |||
744 | v->alg_name = kstrdup(argv[7], GFP_KERNEL); | ||
745 | if (!v->alg_name) { | ||
746 | ti->error = "Cannot allocate algorithm name"; | ||
747 | r = -ENOMEM; | ||
748 | goto bad; | ||
749 | } | ||
750 | |||
751 | v->tfm = crypto_alloc_shash(v->alg_name, 0, 0); | ||
752 | if (IS_ERR(v->tfm)) { | ||
753 | ti->error = "Cannot initialize hash function"; | ||
754 | r = PTR_ERR(v->tfm); | ||
755 | v->tfm = NULL; | ||
756 | goto bad; | ||
757 | } | ||
758 | v->digest_size = crypto_shash_digestsize(v->tfm); | ||
759 | if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { | ||
760 | ti->error = "Digest size too big"; | ||
761 | r = -EINVAL; | ||
762 | goto bad; | ||
763 | } | ||
764 | v->shash_descsize = | ||
765 | sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm); | ||
766 | |||
767 | v->root_digest = kmalloc(v->digest_size, GFP_KERNEL); | ||
768 | if (!v->root_digest) { | ||
769 | ti->error = "Cannot allocate root digest"; | ||
770 | r = -ENOMEM; | ||
771 | goto bad; | ||
772 | } | ||
773 | if (strlen(argv[8]) != v->digest_size * 2 || | ||
774 | hex2bin(v->root_digest, argv[8], v->digest_size)) { | ||
775 | ti->error = "Invalid root digest"; | ||
776 | r = -EINVAL; | ||
777 | goto bad; | ||
778 | } | ||
779 | |||
780 | if (strcmp(argv[9], "-")) { | ||
781 | v->salt_size = strlen(argv[9]) / 2; | ||
782 | v->salt = kmalloc(v->salt_size, GFP_KERNEL); | ||
783 | if (!v->salt) { | ||
784 | ti->error = "Cannot allocate salt"; | ||
785 | r = -ENOMEM; | ||
786 | goto bad; | ||
787 | } | ||
788 | if (strlen(argv[9]) != v->salt_size * 2 || | ||
789 | hex2bin(v->salt, argv[9], v->salt_size)) { | ||
790 | ti->error = "Invalid salt"; | ||
791 | r = -EINVAL; | ||
792 | goto bad; | ||
793 | } | ||
794 | } | ||
795 | |||
796 | v->hash_per_block_bits = | ||
797 | fls((1 << v->hash_dev_block_bits) / v->digest_size) - 1; | ||
798 | |||
799 | v->levels = 0; | ||
800 | if (v->data_blocks) | ||
801 | while (v->hash_per_block_bits * v->levels < 64 && | ||
802 | (unsigned long long)(v->data_blocks - 1) >> | ||
803 | (v->hash_per_block_bits * v->levels)) | ||
804 | v->levels++; | ||
805 | |||
806 | if (v->levels > DM_VERITY_MAX_LEVELS) { | ||
807 | ti->error = "Too many tree levels"; | ||
808 | r = -E2BIG; | ||
809 | goto bad; | ||
810 | } | ||
811 | |||
812 | hash_position = v->hash_start; | ||
813 | for (i = v->levels - 1; i >= 0; i--) { | ||
814 | sector_t s; | ||
815 | v->hash_level_block[i] = hash_position; | ||
816 | s = verity_position_at_level(v, v->data_blocks, i); | ||
817 | s = (s >> v->hash_per_block_bits) + | ||
818 | !!(s & ((1 << v->hash_per_block_bits) - 1)); | ||
819 | if (hash_position + s < hash_position) { | ||
820 | ti->error = "Hash device offset overflow"; | ||
821 | r = -E2BIG; | ||
822 | goto bad; | ||
823 | } | ||
824 | hash_position += s; | ||
825 | } | ||
826 | v->hash_blocks = hash_position; | ||
827 | |||
828 | v->bufio = dm_bufio_client_create(v->hash_dev->bdev, | ||
829 | 1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux), | ||
830 | dm_bufio_alloc_callback, NULL); | ||
831 | if (IS_ERR(v->bufio)) { | ||
832 | ti->error = "Cannot initialize dm-bufio"; | ||
833 | r = PTR_ERR(v->bufio); | ||
834 | v->bufio = NULL; | ||
835 | goto bad; | ||
836 | } | ||
837 | |||
838 | if (dm_bufio_get_device_size(v->bufio) < v->hash_blocks) { | ||
839 | ti->error = "Hash device is too small"; | ||
840 | r = -E2BIG; | ||
841 | goto bad; | ||
842 | } | ||
843 | |||
844 | v->io_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE, | ||
845 | sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2); | ||
846 | if (!v->io_mempool) { | ||
847 | ti->error = "Cannot allocate io mempool"; | ||
848 | r = -ENOMEM; | ||
849 | goto bad; | ||
850 | } | ||
851 | |||
852 | v->vec_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE, | ||
853 | BIO_MAX_PAGES * sizeof(struct bio_vec)); | ||
854 | if (!v->vec_mempool) { | ||
855 | ti->error = "Cannot allocate vector mempool"; | ||
856 | r = -ENOMEM; | ||
857 | goto bad; | ||
858 | } | ||
859 | |||
860 | /* WQ_UNBOUND greatly improves performance when running on ramdisk */ | ||
861 | v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus()); | ||
862 | if (!v->verify_wq) { | ||
863 | ti->error = "Cannot allocate workqueue"; | ||
864 | r = -ENOMEM; | ||
865 | goto bad; | ||
866 | } | ||
867 | |||
868 | return 0; | ||
869 | |||
870 | bad: | ||
871 | verity_dtr(ti); | ||
872 | |||
873 | return r; | ||
874 | } | ||
875 | |||
876 | static struct target_type verity_target = { | ||
877 | .name = "verity", | ||
878 | .version = {1, 0, 0}, | ||
879 | .module = THIS_MODULE, | ||
880 | .ctr = verity_ctr, | ||
881 | .dtr = verity_dtr, | ||
882 | .map = verity_map, | ||
883 | .status = verity_status, | ||
884 | .ioctl = verity_ioctl, | ||
885 | .merge = verity_merge, | ||
886 | .iterate_devices = verity_iterate_devices, | ||
887 | .io_hints = verity_io_hints, | ||
888 | }; | ||
889 | |||
890 | static int __init dm_verity_init(void) | ||
891 | { | ||
892 | int r; | ||
893 | |||
894 | r = dm_register_target(&verity_target); | ||
895 | if (r < 0) | ||
896 | DMERR("register failed %d", r); | ||
897 | |||
898 | return r; | ||
899 | } | ||
900 | |||
901 | static void __exit dm_verity_exit(void) | ||
902 | { | ||
903 | dm_unregister_target(&verity_target); | ||
904 | } | ||
905 | |||
906 | module_init(dm_verity_init); | ||
907 | module_exit(dm_verity_exit); | ||
908 | |||
909 | MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>"); | ||
910 | MODULE_AUTHOR("Mandeep Baines <msb@chromium.org>"); | ||
911 | MODULE_AUTHOR("Will Drewry <wad@chromium.org>"); | ||
912 | MODULE_DESCRIPTION(DM_NAME " target for transparent disk integrity checking"); | ||
913 | MODULE_LICENSE("GPL"); | ||
diff --git a/drivers/md/dm.c b/drivers/md/dm.c index b89c548ec3f8..e24143cc2040 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c | |||
@@ -1016,6 +1016,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, | |||
1016 | /* | 1016 | /* |
1017 | * Store bio_set for cleanup. | 1017 | * Store bio_set for cleanup. |
1018 | */ | 1018 | */ |
1019 | clone->bi_end_io = NULL; | ||
1019 | clone->bi_private = md->bs; | 1020 | clone->bi_private = md->bs; |
1020 | bio_put(clone); | 1021 | bio_put(clone); |
1021 | free_tio(md, tio); | 1022 | free_tio(md, tio); |
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index feb2c3c7bb44..45135f69509c 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c | |||
@@ -315,7 +315,7 @@ static int run(struct mddev *mddev) | |||
315 | } | 315 | } |
316 | conf->nfaults = 0; | 316 | conf->nfaults = 0; |
317 | 317 | ||
318 | list_for_each_entry(rdev, &mddev->disks, same_set) | 318 | rdev_for_each(rdev, mddev) |
319 | conf->rdev = rdev; | 319 | conf->rdev = rdev; |
320 | 320 | ||
321 | md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); | 321 | md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); |
diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 627456542fb3..b0fcc7d02adb 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c | |||
@@ -68,10 +68,19 @@ static int linear_mergeable_bvec(struct request_queue *q, | |||
68 | struct dev_info *dev0; | 68 | struct dev_info *dev0; |
69 | unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; | 69 | unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; |
70 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); | 70 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); |
71 | int maxbytes = biovec->bv_len; | ||
72 | struct request_queue *subq; | ||
71 | 73 | ||
72 | rcu_read_lock(); | 74 | rcu_read_lock(); |
73 | dev0 = which_dev(mddev, sector); | 75 | dev0 = which_dev(mddev, sector); |
74 | maxsectors = dev0->end_sector - sector; | 76 | maxsectors = dev0->end_sector - sector; |
77 | subq = bdev_get_queue(dev0->rdev->bdev); | ||
78 | if (subq->merge_bvec_fn) { | ||
79 | bvm->bi_bdev = dev0->rdev->bdev; | ||
80 | bvm->bi_sector -= dev0->end_sector - dev0->rdev->sectors; | ||
81 | maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm, | ||
82 | biovec)); | ||
83 | } | ||
75 | rcu_read_unlock(); | 84 | rcu_read_unlock(); |
76 | 85 | ||
77 | if (maxsectors < bio_sectors) | 86 | if (maxsectors < bio_sectors) |
@@ -80,12 +89,12 @@ static int linear_mergeable_bvec(struct request_queue *q, | |||
80 | maxsectors -= bio_sectors; | 89 | maxsectors -= bio_sectors; |
81 | 90 | ||
82 | if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0) | 91 | if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0) |
83 | return biovec->bv_len; | 92 | return maxbytes; |
84 | /* The bytes available at this offset could be really big, | 93 | |
85 | * so we cap at 2^31 to avoid overflow */ | 94 | if (maxsectors > (maxbytes >> 9)) |
86 | if (maxsectors > (1 << (31-9))) | 95 | return maxbytes; |
87 | return 1<<31; | 96 | else |
88 | return maxsectors << 9; | 97 | return maxsectors << 9; |
89 | } | 98 | } |
90 | 99 | ||
91 | static int linear_congested(void *data, int bits) | 100 | static int linear_congested(void *data, int bits) |
@@ -138,7 +147,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) | |||
138 | cnt = 0; | 147 | cnt = 0; |
139 | conf->array_sectors = 0; | 148 | conf->array_sectors = 0; |
140 | 149 | ||
141 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 150 | rdev_for_each(rdev, mddev) { |
142 | int j = rdev->raid_disk; | 151 | int j = rdev->raid_disk; |
143 | struct dev_info *disk = conf->disks + j; | 152 | struct dev_info *disk = conf->disks + j; |
144 | sector_t sectors; | 153 | sector_t sectors; |
@@ -158,15 +167,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) | |||
158 | 167 | ||
159 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 168 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
160 | rdev->data_offset << 9); | 169 | rdev->data_offset << 9); |
161 | /* as we don't honour merge_bvec_fn, we must never risk | ||
162 | * violating it, so limit max_segments to 1 lying within | ||
163 | * a single page. | ||
164 | */ | ||
165 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { | ||
166 | blk_queue_max_segments(mddev->queue, 1); | ||
167 | blk_queue_segment_boundary(mddev->queue, | ||
168 | PAGE_CACHE_SIZE - 1); | ||
169 | } | ||
170 | 170 | ||
171 | conf->array_sectors += rdev->sectors; | 171 | conf->array_sectors += rdev->sectors; |
172 | cnt++; | 172 | cnt++; |
diff --git a/drivers/md/md.c b/drivers/md/md.c index ce88755baf4a..b572e1e386ce 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c | |||
@@ -439,7 +439,7 @@ static void submit_flushes(struct work_struct *ws) | |||
439 | INIT_WORK(&mddev->flush_work, md_submit_flush_data); | 439 | INIT_WORK(&mddev->flush_work, md_submit_flush_data); |
440 | atomic_set(&mddev->flush_pending, 1); | 440 | atomic_set(&mddev->flush_pending, 1); |
441 | rcu_read_lock(); | 441 | rcu_read_lock(); |
442 | list_for_each_entry_rcu(rdev, &mddev->disks, same_set) | 442 | rdev_for_each_rcu(rdev, mddev) |
443 | if (rdev->raid_disk >= 0 && | 443 | if (rdev->raid_disk >= 0 && |
444 | !test_bit(Faulty, &rdev->flags)) { | 444 | !test_bit(Faulty, &rdev->flags)) { |
445 | /* Take two references, one is dropped | 445 | /* Take two references, one is dropped |
@@ -749,7 +749,7 @@ static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr) | |||
749 | { | 749 | { |
750 | struct md_rdev *rdev; | 750 | struct md_rdev *rdev; |
751 | 751 | ||
752 | list_for_each_entry(rdev, &mddev->disks, same_set) | 752 | rdev_for_each(rdev, mddev) |
753 | if (rdev->desc_nr == nr) | 753 | if (rdev->desc_nr == nr) |
754 | return rdev; | 754 | return rdev; |
755 | 755 | ||
@@ -760,7 +760,7 @@ static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev) | |||
760 | { | 760 | { |
761 | struct md_rdev *rdev; | 761 | struct md_rdev *rdev; |
762 | 762 | ||
763 | list_for_each_entry(rdev, &mddev->disks, same_set) | 763 | rdev_for_each(rdev, mddev) |
764 | if (rdev->bdev->bd_dev == dev) | 764 | if (rdev->bdev->bd_dev == dev) |
765 | return rdev; | 765 | return rdev; |
766 | 766 | ||
@@ -1342,7 +1342,7 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev) | |||
1342 | sb->state |= (1<<MD_SB_BITMAP_PRESENT); | 1342 | sb->state |= (1<<MD_SB_BITMAP_PRESENT); |
1343 | 1343 | ||
1344 | sb->disks[0].state = (1<<MD_DISK_REMOVED); | 1344 | sb->disks[0].state = (1<<MD_DISK_REMOVED); |
1345 | list_for_each_entry(rdev2, &mddev->disks, same_set) { | 1345 | rdev_for_each(rdev2, mddev) { |
1346 | mdp_disk_t *d; | 1346 | mdp_disk_t *d; |
1347 | int desc_nr; | 1347 | int desc_nr; |
1348 | int is_active = test_bit(In_sync, &rdev2->flags); | 1348 | int is_active = test_bit(In_sync, &rdev2->flags); |
@@ -1805,18 +1805,18 @@ retry: | |||
1805 | | BB_LEN(internal_bb)); | 1805 | | BB_LEN(internal_bb)); |
1806 | *bbp++ = cpu_to_le64(store_bb); | 1806 | *bbp++ = cpu_to_le64(store_bb); |
1807 | } | 1807 | } |
1808 | bb->changed = 0; | ||
1808 | if (read_seqretry(&bb->lock, seq)) | 1809 | if (read_seqretry(&bb->lock, seq)) |
1809 | goto retry; | 1810 | goto retry; |
1810 | 1811 | ||
1811 | bb->sector = (rdev->sb_start + | 1812 | bb->sector = (rdev->sb_start + |
1812 | (int)le32_to_cpu(sb->bblog_offset)); | 1813 | (int)le32_to_cpu(sb->bblog_offset)); |
1813 | bb->size = le16_to_cpu(sb->bblog_size); | 1814 | bb->size = le16_to_cpu(sb->bblog_size); |
1814 | bb->changed = 0; | ||
1815 | } | 1815 | } |
1816 | } | 1816 | } |
1817 | 1817 | ||
1818 | max_dev = 0; | 1818 | max_dev = 0; |
1819 | list_for_each_entry(rdev2, &mddev->disks, same_set) | 1819 | rdev_for_each(rdev2, mddev) |
1820 | if (rdev2->desc_nr+1 > max_dev) | 1820 | if (rdev2->desc_nr+1 > max_dev) |
1821 | max_dev = rdev2->desc_nr+1; | 1821 | max_dev = rdev2->desc_nr+1; |
1822 | 1822 | ||
@@ -1833,7 +1833,7 @@ retry: | |||
1833 | for (i=0; i<max_dev;i++) | 1833 | for (i=0; i<max_dev;i++) |
1834 | sb->dev_roles[i] = cpu_to_le16(0xfffe); | 1834 | sb->dev_roles[i] = cpu_to_le16(0xfffe); |
1835 | 1835 | ||
1836 | list_for_each_entry(rdev2, &mddev->disks, same_set) { | 1836 | rdev_for_each(rdev2, mddev) { |
1837 | i = rdev2->desc_nr; | 1837 | i = rdev2->desc_nr; |
1838 | if (test_bit(Faulty, &rdev2->flags)) | 1838 | if (test_bit(Faulty, &rdev2->flags)) |
1839 | sb->dev_roles[i] = cpu_to_le16(0xfffe); | 1839 | sb->dev_roles[i] = cpu_to_le16(0xfffe); |
@@ -1948,7 +1948,7 @@ int md_integrity_register(struct mddev *mddev) | |||
1948 | return 0; /* nothing to do */ | 1948 | return 0; /* nothing to do */ |
1949 | if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) | 1949 | if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) |
1950 | return 0; /* shouldn't register, or already is */ | 1950 | return 0; /* shouldn't register, or already is */ |
1951 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 1951 | rdev_for_each(rdev, mddev) { |
1952 | /* skip spares and non-functional disks */ | 1952 | /* skip spares and non-functional disks */ |
1953 | if (test_bit(Faulty, &rdev->flags)) | 1953 | if (test_bit(Faulty, &rdev->flags)) |
1954 | continue; | 1954 | continue; |
@@ -2175,7 +2175,7 @@ static void export_array(struct mddev *mddev) | |||
2175 | { | 2175 | { |
2176 | struct md_rdev *rdev, *tmp; | 2176 | struct md_rdev *rdev, *tmp; |
2177 | 2177 | ||
2178 | rdev_for_each(rdev, tmp, mddev) { | 2178 | rdev_for_each_safe(rdev, tmp, mddev) { |
2179 | if (!rdev->mddev) { | 2179 | if (!rdev->mddev) { |
2180 | MD_BUG(); | 2180 | MD_BUG(); |
2181 | continue; | 2181 | continue; |
@@ -2307,11 +2307,11 @@ static void md_print_devices(void) | |||
2307 | bitmap_print_sb(mddev->bitmap); | 2307 | bitmap_print_sb(mddev->bitmap); |
2308 | else | 2308 | else |
2309 | printk("%s: ", mdname(mddev)); | 2309 | printk("%s: ", mdname(mddev)); |
2310 | list_for_each_entry(rdev, &mddev->disks, same_set) | 2310 | rdev_for_each(rdev, mddev) |
2311 | printk("<%s>", bdevname(rdev->bdev,b)); | 2311 | printk("<%s>", bdevname(rdev->bdev,b)); |
2312 | printk("\n"); | 2312 | printk("\n"); |
2313 | 2313 | ||
2314 | list_for_each_entry(rdev, &mddev->disks, same_set) | 2314 | rdev_for_each(rdev, mddev) |
2315 | print_rdev(rdev, mddev->major_version); | 2315 | print_rdev(rdev, mddev->major_version); |
2316 | } | 2316 | } |
2317 | printk("md: **********************************\n"); | 2317 | printk("md: **********************************\n"); |
@@ -2328,7 +2328,7 @@ static void sync_sbs(struct mddev * mddev, int nospares) | |||
2328 | * with the rest of the array) | 2328 | * with the rest of the array) |
2329 | */ | 2329 | */ |
2330 | struct md_rdev *rdev; | 2330 | struct md_rdev *rdev; |
2331 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 2331 | rdev_for_each(rdev, mddev) { |
2332 | if (rdev->sb_events == mddev->events || | 2332 | if (rdev->sb_events == mddev->events || |
2333 | (nospares && | 2333 | (nospares && |
2334 | rdev->raid_disk < 0 && | 2334 | rdev->raid_disk < 0 && |
@@ -2351,7 +2351,7 @@ static void md_update_sb(struct mddev * mddev, int force_change) | |||
2351 | 2351 | ||
2352 | repeat: | 2352 | repeat: |
2353 | /* First make sure individual recovery_offsets are correct */ | 2353 | /* First make sure individual recovery_offsets are correct */ |
2354 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 2354 | rdev_for_each(rdev, mddev) { |
2355 | if (rdev->raid_disk >= 0 && | 2355 | if (rdev->raid_disk >= 0 && |
2356 | mddev->delta_disks >= 0 && | 2356 | mddev->delta_disks >= 0 && |
2357 | !test_bit(In_sync, &rdev->flags) && | 2357 | !test_bit(In_sync, &rdev->flags) && |
@@ -2364,8 +2364,9 @@ repeat: | |||
2364 | clear_bit(MD_CHANGE_DEVS, &mddev->flags); | 2364 | clear_bit(MD_CHANGE_DEVS, &mddev->flags); |
2365 | if (!mddev->external) { | 2365 | if (!mddev->external) { |
2366 | clear_bit(MD_CHANGE_PENDING, &mddev->flags); | 2366 | clear_bit(MD_CHANGE_PENDING, &mddev->flags); |
2367 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 2367 | rdev_for_each(rdev, mddev) { |
2368 | if (rdev->badblocks.changed) { | 2368 | if (rdev->badblocks.changed) { |
2369 | rdev->badblocks.changed = 0; | ||
2369 | md_ack_all_badblocks(&rdev->badblocks); | 2370 | md_ack_all_badblocks(&rdev->badblocks); |
2370 | md_error(mddev, rdev); | 2371 | md_error(mddev, rdev); |
2371 | } | 2372 | } |
@@ -2430,7 +2431,7 @@ repeat: | |||
2430 | mddev->events --; | 2431 | mddev->events --; |
2431 | } | 2432 | } |
2432 | 2433 | ||
2433 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 2434 | rdev_for_each(rdev, mddev) { |
2434 | if (rdev->badblocks.changed) | 2435 | if (rdev->badblocks.changed) |
2435 | any_badblocks_changed++; | 2436 | any_badblocks_changed++; |
2436 | if (test_bit(Faulty, &rdev->flags)) | 2437 | if (test_bit(Faulty, &rdev->flags)) |
@@ -2444,7 +2445,7 @@ repeat: | |||
2444 | mdname(mddev), mddev->in_sync); | 2445 | mdname(mddev), mddev->in_sync); |
2445 | 2446 | ||
2446 | bitmap_update_sb(mddev->bitmap); | 2447 | bitmap_update_sb(mddev->bitmap); |
2447 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 2448 | rdev_for_each(rdev, mddev) { |
2448 | char b[BDEVNAME_SIZE]; | 2449 | char b[BDEVNAME_SIZE]; |
2449 | 2450 | ||
2450 | if (rdev->sb_loaded != 1) | 2451 | if (rdev->sb_loaded != 1) |
@@ -2493,7 +2494,7 @@ repeat: | |||
2493 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) | 2494 | if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) |
2494 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); | 2495 | sysfs_notify(&mddev->kobj, NULL, "sync_completed"); |
2495 | 2496 | ||
2496 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 2497 | rdev_for_each(rdev, mddev) { |
2497 | if (test_and_clear_bit(FaultRecorded, &rdev->flags)) | 2498 | if (test_and_clear_bit(FaultRecorded, &rdev->flags)) |
2498 | clear_bit(Blocked, &rdev->flags); | 2499 | clear_bit(Blocked, &rdev->flags); |
2499 | 2500 | ||
@@ -2896,7 +2897,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len) | |||
2896 | struct md_rdev *rdev2; | 2897 | struct md_rdev *rdev2; |
2897 | 2898 | ||
2898 | mddev_lock(mddev); | 2899 | mddev_lock(mddev); |
2899 | list_for_each_entry(rdev2, &mddev->disks, same_set) | 2900 | rdev_for_each(rdev2, mddev) |
2900 | if (rdev->bdev == rdev2->bdev && | 2901 | if (rdev->bdev == rdev2->bdev && |
2901 | rdev != rdev2 && | 2902 | rdev != rdev2 && |
2902 | overlaps(rdev->data_offset, rdev->sectors, | 2903 | overlaps(rdev->data_offset, rdev->sectors, |
@@ -3193,7 +3194,7 @@ static void analyze_sbs(struct mddev * mddev) | |||
3193 | char b[BDEVNAME_SIZE]; | 3194 | char b[BDEVNAME_SIZE]; |
3194 | 3195 | ||
3195 | freshest = NULL; | 3196 | freshest = NULL; |
3196 | rdev_for_each(rdev, tmp, mddev) | 3197 | rdev_for_each_safe(rdev, tmp, mddev) |
3197 | switch (super_types[mddev->major_version]. | 3198 | switch (super_types[mddev->major_version]. |
3198 | load_super(rdev, freshest, mddev->minor_version)) { | 3199 | load_super(rdev, freshest, mddev->minor_version)) { |
3199 | case 1: | 3200 | case 1: |
@@ -3214,7 +3215,7 @@ static void analyze_sbs(struct mddev * mddev) | |||
3214 | validate_super(mddev, freshest); | 3215 | validate_super(mddev, freshest); |
3215 | 3216 | ||
3216 | i = 0; | 3217 | i = 0; |
3217 | rdev_for_each(rdev, tmp, mddev) { | 3218 | rdev_for_each_safe(rdev, tmp, mddev) { |
3218 | if (mddev->max_disks && | 3219 | if (mddev->max_disks && |
3219 | (rdev->desc_nr >= mddev->max_disks || | 3220 | (rdev->desc_nr >= mddev->max_disks || |
3220 | i > mddev->max_disks)) { | 3221 | i > mddev->max_disks)) { |
@@ -3403,7 +3404,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
3403 | return -EINVAL; | 3404 | return -EINVAL; |
3404 | } | 3405 | } |
3405 | 3406 | ||
3406 | list_for_each_entry(rdev, &mddev->disks, same_set) | 3407 | rdev_for_each(rdev, mddev) |
3407 | rdev->new_raid_disk = rdev->raid_disk; | 3408 | rdev->new_raid_disk = rdev->raid_disk; |
3408 | 3409 | ||
3409 | /* ->takeover must set new_* and/or delta_disks | 3410 | /* ->takeover must set new_* and/or delta_disks |
@@ -3456,7 +3457,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
3456 | mddev->safemode = 0; | 3457 | mddev->safemode = 0; |
3457 | } | 3458 | } |
3458 | 3459 | ||
3459 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 3460 | rdev_for_each(rdev, mddev) { |
3460 | if (rdev->raid_disk < 0) | 3461 | if (rdev->raid_disk < 0) |
3461 | continue; | 3462 | continue; |
3462 | if (rdev->new_raid_disk >= mddev->raid_disks) | 3463 | if (rdev->new_raid_disk >= mddev->raid_disks) |
@@ -3465,7 +3466,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len) | |||
3465 | continue; | 3466 | continue; |
3466 | sysfs_unlink_rdev(mddev, rdev); | 3467 | sysfs_unlink_rdev(mddev, rdev); |
3467 | } | 3468 | } |
3468 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 3469 | rdev_for_each(rdev, mddev) { |
3469 | if (rdev->raid_disk < 0) | 3470 | if (rdev->raid_disk < 0) |
3470 | continue; | 3471 | continue; |
3471 | if (rdev->new_raid_disk == rdev->raid_disk) | 3472 | if (rdev->new_raid_disk == rdev->raid_disk) |
@@ -4796,7 +4797,7 @@ int md_run(struct mddev *mddev) | |||
4796 | * the only valid external interface is through the md | 4797 | * the only valid external interface is through the md |
4797 | * device. | 4798 | * device. |
4798 | */ | 4799 | */ |
4799 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 4800 | rdev_for_each(rdev, mddev) { |
4800 | if (test_bit(Faulty, &rdev->flags)) | 4801 | if (test_bit(Faulty, &rdev->flags)) |
4801 | continue; | 4802 | continue; |
4802 | sync_blockdev(rdev->bdev); | 4803 | sync_blockdev(rdev->bdev); |
@@ -4867,8 +4868,8 @@ int md_run(struct mddev *mddev) | |||
4867 | struct md_rdev *rdev2; | 4868 | struct md_rdev *rdev2; |
4868 | int warned = 0; | 4869 | int warned = 0; |
4869 | 4870 | ||
4870 | list_for_each_entry(rdev, &mddev->disks, same_set) | 4871 | rdev_for_each(rdev, mddev) |
4871 | list_for_each_entry(rdev2, &mddev->disks, same_set) { | 4872 | rdev_for_each(rdev2, mddev) { |
4872 | if (rdev < rdev2 && | 4873 | if (rdev < rdev2 && |
4873 | rdev->bdev->bd_contains == | 4874 | rdev->bdev->bd_contains == |
4874 | rdev2->bdev->bd_contains) { | 4875 | rdev2->bdev->bd_contains) { |
@@ -4945,7 +4946,7 @@ int md_run(struct mddev *mddev) | |||
4945 | mddev->in_sync = 1; | 4946 | mddev->in_sync = 1; |
4946 | smp_wmb(); | 4947 | smp_wmb(); |
4947 | mddev->ready = 1; | 4948 | mddev->ready = 1; |
4948 | list_for_each_entry(rdev, &mddev->disks, same_set) | 4949 | rdev_for_each(rdev, mddev) |
4949 | if (rdev->raid_disk >= 0) | 4950 | if (rdev->raid_disk >= 0) |
4950 | if (sysfs_link_rdev(mddev, rdev)) | 4951 | if (sysfs_link_rdev(mddev, rdev)) |
4951 | /* failure here is OK */; | 4952 | /* failure here is OK */; |
@@ -5073,6 +5074,7 @@ static void md_clean(struct mddev *mddev) | |||
5073 | mddev->changed = 0; | 5074 | mddev->changed = 0; |
5074 | mddev->degraded = 0; | 5075 | mddev->degraded = 0; |
5075 | mddev->safemode = 0; | 5076 | mddev->safemode = 0; |
5077 | mddev->merge_check_needed = 0; | ||
5076 | mddev->bitmap_info.offset = 0; | 5078 | mddev->bitmap_info.offset = 0; |
5077 | mddev->bitmap_info.default_offset = 0; | 5079 | mddev->bitmap_info.default_offset = 0; |
5078 | mddev->bitmap_info.chunksize = 0; | 5080 | mddev->bitmap_info.chunksize = 0; |
@@ -5175,7 +5177,7 @@ static int do_md_stop(struct mddev * mddev, int mode, int is_open) | |||
5175 | /* tell userspace to handle 'inactive' */ | 5177 | /* tell userspace to handle 'inactive' */ |
5176 | sysfs_notify_dirent_safe(mddev->sysfs_state); | 5178 | sysfs_notify_dirent_safe(mddev->sysfs_state); |
5177 | 5179 | ||
5178 | list_for_each_entry(rdev, &mddev->disks, same_set) | 5180 | rdev_for_each(rdev, mddev) |
5179 | if (rdev->raid_disk >= 0) | 5181 | if (rdev->raid_disk >= 0) |
5180 | sysfs_unlink_rdev(mddev, rdev); | 5182 | sysfs_unlink_rdev(mddev, rdev); |
5181 | 5183 | ||
@@ -5226,7 +5228,7 @@ static void autorun_array(struct mddev *mddev) | |||
5226 | 5228 | ||
5227 | printk(KERN_INFO "md: running: "); | 5229 | printk(KERN_INFO "md: running: "); |
5228 | 5230 | ||
5229 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 5231 | rdev_for_each(rdev, mddev) { |
5230 | char b[BDEVNAME_SIZE]; | 5232 | char b[BDEVNAME_SIZE]; |
5231 | printk("<%s>", bdevname(rdev->bdev,b)); | 5233 | printk("<%s>", bdevname(rdev->bdev,b)); |
5232 | } | 5234 | } |
@@ -5356,7 +5358,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg) | |||
5356 | struct md_rdev *rdev; | 5358 | struct md_rdev *rdev; |
5357 | 5359 | ||
5358 | nr=working=insync=failed=spare=0; | 5360 | nr=working=insync=failed=spare=0; |
5359 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 5361 | rdev_for_each(rdev, mddev) { |
5360 | nr++; | 5362 | nr++; |
5361 | if (test_bit(Faulty, &rdev->flags)) | 5363 | if (test_bit(Faulty, &rdev->flags)) |
5362 | failed++; | 5364 | failed++; |
@@ -5923,7 +5925,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors) | |||
5923 | * grow, and re-add. | 5925 | * grow, and re-add. |
5924 | */ | 5926 | */ |
5925 | return -EBUSY; | 5927 | return -EBUSY; |
5926 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 5928 | rdev_for_each(rdev, mddev) { |
5927 | sector_t avail = rdev->sectors; | 5929 | sector_t avail = rdev->sectors; |
5928 | 5930 | ||
5929 | if (fit && (num_sectors == 0 || num_sectors > avail)) | 5931 | if (fit && (num_sectors == 0 || num_sectors > avail)) |
@@ -6724,7 +6726,6 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
6724 | struct mddev *mddev = v; | 6726 | struct mddev *mddev = v; |
6725 | sector_t sectors; | 6727 | sector_t sectors; |
6726 | struct md_rdev *rdev; | 6728 | struct md_rdev *rdev; |
6727 | struct bitmap *bitmap; | ||
6728 | 6729 | ||
6729 | if (v == (void*)1) { | 6730 | if (v == (void*)1) { |
6730 | struct md_personality *pers; | 6731 | struct md_personality *pers; |
@@ -6758,7 +6759,7 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
6758 | } | 6759 | } |
6759 | 6760 | ||
6760 | sectors = 0; | 6761 | sectors = 0; |
6761 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 6762 | rdev_for_each(rdev, mddev) { |
6762 | char b[BDEVNAME_SIZE]; | 6763 | char b[BDEVNAME_SIZE]; |
6763 | seq_printf(seq, " %s[%d]", | 6764 | seq_printf(seq, " %s[%d]", |
6764 | bdevname(rdev->bdev,b), rdev->desc_nr); | 6765 | bdevname(rdev->bdev,b), rdev->desc_nr); |
@@ -6812,27 +6813,7 @@ static int md_seq_show(struct seq_file *seq, void *v) | |||
6812 | } else | 6813 | } else |
6813 | seq_printf(seq, "\n "); | 6814 | seq_printf(seq, "\n "); |
6814 | 6815 | ||
6815 | if ((bitmap = mddev->bitmap)) { | 6816 | bitmap_status(seq, mddev->bitmap); |
6816 | unsigned long chunk_kb; | ||
6817 | unsigned long flags; | ||
6818 | spin_lock_irqsave(&bitmap->lock, flags); | ||
6819 | chunk_kb = mddev->bitmap_info.chunksize >> 10; | ||
6820 | seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " | ||
6821 | "%lu%s chunk", | ||
6822 | bitmap->pages - bitmap->missing_pages, | ||
6823 | bitmap->pages, | ||
6824 | (bitmap->pages - bitmap->missing_pages) | ||
6825 | << (PAGE_SHIFT - 10), | ||
6826 | chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, | ||
6827 | chunk_kb ? "KB" : "B"); | ||
6828 | if (bitmap->file) { | ||
6829 | seq_printf(seq, ", file: "); | ||
6830 | seq_path(seq, &bitmap->file->f_path, " \t\n"); | ||
6831 | } | ||
6832 | |||
6833 | seq_printf(seq, "\n"); | ||
6834 | spin_unlock_irqrestore(&bitmap->lock, flags); | ||
6835 | } | ||
6836 | 6817 | ||
6837 | seq_printf(seq, "\n"); | 6818 | seq_printf(seq, "\n"); |
6838 | } | 6819 | } |
@@ -7170,7 +7151,7 @@ void md_do_sync(struct mddev *mddev) | |||
7170 | max_sectors = mddev->dev_sectors; | 7151 | max_sectors = mddev->dev_sectors; |
7171 | j = MaxSector; | 7152 | j = MaxSector; |
7172 | rcu_read_lock(); | 7153 | rcu_read_lock(); |
7173 | list_for_each_entry_rcu(rdev, &mddev->disks, same_set) | 7154 | rdev_for_each_rcu(rdev, mddev) |
7174 | if (rdev->raid_disk >= 0 && | 7155 | if (rdev->raid_disk >= 0 && |
7175 | !test_bit(Faulty, &rdev->flags) && | 7156 | !test_bit(Faulty, &rdev->flags) && |
7176 | !test_bit(In_sync, &rdev->flags) && | 7157 | !test_bit(In_sync, &rdev->flags) && |
@@ -7342,7 +7323,7 @@ void md_do_sync(struct mddev *mddev) | |||
7342 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) | 7323 | if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) |
7343 | mddev->curr_resync = MaxSector; | 7324 | mddev->curr_resync = MaxSector; |
7344 | rcu_read_lock(); | 7325 | rcu_read_lock(); |
7345 | list_for_each_entry_rcu(rdev, &mddev->disks, same_set) | 7326 | rdev_for_each_rcu(rdev, mddev) |
7346 | if (rdev->raid_disk >= 0 && | 7327 | if (rdev->raid_disk >= 0 && |
7347 | mddev->delta_disks >= 0 && | 7328 | mddev->delta_disks >= 0 && |
7348 | !test_bit(Faulty, &rdev->flags) && | 7329 | !test_bit(Faulty, &rdev->flags) && |
@@ -7388,7 +7369,7 @@ static int remove_and_add_spares(struct mddev *mddev) | |||
7388 | 7369 | ||
7389 | mddev->curr_resync_completed = 0; | 7370 | mddev->curr_resync_completed = 0; |
7390 | 7371 | ||
7391 | list_for_each_entry(rdev, &mddev->disks, same_set) | 7372 | rdev_for_each(rdev, mddev) |
7392 | if (rdev->raid_disk >= 0 && | 7373 | if (rdev->raid_disk >= 0 && |
7393 | !test_bit(Blocked, &rdev->flags) && | 7374 | !test_bit(Blocked, &rdev->flags) && |
7394 | (test_bit(Faulty, &rdev->flags) || | 7375 | (test_bit(Faulty, &rdev->flags) || |
@@ -7406,7 +7387,7 @@ static int remove_and_add_spares(struct mddev *mddev) | |||
7406 | "degraded"); | 7387 | "degraded"); |
7407 | 7388 | ||
7408 | 7389 | ||
7409 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 7390 | rdev_for_each(rdev, mddev) { |
7410 | if (rdev->raid_disk >= 0 && | 7391 | if (rdev->raid_disk >= 0 && |
7411 | !test_bit(In_sync, &rdev->flags) && | 7392 | !test_bit(In_sync, &rdev->flags) && |
7412 | !test_bit(Faulty, &rdev->flags)) | 7393 | !test_bit(Faulty, &rdev->flags)) |
@@ -7451,7 +7432,7 @@ static void reap_sync_thread(struct mddev *mddev) | |||
7451 | * do the superblock for an incrementally recovered device | 7432 | * do the superblock for an incrementally recovered device |
7452 | * written out. | 7433 | * written out. |
7453 | */ | 7434 | */ |
7454 | list_for_each_entry(rdev, &mddev->disks, same_set) | 7435 | rdev_for_each(rdev, mddev) |
7455 | if (!mddev->degraded || | 7436 | if (!mddev->degraded || |
7456 | test_bit(In_sync, &rdev->flags)) | 7437 | test_bit(In_sync, &rdev->flags)) |
7457 | rdev->saved_raid_disk = -1; | 7438 | rdev->saved_raid_disk = -1; |
@@ -7529,7 +7510,7 @@ void md_check_recovery(struct mddev *mddev) | |||
7529 | * failed devices. | 7510 | * failed devices. |
7530 | */ | 7511 | */ |
7531 | struct md_rdev *rdev; | 7512 | struct md_rdev *rdev; |
7532 | list_for_each_entry(rdev, &mddev->disks, same_set) | 7513 | rdev_for_each(rdev, mddev) |
7533 | if (rdev->raid_disk >= 0 && | 7514 | if (rdev->raid_disk >= 0 && |
7534 | !test_bit(Blocked, &rdev->flags) && | 7515 | !test_bit(Blocked, &rdev->flags) && |
7535 | test_bit(Faulty, &rdev->flags) && | 7516 | test_bit(Faulty, &rdev->flags) && |
@@ -8040,7 +8021,7 @@ void md_ack_all_badblocks(struct badblocks *bb) | |||
8040 | return; | 8021 | return; |
8041 | write_seqlock_irq(&bb->lock); | 8022 | write_seqlock_irq(&bb->lock); |
8042 | 8023 | ||
8043 | if (bb->changed == 0) { | 8024 | if (bb->changed == 0 && bb->unacked_exist) { |
8044 | u64 *p = bb->page; | 8025 | u64 *p = bb->page; |
8045 | int i; | 8026 | int i; |
8046 | for (i = 0; i < bb->count ; i++) { | 8027 | for (i = 0; i < bb->count ; i++) { |
@@ -8157,30 +8138,23 @@ static int md_notify_reboot(struct notifier_block *this, | |||
8157 | struct mddev *mddev; | 8138 | struct mddev *mddev; |
8158 | int need_delay = 0; | 8139 | int need_delay = 0; |
8159 | 8140 | ||
8160 | if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { | 8141 | for_each_mddev(mddev, tmp) { |
8161 | 8142 | if (mddev_trylock(mddev)) { | |
8162 | printk(KERN_INFO "md: stopping all md devices.\n"); | 8143 | __md_stop_writes(mddev); |
8163 | 8144 | mddev->safemode = 2; | |
8164 | for_each_mddev(mddev, tmp) { | 8145 | mddev_unlock(mddev); |
8165 | if (mddev_trylock(mddev)) { | ||
8166 | /* Force a switch to readonly even array | ||
8167 | * appears to still be in use. Hence | ||
8168 | * the '100'. | ||
8169 | */ | ||
8170 | md_set_readonly(mddev, 100); | ||
8171 | mddev_unlock(mddev); | ||
8172 | } | ||
8173 | need_delay = 1; | ||
8174 | } | 8146 | } |
8175 | /* | 8147 | need_delay = 1; |
8176 | * certain more exotic SCSI devices are known to be | ||
8177 | * volatile wrt too early system reboots. While the | ||
8178 | * right place to handle this issue is the given | ||
8179 | * driver, we do want to have a safe RAID driver ... | ||
8180 | */ | ||
8181 | if (need_delay) | ||
8182 | mdelay(1000*1); | ||
8183 | } | 8148 | } |
8149 | /* | ||
8150 | * certain more exotic SCSI devices are known to be | ||
8151 | * volatile wrt too early system reboots. While the | ||
8152 | * right place to handle this issue is the given | ||
8153 | * driver, we do want to have a safe RAID driver ... | ||
8154 | */ | ||
8155 | if (need_delay) | ||
8156 | mdelay(1000*1); | ||
8157 | |||
8184 | return NOTIFY_DONE; | 8158 | return NOTIFY_DONE; |
8185 | } | 8159 | } |
8186 | 8160 | ||
diff --git a/drivers/md/md.h b/drivers/md/md.h index 44c63dfeeb2b..1c2063ccf48e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h | |||
@@ -128,6 +128,10 @@ struct md_rdev { | |||
128 | enum flag_bits { | 128 | enum flag_bits { |
129 | Faulty, /* device is known to have a fault */ | 129 | Faulty, /* device is known to have a fault */ |
130 | In_sync, /* device is in_sync with rest of array */ | 130 | In_sync, /* device is in_sync with rest of array */ |
131 | Unmerged, /* device is being added to array and should | ||
132 | * be considerred for bvec_merge_fn but not | ||
133 | * yet for actual IO | ||
134 | */ | ||
131 | WriteMostly, /* Avoid reading if at all possible */ | 135 | WriteMostly, /* Avoid reading if at all possible */ |
132 | AutoDetected, /* added by auto-detect */ | 136 | AutoDetected, /* added by auto-detect */ |
133 | Blocked, /* An error occurred but has not yet | 137 | Blocked, /* An error occurred but has not yet |
@@ -345,6 +349,10 @@ struct mddev { | |||
345 | int degraded; /* whether md should consider | 349 | int degraded; /* whether md should consider |
346 | * adding a spare | 350 | * adding a spare |
347 | */ | 351 | */ |
352 | int merge_check_needed; /* at least one | ||
353 | * member device | ||
354 | * has a | ||
355 | * merge_bvec_fn */ | ||
348 | 356 | ||
349 | atomic_t recovery_active; /* blocks scheduled, but not written */ | 357 | atomic_t recovery_active; /* blocks scheduled, but not written */ |
350 | wait_queue_head_t recovery_wait; | 358 | wait_queue_head_t recovery_wait; |
@@ -519,7 +527,10 @@ static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) | |||
519 | /* | 527 | /* |
520 | * iterates through the 'same array disks' ringlist | 528 | * iterates through the 'same array disks' ringlist |
521 | */ | 529 | */ |
522 | #define rdev_for_each(rdev, tmp, mddev) \ | 530 | #define rdev_for_each(rdev, mddev) \ |
531 | list_for_each_entry(rdev, &((mddev)->disks), same_set) | ||
532 | |||
533 | #define rdev_for_each_safe(rdev, tmp, mddev) \ | ||
523 | list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) | 534 | list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) |
524 | 535 | ||
525 | #define rdev_for_each_rcu(rdev, mddev) \ | 536 | #define rdev_for_each_rcu(rdev, mddev) \ |
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index a222f516660e..9339e67fcc79 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c | |||
@@ -428,7 +428,7 @@ static int multipath_run (struct mddev *mddev) | |||
428 | } | 428 | } |
429 | 429 | ||
430 | working_disks = 0; | 430 | working_disks = 0; |
431 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 431 | rdev_for_each(rdev, mddev) { |
432 | disk_idx = rdev->raid_disk; | 432 | disk_idx = rdev->raid_disk; |
433 | if (disk_idx < 0 || | 433 | if (disk_idx < 0 || |
434 | disk_idx >= mddev->raid_disks) | 434 | disk_idx >= mddev->raid_disks) |
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h index d279c768f8f1..5709bfeab1e8 100644 --- a/drivers/md/persistent-data/dm-btree-internal.h +++ b/drivers/md/persistent-data/dm-btree-internal.h | |||
@@ -108,12 +108,9 @@ static inline void *value_base(struct node *n) | |||
108 | return &n->keys[le32_to_cpu(n->header.max_entries)]; | 108 | return &n->keys[le32_to_cpu(n->header.max_entries)]; |
109 | } | 109 | } |
110 | 110 | ||
111 | /* | 111 | static inline void *value_ptr(struct node *n, uint32_t index) |
112 | * FIXME: Now that value size is stored in node we don't need the third parm. | ||
113 | */ | ||
114 | static inline void *value_ptr(struct node *n, uint32_t index, size_t value_size) | ||
115 | { | 112 | { |
116 | BUG_ON(value_size != le32_to_cpu(n->header.value_size)); | 113 | uint32_t value_size = le32_to_cpu(n->header.value_size); |
117 | return value_base(n) + (value_size * index); | 114 | return value_base(n) + (value_size * index); |
118 | } | 115 | } |
119 | 116 | ||
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c index 023fbc2d389e..aa71e2359a07 100644 --- a/drivers/md/persistent-data/dm-btree-remove.c +++ b/drivers/md/persistent-data/dm-btree-remove.c | |||
@@ -61,20 +61,20 @@ static void node_shift(struct node *n, int shift) | |||
61 | if (shift < 0) { | 61 | if (shift < 0) { |
62 | shift = -shift; | 62 | shift = -shift; |
63 | BUG_ON(shift > nr_entries); | 63 | BUG_ON(shift > nr_entries); |
64 | BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift, value_size)); | 64 | BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift)); |
65 | memmove(key_ptr(n, 0), | 65 | memmove(key_ptr(n, 0), |
66 | key_ptr(n, shift), | 66 | key_ptr(n, shift), |
67 | (nr_entries - shift) * sizeof(__le64)); | 67 | (nr_entries - shift) * sizeof(__le64)); |
68 | memmove(value_ptr(n, 0, value_size), | 68 | memmove(value_ptr(n, 0), |
69 | value_ptr(n, shift, value_size), | 69 | value_ptr(n, shift), |
70 | (nr_entries - shift) * value_size); | 70 | (nr_entries - shift) * value_size); |
71 | } else { | 71 | } else { |
72 | BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); | 72 | BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); |
73 | memmove(key_ptr(n, shift), | 73 | memmove(key_ptr(n, shift), |
74 | key_ptr(n, 0), | 74 | key_ptr(n, 0), |
75 | nr_entries * sizeof(__le64)); | 75 | nr_entries * sizeof(__le64)); |
76 | memmove(value_ptr(n, shift, value_size), | 76 | memmove(value_ptr(n, shift), |
77 | value_ptr(n, 0, value_size), | 77 | value_ptr(n, 0), |
78 | nr_entries * value_size); | 78 | nr_entries * value_size); |
79 | } | 79 | } |
80 | } | 80 | } |
@@ -91,16 +91,16 @@ static void node_copy(struct node *left, struct node *right, int shift) | |||
91 | memcpy(key_ptr(left, nr_left), | 91 | memcpy(key_ptr(left, nr_left), |
92 | key_ptr(right, 0), | 92 | key_ptr(right, 0), |
93 | shift * sizeof(__le64)); | 93 | shift * sizeof(__le64)); |
94 | memcpy(value_ptr(left, nr_left, value_size), | 94 | memcpy(value_ptr(left, nr_left), |
95 | value_ptr(right, 0, value_size), | 95 | value_ptr(right, 0), |
96 | shift * value_size); | 96 | shift * value_size); |
97 | } else { | 97 | } else { |
98 | BUG_ON(shift > le32_to_cpu(right->header.max_entries)); | 98 | BUG_ON(shift > le32_to_cpu(right->header.max_entries)); |
99 | memcpy(key_ptr(right, 0), | 99 | memcpy(key_ptr(right, 0), |
100 | key_ptr(left, nr_left - shift), | 100 | key_ptr(left, nr_left - shift), |
101 | shift * sizeof(__le64)); | 101 | shift * sizeof(__le64)); |
102 | memcpy(value_ptr(right, 0, value_size), | 102 | memcpy(value_ptr(right, 0), |
103 | value_ptr(left, nr_left - shift, value_size), | 103 | value_ptr(left, nr_left - shift), |
104 | shift * value_size); | 104 | shift * value_size); |
105 | } | 105 | } |
106 | } | 106 | } |
@@ -120,26 +120,17 @@ static void delete_at(struct node *n, unsigned index) | |||
120 | key_ptr(n, index + 1), | 120 | key_ptr(n, index + 1), |
121 | nr_to_copy * sizeof(__le64)); | 121 | nr_to_copy * sizeof(__le64)); |
122 | 122 | ||
123 | memmove(value_ptr(n, index, value_size), | 123 | memmove(value_ptr(n, index), |
124 | value_ptr(n, index + 1, value_size), | 124 | value_ptr(n, index + 1), |
125 | nr_to_copy * value_size); | 125 | nr_to_copy * value_size); |
126 | } | 126 | } |
127 | 127 | ||
128 | n->header.nr_entries = cpu_to_le32(nr_entries - 1); | 128 | n->header.nr_entries = cpu_to_le32(nr_entries - 1); |
129 | } | 129 | } |
130 | 130 | ||
131 | static unsigned del_threshold(struct node *n) | ||
132 | { | ||
133 | return le32_to_cpu(n->header.max_entries) / 3; | ||
134 | } | ||
135 | |||
136 | static unsigned merge_threshold(struct node *n) | 131 | static unsigned merge_threshold(struct node *n) |
137 | { | 132 | { |
138 | /* | 133 | return le32_to_cpu(n->header.max_entries) / 3; |
139 | * The extra one is because we know we're potentially going to | ||
140 | * delete an entry. | ||
141 | */ | ||
142 | return 2 * (le32_to_cpu(n->header.max_entries) / 3) + 1; | ||
143 | } | 134 | } |
144 | 135 | ||
145 | struct child { | 136 | struct child { |
@@ -175,7 +166,7 @@ static int init_child(struct dm_btree_info *info, struct node *parent, | |||
175 | if (inc) | 166 | if (inc) |
176 | inc_children(info->tm, result->n, &le64_type); | 167 | inc_children(info->tm, result->n, &le64_type); |
177 | 168 | ||
178 | *((__le64 *) value_ptr(parent, index, sizeof(__le64))) = | 169 | *((__le64 *) value_ptr(parent, index)) = |
179 | cpu_to_le64(dm_block_location(result->block)); | 170 | cpu_to_le64(dm_block_location(result->block)); |
180 | 171 | ||
181 | return 0; | 172 | return 0; |
@@ -188,6 +179,15 @@ static int exit_child(struct dm_btree_info *info, struct child *c) | |||
188 | 179 | ||
189 | static void shift(struct node *left, struct node *right, int count) | 180 | static void shift(struct node *left, struct node *right, int count) |
190 | { | 181 | { |
182 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); | ||
183 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); | ||
184 | uint32_t max_entries = le32_to_cpu(left->header.max_entries); | ||
185 | uint32_t r_max_entries = le32_to_cpu(right->header.max_entries); | ||
186 | |||
187 | BUG_ON(max_entries != r_max_entries); | ||
188 | BUG_ON(nr_left - count > max_entries); | ||
189 | BUG_ON(nr_right + count > max_entries); | ||
190 | |||
191 | if (!count) | 191 | if (!count) |
192 | return; | 192 | return; |
193 | 193 | ||
@@ -199,13 +199,8 @@ static void shift(struct node *left, struct node *right, int count) | |||
199 | node_shift(right, count); | 199 | node_shift(right, count); |
200 | } | 200 | } |
201 | 201 | ||
202 | left->header.nr_entries = | 202 | left->header.nr_entries = cpu_to_le32(nr_left - count); |
203 | cpu_to_le32(le32_to_cpu(left->header.nr_entries) - count); | 203 | right->header.nr_entries = cpu_to_le32(nr_right + count); |
204 | BUG_ON(le32_to_cpu(left->header.nr_entries) > le32_to_cpu(left->header.max_entries)); | ||
205 | |||
206 | right->header.nr_entries = | ||
207 | cpu_to_le32(le32_to_cpu(right->header.nr_entries) + count); | ||
208 | BUG_ON(le32_to_cpu(right->header.nr_entries) > le32_to_cpu(right->header.max_entries)); | ||
209 | } | 204 | } |
210 | 205 | ||
211 | static void __rebalance2(struct dm_btree_info *info, struct node *parent, | 206 | static void __rebalance2(struct dm_btree_info *info, struct node *parent, |
@@ -215,8 +210,9 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent, | |||
215 | struct node *right = r->n; | 210 | struct node *right = r->n; |
216 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); | 211 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); |
217 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); | 212 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); |
213 | unsigned threshold = 2 * merge_threshold(left) + 1; | ||
218 | 214 | ||
219 | if (nr_left + nr_right <= merge_threshold(left)) { | 215 | if (nr_left + nr_right < threshold) { |
220 | /* | 216 | /* |
221 | * Merge | 217 | * Merge |
222 | */ | 218 | */ |
@@ -234,9 +230,6 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent, | |||
234 | * Rebalance. | 230 | * Rebalance. |
235 | */ | 231 | */ |
236 | unsigned target_left = (nr_left + nr_right) / 2; | 232 | unsigned target_left = (nr_left + nr_right) / 2; |
237 | unsigned shift_ = nr_left - target_left; | ||
238 | BUG_ON(le32_to_cpu(left->header.max_entries) <= nr_left - shift_); | ||
239 | BUG_ON(le32_to_cpu(right->header.max_entries) <= nr_right + shift_); | ||
240 | shift(left, right, nr_left - target_left); | 233 | shift(left, right, nr_left - target_left); |
241 | *key_ptr(parent, r->index) = right->keys[0]; | 234 | *key_ptr(parent, r->index) = right->keys[0]; |
242 | } | 235 | } |
@@ -272,6 +265,84 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, | |||
272 | return exit_child(info, &right); | 265 | return exit_child(info, &right); |
273 | } | 266 | } |
274 | 267 | ||
268 | /* | ||
269 | * We dump as many entries from center as possible into left, then the rest | ||
270 | * in right, then rebalance2. This wastes some cpu, but I want something | ||
271 | * simple atm. | ||
272 | */ | ||
273 | static void delete_center_node(struct dm_btree_info *info, struct node *parent, | ||
274 | struct child *l, struct child *c, struct child *r, | ||
275 | struct node *left, struct node *center, struct node *right, | ||
276 | uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) | ||
277 | { | ||
278 | uint32_t max_entries = le32_to_cpu(left->header.max_entries); | ||
279 | unsigned shift = min(max_entries - nr_left, nr_center); | ||
280 | |||
281 | BUG_ON(nr_left + shift > max_entries); | ||
282 | node_copy(left, center, -shift); | ||
283 | left->header.nr_entries = cpu_to_le32(nr_left + shift); | ||
284 | |||
285 | if (shift != nr_center) { | ||
286 | shift = nr_center - shift; | ||
287 | BUG_ON((nr_right + shift) > max_entries); | ||
288 | node_shift(right, shift); | ||
289 | node_copy(center, right, shift); | ||
290 | right->header.nr_entries = cpu_to_le32(nr_right + shift); | ||
291 | } | ||
292 | *key_ptr(parent, r->index) = right->keys[0]; | ||
293 | |||
294 | delete_at(parent, c->index); | ||
295 | r->index--; | ||
296 | |||
297 | dm_tm_dec(info->tm, dm_block_location(c->block)); | ||
298 | __rebalance2(info, parent, l, r); | ||
299 | } | ||
300 | |||
301 | /* | ||
302 | * Redistributes entries among 3 sibling nodes. | ||
303 | */ | ||
304 | static void redistribute3(struct dm_btree_info *info, struct node *parent, | ||
305 | struct child *l, struct child *c, struct child *r, | ||
306 | struct node *left, struct node *center, struct node *right, | ||
307 | uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) | ||
308 | { | ||
309 | int s; | ||
310 | uint32_t max_entries = le32_to_cpu(left->header.max_entries); | ||
311 | unsigned target = (nr_left + nr_center + nr_right) / 3; | ||
312 | BUG_ON(target > max_entries); | ||
313 | |||
314 | if (nr_left < nr_right) { | ||
315 | s = nr_left - target; | ||
316 | |||
317 | if (s < 0 && nr_center < -s) { | ||
318 | /* not enough in central node */ | ||
319 | shift(left, center, nr_center); | ||
320 | s = nr_center - target; | ||
321 | shift(left, right, s); | ||
322 | nr_right += s; | ||
323 | } else | ||
324 | shift(left, center, s); | ||
325 | |||
326 | shift(center, right, target - nr_right); | ||
327 | |||
328 | } else { | ||
329 | s = target - nr_right; | ||
330 | if (s > 0 && nr_center < s) { | ||
331 | /* not enough in central node */ | ||
332 | shift(center, right, nr_center); | ||
333 | s = target - nr_center; | ||
334 | shift(left, right, s); | ||
335 | nr_left -= s; | ||
336 | } else | ||
337 | shift(center, right, s); | ||
338 | |||
339 | shift(left, center, nr_left - target); | ||
340 | } | ||
341 | |||
342 | *key_ptr(parent, c->index) = center->keys[0]; | ||
343 | *key_ptr(parent, r->index) = right->keys[0]; | ||
344 | } | ||
345 | |||
275 | static void __rebalance3(struct dm_btree_info *info, struct node *parent, | 346 | static void __rebalance3(struct dm_btree_info *info, struct node *parent, |
276 | struct child *l, struct child *c, struct child *r) | 347 | struct child *l, struct child *c, struct child *r) |
277 | { | 348 | { |
@@ -282,62 +353,18 @@ static void __rebalance3(struct dm_btree_info *info, struct node *parent, | |||
282 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); | 353 | uint32_t nr_left = le32_to_cpu(left->header.nr_entries); |
283 | uint32_t nr_center = le32_to_cpu(center->header.nr_entries); | 354 | uint32_t nr_center = le32_to_cpu(center->header.nr_entries); |
284 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); | 355 | uint32_t nr_right = le32_to_cpu(right->header.nr_entries); |
285 | uint32_t max_entries = le32_to_cpu(left->header.max_entries); | ||
286 | 356 | ||
287 | unsigned target; | 357 | unsigned threshold = merge_threshold(left) * 4 + 1; |
288 | 358 | ||
289 | BUG_ON(left->header.max_entries != center->header.max_entries); | 359 | BUG_ON(left->header.max_entries != center->header.max_entries); |
290 | BUG_ON(center->header.max_entries != right->header.max_entries); | 360 | BUG_ON(center->header.max_entries != right->header.max_entries); |
291 | 361 | ||
292 | if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center)) { | 362 | if ((nr_left + nr_center + nr_right) < threshold) |
293 | /* | 363 | delete_center_node(info, parent, l, c, r, left, center, right, |
294 | * Delete center node: | 364 | nr_left, nr_center, nr_right); |
295 | * | 365 | else |
296 | * We dump as many entries from center as possible into | 366 | redistribute3(info, parent, l, c, r, left, center, right, |
297 | * left, then the rest in right, then rebalance2. This | 367 | nr_left, nr_center, nr_right); |
298 | * wastes some cpu, but I want something simple atm. | ||
299 | */ | ||
300 | unsigned shift = min(max_entries - nr_left, nr_center); | ||
301 | |||
302 | BUG_ON(nr_left + shift > max_entries); | ||
303 | node_copy(left, center, -shift); | ||
304 | left->header.nr_entries = cpu_to_le32(nr_left + shift); | ||
305 | |||
306 | if (shift != nr_center) { | ||
307 | shift = nr_center - shift; | ||
308 | BUG_ON((nr_right + shift) >= max_entries); | ||
309 | node_shift(right, shift); | ||
310 | node_copy(center, right, shift); | ||
311 | right->header.nr_entries = cpu_to_le32(nr_right + shift); | ||
312 | } | ||
313 | *key_ptr(parent, r->index) = right->keys[0]; | ||
314 | |||
315 | delete_at(parent, c->index); | ||
316 | r->index--; | ||
317 | |||
318 | dm_tm_dec(info->tm, dm_block_location(c->block)); | ||
319 | __rebalance2(info, parent, l, r); | ||
320 | |||
321 | return; | ||
322 | } | ||
323 | |||
324 | /* | ||
325 | * Rebalance | ||
326 | */ | ||
327 | target = (nr_left + nr_center + nr_right) / 3; | ||
328 | BUG_ON(target > max_entries); | ||
329 | |||
330 | /* | ||
331 | * Adjust the left node | ||
332 | */ | ||
333 | shift(left, center, nr_left - target); | ||
334 | |||
335 | /* | ||
336 | * Adjust the right node | ||
337 | */ | ||
338 | shift(center, right, target - nr_right); | ||
339 | *key_ptr(parent, c->index) = center->keys[0]; | ||
340 | *key_ptr(parent, r->index) = right->keys[0]; | ||
341 | } | 368 | } |
342 | 369 | ||
343 | static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, | 370 | static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, |
@@ -441,9 +468,6 @@ static int rebalance_children(struct shadow_spine *s, | |||
441 | if (r) | 468 | if (r) |
442 | return r; | 469 | return r; |
443 | 470 | ||
444 | if (child_entries > del_threshold(n)) | ||
445 | return 0; | ||
446 | |||
447 | has_left_sibling = i > 0; | 471 | has_left_sibling = i > 0; |
448 | has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); | 472 | has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); |
449 | 473 | ||
@@ -496,7 +520,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, | |||
496 | */ | 520 | */ |
497 | if (shadow_has_parent(s)) { | 521 | if (shadow_has_parent(s)) { |
498 | __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); | 522 | __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); |
499 | memcpy(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(__le64)), | 523 | memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), |
500 | &location, sizeof(__le64)); | 524 | &location, sizeof(__le64)); |
501 | } | 525 | } |
502 | 526 | ||
@@ -553,7 +577,7 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, | |||
553 | 577 | ||
554 | if (info->value_type.dec) | 578 | if (info->value_type.dec) |
555 | info->value_type.dec(info->value_type.context, | 579 | info->value_type.dec(info->value_type.context, |
556 | value_ptr(n, index, info->value_type.size)); | 580 | value_ptr(n, index)); |
557 | 581 | ||
558 | delete_at(n, index); | 582 | delete_at(n, index); |
559 | } | 583 | } |
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c index bd1e7ffbe26c..d12b2cc51f1a 100644 --- a/drivers/md/persistent-data/dm-btree.c +++ b/drivers/md/persistent-data/dm-btree.c | |||
@@ -74,8 +74,7 @@ void inc_children(struct dm_transaction_manager *tm, struct node *n, | |||
74 | dm_tm_inc(tm, value64(n, i)); | 74 | dm_tm_inc(tm, value64(n, i)); |
75 | else if (vt->inc) | 75 | else if (vt->inc) |
76 | for (i = 0; i < nr_entries; i++) | 76 | for (i = 0; i < nr_entries; i++) |
77 | vt->inc(vt->context, | 77 | vt->inc(vt->context, value_ptr(n, i)); |
78 | value_ptr(n, i, vt->size)); | ||
79 | } | 78 | } |
80 | 79 | ||
81 | static int insert_at(size_t value_size, struct node *node, unsigned index, | 80 | static int insert_at(size_t value_size, struct node *node, unsigned index, |
@@ -281,7 +280,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root) | |||
281 | 280 | ||
282 | for (i = 0; i < f->nr_children; i++) | 281 | for (i = 0; i < f->nr_children; i++) |
283 | info->value_type.dec(info->value_type.context, | 282 | info->value_type.dec(info->value_type.context, |
284 | value_ptr(f->n, i, info->value_type.size)); | 283 | value_ptr(f->n, i)); |
285 | } | 284 | } |
286 | f->current_child = f->nr_children; | 285 | f->current_child = f->nr_children; |
287 | } | 286 | } |
@@ -320,7 +319,7 @@ static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key, | |||
320 | } while (!(flags & LEAF_NODE)); | 319 | } while (!(flags & LEAF_NODE)); |
321 | 320 | ||
322 | *result_key = le64_to_cpu(ro_node(s)->keys[i]); | 321 | *result_key = le64_to_cpu(ro_node(s)->keys[i]); |
323 | memcpy(v, value_ptr(ro_node(s), i, value_size), value_size); | 322 | memcpy(v, value_ptr(ro_node(s), i), value_size); |
324 | 323 | ||
325 | return 0; | 324 | return 0; |
326 | } | 325 | } |
@@ -432,7 +431,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root, | |||
432 | 431 | ||
433 | size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? | 432 | size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? |
434 | sizeof(uint64_t) : s->info->value_type.size; | 433 | sizeof(uint64_t) : s->info->value_type.size; |
435 | memcpy(value_ptr(rn, 0, size), value_ptr(ln, nr_left, size), | 434 | memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left), |
436 | size * nr_right); | 435 | size * nr_right); |
437 | 436 | ||
438 | /* | 437 | /* |
@@ -443,7 +442,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root, | |||
443 | pn = dm_block_data(parent); | 442 | pn = dm_block_data(parent); |
444 | location = cpu_to_le64(dm_block_location(left)); | 443 | location = cpu_to_le64(dm_block_location(left)); |
445 | __dm_bless_for_disk(&location); | 444 | __dm_bless_for_disk(&location); |
446 | memcpy_disk(value_ptr(pn, parent_index, sizeof(__le64)), | 445 | memcpy_disk(value_ptr(pn, parent_index), |
447 | &location, sizeof(__le64)); | 446 | &location, sizeof(__le64)); |
448 | 447 | ||
449 | location = cpu_to_le64(dm_block_location(right)); | 448 | location = cpu_to_le64(dm_block_location(right)); |
@@ -529,8 +528,8 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) | |||
529 | 528 | ||
530 | size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? | 529 | size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? |
531 | sizeof(__le64) : s->info->value_type.size; | 530 | sizeof(__le64) : s->info->value_type.size; |
532 | memcpy(value_ptr(ln, 0, size), value_ptr(pn, 0, size), nr_left * size); | 531 | memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); |
533 | memcpy(value_ptr(rn, 0, size), value_ptr(pn, nr_left, size), | 532 | memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left), |
534 | nr_right * size); | 533 | nr_right * size); |
535 | 534 | ||
536 | /* new_parent should just point to l and r now */ | 535 | /* new_parent should just point to l and r now */ |
@@ -545,12 +544,12 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key) | |||
545 | val = cpu_to_le64(dm_block_location(left)); | 544 | val = cpu_to_le64(dm_block_location(left)); |
546 | __dm_bless_for_disk(&val); | 545 | __dm_bless_for_disk(&val); |
547 | pn->keys[0] = ln->keys[0]; | 546 | pn->keys[0] = ln->keys[0]; |
548 | memcpy_disk(value_ptr(pn, 0, sizeof(__le64)), &val, sizeof(__le64)); | 547 | memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64)); |
549 | 548 | ||
550 | val = cpu_to_le64(dm_block_location(right)); | 549 | val = cpu_to_le64(dm_block_location(right)); |
551 | __dm_bless_for_disk(&val); | 550 | __dm_bless_for_disk(&val); |
552 | pn->keys[1] = rn->keys[0]; | 551 | pn->keys[1] = rn->keys[0]; |
553 | memcpy_disk(value_ptr(pn, 1, sizeof(__le64)), &val, sizeof(__le64)); | 552 | memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64)); |
554 | 553 | ||
555 | /* | 554 | /* |
556 | * rejig the spine. This is ugly, since it knows too | 555 | * rejig the spine. This is ugly, since it knows too |
@@ -595,7 +594,7 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, | |||
595 | __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); | 594 | __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); |
596 | 595 | ||
597 | __dm_bless_for_disk(&location); | 596 | __dm_bless_for_disk(&location); |
598 | memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)), | 597 | memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), |
599 | &location, sizeof(__le64)); | 598 | &location, sizeof(__le64)); |
600 | } | 599 | } |
601 | 600 | ||
@@ -710,12 +709,12 @@ static int insert(struct dm_btree_info *info, dm_block_t root, | |||
710 | (!info->value_type.equal || | 709 | (!info->value_type.equal || |
711 | !info->value_type.equal( | 710 | !info->value_type.equal( |
712 | info->value_type.context, | 711 | info->value_type.context, |
713 | value_ptr(n, index, info->value_type.size), | 712 | value_ptr(n, index), |
714 | value))) { | 713 | value))) { |
715 | info->value_type.dec(info->value_type.context, | 714 | info->value_type.dec(info->value_type.context, |
716 | value_ptr(n, index, info->value_type.size)); | 715 | value_ptr(n, index)); |
717 | } | 716 | } |
718 | memcpy_disk(value_ptr(n, index, info->value_type.size), | 717 | memcpy_disk(value_ptr(n, index), |
719 | value, info->value_type.size); | 718 | value, info->value_type.size); |
720 | } | 719 | } |
721 | 720 | ||
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c index df2494c06cdc..ff3beed6ad2d 100644 --- a/drivers/md/persistent-data/dm-space-map-common.c +++ b/drivers/md/persistent-data/dm-space-map-common.c | |||
@@ -405,8 +405,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b, | |||
405 | if (r < 0) | 405 | if (r < 0) |
406 | return r; | 406 | return r; |
407 | 407 | ||
408 | #if 0 | ||
409 | /* FIXME: dm_btree_remove doesn't handle this yet */ | ||
410 | if (old > 2) { | 408 | if (old > 2) { |
411 | r = dm_btree_remove(&ll->ref_count_info, | 409 | r = dm_btree_remove(&ll->ref_count_info, |
412 | ll->ref_count_root, | 410 | ll->ref_count_root, |
@@ -414,7 +412,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b, | |||
414 | if (r) | 412 | if (r) |
415 | return r; | 413 | return r; |
416 | } | 414 | } |
417 | #endif | ||
418 | 415 | ||
419 | } else { | 416 | } else { |
420 | __le32 le_rc = cpu_to_le32(ref_count); | 417 | __le32 le_rc = cpu_to_le32(ref_count); |
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 7294bd115e34..6f31f5596e01 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c | |||
@@ -91,7 +91,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) | |||
91 | 91 | ||
92 | if (!conf) | 92 | if (!conf) |
93 | return -ENOMEM; | 93 | return -ENOMEM; |
94 | list_for_each_entry(rdev1, &mddev->disks, same_set) { | 94 | rdev_for_each(rdev1, mddev) { |
95 | pr_debug("md/raid0:%s: looking at %s\n", | 95 | pr_debug("md/raid0:%s: looking at %s\n", |
96 | mdname(mddev), | 96 | mdname(mddev), |
97 | bdevname(rdev1->bdev, b)); | 97 | bdevname(rdev1->bdev, b)); |
@@ -102,7 +102,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) | |||
102 | sector_div(sectors, mddev->chunk_sectors); | 102 | sector_div(sectors, mddev->chunk_sectors); |
103 | rdev1->sectors = sectors * mddev->chunk_sectors; | 103 | rdev1->sectors = sectors * mddev->chunk_sectors; |
104 | 104 | ||
105 | list_for_each_entry(rdev2, &mddev->disks, same_set) { | 105 | rdev_for_each(rdev2, mddev) { |
106 | pr_debug("md/raid0:%s: comparing %s(%llu)" | 106 | pr_debug("md/raid0:%s: comparing %s(%llu)" |
107 | " with %s(%llu)\n", | 107 | " with %s(%llu)\n", |
108 | mdname(mddev), | 108 | mdname(mddev), |
@@ -157,7 +157,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) | |||
157 | smallest = NULL; | 157 | smallest = NULL; |
158 | dev = conf->devlist; | 158 | dev = conf->devlist; |
159 | err = -EINVAL; | 159 | err = -EINVAL; |
160 | list_for_each_entry(rdev1, &mddev->disks, same_set) { | 160 | rdev_for_each(rdev1, mddev) { |
161 | int j = rdev1->raid_disk; | 161 | int j = rdev1->raid_disk; |
162 | 162 | ||
163 | if (mddev->level == 10) { | 163 | if (mddev->level == 10) { |
@@ -188,16 +188,10 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf) | |||
188 | 188 | ||
189 | disk_stack_limits(mddev->gendisk, rdev1->bdev, | 189 | disk_stack_limits(mddev->gendisk, rdev1->bdev, |
190 | rdev1->data_offset << 9); | 190 | rdev1->data_offset << 9); |
191 | /* as we don't honour merge_bvec_fn, we must never risk | ||
192 | * violating it, so limit ->max_segments to 1, lying within | ||
193 | * a single page. | ||
194 | */ | ||
195 | 191 | ||
196 | if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) { | 192 | if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) |
197 | blk_queue_max_segments(mddev->queue, 1); | 193 | conf->has_merge_bvec = 1; |
198 | blk_queue_segment_boundary(mddev->queue, | 194 | |
199 | PAGE_CACHE_SIZE - 1); | ||
200 | } | ||
201 | if (!smallest || (rdev1->sectors < smallest->sectors)) | 195 | if (!smallest || (rdev1->sectors < smallest->sectors)) |
202 | smallest = rdev1; | 196 | smallest = rdev1; |
203 | cnt++; | 197 | cnt++; |
@@ -290,8 +284,64 @@ abort: | |||
290 | return err; | 284 | return err; |
291 | } | 285 | } |
292 | 286 | ||
287 | /* Find the zone which holds a particular offset | ||
288 | * Update *sectorp to be an offset in that zone | ||
289 | */ | ||
290 | static struct strip_zone *find_zone(struct r0conf *conf, | ||
291 | sector_t *sectorp) | ||
292 | { | ||
293 | int i; | ||
294 | struct strip_zone *z = conf->strip_zone; | ||
295 | sector_t sector = *sectorp; | ||
296 | |||
297 | for (i = 0; i < conf->nr_strip_zones; i++) | ||
298 | if (sector < z[i].zone_end) { | ||
299 | if (i) | ||
300 | *sectorp = sector - z[i-1].zone_end; | ||
301 | return z + i; | ||
302 | } | ||
303 | BUG(); | ||
304 | } | ||
305 | |||
306 | /* | ||
307 | * remaps the bio to the target device. we separate two flows. | ||
308 | * power 2 flow and a general flow for the sake of perfromance | ||
309 | */ | ||
310 | static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone, | ||
311 | sector_t sector, sector_t *sector_offset) | ||
312 | { | ||
313 | unsigned int sect_in_chunk; | ||
314 | sector_t chunk; | ||
315 | struct r0conf *conf = mddev->private; | ||
316 | int raid_disks = conf->strip_zone[0].nb_dev; | ||
317 | unsigned int chunk_sects = mddev->chunk_sectors; | ||
318 | |||
319 | if (is_power_of_2(chunk_sects)) { | ||
320 | int chunksect_bits = ffz(~chunk_sects); | ||
321 | /* find the sector offset inside the chunk */ | ||
322 | sect_in_chunk = sector & (chunk_sects - 1); | ||
323 | sector >>= chunksect_bits; | ||
324 | /* chunk in zone */ | ||
325 | chunk = *sector_offset; | ||
326 | /* quotient is the chunk in real device*/ | ||
327 | sector_div(chunk, zone->nb_dev << chunksect_bits); | ||
328 | } else{ | ||
329 | sect_in_chunk = sector_div(sector, chunk_sects); | ||
330 | chunk = *sector_offset; | ||
331 | sector_div(chunk, chunk_sects * zone->nb_dev); | ||
332 | } | ||
333 | /* | ||
334 | * position the bio over the real device | ||
335 | * real sector = chunk in device + starting of zone | ||
336 | * + the position in the chunk | ||
337 | */ | ||
338 | *sector_offset = (chunk * chunk_sects) + sect_in_chunk; | ||
339 | return conf->devlist[(zone - conf->strip_zone)*raid_disks | ||
340 | + sector_div(sector, zone->nb_dev)]; | ||
341 | } | ||
342 | |||
293 | /** | 343 | /** |
294 | * raid0_mergeable_bvec -- tell bio layer if a two requests can be merged | 344 | * raid0_mergeable_bvec -- tell bio layer if two requests can be merged |
295 | * @q: request queue | 345 | * @q: request queue |
296 | * @bvm: properties of new bio | 346 | * @bvm: properties of new bio |
297 | * @biovec: the request that could be merged to it. | 347 | * @biovec: the request that could be merged to it. |
@@ -303,10 +353,15 @@ static int raid0_mergeable_bvec(struct request_queue *q, | |||
303 | struct bio_vec *biovec) | 353 | struct bio_vec *biovec) |
304 | { | 354 | { |
305 | struct mddev *mddev = q->queuedata; | 355 | struct mddev *mddev = q->queuedata; |
356 | struct r0conf *conf = mddev->private; | ||
306 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); | 357 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); |
358 | sector_t sector_offset = sector; | ||
307 | int max; | 359 | int max; |
308 | unsigned int chunk_sectors = mddev->chunk_sectors; | 360 | unsigned int chunk_sectors = mddev->chunk_sectors; |
309 | unsigned int bio_sectors = bvm->bi_size >> 9; | 361 | unsigned int bio_sectors = bvm->bi_size >> 9; |
362 | struct strip_zone *zone; | ||
363 | struct md_rdev *rdev; | ||
364 | struct request_queue *subq; | ||
310 | 365 | ||
311 | if (is_power_of_2(chunk_sectors)) | 366 | if (is_power_of_2(chunk_sectors)) |
312 | max = (chunk_sectors - ((sector & (chunk_sectors-1)) | 367 | max = (chunk_sectors - ((sector & (chunk_sectors-1)) |
@@ -314,10 +369,27 @@ static int raid0_mergeable_bvec(struct request_queue *q, | |||
314 | else | 369 | else |
315 | max = (chunk_sectors - (sector_div(sector, chunk_sectors) | 370 | max = (chunk_sectors - (sector_div(sector, chunk_sectors) |
316 | + bio_sectors)) << 9; | 371 | + bio_sectors)) << 9; |
317 | if (max < 0) max = 0; /* bio_add cannot handle a negative return */ | 372 | if (max < 0) |
373 | max = 0; /* bio_add cannot handle a negative return */ | ||
318 | if (max <= biovec->bv_len && bio_sectors == 0) | 374 | if (max <= biovec->bv_len && bio_sectors == 0) |
319 | return biovec->bv_len; | 375 | return biovec->bv_len; |
320 | else | 376 | if (max < biovec->bv_len) |
377 | /* too small already, no need to check further */ | ||
378 | return max; | ||
379 | if (!conf->has_merge_bvec) | ||
380 | return max; | ||
381 | |||
382 | /* May need to check subordinate device */ | ||
383 | sector = sector_offset; | ||
384 | zone = find_zone(mddev->private, §or_offset); | ||
385 | rdev = map_sector(mddev, zone, sector, §or_offset); | ||
386 | subq = bdev_get_queue(rdev->bdev); | ||
387 | if (subq->merge_bvec_fn) { | ||
388 | bvm->bi_bdev = rdev->bdev; | ||
389 | bvm->bi_sector = sector_offset + zone->dev_start + | ||
390 | rdev->data_offset; | ||
391 | return min(max, subq->merge_bvec_fn(subq, bvm, biovec)); | ||
392 | } else | ||
321 | return max; | 393 | return max; |
322 | } | 394 | } |
323 | 395 | ||
@@ -329,7 +401,7 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks | |||
329 | WARN_ONCE(sectors || raid_disks, | 401 | WARN_ONCE(sectors || raid_disks, |
330 | "%s does not support generic reshape\n", __func__); | 402 | "%s does not support generic reshape\n", __func__); |
331 | 403 | ||
332 | list_for_each_entry(rdev, &mddev->disks, same_set) | 404 | rdev_for_each(rdev, mddev) |
333 | array_sectors += rdev->sectors; | 405 | array_sectors += rdev->sectors; |
334 | 406 | ||
335 | return array_sectors; | 407 | return array_sectors; |
@@ -397,62 +469,6 @@ static int raid0_stop(struct mddev *mddev) | |||
397 | return 0; | 469 | return 0; |
398 | } | 470 | } |
399 | 471 | ||
400 | /* Find the zone which holds a particular offset | ||
401 | * Update *sectorp to be an offset in that zone | ||
402 | */ | ||
403 | static struct strip_zone *find_zone(struct r0conf *conf, | ||
404 | sector_t *sectorp) | ||
405 | { | ||
406 | int i; | ||
407 | struct strip_zone *z = conf->strip_zone; | ||
408 | sector_t sector = *sectorp; | ||
409 | |||
410 | for (i = 0; i < conf->nr_strip_zones; i++) | ||
411 | if (sector < z[i].zone_end) { | ||
412 | if (i) | ||
413 | *sectorp = sector - z[i-1].zone_end; | ||
414 | return z + i; | ||
415 | } | ||
416 | BUG(); | ||
417 | } | ||
418 | |||
419 | /* | ||
420 | * remaps the bio to the target device. we separate two flows. | ||
421 | * power 2 flow and a general flow for the sake of perfromance | ||
422 | */ | ||
423 | static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone, | ||
424 | sector_t sector, sector_t *sector_offset) | ||
425 | { | ||
426 | unsigned int sect_in_chunk; | ||
427 | sector_t chunk; | ||
428 | struct r0conf *conf = mddev->private; | ||
429 | int raid_disks = conf->strip_zone[0].nb_dev; | ||
430 | unsigned int chunk_sects = mddev->chunk_sectors; | ||
431 | |||
432 | if (is_power_of_2(chunk_sects)) { | ||
433 | int chunksect_bits = ffz(~chunk_sects); | ||
434 | /* find the sector offset inside the chunk */ | ||
435 | sect_in_chunk = sector & (chunk_sects - 1); | ||
436 | sector >>= chunksect_bits; | ||
437 | /* chunk in zone */ | ||
438 | chunk = *sector_offset; | ||
439 | /* quotient is the chunk in real device*/ | ||
440 | sector_div(chunk, zone->nb_dev << chunksect_bits); | ||
441 | } else{ | ||
442 | sect_in_chunk = sector_div(sector, chunk_sects); | ||
443 | chunk = *sector_offset; | ||
444 | sector_div(chunk, chunk_sects * zone->nb_dev); | ||
445 | } | ||
446 | /* | ||
447 | * position the bio over the real device | ||
448 | * real sector = chunk in device + starting of zone | ||
449 | * + the position in the chunk | ||
450 | */ | ||
451 | *sector_offset = (chunk * chunk_sects) + sect_in_chunk; | ||
452 | return conf->devlist[(zone - conf->strip_zone)*raid_disks | ||
453 | + sector_div(sector, zone->nb_dev)]; | ||
454 | } | ||
455 | |||
456 | /* | 472 | /* |
457 | * Is io distribute over 1 or more chunks ? | 473 | * Is io distribute over 1 or more chunks ? |
458 | */ | 474 | */ |
@@ -505,7 +521,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) | |||
505 | } | 521 | } |
506 | 522 | ||
507 | sector_offset = bio->bi_sector; | 523 | sector_offset = bio->bi_sector; |
508 | zone = find_zone(mddev->private, §or_offset); | 524 | zone = find_zone(mddev->private, §or_offset); |
509 | tmp_dev = map_sector(mddev, zone, bio->bi_sector, | 525 | tmp_dev = map_sector(mddev, zone, bio->bi_sector, |
510 | §or_offset); | 526 | §or_offset); |
511 | bio->bi_bdev = tmp_dev->bdev; | 527 | bio->bi_bdev = tmp_dev->bdev; |
@@ -543,7 +559,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev) | |||
543 | return ERR_PTR(-EINVAL); | 559 | return ERR_PTR(-EINVAL); |
544 | } | 560 | } |
545 | 561 | ||
546 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 562 | rdev_for_each(rdev, mddev) { |
547 | /* check slot number for a disk */ | 563 | /* check slot number for a disk */ |
548 | if (rdev->raid_disk == mddev->raid_disks-1) { | 564 | if (rdev->raid_disk == mddev->raid_disks-1) { |
549 | printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", | 565 | printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", |
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h index 0884bba8df4c..05539d9c97f0 100644 --- a/drivers/md/raid0.h +++ b/drivers/md/raid0.h | |||
@@ -4,13 +4,16 @@ | |||
4 | struct strip_zone { | 4 | struct strip_zone { |
5 | sector_t zone_end; /* Start of the next zone (in sectors) */ | 5 | sector_t zone_end; /* Start of the next zone (in sectors) */ |
6 | sector_t dev_start; /* Zone offset in real dev (in sectors) */ | 6 | sector_t dev_start; /* Zone offset in real dev (in sectors) */ |
7 | int nb_dev; /* # of devices attached to the zone */ | 7 | int nb_dev; /* # of devices attached to the zone */ |
8 | }; | 8 | }; |
9 | 9 | ||
10 | struct r0conf { | 10 | struct r0conf { |
11 | struct strip_zone *strip_zone; | 11 | struct strip_zone *strip_zone; |
12 | struct md_rdev **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ | 12 | struct md_rdev **devlist; /* lists of rdevs, pointed to |
13 | int nr_strip_zones; | 13 | * by strip_zone->dev */ |
14 | int nr_strip_zones; | ||
15 | int has_merge_bvec; /* at least one member has | ||
16 | * a merge_bvec_fn */ | ||
14 | }; | 17 | }; |
15 | 18 | ||
16 | #endif | 19 | #endif |
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index a368db2431a5..4a40a200d769 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c | |||
@@ -523,6 +523,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
523 | rdev = rcu_dereference(conf->mirrors[disk].rdev); | 523 | rdev = rcu_dereference(conf->mirrors[disk].rdev); |
524 | if (r1_bio->bios[disk] == IO_BLOCKED | 524 | if (r1_bio->bios[disk] == IO_BLOCKED |
525 | || rdev == NULL | 525 | || rdev == NULL |
526 | || test_bit(Unmerged, &rdev->flags) | ||
526 | || test_bit(Faulty, &rdev->flags)) | 527 | || test_bit(Faulty, &rdev->flags)) |
527 | continue; | 528 | continue; |
528 | if (!test_bit(In_sync, &rdev->flags) && | 529 | if (!test_bit(In_sync, &rdev->flags) && |
@@ -614,6 +615,39 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect | |||
614 | return best_disk; | 615 | return best_disk; |
615 | } | 616 | } |
616 | 617 | ||
618 | static int raid1_mergeable_bvec(struct request_queue *q, | ||
619 | struct bvec_merge_data *bvm, | ||
620 | struct bio_vec *biovec) | ||
621 | { | ||
622 | struct mddev *mddev = q->queuedata; | ||
623 | struct r1conf *conf = mddev->private; | ||
624 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); | ||
625 | int max = biovec->bv_len; | ||
626 | |||
627 | if (mddev->merge_check_needed) { | ||
628 | int disk; | ||
629 | rcu_read_lock(); | ||
630 | for (disk = 0; disk < conf->raid_disks * 2; disk++) { | ||
631 | struct md_rdev *rdev = rcu_dereference( | ||
632 | conf->mirrors[disk].rdev); | ||
633 | if (rdev && !test_bit(Faulty, &rdev->flags)) { | ||
634 | struct request_queue *q = | ||
635 | bdev_get_queue(rdev->bdev); | ||
636 | if (q->merge_bvec_fn) { | ||
637 | bvm->bi_sector = sector + | ||
638 | rdev->data_offset; | ||
639 | bvm->bi_bdev = rdev->bdev; | ||
640 | max = min(max, q->merge_bvec_fn( | ||
641 | q, bvm, biovec)); | ||
642 | } | ||
643 | } | ||
644 | } | ||
645 | rcu_read_unlock(); | ||
646 | } | ||
647 | return max; | ||
648 | |||
649 | } | ||
650 | |||
617 | int md_raid1_congested(struct mddev *mddev, int bits) | 651 | int md_raid1_congested(struct mddev *mddev, int bits) |
618 | { | 652 | { |
619 | struct r1conf *conf = mddev->private; | 653 | struct r1conf *conf = mddev->private; |
@@ -624,7 +658,7 @@ int md_raid1_congested(struct mddev *mddev, int bits) | |||
624 | return 1; | 658 | return 1; |
625 | 659 | ||
626 | rcu_read_lock(); | 660 | rcu_read_lock(); |
627 | for (i = 0; i < conf->raid_disks; i++) { | 661 | for (i = 0; i < conf->raid_disks * 2; i++) { |
628 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); | 662 | struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); |
629 | if (rdev && !test_bit(Faulty, &rdev->flags)) { | 663 | if (rdev && !test_bit(Faulty, &rdev->flags)) { |
630 | struct request_queue *q = bdev_get_queue(rdev->bdev); | 664 | struct request_queue *q = bdev_get_queue(rdev->bdev); |
@@ -737,9 +771,22 @@ static void wait_barrier(struct r1conf *conf) | |||
737 | spin_lock_irq(&conf->resync_lock); | 771 | spin_lock_irq(&conf->resync_lock); |
738 | if (conf->barrier) { | 772 | if (conf->barrier) { |
739 | conf->nr_waiting++; | 773 | conf->nr_waiting++; |
740 | wait_event_lock_irq(conf->wait_barrier, !conf->barrier, | 774 | /* Wait for the barrier to drop. |
775 | * However if there are already pending | ||
776 | * requests (preventing the barrier from | ||
777 | * rising completely), and the | ||
778 | * pre-process bio queue isn't empty, | ||
779 | * then don't wait, as we need to empty | ||
780 | * that queue to get the nr_pending | ||
781 | * count down. | ||
782 | */ | ||
783 | wait_event_lock_irq(conf->wait_barrier, | ||
784 | !conf->barrier || | ||
785 | (conf->nr_pending && | ||
786 | current->bio_list && | ||
787 | !bio_list_empty(current->bio_list)), | ||
741 | conf->resync_lock, | 788 | conf->resync_lock, |
742 | ); | 789 | ); |
743 | conf->nr_waiting--; | 790 | conf->nr_waiting--; |
744 | } | 791 | } |
745 | conf->nr_pending++; | 792 | conf->nr_pending++; |
@@ -1002,7 +1049,8 @@ read_again: | |||
1002 | break; | 1049 | break; |
1003 | } | 1050 | } |
1004 | r1_bio->bios[i] = NULL; | 1051 | r1_bio->bios[i] = NULL; |
1005 | if (!rdev || test_bit(Faulty, &rdev->flags)) { | 1052 | if (!rdev || test_bit(Faulty, &rdev->flags) |
1053 | || test_bit(Unmerged, &rdev->flags)) { | ||
1006 | if (i < conf->raid_disks) | 1054 | if (i < conf->raid_disks) |
1007 | set_bit(R1BIO_Degraded, &r1_bio->state); | 1055 | set_bit(R1BIO_Degraded, &r1_bio->state); |
1008 | continue; | 1056 | continue; |
@@ -1322,6 +1370,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1322 | struct mirror_info *p; | 1370 | struct mirror_info *p; |
1323 | int first = 0; | 1371 | int first = 0; |
1324 | int last = conf->raid_disks - 1; | 1372 | int last = conf->raid_disks - 1; |
1373 | struct request_queue *q = bdev_get_queue(rdev->bdev); | ||
1325 | 1374 | ||
1326 | if (mddev->recovery_disabled == conf->recovery_disabled) | 1375 | if (mddev->recovery_disabled == conf->recovery_disabled) |
1327 | return -EBUSY; | 1376 | return -EBUSY; |
@@ -1329,23 +1378,17 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1329 | if (rdev->raid_disk >= 0) | 1378 | if (rdev->raid_disk >= 0) |
1330 | first = last = rdev->raid_disk; | 1379 | first = last = rdev->raid_disk; |
1331 | 1380 | ||
1381 | if (q->merge_bvec_fn) { | ||
1382 | set_bit(Unmerged, &rdev->flags); | ||
1383 | mddev->merge_check_needed = 1; | ||
1384 | } | ||
1385 | |||
1332 | for (mirror = first; mirror <= last; mirror++) { | 1386 | for (mirror = first; mirror <= last; mirror++) { |
1333 | p = conf->mirrors+mirror; | 1387 | p = conf->mirrors+mirror; |
1334 | if (!p->rdev) { | 1388 | if (!p->rdev) { |
1335 | 1389 | ||
1336 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 1390 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
1337 | rdev->data_offset << 9); | 1391 | rdev->data_offset << 9); |
1338 | /* as we don't honour merge_bvec_fn, we must | ||
1339 | * never risk violating it, so limit | ||
1340 | * ->max_segments to one lying with a single | ||
1341 | * page, as a one page request is never in | ||
1342 | * violation. | ||
1343 | */ | ||
1344 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { | ||
1345 | blk_queue_max_segments(mddev->queue, 1); | ||
1346 | blk_queue_segment_boundary(mddev->queue, | ||
1347 | PAGE_CACHE_SIZE - 1); | ||
1348 | } | ||
1349 | 1392 | ||
1350 | p->head_position = 0; | 1393 | p->head_position = 0; |
1351 | rdev->raid_disk = mirror; | 1394 | rdev->raid_disk = mirror; |
@@ -1370,6 +1413,19 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1370 | break; | 1413 | break; |
1371 | } | 1414 | } |
1372 | } | 1415 | } |
1416 | if (err == 0 && test_bit(Unmerged, &rdev->flags)) { | ||
1417 | /* Some requests might not have seen this new | ||
1418 | * merge_bvec_fn. We must wait for them to complete | ||
1419 | * before merging the device fully. | ||
1420 | * First we make sure any code which has tested | ||
1421 | * our function has submitted the request, then | ||
1422 | * we wait for all outstanding requests to complete. | ||
1423 | */ | ||
1424 | synchronize_sched(); | ||
1425 | raise_barrier(conf); | ||
1426 | lower_barrier(conf); | ||
1427 | clear_bit(Unmerged, &rdev->flags); | ||
1428 | } | ||
1373 | md_integrity_add_rdev(rdev, mddev); | 1429 | md_integrity_add_rdev(rdev, mddev); |
1374 | print_conf(conf); | 1430 | print_conf(conf); |
1375 | return err; | 1431 | return err; |
@@ -2491,7 +2547,7 @@ static struct r1conf *setup_conf(struct mddev *mddev) | |||
2491 | 2547 | ||
2492 | err = -EINVAL; | 2548 | err = -EINVAL; |
2493 | spin_lock_init(&conf->device_lock); | 2549 | spin_lock_init(&conf->device_lock); |
2494 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 2550 | rdev_for_each(rdev, mddev) { |
2495 | int disk_idx = rdev->raid_disk; | 2551 | int disk_idx = rdev->raid_disk; |
2496 | if (disk_idx >= mddev->raid_disks | 2552 | if (disk_idx >= mddev->raid_disks |
2497 | || disk_idx < 0) | 2553 | || disk_idx < 0) |
@@ -2609,20 +2665,11 @@ static int run(struct mddev *mddev) | |||
2609 | if (IS_ERR(conf)) | 2665 | if (IS_ERR(conf)) |
2610 | return PTR_ERR(conf); | 2666 | return PTR_ERR(conf); |
2611 | 2667 | ||
2612 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 2668 | rdev_for_each(rdev, mddev) { |
2613 | if (!mddev->gendisk) | 2669 | if (!mddev->gendisk) |
2614 | continue; | 2670 | continue; |
2615 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 2671 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
2616 | rdev->data_offset << 9); | 2672 | rdev->data_offset << 9); |
2617 | /* as we don't honour merge_bvec_fn, we must never risk | ||
2618 | * violating it, so limit ->max_segments to 1 lying within | ||
2619 | * a single page, as a one page request is never in violation. | ||
2620 | */ | ||
2621 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { | ||
2622 | blk_queue_max_segments(mddev->queue, 1); | ||
2623 | blk_queue_segment_boundary(mddev->queue, | ||
2624 | PAGE_CACHE_SIZE - 1); | ||
2625 | } | ||
2626 | } | 2673 | } |
2627 | 2674 | ||
2628 | mddev->degraded = 0; | 2675 | mddev->degraded = 0; |
@@ -2656,6 +2703,7 @@ static int run(struct mddev *mddev) | |||
2656 | if (mddev->queue) { | 2703 | if (mddev->queue) { |
2657 | mddev->queue->backing_dev_info.congested_fn = raid1_congested; | 2704 | mddev->queue->backing_dev_info.congested_fn = raid1_congested; |
2658 | mddev->queue->backing_dev_info.congested_data = mddev; | 2705 | mddev->queue->backing_dev_info.congested_data = mddev; |
2706 | blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec); | ||
2659 | } | 2707 | } |
2660 | return md_integrity_register(mddev); | 2708 | return md_integrity_register(mddev); |
2661 | } | 2709 | } |
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6e8aa213f0d5..3540316886f2 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c | |||
@@ -67,6 +67,7 @@ static int max_queued_requests = 1024; | |||
67 | 67 | ||
68 | static void allow_barrier(struct r10conf *conf); | 68 | static void allow_barrier(struct r10conf *conf); |
69 | static void lower_barrier(struct r10conf *conf); | 69 | static void lower_barrier(struct r10conf *conf); |
70 | static int enough(struct r10conf *conf, int ignore); | ||
70 | 71 | ||
71 | static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) | 72 | static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) |
72 | { | 73 | { |
@@ -347,6 +348,19 @@ static void raid10_end_read_request(struct bio *bio, int error) | |||
347 | * wait for the 'master' bio. | 348 | * wait for the 'master' bio. |
348 | */ | 349 | */ |
349 | set_bit(R10BIO_Uptodate, &r10_bio->state); | 350 | set_bit(R10BIO_Uptodate, &r10_bio->state); |
351 | } else { | ||
352 | /* If all other devices that store this block have | ||
353 | * failed, we want to return the error upwards rather | ||
354 | * than fail the last device. Here we redefine | ||
355 | * "uptodate" to mean "Don't want to retry" | ||
356 | */ | ||
357 | unsigned long flags; | ||
358 | spin_lock_irqsave(&conf->device_lock, flags); | ||
359 | if (!enough(conf, rdev->raid_disk)) | ||
360 | uptodate = 1; | ||
361 | spin_unlock_irqrestore(&conf->device_lock, flags); | ||
362 | } | ||
363 | if (uptodate) { | ||
350 | raid_end_bio_io(r10_bio); | 364 | raid_end_bio_io(r10_bio); |
351 | rdev_dec_pending(rdev, conf->mddev); | 365 | rdev_dec_pending(rdev, conf->mddev); |
352 | } else { | 366 | } else { |
@@ -572,25 +586,68 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev) | |||
572 | * @biovec: the request that could be merged to it. | 586 | * @biovec: the request that could be merged to it. |
573 | * | 587 | * |
574 | * Return amount of bytes we can accept at this offset | 588 | * Return amount of bytes we can accept at this offset |
575 | * If near_copies == raid_disk, there are no striping issues, | 589 | * This requires checking for end-of-chunk if near_copies != raid_disks, |
576 | * but in that case, the function isn't called at all. | 590 | * and for subordinate merge_bvec_fns if merge_check_needed. |
577 | */ | 591 | */ |
578 | static int raid10_mergeable_bvec(struct request_queue *q, | 592 | static int raid10_mergeable_bvec(struct request_queue *q, |
579 | struct bvec_merge_data *bvm, | 593 | struct bvec_merge_data *bvm, |
580 | struct bio_vec *biovec) | 594 | struct bio_vec *biovec) |
581 | { | 595 | { |
582 | struct mddev *mddev = q->queuedata; | 596 | struct mddev *mddev = q->queuedata; |
597 | struct r10conf *conf = mddev->private; | ||
583 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); | 598 | sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); |
584 | int max; | 599 | int max; |
585 | unsigned int chunk_sectors = mddev->chunk_sectors; | 600 | unsigned int chunk_sectors = mddev->chunk_sectors; |
586 | unsigned int bio_sectors = bvm->bi_size >> 9; | 601 | unsigned int bio_sectors = bvm->bi_size >> 9; |
587 | 602 | ||
588 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; | 603 | if (conf->near_copies < conf->raid_disks) { |
589 | if (max < 0) max = 0; /* bio_add cannot handle a negative return */ | 604 | max = (chunk_sectors - ((sector & (chunk_sectors - 1)) |
590 | if (max <= biovec->bv_len && bio_sectors == 0) | 605 | + bio_sectors)) << 9; |
591 | return biovec->bv_len; | 606 | if (max < 0) |
592 | else | 607 | /* bio_add cannot handle a negative return */ |
593 | return max; | 608 | max = 0; |
609 | if (max <= biovec->bv_len && bio_sectors == 0) | ||
610 | return biovec->bv_len; | ||
611 | } else | ||
612 | max = biovec->bv_len; | ||
613 | |||
614 | if (mddev->merge_check_needed) { | ||
615 | struct r10bio r10_bio; | ||
616 | int s; | ||
617 | r10_bio.sector = sector; | ||
618 | raid10_find_phys(conf, &r10_bio); | ||
619 | rcu_read_lock(); | ||
620 | for (s = 0; s < conf->copies; s++) { | ||
621 | int disk = r10_bio.devs[s].devnum; | ||
622 | struct md_rdev *rdev = rcu_dereference( | ||
623 | conf->mirrors[disk].rdev); | ||
624 | if (rdev && !test_bit(Faulty, &rdev->flags)) { | ||
625 | struct request_queue *q = | ||
626 | bdev_get_queue(rdev->bdev); | ||
627 | if (q->merge_bvec_fn) { | ||
628 | bvm->bi_sector = r10_bio.devs[s].addr | ||
629 | + rdev->data_offset; | ||
630 | bvm->bi_bdev = rdev->bdev; | ||
631 | max = min(max, q->merge_bvec_fn( | ||
632 | q, bvm, biovec)); | ||
633 | } | ||
634 | } | ||
635 | rdev = rcu_dereference(conf->mirrors[disk].replacement); | ||
636 | if (rdev && !test_bit(Faulty, &rdev->flags)) { | ||
637 | struct request_queue *q = | ||
638 | bdev_get_queue(rdev->bdev); | ||
639 | if (q->merge_bvec_fn) { | ||
640 | bvm->bi_sector = r10_bio.devs[s].addr | ||
641 | + rdev->data_offset; | ||
642 | bvm->bi_bdev = rdev->bdev; | ||
643 | max = min(max, q->merge_bvec_fn( | ||
644 | q, bvm, biovec)); | ||
645 | } | ||
646 | } | ||
647 | } | ||
648 | rcu_read_unlock(); | ||
649 | } | ||
650 | return max; | ||
594 | } | 651 | } |
595 | 652 | ||
596 | /* | 653 | /* |
@@ -654,11 +711,12 @@ retry: | |||
654 | disk = r10_bio->devs[slot].devnum; | 711 | disk = r10_bio->devs[slot].devnum; |
655 | rdev = rcu_dereference(conf->mirrors[disk].replacement); | 712 | rdev = rcu_dereference(conf->mirrors[disk].replacement); |
656 | if (rdev == NULL || test_bit(Faulty, &rdev->flags) || | 713 | if (rdev == NULL || test_bit(Faulty, &rdev->flags) || |
714 | test_bit(Unmerged, &rdev->flags) || | ||
657 | r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) | 715 | r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) |
658 | rdev = rcu_dereference(conf->mirrors[disk].rdev); | 716 | rdev = rcu_dereference(conf->mirrors[disk].rdev); |
659 | if (rdev == NULL) | 717 | if (rdev == NULL || |
660 | continue; | 718 | test_bit(Faulty, &rdev->flags) || |
661 | if (test_bit(Faulty, &rdev->flags)) | 719 | test_bit(Unmerged, &rdev->flags)) |
662 | continue; | 720 | continue; |
663 | if (!test_bit(In_sync, &rdev->flags) && | 721 | if (!test_bit(In_sync, &rdev->flags) && |
664 | r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) | 722 | r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) |
@@ -849,9 +907,22 @@ static void wait_barrier(struct r10conf *conf) | |||
849 | spin_lock_irq(&conf->resync_lock); | 907 | spin_lock_irq(&conf->resync_lock); |
850 | if (conf->barrier) { | 908 | if (conf->barrier) { |
851 | conf->nr_waiting++; | 909 | conf->nr_waiting++; |
852 | wait_event_lock_irq(conf->wait_barrier, !conf->barrier, | 910 | /* Wait for the barrier to drop. |
911 | * However if there are already pending | ||
912 | * requests (preventing the barrier from | ||
913 | * rising completely), and the | ||
914 | * pre-process bio queue isn't empty, | ||
915 | * then don't wait, as we need to empty | ||
916 | * that queue to get the nr_pending | ||
917 | * count down. | ||
918 | */ | ||
919 | wait_event_lock_irq(conf->wait_barrier, | ||
920 | !conf->barrier || | ||
921 | (conf->nr_pending && | ||
922 | current->bio_list && | ||
923 | !bio_list_empty(current->bio_list)), | ||
853 | conf->resync_lock, | 924 | conf->resync_lock, |
854 | ); | 925 | ); |
855 | conf->nr_waiting--; | 926 | conf->nr_waiting--; |
856 | } | 927 | } |
857 | conf->nr_pending++; | 928 | conf->nr_pending++; |
@@ -1107,12 +1178,14 @@ retry_write: | |||
1107 | blocked_rdev = rrdev; | 1178 | blocked_rdev = rrdev; |
1108 | break; | 1179 | break; |
1109 | } | 1180 | } |
1110 | if (rrdev && test_bit(Faulty, &rrdev->flags)) | 1181 | if (rrdev && (test_bit(Faulty, &rrdev->flags) |
1182 | || test_bit(Unmerged, &rrdev->flags))) | ||
1111 | rrdev = NULL; | 1183 | rrdev = NULL; |
1112 | 1184 | ||
1113 | r10_bio->devs[i].bio = NULL; | 1185 | r10_bio->devs[i].bio = NULL; |
1114 | r10_bio->devs[i].repl_bio = NULL; | 1186 | r10_bio->devs[i].repl_bio = NULL; |
1115 | if (!rdev || test_bit(Faulty, &rdev->flags)) { | 1187 | if (!rdev || test_bit(Faulty, &rdev->flags) || |
1188 | test_bit(Unmerged, &rdev->flags)) { | ||
1116 | set_bit(R10BIO_Degraded, &r10_bio->state); | 1189 | set_bit(R10BIO_Degraded, &r10_bio->state); |
1117 | continue; | 1190 | continue; |
1118 | } | 1191 | } |
@@ -1463,18 +1536,24 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1463 | int mirror; | 1536 | int mirror; |
1464 | int first = 0; | 1537 | int first = 0; |
1465 | int last = conf->raid_disks - 1; | 1538 | int last = conf->raid_disks - 1; |
1539 | struct request_queue *q = bdev_get_queue(rdev->bdev); | ||
1466 | 1540 | ||
1467 | if (mddev->recovery_cp < MaxSector) | 1541 | if (mddev->recovery_cp < MaxSector) |
1468 | /* only hot-add to in-sync arrays, as recovery is | 1542 | /* only hot-add to in-sync arrays, as recovery is |
1469 | * very different from resync | 1543 | * very different from resync |
1470 | */ | 1544 | */ |
1471 | return -EBUSY; | 1545 | return -EBUSY; |
1472 | if (!enough(conf, -1)) | 1546 | if (rdev->saved_raid_disk < 0 && !enough(conf, -1)) |
1473 | return -EINVAL; | 1547 | return -EINVAL; |
1474 | 1548 | ||
1475 | if (rdev->raid_disk >= 0) | 1549 | if (rdev->raid_disk >= 0) |
1476 | first = last = rdev->raid_disk; | 1550 | first = last = rdev->raid_disk; |
1477 | 1551 | ||
1552 | if (q->merge_bvec_fn) { | ||
1553 | set_bit(Unmerged, &rdev->flags); | ||
1554 | mddev->merge_check_needed = 1; | ||
1555 | } | ||
1556 | |||
1478 | if (rdev->saved_raid_disk >= first && | 1557 | if (rdev->saved_raid_disk >= first && |
1479 | conf->mirrors[rdev->saved_raid_disk].rdev == NULL) | 1558 | conf->mirrors[rdev->saved_raid_disk].rdev == NULL) |
1480 | mirror = rdev->saved_raid_disk; | 1559 | mirror = rdev->saved_raid_disk; |
@@ -1494,11 +1573,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1494 | err = 0; | 1573 | err = 0; |
1495 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 1574 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
1496 | rdev->data_offset << 9); | 1575 | rdev->data_offset << 9); |
1497 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { | ||
1498 | blk_queue_max_segments(mddev->queue, 1); | ||
1499 | blk_queue_segment_boundary(mddev->queue, | ||
1500 | PAGE_CACHE_SIZE - 1); | ||
1501 | } | ||
1502 | conf->fullsync = 1; | 1576 | conf->fullsync = 1; |
1503 | rcu_assign_pointer(p->replacement, rdev); | 1577 | rcu_assign_pointer(p->replacement, rdev); |
1504 | break; | 1578 | break; |
@@ -1506,17 +1580,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1506 | 1580 | ||
1507 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 1581 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
1508 | rdev->data_offset << 9); | 1582 | rdev->data_offset << 9); |
1509 | /* as we don't honour merge_bvec_fn, we must | ||
1510 | * never risk violating it, so limit | ||
1511 | * ->max_segments to one lying with a single | ||
1512 | * page, as a one page request is never in | ||
1513 | * violation. | ||
1514 | */ | ||
1515 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { | ||
1516 | blk_queue_max_segments(mddev->queue, 1); | ||
1517 | blk_queue_segment_boundary(mddev->queue, | ||
1518 | PAGE_CACHE_SIZE - 1); | ||
1519 | } | ||
1520 | 1583 | ||
1521 | p->head_position = 0; | 1584 | p->head_position = 0; |
1522 | p->recovery_disabled = mddev->recovery_disabled - 1; | 1585 | p->recovery_disabled = mddev->recovery_disabled - 1; |
@@ -1527,7 +1590,19 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
1527 | rcu_assign_pointer(p->rdev, rdev); | 1590 | rcu_assign_pointer(p->rdev, rdev); |
1528 | break; | 1591 | break; |
1529 | } | 1592 | } |
1530 | 1593 | if (err == 0 && test_bit(Unmerged, &rdev->flags)) { | |
1594 | /* Some requests might not have seen this new | ||
1595 | * merge_bvec_fn. We must wait for them to complete | ||
1596 | * before merging the device fully. | ||
1597 | * First we make sure any code which has tested | ||
1598 | * our function has submitted the request, then | ||
1599 | * we wait for all outstanding requests to complete. | ||
1600 | */ | ||
1601 | synchronize_sched(); | ||
1602 | raise_barrier(conf, 0); | ||
1603 | lower_barrier(conf); | ||
1604 | clear_bit(Unmerged, &rdev->flags); | ||
1605 | } | ||
1531 | md_integrity_add_rdev(rdev, mddev); | 1606 | md_integrity_add_rdev(rdev, mddev); |
1532 | print_conf(conf); | 1607 | print_conf(conf); |
1533 | return err; | 1608 | return err; |
@@ -1668,10 +1743,8 @@ static void end_sync_write(struct bio *bio, int error) | |||
1668 | d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); | 1743 | d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); |
1669 | if (repl) | 1744 | if (repl) |
1670 | rdev = conf->mirrors[d].replacement; | 1745 | rdev = conf->mirrors[d].replacement; |
1671 | if (!rdev) { | 1746 | else |
1672 | smp_mb(); | ||
1673 | rdev = conf->mirrors[d].rdev; | 1747 | rdev = conf->mirrors[d].rdev; |
1674 | } | ||
1675 | 1748 | ||
1676 | if (!uptodate) { | 1749 | if (!uptodate) { |
1677 | if (repl) | 1750 | if (repl) |
@@ -2052,6 +2125,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 | |||
2052 | "md/raid10:%s: %s: Failing raid device\n", | 2125 | "md/raid10:%s: %s: Failing raid device\n", |
2053 | mdname(mddev), b); | 2126 | mdname(mddev), b); |
2054 | md_error(mddev, conf->mirrors[d].rdev); | 2127 | md_error(mddev, conf->mirrors[d].rdev); |
2128 | r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED; | ||
2055 | return; | 2129 | return; |
2056 | } | 2130 | } |
2057 | 2131 | ||
@@ -2072,6 +2146,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 | |||
2072 | d = r10_bio->devs[sl].devnum; | 2146 | d = r10_bio->devs[sl].devnum; |
2073 | rdev = rcu_dereference(conf->mirrors[d].rdev); | 2147 | rdev = rcu_dereference(conf->mirrors[d].rdev); |
2074 | if (rdev && | 2148 | if (rdev && |
2149 | !test_bit(Unmerged, &rdev->flags) && | ||
2075 | test_bit(In_sync, &rdev->flags) && | 2150 | test_bit(In_sync, &rdev->flags) && |
2076 | is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, | 2151 | is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, |
2077 | &first_bad, &bad_sectors) == 0) { | 2152 | &first_bad, &bad_sectors) == 0) { |
@@ -2105,8 +2180,11 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 | |||
2105 | rdev, | 2180 | rdev, |
2106 | r10_bio->devs[r10_bio->read_slot].addr | 2181 | r10_bio->devs[r10_bio->read_slot].addr |
2107 | + sect, | 2182 | + sect, |
2108 | s, 0)) | 2183 | s, 0)) { |
2109 | md_error(mddev, rdev); | 2184 | md_error(mddev, rdev); |
2185 | r10_bio->devs[r10_bio->read_slot].bio | ||
2186 | = IO_BLOCKED; | ||
2187 | } | ||
2110 | break; | 2188 | break; |
2111 | } | 2189 | } |
2112 | 2190 | ||
@@ -2122,6 +2200,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10 | |||
2122 | d = r10_bio->devs[sl].devnum; | 2200 | d = r10_bio->devs[sl].devnum; |
2123 | rdev = rcu_dereference(conf->mirrors[d].rdev); | 2201 | rdev = rcu_dereference(conf->mirrors[d].rdev); |
2124 | if (!rdev || | 2202 | if (!rdev || |
2203 | test_bit(Unmerged, &rdev->flags) || | ||
2125 | !test_bit(In_sync, &rdev->flags)) | 2204 | !test_bit(In_sync, &rdev->flags)) |
2126 | continue; | 2205 | continue; |
2127 | 2206 | ||
@@ -2299,17 +2378,20 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio) | |||
2299 | * This is all done synchronously while the array is | 2378 | * This is all done synchronously while the array is |
2300 | * frozen. | 2379 | * frozen. |
2301 | */ | 2380 | */ |
2381 | bio = r10_bio->devs[slot].bio; | ||
2382 | bdevname(bio->bi_bdev, b); | ||
2383 | bio_put(bio); | ||
2384 | r10_bio->devs[slot].bio = NULL; | ||
2385 | |||
2302 | if (mddev->ro == 0) { | 2386 | if (mddev->ro == 0) { |
2303 | freeze_array(conf); | 2387 | freeze_array(conf); |
2304 | fix_read_error(conf, mddev, r10_bio); | 2388 | fix_read_error(conf, mddev, r10_bio); |
2305 | unfreeze_array(conf); | 2389 | unfreeze_array(conf); |
2306 | } | 2390 | } else |
2391 | r10_bio->devs[slot].bio = IO_BLOCKED; | ||
2392 | |||
2307 | rdev_dec_pending(rdev, mddev); | 2393 | rdev_dec_pending(rdev, mddev); |
2308 | 2394 | ||
2309 | bio = r10_bio->devs[slot].bio; | ||
2310 | bdevname(bio->bi_bdev, b); | ||
2311 | r10_bio->devs[slot].bio = | ||
2312 | mddev->ro ? IO_BLOCKED : NULL; | ||
2313 | read_more: | 2395 | read_more: |
2314 | rdev = read_balance(conf, r10_bio, &max_sectors); | 2396 | rdev = read_balance(conf, r10_bio, &max_sectors); |
2315 | if (rdev == NULL) { | 2397 | if (rdev == NULL) { |
@@ -2318,13 +2400,10 @@ read_more: | |||
2318 | mdname(mddev), b, | 2400 | mdname(mddev), b, |
2319 | (unsigned long long)r10_bio->sector); | 2401 | (unsigned long long)r10_bio->sector); |
2320 | raid_end_bio_io(r10_bio); | 2402 | raid_end_bio_io(r10_bio); |
2321 | bio_put(bio); | ||
2322 | return; | 2403 | return; |
2323 | } | 2404 | } |
2324 | 2405 | ||
2325 | do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); | 2406 | do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); |
2326 | if (bio) | ||
2327 | bio_put(bio); | ||
2328 | slot = r10_bio->read_slot; | 2407 | slot = r10_bio->read_slot; |
2329 | printk_ratelimited( | 2408 | printk_ratelimited( |
2330 | KERN_ERR | 2409 | KERN_ERR |
@@ -2360,7 +2439,6 @@ read_more: | |||
2360 | mbio->bi_phys_segments++; | 2439 | mbio->bi_phys_segments++; |
2361 | spin_unlock_irq(&conf->device_lock); | 2440 | spin_unlock_irq(&conf->device_lock); |
2362 | generic_make_request(bio); | 2441 | generic_make_request(bio); |
2363 | bio = NULL; | ||
2364 | 2442 | ||
2365 | r10_bio = mempool_alloc(conf->r10bio_pool, | 2443 | r10_bio = mempool_alloc(conf->r10bio_pool, |
2366 | GFP_NOIO); | 2444 | GFP_NOIO); |
@@ -3225,7 +3303,7 @@ static int run(struct mddev *mddev) | |||
3225 | blk_queue_io_opt(mddev->queue, chunk_size * | 3303 | blk_queue_io_opt(mddev->queue, chunk_size * |
3226 | (conf->raid_disks / conf->near_copies)); | 3304 | (conf->raid_disks / conf->near_copies)); |
3227 | 3305 | ||
3228 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 3306 | rdev_for_each(rdev, mddev) { |
3229 | 3307 | ||
3230 | disk_idx = rdev->raid_disk; | 3308 | disk_idx = rdev->raid_disk; |
3231 | if (disk_idx >= conf->raid_disks | 3309 | if (disk_idx >= conf->raid_disks |
@@ -3243,18 +3321,8 @@ static int run(struct mddev *mddev) | |||
3243 | disk->rdev = rdev; | 3321 | disk->rdev = rdev; |
3244 | } | 3322 | } |
3245 | 3323 | ||
3246 | disk->rdev = rdev; | ||
3247 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 3324 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
3248 | rdev->data_offset << 9); | 3325 | rdev->data_offset << 9); |
3249 | /* as we don't honour merge_bvec_fn, we must never risk | ||
3250 | * violating it, so limit max_segments to 1 lying | ||
3251 | * within a single page. | ||
3252 | */ | ||
3253 | if (rdev->bdev->bd_disk->queue->merge_bvec_fn) { | ||
3254 | blk_queue_max_segments(mddev->queue, 1); | ||
3255 | blk_queue_segment_boundary(mddev->queue, | ||
3256 | PAGE_CACHE_SIZE - 1); | ||
3257 | } | ||
3258 | 3326 | ||
3259 | disk->head_position = 0; | 3327 | disk->head_position = 0; |
3260 | } | 3328 | } |
@@ -3318,8 +3386,7 @@ static int run(struct mddev *mddev) | |||
3318 | mddev->queue->backing_dev_info.ra_pages = 2* stripe; | 3386 | mddev->queue->backing_dev_info.ra_pages = 2* stripe; |
3319 | } | 3387 | } |
3320 | 3388 | ||
3321 | if (conf->near_copies < conf->raid_disks) | 3389 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); |
3322 | blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); | ||
3323 | 3390 | ||
3324 | if (md_integrity_register(mddev)) | 3391 | if (md_integrity_register(mddev)) |
3325 | goto out_free_conf; | 3392 | goto out_free_conf; |
@@ -3369,6 +3436,43 @@ static void raid10_quiesce(struct mddev *mddev, int state) | |||
3369 | } | 3436 | } |
3370 | } | 3437 | } |
3371 | 3438 | ||
3439 | static int raid10_resize(struct mddev *mddev, sector_t sectors) | ||
3440 | { | ||
3441 | /* Resize of 'far' arrays is not supported. | ||
3442 | * For 'near' and 'offset' arrays we can set the | ||
3443 | * number of sectors used to be an appropriate multiple | ||
3444 | * of the chunk size. | ||
3445 | * For 'offset', this is far_copies*chunksize. | ||
3446 | * For 'near' the multiplier is the LCM of | ||
3447 | * near_copies and raid_disks. | ||
3448 | * So if far_copies > 1 && !far_offset, fail. | ||
3449 | * Else find LCM(raid_disks, near_copy)*far_copies and | ||
3450 | * multiply by chunk_size. Then round to this number. | ||
3451 | * This is mostly done by raid10_size() | ||
3452 | */ | ||
3453 | struct r10conf *conf = mddev->private; | ||
3454 | sector_t oldsize, size; | ||
3455 | |||
3456 | if (conf->far_copies > 1 && !conf->far_offset) | ||
3457 | return -EINVAL; | ||
3458 | |||
3459 | oldsize = raid10_size(mddev, 0, 0); | ||
3460 | size = raid10_size(mddev, sectors, 0); | ||
3461 | md_set_array_sectors(mddev, size); | ||
3462 | if (mddev->array_sectors > size) | ||
3463 | return -EINVAL; | ||
3464 | set_capacity(mddev->gendisk, mddev->array_sectors); | ||
3465 | revalidate_disk(mddev->gendisk); | ||
3466 | if (sectors > mddev->dev_sectors && | ||
3467 | mddev->recovery_cp > oldsize) { | ||
3468 | mddev->recovery_cp = oldsize; | ||
3469 | set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); | ||
3470 | } | ||
3471 | mddev->dev_sectors = sectors; | ||
3472 | mddev->resync_max_sectors = size; | ||
3473 | return 0; | ||
3474 | } | ||
3475 | |||
3372 | static void *raid10_takeover_raid0(struct mddev *mddev) | 3476 | static void *raid10_takeover_raid0(struct mddev *mddev) |
3373 | { | 3477 | { |
3374 | struct md_rdev *rdev; | 3478 | struct md_rdev *rdev; |
@@ -3392,7 +3496,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev) | |||
3392 | 3496 | ||
3393 | conf = setup_conf(mddev); | 3497 | conf = setup_conf(mddev); |
3394 | if (!IS_ERR(conf)) { | 3498 | if (!IS_ERR(conf)) { |
3395 | list_for_each_entry(rdev, &mddev->disks, same_set) | 3499 | rdev_for_each(rdev, mddev) |
3396 | if (rdev->raid_disk >= 0) | 3500 | if (rdev->raid_disk >= 0) |
3397 | rdev->new_raid_disk = rdev->raid_disk * 2; | 3501 | rdev->new_raid_disk = rdev->raid_disk * 2; |
3398 | conf->barrier = 1; | 3502 | conf->barrier = 1; |
@@ -3438,6 +3542,7 @@ static struct md_personality raid10_personality = | |||
3438 | .sync_request = sync_request, | 3542 | .sync_request = sync_request, |
3439 | .quiesce = raid10_quiesce, | 3543 | .quiesce = raid10_quiesce, |
3440 | .size = raid10_size, | 3544 | .size = raid10_size, |
3545 | .resize = raid10_resize, | ||
3441 | .takeover = raid10_takeover, | 3546 | .takeover = raid10_takeover, |
3442 | }; | 3547 | }; |
3443 | 3548 | ||
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 360f2b98f62b..23ac880bba9a 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c | |||
@@ -208,11 +208,10 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh) | |||
208 | md_wakeup_thread(conf->mddev->thread); | 208 | md_wakeup_thread(conf->mddev->thread); |
209 | } else { | 209 | } else { |
210 | BUG_ON(stripe_operations_active(sh)); | 210 | BUG_ON(stripe_operations_active(sh)); |
211 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { | 211 | if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) |
212 | atomic_dec(&conf->preread_active_stripes); | 212 | if (atomic_dec_return(&conf->preread_active_stripes) |
213 | if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) | 213 | < IO_THRESHOLD) |
214 | md_wakeup_thread(conf->mddev->thread); | 214 | md_wakeup_thread(conf->mddev->thread); |
215 | } | ||
216 | atomic_dec(&conf->active_stripes); | 215 | atomic_dec(&conf->active_stripes); |
217 | if (!test_bit(STRIPE_EXPANDING, &sh->state)) { | 216 | if (!test_bit(STRIPE_EXPANDING, &sh->state)) { |
218 | list_add_tail(&sh->lru, &conf->inactive_list); | 217 | list_add_tail(&sh->lru, &conf->inactive_list); |
@@ -4843,7 +4842,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) | |||
4843 | 4842 | ||
4844 | pr_debug("raid456: run(%s) called.\n", mdname(mddev)); | 4843 | pr_debug("raid456: run(%s) called.\n", mdname(mddev)); |
4845 | 4844 | ||
4846 | list_for_each_entry(rdev, &mddev->disks, same_set) { | 4845 | rdev_for_each(rdev, mddev) { |
4847 | raid_disk = rdev->raid_disk; | 4846 | raid_disk = rdev->raid_disk; |
4848 | if (raid_disk >= max_disks | 4847 | if (raid_disk >= max_disks |
4849 | || raid_disk < 0) | 4848 | || raid_disk < 0) |
@@ -5178,7 +5177,7 @@ static int run(struct mddev *mddev) | |||
5178 | blk_queue_io_opt(mddev->queue, chunk_size * | 5177 | blk_queue_io_opt(mddev->queue, chunk_size * |
5179 | (conf->raid_disks - conf->max_degraded)); | 5178 | (conf->raid_disks - conf->max_degraded)); |
5180 | 5179 | ||
5181 | list_for_each_entry(rdev, &mddev->disks, same_set) | 5180 | rdev_for_each(rdev, mddev) |
5182 | disk_stack_limits(mddev->gendisk, rdev->bdev, | 5181 | disk_stack_limits(mddev->gendisk, rdev->bdev, |
5183 | rdev->data_offset << 9); | 5182 | rdev->data_offset << 9); |
5184 | } | 5183 | } |
@@ -5362,7 +5361,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev) | |||
5362 | if (mddev->recovery_disabled == conf->recovery_disabled) | 5361 | if (mddev->recovery_disabled == conf->recovery_disabled) |
5363 | return -EBUSY; | 5362 | return -EBUSY; |
5364 | 5363 | ||
5365 | if (has_failed(conf)) | 5364 | if (rdev->saved_raid_disk < 0 && has_failed(conf)) |
5366 | /* no point adding a device */ | 5365 | /* no point adding a device */ |
5367 | return -EINVAL; | 5366 | return -EINVAL; |
5368 | 5367 | ||
@@ -5501,7 +5500,7 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
5501 | if (!check_stripe_cache(mddev)) | 5500 | if (!check_stripe_cache(mddev)) |
5502 | return -ENOSPC; | 5501 | return -ENOSPC; |
5503 | 5502 | ||
5504 | list_for_each_entry(rdev, &mddev->disks, same_set) | 5503 | rdev_for_each(rdev, mddev) |
5505 | if (!test_bit(In_sync, &rdev->flags) | 5504 | if (!test_bit(In_sync, &rdev->flags) |
5506 | && !test_bit(Faulty, &rdev->flags)) | 5505 | && !test_bit(Faulty, &rdev->flags)) |
5507 | spares++; | 5506 | spares++; |
@@ -5547,16 +5546,14 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
5547 | * such devices during the reshape and confusion could result. | 5546 | * such devices during the reshape and confusion could result. |
5548 | */ | 5547 | */ |
5549 | if (mddev->delta_disks >= 0) { | 5548 | if (mddev->delta_disks >= 0) { |
5550 | int added_devices = 0; | 5549 | rdev_for_each(rdev, mddev) |
5551 | list_for_each_entry(rdev, &mddev->disks, same_set) | ||
5552 | if (rdev->raid_disk < 0 && | 5550 | if (rdev->raid_disk < 0 && |
5553 | !test_bit(Faulty, &rdev->flags)) { | 5551 | !test_bit(Faulty, &rdev->flags)) { |
5554 | if (raid5_add_disk(mddev, rdev) == 0) { | 5552 | if (raid5_add_disk(mddev, rdev) == 0) { |
5555 | if (rdev->raid_disk | 5553 | if (rdev->raid_disk |
5556 | >= conf->previous_raid_disks) { | 5554 | >= conf->previous_raid_disks) |
5557 | set_bit(In_sync, &rdev->flags); | 5555 | set_bit(In_sync, &rdev->flags); |
5558 | added_devices++; | 5556 | else |
5559 | } else | ||
5560 | rdev->recovery_offset = 0; | 5557 | rdev->recovery_offset = 0; |
5561 | 5558 | ||
5562 | if (sysfs_link_rdev(mddev, rdev)) | 5559 | if (sysfs_link_rdev(mddev, rdev)) |
@@ -5566,7 +5563,6 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
5566 | && !test_bit(Faulty, &rdev->flags)) { | 5563 | && !test_bit(Faulty, &rdev->flags)) { |
5567 | /* This is a spare that was manually added */ | 5564 | /* This is a spare that was manually added */ |
5568 | set_bit(In_sync, &rdev->flags); | 5565 | set_bit(In_sync, &rdev->flags); |
5569 | added_devices++; | ||
5570 | } | 5566 | } |
5571 | 5567 | ||
5572 | /* When a reshape changes the number of devices, | 5568 | /* When a reshape changes the number of devices, |
@@ -5592,6 +5588,7 @@ static int raid5_start_reshape(struct mddev *mddev) | |||
5592 | spin_lock_irq(&conf->device_lock); | 5588 | spin_lock_irq(&conf->device_lock); |
5593 | mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; | 5589 | mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; |
5594 | conf->reshape_progress = MaxSector; | 5590 | conf->reshape_progress = MaxSector; |
5591 | mddev->reshape_position = MaxSector; | ||
5595 | spin_unlock_irq(&conf->device_lock); | 5592 | spin_unlock_irq(&conf->device_lock); |
5596 | return -EAGAIN; | 5593 | return -EAGAIN; |
5597 | } | 5594 | } |