aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/md
diff options
context:
space:
mode:
authorTakashi Iwai <tiwai@suse.de>2012-04-07 06:28:00 -0400
committerTakashi Iwai <tiwai@suse.de>2012-04-07 06:28:00 -0400
commitc38f62b08d800104fa9b0e9d6e9141459986c06d (patch)
tree1d04d768c8aa0c1a544d1f068317c7beb0101be2 /drivers/md
parent250f32747e62cb415b85083e247184188f24e566 (diff)
parent8abe05c6eb358967f16bce8a02c88d57c82cfbd6 (diff)
Merge tag 'asoc-3.4' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound into for-linus
ASoC: fixes for 3.4 A bunch of driver-specific fixes and one generic fix for the new support for platform DAPM contexts - we were picking the wrong default for the idle_bias_off setting which was meaning we weren't actually achieving any useful runtime PM on platform devices.
Diffstat (limited to 'drivers/md')
-rw-r--r--drivers/md/Kconfig28
-rw-r--r--drivers/md/Makefile1
-rw-r--r--drivers/md/bitmap.c194
-rw-r--r--drivers/md/bitmap.h22
-rw-r--r--drivers/md/dm-bufio.c109
-rw-r--r--drivers/md/dm-bufio.h8
-rw-r--r--drivers/md/dm-crypt.c54
-rw-r--r--drivers/md/dm-delay.c9
-rw-r--r--drivers/md/dm-exception-store.c2
-rw-r--r--drivers/md/dm-flakey.c5
-rw-r--r--drivers/md/dm-io.c23
-rw-r--r--drivers/md/dm-ioctl.c7
-rw-r--r--drivers/md/dm-linear.c3
-rw-r--r--drivers/md/dm-log.c3
-rw-r--r--drivers/md/dm-mpath.c52
-rw-r--r--drivers/md/dm-queue-length.c3
-rw-r--r--drivers/md/dm-raid.c86
-rw-r--r--drivers/md/dm-raid1.c12
-rw-r--r--drivers/md/dm-round-robin.c3
-rw-r--r--drivers/md/dm-service-time.c5
-rw-r--r--drivers/md/dm-stripe.c3
-rw-r--r--drivers/md/dm-table.c9
-rw-r--r--drivers/md/dm-thin-metadata.c30
-rw-r--r--drivers/md/dm-thin-metadata.h13
-rw-r--r--drivers/md/dm-thin.c680
-rw-r--r--drivers/md/dm-verity.c913
-rw-r--r--drivers/md/dm.c1
-rw-r--r--drivers/md/faulty.c2
-rw-r--r--drivers/md/linear.c32
-rw-r--r--drivers/md/md.c140
-rw-r--r--drivers/md/md.h13
-rw-r--r--drivers/md/multipath.c2
-rw-r--r--drivers/md/persistent-data/dm-btree-internal.h7
-rw-r--r--drivers/md/persistent-data/dm-btree-remove.c202
-rw-r--r--drivers/md/persistent-data/dm-btree.c27
-rw-r--r--drivers/md/persistent-data/dm-space-map-common.c3
-rw-r--r--drivers/md/raid0.c164
-rw-r--r--drivers/md/raid0.h11
-rw-r--r--drivers/md/raid1.c100
-rw-r--r--drivers/md/raid10.c225
-rw-r--r--drivers/md/raid5.c25
41 files changed, 2424 insertions, 807 deletions
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig
index faa4741df6d3..10f122a3a856 100644
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -277,8 +277,8 @@ config DM_MIRROR
277 needed for live data migration tools such as 'pvmove'. 277 needed for live data migration tools such as 'pvmove'.
278 278
279config DM_RAID 279config DM_RAID
280 tristate "RAID 1/4/5/6 target (EXPERIMENTAL)" 280 tristate "RAID 1/4/5/6 target"
281 depends on BLK_DEV_DM && EXPERIMENTAL 281 depends on BLK_DEV_DM
282 select MD_RAID1 282 select MD_RAID1
283 select MD_RAID456 283 select MD_RAID456
284 select BLK_DEV_MD 284 select BLK_DEV_MD
@@ -359,8 +359,8 @@ config DM_DELAY
359 If unsure, say N. 359 If unsure, say N.
360 360
361config DM_UEVENT 361config DM_UEVENT
362 bool "DM uevents (EXPERIMENTAL)" 362 bool "DM uevents"
363 depends on BLK_DEV_DM && EXPERIMENTAL 363 depends on BLK_DEV_DM
364 ---help--- 364 ---help---
365 Generate udev events for DM events. 365 Generate udev events for DM events.
366 366
@@ -370,4 +370,24 @@ config DM_FLAKEY
370 ---help--- 370 ---help---
371 A target that intermittently fails I/O for debugging purposes. 371 A target that intermittently fails I/O for debugging purposes.
372 372
373config DM_VERITY
374 tristate "Verity target support (EXPERIMENTAL)"
375 depends on BLK_DEV_DM && EXPERIMENTAL
376 select CRYPTO
377 select CRYPTO_HASH
378 select DM_BUFIO
379 ---help---
380 This device-mapper target creates a read-only device that
381 transparently validates the data on one underlying device against
382 a pre-generated tree of cryptographic checksums stored on a second
383 device.
384
385 You'll need to activate the digests you're going to use in the
386 cryptoapi configuration.
387
388 To compile this code as a module, choose M here: the module will
389 be called dm-verity.
390
391 If unsure, say N.
392
373endif # MD 393endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile
index 046860c7a166..8b2e0dffe82e 100644
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_DM_LOG_USERSPACE) += dm-log-userspace.o
42obj-$(CONFIG_DM_ZERO) += dm-zero.o 42obj-$(CONFIG_DM_ZERO) += dm-zero.o
43obj-$(CONFIG_DM_RAID) += dm-raid.o 43obj-$(CONFIG_DM_RAID) += dm-raid.o
44obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o 44obj-$(CONFIG_DM_THIN_PROVISIONING) += dm-thin-pool.o
45obj-$(CONFIG_DM_VERITY) += dm-verity.o
45 46
46ifeq ($(CONFIG_DM_UEVENT),y) 47ifeq ($(CONFIG_DM_UEVENT),y)
47dm-mod-objs += dm-uevent.o 48dm-mod-objs += dm-uevent.o
diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c
index cdf36b1e9aa6..3d0dfa7a89a2 100644
--- a/drivers/md/bitmap.c
+++ b/drivers/md/bitmap.c
@@ -26,6 +26,7 @@
26#include <linux/file.h> 26#include <linux/file.h>
27#include <linux/mount.h> 27#include <linux/mount.h>
28#include <linux/buffer_head.h> 28#include <linux/buffer_head.h>
29#include <linux/seq_file.h>
29#include "md.h" 30#include "md.h"
30#include "bitmap.h" 31#include "bitmap.h"
31 32
@@ -35,31 +36,6 @@ static inline char *bmname(struct bitmap *bitmap)
35} 36}
36 37
37/* 38/*
38 * just a placeholder - calls kmalloc for bitmap pages
39 */
40static unsigned char *bitmap_alloc_page(struct bitmap *bitmap)
41{
42 unsigned char *page;
43
44 page = kzalloc(PAGE_SIZE, GFP_NOIO);
45 if (!page)
46 printk("%s: bitmap_alloc_page FAILED\n", bmname(bitmap));
47 else
48 pr_debug("%s: bitmap_alloc_page: allocated page at %p\n",
49 bmname(bitmap), page);
50 return page;
51}
52
53/*
54 * for now just a placeholder -- just calls kfree for bitmap pages
55 */
56static void bitmap_free_page(struct bitmap *bitmap, unsigned char *page)
57{
58 pr_debug("%s: bitmap_free_page: free page %p\n", bmname(bitmap), page);
59 kfree(page);
60}
61
62/*
63 * check a page and, if necessary, allocate it (or hijack it if the alloc fails) 39 * check a page and, if necessary, allocate it (or hijack it if the alloc fails)
64 * 40 *
65 * 1) check to see if this page is allocated, if it's not then try to alloc 41 * 1) check to see if this page is allocated, if it's not then try to alloc
@@ -96,7 +72,7 @@ __acquires(bitmap->lock)
96 /* this page has not been allocated yet */ 72 /* this page has not been allocated yet */
97 73
98 spin_unlock_irq(&bitmap->lock); 74 spin_unlock_irq(&bitmap->lock);
99 mappage = bitmap_alloc_page(bitmap); 75 mappage = kzalloc(PAGE_SIZE, GFP_NOIO);
100 spin_lock_irq(&bitmap->lock); 76 spin_lock_irq(&bitmap->lock);
101 77
102 if (mappage == NULL) { 78 if (mappage == NULL) {
@@ -109,7 +85,7 @@ __acquires(bitmap->lock)
109 } else if (bitmap->bp[page].map || 85 } else if (bitmap->bp[page].map ||
110 bitmap->bp[page].hijacked) { 86 bitmap->bp[page].hijacked) {
111 /* somebody beat us to getting the page */ 87 /* somebody beat us to getting the page */
112 bitmap_free_page(bitmap, mappage); 88 kfree(mappage);
113 return 0; 89 return 0;
114 } else { 90 } else {
115 91
@@ -141,7 +117,7 @@ static void bitmap_checkfree(struct bitmap *bitmap, unsigned long page)
141 ptr = bitmap->bp[page].map; 117 ptr = bitmap->bp[page].map;
142 bitmap->bp[page].map = NULL; 118 bitmap->bp[page].map = NULL;
143 bitmap->missing_pages++; 119 bitmap->missing_pages++;
144 bitmap_free_page(bitmap, ptr); 120 kfree(ptr);
145 } 121 }
146} 122}
147 123
@@ -171,7 +147,7 @@ static struct page *read_sb_page(struct mddev *mddev, loff_t offset,
171 did_alloc = 1; 147 did_alloc = 1;
172 } 148 }
173 149
174 list_for_each_entry(rdev, &mddev->disks, same_set) { 150 rdev_for_each(rdev, mddev) {
175 if (! test_bit(In_sync, &rdev->flags) 151 if (! test_bit(In_sync, &rdev->flags)
176 || test_bit(Faulty, &rdev->flags)) 152 || test_bit(Faulty, &rdev->flags))
177 continue; 153 continue;
@@ -445,19 +421,14 @@ out:
445void bitmap_update_sb(struct bitmap *bitmap) 421void bitmap_update_sb(struct bitmap *bitmap)
446{ 422{
447 bitmap_super_t *sb; 423 bitmap_super_t *sb;
448 unsigned long flags;
449 424
450 if (!bitmap || !bitmap->mddev) /* no bitmap for this array */ 425 if (!bitmap || !bitmap->mddev) /* no bitmap for this array */
451 return; 426 return;
452 if (bitmap->mddev->bitmap_info.external) 427 if (bitmap->mddev->bitmap_info.external)
453 return; 428 return;
454 spin_lock_irqsave(&bitmap->lock, flags); 429 if (!bitmap->sb_page) /* no superblock */
455 if (!bitmap->sb_page) { /* no superblock */
456 spin_unlock_irqrestore(&bitmap->lock, flags);
457 return; 430 return;
458 } 431 sb = kmap_atomic(bitmap->sb_page);
459 spin_unlock_irqrestore(&bitmap->lock, flags);
460 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
461 sb->events = cpu_to_le64(bitmap->mddev->events); 432 sb->events = cpu_to_le64(bitmap->mddev->events);
462 if (bitmap->mddev->events < bitmap->events_cleared) 433 if (bitmap->mddev->events < bitmap->events_cleared)
463 /* rocking back to read-only */ 434 /* rocking back to read-only */
@@ -467,7 +438,7 @@ void bitmap_update_sb(struct bitmap *bitmap)
467 /* Just in case these have been changed via sysfs: */ 438 /* Just in case these have been changed via sysfs: */
468 sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ); 439 sb->daemon_sleep = cpu_to_le32(bitmap->mddev->bitmap_info.daemon_sleep/HZ);
469 sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind); 440 sb->write_behind = cpu_to_le32(bitmap->mddev->bitmap_info.max_write_behind);
470 kunmap_atomic(sb, KM_USER0); 441 kunmap_atomic(sb);
471 write_page(bitmap, bitmap->sb_page, 1); 442 write_page(bitmap, bitmap->sb_page, 1);
472} 443}
473 444
@@ -478,7 +449,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
478 449
479 if (!bitmap || !bitmap->sb_page) 450 if (!bitmap || !bitmap->sb_page)
480 return; 451 return;
481 sb = kmap_atomic(bitmap->sb_page, KM_USER0); 452 sb = kmap_atomic(bitmap->sb_page);
482 printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap)); 453 printk(KERN_DEBUG "%s: bitmap file superblock:\n", bmname(bitmap));
483 printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic)); 454 printk(KERN_DEBUG " magic: %08x\n", le32_to_cpu(sb->magic));
484 printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version)); 455 printk(KERN_DEBUG " version: %d\n", le32_to_cpu(sb->version));
@@ -497,7 +468,7 @@ void bitmap_print_sb(struct bitmap *bitmap)
497 printk(KERN_DEBUG " sync size: %llu KB\n", 468 printk(KERN_DEBUG " sync size: %llu KB\n",
498 (unsigned long long)le64_to_cpu(sb->sync_size)/2); 469 (unsigned long long)le64_to_cpu(sb->sync_size)/2);
499 printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind)); 470 printk(KERN_DEBUG "max write behind: %d\n", le32_to_cpu(sb->write_behind));
500 kunmap_atomic(sb, KM_USER0); 471 kunmap_atomic(sb);
501} 472}
502 473
503/* 474/*
@@ -525,7 +496,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
525 } 496 }
526 bitmap->sb_page->index = 0; 497 bitmap->sb_page->index = 0;
527 498
528 sb = kmap_atomic(bitmap->sb_page, KM_USER0); 499 sb = kmap_atomic(bitmap->sb_page);
529 500
530 sb->magic = cpu_to_le32(BITMAP_MAGIC); 501 sb->magic = cpu_to_le32(BITMAP_MAGIC);
531 sb->version = cpu_to_le32(BITMAP_MAJOR_HI); 502 sb->version = cpu_to_le32(BITMAP_MAJOR_HI);
@@ -533,7 +504,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
533 chunksize = bitmap->mddev->bitmap_info.chunksize; 504 chunksize = bitmap->mddev->bitmap_info.chunksize;
534 BUG_ON(!chunksize); 505 BUG_ON(!chunksize);
535 if (!is_power_of_2(chunksize)) { 506 if (!is_power_of_2(chunksize)) {
536 kunmap_atomic(sb, KM_USER0); 507 kunmap_atomic(sb);
537 printk(KERN_ERR "bitmap chunksize not a power of 2\n"); 508 printk(KERN_ERR "bitmap chunksize not a power of 2\n");
538 return -EINVAL; 509 return -EINVAL;
539 } 510 }
@@ -571,7 +542,7 @@ static int bitmap_new_disk_sb(struct bitmap *bitmap)
571 bitmap->flags |= BITMAP_HOSTENDIAN; 542 bitmap->flags |= BITMAP_HOSTENDIAN;
572 sb->version = cpu_to_le32(BITMAP_MAJOR_HOSTENDIAN); 543 sb->version = cpu_to_le32(BITMAP_MAJOR_HOSTENDIAN);
573 544
574 kunmap_atomic(sb, KM_USER0); 545 kunmap_atomic(sb);
575 546
576 return 0; 547 return 0;
577} 548}
@@ -603,7 +574,7 @@ static int bitmap_read_sb(struct bitmap *bitmap)
603 return err; 574 return err;
604 } 575 }
605 576
606 sb = kmap_atomic(bitmap->sb_page, KM_USER0); 577 sb = kmap_atomic(bitmap->sb_page);
607 578
608 chunksize = le32_to_cpu(sb->chunksize); 579 chunksize = le32_to_cpu(sb->chunksize);
609 daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ; 580 daemon_sleep = le32_to_cpu(sb->daemon_sleep) * HZ;
@@ -632,26 +603,28 @@ static int bitmap_read_sb(struct bitmap *bitmap)
632 /* keep the array size field of the bitmap superblock up to date */ 603 /* keep the array size field of the bitmap superblock up to date */
633 sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors); 604 sb->sync_size = cpu_to_le64(bitmap->mddev->resync_max_sectors);
634 605
635 if (!bitmap->mddev->persistent) 606 if (bitmap->mddev->persistent) {
636 goto success; 607 /*
637 608 * We have a persistent array superblock, so compare the
638 /* 609 * bitmap's UUID and event counter to the mddev's
639 * if we have a persistent array superblock, compare the 610 */
640 * bitmap's UUID and event counter to the mddev's 611 if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) {
641 */ 612 printk(KERN_INFO
642 if (memcmp(sb->uuid, bitmap->mddev->uuid, 16)) { 613 "%s: bitmap superblock UUID mismatch\n",
643 printk(KERN_INFO "%s: bitmap superblock UUID mismatch\n", 614 bmname(bitmap));
644 bmname(bitmap)); 615 goto out;
645 goto out; 616 }
646 } 617 events = le64_to_cpu(sb->events);
647 events = le64_to_cpu(sb->events); 618 if (events < bitmap->mddev->events) {
648 if (events < bitmap->mddev->events) { 619 printk(KERN_INFO
649 printk(KERN_INFO "%s: bitmap file is out of date (%llu < %llu) " 620 "%s: bitmap file is out of date (%llu < %llu) "
650 "-- forcing full recovery\n", bmname(bitmap), events, 621 "-- forcing full recovery\n",
651 (unsigned long long) bitmap->mddev->events); 622 bmname(bitmap), events,
652 sb->state |= cpu_to_le32(BITMAP_STALE); 623 (unsigned long long) bitmap->mddev->events);
624 sb->state |= cpu_to_le32(BITMAP_STALE);
625 }
653 } 626 }
654success: 627
655 /* assign fields using values from superblock */ 628 /* assign fields using values from superblock */
656 bitmap->mddev->bitmap_info.chunksize = chunksize; 629 bitmap->mddev->bitmap_info.chunksize = chunksize;
657 bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep; 630 bitmap->mddev->bitmap_info.daemon_sleep = daemon_sleep;
@@ -664,7 +637,7 @@ success:
664 bitmap->events_cleared = bitmap->mddev->events; 637 bitmap->events_cleared = bitmap->mddev->events;
665 err = 0; 638 err = 0;
666out: 639out:
667 kunmap_atomic(sb, KM_USER0); 640 kunmap_atomic(sb);
668 if (err) 641 if (err)
669 bitmap_print_sb(bitmap); 642 bitmap_print_sb(bitmap);
670 return err; 643 return err;
@@ -680,16 +653,11 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
680 enum bitmap_mask_op op) 653 enum bitmap_mask_op op)
681{ 654{
682 bitmap_super_t *sb; 655 bitmap_super_t *sb;
683 unsigned long flags;
684 int old; 656 int old;
685 657
686 spin_lock_irqsave(&bitmap->lock, flags); 658 if (!bitmap->sb_page) /* can't set the state */
687 if (!bitmap->sb_page) { /* can't set the state */
688 spin_unlock_irqrestore(&bitmap->lock, flags);
689 return 0; 659 return 0;
690 } 660 sb = kmap_atomic(bitmap->sb_page);
691 spin_unlock_irqrestore(&bitmap->lock, flags);
692 sb = kmap_atomic(bitmap->sb_page, KM_USER0);
693 old = le32_to_cpu(sb->state) & bits; 661 old = le32_to_cpu(sb->state) & bits;
694 switch (op) { 662 switch (op) {
695 case MASK_SET: 663 case MASK_SET:
@@ -703,7 +671,7 @@ static int bitmap_mask_state(struct bitmap *bitmap, enum bitmap_state bits,
703 default: 671 default:
704 BUG(); 672 BUG();
705 } 673 }
706 kunmap_atomic(sb, KM_USER0); 674 kunmap_atomic(sb);
707 return old; 675 return old;
708} 676}
709 677
@@ -870,7 +838,7 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
870 unsigned long bit; 838 unsigned long bit;
871 struct page *page; 839 struct page *page;
872 void *kaddr; 840 void *kaddr;
873 unsigned long chunk = block >> CHUNK_BLOCK_SHIFT(bitmap); 841 unsigned long chunk = block >> bitmap->chunkshift;
874 842
875 if (!bitmap->filemap) 843 if (!bitmap->filemap)
876 return; 844 return;
@@ -881,12 +849,12 @@ static void bitmap_file_set_bit(struct bitmap *bitmap, sector_t block)
881 bit = file_page_offset(bitmap, chunk); 849 bit = file_page_offset(bitmap, chunk);
882 850
883 /* set the bit */ 851 /* set the bit */
884 kaddr = kmap_atomic(page, KM_USER0); 852 kaddr = kmap_atomic(page);
885 if (bitmap->flags & BITMAP_HOSTENDIAN) 853 if (bitmap->flags & BITMAP_HOSTENDIAN)
886 set_bit(bit, kaddr); 854 set_bit(bit, kaddr);
887 else 855 else
888 __set_bit_le(bit, kaddr); 856 __set_bit_le(bit, kaddr);
889 kunmap_atomic(kaddr, KM_USER0); 857 kunmap_atomic(kaddr);
890 pr_debug("set file bit %lu page %lu\n", bit, page->index); 858 pr_debug("set file bit %lu page %lu\n", bit, page->index);
891 /* record page number so it gets flushed to disk when unplug occurs */ 859 /* record page number so it gets flushed to disk when unplug occurs */
892 set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY); 860 set_page_attr(bitmap, page, BITMAP_PAGE_DIRTY);
@@ -1050,10 +1018,10 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
1050 * if bitmap is out of date, dirty the 1018 * if bitmap is out of date, dirty the
1051 * whole page and write it out 1019 * whole page and write it out
1052 */ 1020 */
1053 paddr = kmap_atomic(page, KM_USER0); 1021 paddr = kmap_atomic(page);
1054 memset(paddr + offset, 0xff, 1022 memset(paddr + offset, 0xff,
1055 PAGE_SIZE - offset); 1023 PAGE_SIZE - offset);
1056 kunmap_atomic(paddr, KM_USER0); 1024 kunmap_atomic(paddr);
1057 write_page(bitmap, page, 1); 1025 write_page(bitmap, page, 1);
1058 1026
1059 ret = -EIO; 1027 ret = -EIO;
@@ -1061,18 +1029,18 @@ static int bitmap_init_from_disk(struct bitmap *bitmap, sector_t start)
1061 goto err; 1029 goto err;
1062 } 1030 }
1063 } 1031 }
1064 paddr = kmap_atomic(page, KM_USER0); 1032 paddr = kmap_atomic(page);
1065 if (bitmap->flags & BITMAP_HOSTENDIAN) 1033 if (bitmap->flags & BITMAP_HOSTENDIAN)
1066 b = test_bit(bit, paddr); 1034 b = test_bit(bit, paddr);
1067 else 1035 else
1068 b = test_bit_le(bit, paddr); 1036 b = test_bit_le(bit, paddr);
1069 kunmap_atomic(paddr, KM_USER0); 1037 kunmap_atomic(paddr);
1070 if (b) { 1038 if (b) {
1071 /* if the disk bit is set, set the memory bit */ 1039 /* if the disk bit is set, set the memory bit */
1072 int needed = ((sector_t)(i+1) << (CHUNK_BLOCK_SHIFT(bitmap)) 1040 int needed = ((sector_t)(i+1) << bitmap->chunkshift
1073 >= start); 1041 >= start);
1074 bitmap_set_memory_bits(bitmap, 1042 bitmap_set_memory_bits(bitmap,
1075 (sector_t)i << CHUNK_BLOCK_SHIFT(bitmap), 1043 (sector_t)i << bitmap->chunkshift,
1076 needed); 1044 needed);
1077 bit_cnt++; 1045 bit_cnt++;
1078 } 1046 }
@@ -1116,7 +1084,7 @@ void bitmap_write_all(struct bitmap *bitmap)
1116 1084
1117static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc) 1085static void bitmap_count_page(struct bitmap *bitmap, sector_t offset, int inc)
1118{ 1086{
1119 sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap); 1087 sector_t chunk = offset >> bitmap->chunkshift;
1120 unsigned long page = chunk >> PAGE_COUNTER_SHIFT; 1088 unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1121 bitmap->bp[page].count += inc; 1089 bitmap->bp[page].count += inc;
1122 bitmap_checkfree(bitmap, page); 1090 bitmap_checkfree(bitmap, page);
@@ -1209,10 +1177,10 @@ void bitmap_daemon_work(struct mddev *mddev)
1209 mddev->bitmap_info.external == 0) { 1177 mddev->bitmap_info.external == 0) {
1210 bitmap_super_t *sb; 1178 bitmap_super_t *sb;
1211 bitmap->need_sync = 0; 1179 bitmap->need_sync = 0;
1212 sb = kmap_atomic(bitmap->sb_page, KM_USER0); 1180 sb = kmap_atomic(bitmap->sb_page);
1213 sb->events_cleared = 1181 sb->events_cleared =
1214 cpu_to_le64(bitmap->events_cleared); 1182 cpu_to_le64(bitmap->events_cleared);
1215 kunmap_atomic(sb, KM_USER0); 1183 kunmap_atomic(sb);
1216 write_page(bitmap, bitmap->sb_page, 1); 1184 write_page(bitmap, bitmap->sb_page, 1);
1217 } 1185 }
1218 spin_lock_irqsave(&bitmap->lock, flags); 1186 spin_lock_irqsave(&bitmap->lock, flags);
@@ -1222,7 +1190,7 @@ void bitmap_daemon_work(struct mddev *mddev)
1222 bitmap->allclean = 0; 1190 bitmap->allclean = 0;
1223 } 1191 }
1224 bmc = bitmap_get_counter(bitmap, 1192 bmc = bitmap_get_counter(bitmap,
1225 (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), 1193 (sector_t)j << bitmap->chunkshift,
1226 &blocks, 0); 1194 &blocks, 0);
1227 if (!bmc) 1195 if (!bmc)
1228 j |= PAGE_COUNTER_MASK; 1196 j |= PAGE_COUNTER_MASK;
@@ -1231,11 +1199,11 @@ void bitmap_daemon_work(struct mddev *mddev)
1231 /* we can clear the bit */ 1199 /* we can clear the bit */
1232 *bmc = 0; 1200 *bmc = 0;
1233 bitmap_count_page(bitmap, 1201 bitmap_count_page(bitmap,
1234 (sector_t)j << CHUNK_BLOCK_SHIFT(bitmap), 1202 (sector_t)j << bitmap->chunkshift,
1235 -1); 1203 -1);
1236 1204
1237 /* clear the bit */ 1205 /* clear the bit */
1238 paddr = kmap_atomic(page, KM_USER0); 1206 paddr = kmap_atomic(page);
1239 if (bitmap->flags & BITMAP_HOSTENDIAN) 1207 if (bitmap->flags & BITMAP_HOSTENDIAN)
1240 clear_bit(file_page_offset(bitmap, j), 1208 clear_bit(file_page_offset(bitmap, j),
1241 paddr); 1209 paddr);
@@ -1244,7 +1212,7 @@ void bitmap_daemon_work(struct mddev *mddev)
1244 file_page_offset(bitmap, 1212 file_page_offset(bitmap,
1245 j), 1213 j),
1246 paddr); 1214 paddr);
1247 kunmap_atomic(paddr, KM_USER0); 1215 kunmap_atomic(paddr);
1248 } else if (*bmc <= 2) { 1216 } else if (*bmc <= 2) {
1249 *bmc = 1; /* maybe clear the bit next time */ 1217 *bmc = 1; /* maybe clear the bit next time */
1250 set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); 1218 set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
@@ -1285,7 +1253,7 @@ __acquires(bitmap->lock)
1285 * The lock must have been taken with interrupts enabled. 1253 * The lock must have been taken with interrupts enabled.
1286 * If !create, we don't release the lock. 1254 * If !create, we don't release the lock.
1287 */ 1255 */
1288 sector_t chunk = offset >> CHUNK_BLOCK_SHIFT(bitmap); 1256 sector_t chunk = offset >> bitmap->chunkshift;
1289 unsigned long page = chunk >> PAGE_COUNTER_SHIFT; 1257 unsigned long page = chunk >> PAGE_COUNTER_SHIFT;
1290 unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT; 1258 unsigned long pageoff = (chunk & PAGE_COUNTER_MASK) << COUNTER_BYTE_SHIFT;
1291 sector_t csize; 1259 sector_t csize;
@@ -1295,10 +1263,10 @@ __acquires(bitmap->lock)
1295 1263
1296 if (bitmap->bp[page].hijacked || 1264 if (bitmap->bp[page].hijacked ||
1297 bitmap->bp[page].map == NULL) 1265 bitmap->bp[page].map == NULL)
1298 csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap) + 1266 csize = ((sector_t)1) << (bitmap->chunkshift +
1299 PAGE_COUNTER_SHIFT - 1); 1267 PAGE_COUNTER_SHIFT - 1);
1300 else 1268 else
1301 csize = ((sector_t)1) << (CHUNK_BLOCK_SHIFT(bitmap)); 1269 csize = ((sector_t)1) << bitmap->chunkshift;
1302 *blocks = csize - (offset & (csize - 1)); 1270 *blocks = csize - (offset & (csize - 1));
1303 1271
1304 if (err < 0) 1272 if (err < 0)
@@ -1424,7 +1392,7 @@ void bitmap_endwrite(struct bitmap *bitmap, sector_t offset, unsigned long secto
1424 set_page_attr(bitmap, 1392 set_page_attr(bitmap,
1425 filemap_get_page( 1393 filemap_get_page(
1426 bitmap, 1394 bitmap,
1427 offset >> CHUNK_BLOCK_SHIFT(bitmap)), 1395 offset >> bitmap->chunkshift),
1428 BITMAP_PAGE_PENDING); 1396 BITMAP_PAGE_PENDING);
1429 bitmap->allclean = 0; 1397 bitmap->allclean = 0;
1430 } 1398 }
@@ -1512,7 +1480,7 @@ void bitmap_end_sync(struct bitmap *bitmap, sector_t offset, sector_t *blocks, i
1512 else { 1480 else {
1513 if (*bmc <= 2) { 1481 if (*bmc <= 2) {
1514 set_page_attr(bitmap, 1482 set_page_attr(bitmap,
1515 filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)), 1483 filemap_get_page(bitmap, offset >> bitmap->chunkshift),
1516 BITMAP_PAGE_PENDING); 1484 BITMAP_PAGE_PENDING);
1517 bitmap->allclean = 0; 1485 bitmap->allclean = 0;
1518 } 1486 }
@@ -1559,7 +1527,7 @@ void bitmap_cond_end_sync(struct bitmap *bitmap, sector_t sector)
1559 1527
1560 bitmap->mddev->curr_resync_completed = sector; 1528 bitmap->mddev->curr_resync_completed = sector;
1561 set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags); 1529 set_bit(MD_CHANGE_CLEAN, &bitmap->mddev->flags);
1562 sector &= ~((1ULL << CHUNK_BLOCK_SHIFT(bitmap)) - 1); 1530 sector &= ~((1ULL << bitmap->chunkshift) - 1);
1563 s = 0; 1531 s = 0;
1564 while (s < sector && s < bitmap->mddev->resync_max_sectors) { 1532 while (s < sector && s < bitmap->mddev->resync_max_sectors) {
1565 bitmap_end_sync(bitmap, s, &blocks, 0); 1533 bitmap_end_sync(bitmap, s, &blocks, 0);
@@ -1589,7 +1557,7 @@ static void bitmap_set_memory_bits(struct bitmap *bitmap, sector_t offset, int n
1589 struct page *page; 1557 struct page *page;
1590 *bmc = 2 | (needed ? NEEDED_MASK : 0); 1558 *bmc = 2 | (needed ? NEEDED_MASK : 0);
1591 bitmap_count_page(bitmap, offset, 1); 1559 bitmap_count_page(bitmap, offset, 1);
1592 page = filemap_get_page(bitmap, offset >> CHUNK_BLOCK_SHIFT(bitmap)); 1560 page = filemap_get_page(bitmap, offset >> bitmap->chunkshift);
1593 set_page_attr(bitmap, page, BITMAP_PAGE_PENDING); 1561 set_page_attr(bitmap, page, BITMAP_PAGE_PENDING);
1594 bitmap->allclean = 0; 1562 bitmap->allclean = 0;
1595 } 1563 }
@@ -1602,7 +1570,7 @@ void bitmap_dirty_bits(struct bitmap *bitmap, unsigned long s, unsigned long e)
1602 unsigned long chunk; 1570 unsigned long chunk;
1603 1571
1604 for (chunk = s; chunk <= e; chunk++) { 1572 for (chunk = s; chunk <= e; chunk++) {
1605 sector_t sec = (sector_t)chunk << CHUNK_BLOCK_SHIFT(bitmap); 1573 sector_t sec = (sector_t)chunk << bitmap->chunkshift;
1606 bitmap_set_memory_bits(bitmap, sec, 1); 1574 bitmap_set_memory_bits(bitmap, sec, 1);
1607 spin_lock_irq(&bitmap->lock); 1575 spin_lock_irq(&bitmap->lock);
1608 bitmap_file_set_bit(bitmap, sec); 1576 bitmap_file_set_bit(bitmap, sec);
@@ -1759,11 +1727,12 @@ int bitmap_create(struct mddev *mddev)
1759 goto error; 1727 goto error;
1760 1728
1761 bitmap->daemon_lastrun = jiffies; 1729 bitmap->daemon_lastrun = jiffies;
1762 bitmap->chunkshift = ffz(~mddev->bitmap_info.chunksize); 1730 bitmap->chunkshift = (ffz(~mddev->bitmap_info.chunksize)
1731 - BITMAP_BLOCK_SHIFT);
1763 1732
1764 /* now that chunksize and chunkshift are set, we can use these macros */ 1733 /* now that chunksize and chunkshift are set, we can use these macros */
1765 chunks = (blocks + CHUNK_BLOCK_RATIO(bitmap) - 1) >> 1734 chunks = (blocks + bitmap->chunkshift - 1) >>
1766 CHUNK_BLOCK_SHIFT(bitmap); 1735 bitmap->chunkshift;
1767 pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO; 1736 pages = (chunks + PAGE_COUNTER_RATIO - 1) / PAGE_COUNTER_RATIO;
1768 1737
1769 BUG_ON(!pages); 1738 BUG_ON(!pages);
@@ -1836,6 +1805,33 @@ out:
1836} 1805}
1837EXPORT_SYMBOL_GPL(bitmap_load); 1806EXPORT_SYMBOL_GPL(bitmap_load);
1838 1807
1808void bitmap_status(struct seq_file *seq, struct bitmap *bitmap)
1809{
1810 unsigned long chunk_kb;
1811 unsigned long flags;
1812
1813 if (!bitmap)
1814 return;
1815
1816 spin_lock_irqsave(&bitmap->lock, flags);
1817 chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10;
1818 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
1819 "%lu%s chunk",
1820 bitmap->pages - bitmap->missing_pages,
1821 bitmap->pages,
1822 (bitmap->pages - bitmap->missing_pages)
1823 << (PAGE_SHIFT - 10),
1824 chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize,
1825 chunk_kb ? "KB" : "B");
1826 if (bitmap->file) {
1827 seq_printf(seq, ", file: ");
1828 seq_path(seq, &bitmap->file->f_path, " \t\n");
1829 }
1830
1831 seq_printf(seq, "\n");
1832 spin_unlock_irqrestore(&bitmap->lock, flags);
1833}
1834
1839static ssize_t 1835static ssize_t
1840location_show(struct mddev *mddev, char *page) 1836location_show(struct mddev *mddev, char *page)
1841{ 1837{
@@ -1904,6 +1900,8 @@ location_store(struct mddev *mddev, const char *buf, size_t len)
1904 if (mddev->pers) { 1900 if (mddev->pers) {
1905 mddev->pers->quiesce(mddev, 1); 1901 mddev->pers->quiesce(mddev, 1);
1906 rv = bitmap_create(mddev); 1902 rv = bitmap_create(mddev);
1903 if (!rv)
1904 rv = bitmap_load(mddev);
1907 if (rv) { 1905 if (rv) {
1908 bitmap_destroy(mddev); 1906 bitmap_destroy(mddev);
1909 mddev->bitmap_info.offset = 0; 1907 mddev->bitmap_info.offset = 0;
diff --git a/drivers/md/bitmap.h b/drivers/md/bitmap.h
index a15436dd9b3e..55ca5aec84e4 100644
--- a/drivers/md/bitmap.h
+++ b/drivers/md/bitmap.h
@@ -13,8 +13,6 @@
13#define BITMAP_MAJOR_HI 4 13#define BITMAP_MAJOR_HI 4
14#define BITMAP_MAJOR_HOSTENDIAN 3 14#define BITMAP_MAJOR_HOSTENDIAN 3
15 15
16#define BITMAP_MINOR 39
17
18/* 16/*
19 * in-memory bitmap: 17 * in-memory bitmap:
20 * 18 *
@@ -101,21 +99,10 @@ typedef __u16 bitmap_counter_t;
101/* same, except a mask value for more efficient bitops */ 99/* same, except a mask value for more efficient bitops */
102#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1) 100#define PAGE_COUNTER_MASK (PAGE_COUNTER_RATIO - 1)
103 101
104#define BITMAP_BLOCK_SIZE 512
105#define BITMAP_BLOCK_SHIFT 9 102#define BITMAP_BLOCK_SHIFT 9
106 103
107/* how many blocks per chunk? (this is variable) */ 104/* how many blocks per chunk? (this is variable) */
108#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT) 105#define CHUNK_BLOCK_RATIO(bitmap) ((bitmap)->mddev->bitmap_info.chunksize >> BITMAP_BLOCK_SHIFT)
109#define CHUNK_BLOCK_SHIFT(bitmap) ((bitmap)->chunkshift - BITMAP_BLOCK_SHIFT)
110#define CHUNK_BLOCK_MASK(bitmap) (CHUNK_BLOCK_RATIO(bitmap) - 1)
111
112/* when hijacked, the counters and bits represent even larger "chunks" */
113/* there will be 1024 chunks represented by each counter in the page pointers */
114#define PAGEPTR_BLOCK_RATIO(bitmap) \
115 (CHUNK_BLOCK_RATIO(bitmap) << PAGE_COUNTER_SHIFT >> 1)
116#define PAGEPTR_BLOCK_SHIFT(bitmap) \
117 (CHUNK_BLOCK_SHIFT(bitmap) + PAGE_COUNTER_SHIFT - 1)
118#define PAGEPTR_BLOCK_MASK(bitmap) (PAGEPTR_BLOCK_RATIO(bitmap) - 1)
119 106
120#endif 107#endif
121 108
@@ -181,12 +168,6 @@ struct bitmap_page {
181 unsigned int count:31; 168 unsigned int count:31;
182}; 169};
183 170
184/* keep track of bitmap file pages that have pending writes on them */
185struct page_list {
186 struct list_head list;
187 struct page *page;
188};
189
190/* the main bitmap structure - one per mddev */ 171/* the main bitmap structure - one per mddev */
191struct bitmap { 172struct bitmap {
192 struct bitmap_page *bp; 173 struct bitmap_page *bp;
@@ -196,7 +177,7 @@ struct bitmap {
196 struct mddev *mddev; /* the md device that the bitmap is for */ 177 struct mddev *mddev; /* the md device that the bitmap is for */
197 178
198 /* bitmap chunksize -- how much data does each bit represent? */ 179 /* bitmap chunksize -- how much data does each bit represent? */
199 unsigned long chunkshift; /* chunksize = 2^chunkshift (for bitops) */ 180 unsigned long chunkshift; /* chunksize = 2^(chunkshift+9) (for bitops) */
200 unsigned long chunks; /* total number of data chunks for the array */ 181 unsigned long chunks; /* total number of data chunks for the array */
201 182
202 __u64 events_cleared; 183 __u64 events_cleared;
@@ -245,6 +226,7 @@ void bitmap_destroy(struct mddev *mddev);
245 226
246void bitmap_print_sb(struct bitmap *bitmap); 227void bitmap_print_sb(struct bitmap *bitmap);
247void bitmap_update_sb(struct bitmap *bitmap); 228void bitmap_update_sb(struct bitmap *bitmap);
229void bitmap_status(struct seq_file *seq, struct bitmap *bitmap);
248 230
249int bitmap_setallbits(struct bitmap *bitmap); 231int bitmap_setallbits(struct bitmap *bitmap);
250void bitmap_write_all(struct bitmap *bitmap); 232void bitmap_write_all(struct bitmap *bitmap);
diff --git a/drivers/md/dm-bufio.c b/drivers/md/dm-bufio.c
index 0a6806f80ab5..cc06a1e52423 100644
--- a/drivers/md/dm-bufio.c
+++ b/drivers/md/dm-bufio.c
@@ -12,7 +12,6 @@
12#include <linux/dm-io.h> 12#include <linux/dm-io.h>
13#include <linux/slab.h> 13#include <linux/slab.h>
14#include <linux/vmalloc.h> 14#include <linux/vmalloc.h>
15#include <linux/version.h>
16#include <linux/shrinker.h> 15#include <linux/shrinker.h>
17#include <linux/module.h> 16#include <linux/module.h>
18 17
@@ -579,7 +578,7 @@ static void write_endio(struct bio *bio, int error)
579 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio); 578 struct dm_buffer *b = container_of(bio, struct dm_buffer, bio);
580 579
581 b->write_error = error; 580 b->write_error = error;
582 if (error) { 581 if (unlikely(error)) {
583 struct dm_bufio_client *c = b->c; 582 struct dm_bufio_client *c = b->c;
584 (void)cmpxchg(&c->async_write_error, 0, error); 583 (void)cmpxchg(&c->async_write_error, 0, error);
585 } 584 }
@@ -698,13 +697,20 @@ static void __wait_for_free_buffer(struct dm_bufio_client *c)
698 dm_bufio_lock(c); 697 dm_bufio_lock(c);
699} 698}
700 699
700enum new_flag {
701 NF_FRESH = 0,
702 NF_READ = 1,
703 NF_GET = 2,
704 NF_PREFETCH = 3
705};
706
701/* 707/*
702 * Allocate a new buffer. If the allocation is not possible, wait until 708 * Allocate a new buffer. If the allocation is not possible, wait until
703 * some other thread frees a buffer. 709 * some other thread frees a buffer.
704 * 710 *
705 * May drop the lock and regain it. 711 * May drop the lock and regain it.
706 */ 712 */
707static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c) 713static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client *c, enum new_flag nf)
708{ 714{
709 struct dm_buffer *b; 715 struct dm_buffer *b;
710 716
@@ -727,6 +733,9 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
727 return b; 733 return b;
728 } 734 }
729 735
736 if (nf == NF_PREFETCH)
737 return NULL;
738
730 if (!list_empty(&c->reserved_buffers)) { 739 if (!list_empty(&c->reserved_buffers)) {
731 b = list_entry(c->reserved_buffers.next, 740 b = list_entry(c->reserved_buffers.next,
732 struct dm_buffer, lru_list); 741 struct dm_buffer, lru_list);
@@ -744,9 +753,12 @@ static struct dm_buffer *__alloc_buffer_wait_no_callback(struct dm_bufio_client
744 } 753 }
745} 754}
746 755
747static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c) 756static struct dm_buffer *__alloc_buffer_wait(struct dm_bufio_client *c, enum new_flag nf)
748{ 757{
749 struct dm_buffer *b = __alloc_buffer_wait_no_callback(c); 758 struct dm_buffer *b = __alloc_buffer_wait_no_callback(c, nf);
759
760 if (!b)
761 return NULL;
750 762
751 if (c->alloc_callback) 763 if (c->alloc_callback)
752 c->alloc_callback(b); 764 c->alloc_callback(b);
@@ -866,32 +878,23 @@ static struct dm_buffer *__find(struct dm_bufio_client *c, sector_t block)
866 * Getting a buffer 878 * Getting a buffer
867 *--------------------------------------------------------------*/ 879 *--------------------------------------------------------------*/
868 880
869enum new_flag {
870 NF_FRESH = 0,
871 NF_READ = 1,
872 NF_GET = 2
873};
874
875static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block, 881static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
876 enum new_flag nf, struct dm_buffer **bp, 882 enum new_flag nf, int *need_submit)
877 int *need_submit)
878{ 883{
879 struct dm_buffer *b, *new_b = NULL; 884 struct dm_buffer *b, *new_b = NULL;
880 885
881 *need_submit = 0; 886 *need_submit = 0;
882 887
883 b = __find(c, block); 888 b = __find(c, block);
884 if (b) { 889 if (b)
885 b->hold_count++; 890 goto found_buffer;
886 __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
887 test_bit(B_WRITING, &b->state));
888 return b;
889 }
890 891
891 if (nf == NF_GET) 892 if (nf == NF_GET)
892 return NULL; 893 return NULL;
893 894
894 new_b = __alloc_buffer_wait(c); 895 new_b = __alloc_buffer_wait(c, nf);
896 if (!new_b)
897 return NULL;
895 898
896 /* 899 /*
897 * We've had a period where the mutex was unlocked, so need to 900 * We've had a period where the mutex was unlocked, so need to
@@ -900,10 +903,7 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
900 b = __find(c, block); 903 b = __find(c, block);
901 if (b) { 904 if (b) {
902 __free_buffer_wake(new_b); 905 __free_buffer_wake(new_b);
903 b->hold_count++; 906 goto found_buffer;
904 __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
905 test_bit(B_WRITING, &b->state));
906 return b;
907 } 907 }
908 908
909 __check_watermark(c); 909 __check_watermark(c);
@@ -923,6 +923,24 @@ static struct dm_buffer *__bufio_new(struct dm_bufio_client *c, sector_t block,
923 *need_submit = 1; 923 *need_submit = 1;
924 924
925 return b; 925 return b;
926
927found_buffer:
928 if (nf == NF_PREFETCH)
929 return NULL;
930 /*
931 * Note: it is essential that we don't wait for the buffer to be
932 * read if dm_bufio_get function is used. Both dm_bufio_get and
933 * dm_bufio_prefetch can be used in the driver request routine.
934 * If the user called both dm_bufio_prefetch and dm_bufio_get on
935 * the same buffer, it would deadlock if we waited.
936 */
937 if (nf == NF_GET && unlikely(test_bit(B_READING, &b->state)))
938 return NULL;
939
940 b->hold_count++;
941 __relink_lru(b, test_bit(B_DIRTY, &b->state) ||
942 test_bit(B_WRITING, &b->state));
943 return b;
926} 944}
927 945
928/* 946/*
@@ -957,10 +975,10 @@ static void *new_read(struct dm_bufio_client *c, sector_t block,
957 struct dm_buffer *b; 975 struct dm_buffer *b;
958 976
959 dm_bufio_lock(c); 977 dm_bufio_lock(c);
960 b = __bufio_new(c, block, nf, bp, &need_submit); 978 b = __bufio_new(c, block, nf, &need_submit);
961 dm_bufio_unlock(c); 979 dm_bufio_unlock(c);
962 980
963 if (!b || IS_ERR(b)) 981 if (!b)
964 return b; 982 return b;
965 983
966 if (need_submit) 984 if (need_submit)
@@ -1006,13 +1024,47 @@ void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
1006} 1024}
1007EXPORT_SYMBOL_GPL(dm_bufio_new); 1025EXPORT_SYMBOL_GPL(dm_bufio_new);
1008 1026
1027void dm_bufio_prefetch(struct dm_bufio_client *c,
1028 sector_t block, unsigned n_blocks)
1029{
1030 struct blk_plug plug;
1031
1032 blk_start_plug(&plug);
1033 dm_bufio_lock(c);
1034
1035 for (; n_blocks--; block++) {
1036 int need_submit;
1037 struct dm_buffer *b;
1038 b = __bufio_new(c, block, NF_PREFETCH, &need_submit);
1039 if (unlikely(b != NULL)) {
1040 dm_bufio_unlock(c);
1041
1042 if (need_submit)
1043 submit_io(b, READ, b->block, read_endio);
1044 dm_bufio_release(b);
1045
1046 dm_bufio_cond_resched();
1047
1048 if (!n_blocks)
1049 goto flush_plug;
1050 dm_bufio_lock(c);
1051 }
1052
1053 }
1054
1055 dm_bufio_unlock(c);
1056
1057flush_plug:
1058 blk_finish_plug(&plug);
1059}
1060EXPORT_SYMBOL_GPL(dm_bufio_prefetch);
1061
1009void dm_bufio_release(struct dm_buffer *b) 1062void dm_bufio_release(struct dm_buffer *b)
1010{ 1063{
1011 struct dm_bufio_client *c = b->c; 1064 struct dm_bufio_client *c = b->c;
1012 1065
1013 dm_bufio_lock(c); 1066 dm_bufio_lock(c);
1014 1067
1015 BUG_ON(test_bit(B_READING, &b->state));
1016 BUG_ON(!b->hold_count); 1068 BUG_ON(!b->hold_count);
1017 1069
1018 b->hold_count--; 1070 b->hold_count--;
@@ -1025,6 +1077,7 @@ void dm_bufio_release(struct dm_buffer *b)
1025 * invalid buffer. 1077 * invalid buffer.
1026 */ 1078 */
1027 if ((b->read_error || b->write_error) && 1079 if ((b->read_error || b->write_error) &&
1080 !test_bit(B_READING, &b->state) &&
1028 !test_bit(B_WRITING, &b->state) && 1081 !test_bit(B_WRITING, &b->state) &&
1029 !test_bit(B_DIRTY, &b->state)) { 1082 !test_bit(B_DIRTY, &b->state)) {
1030 __unlink_buffer(b); 1083 __unlink_buffer(b);
@@ -1042,6 +1095,8 @@ void dm_bufio_mark_buffer_dirty(struct dm_buffer *b)
1042 1095
1043 dm_bufio_lock(c); 1096 dm_bufio_lock(c);
1044 1097
1098 BUG_ON(test_bit(B_READING, &b->state));
1099
1045 if (!test_and_set_bit(B_DIRTY, &b->state)) 1100 if (!test_and_set_bit(B_DIRTY, &b->state))
1046 __relink_lru(b, LIST_DIRTY); 1101 __relink_lru(b, LIST_DIRTY);
1047 1102
diff --git a/drivers/md/dm-bufio.h b/drivers/md/dm-bufio.h
index 5c4c3a04e381..b142946a9e32 100644
--- a/drivers/md/dm-bufio.h
+++ b/drivers/md/dm-bufio.h
@@ -63,6 +63,14 @@ void *dm_bufio_new(struct dm_bufio_client *c, sector_t block,
63 struct dm_buffer **bp); 63 struct dm_buffer **bp);
64 64
65/* 65/*
66 * Prefetch the specified blocks to the cache.
67 * The function starts to read the blocks and returns without waiting for
68 * I/O to finish.
69 */
70void dm_bufio_prefetch(struct dm_bufio_client *c,
71 sector_t block, unsigned n_blocks);
72
73/*
66 * Release a reference obtained with dm_bufio_{read,get,new}. The data 74 * Release a reference obtained with dm_bufio_{read,get,new}. The data
67 * pointer and dm_buffer pointer is no longer valid after this call. 75 * pointer and dm_buffer pointer is no longer valid after this call.
68 */ 76 */
diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c
index 8c2a000cf3f5..3f06df59fd82 100644
--- a/drivers/md/dm-crypt.c
+++ b/drivers/md/dm-crypt.c
@@ -176,7 +176,6 @@ struct crypt_config {
176 176
177#define MIN_IOS 16 177#define MIN_IOS 16
178#define MIN_POOL_PAGES 32 178#define MIN_POOL_PAGES 32
179#define MIN_BIO_PAGES 8
180 179
181static struct kmem_cache *_crypt_io_pool; 180static struct kmem_cache *_crypt_io_pool;
182 181
@@ -590,9 +589,9 @@ static int crypt_iv_lmk_gen(struct crypt_config *cc, u8 *iv,
590 int r = 0; 589 int r = 0;
591 590
592 if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) { 591 if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) {
593 src = kmap_atomic(sg_page(&dmreq->sg_in), KM_USER0); 592 src = kmap_atomic(sg_page(&dmreq->sg_in));
594 r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset); 593 r = crypt_iv_lmk_one(cc, iv, dmreq, src + dmreq->sg_in.offset);
595 kunmap_atomic(src, KM_USER0); 594 kunmap_atomic(src);
596 } else 595 } else
597 memset(iv, 0, cc->iv_size); 596 memset(iv, 0, cc->iv_size);
598 597
@@ -608,14 +607,14 @@ static int crypt_iv_lmk_post(struct crypt_config *cc, u8 *iv,
608 if (bio_data_dir(dmreq->ctx->bio_in) == WRITE) 607 if (bio_data_dir(dmreq->ctx->bio_in) == WRITE)
609 return 0; 608 return 0;
610 609
611 dst = kmap_atomic(sg_page(&dmreq->sg_out), KM_USER0); 610 dst = kmap_atomic(sg_page(&dmreq->sg_out));
612 r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset); 611 r = crypt_iv_lmk_one(cc, iv, dmreq, dst + dmreq->sg_out.offset);
613 612
614 /* Tweak the first block of plaintext sector */ 613 /* Tweak the first block of plaintext sector */
615 if (!r) 614 if (!r)
616 crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size); 615 crypto_xor(dst + dmreq->sg_out.offset, iv, cc->iv_size);
617 616
618 kunmap_atomic(dst, KM_USER0); 617 kunmap_atomic(dst);
619 return r; 618 return r;
620} 619}
621 620
@@ -848,12 +847,11 @@ static struct bio *crypt_alloc_buffer(struct dm_crypt_io *io, unsigned size,
848 } 847 }
849 848
850 /* 849 /*
851 * if additional pages cannot be allocated without waiting, 850 * If additional pages cannot be allocated without waiting,
852 * return a partially allocated bio, the caller will then try 851 * return a partially-allocated bio. The caller will then try
853 * to allocate additional bios while submitting this partial bio 852 * to allocate more bios while submitting this partial bio.
854 */ 853 */
855 if (i == (MIN_BIO_PAGES - 1)) 854 gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
856 gfp_mask = (gfp_mask | __GFP_NOWARN) & ~__GFP_WAIT;
857 855
858 len = (size > PAGE_SIZE) ? PAGE_SIZE : size; 856 len = (size > PAGE_SIZE) ? PAGE_SIZE : size;
859 857
@@ -1046,16 +1044,14 @@ static void kcryptd_queue_io(struct dm_crypt_io *io)
1046 queue_work(cc->io_queue, &io->work); 1044 queue_work(cc->io_queue, &io->work);
1047} 1045}
1048 1046
1049static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, 1047static void kcryptd_crypt_write_io_submit(struct dm_crypt_io *io, int async)
1050 int error, int async)
1051{ 1048{
1052 struct bio *clone = io->ctx.bio_out; 1049 struct bio *clone = io->ctx.bio_out;
1053 struct crypt_config *cc = io->target->private; 1050 struct crypt_config *cc = io->target->private;
1054 1051
1055 if (unlikely(error < 0)) { 1052 if (unlikely(io->error < 0)) {
1056 crypt_free_buffer_pages(cc, clone); 1053 crypt_free_buffer_pages(cc, clone);
1057 bio_put(clone); 1054 bio_put(clone);
1058 io->error = -EIO;
1059 crypt_dec_pending(io); 1055 crypt_dec_pending(io);
1060 return; 1056 return;
1061 } 1057 }
@@ -1106,12 +1102,16 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
1106 sector += bio_sectors(clone); 1102 sector += bio_sectors(clone);
1107 1103
1108 crypt_inc_pending(io); 1104 crypt_inc_pending(io);
1105
1109 r = crypt_convert(cc, &io->ctx); 1106 r = crypt_convert(cc, &io->ctx);
1107 if (r < 0)
1108 io->error = -EIO;
1109
1110 crypt_finished = atomic_dec_and_test(&io->ctx.pending); 1110 crypt_finished = atomic_dec_and_test(&io->ctx.pending);
1111 1111
1112 /* Encryption was already finished, submit io now */ 1112 /* Encryption was already finished, submit io now */
1113 if (crypt_finished) { 1113 if (crypt_finished) {
1114 kcryptd_crypt_write_io_submit(io, r, 0); 1114 kcryptd_crypt_write_io_submit(io, 0);
1115 1115
1116 /* 1116 /*
1117 * If there was an error, do not try next fragments. 1117 * If there was an error, do not try next fragments.
@@ -1162,11 +1162,8 @@ static void kcryptd_crypt_write_convert(struct dm_crypt_io *io)
1162 crypt_dec_pending(io); 1162 crypt_dec_pending(io);
1163} 1163}
1164 1164
1165static void kcryptd_crypt_read_done(struct dm_crypt_io *io, int error) 1165static void kcryptd_crypt_read_done(struct dm_crypt_io *io)
1166{ 1166{
1167 if (unlikely(error < 0))
1168 io->error = -EIO;
1169
1170 crypt_dec_pending(io); 1167 crypt_dec_pending(io);
1171} 1168}
1172 1169
@@ -1181,9 +1178,11 @@ static void kcryptd_crypt_read_convert(struct dm_crypt_io *io)
1181 io->sector); 1178 io->sector);
1182 1179
1183 r = crypt_convert(cc, &io->ctx); 1180 r = crypt_convert(cc, &io->ctx);
1181 if (r < 0)
1182 io->error = -EIO;
1184 1183
1185 if (atomic_dec_and_test(&io->ctx.pending)) 1184 if (atomic_dec_and_test(&io->ctx.pending))
1186 kcryptd_crypt_read_done(io, r); 1185 kcryptd_crypt_read_done(io);
1187 1186
1188 crypt_dec_pending(io); 1187 crypt_dec_pending(io);
1189} 1188}
@@ -1204,15 +1203,18 @@ static void kcryptd_async_done(struct crypto_async_request *async_req,
1204 if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post) 1203 if (!error && cc->iv_gen_ops && cc->iv_gen_ops->post)
1205 error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq); 1204 error = cc->iv_gen_ops->post(cc, iv_of_dmreq(cc, dmreq), dmreq);
1206 1205
1206 if (error < 0)
1207 io->error = -EIO;
1208
1207 mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool); 1209 mempool_free(req_of_dmreq(cc, dmreq), cc->req_pool);
1208 1210
1209 if (!atomic_dec_and_test(&ctx->pending)) 1211 if (!atomic_dec_and_test(&ctx->pending))
1210 return; 1212 return;
1211 1213
1212 if (bio_data_dir(io->base_bio) == READ) 1214 if (bio_data_dir(io->base_bio) == READ)
1213 kcryptd_crypt_read_done(io, error); 1215 kcryptd_crypt_read_done(io);
1214 else 1216 else
1215 kcryptd_crypt_write_io_submit(io, error, 1); 1217 kcryptd_crypt_write_io_submit(io, 1);
1216} 1218}
1217 1219
1218static void kcryptd_crypt(struct work_struct *work) 1220static void kcryptd_crypt(struct work_struct *work)
@@ -1413,6 +1415,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1413 char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount; 1415 char *tmp, *cipher, *chainmode, *ivmode, *ivopts, *keycount;
1414 char *cipher_api = NULL; 1416 char *cipher_api = NULL;
1415 int cpu, ret = -EINVAL; 1417 int cpu, ret = -EINVAL;
1418 char dummy;
1416 1419
1417 /* Convert to crypto api definition? */ 1420 /* Convert to crypto api definition? */
1418 if (strchr(cipher_in, '(')) { 1421 if (strchr(cipher_in, '(')) {
@@ -1434,7 +1437,7 @@ static int crypt_ctr_cipher(struct dm_target *ti,
1434 1437
1435 if (!keycount) 1438 if (!keycount)
1436 cc->tfms_count = 1; 1439 cc->tfms_count = 1;
1437 else if (sscanf(keycount, "%u", &cc->tfms_count) != 1 || 1440 else if (sscanf(keycount, "%u%c", &cc->tfms_count, &dummy) != 1 ||
1438 !is_power_of_2(cc->tfms_count)) { 1441 !is_power_of_2(cc->tfms_count)) {
1439 ti->error = "Bad cipher key count specification"; 1442 ti->error = "Bad cipher key count specification";
1440 return -EINVAL; 1443 return -EINVAL;
@@ -1579,6 +1582,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1579 int ret; 1582 int ret;
1580 struct dm_arg_set as; 1583 struct dm_arg_set as;
1581 const char *opt_string; 1584 const char *opt_string;
1585 char dummy;
1582 1586
1583 static struct dm_arg _args[] = { 1587 static struct dm_arg _args[] = {
1584 {0, 1, "Invalid number of feature args"}, 1588 {0, 1, "Invalid number of feature args"},
@@ -1636,7 +1640,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1636 } 1640 }
1637 1641
1638 ret = -EINVAL; 1642 ret = -EINVAL;
1639 if (sscanf(argv[2], "%llu", &tmpll) != 1) { 1643 if (sscanf(argv[2], "%llu%c", &tmpll, &dummy) != 1) {
1640 ti->error = "Invalid iv_offset sector"; 1644 ti->error = "Invalid iv_offset sector";
1641 goto bad; 1645 goto bad;
1642 } 1646 }
@@ -1647,7 +1651,7 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1647 goto bad; 1651 goto bad;
1648 } 1652 }
1649 1653
1650 if (sscanf(argv[4], "%llu", &tmpll) != 1) { 1654 if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) {
1651 ti->error = "Invalid device sector"; 1655 ti->error = "Invalid device sector";
1652 goto bad; 1656 goto bad;
1653 } 1657 }
diff --git a/drivers/md/dm-delay.c b/drivers/md/dm-delay.c
index f18375dcedd9..2dc22dddb2ae 100644
--- a/drivers/md/dm-delay.c
+++ b/drivers/md/dm-delay.c
@@ -131,6 +131,7 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
131{ 131{
132 struct delay_c *dc; 132 struct delay_c *dc;
133 unsigned long long tmpll; 133 unsigned long long tmpll;
134 char dummy;
134 135
135 if (argc != 3 && argc != 6) { 136 if (argc != 3 && argc != 6) {
136 ti->error = "requires exactly 3 or 6 arguments"; 137 ti->error = "requires exactly 3 or 6 arguments";
@@ -145,13 +146,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
145 146
146 dc->reads = dc->writes = 0; 147 dc->reads = dc->writes = 0;
147 148
148 if (sscanf(argv[1], "%llu", &tmpll) != 1) { 149 if (sscanf(argv[1], "%llu%c", &tmpll, &dummy) != 1) {
149 ti->error = "Invalid device sector"; 150 ti->error = "Invalid device sector";
150 goto bad; 151 goto bad;
151 } 152 }
152 dc->start_read = tmpll; 153 dc->start_read = tmpll;
153 154
154 if (sscanf(argv[2], "%u", &dc->read_delay) != 1) { 155 if (sscanf(argv[2], "%u%c", &dc->read_delay, &dummy) != 1) {
155 ti->error = "Invalid delay"; 156 ti->error = "Invalid delay";
156 goto bad; 157 goto bad;
157 } 158 }
@@ -166,13 +167,13 @@ static int delay_ctr(struct dm_target *ti, unsigned int argc, char **argv)
166 if (argc == 3) 167 if (argc == 3)
167 goto out; 168 goto out;
168 169
169 if (sscanf(argv[4], "%llu", &tmpll) != 1) { 170 if (sscanf(argv[4], "%llu%c", &tmpll, &dummy) != 1) {
170 ti->error = "Invalid write device sector"; 171 ti->error = "Invalid write device sector";
171 goto bad_dev_read; 172 goto bad_dev_read;
172 } 173 }
173 dc->start_write = tmpll; 174 dc->start_write = tmpll;
174 175
175 if (sscanf(argv[5], "%u", &dc->write_delay) != 1) { 176 if (sscanf(argv[5], "%u%c", &dc->write_delay, &dummy) != 1) {
176 ti->error = "Invalid write delay"; 177 ti->error = "Invalid write delay";
177 goto bad_dev_read; 178 goto bad_dev_read;
178 } 179 }
diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c
index 042e71996569..aa70f7d43a1a 100644
--- a/drivers/md/dm-exception-store.c
+++ b/drivers/md/dm-exception-store.c
@@ -283,7 +283,7 @@ int dm_exception_store_init(void)
283 return 0; 283 return 0;
284 284
285persistent_fail: 285persistent_fail:
286 dm_persistent_snapshot_exit(); 286 dm_transient_snapshot_exit();
287transient_fail: 287transient_fail:
288 return r; 288 return r;
289} 289}
diff --git a/drivers/md/dm-flakey.c b/drivers/md/dm-flakey.c
index 9fb18c147825..ac49c01f1a44 100644
--- a/drivers/md/dm-flakey.c
+++ b/drivers/md/dm-flakey.c
@@ -160,6 +160,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
160 unsigned long long tmpll; 160 unsigned long long tmpll;
161 struct dm_arg_set as; 161 struct dm_arg_set as;
162 const char *devname; 162 const char *devname;
163 char dummy;
163 164
164 as.argc = argc; 165 as.argc = argc;
165 as.argv = argv; 166 as.argv = argv;
@@ -178,7 +179,7 @@ static int flakey_ctr(struct dm_target *ti, unsigned int argc, char **argv)
178 179
179 devname = dm_shift_arg(&as); 180 devname = dm_shift_arg(&as);
180 181
181 if (sscanf(dm_shift_arg(&as), "%llu", &tmpll) != 1) { 182 if (sscanf(dm_shift_arg(&as), "%llu%c", &tmpll, &dummy) != 1) {
182 ti->error = "Invalid device sector"; 183 ti->error = "Invalid device sector";
183 goto bad; 184 goto bad;
184 } 185 }
@@ -323,7 +324,7 @@ static int flakey_end_io(struct dm_target *ti, struct bio *bio,
323 * Corrupt successful READs while in down state. 324 * Corrupt successful READs while in down state.
324 * If flags were specified, only corrupt those that match. 325 * If flags were specified, only corrupt those that match.
325 */ 326 */
326 if (!error && bio_submitted_while_down && 327 if (fc->corrupt_bio_byte && !error && bio_submitted_while_down &&
327 (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) && 328 (bio_data_dir(bio) == READ) && (fc->corrupt_bio_rw == READ) &&
328 all_corrupt_bio_flags_match(bio, fc)) 329 all_corrupt_bio_flags_match(bio, fc))
329 corrupt_bio_data(bio, fc); 330 corrupt_bio_data(bio, fc);
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index ad2eba40e319..ea5dd289fe2a 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -296,6 +296,8 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
296 unsigned offset; 296 unsigned offset;
297 unsigned num_bvecs; 297 unsigned num_bvecs;
298 sector_t remaining = where->count; 298 sector_t remaining = where->count;
299 struct request_queue *q = bdev_get_queue(where->bdev);
300 sector_t discard_sectors;
299 301
300 /* 302 /*
301 * where->count may be zero if rw holds a flush and we need to 303 * where->count may be zero if rw holds a flush and we need to
@@ -305,9 +307,12 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
305 /* 307 /*
306 * Allocate a suitably sized-bio. 308 * Allocate a suitably sized-bio.
307 */ 309 */
308 num_bvecs = dm_sector_div_up(remaining, 310 if (rw & REQ_DISCARD)
309 (PAGE_SIZE >> SECTOR_SHIFT)); 311 num_bvecs = 1;
310 num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev), num_bvecs); 312 else
313 num_bvecs = min_t(int, bio_get_nr_vecs(where->bdev),
314 dm_sector_div_up(remaining, (PAGE_SIZE >> SECTOR_SHIFT)));
315
311 bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios); 316 bio = bio_alloc_bioset(GFP_NOIO, num_bvecs, io->client->bios);
312 bio->bi_sector = where->sector + (where->count - remaining); 317 bio->bi_sector = where->sector + (where->count - remaining);
313 bio->bi_bdev = where->bdev; 318 bio->bi_bdev = where->bdev;
@@ -315,10 +320,14 @@ static void do_region(int rw, unsigned region, struct dm_io_region *where,
315 bio->bi_destructor = dm_bio_destructor; 320 bio->bi_destructor = dm_bio_destructor;
316 store_io_and_region_in_bio(bio, io, region); 321 store_io_and_region_in_bio(bio, io, region);
317 322
318 /* 323 if (rw & REQ_DISCARD) {
319 * Try and add as many pages as possible. 324 discard_sectors = min_t(sector_t, q->limits.max_discard_sectors, remaining);
320 */ 325 bio->bi_size = discard_sectors << SECTOR_SHIFT;
321 while (remaining) { 326 remaining -= discard_sectors;
327 } else while (remaining) {
328 /*
329 * Try and add as many pages as possible.
330 */
322 dp->get_page(dp, &page, &len, &offset); 331 dp->get_page(dp, &page, &len, &offset);
323 len = min(len, to_bytes(remaining)); 332 len = min(len, to_bytes(remaining));
324 if (!bio_add_page(bio, page, len, offset)) 333 if (!bio_add_page(bio, page, len, offset))
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index 31c2dc25886d..a1a3e6df17b8 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -880,6 +880,7 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
880 struct hd_geometry geometry; 880 struct hd_geometry geometry;
881 unsigned long indata[4]; 881 unsigned long indata[4];
882 char *geostr = (char *) param + param->data_start; 882 char *geostr = (char *) param + param->data_start;
883 char dummy;
883 884
884 md = find_device(param); 885 md = find_device(param);
885 if (!md) 886 if (!md)
@@ -891,8 +892,8 @@ static int dev_set_geometry(struct dm_ioctl *param, size_t param_size)
891 goto out; 892 goto out;
892 } 893 }
893 894
894 x = sscanf(geostr, "%lu %lu %lu %lu", indata, 895 x = sscanf(geostr, "%lu %lu %lu %lu%c", indata,
895 indata + 1, indata + 2, indata + 3); 896 indata + 1, indata + 2, indata + 3, &dummy);
896 897
897 if (x != 4) { 898 if (x != 4) {
898 DMWARN("Unable to interpret geometry settings."); 899 DMWARN("Unable to interpret geometry settings.");
@@ -1437,7 +1438,7 @@ static int target_message(struct dm_ioctl *param, size_t param_size)
1437 1438
1438 if (!argc) { 1439 if (!argc) {
1439 DMWARN("Empty message received."); 1440 DMWARN("Empty message received.");
1440 goto out; 1441 goto out_argv;
1441 } 1442 }
1442 1443
1443 table = dm_get_live_table(md); 1444 table = dm_get_live_table(md);
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index 9728839f844a..3639eeab6042 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -29,6 +29,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
29{ 29{
30 struct linear_c *lc; 30 struct linear_c *lc;
31 unsigned long long tmp; 31 unsigned long long tmp;
32 char dummy;
32 33
33 if (argc != 2) { 34 if (argc != 2) {
34 ti->error = "Invalid argument count"; 35 ti->error = "Invalid argument count";
@@ -41,7 +42,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
41 return -ENOMEM; 42 return -ENOMEM;
42 } 43 }
43 44
44 if (sscanf(argv[1], "%llu", &tmp) != 1) { 45 if (sscanf(argv[1], "%llu%c", &tmp, &dummy) != 1) {
45 ti->error = "dm-linear: Invalid device sector"; 46 ti->error = "dm-linear: Invalid device sector";
46 goto bad; 47 goto bad;
47 } 48 }
diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c
index 3b52bb72bd1f..65ebaebf502b 100644
--- a/drivers/md/dm-log.c
+++ b/drivers/md/dm-log.c
@@ -369,6 +369,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
369 unsigned int region_count; 369 unsigned int region_count;
370 size_t bitset_size, buf_size; 370 size_t bitset_size, buf_size;
371 int r; 371 int r;
372 char dummy;
372 373
373 if (argc < 1 || argc > 2) { 374 if (argc < 1 || argc > 2) {
374 DMWARN("wrong number of arguments to dirty region log"); 375 DMWARN("wrong number of arguments to dirty region log");
@@ -387,7 +388,7 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti,
387 } 388 }
388 } 389 }
389 390
390 if (sscanf(argv[0], "%u", &region_size) != 1 || 391 if (sscanf(argv[0], "%u%c", &region_size, &dummy) != 1 ||
391 !_check_region_size(ti, region_size)) { 392 !_check_region_size(ti, region_size)) {
392 DMWARN("invalid region size %s", argv[0]); 393 DMWARN("invalid region size %s", argv[0]);
393 return -EINVAL; 394 return -EINVAL;
diff --git a/drivers/md/dm-mpath.c b/drivers/md/dm-mpath.c
index 801d92d237cf..922a3385eead 100644
--- a/drivers/md/dm-mpath.c
+++ b/drivers/md/dm-mpath.c
@@ -226,6 +226,27 @@ static void free_multipath(struct multipath *m)
226 kfree(m); 226 kfree(m);
227} 227}
228 228
229static int set_mapinfo(struct multipath *m, union map_info *info)
230{
231 struct dm_mpath_io *mpio;
232
233 mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC);
234 if (!mpio)
235 return -ENOMEM;
236
237 memset(mpio, 0, sizeof(*mpio));
238 info->ptr = mpio;
239
240 return 0;
241}
242
243static void clear_mapinfo(struct multipath *m, union map_info *info)
244{
245 struct dm_mpath_io *mpio = info->ptr;
246
247 info->ptr = NULL;
248 mempool_free(mpio, m->mpio_pool);
249}
229 250
230/*----------------------------------------------- 251/*-----------------------------------------------
231 * Path selection 252 * Path selection
@@ -341,13 +362,14 @@ static int __must_push_back(struct multipath *m)
341} 362}
342 363
343static int map_io(struct multipath *m, struct request *clone, 364static int map_io(struct multipath *m, struct request *clone,
344 struct dm_mpath_io *mpio, unsigned was_queued) 365 union map_info *map_context, unsigned was_queued)
345{ 366{
346 int r = DM_MAPIO_REMAPPED; 367 int r = DM_MAPIO_REMAPPED;
347 size_t nr_bytes = blk_rq_bytes(clone); 368 size_t nr_bytes = blk_rq_bytes(clone);
348 unsigned long flags; 369 unsigned long flags;
349 struct pgpath *pgpath; 370 struct pgpath *pgpath;
350 struct block_device *bdev; 371 struct block_device *bdev;
372 struct dm_mpath_io *mpio = map_context->ptr;
351 373
352 spin_lock_irqsave(&m->lock, flags); 374 spin_lock_irqsave(&m->lock, flags);
353 375
@@ -423,7 +445,6 @@ static void dispatch_queued_ios(struct multipath *m)
423{ 445{
424 int r; 446 int r;
425 unsigned long flags; 447 unsigned long flags;
426 struct dm_mpath_io *mpio;
427 union map_info *info; 448 union map_info *info;
428 struct request *clone, *n; 449 struct request *clone, *n;
429 LIST_HEAD(cl); 450 LIST_HEAD(cl);
@@ -436,16 +457,15 @@ static void dispatch_queued_ios(struct multipath *m)
436 list_del_init(&clone->queuelist); 457 list_del_init(&clone->queuelist);
437 458
438 info = dm_get_rq_mapinfo(clone); 459 info = dm_get_rq_mapinfo(clone);
439 mpio = info->ptr;
440 460
441 r = map_io(m, clone, mpio, 1); 461 r = map_io(m, clone, info, 1);
442 if (r < 0) { 462 if (r < 0) {
443 mempool_free(mpio, m->mpio_pool); 463 clear_mapinfo(m, info);
444 dm_kill_unmapped_request(clone, r); 464 dm_kill_unmapped_request(clone, r);
445 } else if (r == DM_MAPIO_REMAPPED) 465 } else if (r == DM_MAPIO_REMAPPED)
446 dm_dispatch_request(clone); 466 dm_dispatch_request(clone);
447 else if (r == DM_MAPIO_REQUEUE) { 467 else if (r == DM_MAPIO_REQUEUE) {
448 mempool_free(mpio, m->mpio_pool); 468 clear_mapinfo(m, info);
449 dm_requeue_unmapped_request(clone); 469 dm_requeue_unmapped_request(clone);
450 } 470 }
451 } 471 }
@@ -908,20 +928,16 @@ static int multipath_map(struct dm_target *ti, struct request *clone,
908 union map_info *map_context) 928 union map_info *map_context)
909{ 929{
910 int r; 930 int r;
911 struct dm_mpath_io *mpio;
912 struct multipath *m = (struct multipath *) ti->private; 931 struct multipath *m = (struct multipath *) ti->private;
913 932
914 mpio = mempool_alloc(m->mpio_pool, GFP_ATOMIC); 933 if (set_mapinfo(m, map_context) < 0)
915 if (!mpio)
916 /* ENOMEM, requeue */ 934 /* ENOMEM, requeue */
917 return DM_MAPIO_REQUEUE; 935 return DM_MAPIO_REQUEUE;
918 memset(mpio, 0, sizeof(*mpio));
919 936
920 map_context->ptr = mpio;
921 clone->cmd_flags |= REQ_FAILFAST_TRANSPORT; 937 clone->cmd_flags |= REQ_FAILFAST_TRANSPORT;
922 r = map_io(m, clone, mpio, 0); 938 r = map_io(m, clone, map_context, 0);
923 if (r < 0 || r == DM_MAPIO_REQUEUE) 939 if (r < 0 || r == DM_MAPIO_REQUEUE)
924 mempool_free(mpio, m->mpio_pool); 940 clear_mapinfo(m, map_context);
925 941
926 return r; 942 return r;
927} 943}
@@ -1054,8 +1070,9 @@ static int switch_pg_num(struct multipath *m, const char *pgstr)
1054 struct priority_group *pg; 1070 struct priority_group *pg;
1055 unsigned pgnum; 1071 unsigned pgnum;
1056 unsigned long flags; 1072 unsigned long flags;
1073 char dummy;
1057 1074
1058 if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || 1075 if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1059 (pgnum > m->nr_priority_groups)) { 1076 (pgnum > m->nr_priority_groups)) {
1060 DMWARN("invalid PG number supplied to switch_pg_num"); 1077 DMWARN("invalid PG number supplied to switch_pg_num");
1061 return -EINVAL; 1078 return -EINVAL;
@@ -1085,8 +1102,9 @@ static int bypass_pg_num(struct multipath *m, const char *pgstr, int bypassed)
1085{ 1102{
1086 struct priority_group *pg; 1103 struct priority_group *pg;
1087 unsigned pgnum; 1104 unsigned pgnum;
1105 char dummy;
1088 1106
1089 if (!pgstr || (sscanf(pgstr, "%u", &pgnum) != 1) || !pgnum || 1107 if (!pgstr || (sscanf(pgstr, "%u%c", &pgnum, &dummy) != 1) || !pgnum ||
1090 (pgnum > m->nr_priority_groups)) { 1108 (pgnum > m->nr_priority_groups)) {
1091 DMWARN("invalid PG number supplied to bypass_pg"); 1109 DMWARN("invalid PG number supplied to bypass_pg");
1092 return -EINVAL; 1110 return -EINVAL;
@@ -1261,13 +1279,15 @@ static int multipath_end_io(struct dm_target *ti, struct request *clone,
1261 struct path_selector *ps; 1279 struct path_selector *ps;
1262 int r; 1280 int r;
1263 1281
1282 BUG_ON(!mpio);
1283
1264 r = do_end_io(m, clone, error, mpio); 1284 r = do_end_io(m, clone, error, mpio);
1265 if (pgpath) { 1285 if (pgpath) {
1266 ps = &pgpath->pg->ps; 1286 ps = &pgpath->pg->ps;
1267 if (ps->type->end_io) 1287 if (ps->type->end_io)
1268 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes); 1288 ps->type->end_io(ps, &pgpath->path, mpio->nr_bytes);
1269 } 1289 }
1270 mempool_free(mpio, m->mpio_pool); 1290 clear_mapinfo(m, map_context);
1271 1291
1272 return r; 1292 return r;
1273} 1293}
diff --git a/drivers/md/dm-queue-length.c b/drivers/md/dm-queue-length.c
index 03a837aa5ce6..3941fae0de9f 100644
--- a/drivers/md/dm-queue-length.c
+++ b/drivers/md/dm-queue-length.c
@@ -112,6 +112,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path,
112 struct selector *s = ps->context; 112 struct selector *s = ps->context;
113 struct path_info *pi; 113 struct path_info *pi;
114 unsigned repeat_count = QL_MIN_IO; 114 unsigned repeat_count = QL_MIN_IO;
115 char dummy;
115 116
116 /* 117 /*
117 * Arguments: [<repeat_count>] 118 * Arguments: [<repeat_count>]
@@ -123,7 +124,7 @@ static int ql_add_path(struct path_selector *ps, struct dm_path *path,
123 return -EINVAL; 124 return -EINVAL;
124 } 125 }
125 126
126 if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { 127 if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
127 *error = "queue-length ps: invalid repeat count"; 128 *error = "queue-length ps: invalid repeat count";
128 return -EINVAL; 129 return -EINVAL;
129 } 130 }
diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c
index 86cb7e5d83d5..b0ba52459ed7 100644
--- a/drivers/md/dm-raid.c
+++ b/drivers/md/dm-raid.c
@@ -604,7 +604,9 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
604 return 0; 604 return 0;
605 605
606 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) { 606 if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, 1)) {
607 DMERR("Failed to read device superblock"); 607 DMERR("Failed to read superblock of device at position %d",
608 rdev->raid_disk);
609 set_bit(Faulty, &rdev->flags);
608 return -EINVAL; 610 return -EINVAL;
609 } 611 }
610 612
@@ -615,14 +617,14 @@ static int read_disk_sb(struct md_rdev *rdev, int size)
615 617
616static void super_sync(struct mddev *mddev, struct md_rdev *rdev) 618static void super_sync(struct mddev *mddev, struct md_rdev *rdev)
617{ 619{
618 struct md_rdev *r, *t; 620 struct md_rdev *r;
619 uint64_t failed_devices; 621 uint64_t failed_devices;
620 struct dm_raid_superblock *sb; 622 struct dm_raid_superblock *sb;
621 623
622 sb = page_address(rdev->sb_page); 624 sb = page_address(rdev->sb_page);
623 failed_devices = le64_to_cpu(sb->failed_devices); 625 failed_devices = le64_to_cpu(sb->failed_devices);
624 626
625 rdev_for_each(r, t, mddev) 627 rdev_for_each(r, mddev)
626 if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags)) 628 if ((r->raid_disk >= 0) && test_bit(Faulty, &r->flags))
627 failed_devices |= (1ULL << r->raid_disk); 629 failed_devices |= (1ULL << r->raid_disk);
628 630
@@ -668,7 +670,14 @@ static int super_load(struct md_rdev *rdev, struct md_rdev *refdev)
668 return ret; 670 return ret;
669 671
670 sb = page_address(rdev->sb_page); 672 sb = page_address(rdev->sb_page);
671 if (sb->magic != cpu_to_le32(DM_RAID_MAGIC)) { 673
674 /*
675 * Two cases that we want to write new superblocks and rebuild:
676 * 1) New device (no matching magic number)
677 * 2) Device specified for rebuild (!In_sync w/ offset == 0)
678 */
679 if ((sb->magic != cpu_to_le32(DM_RAID_MAGIC)) ||
680 (!test_bit(In_sync, &rdev->flags) && !rdev->recovery_offset)) {
672 super_sync(rdev->mddev, rdev); 681 super_sync(rdev->mddev, rdev);
673 682
674 set_bit(FirstUse, &rdev->flags); 683 set_bit(FirstUse, &rdev->flags);
@@ -700,7 +709,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
700 struct dm_raid_superblock *sb; 709 struct dm_raid_superblock *sb;
701 uint32_t new_devs = 0; 710 uint32_t new_devs = 0;
702 uint32_t rebuilds = 0; 711 uint32_t rebuilds = 0;
703 struct md_rdev *r, *t; 712 struct md_rdev *r;
704 struct dm_raid_superblock *sb2; 713 struct dm_raid_superblock *sb2;
705 714
706 sb = page_address(rdev->sb_page); 715 sb = page_address(rdev->sb_page);
@@ -743,13 +752,10 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
743 * case the In_sync bit will /not/ be set and 752 * case the In_sync bit will /not/ be set and
744 * recovery_cp must be MaxSector. 753 * recovery_cp must be MaxSector.
745 */ 754 */
746 rdev_for_each(r, t, mddev) { 755 rdev_for_each(r, mddev) {
747 if (!test_bit(In_sync, &r->flags)) { 756 if (!test_bit(In_sync, &r->flags)) {
748 if (!test_bit(FirstUse, &r->flags)) 757 DMINFO("Device %d specified for rebuild: "
749 DMERR("Superblock area of " 758 "Clearing superblock", r->raid_disk);
750 "rebuild device %d should have been "
751 "cleared.", r->raid_disk);
752 set_bit(FirstUse, &r->flags);
753 rebuilds++; 759 rebuilds++;
754 } else if (test_bit(FirstUse, &r->flags)) 760 } else if (test_bit(FirstUse, &r->flags))
755 new_devs++; 761 new_devs++;
@@ -778,7 +784,7 @@ static int super_init_validation(struct mddev *mddev, struct md_rdev *rdev)
778 * Now we set the Faulty bit for those devices that are 784 * Now we set the Faulty bit for those devices that are
779 * recorded in the superblock as failed. 785 * recorded in the superblock as failed.
780 */ 786 */
781 rdev_for_each(r, t, mddev) { 787 rdev_for_each(r, mddev) {
782 if (!r->sb_page) 788 if (!r->sb_page)
783 continue; 789 continue;
784 sb2 = page_address(r->sb_page); 790 sb2 = page_address(r->sb_page);
@@ -851,11 +857,27 @@ static int super_validate(struct mddev *mddev, struct md_rdev *rdev)
851static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs) 857static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
852{ 858{
853 int ret; 859 int ret;
854 struct md_rdev *rdev, *freshest, *tmp; 860 unsigned redundancy = 0;
861 struct raid_dev *dev;
862 struct md_rdev *rdev, *freshest;
855 struct mddev *mddev = &rs->md; 863 struct mddev *mddev = &rs->md;
856 864
865 switch (rs->raid_type->level) {
866 case 1:
867 redundancy = rs->md.raid_disks - 1;
868 break;
869 case 4:
870 case 5:
871 case 6:
872 redundancy = rs->raid_type->parity_devs;
873 break;
874 default:
875 ti->error = "Unknown RAID type";
876 return -EINVAL;
877 }
878
857 freshest = NULL; 879 freshest = NULL;
858 rdev_for_each(rdev, tmp, mddev) { 880 rdev_for_each(rdev, mddev) {
859 if (!rdev->meta_bdev) 881 if (!rdev->meta_bdev)
860 continue; 882 continue;
861 883
@@ -868,6 +890,37 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
868 case 0: 890 case 0:
869 break; 891 break;
870 default: 892 default:
893 dev = container_of(rdev, struct raid_dev, rdev);
894 if (redundancy--) {
895 if (dev->meta_dev)
896 dm_put_device(ti, dev->meta_dev);
897
898 dev->meta_dev = NULL;
899 rdev->meta_bdev = NULL;
900
901 if (rdev->sb_page)
902 put_page(rdev->sb_page);
903
904 rdev->sb_page = NULL;
905
906 rdev->sb_loaded = 0;
907
908 /*
909 * We might be able to salvage the data device
910 * even though the meta device has failed. For
911 * now, we behave as though '- -' had been
912 * set for this device in the table.
913 */
914 if (dev->data_dev)
915 dm_put_device(ti, dev->data_dev);
916
917 dev->data_dev = NULL;
918 rdev->bdev = NULL;
919
920 list_del(&rdev->same_set);
921
922 continue;
923 }
871 ti->error = "Failed to load superblock"; 924 ti->error = "Failed to load superblock";
872 return ret; 925 return ret;
873 } 926 }
@@ -884,7 +937,7 @@ static int analyse_superblocks(struct dm_target *ti, struct raid_set *rs)
884 if (super_validate(mddev, freshest)) 937 if (super_validate(mddev, freshest))
885 return -EINVAL; 938 return -EINVAL;
886 939
887 rdev_for_each(rdev, tmp, mddev) 940 rdev_for_each(rdev, mddev)
888 if ((rdev != freshest) && super_validate(mddev, rdev)) 941 if ((rdev != freshest) && super_validate(mddev, rdev))
889 return -EINVAL; 942 return -EINVAL;
890 943
@@ -971,6 +1024,7 @@ static int raid_ctr(struct dm_target *ti, unsigned argc, char **argv)
971 1024
972 INIT_WORK(&rs->md.event_work, do_table_event); 1025 INIT_WORK(&rs->md.event_work, do_table_event);
973 ti->private = rs; 1026 ti->private = rs;
1027 ti->num_flush_requests = 1;
974 1028
975 mutex_lock(&rs->md.reconfig_mutex); 1029 mutex_lock(&rs->md.reconfig_mutex);
976 ret = md_run(&rs->md); 1030 ret = md_run(&rs->md);
@@ -1209,7 +1263,7 @@ static void raid_resume(struct dm_target *ti)
1209 1263
1210static struct target_type raid_target = { 1264static struct target_type raid_target = {
1211 .name = "raid", 1265 .name = "raid",
1212 .version = {1, 1, 0}, 1266 .version = {1, 2, 0},
1213 .module = THIS_MODULE, 1267 .module = THIS_MODULE,
1214 .ctr = raid_ctr, 1268 .ctr = raid_ctr,
1215 .dtr = raid_dtr, 1269 .dtr = raid_dtr,
diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c
index 9bfd057be686..d039de8322f0 100644
--- a/drivers/md/dm-raid1.c
+++ b/drivers/md/dm-raid1.c
@@ -924,8 +924,9 @@ static int get_mirror(struct mirror_set *ms, struct dm_target *ti,
924 unsigned int mirror, char **argv) 924 unsigned int mirror, char **argv)
925{ 925{
926 unsigned long long offset; 926 unsigned long long offset;
927 char dummy;
927 928
928 if (sscanf(argv[1], "%llu", &offset) != 1) { 929 if (sscanf(argv[1], "%llu%c", &offset, &dummy) != 1) {
929 ti->error = "Invalid offset"; 930 ti->error = "Invalid offset";
930 return -EINVAL; 931 return -EINVAL;
931 } 932 }
@@ -953,13 +954,14 @@ static struct dm_dirty_log *create_dirty_log(struct dm_target *ti,
953{ 954{
954 unsigned param_count; 955 unsigned param_count;
955 struct dm_dirty_log *dl; 956 struct dm_dirty_log *dl;
957 char dummy;
956 958
957 if (argc < 2) { 959 if (argc < 2) {
958 ti->error = "Insufficient mirror log arguments"; 960 ti->error = "Insufficient mirror log arguments";
959 return NULL; 961 return NULL;
960 } 962 }
961 963
962 if (sscanf(argv[1], "%u", &param_count) != 1) { 964 if (sscanf(argv[1], "%u%c", &param_count, &dummy) != 1) {
963 ti->error = "Invalid mirror log argument count"; 965 ti->error = "Invalid mirror log argument count";
964 return NULL; 966 return NULL;
965 } 967 }
@@ -986,13 +988,14 @@ static int parse_features(struct mirror_set *ms, unsigned argc, char **argv,
986{ 988{
987 unsigned num_features; 989 unsigned num_features;
988 struct dm_target *ti = ms->ti; 990 struct dm_target *ti = ms->ti;
991 char dummy;
989 992
990 *args_used = 0; 993 *args_used = 0;
991 994
992 if (!argc) 995 if (!argc)
993 return 0; 996 return 0;
994 997
995 if (sscanf(argv[0], "%u", &num_features) != 1) { 998 if (sscanf(argv[0], "%u%c", &num_features, &dummy) != 1) {
996 ti->error = "Invalid number of features"; 999 ti->error = "Invalid number of features";
997 return -EINVAL; 1000 return -EINVAL;
998 } 1001 }
@@ -1036,6 +1039,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1036 unsigned int nr_mirrors, m, args_used; 1039 unsigned int nr_mirrors, m, args_used;
1037 struct mirror_set *ms; 1040 struct mirror_set *ms;
1038 struct dm_dirty_log *dl; 1041 struct dm_dirty_log *dl;
1042 char dummy;
1039 1043
1040 dl = create_dirty_log(ti, argc, argv, &args_used); 1044 dl = create_dirty_log(ti, argc, argv, &args_used);
1041 if (!dl) 1045 if (!dl)
@@ -1044,7 +1048,7 @@ static int mirror_ctr(struct dm_target *ti, unsigned int argc, char **argv)
1044 argv += args_used; 1048 argv += args_used;
1045 argc -= args_used; 1049 argc -= args_used;
1046 1050
1047 if (!argc || sscanf(argv[0], "%u", &nr_mirrors) != 1 || 1051 if (!argc || sscanf(argv[0], "%u%c", &nr_mirrors, &dummy) != 1 ||
1048 nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) { 1052 nr_mirrors < 2 || nr_mirrors > DM_KCOPYD_MAX_REGIONS + 1) {
1049 ti->error = "Invalid number of mirrors"; 1053 ti->error = "Invalid number of mirrors";
1050 dm_dirty_log_destroy(dl); 1054 dm_dirty_log_destroy(dl);
diff --git a/drivers/md/dm-round-robin.c b/drivers/md/dm-round-robin.c
index 27f1d423b76c..6ab1192cdd5f 100644
--- a/drivers/md/dm-round-robin.c
+++ b/drivers/md/dm-round-robin.c
@@ -114,6 +114,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
114 struct selector *s = (struct selector *) ps->context; 114 struct selector *s = (struct selector *) ps->context;
115 struct path_info *pi; 115 struct path_info *pi;
116 unsigned repeat_count = RR_MIN_IO; 116 unsigned repeat_count = RR_MIN_IO;
117 char dummy;
117 118
118 if (argc > 1) { 119 if (argc > 1) {
119 *error = "round-robin ps: incorrect number of arguments"; 120 *error = "round-robin ps: incorrect number of arguments";
@@ -121,7 +122,7 @@ static int rr_add_path(struct path_selector *ps, struct dm_path *path,
121 } 122 }
122 123
123 /* First path argument is number of I/Os before switching path */ 124 /* First path argument is number of I/Os before switching path */
124 if ((argc == 1) && (sscanf(argv[0], "%u", &repeat_count) != 1)) { 125 if ((argc == 1) && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
125 *error = "round-robin ps: invalid repeat count"; 126 *error = "round-robin ps: invalid repeat count";
126 return -EINVAL; 127 return -EINVAL;
127 } 128 }
diff --git a/drivers/md/dm-service-time.c b/drivers/md/dm-service-time.c
index 59883bd78214..9df8f6bd6418 100644
--- a/drivers/md/dm-service-time.c
+++ b/drivers/md/dm-service-time.c
@@ -110,6 +110,7 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path,
110 struct path_info *pi; 110 struct path_info *pi;
111 unsigned repeat_count = ST_MIN_IO; 111 unsigned repeat_count = ST_MIN_IO;
112 unsigned relative_throughput = 1; 112 unsigned relative_throughput = 1;
113 char dummy;
113 114
114 /* 115 /*
115 * Arguments: [<repeat_count> [<relative_throughput>]] 116 * Arguments: [<repeat_count> [<relative_throughput>]]
@@ -128,13 +129,13 @@ static int st_add_path(struct path_selector *ps, struct dm_path *path,
128 return -EINVAL; 129 return -EINVAL;
129 } 130 }
130 131
131 if (argc && (sscanf(argv[0], "%u", &repeat_count) != 1)) { 132 if (argc && (sscanf(argv[0], "%u%c", &repeat_count, &dummy) != 1)) {
132 *error = "service-time ps: invalid repeat count"; 133 *error = "service-time ps: invalid repeat count";
133 return -EINVAL; 134 return -EINVAL;
134 } 135 }
135 136
136 if ((argc == 2) && 137 if ((argc == 2) &&
137 (sscanf(argv[1], "%u", &relative_throughput) != 1 || 138 (sscanf(argv[1], "%u%c", &relative_throughput, &dummy) != 1 ||
138 relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) { 139 relative_throughput > ST_MAX_RELATIVE_THROUGHPUT)) {
139 *error = "service-time ps: invalid relative_throughput value"; 140 *error = "service-time ps: invalid relative_throughput value";
140 return -EINVAL; 141 return -EINVAL;
diff --git a/drivers/md/dm-stripe.c b/drivers/md/dm-stripe.c
index 3d80cf0c152d..35c94ff24ad5 100644
--- a/drivers/md/dm-stripe.c
+++ b/drivers/md/dm-stripe.c
@@ -75,8 +75,9 @@ static int get_stripe(struct dm_target *ti, struct stripe_c *sc,
75 unsigned int stripe, char **argv) 75 unsigned int stripe, char **argv)
76{ 76{
77 unsigned long long start; 77 unsigned long long start;
78 char dummy;
78 79
79 if (sscanf(argv[1], "%llu", &start) != 1) 80 if (sscanf(argv[1], "%llu%c", &start, &dummy) != 1)
80 return -EINVAL; 81 return -EINVAL;
81 82
82 if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), 83 if (dm_get_device(ti, argv[0], dm_table_get_mode(ti->table),
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index 63cc54289aff..2e227fbf1622 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -268,8 +268,7 @@ void dm_table_destroy(struct dm_table *t)
268 vfree(t->highs); 268 vfree(t->highs);
269 269
270 /* free the device list */ 270 /* free the device list */
271 if (t->devices.next != &t->devices) 271 free_devices(&t->devices);
272 free_devices(&t->devices);
273 272
274 dm_free_md_mempools(t->mempools); 273 dm_free_md_mempools(t->mempools);
275 274
@@ -464,10 +463,11 @@ int dm_get_device(struct dm_target *ti, const char *path, fmode_t mode,
464 struct dm_dev_internal *dd; 463 struct dm_dev_internal *dd;
465 unsigned int major, minor; 464 unsigned int major, minor;
466 struct dm_table *t = ti->table; 465 struct dm_table *t = ti->table;
466 char dummy;
467 467
468 BUG_ON(!t); 468 BUG_ON(!t);
469 469
470 if (sscanf(path, "%u:%u", &major, &minor) == 2) { 470 if (sscanf(path, "%u:%u%c", &major, &minor, &dummy) == 2) {
471 /* Extract the major/minor numbers */ 471 /* Extract the major/minor numbers */
472 dev = MKDEV(major, minor); 472 dev = MKDEV(major, minor);
473 if (MAJOR(dev) != major || MINOR(dev) != minor) 473 if (MAJOR(dev) != major || MINOR(dev) != minor)
@@ -842,9 +842,10 @@ static int validate_next_arg(struct dm_arg *arg, struct dm_arg_set *arg_set,
842 unsigned *value, char **error, unsigned grouped) 842 unsigned *value, char **error, unsigned grouped)
843{ 843{
844 const char *arg_str = dm_shift_arg(arg_set); 844 const char *arg_str = dm_shift_arg(arg_set);
845 char dummy;
845 846
846 if (!arg_str || 847 if (!arg_str ||
847 (sscanf(arg_str, "%u", value) != 1) || 848 (sscanf(arg_str, "%u%c", value, &dummy) != 1) ||
848 (*value < arg->min) || 849 (*value < arg->min) ||
849 (*value > arg->max) || 850 (*value > arg->max) ||
850 (grouped && arg_set->argc < *value)) { 851 (grouped && arg_set->argc < *value)) {
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 59c4f0446ffa..737d38865b69 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -385,6 +385,7 @@ static int init_pmd(struct dm_pool_metadata *pmd,
385 data_sm = dm_sm_disk_create(tm, nr_blocks); 385 data_sm = dm_sm_disk_create(tm, nr_blocks);
386 if (IS_ERR(data_sm)) { 386 if (IS_ERR(data_sm)) {
387 DMERR("sm_disk_create failed"); 387 DMERR("sm_disk_create failed");
388 dm_tm_unlock(tm, sblock);
388 r = PTR_ERR(data_sm); 389 r = PTR_ERR(data_sm);
389 goto bad; 390 goto bad;
390 } 391 }
@@ -613,7 +614,7 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
613 if (r < 0) 614 if (r < 0)
614 goto out; 615 goto out;
615 616
616 r = dm_sm_root_size(pmd->metadata_sm, &data_len); 617 r = dm_sm_root_size(pmd->data_sm, &data_len);
617 if (r < 0) 618 if (r < 0)
618 goto out; 619 goto out;
619 620
@@ -712,6 +713,9 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
712 if (r) 713 if (r)
713 goto bad; 714 goto bad;
714 715
716 if (bdev_size > THIN_METADATA_MAX_SECTORS)
717 bdev_size = THIN_METADATA_MAX_SECTORS;
718
715 disk_super = dm_block_data(sblock); 719 disk_super = dm_block_data(sblock);
716 disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); 720 disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC);
717 disk_super->version = cpu_to_le32(THIN_VERSION); 721 disk_super->version = cpu_to_le32(THIN_VERSION);
@@ -789,6 +793,11 @@ int dm_pool_metadata_close(struct dm_pool_metadata *pmd)
789 return 0; 793 return 0;
790} 794}
791 795
796/*
797 * __open_device: Returns @td corresponding to device with id @dev,
798 * creating it if @create is set and incrementing @td->open_count.
799 * On failure, @td is undefined.
800 */
792static int __open_device(struct dm_pool_metadata *pmd, 801static int __open_device(struct dm_pool_metadata *pmd,
793 dm_thin_id dev, int create, 802 dm_thin_id dev, int create,
794 struct dm_thin_device **td) 803 struct dm_thin_device **td)
@@ -799,10 +808,16 @@ static int __open_device(struct dm_pool_metadata *pmd,
799 struct disk_device_details details_le; 808 struct disk_device_details details_le;
800 809
801 /* 810 /*
802 * Check the device isn't already open. 811 * If the device is already open, return it.
803 */ 812 */
804 list_for_each_entry(td2, &pmd->thin_devices, list) 813 list_for_each_entry(td2, &pmd->thin_devices, list)
805 if (td2->id == dev) { 814 if (td2->id == dev) {
815 /*
816 * May not create an already-open device.
817 */
818 if (create)
819 return -EEXIST;
820
806 td2->open_count++; 821 td2->open_count++;
807 *td = td2; 822 *td = td2;
808 return 0; 823 return 0;
@@ -817,6 +832,9 @@ static int __open_device(struct dm_pool_metadata *pmd,
817 if (r != -ENODATA || !create) 832 if (r != -ENODATA || !create)
818 return r; 833 return r;
819 834
835 /*
836 * Create new device.
837 */
820 changed = 1; 838 changed = 1;
821 details_le.mapped_blocks = 0; 839 details_le.mapped_blocks = 0;
822 details_le.transaction_id = cpu_to_le64(pmd->trans_id); 840 details_le.transaction_id = cpu_to_le64(pmd->trans_id);
@@ -882,12 +900,10 @@ static int __create_thin(struct dm_pool_metadata *pmd,
882 900
883 r = __open_device(pmd, dev, 1, &td); 901 r = __open_device(pmd, dev, 1, &td);
884 if (r) { 902 if (r) {
885 __close_device(td);
886 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); 903 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
887 dm_btree_del(&pmd->bl_info, dev_root); 904 dm_btree_del(&pmd->bl_info, dev_root);
888 return r; 905 return r;
889 } 906 }
890 td->changed = 1;
891 __close_device(td); 907 __close_device(td);
892 908
893 return r; 909 return r;
@@ -967,14 +983,14 @@ static int __create_snap(struct dm_pool_metadata *pmd,
967 goto bad; 983 goto bad;
968 984
969 r = __set_snapshot_details(pmd, td, origin, pmd->time); 985 r = __set_snapshot_details(pmd, td, origin, pmd->time);
986 __close_device(td);
987
970 if (r) 988 if (r)
971 goto bad; 989 goto bad;
972 990
973 __close_device(td);
974 return 0; 991 return 0;
975 992
976bad: 993bad:
977 __close_device(td);
978 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); 994 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root);
979 dm_btree_remove(&pmd->details_info, pmd->details_root, 995 dm_btree_remove(&pmd->details_info, pmd->details_root,
980 &key, &pmd->details_root); 996 &key, &pmd->details_root);
@@ -1211,6 +1227,8 @@ static int __remove(struct dm_thin_device *td, dm_block_t block)
1211 if (r) 1227 if (r)
1212 return r; 1228 return r;
1213 1229
1230 td->mapped_blocks--;
1231 td->changed = 1;
1214 pmd->need_commit = 1; 1232 pmd->need_commit = 1;
1215 1233
1216 return 0; 1234 return 0;
diff --git a/drivers/md/dm-thin-metadata.h b/drivers/md/dm-thin-metadata.h
index 859c16896877..ed4725e67c96 100644
--- a/drivers/md/dm-thin-metadata.h
+++ b/drivers/md/dm-thin-metadata.h
@@ -11,6 +11,19 @@
11 11
12#define THIN_METADATA_BLOCK_SIZE 4096 12#define THIN_METADATA_BLOCK_SIZE 4096
13 13
14/*
15 * The metadata device is currently limited in size.
16 *
17 * We have one block of index, which can hold 255 index entries. Each
18 * index entry contains allocation info about 16k metadata blocks.
19 */
20#define THIN_METADATA_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
21
22/*
23 * A metadata device larger than 16GB triggers a warning.
24 */
25#define THIN_METADATA_MAX_SECTORS_WARNING (16 * (1024 * 1024 * 1024 >> SECTOR_SHIFT))
26
14/*----------------------------------------------------------------*/ 27/*----------------------------------------------------------------*/
15 28
16struct dm_pool_metadata; 29struct dm_pool_metadata;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index c3087575fef0..213ae32a0fc4 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -23,6 +23,7 @@
23#define DEFERRED_SET_SIZE 64 23#define DEFERRED_SET_SIZE 64
24#define MAPPING_POOL_SIZE 1024 24#define MAPPING_POOL_SIZE 1024
25#define PRISON_CELLS 1024 25#define PRISON_CELLS 1024
26#define COMMIT_PERIOD HZ
26 27
27/* 28/*
28 * The block size of the device holding pool data must be 29 * The block size of the device holding pool data must be
@@ -32,16 +33,6 @@
32#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT) 33#define DATA_DEV_BLOCK_SIZE_MAX_SECTORS (1024 * 1024 * 1024 >> SECTOR_SHIFT)
33 34
34/* 35/*
35 * The metadata device is currently limited in size. The limitation is
36 * checked lower down in dm-space-map-metadata, but we also check it here
37 * so we can fail early.
38 *
39 * We have one block of index, which can hold 255 index entries. Each
40 * index entry contains allocation info about 16k metadata blocks.
41 */
42#define METADATA_DEV_MAX_SECTORS (255 * (1 << 14) * (THIN_METADATA_BLOCK_SIZE / (1 << SECTOR_SHIFT)))
43
44/*
45 * Device id is restricted to 24 bits. 36 * Device id is restricted to 24 bits.
46 */ 37 */
47#define MAX_DEV_ID ((1 << 24) - 1) 38#define MAX_DEV_ID ((1 << 24) - 1)
@@ -72,7 +63,7 @@
72 * missed out if the io covers the block. (schedule_copy). 63 * missed out if the io covers the block. (schedule_copy).
73 * 64 *
74 * iv) insert the new mapping into the origin's btree 65 * iv) insert the new mapping into the origin's btree
75 * (process_prepared_mappings). This act of inserting breaks some 66 * (process_prepared_mapping). This act of inserting breaks some
76 * sharing of btree nodes between the two devices. Breaking sharing only 67 * sharing of btree nodes between the two devices. Breaking sharing only
77 * effects the btree of that specific device. Btrees for the other 68 * effects the btree of that specific device. Btrees for the other
78 * devices that share the block never change. The btree for the origin 69 * devices that share the block never change. The btree for the origin
@@ -124,7 +115,7 @@ struct cell {
124 struct hlist_node list; 115 struct hlist_node list;
125 struct bio_prison *prison; 116 struct bio_prison *prison;
126 struct cell_key key; 117 struct cell_key key;
127 unsigned count; 118 struct bio *holder;
128 struct bio_list bios; 119 struct bio_list bios;
129}; 120};
130 121
@@ -220,54 +211,59 @@ static struct cell *__search_bucket(struct hlist_head *bucket,
220 * This may block if a new cell needs allocating. You must ensure that 211 * This may block if a new cell needs allocating. You must ensure that
221 * cells will be unlocked even if the calling thread is blocked. 212 * cells will be unlocked even if the calling thread is blocked.
222 * 213 *
223 * Returns the number of entries in the cell prior to the new addition 214 * Returns 1 if the cell was already held, 0 if @inmate is the new holder.
224 * or < 0 on failure.
225 */ 215 */
226static int bio_detain(struct bio_prison *prison, struct cell_key *key, 216static int bio_detain(struct bio_prison *prison, struct cell_key *key,
227 struct bio *inmate, struct cell **ref) 217 struct bio *inmate, struct cell **ref)
228{ 218{
229 int r; 219 int r = 1;
230 unsigned long flags; 220 unsigned long flags;
231 uint32_t hash = hash_key(prison, key); 221 uint32_t hash = hash_key(prison, key);
232 struct cell *uninitialized_var(cell), *cell2 = NULL; 222 struct cell *cell, *cell2;
233 223
234 BUG_ON(hash > prison->nr_buckets); 224 BUG_ON(hash > prison->nr_buckets);
235 225
236 spin_lock_irqsave(&prison->lock, flags); 226 spin_lock_irqsave(&prison->lock, flags);
227
237 cell = __search_bucket(prison->cells + hash, key); 228 cell = __search_bucket(prison->cells + hash, key);
229 if (cell) {
230 bio_list_add(&cell->bios, inmate);
231 goto out;
232 }
238 233
239 if (!cell) { 234 /*
240 /* 235 * Allocate a new cell
241 * Allocate a new cell 236 */
242 */ 237 spin_unlock_irqrestore(&prison->lock, flags);
243 spin_unlock_irqrestore(&prison->lock, flags); 238 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO);
244 cell2 = mempool_alloc(prison->cell_pool, GFP_NOIO); 239 spin_lock_irqsave(&prison->lock, flags);
245 spin_lock_irqsave(&prison->lock, flags);
246 240
247 /* 241 /*
248 * We've been unlocked, so we have to double check that 242 * We've been unlocked, so we have to double check that
249 * nobody else has inserted this cell in the meantime. 243 * nobody else has inserted this cell in the meantime.
250 */ 244 */
251 cell = __search_bucket(prison->cells + hash, key); 245 cell = __search_bucket(prison->cells + hash, key);
246 if (cell) {
247 mempool_free(cell2, prison->cell_pool);
248 bio_list_add(&cell->bios, inmate);
249 goto out;
250 }
252 251
253 if (!cell) { 252 /*
254 cell = cell2; 253 * Use new cell.
255 cell2 = NULL; 254 */
255 cell = cell2;
256 256
257 cell->prison = prison; 257 cell->prison = prison;
258 memcpy(&cell->key, key, sizeof(cell->key)); 258 memcpy(&cell->key, key, sizeof(cell->key));
259 cell->count = 0; 259 cell->holder = inmate;
260 bio_list_init(&cell->bios); 260 bio_list_init(&cell->bios);
261 hlist_add_head(&cell->list, prison->cells + hash); 261 hlist_add_head(&cell->list, prison->cells + hash);
262 }
263 }
264 262
265 r = cell->count++; 263 r = 0;
266 bio_list_add(&cell->bios, inmate);
267 spin_unlock_irqrestore(&prison->lock, flags);
268 264
269 if (cell2) 265out:
270 mempool_free(cell2, prison->cell_pool); 266 spin_unlock_irqrestore(&prison->lock, flags);
271 267
272 *ref = cell; 268 *ref = cell;
273 269
@@ -283,8 +279,8 @@ static void __cell_release(struct cell *cell, struct bio_list *inmates)
283 279
284 hlist_del(&cell->list); 280 hlist_del(&cell->list);
285 281
286 if (inmates) 282 bio_list_add(inmates, cell->holder);
287 bio_list_merge(inmates, &cell->bios); 283 bio_list_merge(inmates, &cell->bios);
288 284
289 mempool_free(cell, prison->cell_pool); 285 mempool_free(cell, prison->cell_pool);
290} 286}
@@ -305,22 +301,44 @@ static void cell_release(struct cell *cell, struct bio_list *bios)
305 * bio may be in the cell. This function releases the cell, and also does 301 * bio may be in the cell. This function releases the cell, and also does
306 * a sanity check. 302 * a sanity check.
307 */ 303 */
304static void __cell_release_singleton(struct cell *cell, struct bio *bio)
305{
306 hlist_del(&cell->list);
307 BUG_ON(cell->holder != bio);
308 BUG_ON(!bio_list_empty(&cell->bios));
309}
310
308static void cell_release_singleton(struct cell *cell, struct bio *bio) 311static void cell_release_singleton(struct cell *cell, struct bio *bio)
309{ 312{
310 struct bio_prison *prison = cell->prison;
311 struct bio_list bios;
312 struct bio *b;
313 unsigned long flags; 313 unsigned long flags;
314 314 struct bio_prison *prison = cell->prison;
315 bio_list_init(&bios);
316 315
317 spin_lock_irqsave(&prison->lock, flags); 316 spin_lock_irqsave(&prison->lock, flags);
318 __cell_release(cell, &bios); 317 __cell_release_singleton(cell, bio);
319 spin_unlock_irqrestore(&prison->lock, flags); 318 spin_unlock_irqrestore(&prison->lock, flags);
319}
320
321/*
322 * Sometimes we don't want the holder, just the additional bios.
323 */
324static void __cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
325{
326 struct bio_prison *prison = cell->prison;
327
328 hlist_del(&cell->list);
329 bio_list_merge(inmates, &cell->bios);
320 330
321 b = bio_list_pop(&bios); 331 mempool_free(cell, prison->cell_pool);
322 BUG_ON(b != bio); 332}
323 BUG_ON(!bio_list_empty(&bios)); 333
334static void cell_release_no_holder(struct cell *cell, struct bio_list *inmates)
335{
336 unsigned long flags;
337 struct bio_prison *prison = cell->prison;
338
339 spin_lock_irqsave(&prison->lock, flags);
340 __cell_release_no_holder(cell, inmates);
341 spin_unlock_irqrestore(&prison->lock, flags);
324} 342}
325 343
326static void cell_error(struct cell *cell) 344static void cell_error(struct cell *cell)
@@ -471,6 +489,13 @@ static void build_virtual_key(struct dm_thin_device *td, dm_block_t b,
471 * devices. 489 * devices.
472 */ 490 */
473struct new_mapping; 491struct new_mapping;
492
493struct pool_features {
494 unsigned zero_new_blocks:1;
495 unsigned discard_enabled:1;
496 unsigned discard_passdown:1;
497};
498
474struct pool { 499struct pool {
475 struct list_head list; 500 struct list_head list;
476 struct dm_target *ti; /* Only set if a pool target is bound */ 501 struct dm_target *ti; /* Only set if a pool target is bound */
@@ -484,7 +509,7 @@ struct pool {
484 dm_block_t offset_mask; 509 dm_block_t offset_mask;
485 dm_block_t low_water_blocks; 510 dm_block_t low_water_blocks;
486 511
487 unsigned zero_new_blocks:1; 512 struct pool_features pf;
488 unsigned low_water_triggered:1; /* A dm event has been sent */ 513 unsigned low_water_triggered:1; /* A dm event has been sent */
489 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */ 514 unsigned no_free_space:1; /* A -ENOSPC warning has been issued */
490 515
@@ -493,17 +518,21 @@ struct pool {
493 518
494 struct workqueue_struct *wq; 519 struct workqueue_struct *wq;
495 struct work_struct worker; 520 struct work_struct worker;
521 struct delayed_work waker;
496 522
497 unsigned ref_count; 523 unsigned ref_count;
524 unsigned long last_commit_jiffies;
498 525
499 spinlock_t lock; 526 spinlock_t lock;
500 struct bio_list deferred_bios; 527 struct bio_list deferred_bios;
501 struct bio_list deferred_flush_bios; 528 struct bio_list deferred_flush_bios;
502 struct list_head prepared_mappings; 529 struct list_head prepared_mappings;
530 struct list_head prepared_discards;
503 531
504 struct bio_list retry_on_resume_list; 532 struct bio_list retry_on_resume_list;
505 533
506 struct deferred_set ds; /* FIXME: move to thin_c */ 534 struct deferred_set shared_read_ds;
535 struct deferred_set all_io_ds;
507 536
508 struct new_mapping *next_mapping; 537 struct new_mapping *next_mapping;
509 mempool_t *mapping_pool; 538 mempool_t *mapping_pool;
@@ -521,7 +550,7 @@ struct pool_c {
521 struct dm_target_callbacks callbacks; 550 struct dm_target_callbacks callbacks;
522 551
523 dm_block_t low_water_blocks; 552 dm_block_t low_water_blocks;
524 unsigned zero_new_blocks:1; 553 struct pool_features pf;
525}; 554};
526 555
527/* 556/*
@@ -529,6 +558,7 @@ struct pool_c {
529 */ 558 */
530struct thin_c { 559struct thin_c {
531 struct dm_dev *pool_dev; 560 struct dm_dev *pool_dev;
561 struct dm_dev *origin_dev;
532 dm_thin_id dev_id; 562 dm_thin_id dev_id;
533 563
534 struct pool *pool; 564 struct pool *pool;
@@ -597,6 +627,13 @@ static struct pool *__pool_table_lookup_metadata_dev(struct block_device *md_dev
597 627
598/*----------------------------------------------------------------*/ 628/*----------------------------------------------------------------*/
599 629
630struct endio_hook {
631 struct thin_c *tc;
632 struct deferred_entry *shared_read_entry;
633 struct deferred_entry *all_io_entry;
634 struct new_mapping *overwrite_mapping;
635};
636
600static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master) 637static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
601{ 638{
602 struct bio *bio; 639 struct bio *bio;
@@ -607,7 +644,8 @@ static void __requeue_bio_list(struct thin_c *tc, struct bio_list *master)
607 bio_list_init(master); 644 bio_list_init(master);
608 645
609 while ((bio = bio_list_pop(&bios))) { 646 while ((bio = bio_list_pop(&bios))) {
610 if (dm_get_mapinfo(bio)->ptr == tc) 647 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
648 if (h->tc == tc)
611 bio_endio(bio, DM_ENDIO_REQUEUE); 649 bio_endio(bio, DM_ENDIO_REQUEUE);
612 else 650 else
613 bio_list_add(master, bio); 651 bio_list_add(master, bio);
@@ -646,14 +684,16 @@ static void remap(struct thin_c *tc, struct bio *bio, dm_block_t block)
646 (bio->bi_sector & pool->offset_mask); 684 (bio->bi_sector & pool->offset_mask);
647} 685}
648 686
649static void remap_and_issue(struct thin_c *tc, struct bio *bio, 687static void remap_to_origin(struct thin_c *tc, struct bio *bio)
650 dm_block_t block) 688{
689 bio->bi_bdev = tc->origin_dev->bdev;
690}
691
692static void issue(struct thin_c *tc, struct bio *bio)
651{ 693{
652 struct pool *pool = tc->pool; 694 struct pool *pool = tc->pool;
653 unsigned long flags; 695 unsigned long flags;
654 696
655 remap(tc, bio, block);
656
657 /* 697 /*
658 * Batch together any FUA/FLUSH bios we find and then issue 698 * Batch together any FUA/FLUSH bios we find and then issue
659 * a single commit for them in process_deferred_bios(). 699 * a single commit for them in process_deferred_bios().
@@ -666,6 +706,19 @@ static void remap_and_issue(struct thin_c *tc, struct bio *bio,
666 generic_make_request(bio); 706 generic_make_request(bio);
667} 707}
668 708
709static void remap_to_origin_and_issue(struct thin_c *tc, struct bio *bio)
710{
711 remap_to_origin(tc, bio);
712 issue(tc, bio);
713}
714
715static void remap_and_issue(struct thin_c *tc, struct bio *bio,
716 dm_block_t block)
717{
718 remap(tc, bio, block);
719 issue(tc, bio);
720}
721
669/* 722/*
670 * wake_worker() is used when new work is queued and when pool_resume is 723 * wake_worker() is used when new work is queued and when pool_resume is
671 * ready to continue deferred IO processing. 724 * ready to continue deferred IO processing.
@@ -680,21 +733,17 @@ static void wake_worker(struct pool *pool)
680/* 733/*
681 * Bio endio functions. 734 * Bio endio functions.
682 */ 735 */
683struct endio_hook {
684 struct thin_c *tc;
685 bio_end_io_t *saved_bi_end_io;
686 struct deferred_entry *entry;
687};
688
689struct new_mapping { 736struct new_mapping {
690 struct list_head list; 737 struct list_head list;
691 738
692 int prepared; 739 unsigned quiesced:1;
740 unsigned prepared:1;
741 unsigned pass_discard:1;
693 742
694 struct thin_c *tc; 743 struct thin_c *tc;
695 dm_block_t virt_block; 744 dm_block_t virt_block;
696 dm_block_t data_block; 745 dm_block_t data_block;
697 struct cell *cell; 746 struct cell *cell, *cell2;
698 int err; 747 int err;
699 748
700 /* 749 /*
@@ -711,7 +760,7 @@ static void __maybe_add_mapping(struct new_mapping *m)
711{ 760{
712 struct pool *pool = m->tc->pool; 761 struct pool *pool = m->tc->pool;
713 762
714 if (list_empty(&m->list) && m->prepared) { 763 if (m->quiesced && m->prepared) {
715 list_add(&m->list, &pool->prepared_mappings); 764 list_add(&m->list, &pool->prepared_mappings);
716 wake_worker(pool); 765 wake_worker(pool);
717 } 766 }
@@ -734,7 +783,8 @@ static void copy_complete(int read_err, unsigned long write_err, void *context)
734static void overwrite_endio(struct bio *bio, int err) 783static void overwrite_endio(struct bio *bio, int err)
735{ 784{
736 unsigned long flags; 785 unsigned long flags;
737 struct new_mapping *m = dm_get_mapinfo(bio)->ptr; 786 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
787 struct new_mapping *m = h->overwrite_mapping;
738 struct pool *pool = m->tc->pool; 788 struct pool *pool = m->tc->pool;
739 789
740 m->err = err; 790 m->err = err;
@@ -745,31 +795,6 @@ static void overwrite_endio(struct bio *bio, int err)
745 spin_unlock_irqrestore(&pool->lock, flags); 795 spin_unlock_irqrestore(&pool->lock, flags);
746} 796}
747 797
748static void shared_read_endio(struct bio *bio, int err)
749{
750 struct list_head mappings;
751 struct new_mapping *m, *tmp;
752 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
753 unsigned long flags;
754 struct pool *pool = h->tc->pool;
755
756 bio->bi_end_io = h->saved_bi_end_io;
757 bio_endio(bio, err);
758
759 INIT_LIST_HEAD(&mappings);
760 ds_dec(h->entry, &mappings);
761
762 spin_lock_irqsave(&pool->lock, flags);
763 list_for_each_entry_safe(m, tmp, &mappings, list) {
764 list_del(&m->list);
765 INIT_LIST_HEAD(&m->list);
766 __maybe_add_mapping(m);
767 }
768 spin_unlock_irqrestore(&pool->lock, flags);
769
770 mempool_free(h, pool->endio_hook_pool);
771}
772
773/*----------------------------------------------------------------*/ 798/*----------------------------------------------------------------*/
774 799
775/* 800/*
@@ -800,21 +825,16 @@ static void cell_defer(struct thin_c *tc, struct cell *cell,
800 * Same as cell_defer above, except it omits one particular detainee, 825 * Same as cell_defer above, except it omits one particular detainee,
801 * a write bio that covers the block and has already been processed. 826 * a write bio that covers the block and has already been processed.
802 */ 827 */
803static void cell_defer_except(struct thin_c *tc, struct cell *cell, 828static void cell_defer_except(struct thin_c *tc, struct cell *cell)
804 struct bio *exception)
805{ 829{
806 struct bio_list bios; 830 struct bio_list bios;
807 struct bio *bio;
808 struct pool *pool = tc->pool; 831 struct pool *pool = tc->pool;
809 unsigned long flags; 832 unsigned long flags;
810 833
811 bio_list_init(&bios); 834 bio_list_init(&bios);
812 cell_release(cell, &bios);
813 835
814 spin_lock_irqsave(&pool->lock, flags); 836 spin_lock_irqsave(&pool->lock, flags);
815 while ((bio = bio_list_pop(&bios))) 837 cell_release_no_holder(cell, &pool->deferred_bios);
816 if (bio != exception)
817 bio_list_add(&pool->deferred_bios, bio);
818 spin_unlock_irqrestore(&pool->lock, flags); 838 spin_unlock_irqrestore(&pool->lock, flags);
819 839
820 wake_worker(pool); 840 wake_worker(pool);
@@ -854,7 +874,7 @@ static void process_prepared_mapping(struct new_mapping *m)
854 * the bios in the cell. 874 * the bios in the cell.
855 */ 875 */
856 if (bio) { 876 if (bio) {
857 cell_defer_except(tc, m->cell, bio); 877 cell_defer_except(tc, m->cell);
858 bio_endio(bio, 0); 878 bio_endio(bio, 0);
859 } else 879 } else
860 cell_defer(tc, m->cell, m->data_block); 880 cell_defer(tc, m->cell, m->data_block);
@@ -863,7 +883,30 @@ static void process_prepared_mapping(struct new_mapping *m)
863 mempool_free(m, tc->pool->mapping_pool); 883 mempool_free(m, tc->pool->mapping_pool);
864} 884}
865 885
866static void process_prepared_mappings(struct pool *pool) 886static void process_prepared_discard(struct new_mapping *m)
887{
888 int r;
889 struct thin_c *tc = m->tc;
890
891 r = dm_thin_remove_block(tc->td, m->virt_block);
892 if (r)
893 DMERR("dm_thin_remove_block() failed");
894
895 /*
896 * Pass the discard down to the underlying device?
897 */
898 if (m->pass_discard)
899 remap_and_issue(tc, m->bio, m->data_block);
900 else
901 bio_endio(m->bio, 0);
902
903 cell_defer_except(tc, m->cell);
904 cell_defer_except(tc, m->cell2);
905 mempool_free(m, tc->pool->mapping_pool);
906}
907
908static void process_prepared(struct pool *pool, struct list_head *head,
909 void (*fn)(struct new_mapping *))
867{ 910{
868 unsigned long flags; 911 unsigned long flags;
869 struct list_head maps; 912 struct list_head maps;
@@ -871,21 +914,27 @@ static void process_prepared_mappings(struct pool *pool)
871 914
872 INIT_LIST_HEAD(&maps); 915 INIT_LIST_HEAD(&maps);
873 spin_lock_irqsave(&pool->lock, flags); 916 spin_lock_irqsave(&pool->lock, flags);
874 list_splice_init(&pool->prepared_mappings, &maps); 917 list_splice_init(head, &maps);
875 spin_unlock_irqrestore(&pool->lock, flags); 918 spin_unlock_irqrestore(&pool->lock, flags);
876 919
877 list_for_each_entry_safe(m, tmp, &maps, list) 920 list_for_each_entry_safe(m, tmp, &maps, list)
878 process_prepared_mapping(m); 921 fn(m);
879} 922}
880 923
881/* 924/*
882 * Deferred bio jobs. 925 * Deferred bio jobs.
883 */ 926 */
884static int io_overwrites_block(struct pool *pool, struct bio *bio) 927static int io_overlaps_block(struct pool *pool, struct bio *bio)
885{ 928{
886 return ((bio_data_dir(bio) == WRITE) && 929 return !(bio->bi_sector & pool->offset_mask) &&
887 !(bio->bi_sector & pool->offset_mask)) &&
888 (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT)); 930 (bio->bi_size == (pool->sectors_per_block << SECTOR_SHIFT));
931
932}
933
934static int io_overwrites_block(struct pool *pool, struct bio *bio)
935{
936 return (bio_data_dir(bio) == WRITE) &&
937 io_overlaps_block(pool, bio);
889} 938}
890 939
891static void save_and_set_endio(struct bio *bio, bio_end_io_t **save, 940static void save_and_set_endio(struct bio *bio, bio_end_io_t **save,
@@ -917,7 +966,8 @@ static struct new_mapping *get_next_mapping(struct pool *pool)
917} 966}
918 967
919static void schedule_copy(struct thin_c *tc, dm_block_t virt_block, 968static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
920 dm_block_t data_origin, dm_block_t data_dest, 969 struct dm_dev *origin, dm_block_t data_origin,
970 dm_block_t data_dest,
921 struct cell *cell, struct bio *bio) 971 struct cell *cell, struct bio *bio)
922{ 972{
923 int r; 973 int r;
@@ -925,6 +975,7 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
925 struct new_mapping *m = get_next_mapping(pool); 975 struct new_mapping *m = get_next_mapping(pool);
926 976
927 INIT_LIST_HEAD(&m->list); 977 INIT_LIST_HEAD(&m->list);
978 m->quiesced = 0;
928 m->prepared = 0; 979 m->prepared = 0;
929 m->tc = tc; 980 m->tc = tc;
930 m->virt_block = virt_block; 981 m->virt_block = virt_block;
@@ -933,7 +984,8 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
933 m->err = 0; 984 m->err = 0;
934 m->bio = NULL; 985 m->bio = NULL;
935 986
936 ds_add_work(&pool->ds, &m->list); 987 if (!ds_add_work(&pool->shared_read_ds, &m->list))
988 m->quiesced = 1;
937 989
938 /* 990 /*
939 * IO to pool_dev remaps to the pool target's data_dev. 991 * IO to pool_dev remaps to the pool target's data_dev.
@@ -942,14 +994,15 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
942 * bio immediately. Otherwise we use kcopyd to clone the data first. 994 * bio immediately. Otherwise we use kcopyd to clone the data first.
943 */ 995 */
944 if (io_overwrites_block(pool, bio)) { 996 if (io_overwrites_block(pool, bio)) {
997 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
998 h->overwrite_mapping = m;
945 m->bio = bio; 999 m->bio = bio;
946 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1000 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
947 dm_get_mapinfo(bio)->ptr = m;
948 remap_and_issue(tc, bio, data_dest); 1001 remap_and_issue(tc, bio, data_dest);
949 } else { 1002 } else {
950 struct dm_io_region from, to; 1003 struct dm_io_region from, to;
951 1004
952 from.bdev = tc->pool_dev->bdev; 1005 from.bdev = origin->bdev;
953 from.sector = data_origin * pool->sectors_per_block; 1006 from.sector = data_origin * pool->sectors_per_block;
954 from.count = pool->sectors_per_block; 1007 from.count = pool->sectors_per_block;
955 1008
@@ -967,6 +1020,22 @@ static void schedule_copy(struct thin_c *tc, dm_block_t virt_block,
967 } 1020 }
968} 1021}
969 1022
1023static void schedule_internal_copy(struct thin_c *tc, dm_block_t virt_block,
1024 dm_block_t data_origin, dm_block_t data_dest,
1025 struct cell *cell, struct bio *bio)
1026{
1027 schedule_copy(tc, virt_block, tc->pool_dev,
1028 data_origin, data_dest, cell, bio);
1029}
1030
1031static void schedule_external_copy(struct thin_c *tc, dm_block_t virt_block,
1032 dm_block_t data_dest,
1033 struct cell *cell, struct bio *bio)
1034{
1035 schedule_copy(tc, virt_block, tc->origin_dev,
1036 virt_block, data_dest, cell, bio);
1037}
1038
970static void schedule_zero(struct thin_c *tc, dm_block_t virt_block, 1039static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
971 dm_block_t data_block, struct cell *cell, 1040 dm_block_t data_block, struct cell *cell,
972 struct bio *bio) 1041 struct bio *bio)
@@ -975,6 +1044,7 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
975 struct new_mapping *m = get_next_mapping(pool); 1044 struct new_mapping *m = get_next_mapping(pool);
976 1045
977 INIT_LIST_HEAD(&m->list); 1046 INIT_LIST_HEAD(&m->list);
1047 m->quiesced = 1;
978 m->prepared = 0; 1048 m->prepared = 0;
979 m->tc = tc; 1049 m->tc = tc;
980 m->virt_block = virt_block; 1050 m->virt_block = virt_block;
@@ -988,13 +1058,14 @@ static void schedule_zero(struct thin_c *tc, dm_block_t virt_block,
988 * zeroing pre-existing data, we can issue the bio immediately. 1058 * zeroing pre-existing data, we can issue the bio immediately.
989 * Otherwise we use kcopyd to zero the data first. 1059 * Otherwise we use kcopyd to zero the data first.
990 */ 1060 */
991 if (!pool->zero_new_blocks) 1061 if (!pool->pf.zero_new_blocks)
992 process_prepared_mapping(m); 1062 process_prepared_mapping(m);
993 1063
994 else if (io_overwrites_block(pool, bio)) { 1064 else if (io_overwrites_block(pool, bio)) {
1065 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
1066 h->overwrite_mapping = m;
995 m->bio = bio; 1067 m->bio = bio;
996 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio); 1068 save_and_set_endio(bio, &m->saved_bi_end_io, overwrite_endio);
997 dm_get_mapinfo(bio)->ptr = m;
998 remap_and_issue(tc, bio, data_block); 1069 remap_and_issue(tc, bio, data_block);
999 1070
1000 } else { 1071 } else {
@@ -1081,7 +1152,8 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
1081 */ 1152 */
1082static void retry_on_resume(struct bio *bio) 1153static void retry_on_resume(struct bio *bio)
1083{ 1154{
1084 struct thin_c *tc = dm_get_mapinfo(bio)->ptr; 1155 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
1156 struct thin_c *tc = h->tc;
1085 struct pool *pool = tc->pool; 1157 struct pool *pool = tc->pool;
1086 unsigned long flags; 1158 unsigned long flags;
1087 1159
@@ -1102,6 +1174,86 @@ static void no_space(struct cell *cell)
1102 retry_on_resume(bio); 1174 retry_on_resume(bio);
1103} 1175}
1104 1176
1177static void process_discard(struct thin_c *tc, struct bio *bio)
1178{
1179 int r;
1180 struct pool *pool = tc->pool;
1181 struct cell *cell, *cell2;
1182 struct cell_key key, key2;
1183 dm_block_t block = get_bio_block(tc, bio);
1184 struct dm_thin_lookup_result lookup_result;
1185 struct new_mapping *m;
1186
1187 build_virtual_key(tc->td, block, &key);
1188 if (bio_detain(tc->pool->prison, &key, bio, &cell))
1189 return;
1190
1191 r = dm_thin_find_block(tc->td, block, 1, &lookup_result);
1192 switch (r) {
1193 case 0:
1194 /*
1195 * Check nobody is fiddling with this pool block. This can
1196 * happen if someone's in the process of breaking sharing
1197 * on this block.
1198 */
1199 build_data_key(tc->td, lookup_result.block, &key2);
1200 if (bio_detain(tc->pool->prison, &key2, bio, &cell2)) {
1201 cell_release_singleton(cell, bio);
1202 break;
1203 }
1204
1205 if (io_overlaps_block(pool, bio)) {
1206 /*
1207 * IO may still be going to the destination block. We must
1208 * quiesce before we can do the removal.
1209 */
1210 m = get_next_mapping(pool);
1211 m->tc = tc;
1212 m->pass_discard = (!lookup_result.shared) & pool->pf.discard_passdown;
1213 m->virt_block = block;
1214 m->data_block = lookup_result.block;
1215 m->cell = cell;
1216 m->cell2 = cell2;
1217 m->err = 0;
1218 m->bio = bio;
1219
1220 if (!ds_add_work(&pool->all_io_ds, &m->list)) {
1221 list_add(&m->list, &pool->prepared_discards);
1222 wake_worker(pool);
1223 }
1224 } else {
1225 /*
1226 * This path is hit if people are ignoring
1227 * limits->discard_granularity. It ignores any
1228 * part of the discard that is in a subsequent
1229 * block.
1230 */
1231 sector_t offset = bio->bi_sector - (block << pool->block_shift);
1232 unsigned remaining = (pool->sectors_per_block - offset) << 9;
1233 bio->bi_size = min(bio->bi_size, remaining);
1234
1235 cell_release_singleton(cell, bio);
1236 cell_release_singleton(cell2, bio);
1237 remap_and_issue(tc, bio, lookup_result.block);
1238 }
1239 break;
1240
1241 case -ENODATA:
1242 /*
1243 * It isn't provisioned, just forget it.
1244 */
1245 cell_release_singleton(cell, bio);
1246 bio_endio(bio, 0);
1247 break;
1248
1249 default:
1250 DMERR("discard: find block unexpectedly returned %d", r);
1251 cell_release_singleton(cell, bio);
1252 bio_io_error(bio);
1253 break;
1254 }
1255}
1256
1105static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block, 1257static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1106 struct cell_key *key, 1258 struct cell_key *key,
1107 struct dm_thin_lookup_result *lookup_result, 1259 struct dm_thin_lookup_result *lookup_result,
@@ -1113,8 +1265,8 @@ static void break_sharing(struct thin_c *tc, struct bio *bio, dm_block_t block,
1113 r = alloc_data_block(tc, &data_block); 1265 r = alloc_data_block(tc, &data_block);
1114 switch (r) { 1266 switch (r) {
1115 case 0: 1267 case 0:
1116 schedule_copy(tc, block, lookup_result->block, 1268 schedule_internal_copy(tc, block, lookup_result->block,
1117 data_block, cell, bio); 1269 data_block, cell, bio);
1118 break; 1270 break;
1119 1271
1120 case -ENOSPC: 1272 case -ENOSPC:
@@ -1147,13 +1299,9 @@ static void process_shared_bio(struct thin_c *tc, struct bio *bio,
1147 if (bio_data_dir(bio) == WRITE) 1299 if (bio_data_dir(bio) == WRITE)
1148 break_sharing(tc, bio, block, &key, lookup_result, cell); 1300 break_sharing(tc, bio, block, &key, lookup_result, cell);
1149 else { 1301 else {
1150 struct endio_hook *h; 1302 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
1151 h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
1152 1303
1153 h->tc = tc; 1304 h->shared_read_entry = ds_inc(&pool->shared_read_ds);
1154 h->entry = ds_inc(&pool->ds);
1155 save_and_set_endio(bio, &h->saved_bi_end_io, shared_read_endio);
1156 dm_get_mapinfo(bio)->ptr = h;
1157 1305
1158 cell_release_singleton(cell, bio); 1306 cell_release_singleton(cell, bio);
1159 remap_and_issue(tc, bio, lookup_result->block); 1307 remap_and_issue(tc, bio, lookup_result->block);
@@ -1188,7 +1336,10 @@ static void provision_block(struct thin_c *tc, struct bio *bio, dm_block_t block
1188 r = alloc_data_block(tc, &data_block); 1336 r = alloc_data_block(tc, &data_block);
1189 switch (r) { 1337 switch (r) {
1190 case 0: 1338 case 0:
1191 schedule_zero(tc, block, data_block, cell, bio); 1339 if (tc->origin_dev)
1340 schedule_external_copy(tc, block, data_block, cell, bio);
1341 else
1342 schedule_zero(tc, block, data_block, cell, bio);
1192 break; 1343 break;
1193 1344
1194 case -ENOSPC: 1345 case -ENOSPC:
@@ -1239,16 +1390,27 @@ static void process_bio(struct thin_c *tc, struct bio *bio)
1239 break; 1390 break;
1240 1391
1241 case -ENODATA: 1392 case -ENODATA:
1242 provision_block(tc, bio, block, cell); 1393 if (bio_data_dir(bio) == READ && tc->origin_dev) {
1394 cell_release_singleton(cell, bio);
1395 remap_to_origin_and_issue(tc, bio);
1396 } else
1397 provision_block(tc, bio, block, cell);
1243 break; 1398 break;
1244 1399
1245 default: 1400 default:
1246 DMERR("dm_thin_find_block() failed, error = %d", r); 1401 DMERR("dm_thin_find_block() failed, error = %d", r);
1402 cell_release_singleton(cell, bio);
1247 bio_io_error(bio); 1403 bio_io_error(bio);
1248 break; 1404 break;
1249 } 1405 }
1250} 1406}
1251 1407
1408static int need_commit_due_to_time(struct pool *pool)
1409{
1410 return jiffies < pool->last_commit_jiffies ||
1411 jiffies > pool->last_commit_jiffies + COMMIT_PERIOD;
1412}
1413
1252static void process_deferred_bios(struct pool *pool) 1414static void process_deferred_bios(struct pool *pool)
1253{ 1415{
1254 unsigned long flags; 1416 unsigned long flags;
@@ -1264,7 +1426,9 @@ static void process_deferred_bios(struct pool *pool)
1264 spin_unlock_irqrestore(&pool->lock, flags); 1426 spin_unlock_irqrestore(&pool->lock, flags);
1265 1427
1266 while ((bio = bio_list_pop(&bios))) { 1428 while ((bio = bio_list_pop(&bios))) {
1267 struct thin_c *tc = dm_get_mapinfo(bio)->ptr; 1429 struct endio_hook *h = dm_get_mapinfo(bio)->ptr;
1430 struct thin_c *tc = h->tc;
1431
1268 /* 1432 /*
1269 * If we've got no free new_mapping structs, and processing 1433 * If we've got no free new_mapping structs, and processing
1270 * this bio might require one, we pause until there are some 1434 * this bio might require one, we pause until there are some
@@ -1277,7 +1441,11 @@ static void process_deferred_bios(struct pool *pool)
1277 1441
1278 break; 1442 break;
1279 } 1443 }
1280 process_bio(tc, bio); 1444
1445 if (bio->bi_rw & REQ_DISCARD)
1446 process_discard(tc, bio);
1447 else
1448 process_bio(tc, bio);
1281 } 1449 }
1282 1450
1283 /* 1451 /*
@@ -1290,7 +1458,7 @@ static void process_deferred_bios(struct pool *pool)
1290 bio_list_init(&pool->deferred_flush_bios); 1458 bio_list_init(&pool->deferred_flush_bios);
1291 spin_unlock_irqrestore(&pool->lock, flags); 1459 spin_unlock_irqrestore(&pool->lock, flags);
1292 1460
1293 if (bio_list_empty(&bios)) 1461 if (bio_list_empty(&bios) && !need_commit_due_to_time(pool))
1294 return; 1462 return;
1295 1463
1296 r = dm_pool_commit_metadata(pool->pmd); 1464 r = dm_pool_commit_metadata(pool->pmd);
@@ -1301,6 +1469,7 @@ static void process_deferred_bios(struct pool *pool)
1301 bio_io_error(bio); 1469 bio_io_error(bio);
1302 return; 1470 return;
1303 } 1471 }
1472 pool->last_commit_jiffies = jiffies;
1304 1473
1305 while ((bio = bio_list_pop(&bios))) 1474 while ((bio = bio_list_pop(&bios)))
1306 generic_make_request(bio); 1475 generic_make_request(bio);
@@ -1310,10 +1479,22 @@ static void do_worker(struct work_struct *ws)
1310{ 1479{
1311 struct pool *pool = container_of(ws, struct pool, worker); 1480 struct pool *pool = container_of(ws, struct pool, worker);
1312 1481
1313 process_prepared_mappings(pool); 1482 process_prepared(pool, &pool->prepared_mappings, process_prepared_mapping);
1483 process_prepared(pool, &pool->prepared_discards, process_prepared_discard);
1314 process_deferred_bios(pool); 1484 process_deferred_bios(pool);
1315} 1485}
1316 1486
1487/*
1488 * We want to commit periodically so that not too much
1489 * unwritten data builds up.
1490 */
1491static void do_waker(struct work_struct *ws)
1492{
1493 struct pool *pool = container_of(to_delayed_work(ws), struct pool, waker);
1494 wake_worker(pool);
1495 queue_delayed_work(pool->wq, &pool->waker, COMMIT_PERIOD);
1496}
1497
1317/*----------------------------------------------------------------*/ 1498/*----------------------------------------------------------------*/
1318 1499
1319/* 1500/*
@@ -1335,6 +1516,19 @@ static void thin_defer_bio(struct thin_c *tc, struct bio *bio)
1335 wake_worker(pool); 1516 wake_worker(pool);
1336} 1517}
1337 1518
1519static struct endio_hook *thin_hook_bio(struct thin_c *tc, struct bio *bio)
1520{
1521 struct pool *pool = tc->pool;
1522 struct endio_hook *h = mempool_alloc(pool->endio_hook_pool, GFP_NOIO);
1523
1524 h->tc = tc;
1525 h->shared_read_entry = NULL;
1526 h->all_io_entry = bio->bi_rw & REQ_DISCARD ? NULL : ds_inc(&pool->all_io_ds);
1527 h->overwrite_mapping = NULL;
1528
1529 return h;
1530}
1531
1338/* 1532/*
1339 * Non-blocking function called from the thin target's map function. 1533 * Non-blocking function called from the thin target's map function.
1340 */ 1534 */
@@ -1347,12 +1541,8 @@ static int thin_bio_map(struct dm_target *ti, struct bio *bio,
1347 struct dm_thin_device *td = tc->td; 1541 struct dm_thin_device *td = tc->td;
1348 struct dm_thin_lookup_result result; 1542 struct dm_thin_lookup_result result;
1349 1543
1350 /* 1544 map_context->ptr = thin_hook_bio(tc, bio);
1351 * Save the thin context for easy access from the deferred bio later. 1545 if (bio->bi_rw & (REQ_DISCARD | REQ_FLUSH | REQ_FUA)) {
1352 */
1353 map_context->ptr = tc;
1354
1355 if (bio->bi_rw & (REQ_FLUSH | REQ_FUA)) {
1356 thin_defer_bio(tc, bio); 1546 thin_defer_bio(tc, bio);
1357 return DM_MAPIO_SUBMITTED; 1547 return DM_MAPIO_SUBMITTED;
1358 } 1548 }
@@ -1434,7 +1624,7 @@ static int bind_control_target(struct pool *pool, struct dm_target *ti)
1434 1624
1435 pool->ti = ti; 1625 pool->ti = ti;
1436 pool->low_water_blocks = pt->low_water_blocks; 1626 pool->low_water_blocks = pt->low_water_blocks;
1437 pool->zero_new_blocks = pt->zero_new_blocks; 1627 pool->pf = pt->pf;
1438 1628
1439 return 0; 1629 return 0;
1440} 1630}
@@ -1448,6 +1638,14 @@ static void unbind_control_target(struct pool *pool, struct dm_target *ti)
1448/*---------------------------------------------------------------- 1638/*----------------------------------------------------------------
1449 * Pool creation 1639 * Pool creation
1450 *--------------------------------------------------------------*/ 1640 *--------------------------------------------------------------*/
1641/* Initialize pool features. */
1642static void pool_features_init(struct pool_features *pf)
1643{
1644 pf->zero_new_blocks = 1;
1645 pf->discard_enabled = 1;
1646 pf->discard_passdown = 1;
1647}
1648
1451static void __pool_destroy(struct pool *pool) 1649static void __pool_destroy(struct pool *pool)
1452{ 1650{
1453 __pool_table_remove(pool); 1651 __pool_table_remove(pool);
@@ -1495,7 +1693,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
1495 pool->block_shift = ffs(block_size) - 1; 1693 pool->block_shift = ffs(block_size) - 1;
1496 pool->offset_mask = block_size - 1; 1694 pool->offset_mask = block_size - 1;
1497 pool->low_water_blocks = 0; 1695 pool->low_water_blocks = 0;
1498 pool->zero_new_blocks = 1; 1696 pool_features_init(&pool->pf);
1499 pool->prison = prison_create(PRISON_CELLS); 1697 pool->prison = prison_create(PRISON_CELLS);
1500 if (!pool->prison) { 1698 if (!pool->prison) {
1501 *error = "Error creating pool's bio prison"; 1699 *error = "Error creating pool's bio prison";
@@ -1523,14 +1721,17 @@ static struct pool *pool_create(struct mapped_device *pool_md,
1523 } 1721 }
1524 1722
1525 INIT_WORK(&pool->worker, do_worker); 1723 INIT_WORK(&pool->worker, do_worker);
1724 INIT_DELAYED_WORK(&pool->waker, do_waker);
1526 spin_lock_init(&pool->lock); 1725 spin_lock_init(&pool->lock);
1527 bio_list_init(&pool->deferred_bios); 1726 bio_list_init(&pool->deferred_bios);
1528 bio_list_init(&pool->deferred_flush_bios); 1727 bio_list_init(&pool->deferred_flush_bios);
1529 INIT_LIST_HEAD(&pool->prepared_mappings); 1728 INIT_LIST_HEAD(&pool->prepared_mappings);
1729 INIT_LIST_HEAD(&pool->prepared_discards);
1530 pool->low_water_triggered = 0; 1730 pool->low_water_triggered = 0;
1531 pool->no_free_space = 0; 1731 pool->no_free_space = 0;
1532 bio_list_init(&pool->retry_on_resume_list); 1732 bio_list_init(&pool->retry_on_resume_list);
1533 ds_init(&pool->ds); 1733 ds_init(&pool->shared_read_ds);
1734 ds_init(&pool->all_io_ds);
1534 1735
1535 pool->next_mapping = NULL; 1736 pool->next_mapping = NULL;
1536 pool->mapping_pool = 1737 pool->mapping_pool =
@@ -1549,6 +1750,7 @@ static struct pool *pool_create(struct mapped_device *pool_md,
1549 goto bad_endio_hook_pool; 1750 goto bad_endio_hook_pool;
1550 } 1751 }
1551 pool->ref_count = 1; 1752 pool->ref_count = 1;
1753 pool->last_commit_jiffies = jiffies;
1552 pool->pool_md = pool_md; 1754 pool->pool_md = pool_md;
1553 pool->md_dev = metadata_dev; 1755 pool->md_dev = metadata_dev;
1554 __pool_table_insert(pool); 1756 __pool_table_insert(pool);
@@ -1588,7 +1790,8 @@ static void __pool_dec(struct pool *pool)
1588 1790
1589static struct pool *__pool_find(struct mapped_device *pool_md, 1791static struct pool *__pool_find(struct mapped_device *pool_md,
1590 struct block_device *metadata_dev, 1792 struct block_device *metadata_dev,
1591 unsigned long block_size, char **error) 1793 unsigned long block_size, char **error,
1794 int *created)
1592{ 1795{
1593 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev); 1796 struct pool *pool = __pool_table_lookup_metadata_dev(metadata_dev);
1594 1797
@@ -1604,8 +1807,10 @@ static struct pool *__pool_find(struct mapped_device *pool_md,
1604 return ERR_PTR(-EINVAL); 1807 return ERR_PTR(-EINVAL);
1605 __pool_inc(pool); 1808 __pool_inc(pool);
1606 1809
1607 } else 1810 } else {
1608 pool = pool_create(pool_md, metadata_dev, block_size, error); 1811 pool = pool_create(pool_md, metadata_dev, block_size, error);
1812 *created = 1;
1813 }
1609 } 1814 }
1610 1815
1611 return pool; 1816 return pool;
@@ -1629,10 +1834,6 @@ static void pool_dtr(struct dm_target *ti)
1629 mutex_unlock(&dm_thin_pool_table.mutex); 1834 mutex_unlock(&dm_thin_pool_table.mutex);
1630} 1835}
1631 1836
1632struct pool_features {
1633 unsigned zero_new_blocks:1;
1634};
1635
1636static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf, 1837static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1637 struct dm_target *ti) 1838 struct dm_target *ti)
1638{ 1839{
@@ -1641,7 +1842,7 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1641 const char *arg_name; 1842 const char *arg_name;
1642 1843
1643 static struct dm_arg _args[] = { 1844 static struct dm_arg _args[] = {
1644 {0, 1, "Invalid number of pool feature arguments"}, 1845 {0, 3, "Invalid number of pool feature arguments"},
1645 }; 1846 };
1646 1847
1647 /* 1848 /*
@@ -1661,6 +1862,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1661 if (!strcasecmp(arg_name, "skip_block_zeroing")) { 1862 if (!strcasecmp(arg_name, "skip_block_zeroing")) {
1662 pf->zero_new_blocks = 0; 1863 pf->zero_new_blocks = 0;
1663 continue; 1864 continue;
1865 } else if (!strcasecmp(arg_name, "ignore_discard")) {
1866 pf->discard_enabled = 0;
1867 continue;
1868 } else if (!strcasecmp(arg_name, "no_discard_passdown")) {
1869 pf->discard_passdown = 0;
1870 continue;
1664 } 1871 }
1665 1872
1666 ti->error = "Unrecognised pool feature requested"; 1873 ti->error = "Unrecognised pool feature requested";
@@ -1678,10 +1885,12 @@ static int parse_pool_features(struct dm_arg_set *as, struct pool_features *pf,
1678 * 1885 *
1679 * Optional feature arguments are: 1886 * Optional feature arguments are:
1680 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks. 1887 * skip_block_zeroing: skips the zeroing of newly-provisioned blocks.
1888 * ignore_discard: disable discard
1889 * no_discard_passdown: don't pass discards down to the data device
1681 */ 1890 */
1682static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv) 1891static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1683{ 1892{
1684 int r; 1893 int r, pool_created = 0;
1685 struct pool_c *pt; 1894 struct pool_c *pt;
1686 struct pool *pool; 1895 struct pool *pool;
1687 struct pool_features pf; 1896 struct pool_features pf;
@@ -1691,6 +1900,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1691 dm_block_t low_water_blocks; 1900 dm_block_t low_water_blocks;
1692 struct dm_dev *metadata_dev; 1901 struct dm_dev *metadata_dev;
1693 sector_t metadata_dev_size; 1902 sector_t metadata_dev_size;
1903 char b[BDEVNAME_SIZE];
1694 1904
1695 /* 1905 /*
1696 * FIXME Remove validation from scope of lock. 1906 * FIXME Remove validation from scope of lock.
@@ -1712,11 +1922,9 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1712 } 1922 }
1713 1923
1714 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT; 1924 metadata_dev_size = i_size_read(metadata_dev->bdev->bd_inode) >> SECTOR_SHIFT;
1715 if (metadata_dev_size > METADATA_DEV_MAX_SECTORS) { 1925 if (metadata_dev_size > THIN_METADATA_MAX_SECTORS_WARNING)
1716 ti->error = "Metadata device is too large"; 1926 DMWARN("Metadata device %s is larger than %u sectors: excess space will not be used.",
1717 r = -EINVAL; 1927 bdevname(metadata_dev->bdev, b), THIN_METADATA_MAX_SECTORS);
1718 goto out_metadata;
1719 }
1720 1928
1721 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev); 1929 r = dm_get_device(ti, argv[1], FMODE_READ | FMODE_WRITE, &data_dev);
1722 if (r) { 1930 if (r) {
@@ -1742,8 +1950,7 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1742 /* 1950 /*
1743 * Set default pool features. 1951 * Set default pool features.
1744 */ 1952 */
1745 memset(&pf, 0, sizeof(pf)); 1953 pool_features_init(&pf);
1746 pf.zero_new_blocks = 1;
1747 1954
1748 dm_consume_args(&as, 4); 1955 dm_consume_args(&as, 4);
1749 r = parse_pool_features(&as, &pf, ti); 1956 r = parse_pool_features(&as, &pf, ti);
@@ -1757,20 +1964,58 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1757 } 1964 }
1758 1965
1759 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev, 1966 pool = __pool_find(dm_table_get_md(ti->table), metadata_dev->bdev,
1760 block_size, &ti->error); 1967 block_size, &ti->error, &pool_created);
1761 if (IS_ERR(pool)) { 1968 if (IS_ERR(pool)) {
1762 r = PTR_ERR(pool); 1969 r = PTR_ERR(pool);
1763 goto out_free_pt; 1970 goto out_free_pt;
1764 } 1971 }
1765 1972
1973 /*
1974 * 'pool_created' reflects whether this is the first table load.
1975 * Top level discard support is not allowed to be changed after
1976 * initial load. This would require a pool reload to trigger thin
1977 * device changes.
1978 */
1979 if (!pool_created && pf.discard_enabled != pool->pf.discard_enabled) {
1980 ti->error = "Discard support cannot be disabled once enabled";
1981 r = -EINVAL;
1982 goto out_flags_changed;
1983 }
1984
1985 /*
1986 * If discard_passdown was enabled verify that the data device
1987 * supports discards. Disable discard_passdown if not; otherwise
1988 * -EOPNOTSUPP will be returned.
1989 */
1990 if (pf.discard_passdown) {
1991 struct request_queue *q = bdev_get_queue(data_dev->bdev);
1992 if (!q || !blk_queue_discard(q)) {
1993 DMWARN("Discard unsupported by data device: Disabling discard passdown.");
1994 pf.discard_passdown = 0;
1995 }
1996 }
1997
1766 pt->pool = pool; 1998 pt->pool = pool;
1767 pt->ti = ti; 1999 pt->ti = ti;
1768 pt->metadata_dev = metadata_dev; 2000 pt->metadata_dev = metadata_dev;
1769 pt->data_dev = data_dev; 2001 pt->data_dev = data_dev;
1770 pt->low_water_blocks = low_water_blocks; 2002 pt->low_water_blocks = low_water_blocks;
1771 pt->zero_new_blocks = pf.zero_new_blocks; 2003 pt->pf = pf;
1772 ti->num_flush_requests = 1; 2004 ti->num_flush_requests = 1;
1773 ti->num_discard_requests = 0; 2005 /*
2006 * Only need to enable discards if the pool should pass
2007 * them down to the data device. The thin device's discard
2008 * processing will cause mappings to be removed from the btree.
2009 */
2010 if (pf.discard_enabled && pf.discard_passdown) {
2011 ti->num_discard_requests = 1;
2012 /*
2013 * Setting 'discards_supported' circumvents the normal
2014 * stacking of discard limits (this keeps the pool and
2015 * thin devices' discard limits consistent).
2016 */
2017 ti->discards_supported = 1;
2018 }
1774 ti->private = pt; 2019 ti->private = pt;
1775 2020
1776 pt->callbacks.congested_fn = pool_is_congested; 2021 pt->callbacks.congested_fn = pool_is_congested;
@@ -1780,6 +2025,8 @@ static int pool_ctr(struct dm_target *ti, unsigned argc, char **argv)
1780 2025
1781 return 0; 2026 return 0;
1782 2027
2028out_flags_changed:
2029 __pool_dec(pool);
1783out_free_pt: 2030out_free_pt:
1784 kfree(pt); 2031 kfree(pt);
1785out: 2032out:
@@ -1878,7 +2125,7 @@ static void pool_resume(struct dm_target *ti)
1878 __requeue_bios(pool); 2125 __requeue_bios(pool);
1879 spin_unlock_irqrestore(&pool->lock, flags); 2126 spin_unlock_irqrestore(&pool->lock, flags);
1880 2127
1881 wake_worker(pool); 2128 do_waker(&pool->waker.work);
1882} 2129}
1883 2130
1884static void pool_postsuspend(struct dm_target *ti) 2131static void pool_postsuspend(struct dm_target *ti)
@@ -1887,6 +2134,7 @@ static void pool_postsuspend(struct dm_target *ti)
1887 struct pool_c *pt = ti->private; 2134 struct pool_c *pt = ti->private;
1888 struct pool *pool = pt->pool; 2135 struct pool *pool = pt->pool;
1889 2136
2137 cancel_delayed_work(&pool->waker);
1890 flush_workqueue(pool->wq); 2138 flush_workqueue(pool->wq);
1891 2139
1892 r = dm_pool_commit_metadata(pool->pmd); 2140 r = dm_pool_commit_metadata(pool->pmd);
@@ -2067,7 +2315,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
2067static int pool_status(struct dm_target *ti, status_type_t type, 2315static int pool_status(struct dm_target *ti, status_type_t type,
2068 char *result, unsigned maxlen) 2316 char *result, unsigned maxlen)
2069{ 2317{
2070 int r; 2318 int r, count;
2071 unsigned sz = 0; 2319 unsigned sz = 0;
2072 uint64_t transaction_id; 2320 uint64_t transaction_id;
2073 dm_block_t nr_free_blocks_data; 2321 dm_block_t nr_free_blocks_data;
@@ -2130,10 +2378,19 @@ static int pool_status(struct dm_target *ti, status_type_t type,
2130 (unsigned long)pool->sectors_per_block, 2378 (unsigned long)pool->sectors_per_block,
2131 (unsigned long long)pt->low_water_blocks); 2379 (unsigned long long)pt->low_water_blocks);
2132 2380
2133 DMEMIT("%u ", !pool->zero_new_blocks); 2381 count = !pool->pf.zero_new_blocks + !pool->pf.discard_enabled +
2382 !pool->pf.discard_passdown;
2383 DMEMIT("%u ", count);
2134 2384
2135 if (!pool->zero_new_blocks) 2385 if (!pool->pf.zero_new_blocks)
2136 DMEMIT("skip_block_zeroing "); 2386 DMEMIT("skip_block_zeroing ");
2387
2388 if (!pool->pf.discard_enabled)
2389 DMEMIT("ignore_discard ");
2390
2391 if (!pool->pf.discard_passdown)
2392 DMEMIT("no_discard_passdown ");
2393
2137 break; 2394 break;
2138 } 2395 }
2139 2396
@@ -2162,6 +2419,21 @@ static int pool_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
2162 return min(max_size, q->merge_bvec_fn(q, bvm, biovec)); 2419 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
2163} 2420}
2164 2421
2422static void set_discard_limits(struct pool *pool, struct queue_limits *limits)
2423{
2424 /*
2425 * FIXME: these limits may be incompatible with the pool's data device
2426 */
2427 limits->max_discard_sectors = pool->sectors_per_block;
2428
2429 /*
2430 * This is just a hint, and not enforced. We have to cope with
2431 * bios that overlap 2 blocks.
2432 */
2433 limits->discard_granularity = pool->sectors_per_block << SECTOR_SHIFT;
2434 limits->discard_zeroes_data = pool->pf.zero_new_blocks;
2435}
2436
2165static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits) 2437static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2166{ 2438{
2167 struct pool_c *pt = ti->private; 2439 struct pool_c *pt = ti->private;
@@ -2169,13 +2441,15 @@ static void pool_io_hints(struct dm_target *ti, struct queue_limits *limits)
2169 2441
2170 blk_limits_io_min(limits, 0); 2442 blk_limits_io_min(limits, 0);
2171 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT); 2443 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2444 if (pool->pf.discard_enabled)
2445 set_discard_limits(pool, limits);
2172} 2446}
2173 2447
2174static struct target_type pool_target = { 2448static struct target_type pool_target = {
2175 .name = "thin-pool", 2449 .name = "thin-pool",
2176 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE | 2450 .features = DM_TARGET_SINGLETON | DM_TARGET_ALWAYS_WRITEABLE |
2177 DM_TARGET_IMMUTABLE, 2451 DM_TARGET_IMMUTABLE,
2178 .version = {1, 0, 0}, 2452 .version = {1, 1, 0},
2179 .module = THIS_MODULE, 2453 .module = THIS_MODULE,
2180 .ctr = pool_ctr, 2454 .ctr = pool_ctr,
2181 .dtr = pool_dtr, 2455 .dtr = pool_dtr,
@@ -2202,6 +2476,8 @@ static void thin_dtr(struct dm_target *ti)
2202 __pool_dec(tc->pool); 2476 __pool_dec(tc->pool);
2203 dm_pool_close_thin_device(tc->td); 2477 dm_pool_close_thin_device(tc->td);
2204 dm_put_device(ti, tc->pool_dev); 2478 dm_put_device(ti, tc->pool_dev);
2479 if (tc->origin_dev)
2480 dm_put_device(ti, tc->origin_dev);
2205 kfree(tc); 2481 kfree(tc);
2206 2482
2207 mutex_unlock(&dm_thin_pool_table.mutex); 2483 mutex_unlock(&dm_thin_pool_table.mutex);
@@ -2210,21 +2486,25 @@ static void thin_dtr(struct dm_target *ti)
2210/* 2486/*
2211 * Thin target parameters: 2487 * Thin target parameters:
2212 * 2488 *
2213 * <pool_dev> <dev_id> 2489 * <pool_dev> <dev_id> [origin_dev]
2214 * 2490 *
2215 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool) 2491 * pool_dev: the path to the pool (eg, /dev/mapper/my_pool)
2216 * dev_id: the internal device identifier 2492 * dev_id: the internal device identifier
2493 * origin_dev: a device external to the pool that should act as the origin
2494 *
2495 * If the pool device has discards disabled, they get disabled for the thin
2496 * device as well.
2217 */ 2497 */
2218static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv) 2498static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2219{ 2499{
2220 int r; 2500 int r;
2221 struct thin_c *tc; 2501 struct thin_c *tc;
2222 struct dm_dev *pool_dev; 2502 struct dm_dev *pool_dev, *origin_dev;
2223 struct mapped_device *pool_md; 2503 struct mapped_device *pool_md;
2224 2504
2225 mutex_lock(&dm_thin_pool_table.mutex); 2505 mutex_lock(&dm_thin_pool_table.mutex);
2226 2506
2227 if (argc != 2) { 2507 if (argc != 2 && argc != 3) {
2228 ti->error = "Invalid argument count"; 2508 ti->error = "Invalid argument count";
2229 r = -EINVAL; 2509 r = -EINVAL;
2230 goto out_unlock; 2510 goto out_unlock;
@@ -2237,6 +2517,15 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2237 goto out_unlock; 2517 goto out_unlock;
2238 } 2518 }
2239 2519
2520 if (argc == 3) {
2521 r = dm_get_device(ti, argv[2], FMODE_READ, &origin_dev);
2522 if (r) {
2523 ti->error = "Error opening origin device";
2524 goto bad_origin_dev;
2525 }
2526 tc->origin_dev = origin_dev;
2527 }
2528
2240 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev); 2529 r = dm_get_device(ti, argv[0], dm_table_get_mode(ti->table), &pool_dev);
2241 if (r) { 2530 if (r) {
2242 ti->error = "Error opening pool device"; 2531 ti->error = "Error opening pool device";
@@ -2273,8 +2562,12 @@ static int thin_ctr(struct dm_target *ti, unsigned argc, char **argv)
2273 2562
2274 ti->split_io = tc->pool->sectors_per_block; 2563 ti->split_io = tc->pool->sectors_per_block;
2275 ti->num_flush_requests = 1; 2564 ti->num_flush_requests = 1;
2276 ti->num_discard_requests = 0; 2565
2277 ti->discards_supported = 0; 2566 /* In case the pool supports discards, pass them on. */
2567 if (tc->pool->pf.discard_enabled) {
2568 ti->discards_supported = 1;
2569 ti->num_discard_requests = 1;
2570 }
2278 2571
2279 dm_put(pool_md); 2572 dm_put(pool_md);
2280 2573
@@ -2289,6 +2582,9 @@ bad_pool_lookup:
2289bad_common: 2582bad_common:
2290 dm_put_device(ti, tc->pool_dev); 2583 dm_put_device(ti, tc->pool_dev);
2291bad_pool_dev: 2584bad_pool_dev:
2585 if (tc->origin_dev)
2586 dm_put_device(ti, tc->origin_dev);
2587bad_origin_dev:
2292 kfree(tc); 2588 kfree(tc);
2293out_unlock: 2589out_unlock:
2294 mutex_unlock(&dm_thin_pool_table.mutex); 2590 mutex_unlock(&dm_thin_pool_table.mutex);
@@ -2299,11 +2595,46 @@ out_unlock:
2299static int thin_map(struct dm_target *ti, struct bio *bio, 2595static int thin_map(struct dm_target *ti, struct bio *bio,
2300 union map_info *map_context) 2596 union map_info *map_context)
2301{ 2597{
2302 bio->bi_sector -= ti->begin; 2598 bio->bi_sector = dm_target_offset(ti, bio->bi_sector);
2303 2599
2304 return thin_bio_map(ti, bio, map_context); 2600 return thin_bio_map(ti, bio, map_context);
2305} 2601}
2306 2602
2603static int thin_endio(struct dm_target *ti,
2604 struct bio *bio, int err,
2605 union map_info *map_context)
2606{
2607 unsigned long flags;
2608 struct endio_hook *h = map_context->ptr;
2609 struct list_head work;
2610 struct new_mapping *m, *tmp;
2611 struct pool *pool = h->tc->pool;
2612
2613 if (h->shared_read_entry) {
2614 INIT_LIST_HEAD(&work);
2615 ds_dec(h->shared_read_entry, &work);
2616
2617 spin_lock_irqsave(&pool->lock, flags);
2618 list_for_each_entry_safe(m, tmp, &work, list) {
2619 list_del(&m->list);
2620 m->quiesced = 1;
2621 __maybe_add_mapping(m);
2622 }
2623 spin_unlock_irqrestore(&pool->lock, flags);
2624 }
2625
2626 if (h->all_io_entry) {
2627 INIT_LIST_HEAD(&work);
2628 ds_dec(h->all_io_entry, &work);
2629 list_for_each_entry_safe(m, tmp, &work, list)
2630 list_add(&m->list, &pool->prepared_discards);
2631 }
2632
2633 mempool_free(h, pool->endio_hook_pool);
2634
2635 return 0;
2636}
2637
2307static void thin_postsuspend(struct dm_target *ti) 2638static void thin_postsuspend(struct dm_target *ti)
2308{ 2639{
2309 if (dm_noflush_suspending(ti)) 2640 if (dm_noflush_suspending(ti))
@@ -2347,6 +2678,8 @@ static int thin_status(struct dm_target *ti, status_type_t type,
2347 DMEMIT("%s %lu", 2678 DMEMIT("%s %lu",
2348 format_dev_t(buf, tc->pool_dev->bdev->bd_dev), 2679 format_dev_t(buf, tc->pool_dev->bdev->bd_dev),
2349 (unsigned long) tc->dev_id); 2680 (unsigned long) tc->dev_id);
2681 if (tc->origin_dev)
2682 DMEMIT(" %s", format_dev_t(buf, tc->origin_dev->bdev->bd_dev));
2350 break; 2683 break;
2351 } 2684 }
2352 } 2685 }
@@ -2377,18 +2710,21 @@ static int thin_iterate_devices(struct dm_target *ti,
2377static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits) 2710static void thin_io_hints(struct dm_target *ti, struct queue_limits *limits)
2378{ 2711{
2379 struct thin_c *tc = ti->private; 2712 struct thin_c *tc = ti->private;
2713 struct pool *pool = tc->pool;
2380 2714
2381 blk_limits_io_min(limits, 0); 2715 blk_limits_io_min(limits, 0);
2382 blk_limits_io_opt(limits, tc->pool->sectors_per_block << SECTOR_SHIFT); 2716 blk_limits_io_opt(limits, pool->sectors_per_block << SECTOR_SHIFT);
2717 set_discard_limits(pool, limits);
2383} 2718}
2384 2719
2385static struct target_type thin_target = { 2720static struct target_type thin_target = {
2386 .name = "thin", 2721 .name = "thin",
2387 .version = {1, 0, 0}, 2722 .version = {1, 1, 0},
2388 .module = THIS_MODULE, 2723 .module = THIS_MODULE,
2389 .ctr = thin_ctr, 2724 .ctr = thin_ctr,
2390 .dtr = thin_dtr, 2725 .dtr = thin_dtr,
2391 .map = thin_map, 2726 .map = thin_map,
2727 .end_io = thin_endio,
2392 .postsuspend = thin_postsuspend, 2728 .postsuspend = thin_postsuspend,
2393 .status = thin_status, 2729 .status = thin_status,
2394 .iterate_devices = thin_iterate_devices, 2730 .iterate_devices = thin_iterate_devices,
diff --git a/drivers/md/dm-verity.c b/drivers/md/dm-verity.c
new file mode 100644
index 000000000000..fa365d39b612
--- /dev/null
+++ b/drivers/md/dm-verity.c
@@ -0,0 +1,913 @@
1/*
2 * Copyright (C) 2012 Red Hat, Inc.
3 *
4 * Author: Mikulas Patocka <mpatocka@redhat.com>
5 *
6 * Based on Chromium dm-verity driver (C) 2011 The Chromium OS Authors
7 *
8 * This file is released under the GPLv2.
9 *
10 * In the file "/sys/module/dm_verity/parameters/prefetch_cluster" you can set
11 * default prefetch value. Data are read in "prefetch_cluster" chunks from the
12 * hash device. Setting this greatly improves performance when data and hash
13 * are on the same disk on different partitions on devices with poor random
14 * access behavior.
15 */
16
17#include "dm-bufio.h"
18
19#include <linux/module.h>
20#include <linux/device-mapper.h>
21#include <crypto/hash.h>
22
23#define DM_MSG_PREFIX "verity"
24
25#define DM_VERITY_IO_VEC_INLINE 16
26#define DM_VERITY_MEMPOOL_SIZE 4
27#define DM_VERITY_DEFAULT_PREFETCH_SIZE 262144
28
29#define DM_VERITY_MAX_LEVELS 63
30
31static unsigned dm_verity_prefetch_cluster = DM_VERITY_DEFAULT_PREFETCH_SIZE;
32
33module_param_named(prefetch_cluster, dm_verity_prefetch_cluster, uint, S_IRUGO | S_IWUSR);
34
35struct dm_verity {
36 struct dm_dev *data_dev;
37 struct dm_dev *hash_dev;
38 struct dm_target *ti;
39 struct dm_bufio_client *bufio;
40 char *alg_name;
41 struct crypto_shash *tfm;
42 u8 *root_digest; /* digest of the root block */
43 u8 *salt; /* salt: its size is salt_size */
44 unsigned salt_size;
45 sector_t data_start; /* data offset in 512-byte sectors */
46 sector_t hash_start; /* hash start in blocks */
47 sector_t data_blocks; /* the number of data blocks */
48 sector_t hash_blocks; /* the number of hash blocks */
49 unsigned char data_dev_block_bits; /* log2(data blocksize) */
50 unsigned char hash_dev_block_bits; /* log2(hash blocksize) */
51 unsigned char hash_per_block_bits; /* log2(hashes in hash block) */
52 unsigned char levels; /* the number of tree levels */
53 unsigned char version;
54 unsigned digest_size; /* digest size for the current hash algorithm */
55 unsigned shash_descsize;/* the size of temporary space for crypto */
56 int hash_failed; /* set to 1 if hash of any block failed */
57
58 mempool_t *io_mempool; /* mempool of struct dm_verity_io */
59 mempool_t *vec_mempool; /* mempool of bio vector */
60
61 struct workqueue_struct *verify_wq;
62
63 /* starting blocks for each tree level. 0 is the lowest level. */
64 sector_t hash_level_block[DM_VERITY_MAX_LEVELS];
65};
66
67struct dm_verity_io {
68 struct dm_verity *v;
69 struct bio *bio;
70
71 /* original values of bio->bi_end_io and bio->bi_private */
72 bio_end_io_t *orig_bi_end_io;
73 void *orig_bi_private;
74
75 sector_t block;
76 unsigned n_blocks;
77
78 /* saved bio vector */
79 struct bio_vec *io_vec;
80 unsigned io_vec_size;
81
82 struct work_struct work;
83
84 /* A space for short vectors; longer vectors are allocated separately. */
85 struct bio_vec io_vec_inline[DM_VERITY_IO_VEC_INLINE];
86
87 /*
88 * Three variably-size fields follow this struct:
89 *
90 * u8 hash_desc[v->shash_descsize];
91 * u8 real_digest[v->digest_size];
92 * u8 want_digest[v->digest_size];
93 *
94 * To access them use: io_hash_desc(), io_real_digest() and io_want_digest().
95 */
96};
97
98static struct shash_desc *io_hash_desc(struct dm_verity *v, struct dm_verity_io *io)
99{
100 return (struct shash_desc *)(io + 1);
101}
102
103static u8 *io_real_digest(struct dm_verity *v, struct dm_verity_io *io)
104{
105 return (u8 *)(io + 1) + v->shash_descsize;
106}
107
108static u8 *io_want_digest(struct dm_verity *v, struct dm_verity_io *io)
109{
110 return (u8 *)(io + 1) + v->shash_descsize + v->digest_size;
111}
112
113/*
114 * Auxiliary structure appended to each dm-bufio buffer. If the value
115 * hash_verified is nonzero, hash of the block has been verified.
116 *
117 * The variable hash_verified is set to 0 when allocating the buffer, then
118 * it can be changed to 1 and it is never reset to 0 again.
119 *
120 * There is no lock around this value, a race condition can at worst cause
121 * that multiple processes verify the hash of the same buffer simultaneously
122 * and write 1 to hash_verified simultaneously.
123 * This condition is harmless, so we don't need locking.
124 */
125struct buffer_aux {
126 int hash_verified;
127};
128
129/*
130 * Initialize struct buffer_aux for a freshly created buffer.
131 */
132static void dm_bufio_alloc_callback(struct dm_buffer *buf)
133{
134 struct buffer_aux *aux = dm_bufio_get_aux_data(buf);
135
136 aux->hash_verified = 0;
137}
138
139/*
140 * Translate input sector number to the sector number on the target device.
141 */
142static sector_t verity_map_sector(struct dm_verity *v, sector_t bi_sector)
143{
144 return v->data_start + dm_target_offset(v->ti, bi_sector);
145}
146
147/*
148 * Return hash position of a specified block at a specified tree level
149 * (0 is the lowest level).
150 * The lowest "hash_per_block_bits"-bits of the result denote hash position
151 * inside a hash block. The remaining bits denote location of the hash block.
152 */
153static sector_t verity_position_at_level(struct dm_verity *v, sector_t block,
154 int level)
155{
156 return block >> (level * v->hash_per_block_bits);
157}
158
159static void verity_hash_at_level(struct dm_verity *v, sector_t block, int level,
160 sector_t *hash_block, unsigned *offset)
161{
162 sector_t position = verity_position_at_level(v, block, level);
163 unsigned idx;
164
165 *hash_block = v->hash_level_block[level] + (position >> v->hash_per_block_bits);
166
167 if (!offset)
168 return;
169
170 idx = position & ((1 << v->hash_per_block_bits) - 1);
171 if (!v->version)
172 *offset = idx * v->digest_size;
173 else
174 *offset = idx << (v->hash_dev_block_bits - v->hash_per_block_bits);
175}
176
177/*
178 * Verify hash of a metadata block pertaining to the specified data block
179 * ("block" argument) at a specified level ("level" argument).
180 *
181 * On successful return, io_want_digest(v, io) contains the hash value for
182 * a lower tree level or for the data block (if we're at the lowest leve).
183 *
184 * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned.
185 * If "skip_unverified" is false, unverified buffer is hashed and verified
186 * against current value of io_want_digest(v, io).
187 */
188static int verity_verify_level(struct dm_verity_io *io, sector_t block,
189 int level, bool skip_unverified)
190{
191 struct dm_verity *v = io->v;
192 struct dm_buffer *buf;
193 struct buffer_aux *aux;
194 u8 *data;
195 int r;
196 sector_t hash_block;
197 unsigned offset;
198
199 verity_hash_at_level(v, block, level, &hash_block, &offset);
200
201 data = dm_bufio_read(v->bufio, hash_block, &buf);
202 if (unlikely(IS_ERR(data)))
203 return PTR_ERR(data);
204
205 aux = dm_bufio_get_aux_data(buf);
206
207 if (!aux->hash_verified) {
208 struct shash_desc *desc;
209 u8 *result;
210
211 if (skip_unverified) {
212 r = 1;
213 goto release_ret_r;
214 }
215
216 desc = io_hash_desc(v, io);
217 desc->tfm = v->tfm;
218 desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
219 r = crypto_shash_init(desc);
220 if (r < 0) {
221 DMERR("crypto_shash_init failed: %d", r);
222 goto release_ret_r;
223 }
224
225 if (likely(v->version >= 1)) {
226 r = crypto_shash_update(desc, v->salt, v->salt_size);
227 if (r < 0) {
228 DMERR("crypto_shash_update failed: %d", r);
229 goto release_ret_r;
230 }
231 }
232
233 r = crypto_shash_update(desc, data, 1 << v->hash_dev_block_bits);
234 if (r < 0) {
235 DMERR("crypto_shash_update failed: %d", r);
236 goto release_ret_r;
237 }
238
239 if (!v->version) {
240 r = crypto_shash_update(desc, v->salt, v->salt_size);
241 if (r < 0) {
242 DMERR("crypto_shash_update failed: %d", r);
243 goto release_ret_r;
244 }
245 }
246
247 result = io_real_digest(v, io);
248 r = crypto_shash_final(desc, result);
249 if (r < 0) {
250 DMERR("crypto_shash_final failed: %d", r);
251 goto release_ret_r;
252 }
253 if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
254 DMERR_LIMIT("metadata block %llu is corrupted",
255 (unsigned long long)hash_block);
256 v->hash_failed = 1;
257 r = -EIO;
258 goto release_ret_r;
259 } else
260 aux->hash_verified = 1;
261 }
262
263 data += offset;
264
265 memcpy(io_want_digest(v, io), data, v->digest_size);
266
267 dm_bufio_release(buf);
268 return 0;
269
270release_ret_r:
271 dm_bufio_release(buf);
272
273 return r;
274}
275
276/*
277 * Verify one "dm_verity_io" structure.
278 */
279static int verity_verify_io(struct dm_verity_io *io)
280{
281 struct dm_verity *v = io->v;
282 unsigned b;
283 int i;
284 unsigned vector = 0, offset = 0;
285
286 for (b = 0; b < io->n_blocks; b++) {
287 struct shash_desc *desc;
288 u8 *result;
289 int r;
290 unsigned todo;
291
292 if (likely(v->levels)) {
293 /*
294 * First, we try to get the requested hash for
295 * the current block. If the hash block itself is
296 * verified, zero is returned. If it isn't, this
297 * function returns 0 and we fall back to whole
298 * chain verification.
299 */
300 int r = verity_verify_level(io, io->block + b, 0, true);
301 if (likely(!r))
302 goto test_block_hash;
303 if (r < 0)
304 return r;
305 }
306
307 memcpy(io_want_digest(v, io), v->root_digest, v->digest_size);
308
309 for (i = v->levels - 1; i >= 0; i--) {
310 int r = verity_verify_level(io, io->block + b, i, false);
311 if (unlikely(r))
312 return r;
313 }
314
315test_block_hash:
316 desc = io_hash_desc(v, io);
317 desc->tfm = v->tfm;
318 desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
319 r = crypto_shash_init(desc);
320 if (r < 0) {
321 DMERR("crypto_shash_init failed: %d", r);
322 return r;
323 }
324
325 if (likely(v->version >= 1)) {
326 r = crypto_shash_update(desc, v->salt, v->salt_size);
327 if (r < 0) {
328 DMERR("crypto_shash_update failed: %d", r);
329 return r;
330 }
331 }
332
333 todo = 1 << v->data_dev_block_bits;
334 do {
335 struct bio_vec *bv;
336 u8 *page;
337 unsigned len;
338
339 BUG_ON(vector >= io->io_vec_size);
340 bv = &io->io_vec[vector];
341 page = kmap_atomic(bv->bv_page);
342 len = bv->bv_len - offset;
343 if (likely(len >= todo))
344 len = todo;
345 r = crypto_shash_update(desc,
346 page + bv->bv_offset + offset, len);
347 kunmap_atomic(page);
348 if (r < 0) {
349 DMERR("crypto_shash_update failed: %d", r);
350 return r;
351 }
352 offset += len;
353 if (likely(offset == bv->bv_len)) {
354 offset = 0;
355 vector++;
356 }
357 todo -= len;
358 } while (todo);
359
360 if (!v->version) {
361 r = crypto_shash_update(desc, v->salt, v->salt_size);
362 if (r < 0) {
363 DMERR("crypto_shash_update failed: %d", r);
364 return r;
365 }
366 }
367
368 result = io_real_digest(v, io);
369 r = crypto_shash_final(desc, result);
370 if (r < 0) {
371 DMERR("crypto_shash_final failed: %d", r);
372 return r;
373 }
374 if (unlikely(memcmp(result, io_want_digest(v, io), v->digest_size))) {
375 DMERR_LIMIT("data block %llu is corrupted",
376 (unsigned long long)(io->block + b));
377 v->hash_failed = 1;
378 return -EIO;
379 }
380 }
381 BUG_ON(vector != io->io_vec_size);
382 BUG_ON(offset);
383
384 return 0;
385}
386
387/*
388 * End one "io" structure with a given error.
389 */
390static void verity_finish_io(struct dm_verity_io *io, int error)
391{
392 struct bio *bio = io->bio;
393 struct dm_verity *v = io->v;
394
395 bio->bi_end_io = io->orig_bi_end_io;
396 bio->bi_private = io->orig_bi_private;
397
398 if (io->io_vec != io->io_vec_inline)
399 mempool_free(io->io_vec, v->vec_mempool);
400
401 mempool_free(io, v->io_mempool);
402
403 bio_endio(bio, error);
404}
405
406static void verity_work(struct work_struct *w)
407{
408 struct dm_verity_io *io = container_of(w, struct dm_verity_io, work);
409
410 verity_finish_io(io, verity_verify_io(io));
411}
412
413static void verity_end_io(struct bio *bio, int error)
414{
415 struct dm_verity_io *io = bio->bi_private;
416
417 if (error) {
418 verity_finish_io(io, error);
419 return;
420 }
421
422 INIT_WORK(&io->work, verity_work);
423 queue_work(io->v->verify_wq, &io->work);
424}
425
426/*
427 * Prefetch buffers for the specified io.
428 * The root buffer is not prefetched, it is assumed that it will be cached
429 * all the time.
430 */
431static void verity_prefetch_io(struct dm_verity *v, struct dm_verity_io *io)
432{
433 int i;
434
435 for (i = v->levels - 2; i >= 0; i--) {
436 sector_t hash_block_start;
437 sector_t hash_block_end;
438 verity_hash_at_level(v, io->block, i, &hash_block_start, NULL);
439 verity_hash_at_level(v, io->block + io->n_blocks - 1, i, &hash_block_end, NULL);
440 if (!i) {
441 unsigned cluster = *(volatile unsigned *)&dm_verity_prefetch_cluster;
442
443 cluster >>= v->data_dev_block_bits;
444 if (unlikely(!cluster))
445 goto no_prefetch_cluster;
446
447 if (unlikely(cluster & (cluster - 1)))
448 cluster = 1 << (fls(cluster) - 1);
449
450 hash_block_start &= ~(sector_t)(cluster - 1);
451 hash_block_end |= cluster - 1;
452 if (unlikely(hash_block_end >= v->hash_blocks))
453 hash_block_end = v->hash_blocks - 1;
454 }
455no_prefetch_cluster:
456 dm_bufio_prefetch(v->bufio, hash_block_start,
457 hash_block_end - hash_block_start + 1);
458 }
459}
460
461/*
462 * Bio map function. It allocates dm_verity_io structure and bio vector and
463 * fills them. Then it issues prefetches and the I/O.
464 */
465static int verity_map(struct dm_target *ti, struct bio *bio,
466 union map_info *map_context)
467{
468 struct dm_verity *v = ti->private;
469 struct dm_verity_io *io;
470
471 bio->bi_bdev = v->data_dev->bdev;
472 bio->bi_sector = verity_map_sector(v, bio->bi_sector);
473
474 if (((unsigned)bio->bi_sector | bio_sectors(bio)) &
475 ((1 << (v->data_dev_block_bits - SECTOR_SHIFT)) - 1)) {
476 DMERR_LIMIT("unaligned io");
477 return -EIO;
478 }
479
480 if ((bio->bi_sector + bio_sectors(bio)) >>
481 (v->data_dev_block_bits - SECTOR_SHIFT) > v->data_blocks) {
482 DMERR_LIMIT("io out of range");
483 return -EIO;
484 }
485
486 if (bio_data_dir(bio) == WRITE)
487 return -EIO;
488
489 io = mempool_alloc(v->io_mempool, GFP_NOIO);
490 io->v = v;
491 io->bio = bio;
492 io->orig_bi_end_io = bio->bi_end_io;
493 io->orig_bi_private = bio->bi_private;
494 io->block = bio->bi_sector >> (v->data_dev_block_bits - SECTOR_SHIFT);
495 io->n_blocks = bio->bi_size >> v->data_dev_block_bits;
496
497 bio->bi_end_io = verity_end_io;
498 bio->bi_private = io;
499 io->io_vec_size = bio->bi_vcnt - bio->bi_idx;
500 if (io->io_vec_size < DM_VERITY_IO_VEC_INLINE)
501 io->io_vec = io->io_vec_inline;
502 else
503 io->io_vec = mempool_alloc(v->vec_mempool, GFP_NOIO);
504 memcpy(io->io_vec, bio_iovec(bio),
505 io->io_vec_size * sizeof(struct bio_vec));
506
507 verity_prefetch_io(v, io);
508
509 generic_make_request(bio);
510
511 return DM_MAPIO_SUBMITTED;
512}
513
514/*
515 * Status: V (valid) or C (corruption found)
516 */
517static int verity_status(struct dm_target *ti, status_type_t type,
518 char *result, unsigned maxlen)
519{
520 struct dm_verity *v = ti->private;
521 unsigned sz = 0;
522 unsigned x;
523
524 switch (type) {
525 case STATUSTYPE_INFO:
526 DMEMIT("%c", v->hash_failed ? 'C' : 'V');
527 break;
528 case STATUSTYPE_TABLE:
529 DMEMIT("%u %s %s %u %u %llu %llu %s ",
530 v->version,
531 v->data_dev->name,
532 v->hash_dev->name,
533 1 << v->data_dev_block_bits,
534 1 << v->hash_dev_block_bits,
535 (unsigned long long)v->data_blocks,
536 (unsigned long long)v->hash_start,
537 v->alg_name
538 );
539 for (x = 0; x < v->digest_size; x++)
540 DMEMIT("%02x", v->root_digest[x]);
541 DMEMIT(" ");
542 if (!v->salt_size)
543 DMEMIT("-");
544 else
545 for (x = 0; x < v->salt_size; x++)
546 DMEMIT("%02x", v->salt[x]);
547 break;
548 }
549
550 return 0;
551}
552
553static int verity_ioctl(struct dm_target *ti, unsigned cmd,
554 unsigned long arg)
555{
556 struct dm_verity *v = ti->private;
557 int r = 0;
558
559 if (v->data_start ||
560 ti->len != i_size_read(v->data_dev->bdev->bd_inode) >> SECTOR_SHIFT)
561 r = scsi_verify_blk_ioctl(NULL, cmd);
562
563 return r ? : __blkdev_driver_ioctl(v->data_dev->bdev, v->data_dev->mode,
564 cmd, arg);
565}
566
567static int verity_merge(struct dm_target *ti, struct bvec_merge_data *bvm,
568 struct bio_vec *biovec, int max_size)
569{
570 struct dm_verity *v = ti->private;
571 struct request_queue *q = bdev_get_queue(v->data_dev->bdev);
572
573 if (!q->merge_bvec_fn)
574 return max_size;
575
576 bvm->bi_bdev = v->data_dev->bdev;
577 bvm->bi_sector = verity_map_sector(v, bvm->bi_sector);
578
579 return min(max_size, q->merge_bvec_fn(q, bvm, biovec));
580}
581
582static int verity_iterate_devices(struct dm_target *ti,
583 iterate_devices_callout_fn fn, void *data)
584{
585 struct dm_verity *v = ti->private;
586
587 return fn(ti, v->data_dev, v->data_start, ti->len, data);
588}
589
590static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
591{
592 struct dm_verity *v = ti->private;
593
594 if (limits->logical_block_size < 1 << v->data_dev_block_bits)
595 limits->logical_block_size = 1 << v->data_dev_block_bits;
596
597 if (limits->physical_block_size < 1 << v->data_dev_block_bits)
598 limits->physical_block_size = 1 << v->data_dev_block_bits;
599
600 blk_limits_io_min(limits, limits->logical_block_size);
601}
602
603static void verity_dtr(struct dm_target *ti)
604{
605 struct dm_verity *v = ti->private;
606
607 if (v->verify_wq)
608 destroy_workqueue(v->verify_wq);
609
610 if (v->vec_mempool)
611 mempool_destroy(v->vec_mempool);
612
613 if (v->io_mempool)
614 mempool_destroy(v->io_mempool);
615
616 if (v->bufio)
617 dm_bufio_client_destroy(v->bufio);
618
619 kfree(v->salt);
620 kfree(v->root_digest);
621
622 if (v->tfm)
623 crypto_free_shash(v->tfm);
624
625 kfree(v->alg_name);
626
627 if (v->hash_dev)
628 dm_put_device(ti, v->hash_dev);
629
630 if (v->data_dev)
631 dm_put_device(ti, v->data_dev);
632
633 kfree(v);
634}
635
636/*
637 * Target parameters:
638 * <version> The current format is version 1.
639 * Vsn 0 is compatible with original Chromium OS releases.
640 * <data device>
641 * <hash device>
642 * <data block size>
643 * <hash block size>
644 * <the number of data blocks>
645 * <hash start block>
646 * <algorithm>
647 * <digest>
648 * <salt> Hex string or "-" if no salt.
649 */
650static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
651{
652 struct dm_verity *v;
653 unsigned num;
654 unsigned long long num_ll;
655 int r;
656 int i;
657 sector_t hash_position;
658 char dummy;
659
660 v = kzalloc(sizeof(struct dm_verity), GFP_KERNEL);
661 if (!v) {
662 ti->error = "Cannot allocate verity structure";
663 return -ENOMEM;
664 }
665 ti->private = v;
666 v->ti = ti;
667
668 if ((dm_table_get_mode(ti->table) & ~FMODE_READ)) {
669 ti->error = "Device must be readonly";
670 r = -EINVAL;
671 goto bad;
672 }
673
674 if (argc != 10) {
675 ti->error = "Invalid argument count: exactly 10 arguments required";
676 r = -EINVAL;
677 goto bad;
678 }
679
680 if (sscanf(argv[0], "%d%c", &num, &dummy) != 1 ||
681 num < 0 || num > 1) {
682 ti->error = "Invalid version";
683 r = -EINVAL;
684 goto bad;
685 }
686 v->version = num;
687
688 r = dm_get_device(ti, argv[1], FMODE_READ, &v->data_dev);
689 if (r) {
690 ti->error = "Data device lookup failed";
691 goto bad;
692 }
693
694 r = dm_get_device(ti, argv[2], FMODE_READ, &v->hash_dev);
695 if (r) {
696 ti->error = "Data device lookup failed";
697 goto bad;
698 }
699
700 if (sscanf(argv[3], "%u%c", &num, &dummy) != 1 ||
701 !num || (num & (num - 1)) ||
702 num < bdev_logical_block_size(v->data_dev->bdev) ||
703 num > PAGE_SIZE) {
704 ti->error = "Invalid data device block size";
705 r = -EINVAL;
706 goto bad;
707 }
708 v->data_dev_block_bits = ffs(num) - 1;
709
710 if (sscanf(argv[4], "%u%c", &num, &dummy) != 1 ||
711 !num || (num & (num - 1)) ||
712 num < bdev_logical_block_size(v->hash_dev->bdev) ||
713 num > INT_MAX) {
714 ti->error = "Invalid hash device block size";
715 r = -EINVAL;
716 goto bad;
717 }
718 v->hash_dev_block_bits = ffs(num) - 1;
719
720 if (sscanf(argv[5], "%llu%c", &num_ll, &dummy) != 1 ||
721 num_ll << (v->data_dev_block_bits - SECTOR_SHIFT) !=
722 (sector_t)num_ll << (v->data_dev_block_bits - SECTOR_SHIFT)) {
723 ti->error = "Invalid data blocks";
724 r = -EINVAL;
725 goto bad;
726 }
727 v->data_blocks = num_ll;
728
729 if (ti->len > (v->data_blocks << (v->data_dev_block_bits - SECTOR_SHIFT))) {
730 ti->error = "Data device is too small";
731 r = -EINVAL;
732 goto bad;
733 }
734
735 if (sscanf(argv[6], "%llu%c", &num_ll, &dummy) != 1 ||
736 num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT) !=
737 (sector_t)num_ll << (v->hash_dev_block_bits - SECTOR_SHIFT)) {
738 ti->error = "Invalid hash start";
739 r = -EINVAL;
740 goto bad;
741 }
742 v->hash_start = num_ll;
743
744 v->alg_name = kstrdup(argv[7], GFP_KERNEL);
745 if (!v->alg_name) {
746 ti->error = "Cannot allocate algorithm name";
747 r = -ENOMEM;
748 goto bad;
749 }
750
751 v->tfm = crypto_alloc_shash(v->alg_name, 0, 0);
752 if (IS_ERR(v->tfm)) {
753 ti->error = "Cannot initialize hash function";
754 r = PTR_ERR(v->tfm);
755 v->tfm = NULL;
756 goto bad;
757 }
758 v->digest_size = crypto_shash_digestsize(v->tfm);
759 if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) {
760 ti->error = "Digest size too big";
761 r = -EINVAL;
762 goto bad;
763 }
764 v->shash_descsize =
765 sizeof(struct shash_desc) + crypto_shash_descsize(v->tfm);
766
767 v->root_digest = kmalloc(v->digest_size, GFP_KERNEL);
768 if (!v->root_digest) {
769 ti->error = "Cannot allocate root digest";
770 r = -ENOMEM;
771 goto bad;
772 }
773 if (strlen(argv[8]) != v->digest_size * 2 ||
774 hex2bin(v->root_digest, argv[8], v->digest_size)) {
775 ti->error = "Invalid root digest";
776 r = -EINVAL;
777 goto bad;
778 }
779
780 if (strcmp(argv[9], "-")) {
781 v->salt_size = strlen(argv[9]) / 2;
782 v->salt = kmalloc(v->salt_size, GFP_KERNEL);
783 if (!v->salt) {
784 ti->error = "Cannot allocate salt";
785 r = -ENOMEM;
786 goto bad;
787 }
788 if (strlen(argv[9]) != v->salt_size * 2 ||
789 hex2bin(v->salt, argv[9], v->salt_size)) {
790 ti->error = "Invalid salt";
791 r = -EINVAL;
792 goto bad;
793 }
794 }
795
796 v->hash_per_block_bits =
797 fls((1 << v->hash_dev_block_bits) / v->digest_size) - 1;
798
799 v->levels = 0;
800 if (v->data_blocks)
801 while (v->hash_per_block_bits * v->levels < 64 &&
802 (unsigned long long)(v->data_blocks - 1) >>
803 (v->hash_per_block_bits * v->levels))
804 v->levels++;
805
806 if (v->levels > DM_VERITY_MAX_LEVELS) {
807 ti->error = "Too many tree levels";
808 r = -E2BIG;
809 goto bad;
810 }
811
812 hash_position = v->hash_start;
813 for (i = v->levels - 1; i >= 0; i--) {
814 sector_t s;
815 v->hash_level_block[i] = hash_position;
816 s = verity_position_at_level(v, v->data_blocks, i);
817 s = (s >> v->hash_per_block_bits) +
818 !!(s & ((1 << v->hash_per_block_bits) - 1));
819 if (hash_position + s < hash_position) {
820 ti->error = "Hash device offset overflow";
821 r = -E2BIG;
822 goto bad;
823 }
824 hash_position += s;
825 }
826 v->hash_blocks = hash_position;
827
828 v->bufio = dm_bufio_client_create(v->hash_dev->bdev,
829 1 << v->hash_dev_block_bits, 1, sizeof(struct buffer_aux),
830 dm_bufio_alloc_callback, NULL);
831 if (IS_ERR(v->bufio)) {
832 ti->error = "Cannot initialize dm-bufio";
833 r = PTR_ERR(v->bufio);
834 v->bufio = NULL;
835 goto bad;
836 }
837
838 if (dm_bufio_get_device_size(v->bufio) < v->hash_blocks) {
839 ti->error = "Hash device is too small";
840 r = -E2BIG;
841 goto bad;
842 }
843
844 v->io_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
845 sizeof(struct dm_verity_io) + v->shash_descsize + v->digest_size * 2);
846 if (!v->io_mempool) {
847 ti->error = "Cannot allocate io mempool";
848 r = -ENOMEM;
849 goto bad;
850 }
851
852 v->vec_mempool = mempool_create_kmalloc_pool(DM_VERITY_MEMPOOL_SIZE,
853 BIO_MAX_PAGES * sizeof(struct bio_vec));
854 if (!v->vec_mempool) {
855 ti->error = "Cannot allocate vector mempool";
856 r = -ENOMEM;
857 goto bad;
858 }
859
860 /* WQ_UNBOUND greatly improves performance when running on ramdisk */
861 v->verify_wq = alloc_workqueue("kverityd", WQ_CPU_INTENSIVE | WQ_MEM_RECLAIM | WQ_UNBOUND, num_online_cpus());
862 if (!v->verify_wq) {
863 ti->error = "Cannot allocate workqueue";
864 r = -ENOMEM;
865 goto bad;
866 }
867
868 return 0;
869
870bad:
871 verity_dtr(ti);
872
873 return r;
874}
875
876static struct target_type verity_target = {
877 .name = "verity",
878 .version = {1, 0, 0},
879 .module = THIS_MODULE,
880 .ctr = verity_ctr,
881 .dtr = verity_dtr,
882 .map = verity_map,
883 .status = verity_status,
884 .ioctl = verity_ioctl,
885 .merge = verity_merge,
886 .iterate_devices = verity_iterate_devices,
887 .io_hints = verity_io_hints,
888};
889
890static int __init dm_verity_init(void)
891{
892 int r;
893
894 r = dm_register_target(&verity_target);
895 if (r < 0)
896 DMERR("register failed %d", r);
897
898 return r;
899}
900
901static void __exit dm_verity_exit(void)
902{
903 dm_unregister_target(&verity_target);
904}
905
906module_init(dm_verity_init);
907module_exit(dm_verity_exit);
908
909MODULE_AUTHOR("Mikulas Patocka <mpatocka@redhat.com>");
910MODULE_AUTHOR("Mandeep Baines <msb@chromium.org>");
911MODULE_AUTHOR("Will Drewry <wad@chromium.org>");
912MODULE_DESCRIPTION(DM_NAME " target for transparent disk integrity checking");
913MODULE_LICENSE("GPL");
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index b89c548ec3f8..e24143cc2040 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -1016,6 +1016,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
1016 /* 1016 /*
1017 * Store bio_set for cleanup. 1017 * Store bio_set for cleanup.
1018 */ 1018 */
1019 clone->bi_end_io = NULL;
1019 clone->bi_private = md->bs; 1020 clone->bi_private = md->bs;
1020 bio_put(clone); 1021 bio_put(clone);
1021 free_tio(md, tio); 1022 free_tio(md, tio);
diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c
index feb2c3c7bb44..45135f69509c 100644
--- a/drivers/md/faulty.c
+++ b/drivers/md/faulty.c
@@ -315,7 +315,7 @@ static int run(struct mddev *mddev)
315 } 315 }
316 conf->nfaults = 0; 316 conf->nfaults = 0;
317 317
318 list_for_each_entry(rdev, &mddev->disks, same_set) 318 rdev_for_each(rdev, mddev)
319 conf->rdev = rdev; 319 conf->rdev = rdev;
320 320
321 md_set_array_sectors(mddev, faulty_size(mddev, 0, 0)); 321 md_set_array_sectors(mddev, faulty_size(mddev, 0, 0));
diff --git a/drivers/md/linear.c b/drivers/md/linear.c
index 627456542fb3..b0fcc7d02adb 100644
--- a/drivers/md/linear.c
+++ b/drivers/md/linear.c
@@ -68,10 +68,19 @@ static int linear_mergeable_bvec(struct request_queue *q,
68 struct dev_info *dev0; 68 struct dev_info *dev0;
69 unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9; 69 unsigned long maxsectors, bio_sectors = bvm->bi_size >> 9;
70 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 70 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
71 int maxbytes = biovec->bv_len;
72 struct request_queue *subq;
71 73
72 rcu_read_lock(); 74 rcu_read_lock();
73 dev0 = which_dev(mddev, sector); 75 dev0 = which_dev(mddev, sector);
74 maxsectors = dev0->end_sector - sector; 76 maxsectors = dev0->end_sector - sector;
77 subq = bdev_get_queue(dev0->rdev->bdev);
78 if (subq->merge_bvec_fn) {
79 bvm->bi_bdev = dev0->rdev->bdev;
80 bvm->bi_sector -= dev0->end_sector - dev0->rdev->sectors;
81 maxbytes = min(maxbytes, subq->merge_bvec_fn(subq, bvm,
82 biovec));
83 }
75 rcu_read_unlock(); 84 rcu_read_unlock();
76 85
77 if (maxsectors < bio_sectors) 86 if (maxsectors < bio_sectors)
@@ -80,12 +89,12 @@ static int linear_mergeable_bvec(struct request_queue *q,
80 maxsectors -= bio_sectors; 89 maxsectors -= bio_sectors;
81 90
82 if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0) 91 if (maxsectors <= (PAGE_SIZE >> 9 ) && bio_sectors == 0)
83 return biovec->bv_len; 92 return maxbytes;
84 /* The bytes available at this offset could be really big, 93
85 * so we cap at 2^31 to avoid overflow */ 94 if (maxsectors > (maxbytes >> 9))
86 if (maxsectors > (1 << (31-9))) 95 return maxbytes;
87 return 1<<31; 96 else
88 return maxsectors << 9; 97 return maxsectors << 9;
89} 98}
90 99
91static int linear_congested(void *data, int bits) 100static int linear_congested(void *data, int bits)
@@ -138,7 +147,7 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
138 cnt = 0; 147 cnt = 0;
139 conf->array_sectors = 0; 148 conf->array_sectors = 0;
140 149
141 list_for_each_entry(rdev, &mddev->disks, same_set) { 150 rdev_for_each(rdev, mddev) {
142 int j = rdev->raid_disk; 151 int j = rdev->raid_disk;
143 struct dev_info *disk = conf->disks + j; 152 struct dev_info *disk = conf->disks + j;
144 sector_t sectors; 153 sector_t sectors;
@@ -158,15 +167,6 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks)
158 167
159 disk_stack_limits(mddev->gendisk, rdev->bdev, 168 disk_stack_limits(mddev->gendisk, rdev->bdev,
160 rdev->data_offset << 9); 169 rdev->data_offset << 9);
161 /* as we don't honour merge_bvec_fn, we must never risk
162 * violating it, so limit max_segments to 1 lying within
163 * a single page.
164 */
165 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
166 blk_queue_max_segments(mddev->queue, 1);
167 blk_queue_segment_boundary(mddev->queue,
168 PAGE_CACHE_SIZE - 1);
169 }
170 170
171 conf->array_sectors += rdev->sectors; 171 conf->array_sectors += rdev->sectors;
172 cnt++; 172 cnt++;
diff --git a/drivers/md/md.c b/drivers/md/md.c
index ce88755baf4a..b572e1e386ce 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -439,7 +439,7 @@ static void submit_flushes(struct work_struct *ws)
439 INIT_WORK(&mddev->flush_work, md_submit_flush_data); 439 INIT_WORK(&mddev->flush_work, md_submit_flush_data);
440 atomic_set(&mddev->flush_pending, 1); 440 atomic_set(&mddev->flush_pending, 1);
441 rcu_read_lock(); 441 rcu_read_lock();
442 list_for_each_entry_rcu(rdev, &mddev->disks, same_set) 442 rdev_for_each_rcu(rdev, mddev)
443 if (rdev->raid_disk >= 0 && 443 if (rdev->raid_disk >= 0 &&
444 !test_bit(Faulty, &rdev->flags)) { 444 !test_bit(Faulty, &rdev->flags)) {
445 /* Take two references, one is dropped 445 /* Take two references, one is dropped
@@ -749,7 +749,7 @@ static struct md_rdev * find_rdev_nr(struct mddev *mddev, int nr)
749{ 749{
750 struct md_rdev *rdev; 750 struct md_rdev *rdev;
751 751
752 list_for_each_entry(rdev, &mddev->disks, same_set) 752 rdev_for_each(rdev, mddev)
753 if (rdev->desc_nr == nr) 753 if (rdev->desc_nr == nr)
754 return rdev; 754 return rdev;
755 755
@@ -760,7 +760,7 @@ static struct md_rdev * find_rdev(struct mddev * mddev, dev_t dev)
760{ 760{
761 struct md_rdev *rdev; 761 struct md_rdev *rdev;
762 762
763 list_for_each_entry(rdev, &mddev->disks, same_set) 763 rdev_for_each(rdev, mddev)
764 if (rdev->bdev->bd_dev == dev) 764 if (rdev->bdev->bd_dev == dev)
765 return rdev; 765 return rdev;
766 766
@@ -1342,7 +1342,7 @@ static void super_90_sync(struct mddev *mddev, struct md_rdev *rdev)
1342 sb->state |= (1<<MD_SB_BITMAP_PRESENT); 1342 sb->state |= (1<<MD_SB_BITMAP_PRESENT);
1343 1343
1344 sb->disks[0].state = (1<<MD_DISK_REMOVED); 1344 sb->disks[0].state = (1<<MD_DISK_REMOVED);
1345 list_for_each_entry(rdev2, &mddev->disks, same_set) { 1345 rdev_for_each(rdev2, mddev) {
1346 mdp_disk_t *d; 1346 mdp_disk_t *d;
1347 int desc_nr; 1347 int desc_nr;
1348 int is_active = test_bit(In_sync, &rdev2->flags); 1348 int is_active = test_bit(In_sync, &rdev2->flags);
@@ -1805,18 +1805,18 @@ retry:
1805 | BB_LEN(internal_bb)); 1805 | BB_LEN(internal_bb));
1806 *bbp++ = cpu_to_le64(store_bb); 1806 *bbp++ = cpu_to_le64(store_bb);
1807 } 1807 }
1808 bb->changed = 0;
1808 if (read_seqretry(&bb->lock, seq)) 1809 if (read_seqretry(&bb->lock, seq))
1809 goto retry; 1810 goto retry;
1810 1811
1811 bb->sector = (rdev->sb_start + 1812 bb->sector = (rdev->sb_start +
1812 (int)le32_to_cpu(sb->bblog_offset)); 1813 (int)le32_to_cpu(sb->bblog_offset));
1813 bb->size = le16_to_cpu(sb->bblog_size); 1814 bb->size = le16_to_cpu(sb->bblog_size);
1814 bb->changed = 0;
1815 } 1815 }
1816 } 1816 }
1817 1817
1818 max_dev = 0; 1818 max_dev = 0;
1819 list_for_each_entry(rdev2, &mddev->disks, same_set) 1819 rdev_for_each(rdev2, mddev)
1820 if (rdev2->desc_nr+1 > max_dev) 1820 if (rdev2->desc_nr+1 > max_dev)
1821 max_dev = rdev2->desc_nr+1; 1821 max_dev = rdev2->desc_nr+1;
1822 1822
@@ -1833,7 +1833,7 @@ retry:
1833 for (i=0; i<max_dev;i++) 1833 for (i=0; i<max_dev;i++)
1834 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1834 sb->dev_roles[i] = cpu_to_le16(0xfffe);
1835 1835
1836 list_for_each_entry(rdev2, &mddev->disks, same_set) { 1836 rdev_for_each(rdev2, mddev) {
1837 i = rdev2->desc_nr; 1837 i = rdev2->desc_nr;
1838 if (test_bit(Faulty, &rdev2->flags)) 1838 if (test_bit(Faulty, &rdev2->flags))
1839 sb->dev_roles[i] = cpu_to_le16(0xfffe); 1839 sb->dev_roles[i] = cpu_to_le16(0xfffe);
@@ -1948,7 +1948,7 @@ int md_integrity_register(struct mddev *mddev)
1948 return 0; /* nothing to do */ 1948 return 0; /* nothing to do */
1949 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk)) 1949 if (!mddev->gendisk || blk_get_integrity(mddev->gendisk))
1950 return 0; /* shouldn't register, or already is */ 1950 return 0; /* shouldn't register, or already is */
1951 list_for_each_entry(rdev, &mddev->disks, same_set) { 1951 rdev_for_each(rdev, mddev) {
1952 /* skip spares and non-functional disks */ 1952 /* skip spares and non-functional disks */
1953 if (test_bit(Faulty, &rdev->flags)) 1953 if (test_bit(Faulty, &rdev->flags))
1954 continue; 1954 continue;
@@ -2175,7 +2175,7 @@ static void export_array(struct mddev *mddev)
2175{ 2175{
2176 struct md_rdev *rdev, *tmp; 2176 struct md_rdev *rdev, *tmp;
2177 2177
2178 rdev_for_each(rdev, tmp, mddev) { 2178 rdev_for_each_safe(rdev, tmp, mddev) {
2179 if (!rdev->mddev) { 2179 if (!rdev->mddev) {
2180 MD_BUG(); 2180 MD_BUG();
2181 continue; 2181 continue;
@@ -2307,11 +2307,11 @@ static void md_print_devices(void)
2307 bitmap_print_sb(mddev->bitmap); 2307 bitmap_print_sb(mddev->bitmap);
2308 else 2308 else
2309 printk("%s: ", mdname(mddev)); 2309 printk("%s: ", mdname(mddev));
2310 list_for_each_entry(rdev, &mddev->disks, same_set) 2310 rdev_for_each(rdev, mddev)
2311 printk("<%s>", bdevname(rdev->bdev,b)); 2311 printk("<%s>", bdevname(rdev->bdev,b));
2312 printk("\n"); 2312 printk("\n");
2313 2313
2314 list_for_each_entry(rdev, &mddev->disks, same_set) 2314 rdev_for_each(rdev, mddev)
2315 print_rdev(rdev, mddev->major_version); 2315 print_rdev(rdev, mddev->major_version);
2316 } 2316 }
2317 printk("md: **********************************\n"); 2317 printk("md: **********************************\n");
@@ -2328,7 +2328,7 @@ static void sync_sbs(struct mddev * mddev, int nospares)
2328 * with the rest of the array) 2328 * with the rest of the array)
2329 */ 2329 */
2330 struct md_rdev *rdev; 2330 struct md_rdev *rdev;
2331 list_for_each_entry(rdev, &mddev->disks, same_set) { 2331 rdev_for_each(rdev, mddev) {
2332 if (rdev->sb_events == mddev->events || 2332 if (rdev->sb_events == mddev->events ||
2333 (nospares && 2333 (nospares &&
2334 rdev->raid_disk < 0 && 2334 rdev->raid_disk < 0 &&
@@ -2351,7 +2351,7 @@ static void md_update_sb(struct mddev * mddev, int force_change)
2351 2351
2352repeat: 2352repeat:
2353 /* First make sure individual recovery_offsets are correct */ 2353 /* First make sure individual recovery_offsets are correct */
2354 list_for_each_entry(rdev, &mddev->disks, same_set) { 2354 rdev_for_each(rdev, mddev) {
2355 if (rdev->raid_disk >= 0 && 2355 if (rdev->raid_disk >= 0 &&
2356 mddev->delta_disks >= 0 && 2356 mddev->delta_disks >= 0 &&
2357 !test_bit(In_sync, &rdev->flags) && 2357 !test_bit(In_sync, &rdev->flags) &&
@@ -2364,8 +2364,9 @@ repeat:
2364 clear_bit(MD_CHANGE_DEVS, &mddev->flags); 2364 clear_bit(MD_CHANGE_DEVS, &mddev->flags);
2365 if (!mddev->external) { 2365 if (!mddev->external) {
2366 clear_bit(MD_CHANGE_PENDING, &mddev->flags); 2366 clear_bit(MD_CHANGE_PENDING, &mddev->flags);
2367 list_for_each_entry(rdev, &mddev->disks, same_set) { 2367 rdev_for_each(rdev, mddev) {
2368 if (rdev->badblocks.changed) { 2368 if (rdev->badblocks.changed) {
2369 rdev->badblocks.changed = 0;
2369 md_ack_all_badblocks(&rdev->badblocks); 2370 md_ack_all_badblocks(&rdev->badblocks);
2370 md_error(mddev, rdev); 2371 md_error(mddev, rdev);
2371 } 2372 }
@@ -2430,7 +2431,7 @@ repeat:
2430 mddev->events --; 2431 mddev->events --;
2431 } 2432 }
2432 2433
2433 list_for_each_entry(rdev, &mddev->disks, same_set) { 2434 rdev_for_each(rdev, mddev) {
2434 if (rdev->badblocks.changed) 2435 if (rdev->badblocks.changed)
2435 any_badblocks_changed++; 2436 any_badblocks_changed++;
2436 if (test_bit(Faulty, &rdev->flags)) 2437 if (test_bit(Faulty, &rdev->flags))
@@ -2444,7 +2445,7 @@ repeat:
2444 mdname(mddev), mddev->in_sync); 2445 mdname(mddev), mddev->in_sync);
2445 2446
2446 bitmap_update_sb(mddev->bitmap); 2447 bitmap_update_sb(mddev->bitmap);
2447 list_for_each_entry(rdev, &mddev->disks, same_set) { 2448 rdev_for_each(rdev, mddev) {
2448 char b[BDEVNAME_SIZE]; 2449 char b[BDEVNAME_SIZE];
2449 2450
2450 if (rdev->sb_loaded != 1) 2451 if (rdev->sb_loaded != 1)
@@ -2493,7 +2494,7 @@ repeat:
2493 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery)) 2494 if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
2494 sysfs_notify(&mddev->kobj, NULL, "sync_completed"); 2495 sysfs_notify(&mddev->kobj, NULL, "sync_completed");
2495 2496
2496 list_for_each_entry(rdev, &mddev->disks, same_set) { 2497 rdev_for_each(rdev, mddev) {
2497 if (test_and_clear_bit(FaultRecorded, &rdev->flags)) 2498 if (test_and_clear_bit(FaultRecorded, &rdev->flags))
2498 clear_bit(Blocked, &rdev->flags); 2499 clear_bit(Blocked, &rdev->flags);
2499 2500
@@ -2896,7 +2897,7 @@ rdev_size_store(struct md_rdev *rdev, const char *buf, size_t len)
2896 struct md_rdev *rdev2; 2897 struct md_rdev *rdev2;
2897 2898
2898 mddev_lock(mddev); 2899 mddev_lock(mddev);
2899 list_for_each_entry(rdev2, &mddev->disks, same_set) 2900 rdev_for_each(rdev2, mddev)
2900 if (rdev->bdev == rdev2->bdev && 2901 if (rdev->bdev == rdev2->bdev &&
2901 rdev != rdev2 && 2902 rdev != rdev2 &&
2902 overlaps(rdev->data_offset, rdev->sectors, 2903 overlaps(rdev->data_offset, rdev->sectors,
@@ -3193,7 +3194,7 @@ static void analyze_sbs(struct mddev * mddev)
3193 char b[BDEVNAME_SIZE]; 3194 char b[BDEVNAME_SIZE];
3194 3195
3195 freshest = NULL; 3196 freshest = NULL;
3196 rdev_for_each(rdev, tmp, mddev) 3197 rdev_for_each_safe(rdev, tmp, mddev)
3197 switch (super_types[mddev->major_version]. 3198 switch (super_types[mddev->major_version].
3198 load_super(rdev, freshest, mddev->minor_version)) { 3199 load_super(rdev, freshest, mddev->minor_version)) {
3199 case 1: 3200 case 1:
@@ -3214,7 +3215,7 @@ static void analyze_sbs(struct mddev * mddev)
3214 validate_super(mddev, freshest); 3215 validate_super(mddev, freshest);
3215 3216
3216 i = 0; 3217 i = 0;
3217 rdev_for_each(rdev, tmp, mddev) { 3218 rdev_for_each_safe(rdev, tmp, mddev) {
3218 if (mddev->max_disks && 3219 if (mddev->max_disks &&
3219 (rdev->desc_nr >= mddev->max_disks || 3220 (rdev->desc_nr >= mddev->max_disks ||
3220 i > mddev->max_disks)) { 3221 i > mddev->max_disks)) {
@@ -3403,7 +3404,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3403 return -EINVAL; 3404 return -EINVAL;
3404 } 3405 }
3405 3406
3406 list_for_each_entry(rdev, &mddev->disks, same_set) 3407 rdev_for_each(rdev, mddev)
3407 rdev->new_raid_disk = rdev->raid_disk; 3408 rdev->new_raid_disk = rdev->raid_disk;
3408 3409
3409 /* ->takeover must set new_* and/or delta_disks 3410 /* ->takeover must set new_* and/or delta_disks
@@ -3456,7 +3457,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3456 mddev->safemode = 0; 3457 mddev->safemode = 0;
3457 } 3458 }
3458 3459
3459 list_for_each_entry(rdev, &mddev->disks, same_set) { 3460 rdev_for_each(rdev, mddev) {
3460 if (rdev->raid_disk < 0) 3461 if (rdev->raid_disk < 0)
3461 continue; 3462 continue;
3462 if (rdev->new_raid_disk >= mddev->raid_disks) 3463 if (rdev->new_raid_disk >= mddev->raid_disks)
@@ -3465,7 +3466,7 @@ level_store(struct mddev *mddev, const char *buf, size_t len)
3465 continue; 3466 continue;
3466 sysfs_unlink_rdev(mddev, rdev); 3467 sysfs_unlink_rdev(mddev, rdev);
3467 } 3468 }
3468 list_for_each_entry(rdev, &mddev->disks, same_set) { 3469 rdev_for_each(rdev, mddev) {
3469 if (rdev->raid_disk < 0) 3470 if (rdev->raid_disk < 0)
3470 continue; 3471 continue;
3471 if (rdev->new_raid_disk == rdev->raid_disk) 3472 if (rdev->new_raid_disk == rdev->raid_disk)
@@ -4796,7 +4797,7 @@ int md_run(struct mddev *mddev)
4796 * the only valid external interface is through the md 4797 * the only valid external interface is through the md
4797 * device. 4798 * device.
4798 */ 4799 */
4799 list_for_each_entry(rdev, &mddev->disks, same_set) { 4800 rdev_for_each(rdev, mddev) {
4800 if (test_bit(Faulty, &rdev->flags)) 4801 if (test_bit(Faulty, &rdev->flags))
4801 continue; 4802 continue;
4802 sync_blockdev(rdev->bdev); 4803 sync_blockdev(rdev->bdev);
@@ -4867,8 +4868,8 @@ int md_run(struct mddev *mddev)
4867 struct md_rdev *rdev2; 4868 struct md_rdev *rdev2;
4868 int warned = 0; 4869 int warned = 0;
4869 4870
4870 list_for_each_entry(rdev, &mddev->disks, same_set) 4871 rdev_for_each(rdev, mddev)
4871 list_for_each_entry(rdev2, &mddev->disks, same_set) { 4872 rdev_for_each(rdev2, mddev) {
4872 if (rdev < rdev2 && 4873 if (rdev < rdev2 &&
4873 rdev->bdev->bd_contains == 4874 rdev->bdev->bd_contains ==
4874 rdev2->bdev->bd_contains) { 4875 rdev2->bdev->bd_contains) {
@@ -4945,7 +4946,7 @@ int md_run(struct mddev *mddev)
4945 mddev->in_sync = 1; 4946 mddev->in_sync = 1;
4946 smp_wmb(); 4947 smp_wmb();
4947 mddev->ready = 1; 4948 mddev->ready = 1;
4948 list_for_each_entry(rdev, &mddev->disks, same_set) 4949 rdev_for_each(rdev, mddev)
4949 if (rdev->raid_disk >= 0) 4950 if (rdev->raid_disk >= 0)
4950 if (sysfs_link_rdev(mddev, rdev)) 4951 if (sysfs_link_rdev(mddev, rdev))
4951 /* failure here is OK */; 4952 /* failure here is OK */;
@@ -5073,6 +5074,7 @@ static void md_clean(struct mddev *mddev)
5073 mddev->changed = 0; 5074 mddev->changed = 0;
5074 mddev->degraded = 0; 5075 mddev->degraded = 0;
5075 mddev->safemode = 0; 5076 mddev->safemode = 0;
5077 mddev->merge_check_needed = 0;
5076 mddev->bitmap_info.offset = 0; 5078 mddev->bitmap_info.offset = 0;
5077 mddev->bitmap_info.default_offset = 0; 5079 mddev->bitmap_info.default_offset = 0;
5078 mddev->bitmap_info.chunksize = 0; 5080 mddev->bitmap_info.chunksize = 0;
@@ -5175,7 +5177,7 @@ static int do_md_stop(struct mddev * mddev, int mode, int is_open)
5175 /* tell userspace to handle 'inactive' */ 5177 /* tell userspace to handle 'inactive' */
5176 sysfs_notify_dirent_safe(mddev->sysfs_state); 5178 sysfs_notify_dirent_safe(mddev->sysfs_state);
5177 5179
5178 list_for_each_entry(rdev, &mddev->disks, same_set) 5180 rdev_for_each(rdev, mddev)
5179 if (rdev->raid_disk >= 0) 5181 if (rdev->raid_disk >= 0)
5180 sysfs_unlink_rdev(mddev, rdev); 5182 sysfs_unlink_rdev(mddev, rdev);
5181 5183
@@ -5226,7 +5228,7 @@ static void autorun_array(struct mddev *mddev)
5226 5228
5227 printk(KERN_INFO "md: running: "); 5229 printk(KERN_INFO "md: running: ");
5228 5230
5229 list_for_each_entry(rdev, &mddev->disks, same_set) { 5231 rdev_for_each(rdev, mddev) {
5230 char b[BDEVNAME_SIZE]; 5232 char b[BDEVNAME_SIZE];
5231 printk("<%s>", bdevname(rdev->bdev,b)); 5233 printk("<%s>", bdevname(rdev->bdev,b));
5232 } 5234 }
@@ -5356,7 +5358,7 @@ static int get_array_info(struct mddev * mddev, void __user * arg)
5356 struct md_rdev *rdev; 5358 struct md_rdev *rdev;
5357 5359
5358 nr=working=insync=failed=spare=0; 5360 nr=working=insync=failed=spare=0;
5359 list_for_each_entry(rdev, &mddev->disks, same_set) { 5361 rdev_for_each(rdev, mddev) {
5360 nr++; 5362 nr++;
5361 if (test_bit(Faulty, &rdev->flags)) 5363 if (test_bit(Faulty, &rdev->flags))
5362 failed++; 5364 failed++;
@@ -5923,7 +5925,7 @@ static int update_size(struct mddev *mddev, sector_t num_sectors)
5923 * grow, and re-add. 5925 * grow, and re-add.
5924 */ 5926 */
5925 return -EBUSY; 5927 return -EBUSY;
5926 list_for_each_entry(rdev, &mddev->disks, same_set) { 5928 rdev_for_each(rdev, mddev) {
5927 sector_t avail = rdev->sectors; 5929 sector_t avail = rdev->sectors;
5928 5930
5929 if (fit && (num_sectors == 0 || num_sectors > avail)) 5931 if (fit && (num_sectors == 0 || num_sectors > avail))
@@ -6724,7 +6726,6 @@ static int md_seq_show(struct seq_file *seq, void *v)
6724 struct mddev *mddev = v; 6726 struct mddev *mddev = v;
6725 sector_t sectors; 6727 sector_t sectors;
6726 struct md_rdev *rdev; 6728 struct md_rdev *rdev;
6727 struct bitmap *bitmap;
6728 6729
6729 if (v == (void*)1) { 6730 if (v == (void*)1) {
6730 struct md_personality *pers; 6731 struct md_personality *pers;
@@ -6758,7 +6759,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
6758 } 6759 }
6759 6760
6760 sectors = 0; 6761 sectors = 0;
6761 list_for_each_entry(rdev, &mddev->disks, same_set) { 6762 rdev_for_each(rdev, mddev) {
6762 char b[BDEVNAME_SIZE]; 6763 char b[BDEVNAME_SIZE];
6763 seq_printf(seq, " %s[%d]", 6764 seq_printf(seq, " %s[%d]",
6764 bdevname(rdev->bdev,b), rdev->desc_nr); 6765 bdevname(rdev->bdev,b), rdev->desc_nr);
@@ -6812,27 +6813,7 @@ static int md_seq_show(struct seq_file *seq, void *v)
6812 } else 6813 } else
6813 seq_printf(seq, "\n "); 6814 seq_printf(seq, "\n ");
6814 6815
6815 if ((bitmap = mddev->bitmap)) { 6816 bitmap_status(seq, mddev->bitmap);
6816 unsigned long chunk_kb;
6817 unsigned long flags;
6818 spin_lock_irqsave(&bitmap->lock, flags);
6819 chunk_kb = mddev->bitmap_info.chunksize >> 10;
6820 seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], "
6821 "%lu%s chunk",
6822 bitmap->pages - bitmap->missing_pages,
6823 bitmap->pages,
6824 (bitmap->pages - bitmap->missing_pages)
6825 << (PAGE_SHIFT - 10),
6826 chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize,
6827 chunk_kb ? "KB" : "B");
6828 if (bitmap->file) {
6829 seq_printf(seq, ", file: ");
6830 seq_path(seq, &bitmap->file->f_path, " \t\n");
6831 }
6832
6833 seq_printf(seq, "\n");
6834 spin_unlock_irqrestore(&bitmap->lock, flags);
6835 }
6836 6817
6837 seq_printf(seq, "\n"); 6818 seq_printf(seq, "\n");
6838 } 6819 }
@@ -7170,7 +7151,7 @@ void md_do_sync(struct mddev *mddev)
7170 max_sectors = mddev->dev_sectors; 7151 max_sectors = mddev->dev_sectors;
7171 j = MaxSector; 7152 j = MaxSector;
7172 rcu_read_lock(); 7153 rcu_read_lock();
7173 list_for_each_entry_rcu(rdev, &mddev->disks, same_set) 7154 rdev_for_each_rcu(rdev, mddev)
7174 if (rdev->raid_disk >= 0 && 7155 if (rdev->raid_disk >= 0 &&
7175 !test_bit(Faulty, &rdev->flags) && 7156 !test_bit(Faulty, &rdev->flags) &&
7176 !test_bit(In_sync, &rdev->flags) && 7157 !test_bit(In_sync, &rdev->flags) &&
@@ -7342,7 +7323,7 @@ void md_do_sync(struct mddev *mddev)
7342 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) 7323 if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery))
7343 mddev->curr_resync = MaxSector; 7324 mddev->curr_resync = MaxSector;
7344 rcu_read_lock(); 7325 rcu_read_lock();
7345 list_for_each_entry_rcu(rdev, &mddev->disks, same_set) 7326 rdev_for_each_rcu(rdev, mddev)
7346 if (rdev->raid_disk >= 0 && 7327 if (rdev->raid_disk >= 0 &&
7347 mddev->delta_disks >= 0 && 7328 mddev->delta_disks >= 0 &&
7348 !test_bit(Faulty, &rdev->flags) && 7329 !test_bit(Faulty, &rdev->flags) &&
@@ -7388,7 +7369,7 @@ static int remove_and_add_spares(struct mddev *mddev)
7388 7369
7389 mddev->curr_resync_completed = 0; 7370 mddev->curr_resync_completed = 0;
7390 7371
7391 list_for_each_entry(rdev, &mddev->disks, same_set) 7372 rdev_for_each(rdev, mddev)
7392 if (rdev->raid_disk >= 0 && 7373 if (rdev->raid_disk >= 0 &&
7393 !test_bit(Blocked, &rdev->flags) && 7374 !test_bit(Blocked, &rdev->flags) &&
7394 (test_bit(Faulty, &rdev->flags) || 7375 (test_bit(Faulty, &rdev->flags) ||
@@ -7406,7 +7387,7 @@ static int remove_and_add_spares(struct mddev *mddev)
7406 "degraded"); 7387 "degraded");
7407 7388
7408 7389
7409 list_for_each_entry(rdev, &mddev->disks, same_set) { 7390 rdev_for_each(rdev, mddev) {
7410 if (rdev->raid_disk >= 0 && 7391 if (rdev->raid_disk >= 0 &&
7411 !test_bit(In_sync, &rdev->flags) && 7392 !test_bit(In_sync, &rdev->flags) &&
7412 !test_bit(Faulty, &rdev->flags)) 7393 !test_bit(Faulty, &rdev->flags))
@@ -7451,7 +7432,7 @@ static void reap_sync_thread(struct mddev *mddev)
7451 * do the superblock for an incrementally recovered device 7432 * do the superblock for an incrementally recovered device
7452 * written out. 7433 * written out.
7453 */ 7434 */
7454 list_for_each_entry(rdev, &mddev->disks, same_set) 7435 rdev_for_each(rdev, mddev)
7455 if (!mddev->degraded || 7436 if (!mddev->degraded ||
7456 test_bit(In_sync, &rdev->flags)) 7437 test_bit(In_sync, &rdev->flags))
7457 rdev->saved_raid_disk = -1; 7438 rdev->saved_raid_disk = -1;
@@ -7529,7 +7510,7 @@ void md_check_recovery(struct mddev *mddev)
7529 * failed devices. 7510 * failed devices.
7530 */ 7511 */
7531 struct md_rdev *rdev; 7512 struct md_rdev *rdev;
7532 list_for_each_entry(rdev, &mddev->disks, same_set) 7513 rdev_for_each(rdev, mddev)
7533 if (rdev->raid_disk >= 0 && 7514 if (rdev->raid_disk >= 0 &&
7534 !test_bit(Blocked, &rdev->flags) && 7515 !test_bit(Blocked, &rdev->flags) &&
7535 test_bit(Faulty, &rdev->flags) && 7516 test_bit(Faulty, &rdev->flags) &&
@@ -8040,7 +8021,7 @@ void md_ack_all_badblocks(struct badblocks *bb)
8040 return; 8021 return;
8041 write_seqlock_irq(&bb->lock); 8022 write_seqlock_irq(&bb->lock);
8042 8023
8043 if (bb->changed == 0) { 8024 if (bb->changed == 0 && bb->unacked_exist) {
8044 u64 *p = bb->page; 8025 u64 *p = bb->page;
8045 int i; 8026 int i;
8046 for (i = 0; i < bb->count ; i++) { 8027 for (i = 0; i < bb->count ; i++) {
@@ -8157,30 +8138,23 @@ static int md_notify_reboot(struct notifier_block *this,
8157 struct mddev *mddev; 8138 struct mddev *mddev;
8158 int need_delay = 0; 8139 int need_delay = 0;
8159 8140
8160 if ((code == SYS_DOWN) || (code == SYS_HALT) || (code == SYS_POWER_OFF)) { 8141 for_each_mddev(mddev, tmp) {
8161 8142 if (mddev_trylock(mddev)) {
8162 printk(KERN_INFO "md: stopping all md devices.\n"); 8143 __md_stop_writes(mddev);
8163 8144 mddev->safemode = 2;
8164 for_each_mddev(mddev, tmp) { 8145 mddev_unlock(mddev);
8165 if (mddev_trylock(mddev)) {
8166 /* Force a switch to readonly even array
8167 * appears to still be in use. Hence
8168 * the '100'.
8169 */
8170 md_set_readonly(mddev, 100);
8171 mddev_unlock(mddev);
8172 }
8173 need_delay = 1;
8174 } 8146 }
8175 /* 8147 need_delay = 1;
8176 * certain more exotic SCSI devices are known to be
8177 * volatile wrt too early system reboots. While the
8178 * right place to handle this issue is the given
8179 * driver, we do want to have a safe RAID driver ...
8180 */
8181 if (need_delay)
8182 mdelay(1000*1);
8183 } 8148 }
8149 /*
8150 * certain more exotic SCSI devices are known to be
8151 * volatile wrt too early system reboots. While the
8152 * right place to handle this issue is the given
8153 * driver, we do want to have a safe RAID driver ...
8154 */
8155 if (need_delay)
8156 mdelay(1000*1);
8157
8184 return NOTIFY_DONE; 8158 return NOTIFY_DONE;
8185} 8159}
8186 8160
diff --git a/drivers/md/md.h b/drivers/md/md.h
index 44c63dfeeb2b..1c2063ccf48e 100644
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@@ -128,6 +128,10 @@ struct md_rdev {
128enum flag_bits { 128enum flag_bits {
129 Faulty, /* device is known to have a fault */ 129 Faulty, /* device is known to have a fault */
130 In_sync, /* device is in_sync with rest of array */ 130 In_sync, /* device is in_sync with rest of array */
131 Unmerged, /* device is being added to array and should
132 * be considerred for bvec_merge_fn but not
133 * yet for actual IO
134 */
131 WriteMostly, /* Avoid reading if at all possible */ 135 WriteMostly, /* Avoid reading if at all possible */
132 AutoDetected, /* added by auto-detect */ 136 AutoDetected, /* added by auto-detect */
133 Blocked, /* An error occurred but has not yet 137 Blocked, /* An error occurred but has not yet
@@ -345,6 +349,10 @@ struct mddev {
345 int degraded; /* whether md should consider 349 int degraded; /* whether md should consider
346 * adding a spare 350 * adding a spare
347 */ 351 */
352 int merge_check_needed; /* at least one
353 * member device
354 * has a
355 * merge_bvec_fn */
348 356
349 atomic_t recovery_active; /* blocks scheduled, but not written */ 357 atomic_t recovery_active; /* blocks scheduled, but not written */
350 wait_queue_head_t recovery_wait; 358 wait_queue_head_t recovery_wait;
@@ -519,7 +527,10 @@ static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev)
519/* 527/*
520 * iterates through the 'same array disks' ringlist 528 * iterates through the 'same array disks' ringlist
521 */ 529 */
522#define rdev_for_each(rdev, tmp, mddev) \ 530#define rdev_for_each(rdev, mddev) \
531 list_for_each_entry(rdev, &((mddev)->disks), same_set)
532
533#define rdev_for_each_safe(rdev, tmp, mddev) \
523 list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set) 534 list_for_each_entry_safe(rdev, tmp, &((mddev)->disks), same_set)
524 535
525#define rdev_for_each_rcu(rdev, mddev) \ 536#define rdev_for_each_rcu(rdev, mddev) \
diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c
index a222f516660e..9339e67fcc79 100644
--- a/drivers/md/multipath.c
+++ b/drivers/md/multipath.c
@@ -428,7 +428,7 @@ static int multipath_run (struct mddev *mddev)
428 } 428 }
429 429
430 working_disks = 0; 430 working_disks = 0;
431 list_for_each_entry(rdev, &mddev->disks, same_set) { 431 rdev_for_each(rdev, mddev) {
432 disk_idx = rdev->raid_disk; 432 disk_idx = rdev->raid_disk;
433 if (disk_idx < 0 || 433 if (disk_idx < 0 ||
434 disk_idx >= mddev->raid_disks) 434 disk_idx >= mddev->raid_disks)
diff --git a/drivers/md/persistent-data/dm-btree-internal.h b/drivers/md/persistent-data/dm-btree-internal.h
index d279c768f8f1..5709bfeab1e8 100644
--- a/drivers/md/persistent-data/dm-btree-internal.h
+++ b/drivers/md/persistent-data/dm-btree-internal.h
@@ -108,12 +108,9 @@ static inline void *value_base(struct node *n)
108 return &n->keys[le32_to_cpu(n->header.max_entries)]; 108 return &n->keys[le32_to_cpu(n->header.max_entries)];
109} 109}
110 110
111/* 111static inline void *value_ptr(struct node *n, uint32_t index)
112 * FIXME: Now that value size is stored in node we don't need the third parm.
113 */
114static inline void *value_ptr(struct node *n, uint32_t index, size_t value_size)
115{ 112{
116 BUG_ON(value_size != le32_to_cpu(n->header.value_size)); 113 uint32_t value_size = le32_to_cpu(n->header.value_size);
117 return value_base(n) + (value_size * index); 114 return value_base(n) + (value_size * index);
118} 115}
119 116
diff --git a/drivers/md/persistent-data/dm-btree-remove.c b/drivers/md/persistent-data/dm-btree-remove.c
index 023fbc2d389e..aa71e2359a07 100644
--- a/drivers/md/persistent-data/dm-btree-remove.c
+++ b/drivers/md/persistent-data/dm-btree-remove.c
@@ -61,20 +61,20 @@ static void node_shift(struct node *n, int shift)
61 if (shift < 0) { 61 if (shift < 0) {
62 shift = -shift; 62 shift = -shift;
63 BUG_ON(shift > nr_entries); 63 BUG_ON(shift > nr_entries);
64 BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift, value_size)); 64 BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
65 memmove(key_ptr(n, 0), 65 memmove(key_ptr(n, 0),
66 key_ptr(n, shift), 66 key_ptr(n, shift),
67 (nr_entries - shift) * sizeof(__le64)); 67 (nr_entries - shift) * sizeof(__le64));
68 memmove(value_ptr(n, 0, value_size), 68 memmove(value_ptr(n, 0),
69 value_ptr(n, shift, value_size), 69 value_ptr(n, shift),
70 (nr_entries - shift) * value_size); 70 (nr_entries - shift) * value_size);
71 } else { 71 } else {
72 BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); 72 BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
73 memmove(key_ptr(n, shift), 73 memmove(key_ptr(n, shift),
74 key_ptr(n, 0), 74 key_ptr(n, 0),
75 nr_entries * sizeof(__le64)); 75 nr_entries * sizeof(__le64));
76 memmove(value_ptr(n, shift, value_size), 76 memmove(value_ptr(n, shift),
77 value_ptr(n, 0, value_size), 77 value_ptr(n, 0),
78 nr_entries * value_size); 78 nr_entries * value_size);
79 } 79 }
80} 80}
@@ -91,16 +91,16 @@ static void node_copy(struct node *left, struct node *right, int shift)
91 memcpy(key_ptr(left, nr_left), 91 memcpy(key_ptr(left, nr_left),
92 key_ptr(right, 0), 92 key_ptr(right, 0),
93 shift * sizeof(__le64)); 93 shift * sizeof(__le64));
94 memcpy(value_ptr(left, nr_left, value_size), 94 memcpy(value_ptr(left, nr_left),
95 value_ptr(right, 0, value_size), 95 value_ptr(right, 0),
96 shift * value_size); 96 shift * value_size);
97 } else { 97 } else {
98 BUG_ON(shift > le32_to_cpu(right->header.max_entries)); 98 BUG_ON(shift > le32_to_cpu(right->header.max_entries));
99 memcpy(key_ptr(right, 0), 99 memcpy(key_ptr(right, 0),
100 key_ptr(left, nr_left - shift), 100 key_ptr(left, nr_left - shift),
101 shift * sizeof(__le64)); 101 shift * sizeof(__le64));
102 memcpy(value_ptr(right, 0, value_size), 102 memcpy(value_ptr(right, 0),
103 value_ptr(left, nr_left - shift, value_size), 103 value_ptr(left, nr_left - shift),
104 shift * value_size); 104 shift * value_size);
105 } 105 }
106} 106}
@@ -120,26 +120,17 @@ static void delete_at(struct node *n, unsigned index)
120 key_ptr(n, index + 1), 120 key_ptr(n, index + 1),
121 nr_to_copy * sizeof(__le64)); 121 nr_to_copy * sizeof(__le64));
122 122
123 memmove(value_ptr(n, index, value_size), 123 memmove(value_ptr(n, index),
124 value_ptr(n, index + 1, value_size), 124 value_ptr(n, index + 1),
125 nr_to_copy * value_size); 125 nr_to_copy * value_size);
126 } 126 }
127 127
128 n->header.nr_entries = cpu_to_le32(nr_entries - 1); 128 n->header.nr_entries = cpu_to_le32(nr_entries - 1);
129} 129}
130 130
131static unsigned del_threshold(struct node *n)
132{
133 return le32_to_cpu(n->header.max_entries) / 3;
134}
135
136static unsigned merge_threshold(struct node *n) 131static unsigned merge_threshold(struct node *n)
137{ 132{
138 /* 133 return le32_to_cpu(n->header.max_entries) / 3;
139 * The extra one is because we know we're potentially going to
140 * delete an entry.
141 */
142 return 2 * (le32_to_cpu(n->header.max_entries) / 3) + 1;
143} 134}
144 135
145struct child { 136struct child {
@@ -175,7 +166,7 @@ static int init_child(struct dm_btree_info *info, struct node *parent,
175 if (inc) 166 if (inc)
176 inc_children(info->tm, result->n, &le64_type); 167 inc_children(info->tm, result->n, &le64_type);
177 168
178 *((__le64 *) value_ptr(parent, index, sizeof(__le64))) = 169 *((__le64 *) value_ptr(parent, index)) =
179 cpu_to_le64(dm_block_location(result->block)); 170 cpu_to_le64(dm_block_location(result->block));
180 171
181 return 0; 172 return 0;
@@ -188,6 +179,15 @@ static int exit_child(struct dm_btree_info *info, struct child *c)
188 179
189static void shift(struct node *left, struct node *right, int count) 180static void shift(struct node *left, struct node *right, int count)
190{ 181{
182 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
183 uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
184 uint32_t max_entries = le32_to_cpu(left->header.max_entries);
185 uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
186
187 BUG_ON(max_entries != r_max_entries);
188 BUG_ON(nr_left - count > max_entries);
189 BUG_ON(nr_right + count > max_entries);
190
191 if (!count) 191 if (!count)
192 return; 192 return;
193 193
@@ -199,13 +199,8 @@ static void shift(struct node *left, struct node *right, int count)
199 node_shift(right, count); 199 node_shift(right, count);
200 } 200 }
201 201
202 left->header.nr_entries = 202 left->header.nr_entries = cpu_to_le32(nr_left - count);
203 cpu_to_le32(le32_to_cpu(left->header.nr_entries) - count); 203 right->header.nr_entries = cpu_to_le32(nr_right + count);
204 BUG_ON(le32_to_cpu(left->header.nr_entries) > le32_to_cpu(left->header.max_entries));
205
206 right->header.nr_entries =
207 cpu_to_le32(le32_to_cpu(right->header.nr_entries) + count);
208 BUG_ON(le32_to_cpu(right->header.nr_entries) > le32_to_cpu(right->header.max_entries));
209} 204}
210 205
211static void __rebalance2(struct dm_btree_info *info, struct node *parent, 206static void __rebalance2(struct dm_btree_info *info, struct node *parent,
@@ -215,8 +210,9 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent,
215 struct node *right = r->n; 210 struct node *right = r->n;
216 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 211 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
217 uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 212 uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
213 unsigned threshold = 2 * merge_threshold(left) + 1;
218 214
219 if (nr_left + nr_right <= merge_threshold(left)) { 215 if (nr_left + nr_right < threshold) {
220 /* 216 /*
221 * Merge 217 * Merge
222 */ 218 */
@@ -234,9 +230,6 @@ static void __rebalance2(struct dm_btree_info *info, struct node *parent,
234 * Rebalance. 230 * Rebalance.
235 */ 231 */
236 unsigned target_left = (nr_left + nr_right) / 2; 232 unsigned target_left = (nr_left + nr_right) / 2;
237 unsigned shift_ = nr_left - target_left;
238 BUG_ON(le32_to_cpu(left->header.max_entries) <= nr_left - shift_);
239 BUG_ON(le32_to_cpu(right->header.max_entries) <= nr_right + shift_);
240 shift(left, right, nr_left - target_left); 233 shift(left, right, nr_left - target_left);
241 *key_ptr(parent, r->index) = right->keys[0]; 234 *key_ptr(parent, r->index) = right->keys[0];
242 } 235 }
@@ -272,6 +265,84 @@ static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
272 return exit_child(info, &right); 265 return exit_child(info, &right);
273} 266}
274 267
268/*
269 * We dump as many entries from center as possible into left, then the rest
270 * in right, then rebalance2. This wastes some cpu, but I want something
271 * simple atm.
272 */
273static void delete_center_node(struct dm_btree_info *info, struct node *parent,
274 struct child *l, struct child *c, struct child *r,
275 struct node *left, struct node *center, struct node *right,
276 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
277{
278 uint32_t max_entries = le32_to_cpu(left->header.max_entries);
279 unsigned shift = min(max_entries - nr_left, nr_center);
280
281 BUG_ON(nr_left + shift > max_entries);
282 node_copy(left, center, -shift);
283 left->header.nr_entries = cpu_to_le32(nr_left + shift);
284
285 if (shift != nr_center) {
286 shift = nr_center - shift;
287 BUG_ON((nr_right + shift) > max_entries);
288 node_shift(right, shift);
289 node_copy(center, right, shift);
290 right->header.nr_entries = cpu_to_le32(nr_right + shift);
291 }
292 *key_ptr(parent, r->index) = right->keys[0];
293
294 delete_at(parent, c->index);
295 r->index--;
296
297 dm_tm_dec(info->tm, dm_block_location(c->block));
298 __rebalance2(info, parent, l, r);
299}
300
301/*
302 * Redistributes entries among 3 sibling nodes.
303 */
304static void redistribute3(struct dm_btree_info *info, struct node *parent,
305 struct child *l, struct child *c, struct child *r,
306 struct node *left, struct node *center, struct node *right,
307 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
308{
309 int s;
310 uint32_t max_entries = le32_to_cpu(left->header.max_entries);
311 unsigned target = (nr_left + nr_center + nr_right) / 3;
312 BUG_ON(target > max_entries);
313
314 if (nr_left < nr_right) {
315 s = nr_left - target;
316
317 if (s < 0 && nr_center < -s) {
318 /* not enough in central node */
319 shift(left, center, nr_center);
320 s = nr_center - target;
321 shift(left, right, s);
322 nr_right += s;
323 } else
324 shift(left, center, s);
325
326 shift(center, right, target - nr_right);
327
328 } else {
329 s = target - nr_right;
330 if (s > 0 && nr_center < s) {
331 /* not enough in central node */
332 shift(center, right, nr_center);
333 s = target - nr_center;
334 shift(left, right, s);
335 nr_left -= s;
336 } else
337 shift(center, right, s);
338
339 shift(left, center, nr_left - target);
340 }
341
342 *key_ptr(parent, c->index) = center->keys[0];
343 *key_ptr(parent, r->index) = right->keys[0];
344}
345
275static void __rebalance3(struct dm_btree_info *info, struct node *parent, 346static void __rebalance3(struct dm_btree_info *info, struct node *parent,
276 struct child *l, struct child *c, struct child *r) 347 struct child *l, struct child *c, struct child *r)
277{ 348{
@@ -282,62 +353,18 @@ static void __rebalance3(struct dm_btree_info *info, struct node *parent,
282 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 353 uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
283 uint32_t nr_center = le32_to_cpu(center->header.nr_entries); 354 uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
284 uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 355 uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
285 uint32_t max_entries = le32_to_cpu(left->header.max_entries);
286 356
287 unsigned target; 357 unsigned threshold = merge_threshold(left) * 4 + 1;
288 358
289 BUG_ON(left->header.max_entries != center->header.max_entries); 359 BUG_ON(left->header.max_entries != center->header.max_entries);
290 BUG_ON(center->header.max_entries != right->header.max_entries); 360 BUG_ON(center->header.max_entries != right->header.max_entries);
291 361
292 if (((nr_left + nr_center + nr_right) / 2) < merge_threshold(center)) { 362 if ((nr_left + nr_center + nr_right) < threshold)
293 /* 363 delete_center_node(info, parent, l, c, r, left, center, right,
294 * Delete center node: 364 nr_left, nr_center, nr_right);
295 * 365 else
296 * We dump as many entries from center as possible into 366 redistribute3(info, parent, l, c, r, left, center, right,
297 * left, then the rest in right, then rebalance2. This 367 nr_left, nr_center, nr_right);
298 * wastes some cpu, but I want something simple atm.
299 */
300 unsigned shift = min(max_entries - nr_left, nr_center);
301
302 BUG_ON(nr_left + shift > max_entries);
303 node_copy(left, center, -shift);
304 left->header.nr_entries = cpu_to_le32(nr_left + shift);
305
306 if (shift != nr_center) {
307 shift = nr_center - shift;
308 BUG_ON((nr_right + shift) >= max_entries);
309 node_shift(right, shift);
310 node_copy(center, right, shift);
311 right->header.nr_entries = cpu_to_le32(nr_right + shift);
312 }
313 *key_ptr(parent, r->index) = right->keys[0];
314
315 delete_at(parent, c->index);
316 r->index--;
317
318 dm_tm_dec(info->tm, dm_block_location(c->block));
319 __rebalance2(info, parent, l, r);
320
321 return;
322 }
323
324 /*
325 * Rebalance
326 */
327 target = (nr_left + nr_center + nr_right) / 3;
328 BUG_ON(target > max_entries);
329
330 /*
331 * Adjust the left node
332 */
333 shift(left, center, nr_left - target);
334
335 /*
336 * Adjust the right node
337 */
338 shift(center, right, target - nr_right);
339 *key_ptr(parent, c->index) = center->keys[0];
340 *key_ptr(parent, r->index) = right->keys[0];
341} 368}
342 369
343static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, 370static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
@@ -441,9 +468,6 @@ static int rebalance_children(struct shadow_spine *s,
441 if (r) 468 if (r)
442 return r; 469 return r;
443 470
444 if (child_entries > del_threshold(n))
445 return 0;
446
447 has_left_sibling = i > 0; 471 has_left_sibling = i > 0;
448 has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); 472 has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
449 473
@@ -496,7 +520,7 @@ static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
496 */ 520 */
497 if (shadow_has_parent(s)) { 521 if (shadow_has_parent(s)) {
498 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 522 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
499 memcpy(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(__le64)), 523 memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
500 &location, sizeof(__le64)); 524 &location, sizeof(__le64));
501 } 525 }
502 526
@@ -553,7 +577,7 @@ int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
553 577
554 if (info->value_type.dec) 578 if (info->value_type.dec)
555 info->value_type.dec(info->value_type.context, 579 info->value_type.dec(info->value_type.context,
556 value_ptr(n, index, info->value_type.size)); 580 value_ptr(n, index));
557 581
558 delete_at(n, index); 582 delete_at(n, index);
559 } 583 }
diff --git a/drivers/md/persistent-data/dm-btree.c b/drivers/md/persistent-data/dm-btree.c
index bd1e7ffbe26c..d12b2cc51f1a 100644
--- a/drivers/md/persistent-data/dm-btree.c
+++ b/drivers/md/persistent-data/dm-btree.c
@@ -74,8 +74,7 @@ void inc_children(struct dm_transaction_manager *tm, struct node *n,
74 dm_tm_inc(tm, value64(n, i)); 74 dm_tm_inc(tm, value64(n, i));
75 else if (vt->inc) 75 else if (vt->inc)
76 for (i = 0; i < nr_entries; i++) 76 for (i = 0; i < nr_entries; i++)
77 vt->inc(vt->context, 77 vt->inc(vt->context, value_ptr(n, i));
78 value_ptr(n, i, vt->size));
79} 78}
80 79
81static int insert_at(size_t value_size, struct node *node, unsigned index, 80static int insert_at(size_t value_size, struct node *node, unsigned index,
@@ -281,7 +280,7 @@ int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
281 280
282 for (i = 0; i < f->nr_children; i++) 281 for (i = 0; i < f->nr_children; i++)
283 info->value_type.dec(info->value_type.context, 282 info->value_type.dec(info->value_type.context,
284 value_ptr(f->n, i, info->value_type.size)); 283 value_ptr(f->n, i));
285 } 284 }
286 f->current_child = f->nr_children; 285 f->current_child = f->nr_children;
287 } 286 }
@@ -320,7 +319,7 @@ static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
320 } while (!(flags & LEAF_NODE)); 319 } while (!(flags & LEAF_NODE));
321 320
322 *result_key = le64_to_cpu(ro_node(s)->keys[i]); 321 *result_key = le64_to_cpu(ro_node(s)->keys[i]);
323 memcpy(v, value_ptr(ro_node(s), i, value_size), value_size); 322 memcpy(v, value_ptr(ro_node(s), i), value_size);
324 323
325 return 0; 324 return 0;
326} 325}
@@ -432,7 +431,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
432 431
433 size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? 432 size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
434 sizeof(uint64_t) : s->info->value_type.size; 433 sizeof(uint64_t) : s->info->value_type.size;
435 memcpy(value_ptr(rn, 0, size), value_ptr(ln, nr_left, size), 434 memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
436 size * nr_right); 435 size * nr_right);
437 436
438 /* 437 /*
@@ -443,7 +442,7 @@ static int btree_split_sibling(struct shadow_spine *s, dm_block_t root,
443 pn = dm_block_data(parent); 442 pn = dm_block_data(parent);
444 location = cpu_to_le64(dm_block_location(left)); 443 location = cpu_to_le64(dm_block_location(left));
445 __dm_bless_for_disk(&location); 444 __dm_bless_for_disk(&location);
446 memcpy_disk(value_ptr(pn, parent_index, sizeof(__le64)), 445 memcpy_disk(value_ptr(pn, parent_index),
447 &location, sizeof(__le64)); 446 &location, sizeof(__le64));
448 447
449 location = cpu_to_le64(dm_block_location(right)); 448 location = cpu_to_le64(dm_block_location(right));
@@ -529,8 +528,8 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
529 528
530 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? 529 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
531 sizeof(__le64) : s->info->value_type.size; 530 sizeof(__le64) : s->info->value_type.size;
532 memcpy(value_ptr(ln, 0, size), value_ptr(pn, 0, size), nr_left * size); 531 memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
533 memcpy(value_ptr(rn, 0, size), value_ptr(pn, nr_left, size), 532 memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
534 nr_right * size); 533 nr_right * size);
535 534
536 /* new_parent should just point to l and r now */ 535 /* new_parent should just point to l and r now */
@@ -545,12 +544,12 @@ static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
545 val = cpu_to_le64(dm_block_location(left)); 544 val = cpu_to_le64(dm_block_location(left));
546 __dm_bless_for_disk(&val); 545 __dm_bless_for_disk(&val);
547 pn->keys[0] = ln->keys[0]; 546 pn->keys[0] = ln->keys[0];
548 memcpy_disk(value_ptr(pn, 0, sizeof(__le64)), &val, sizeof(__le64)); 547 memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
549 548
550 val = cpu_to_le64(dm_block_location(right)); 549 val = cpu_to_le64(dm_block_location(right));
551 __dm_bless_for_disk(&val); 550 __dm_bless_for_disk(&val);
552 pn->keys[1] = rn->keys[0]; 551 pn->keys[1] = rn->keys[0];
553 memcpy_disk(value_ptr(pn, 1, sizeof(__le64)), &val, sizeof(__le64)); 552 memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
554 553
555 /* 554 /*
556 * rejig the spine. This is ugly, since it knows too 555 * rejig the spine. This is ugly, since it knows too
@@ -595,7 +594,7 @@ static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
595 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 594 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
596 595
597 __dm_bless_for_disk(&location); 596 __dm_bless_for_disk(&location);
598 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i, sizeof(uint64_t)), 597 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
599 &location, sizeof(__le64)); 598 &location, sizeof(__le64));
600 } 599 }
601 600
@@ -710,12 +709,12 @@ static int insert(struct dm_btree_info *info, dm_block_t root,
710 (!info->value_type.equal || 709 (!info->value_type.equal ||
711 !info->value_type.equal( 710 !info->value_type.equal(
712 info->value_type.context, 711 info->value_type.context,
713 value_ptr(n, index, info->value_type.size), 712 value_ptr(n, index),
714 value))) { 713 value))) {
715 info->value_type.dec(info->value_type.context, 714 info->value_type.dec(info->value_type.context,
716 value_ptr(n, index, info->value_type.size)); 715 value_ptr(n, index));
717 } 716 }
718 memcpy_disk(value_ptr(n, index, info->value_type.size), 717 memcpy_disk(value_ptr(n, index),
719 value, info->value_type.size); 718 value, info->value_type.size);
720 } 719 }
721 720
diff --git a/drivers/md/persistent-data/dm-space-map-common.c b/drivers/md/persistent-data/dm-space-map-common.c
index df2494c06cdc..ff3beed6ad2d 100644
--- a/drivers/md/persistent-data/dm-space-map-common.c
+++ b/drivers/md/persistent-data/dm-space-map-common.c
@@ -405,8 +405,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
405 if (r < 0) 405 if (r < 0)
406 return r; 406 return r;
407 407
408#if 0
409 /* FIXME: dm_btree_remove doesn't handle this yet */
410 if (old > 2) { 408 if (old > 2) {
411 r = dm_btree_remove(&ll->ref_count_info, 409 r = dm_btree_remove(&ll->ref_count_info,
412 ll->ref_count_root, 410 ll->ref_count_root,
@@ -414,7 +412,6 @@ int sm_ll_insert(struct ll_disk *ll, dm_block_t b,
414 if (r) 412 if (r)
415 return r; 413 return r;
416 } 414 }
417#endif
418 415
419 } else { 416 } else {
420 __le32 le_rc = cpu_to_le32(ref_count); 417 __le32 le_rc = cpu_to_le32(ref_count);
diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index 7294bd115e34..6f31f5596e01 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -91,7 +91,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
91 91
92 if (!conf) 92 if (!conf)
93 return -ENOMEM; 93 return -ENOMEM;
94 list_for_each_entry(rdev1, &mddev->disks, same_set) { 94 rdev_for_each(rdev1, mddev) {
95 pr_debug("md/raid0:%s: looking at %s\n", 95 pr_debug("md/raid0:%s: looking at %s\n",
96 mdname(mddev), 96 mdname(mddev),
97 bdevname(rdev1->bdev, b)); 97 bdevname(rdev1->bdev, b));
@@ -102,7 +102,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
102 sector_div(sectors, mddev->chunk_sectors); 102 sector_div(sectors, mddev->chunk_sectors);
103 rdev1->sectors = sectors * mddev->chunk_sectors; 103 rdev1->sectors = sectors * mddev->chunk_sectors;
104 104
105 list_for_each_entry(rdev2, &mddev->disks, same_set) { 105 rdev_for_each(rdev2, mddev) {
106 pr_debug("md/raid0:%s: comparing %s(%llu)" 106 pr_debug("md/raid0:%s: comparing %s(%llu)"
107 " with %s(%llu)\n", 107 " with %s(%llu)\n",
108 mdname(mddev), 108 mdname(mddev),
@@ -157,7 +157,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
157 smallest = NULL; 157 smallest = NULL;
158 dev = conf->devlist; 158 dev = conf->devlist;
159 err = -EINVAL; 159 err = -EINVAL;
160 list_for_each_entry(rdev1, &mddev->disks, same_set) { 160 rdev_for_each(rdev1, mddev) {
161 int j = rdev1->raid_disk; 161 int j = rdev1->raid_disk;
162 162
163 if (mddev->level == 10) { 163 if (mddev->level == 10) {
@@ -188,16 +188,10 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
188 188
189 disk_stack_limits(mddev->gendisk, rdev1->bdev, 189 disk_stack_limits(mddev->gendisk, rdev1->bdev,
190 rdev1->data_offset << 9); 190 rdev1->data_offset << 9);
191 /* as we don't honour merge_bvec_fn, we must never risk
192 * violating it, so limit ->max_segments to 1, lying within
193 * a single page.
194 */
195 191
196 if (rdev1->bdev->bd_disk->queue->merge_bvec_fn) { 192 if (rdev1->bdev->bd_disk->queue->merge_bvec_fn)
197 blk_queue_max_segments(mddev->queue, 1); 193 conf->has_merge_bvec = 1;
198 blk_queue_segment_boundary(mddev->queue, 194
199 PAGE_CACHE_SIZE - 1);
200 }
201 if (!smallest || (rdev1->sectors < smallest->sectors)) 195 if (!smallest || (rdev1->sectors < smallest->sectors))
202 smallest = rdev1; 196 smallest = rdev1;
203 cnt++; 197 cnt++;
@@ -290,8 +284,64 @@ abort:
290 return err; 284 return err;
291} 285}
292 286
287/* Find the zone which holds a particular offset
288 * Update *sectorp to be an offset in that zone
289 */
290static struct strip_zone *find_zone(struct r0conf *conf,
291 sector_t *sectorp)
292{
293 int i;
294 struct strip_zone *z = conf->strip_zone;
295 sector_t sector = *sectorp;
296
297 for (i = 0; i < conf->nr_strip_zones; i++)
298 if (sector < z[i].zone_end) {
299 if (i)
300 *sectorp = sector - z[i-1].zone_end;
301 return z + i;
302 }
303 BUG();
304}
305
306/*
307 * remaps the bio to the target device. we separate two flows.
308 * power 2 flow and a general flow for the sake of perfromance
309*/
310static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
311 sector_t sector, sector_t *sector_offset)
312{
313 unsigned int sect_in_chunk;
314 sector_t chunk;
315 struct r0conf *conf = mddev->private;
316 int raid_disks = conf->strip_zone[0].nb_dev;
317 unsigned int chunk_sects = mddev->chunk_sectors;
318
319 if (is_power_of_2(chunk_sects)) {
320 int chunksect_bits = ffz(~chunk_sects);
321 /* find the sector offset inside the chunk */
322 sect_in_chunk = sector & (chunk_sects - 1);
323 sector >>= chunksect_bits;
324 /* chunk in zone */
325 chunk = *sector_offset;
326 /* quotient is the chunk in real device*/
327 sector_div(chunk, zone->nb_dev << chunksect_bits);
328 } else{
329 sect_in_chunk = sector_div(sector, chunk_sects);
330 chunk = *sector_offset;
331 sector_div(chunk, chunk_sects * zone->nb_dev);
332 }
333 /*
334 * position the bio over the real device
335 * real sector = chunk in device + starting of zone
336 * + the position in the chunk
337 */
338 *sector_offset = (chunk * chunk_sects) + sect_in_chunk;
339 return conf->devlist[(zone - conf->strip_zone)*raid_disks
340 + sector_div(sector, zone->nb_dev)];
341}
342
293/** 343/**
294 * raid0_mergeable_bvec -- tell bio layer if a two requests can be merged 344 * raid0_mergeable_bvec -- tell bio layer if two requests can be merged
295 * @q: request queue 345 * @q: request queue
296 * @bvm: properties of new bio 346 * @bvm: properties of new bio
297 * @biovec: the request that could be merged to it. 347 * @biovec: the request that could be merged to it.
@@ -303,10 +353,15 @@ static int raid0_mergeable_bvec(struct request_queue *q,
303 struct bio_vec *biovec) 353 struct bio_vec *biovec)
304{ 354{
305 struct mddev *mddev = q->queuedata; 355 struct mddev *mddev = q->queuedata;
356 struct r0conf *conf = mddev->private;
306 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 357 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
358 sector_t sector_offset = sector;
307 int max; 359 int max;
308 unsigned int chunk_sectors = mddev->chunk_sectors; 360 unsigned int chunk_sectors = mddev->chunk_sectors;
309 unsigned int bio_sectors = bvm->bi_size >> 9; 361 unsigned int bio_sectors = bvm->bi_size >> 9;
362 struct strip_zone *zone;
363 struct md_rdev *rdev;
364 struct request_queue *subq;
310 365
311 if (is_power_of_2(chunk_sectors)) 366 if (is_power_of_2(chunk_sectors))
312 max = (chunk_sectors - ((sector & (chunk_sectors-1)) 367 max = (chunk_sectors - ((sector & (chunk_sectors-1))
@@ -314,10 +369,27 @@ static int raid0_mergeable_bvec(struct request_queue *q,
314 else 369 else
315 max = (chunk_sectors - (sector_div(sector, chunk_sectors) 370 max = (chunk_sectors - (sector_div(sector, chunk_sectors)
316 + bio_sectors)) << 9; 371 + bio_sectors)) << 9;
317 if (max < 0) max = 0; /* bio_add cannot handle a negative return */ 372 if (max < 0)
373 max = 0; /* bio_add cannot handle a negative return */
318 if (max <= biovec->bv_len && bio_sectors == 0) 374 if (max <= biovec->bv_len && bio_sectors == 0)
319 return biovec->bv_len; 375 return biovec->bv_len;
320 else 376 if (max < biovec->bv_len)
377 /* too small already, no need to check further */
378 return max;
379 if (!conf->has_merge_bvec)
380 return max;
381
382 /* May need to check subordinate device */
383 sector = sector_offset;
384 zone = find_zone(mddev->private, &sector_offset);
385 rdev = map_sector(mddev, zone, sector, &sector_offset);
386 subq = bdev_get_queue(rdev->bdev);
387 if (subq->merge_bvec_fn) {
388 bvm->bi_bdev = rdev->bdev;
389 bvm->bi_sector = sector_offset + zone->dev_start +
390 rdev->data_offset;
391 return min(max, subq->merge_bvec_fn(subq, bvm, biovec));
392 } else
321 return max; 393 return max;
322} 394}
323 395
@@ -329,7 +401,7 @@ static sector_t raid0_size(struct mddev *mddev, sector_t sectors, int raid_disks
329 WARN_ONCE(sectors || raid_disks, 401 WARN_ONCE(sectors || raid_disks,
330 "%s does not support generic reshape\n", __func__); 402 "%s does not support generic reshape\n", __func__);
331 403
332 list_for_each_entry(rdev, &mddev->disks, same_set) 404 rdev_for_each(rdev, mddev)
333 array_sectors += rdev->sectors; 405 array_sectors += rdev->sectors;
334 406
335 return array_sectors; 407 return array_sectors;
@@ -397,62 +469,6 @@ static int raid0_stop(struct mddev *mddev)
397 return 0; 469 return 0;
398} 470}
399 471
400/* Find the zone which holds a particular offset
401 * Update *sectorp to be an offset in that zone
402 */
403static struct strip_zone *find_zone(struct r0conf *conf,
404 sector_t *sectorp)
405{
406 int i;
407 struct strip_zone *z = conf->strip_zone;
408 sector_t sector = *sectorp;
409
410 for (i = 0; i < conf->nr_strip_zones; i++)
411 if (sector < z[i].zone_end) {
412 if (i)
413 *sectorp = sector - z[i-1].zone_end;
414 return z + i;
415 }
416 BUG();
417}
418
419/*
420 * remaps the bio to the target device. we separate two flows.
421 * power 2 flow and a general flow for the sake of perfromance
422*/
423static struct md_rdev *map_sector(struct mddev *mddev, struct strip_zone *zone,
424 sector_t sector, sector_t *sector_offset)
425{
426 unsigned int sect_in_chunk;
427 sector_t chunk;
428 struct r0conf *conf = mddev->private;
429 int raid_disks = conf->strip_zone[0].nb_dev;
430 unsigned int chunk_sects = mddev->chunk_sectors;
431
432 if (is_power_of_2(chunk_sects)) {
433 int chunksect_bits = ffz(~chunk_sects);
434 /* find the sector offset inside the chunk */
435 sect_in_chunk = sector & (chunk_sects - 1);
436 sector >>= chunksect_bits;
437 /* chunk in zone */
438 chunk = *sector_offset;
439 /* quotient is the chunk in real device*/
440 sector_div(chunk, zone->nb_dev << chunksect_bits);
441 } else{
442 sect_in_chunk = sector_div(sector, chunk_sects);
443 chunk = *sector_offset;
444 sector_div(chunk, chunk_sects * zone->nb_dev);
445 }
446 /*
447 * position the bio over the real device
448 * real sector = chunk in device + starting of zone
449 * + the position in the chunk
450 */
451 *sector_offset = (chunk * chunk_sects) + sect_in_chunk;
452 return conf->devlist[(zone - conf->strip_zone)*raid_disks
453 + sector_div(sector, zone->nb_dev)];
454}
455
456/* 472/*
457 * Is io distribute over 1 or more chunks ? 473 * Is io distribute over 1 or more chunks ?
458*/ 474*/
@@ -505,7 +521,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio)
505 } 521 }
506 522
507 sector_offset = bio->bi_sector; 523 sector_offset = bio->bi_sector;
508 zone = find_zone(mddev->private, &sector_offset); 524 zone = find_zone(mddev->private, &sector_offset);
509 tmp_dev = map_sector(mddev, zone, bio->bi_sector, 525 tmp_dev = map_sector(mddev, zone, bio->bi_sector,
510 &sector_offset); 526 &sector_offset);
511 bio->bi_bdev = tmp_dev->bdev; 527 bio->bi_bdev = tmp_dev->bdev;
@@ -543,7 +559,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
543 return ERR_PTR(-EINVAL); 559 return ERR_PTR(-EINVAL);
544 } 560 }
545 561
546 list_for_each_entry(rdev, &mddev->disks, same_set) { 562 rdev_for_each(rdev, mddev) {
547 /* check slot number for a disk */ 563 /* check slot number for a disk */
548 if (rdev->raid_disk == mddev->raid_disks-1) { 564 if (rdev->raid_disk == mddev->raid_disks-1) {
549 printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n", 565 printk(KERN_ERR "md/raid0:%s: raid5 must have missing parity disk!\n",
diff --git a/drivers/md/raid0.h b/drivers/md/raid0.h
index 0884bba8df4c..05539d9c97f0 100644
--- a/drivers/md/raid0.h
+++ b/drivers/md/raid0.h
@@ -4,13 +4,16 @@
4struct strip_zone { 4struct strip_zone {
5 sector_t zone_end; /* Start of the next zone (in sectors) */ 5 sector_t zone_end; /* Start of the next zone (in sectors) */
6 sector_t dev_start; /* Zone offset in real dev (in sectors) */ 6 sector_t dev_start; /* Zone offset in real dev (in sectors) */
7 int nb_dev; /* # of devices attached to the zone */ 7 int nb_dev; /* # of devices attached to the zone */
8}; 8};
9 9
10struct r0conf { 10struct r0conf {
11 struct strip_zone *strip_zone; 11 struct strip_zone *strip_zone;
12 struct md_rdev **devlist; /* lists of rdevs, pointed to by strip_zone->dev */ 12 struct md_rdev **devlist; /* lists of rdevs, pointed to
13 int nr_strip_zones; 13 * by strip_zone->dev */
14 int nr_strip_zones;
15 int has_merge_bvec; /* at least one member has
16 * a merge_bvec_fn */
14}; 17};
15 18
16#endif 19#endif
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index a368db2431a5..4a40a200d769 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -523,6 +523,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
523 rdev = rcu_dereference(conf->mirrors[disk].rdev); 523 rdev = rcu_dereference(conf->mirrors[disk].rdev);
524 if (r1_bio->bios[disk] == IO_BLOCKED 524 if (r1_bio->bios[disk] == IO_BLOCKED
525 || rdev == NULL 525 || rdev == NULL
526 || test_bit(Unmerged, &rdev->flags)
526 || test_bit(Faulty, &rdev->flags)) 527 || test_bit(Faulty, &rdev->flags))
527 continue; 528 continue;
528 if (!test_bit(In_sync, &rdev->flags) && 529 if (!test_bit(In_sync, &rdev->flags) &&
@@ -614,6 +615,39 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect
614 return best_disk; 615 return best_disk;
615} 616}
616 617
618static int raid1_mergeable_bvec(struct request_queue *q,
619 struct bvec_merge_data *bvm,
620 struct bio_vec *biovec)
621{
622 struct mddev *mddev = q->queuedata;
623 struct r1conf *conf = mddev->private;
624 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
625 int max = biovec->bv_len;
626
627 if (mddev->merge_check_needed) {
628 int disk;
629 rcu_read_lock();
630 for (disk = 0; disk < conf->raid_disks * 2; disk++) {
631 struct md_rdev *rdev = rcu_dereference(
632 conf->mirrors[disk].rdev);
633 if (rdev && !test_bit(Faulty, &rdev->flags)) {
634 struct request_queue *q =
635 bdev_get_queue(rdev->bdev);
636 if (q->merge_bvec_fn) {
637 bvm->bi_sector = sector +
638 rdev->data_offset;
639 bvm->bi_bdev = rdev->bdev;
640 max = min(max, q->merge_bvec_fn(
641 q, bvm, biovec));
642 }
643 }
644 }
645 rcu_read_unlock();
646 }
647 return max;
648
649}
650
617int md_raid1_congested(struct mddev *mddev, int bits) 651int md_raid1_congested(struct mddev *mddev, int bits)
618{ 652{
619 struct r1conf *conf = mddev->private; 653 struct r1conf *conf = mddev->private;
@@ -624,7 +658,7 @@ int md_raid1_congested(struct mddev *mddev, int bits)
624 return 1; 658 return 1;
625 659
626 rcu_read_lock(); 660 rcu_read_lock();
627 for (i = 0; i < conf->raid_disks; i++) { 661 for (i = 0; i < conf->raid_disks * 2; i++) {
628 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); 662 struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev);
629 if (rdev && !test_bit(Faulty, &rdev->flags)) { 663 if (rdev && !test_bit(Faulty, &rdev->flags)) {
630 struct request_queue *q = bdev_get_queue(rdev->bdev); 664 struct request_queue *q = bdev_get_queue(rdev->bdev);
@@ -737,9 +771,22 @@ static void wait_barrier(struct r1conf *conf)
737 spin_lock_irq(&conf->resync_lock); 771 spin_lock_irq(&conf->resync_lock);
738 if (conf->barrier) { 772 if (conf->barrier) {
739 conf->nr_waiting++; 773 conf->nr_waiting++;
740 wait_event_lock_irq(conf->wait_barrier, !conf->barrier, 774 /* Wait for the barrier to drop.
775 * However if there are already pending
776 * requests (preventing the barrier from
777 * rising completely), and the
778 * pre-process bio queue isn't empty,
779 * then don't wait, as we need to empty
780 * that queue to get the nr_pending
781 * count down.
782 */
783 wait_event_lock_irq(conf->wait_barrier,
784 !conf->barrier ||
785 (conf->nr_pending &&
786 current->bio_list &&
787 !bio_list_empty(current->bio_list)),
741 conf->resync_lock, 788 conf->resync_lock,
742 ); 789 );
743 conf->nr_waiting--; 790 conf->nr_waiting--;
744 } 791 }
745 conf->nr_pending++; 792 conf->nr_pending++;
@@ -1002,7 +1049,8 @@ read_again:
1002 break; 1049 break;
1003 } 1050 }
1004 r1_bio->bios[i] = NULL; 1051 r1_bio->bios[i] = NULL;
1005 if (!rdev || test_bit(Faulty, &rdev->flags)) { 1052 if (!rdev || test_bit(Faulty, &rdev->flags)
1053 || test_bit(Unmerged, &rdev->flags)) {
1006 if (i < conf->raid_disks) 1054 if (i < conf->raid_disks)
1007 set_bit(R1BIO_Degraded, &r1_bio->state); 1055 set_bit(R1BIO_Degraded, &r1_bio->state);
1008 continue; 1056 continue;
@@ -1322,6 +1370,7 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1322 struct mirror_info *p; 1370 struct mirror_info *p;
1323 int first = 0; 1371 int first = 0;
1324 int last = conf->raid_disks - 1; 1372 int last = conf->raid_disks - 1;
1373 struct request_queue *q = bdev_get_queue(rdev->bdev);
1325 1374
1326 if (mddev->recovery_disabled == conf->recovery_disabled) 1375 if (mddev->recovery_disabled == conf->recovery_disabled)
1327 return -EBUSY; 1376 return -EBUSY;
@@ -1329,23 +1378,17 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1329 if (rdev->raid_disk >= 0) 1378 if (rdev->raid_disk >= 0)
1330 first = last = rdev->raid_disk; 1379 first = last = rdev->raid_disk;
1331 1380
1381 if (q->merge_bvec_fn) {
1382 set_bit(Unmerged, &rdev->flags);
1383 mddev->merge_check_needed = 1;
1384 }
1385
1332 for (mirror = first; mirror <= last; mirror++) { 1386 for (mirror = first; mirror <= last; mirror++) {
1333 p = conf->mirrors+mirror; 1387 p = conf->mirrors+mirror;
1334 if (!p->rdev) { 1388 if (!p->rdev) {
1335 1389
1336 disk_stack_limits(mddev->gendisk, rdev->bdev, 1390 disk_stack_limits(mddev->gendisk, rdev->bdev,
1337 rdev->data_offset << 9); 1391 rdev->data_offset << 9);
1338 /* as we don't honour merge_bvec_fn, we must
1339 * never risk violating it, so limit
1340 * ->max_segments to one lying with a single
1341 * page, as a one page request is never in
1342 * violation.
1343 */
1344 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1345 blk_queue_max_segments(mddev->queue, 1);
1346 blk_queue_segment_boundary(mddev->queue,
1347 PAGE_CACHE_SIZE - 1);
1348 }
1349 1392
1350 p->head_position = 0; 1393 p->head_position = 0;
1351 rdev->raid_disk = mirror; 1394 rdev->raid_disk = mirror;
@@ -1370,6 +1413,19 @@ static int raid1_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1370 break; 1413 break;
1371 } 1414 }
1372 } 1415 }
1416 if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
1417 /* Some requests might not have seen this new
1418 * merge_bvec_fn. We must wait for them to complete
1419 * before merging the device fully.
1420 * First we make sure any code which has tested
1421 * our function has submitted the request, then
1422 * we wait for all outstanding requests to complete.
1423 */
1424 synchronize_sched();
1425 raise_barrier(conf);
1426 lower_barrier(conf);
1427 clear_bit(Unmerged, &rdev->flags);
1428 }
1373 md_integrity_add_rdev(rdev, mddev); 1429 md_integrity_add_rdev(rdev, mddev);
1374 print_conf(conf); 1430 print_conf(conf);
1375 return err; 1431 return err;
@@ -2491,7 +2547,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
2491 2547
2492 err = -EINVAL; 2548 err = -EINVAL;
2493 spin_lock_init(&conf->device_lock); 2549 spin_lock_init(&conf->device_lock);
2494 list_for_each_entry(rdev, &mddev->disks, same_set) { 2550 rdev_for_each(rdev, mddev) {
2495 int disk_idx = rdev->raid_disk; 2551 int disk_idx = rdev->raid_disk;
2496 if (disk_idx >= mddev->raid_disks 2552 if (disk_idx >= mddev->raid_disks
2497 || disk_idx < 0) 2553 || disk_idx < 0)
@@ -2609,20 +2665,11 @@ static int run(struct mddev *mddev)
2609 if (IS_ERR(conf)) 2665 if (IS_ERR(conf))
2610 return PTR_ERR(conf); 2666 return PTR_ERR(conf);
2611 2667
2612 list_for_each_entry(rdev, &mddev->disks, same_set) { 2668 rdev_for_each(rdev, mddev) {
2613 if (!mddev->gendisk) 2669 if (!mddev->gendisk)
2614 continue; 2670 continue;
2615 disk_stack_limits(mddev->gendisk, rdev->bdev, 2671 disk_stack_limits(mddev->gendisk, rdev->bdev,
2616 rdev->data_offset << 9); 2672 rdev->data_offset << 9);
2617 /* as we don't honour merge_bvec_fn, we must never risk
2618 * violating it, so limit ->max_segments to 1 lying within
2619 * a single page, as a one page request is never in violation.
2620 */
2621 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
2622 blk_queue_max_segments(mddev->queue, 1);
2623 blk_queue_segment_boundary(mddev->queue,
2624 PAGE_CACHE_SIZE - 1);
2625 }
2626 } 2673 }
2627 2674
2628 mddev->degraded = 0; 2675 mddev->degraded = 0;
@@ -2656,6 +2703,7 @@ static int run(struct mddev *mddev)
2656 if (mddev->queue) { 2703 if (mddev->queue) {
2657 mddev->queue->backing_dev_info.congested_fn = raid1_congested; 2704 mddev->queue->backing_dev_info.congested_fn = raid1_congested;
2658 mddev->queue->backing_dev_info.congested_data = mddev; 2705 mddev->queue->backing_dev_info.congested_data = mddev;
2706 blk_queue_merge_bvec(mddev->queue, raid1_mergeable_bvec);
2659 } 2707 }
2660 return md_integrity_register(mddev); 2708 return md_integrity_register(mddev);
2661} 2709}
diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 6e8aa213f0d5..3540316886f2 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -67,6 +67,7 @@ static int max_queued_requests = 1024;
67 67
68static void allow_barrier(struct r10conf *conf); 68static void allow_barrier(struct r10conf *conf);
69static void lower_barrier(struct r10conf *conf); 69static void lower_barrier(struct r10conf *conf);
70static int enough(struct r10conf *conf, int ignore);
70 71
71static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) 72static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data)
72{ 73{
@@ -347,6 +348,19 @@ static void raid10_end_read_request(struct bio *bio, int error)
347 * wait for the 'master' bio. 348 * wait for the 'master' bio.
348 */ 349 */
349 set_bit(R10BIO_Uptodate, &r10_bio->state); 350 set_bit(R10BIO_Uptodate, &r10_bio->state);
351 } else {
352 /* If all other devices that store this block have
353 * failed, we want to return the error upwards rather
354 * than fail the last device. Here we redefine
355 * "uptodate" to mean "Don't want to retry"
356 */
357 unsigned long flags;
358 spin_lock_irqsave(&conf->device_lock, flags);
359 if (!enough(conf, rdev->raid_disk))
360 uptodate = 1;
361 spin_unlock_irqrestore(&conf->device_lock, flags);
362 }
363 if (uptodate) {
350 raid_end_bio_io(r10_bio); 364 raid_end_bio_io(r10_bio);
351 rdev_dec_pending(rdev, conf->mddev); 365 rdev_dec_pending(rdev, conf->mddev);
352 } else { 366 } else {
@@ -572,25 +586,68 @@ static sector_t raid10_find_virt(struct r10conf *conf, sector_t sector, int dev)
572 * @biovec: the request that could be merged to it. 586 * @biovec: the request that could be merged to it.
573 * 587 *
574 * Return amount of bytes we can accept at this offset 588 * Return amount of bytes we can accept at this offset
575 * If near_copies == raid_disk, there are no striping issues, 589 * This requires checking for end-of-chunk if near_copies != raid_disks,
576 * but in that case, the function isn't called at all. 590 * and for subordinate merge_bvec_fns if merge_check_needed.
577 */ 591 */
578static int raid10_mergeable_bvec(struct request_queue *q, 592static int raid10_mergeable_bvec(struct request_queue *q,
579 struct bvec_merge_data *bvm, 593 struct bvec_merge_data *bvm,
580 struct bio_vec *biovec) 594 struct bio_vec *biovec)
581{ 595{
582 struct mddev *mddev = q->queuedata; 596 struct mddev *mddev = q->queuedata;
597 struct r10conf *conf = mddev->private;
583 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev); 598 sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
584 int max; 599 int max;
585 unsigned int chunk_sectors = mddev->chunk_sectors; 600 unsigned int chunk_sectors = mddev->chunk_sectors;
586 unsigned int bio_sectors = bvm->bi_size >> 9; 601 unsigned int bio_sectors = bvm->bi_size >> 9;
587 602
588 max = (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9; 603 if (conf->near_copies < conf->raid_disks) {
589 if (max < 0) max = 0; /* bio_add cannot handle a negative return */ 604 max = (chunk_sectors - ((sector & (chunk_sectors - 1))
590 if (max <= biovec->bv_len && bio_sectors == 0) 605 + bio_sectors)) << 9;
591 return biovec->bv_len; 606 if (max < 0)
592 else 607 /* bio_add cannot handle a negative return */
593 return max; 608 max = 0;
609 if (max <= biovec->bv_len && bio_sectors == 0)
610 return biovec->bv_len;
611 } else
612 max = biovec->bv_len;
613
614 if (mddev->merge_check_needed) {
615 struct r10bio r10_bio;
616 int s;
617 r10_bio.sector = sector;
618 raid10_find_phys(conf, &r10_bio);
619 rcu_read_lock();
620 for (s = 0; s < conf->copies; s++) {
621 int disk = r10_bio.devs[s].devnum;
622 struct md_rdev *rdev = rcu_dereference(
623 conf->mirrors[disk].rdev);
624 if (rdev && !test_bit(Faulty, &rdev->flags)) {
625 struct request_queue *q =
626 bdev_get_queue(rdev->bdev);
627 if (q->merge_bvec_fn) {
628 bvm->bi_sector = r10_bio.devs[s].addr
629 + rdev->data_offset;
630 bvm->bi_bdev = rdev->bdev;
631 max = min(max, q->merge_bvec_fn(
632 q, bvm, biovec));
633 }
634 }
635 rdev = rcu_dereference(conf->mirrors[disk].replacement);
636 if (rdev && !test_bit(Faulty, &rdev->flags)) {
637 struct request_queue *q =
638 bdev_get_queue(rdev->bdev);
639 if (q->merge_bvec_fn) {
640 bvm->bi_sector = r10_bio.devs[s].addr
641 + rdev->data_offset;
642 bvm->bi_bdev = rdev->bdev;
643 max = min(max, q->merge_bvec_fn(
644 q, bvm, biovec));
645 }
646 }
647 }
648 rcu_read_unlock();
649 }
650 return max;
594} 651}
595 652
596/* 653/*
@@ -654,11 +711,12 @@ retry:
654 disk = r10_bio->devs[slot].devnum; 711 disk = r10_bio->devs[slot].devnum;
655 rdev = rcu_dereference(conf->mirrors[disk].replacement); 712 rdev = rcu_dereference(conf->mirrors[disk].replacement);
656 if (rdev == NULL || test_bit(Faulty, &rdev->flags) || 713 if (rdev == NULL || test_bit(Faulty, &rdev->flags) ||
714 test_bit(Unmerged, &rdev->flags) ||
657 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) 715 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
658 rdev = rcu_dereference(conf->mirrors[disk].rdev); 716 rdev = rcu_dereference(conf->mirrors[disk].rdev);
659 if (rdev == NULL) 717 if (rdev == NULL ||
660 continue; 718 test_bit(Faulty, &rdev->flags) ||
661 if (test_bit(Faulty, &rdev->flags)) 719 test_bit(Unmerged, &rdev->flags))
662 continue; 720 continue;
663 if (!test_bit(In_sync, &rdev->flags) && 721 if (!test_bit(In_sync, &rdev->flags) &&
664 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset) 722 r10_bio->devs[slot].addr + sectors > rdev->recovery_offset)
@@ -849,9 +907,22 @@ static void wait_barrier(struct r10conf *conf)
849 spin_lock_irq(&conf->resync_lock); 907 spin_lock_irq(&conf->resync_lock);
850 if (conf->barrier) { 908 if (conf->barrier) {
851 conf->nr_waiting++; 909 conf->nr_waiting++;
852 wait_event_lock_irq(conf->wait_barrier, !conf->barrier, 910 /* Wait for the barrier to drop.
911 * However if there are already pending
912 * requests (preventing the barrier from
913 * rising completely), and the
914 * pre-process bio queue isn't empty,
915 * then don't wait, as we need to empty
916 * that queue to get the nr_pending
917 * count down.
918 */
919 wait_event_lock_irq(conf->wait_barrier,
920 !conf->barrier ||
921 (conf->nr_pending &&
922 current->bio_list &&
923 !bio_list_empty(current->bio_list)),
853 conf->resync_lock, 924 conf->resync_lock,
854 ); 925 );
855 conf->nr_waiting--; 926 conf->nr_waiting--;
856 } 927 }
857 conf->nr_pending++; 928 conf->nr_pending++;
@@ -1107,12 +1178,14 @@ retry_write:
1107 blocked_rdev = rrdev; 1178 blocked_rdev = rrdev;
1108 break; 1179 break;
1109 } 1180 }
1110 if (rrdev && test_bit(Faulty, &rrdev->flags)) 1181 if (rrdev && (test_bit(Faulty, &rrdev->flags)
1182 || test_bit(Unmerged, &rrdev->flags)))
1111 rrdev = NULL; 1183 rrdev = NULL;
1112 1184
1113 r10_bio->devs[i].bio = NULL; 1185 r10_bio->devs[i].bio = NULL;
1114 r10_bio->devs[i].repl_bio = NULL; 1186 r10_bio->devs[i].repl_bio = NULL;
1115 if (!rdev || test_bit(Faulty, &rdev->flags)) { 1187 if (!rdev || test_bit(Faulty, &rdev->flags) ||
1188 test_bit(Unmerged, &rdev->flags)) {
1116 set_bit(R10BIO_Degraded, &r10_bio->state); 1189 set_bit(R10BIO_Degraded, &r10_bio->state);
1117 continue; 1190 continue;
1118 } 1191 }
@@ -1463,18 +1536,24 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1463 int mirror; 1536 int mirror;
1464 int first = 0; 1537 int first = 0;
1465 int last = conf->raid_disks - 1; 1538 int last = conf->raid_disks - 1;
1539 struct request_queue *q = bdev_get_queue(rdev->bdev);
1466 1540
1467 if (mddev->recovery_cp < MaxSector) 1541 if (mddev->recovery_cp < MaxSector)
1468 /* only hot-add to in-sync arrays, as recovery is 1542 /* only hot-add to in-sync arrays, as recovery is
1469 * very different from resync 1543 * very different from resync
1470 */ 1544 */
1471 return -EBUSY; 1545 return -EBUSY;
1472 if (!enough(conf, -1)) 1546 if (rdev->saved_raid_disk < 0 && !enough(conf, -1))
1473 return -EINVAL; 1547 return -EINVAL;
1474 1548
1475 if (rdev->raid_disk >= 0) 1549 if (rdev->raid_disk >= 0)
1476 first = last = rdev->raid_disk; 1550 first = last = rdev->raid_disk;
1477 1551
1552 if (q->merge_bvec_fn) {
1553 set_bit(Unmerged, &rdev->flags);
1554 mddev->merge_check_needed = 1;
1555 }
1556
1478 if (rdev->saved_raid_disk >= first && 1557 if (rdev->saved_raid_disk >= first &&
1479 conf->mirrors[rdev->saved_raid_disk].rdev == NULL) 1558 conf->mirrors[rdev->saved_raid_disk].rdev == NULL)
1480 mirror = rdev->saved_raid_disk; 1559 mirror = rdev->saved_raid_disk;
@@ -1494,11 +1573,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1494 err = 0; 1573 err = 0;
1495 disk_stack_limits(mddev->gendisk, rdev->bdev, 1574 disk_stack_limits(mddev->gendisk, rdev->bdev,
1496 rdev->data_offset << 9); 1575 rdev->data_offset << 9);
1497 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1498 blk_queue_max_segments(mddev->queue, 1);
1499 blk_queue_segment_boundary(mddev->queue,
1500 PAGE_CACHE_SIZE - 1);
1501 }
1502 conf->fullsync = 1; 1576 conf->fullsync = 1;
1503 rcu_assign_pointer(p->replacement, rdev); 1577 rcu_assign_pointer(p->replacement, rdev);
1504 break; 1578 break;
@@ -1506,17 +1580,6 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1506 1580
1507 disk_stack_limits(mddev->gendisk, rdev->bdev, 1581 disk_stack_limits(mddev->gendisk, rdev->bdev,
1508 rdev->data_offset << 9); 1582 rdev->data_offset << 9);
1509 /* as we don't honour merge_bvec_fn, we must
1510 * never risk violating it, so limit
1511 * ->max_segments to one lying with a single
1512 * page, as a one page request is never in
1513 * violation.
1514 */
1515 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
1516 blk_queue_max_segments(mddev->queue, 1);
1517 blk_queue_segment_boundary(mddev->queue,
1518 PAGE_CACHE_SIZE - 1);
1519 }
1520 1583
1521 p->head_position = 0; 1584 p->head_position = 0;
1522 p->recovery_disabled = mddev->recovery_disabled - 1; 1585 p->recovery_disabled = mddev->recovery_disabled - 1;
@@ -1527,7 +1590,19 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev)
1527 rcu_assign_pointer(p->rdev, rdev); 1590 rcu_assign_pointer(p->rdev, rdev);
1528 break; 1591 break;
1529 } 1592 }
1530 1593 if (err == 0 && test_bit(Unmerged, &rdev->flags)) {
1594 /* Some requests might not have seen this new
1595 * merge_bvec_fn. We must wait for them to complete
1596 * before merging the device fully.
1597 * First we make sure any code which has tested
1598 * our function has submitted the request, then
1599 * we wait for all outstanding requests to complete.
1600 */
1601 synchronize_sched();
1602 raise_barrier(conf, 0);
1603 lower_barrier(conf);
1604 clear_bit(Unmerged, &rdev->flags);
1605 }
1531 md_integrity_add_rdev(rdev, mddev); 1606 md_integrity_add_rdev(rdev, mddev);
1532 print_conf(conf); 1607 print_conf(conf);
1533 return err; 1608 return err;
@@ -1668,10 +1743,8 @@ static void end_sync_write(struct bio *bio, int error)
1668 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl); 1743 d = find_bio_disk(conf, r10_bio, bio, &slot, &repl);
1669 if (repl) 1744 if (repl)
1670 rdev = conf->mirrors[d].replacement; 1745 rdev = conf->mirrors[d].replacement;
1671 if (!rdev) { 1746 else
1672 smp_mb();
1673 rdev = conf->mirrors[d].rdev; 1747 rdev = conf->mirrors[d].rdev;
1674 }
1675 1748
1676 if (!uptodate) { 1749 if (!uptodate) {
1677 if (repl) 1750 if (repl)
@@ -2052,6 +2125,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2052 "md/raid10:%s: %s: Failing raid device\n", 2125 "md/raid10:%s: %s: Failing raid device\n",
2053 mdname(mddev), b); 2126 mdname(mddev), b);
2054 md_error(mddev, conf->mirrors[d].rdev); 2127 md_error(mddev, conf->mirrors[d].rdev);
2128 r10_bio->devs[r10_bio->read_slot].bio = IO_BLOCKED;
2055 return; 2129 return;
2056 } 2130 }
2057 2131
@@ -2072,6 +2146,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2072 d = r10_bio->devs[sl].devnum; 2146 d = r10_bio->devs[sl].devnum;
2073 rdev = rcu_dereference(conf->mirrors[d].rdev); 2147 rdev = rcu_dereference(conf->mirrors[d].rdev);
2074 if (rdev && 2148 if (rdev &&
2149 !test_bit(Unmerged, &rdev->flags) &&
2075 test_bit(In_sync, &rdev->flags) && 2150 test_bit(In_sync, &rdev->flags) &&
2076 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s, 2151 is_badblock(rdev, r10_bio->devs[sl].addr + sect, s,
2077 &first_bad, &bad_sectors) == 0) { 2152 &first_bad, &bad_sectors) == 0) {
@@ -2105,8 +2180,11 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2105 rdev, 2180 rdev,
2106 r10_bio->devs[r10_bio->read_slot].addr 2181 r10_bio->devs[r10_bio->read_slot].addr
2107 + sect, 2182 + sect,
2108 s, 0)) 2183 s, 0)) {
2109 md_error(mddev, rdev); 2184 md_error(mddev, rdev);
2185 r10_bio->devs[r10_bio->read_slot].bio
2186 = IO_BLOCKED;
2187 }
2110 break; 2188 break;
2111 } 2189 }
2112 2190
@@ -2122,6 +2200,7 @@ static void fix_read_error(struct r10conf *conf, struct mddev *mddev, struct r10
2122 d = r10_bio->devs[sl].devnum; 2200 d = r10_bio->devs[sl].devnum;
2123 rdev = rcu_dereference(conf->mirrors[d].rdev); 2201 rdev = rcu_dereference(conf->mirrors[d].rdev);
2124 if (!rdev || 2202 if (!rdev ||
2203 test_bit(Unmerged, &rdev->flags) ||
2125 !test_bit(In_sync, &rdev->flags)) 2204 !test_bit(In_sync, &rdev->flags))
2126 continue; 2205 continue;
2127 2206
@@ -2299,17 +2378,20 @@ static void handle_read_error(struct mddev *mddev, struct r10bio *r10_bio)
2299 * This is all done synchronously while the array is 2378 * This is all done synchronously while the array is
2300 * frozen. 2379 * frozen.
2301 */ 2380 */
2381 bio = r10_bio->devs[slot].bio;
2382 bdevname(bio->bi_bdev, b);
2383 bio_put(bio);
2384 r10_bio->devs[slot].bio = NULL;
2385
2302 if (mddev->ro == 0) { 2386 if (mddev->ro == 0) {
2303 freeze_array(conf); 2387 freeze_array(conf);
2304 fix_read_error(conf, mddev, r10_bio); 2388 fix_read_error(conf, mddev, r10_bio);
2305 unfreeze_array(conf); 2389 unfreeze_array(conf);
2306 } 2390 } else
2391 r10_bio->devs[slot].bio = IO_BLOCKED;
2392
2307 rdev_dec_pending(rdev, mddev); 2393 rdev_dec_pending(rdev, mddev);
2308 2394
2309 bio = r10_bio->devs[slot].bio;
2310 bdevname(bio->bi_bdev, b);
2311 r10_bio->devs[slot].bio =
2312 mddev->ro ? IO_BLOCKED : NULL;
2313read_more: 2395read_more:
2314 rdev = read_balance(conf, r10_bio, &max_sectors); 2396 rdev = read_balance(conf, r10_bio, &max_sectors);
2315 if (rdev == NULL) { 2397 if (rdev == NULL) {
@@ -2318,13 +2400,10 @@ read_more:
2318 mdname(mddev), b, 2400 mdname(mddev), b,
2319 (unsigned long long)r10_bio->sector); 2401 (unsigned long long)r10_bio->sector);
2320 raid_end_bio_io(r10_bio); 2402 raid_end_bio_io(r10_bio);
2321 bio_put(bio);
2322 return; 2403 return;
2323 } 2404 }
2324 2405
2325 do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC); 2406 do_sync = (r10_bio->master_bio->bi_rw & REQ_SYNC);
2326 if (bio)
2327 bio_put(bio);
2328 slot = r10_bio->read_slot; 2407 slot = r10_bio->read_slot;
2329 printk_ratelimited( 2408 printk_ratelimited(
2330 KERN_ERR 2409 KERN_ERR
@@ -2360,7 +2439,6 @@ read_more:
2360 mbio->bi_phys_segments++; 2439 mbio->bi_phys_segments++;
2361 spin_unlock_irq(&conf->device_lock); 2440 spin_unlock_irq(&conf->device_lock);
2362 generic_make_request(bio); 2441 generic_make_request(bio);
2363 bio = NULL;
2364 2442
2365 r10_bio = mempool_alloc(conf->r10bio_pool, 2443 r10_bio = mempool_alloc(conf->r10bio_pool,
2366 GFP_NOIO); 2444 GFP_NOIO);
@@ -3225,7 +3303,7 @@ static int run(struct mddev *mddev)
3225 blk_queue_io_opt(mddev->queue, chunk_size * 3303 blk_queue_io_opt(mddev->queue, chunk_size *
3226 (conf->raid_disks / conf->near_copies)); 3304 (conf->raid_disks / conf->near_copies));
3227 3305
3228 list_for_each_entry(rdev, &mddev->disks, same_set) { 3306 rdev_for_each(rdev, mddev) {
3229 3307
3230 disk_idx = rdev->raid_disk; 3308 disk_idx = rdev->raid_disk;
3231 if (disk_idx >= conf->raid_disks 3309 if (disk_idx >= conf->raid_disks
@@ -3243,18 +3321,8 @@ static int run(struct mddev *mddev)
3243 disk->rdev = rdev; 3321 disk->rdev = rdev;
3244 } 3322 }
3245 3323
3246 disk->rdev = rdev;
3247 disk_stack_limits(mddev->gendisk, rdev->bdev, 3324 disk_stack_limits(mddev->gendisk, rdev->bdev,
3248 rdev->data_offset << 9); 3325 rdev->data_offset << 9);
3249 /* as we don't honour merge_bvec_fn, we must never risk
3250 * violating it, so limit max_segments to 1 lying
3251 * within a single page.
3252 */
3253 if (rdev->bdev->bd_disk->queue->merge_bvec_fn) {
3254 blk_queue_max_segments(mddev->queue, 1);
3255 blk_queue_segment_boundary(mddev->queue,
3256 PAGE_CACHE_SIZE - 1);
3257 }
3258 3326
3259 disk->head_position = 0; 3327 disk->head_position = 0;
3260 } 3328 }
@@ -3318,8 +3386,7 @@ static int run(struct mddev *mddev)
3318 mddev->queue->backing_dev_info.ra_pages = 2* stripe; 3386 mddev->queue->backing_dev_info.ra_pages = 2* stripe;
3319 } 3387 }
3320 3388
3321 if (conf->near_copies < conf->raid_disks) 3389 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3322 blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec);
3323 3390
3324 if (md_integrity_register(mddev)) 3391 if (md_integrity_register(mddev))
3325 goto out_free_conf; 3392 goto out_free_conf;
@@ -3369,6 +3436,43 @@ static void raid10_quiesce(struct mddev *mddev, int state)
3369 } 3436 }
3370} 3437}
3371 3438
3439static int raid10_resize(struct mddev *mddev, sector_t sectors)
3440{
3441 /* Resize of 'far' arrays is not supported.
3442 * For 'near' and 'offset' arrays we can set the
3443 * number of sectors used to be an appropriate multiple
3444 * of the chunk size.
3445 * For 'offset', this is far_copies*chunksize.
3446 * For 'near' the multiplier is the LCM of
3447 * near_copies and raid_disks.
3448 * So if far_copies > 1 && !far_offset, fail.
3449 * Else find LCM(raid_disks, near_copy)*far_copies and
3450 * multiply by chunk_size. Then round to this number.
3451 * This is mostly done by raid10_size()
3452 */
3453 struct r10conf *conf = mddev->private;
3454 sector_t oldsize, size;
3455
3456 if (conf->far_copies > 1 && !conf->far_offset)
3457 return -EINVAL;
3458
3459 oldsize = raid10_size(mddev, 0, 0);
3460 size = raid10_size(mddev, sectors, 0);
3461 md_set_array_sectors(mddev, size);
3462 if (mddev->array_sectors > size)
3463 return -EINVAL;
3464 set_capacity(mddev->gendisk, mddev->array_sectors);
3465 revalidate_disk(mddev->gendisk);
3466 if (sectors > mddev->dev_sectors &&
3467 mddev->recovery_cp > oldsize) {
3468 mddev->recovery_cp = oldsize;
3469 set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
3470 }
3471 mddev->dev_sectors = sectors;
3472 mddev->resync_max_sectors = size;
3473 return 0;
3474}
3475
3372static void *raid10_takeover_raid0(struct mddev *mddev) 3476static void *raid10_takeover_raid0(struct mddev *mddev)
3373{ 3477{
3374 struct md_rdev *rdev; 3478 struct md_rdev *rdev;
@@ -3392,7 +3496,7 @@ static void *raid10_takeover_raid0(struct mddev *mddev)
3392 3496
3393 conf = setup_conf(mddev); 3497 conf = setup_conf(mddev);
3394 if (!IS_ERR(conf)) { 3498 if (!IS_ERR(conf)) {
3395 list_for_each_entry(rdev, &mddev->disks, same_set) 3499 rdev_for_each(rdev, mddev)
3396 if (rdev->raid_disk >= 0) 3500 if (rdev->raid_disk >= 0)
3397 rdev->new_raid_disk = rdev->raid_disk * 2; 3501 rdev->new_raid_disk = rdev->raid_disk * 2;
3398 conf->barrier = 1; 3502 conf->barrier = 1;
@@ -3438,6 +3542,7 @@ static struct md_personality raid10_personality =
3438 .sync_request = sync_request, 3542 .sync_request = sync_request,
3439 .quiesce = raid10_quiesce, 3543 .quiesce = raid10_quiesce,
3440 .size = raid10_size, 3544 .size = raid10_size,
3545 .resize = raid10_resize,
3441 .takeover = raid10_takeover, 3546 .takeover = raid10_takeover,
3442}; 3547};
3443 3548
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 360f2b98f62b..23ac880bba9a 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -208,11 +208,10 @@ static void __release_stripe(struct r5conf *conf, struct stripe_head *sh)
208 md_wakeup_thread(conf->mddev->thread); 208 md_wakeup_thread(conf->mddev->thread);
209 } else { 209 } else {
210 BUG_ON(stripe_operations_active(sh)); 210 BUG_ON(stripe_operations_active(sh));
211 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) { 211 if (test_and_clear_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
212 atomic_dec(&conf->preread_active_stripes); 212 if (atomic_dec_return(&conf->preread_active_stripes)
213 if (atomic_read(&conf->preread_active_stripes) < IO_THRESHOLD) 213 < IO_THRESHOLD)
214 md_wakeup_thread(conf->mddev->thread); 214 md_wakeup_thread(conf->mddev->thread);
215 }
216 atomic_dec(&conf->active_stripes); 215 atomic_dec(&conf->active_stripes);
217 if (!test_bit(STRIPE_EXPANDING, &sh->state)) { 216 if (!test_bit(STRIPE_EXPANDING, &sh->state)) {
218 list_add_tail(&sh->lru, &conf->inactive_list); 217 list_add_tail(&sh->lru, &conf->inactive_list);
@@ -4843,7 +4842,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
4843 4842
4844 pr_debug("raid456: run(%s) called.\n", mdname(mddev)); 4843 pr_debug("raid456: run(%s) called.\n", mdname(mddev));
4845 4844
4846 list_for_each_entry(rdev, &mddev->disks, same_set) { 4845 rdev_for_each(rdev, mddev) {
4847 raid_disk = rdev->raid_disk; 4846 raid_disk = rdev->raid_disk;
4848 if (raid_disk >= max_disks 4847 if (raid_disk >= max_disks
4849 || raid_disk < 0) 4848 || raid_disk < 0)
@@ -5178,7 +5177,7 @@ static int run(struct mddev *mddev)
5178 blk_queue_io_opt(mddev->queue, chunk_size * 5177 blk_queue_io_opt(mddev->queue, chunk_size *
5179 (conf->raid_disks - conf->max_degraded)); 5178 (conf->raid_disks - conf->max_degraded));
5180 5179
5181 list_for_each_entry(rdev, &mddev->disks, same_set) 5180 rdev_for_each(rdev, mddev)
5182 disk_stack_limits(mddev->gendisk, rdev->bdev, 5181 disk_stack_limits(mddev->gendisk, rdev->bdev,
5183 rdev->data_offset << 9); 5182 rdev->data_offset << 9);
5184 } 5183 }
@@ -5362,7 +5361,7 @@ static int raid5_add_disk(struct mddev *mddev, struct md_rdev *rdev)
5362 if (mddev->recovery_disabled == conf->recovery_disabled) 5361 if (mddev->recovery_disabled == conf->recovery_disabled)
5363 return -EBUSY; 5362 return -EBUSY;
5364 5363
5365 if (has_failed(conf)) 5364 if (rdev->saved_raid_disk < 0 && has_failed(conf))
5366 /* no point adding a device */ 5365 /* no point adding a device */
5367 return -EINVAL; 5366 return -EINVAL;
5368 5367
@@ -5501,7 +5500,7 @@ static int raid5_start_reshape(struct mddev *mddev)
5501 if (!check_stripe_cache(mddev)) 5500 if (!check_stripe_cache(mddev))
5502 return -ENOSPC; 5501 return -ENOSPC;
5503 5502
5504 list_for_each_entry(rdev, &mddev->disks, same_set) 5503 rdev_for_each(rdev, mddev)
5505 if (!test_bit(In_sync, &rdev->flags) 5504 if (!test_bit(In_sync, &rdev->flags)
5506 && !test_bit(Faulty, &rdev->flags)) 5505 && !test_bit(Faulty, &rdev->flags))
5507 spares++; 5506 spares++;
@@ -5547,16 +5546,14 @@ static int raid5_start_reshape(struct mddev *mddev)
5547 * such devices during the reshape and confusion could result. 5546 * such devices during the reshape and confusion could result.
5548 */ 5547 */
5549 if (mddev->delta_disks >= 0) { 5548 if (mddev->delta_disks >= 0) {
5550 int added_devices = 0; 5549 rdev_for_each(rdev, mddev)
5551 list_for_each_entry(rdev, &mddev->disks, same_set)
5552 if (rdev->raid_disk < 0 && 5550 if (rdev->raid_disk < 0 &&
5553 !test_bit(Faulty, &rdev->flags)) { 5551 !test_bit(Faulty, &rdev->flags)) {
5554 if (raid5_add_disk(mddev, rdev) == 0) { 5552 if (raid5_add_disk(mddev, rdev) == 0) {
5555 if (rdev->raid_disk 5553 if (rdev->raid_disk
5556 >= conf->previous_raid_disks) { 5554 >= conf->previous_raid_disks)
5557 set_bit(In_sync, &rdev->flags); 5555 set_bit(In_sync, &rdev->flags);
5558 added_devices++; 5556 else
5559 } else
5560 rdev->recovery_offset = 0; 5557 rdev->recovery_offset = 0;
5561 5558
5562 if (sysfs_link_rdev(mddev, rdev)) 5559 if (sysfs_link_rdev(mddev, rdev))
@@ -5566,7 +5563,6 @@ static int raid5_start_reshape(struct mddev *mddev)
5566 && !test_bit(Faulty, &rdev->flags)) { 5563 && !test_bit(Faulty, &rdev->flags)) {
5567 /* This is a spare that was manually added */ 5564 /* This is a spare that was manually added */
5568 set_bit(In_sync, &rdev->flags); 5565 set_bit(In_sync, &rdev->flags);
5569 added_devices++;
5570 } 5566 }
5571 5567
5572 /* When a reshape changes the number of devices, 5568 /* When a reshape changes the number of devices,
@@ -5592,6 +5588,7 @@ static int raid5_start_reshape(struct mddev *mddev)
5592 spin_lock_irq(&conf->device_lock); 5588 spin_lock_irq(&conf->device_lock);
5593 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks; 5589 mddev->raid_disks = conf->raid_disks = conf->previous_raid_disks;
5594 conf->reshape_progress = MaxSector; 5590 conf->reshape_progress = MaxSector;
5591 mddev->reshape_position = MaxSector;
5595 spin_unlock_irq(&conf->device_lock); 5592 spin_unlock_irq(&conf->device_lock);
5596 return -EAGAIN; 5593 return -EAGAIN;
5597 } 5594 }